682 lines
19 KiB
JavaScript
682 lines
19 KiB
JavaScript
var DataFile, IndexFile, LRU, Promise, WordNet, async, fs, path,
|
|
slice = [].slice;
|
|
|
|
IndexFile = require('./index_file');
|
|
|
|
DataFile = require('./data_file');
|
|
|
|
async = require('async');
|
|
|
|
Promise = require('bluebird');
|
|
|
|
path = require('path');
|
|
|
|
fs = require('fs');
|
|
|
|
LRU = require('lru-cache');
|
|
|
|
WordNet = (function() {
|
|
var _forms, _loadExceptions, _validForms, _validFormsWithExceptions, exceptions, forms, tokenDetach, unique;
|
|
|
|
function WordNet(options) {
|
|
var WNdb, e, error;
|
|
if (typeof options === 'string') {
|
|
options = {
|
|
dataDir: options
|
|
};
|
|
} else {
|
|
if (options == null) {
|
|
options = {};
|
|
}
|
|
}
|
|
if (options.dataDir == null) {
|
|
try {
|
|
WNdb = require('wndb-with-exceptions');
|
|
} catch (error) {
|
|
e = error;
|
|
console.error("Please 'npm install wndb-with-exceptions' before using WordNet module or specify a dict directory.");
|
|
throw e;
|
|
}
|
|
options.dataDir = WNdb.path;
|
|
}
|
|
if (!options.cache) {
|
|
this.cache = null;
|
|
} else {
|
|
if (options.cache === true) {
|
|
options.cache = {
|
|
max: 2000
|
|
};
|
|
}
|
|
if (typeof options.cache === 'object' && typeof options.cache.get === 'function') {
|
|
this.cache = options.cache;
|
|
} else {
|
|
this.cache = LRU(options.cache);
|
|
}
|
|
}
|
|
this.path = options.dataDir;
|
|
this.nounIndex = new IndexFile(this.path, 'noun');
|
|
this.verbIndex = new IndexFile(this.path, 'verb');
|
|
this.adjIndex = new IndexFile(this.path, 'adj');
|
|
this.advIndex = new IndexFile(this.path, 'adv');
|
|
this.nounData = new DataFile(this.path, 'noun');
|
|
this.verbData = new DataFile(this.path, 'verb');
|
|
this.adjData = new DataFile(this.path, 'adj');
|
|
this.advData = new DataFile(this.path, 'adv');
|
|
this.allFiles = [
|
|
{
|
|
index: this.nounIndex,
|
|
data: this.nounData,
|
|
pos: 'n'
|
|
}, {
|
|
index: this.verbIndex,
|
|
data: this.verbData,
|
|
pos: 'v'
|
|
}, {
|
|
index: this.adjIndex,
|
|
data: this.adjData,
|
|
pos: 'a'
|
|
}, {
|
|
index: this.advIndex,
|
|
data: this.advData,
|
|
pos: 'r'
|
|
}
|
|
];
|
|
}
|
|
|
|
WordNet.prototype.get = function(synsetOffset, pos, callback) {
|
|
var dataFile, hit, query, wordnet;
|
|
wordnet = this;
|
|
if (this.cache) {
|
|
query = "get:" + synsetOffset + ":" + pos;
|
|
if (hit = wordnet.cache.get(query)) {
|
|
if (callback.length === 1) {
|
|
return callback.call(wordnet, hit);
|
|
} else {
|
|
return callback.call(wordnet, null, hit);
|
|
}
|
|
}
|
|
}
|
|
dataFile = wordnet.getDataFile(pos);
|
|
return dataFile.get(synsetOffset, function(err, result) {
|
|
if (query && (err == null)) {
|
|
wordnet.cache.set(query, result);
|
|
}
|
|
if (callback.length === 1) {
|
|
return callback.call(wordnet, result);
|
|
} else {
|
|
return callback.call(wordnet, err, result);
|
|
}
|
|
});
|
|
};
|
|
|
|
WordNet.prototype.getAsync = function(synsetOffset, pos) {
|
|
var wordnet;
|
|
wordnet = this;
|
|
return new Promise(function(resolve, reject) {
|
|
return wordnet.get(synsetOffset, pos, function(err, data) {
|
|
if (err != null) {
|
|
return reject(err);
|
|
} else {
|
|
return resolve(data);
|
|
}
|
|
});
|
|
});
|
|
};
|
|
|
|
WordNet.prototype.lookup = function(input, callback) {
|
|
var hit, lword, pos, query, ref, selectedFiles, word, wordnet;
|
|
wordnet = this;
|
|
ref = input.split('#'), word = ref[0], pos = ref[1];
|
|
lword = word.toLowerCase().replace(/\s+/g, '_');
|
|
if (this.cache) {
|
|
query = "lookup:" + input;
|
|
if (hit = wordnet.cache.get(query)) {
|
|
if (callback.length === 1) {
|
|
return callback.call(wordnet, hit);
|
|
} else {
|
|
return callback.call(wordnet, null, hit);
|
|
}
|
|
}
|
|
}
|
|
selectedFiles = !pos ? wordnet.allFiles.slice() : wordnet.allFiles.filter(function(file) {
|
|
return file.pos === pos;
|
|
});
|
|
return wordnet.lookupFromFiles(selectedFiles, [], lword, function(err, results) {
|
|
if (err != null) {
|
|
return callback.call(wordnet, err);
|
|
}
|
|
if (query) {
|
|
wordnet.cache.set(query, results);
|
|
}
|
|
if (callback.length === 1) {
|
|
return callback.call(wordnet, results);
|
|
} else {
|
|
return callback.call(wordnet, null, results);
|
|
}
|
|
});
|
|
};
|
|
|
|
WordNet.prototype.lookupAsync = function(input, callback) {
|
|
var wordnet;
|
|
wordnet = this;
|
|
return new Promise(function(resolve, reject) {
|
|
return wordnet.lookup(input, function(err, data) {
|
|
if (err != null) {
|
|
return reject(err);
|
|
} else {
|
|
return resolve(data);
|
|
}
|
|
});
|
|
});
|
|
};
|
|
|
|
WordNet.prototype.findSense = function(input, callback) {
|
|
var hit, lword, pos, query, ref, selectedFiles, sense, senseNumber, word, wordnet;
|
|
wordnet = this;
|
|
ref = input.split('#'), word = ref[0], pos = ref[1], senseNumber = ref[2];
|
|
if (this.cache) {
|
|
query = "findSense:" + input;
|
|
if (hit = wordnet.cache.get(query)) {
|
|
if (callback.length === 1) {
|
|
return callback.call(wordnet, hit);
|
|
} else {
|
|
return callback.call(wordnet, null, hit);
|
|
}
|
|
}
|
|
}
|
|
sense = parseInt(senseNumber);
|
|
if (Number.isNaN(sense)) {
|
|
throw new Error("Sense number should be an integer");
|
|
} else if (sense < 1) {
|
|
throw new Error("Sense number should be a positive integer");
|
|
}
|
|
lword = word.toLowerCase().replace(/\s+/g, '_');
|
|
selectedFiles = wordnet.allFiles.filter(function(file) {
|
|
return file.pos === pos;
|
|
});
|
|
return wordnet.lookupFromFiles(selectedFiles, [], lword, function(err, response) {
|
|
var result;
|
|
if (err != null) {
|
|
return callback.call(wordnet, err);
|
|
}
|
|
result = response[sense - 1];
|
|
if (query) {
|
|
wordnet.cache.set(query, result);
|
|
}
|
|
if (callback.length === 1) {
|
|
return callback.call(wordnet, result);
|
|
} else {
|
|
return callback.call(wordnet, null, result);
|
|
}
|
|
});
|
|
};
|
|
|
|
WordNet.prototype.findSenseAsync = function(input) {
|
|
var wordnet;
|
|
wordnet = this;
|
|
return new Promise(function(resolve, reject) {
|
|
return wordnet.findSense(input, function(err, data) {
|
|
if (err != null) {
|
|
return reject(err);
|
|
} else {
|
|
return resolve(data);
|
|
}
|
|
});
|
|
});
|
|
};
|
|
|
|
WordNet.prototype.querySense = function(input, callback) {
|
|
var hit, pos, query, ref, word, wordnet;
|
|
wordnet = this;
|
|
ref = input.split('#'), word = ref[0], pos = ref[1];
|
|
if (this.cache) {
|
|
query = "querySense:" + input;
|
|
if (hit = wordnet.cache.get(query)) {
|
|
if (callback.length === 1) {
|
|
return callback.call(wordnet, hit);
|
|
} else {
|
|
return callback.call(wordnet, null, hit);
|
|
}
|
|
}
|
|
}
|
|
return wordnet.lookup(input, function(err, results) {
|
|
var i, sense, senseCounts, senses;
|
|
if (err != null) {
|
|
return callback.call(wordnet, err);
|
|
}
|
|
senseCounts = {};
|
|
senses = (function() {
|
|
var j, len, results1;
|
|
results1 = [];
|
|
for (i = j = 0, len = results.length; j < len; i = ++j) {
|
|
sense = results[i];
|
|
pos = sense.pos;
|
|
if (pos === 's') {
|
|
pos = 'a';
|
|
}
|
|
if (senseCounts[pos] == null) {
|
|
senseCounts[pos] = 1;
|
|
}
|
|
results1.push(word + "#" + pos + "#" + senseCounts[pos]++);
|
|
}
|
|
return results1;
|
|
})();
|
|
if (query) {
|
|
wordnet.cache.set(query, senses);
|
|
}
|
|
if (callback.length === 1) {
|
|
return callback.call(wordnet, senses);
|
|
} else {
|
|
return callback.call(wordnet, null, senses);
|
|
}
|
|
});
|
|
};
|
|
|
|
WordNet.prototype.querySenseAsync = function(input) {
|
|
var wordnet;
|
|
wordnet = this;
|
|
return new Promise(function(resolve, reject) {
|
|
return wordnet.querySense(input, function(err, data) {
|
|
if (err != null) {
|
|
return reject(err);
|
|
} else {
|
|
return resolve(data);
|
|
}
|
|
});
|
|
});
|
|
};
|
|
|
|
WordNet.prototype.lookupFromFiles = function(files, results, word, callback) {
|
|
var file, wordnet;
|
|
wordnet = this;
|
|
if (files.length === 0) {
|
|
return callback.call(wordnet, null, results);
|
|
} else {
|
|
file = files.pop();
|
|
return file.index.lookup(word, function(err, record) {
|
|
if (record) {
|
|
return wordnet.pushResults(file.data, results, record.synsetOffset, function() {
|
|
return wordnet.lookupFromFiles(files, results, word, callback);
|
|
});
|
|
} else {
|
|
return wordnet.lookupFromFiles(files, results, word, callback);
|
|
}
|
|
});
|
|
}
|
|
};
|
|
|
|
WordNet.prototype.pushResults = function(data, results, offsets, callback) {
|
|
var wordnet;
|
|
wordnet = this;
|
|
if (offsets.length === 0) {
|
|
return callback(results);
|
|
} else {
|
|
return data.get(offsets.pop(), function(err, record) {
|
|
results.push(record);
|
|
return wordnet.pushResults(data, results, offsets, callback);
|
|
});
|
|
}
|
|
};
|
|
|
|
WordNet.prototype.loadResultSynonyms = function(synonyms, results, callback) {
|
|
var result, wordnet;
|
|
wordnet = this;
|
|
if (results.length > 0) {
|
|
result = results.pop();
|
|
return wordnet.loadSynonyms(synonyms, results, result.ptrs, callback);
|
|
} else {
|
|
return callback(synonyms);
|
|
}
|
|
};
|
|
|
|
WordNet.prototype.loadSynonyms = function(synonyms, results, ptrs, callback) {
|
|
var ptr, wordnet;
|
|
wordnet = this;
|
|
if (ptrs.length > 0) {
|
|
ptr = ptrs.pop();
|
|
return this.get(ptr.synsetOffset, ptr.pos, function(result) {
|
|
synonyms.push(result);
|
|
return wordnet.loadSynonyms(synonyms, results, ptrs, callback);
|
|
});
|
|
} else {
|
|
return wordnet.loadResultSynonyms(synonyms, results, callback);
|
|
}
|
|
};
|
|
|
|
WordNet.prototype.lookupSynonyms = function(word, callback) {
|
|
var wordnet;
|
|
wordnet = this;
|
|
return wordnet.lookup(word, function(results) {
|
|
return wordnet.loadResultSynonyms([], results, callback);
|
|
});
|
|
};
|
|
|
|
WordNet.prototype.getSynonyms = function() {
|
|
var callback, pos, synsetOffset, wordnet;
|
|
wordnet = this;
|
|
callback = arguments[2] ? arguments[2] : arguments[1];
|
|
pos = arguments[0].pos ? arguments[0].pos : arguments[1];
|
|
synsetOffset = arguments[0].synsetOffset ? arguments[0].synsetOffset : arguments[0];
|
|
return this.get(synsetOffset, pos, function(result) {
|
|
return wordnet.loadSynonyms([], [], result.ptrs, callback);
|
|
});
|
|
};
|
|
|
|
WordNet.prototype.getDataFile = function(pos) {
|
|
switch (pos) {
|
|
case 'n':
|
|
return this.nounData;
|
|
case 'v':
|
|
return this.verbData;
|
|
case 'a':
|
|
case 's':
|
|
return this.adjData;
|
|
case 'r':
|
|
return this.advData;
|
|
}
|
|
};
|
|
|
|
exceptions = [
|
|
{
|
|
name: "noun.exc",
|
|
pos: 'n'
|
|
}, {
|
|
name: "verb.exc",
|
|
pos: 'v'
|
|
}, {
|
|
name: "adj.exc",
|
|
pos: 'a'
|
|
}, {
|
|
name: "adv.exc",
|
|
pos: 'r'
|
|
}
|
|
];
|
|
|
|
_loadExceptions = function(wordnet, callback) {
|
|
var loadFile;
|
|
WordNet.prototype.exceptions = 'pending';
|
|
loadFile = function(exception, callback) {
|
|
var fullPath;
|
|
fullPath = path.join(wordnet.path, exception.name);
|
|
return fs.readFile(fullPath, function(err, data) {
|
|
var j, len, line, lines, ref, temp, term1, term2;
|
|
if (err) {
|
|
return callback(err);
|
|
}
|
|
temp = {};
|
|
lines = data.toString().split("\n");
|
|
for (j = 0, len = lines.length; j < len; j++) {
|
|
line = lines[j];
|
|
if (line.length > 0) {
|
|
ref = line.split(' '), term1 = ref[0], term2 = 2 <= ref.length ? slice.call(ref, 1) : [];
|
|
if (temp[term1] == null) {
|
|
temp[term1] = [];
|
|
}
|
|
Array.prototype.push.apply(temp[term1], term2);
|
|
}
|
|
}
|
|
return callback(null, {
|
|
pos: exception.pos,
|
|
data: temp
|
|
});
|
|
});
|
|
};
|
|
return async.map(exceptions, loadFile, function(err, results) {
|
|
var j, len, result;
|
|
exceptions = {};
|
|
for (j = 0, len = results.length; j < len; j++) {
|
|
result = results[j];
|
|
exceptions[result.pos] = result.data;
|
|
}
|
|
WordNet.prototype.exceptions = exceptions;
|
|
return callback();
|
|
});
|
|
};
|
|
|
|
WordNet.prototype.close = function() {
|
|
this.nounIndex.close();
|
|
this.verbIndex.close();
|
|
this.adjIndex.close();
|
|
this.advIndex.close();
|
|
this.nounData.close();
|
|
this.verbData.close();
|
|
this.adjData.close();
|
|
return this.advData.close();
|
|
};
|
|
|
|
unique = function(a) {
|
|
var found;
|
|
found = {};
|
|
return a.filter(function(item) {
|
|
if (found[item]) {
|
|
return false;
|
|
} else {
|
|
return found[item] = true;
|
|
}
|
|
});
|
|
};
|
|
|
|
tokenDetach = function(string) {
|
|
var detach, length, pos, ref, sense, word;
|
|
ref = string.split('#'), word = ref[0], pos = ref[1], sense = ref[2];
|
|
detach = [word];
|
|
length = word.length;
|
|
switch (pos) {
|
|
case 'n':
|
|
if (word.endsWith("s")) {
|
|
detach.push(word.substring(0, length - 1));
|
|
}
|
|
if (word.endsWith("ses")) {
|
|
detach.push(word.substring(0, length - 2));
|
|
}
|
|
if (word.endsWith("xes")) {
|
|
detach.push(word.substring(0, length - 2));
|
|
}
|
|
if (word.endsWith("zes")) {
|
|
detach.push(word.substring(0, length - 2));
|
|
}
|
|
if (word.endsWith("ches")) {
|
|
detach.push(word.substring(0, length - 2));
|
|
}
|
|
if (word.endsWith("shes")) {
|
|
detach.push(word.substring(0, length - 2));
|
|
}
|
|
if (word.endsWith("men")) {
|
|
detach.push(word.substring(0, length - 3) + "man");
|
|
}
|
|
if (word.endsWith("ies")) {
|
|
detach.push(word.substring(0, length - 3) + "y");
|
|
}
|
|
break;
|
|
case 'v':
|
|
if (word.endsWith("s")) {
|
|
detach.push(word.substring(0, length - 1));
|
|
}
|
|
if (word.endsWith("ies")) {
|
|
detach.push(word.substring(0, length - 3) + "y");
|
|
}
|
|
if (word.endsWith("es")) {
|
|
detach.push(word.substring(0, length - 2));
|
|
}
|
|
if (word.endsWith("ed")) {
|
|
detach.push(word.substring(0, length - 1));
|
|
}
|
|
if (word.endsWith("ed")) {
|
|
detach.push(word.substring(0, length - 2));
|
|
}
|
|
if (word.endsWith("ing")) {
|
|
detach.push(word.substring(0, length - 3) + "e");
|
|
}
|
|
if (word.endsWith("ing")) {
|
|
detach.push(word.substring(0, length - 3));
|
|
}
|
|
break;
|
|
case 'r':
|
|
if (word.endsWith("er")) {
|
|
detach.push(word.substring(0, length - 2));
|
|
}
|
|
if (word.endsWith("er")) {
|
|
detach.push(word.substring(0, length - 1));
|
|
}
|
|
if (word.endsWith("est")) {
|
|
detach.push(word.substring(0, length - 3));
|
|
}
|
|
if (word.endsWith("est")) {
|
|
detach.push(word.substring(0, length - 2));
|
|
}
|
|
}
|
|
return unique(detach);
|
|
};
|
|
|
|
_forms = function(wordnet, word, pos) {
|
|
var colloc, exception, forms, i, index, j, lword, ref, ref1, rtn, token, tokens;
|
|
lword = word.toLowerCase();
|
|
exception = (ref = wordnet.exceptions[pos]) != null ? ref[lword] : void 0;
|
|
if (exception) {
|
|
return [word].concat(exception);
|
|
}
|
|
tokens = word.split(/[ _]/g);
|
|
if (tokens.length === 1) {
|
|
return tokenDetach(tokens[0] + "#" + pos);
|
|
}
|
|
forms = tokens.map(function(token) {
|
|
return _forms(wordnet, token, pos);
|
|
});
|
|
rtn = [];
|
|
index = (function() {
|
|
var j, len, results1;
|
|
results1 = [];
|
|
for (j = 0, len = tokens.length; j < len; j++) {
|
|
token = tokens[j];
|
|
results1.push(0);
|
|
}
|
|
return results1;
|
|
})();
|
|
while (true) {
|
|
colloc = forms[0][index[0]];
|
|
for (i = j = 1, ref1 = tokens.length - 1; 1 <= ref1 ? j <= ref1 : j >= ref1; i = 1 <= ref1 ? ++j : --j) {
|
|
colloc = colloc + '_' + forms[i][index[i]];
|
|
}
|
|
rtn.push(colloc);
|
|
i = 0;
|
|
while (i < tokens.length) {
|
|
index[i] = index[i] + 1;
|
|
if (index[i] < forms[i].length) {
|
|
break;
|
|
} else {
|
|
index[i] = 0;
|
|
}
|
|
i = i + 1;
|
|
}
|
|
if (i >= tokens.length) {
|
|
break;
|
|
}
|
|
}
|
|
return rtn;
|
|
};
|
|
|
|
forms = function(wordnet, string) {
|
|
var element, j, len, pos, ref, results1, rtn, sense, word;
|
|
ref = string.split('#'), word = ref[0], pos = ref[1], sense = ref[2];
|
|
rtn = _forms(wordnet, word, pos);
|
|
results1 = [];
|
|
for (j = 0, len = rtn.length; j < len; j++) {
|
|
element = rtn[j];
|
|
results1.push(element + "#" + pos);
|
|
}
|
|
return results1;
|
|
};
|
|
|
|
_validForms = function(wordnet, string, callback) {
|
|
var eachFn, filteredResults, pos, possibleForms, reducer, ref, sense, word;
|
|
ref = string.split('#'), word = ref[0], pos = ref[1], sense = ref[2];
|
|
if (!pos) {
|
|
reducer = function(previous, current, next) {
|
|
return _validForms(wordnet, string + "#" + current, function(err, value) {
|
|
if (value === void 0) {
|
|
return next(null, previous);
|
|
} else {
|
|
return next(null, previous.concat(value));
|
|
}
|
|
});
|
|
};
|
|
return async.reduce(['n', 'v', 'a', 'r'], [], reducer, function(err, result) {
|
|
return callback(null, result);
|
|
});
|
|
} else {
|
|
possibleForms = forms(wordnet, word + "#" + pos);
|
|
filteredResults = [];
|
|
eachFn = function(term, done) {
|
|
return wordnet.lookup(term, function(err, data) {
|
|
if (err != null) {
|
|
return done(err);
|
|
}
|
|
if (data.length > 0) {
|
|
filteredResults.push(term);
|
|
}
|
|
return done();
|
|
});
|
|
};
|
|
return async.each(possibleForms, eachFn, function(err) {
|
|
return callback(err, filteredResults);
|
|
});
|
|
}
|
|
};
|
|
|
|
_validFormsWithExceptions = function(wordnet, string, callback) {
|
|
if (wordnet.exceptions === void 0) {
|
|
return _loadExceptions(wordnet, function() {
|
|
return _validFormsWithExceptions(wordnet, string, callback);
|
|
});
|
|
} else if (wordnet.exceptions === 'pending') {
|
|
return setImmediate(_validFormsWithExceptions, wordnet, string, callback);
|
|
} else {
|
|
return _validForms(wordnet, string, callback);
|
|
}
|
|
};
|
|
|
|
WordNet.prototype.validForms = function(string, callback) {
|
|
var hit, query, wordnet;
|
|
wordnet = this;
|
|
if (this.cache) {
|
|
query = "validForms:" + string;
|
|
if (hit = wordnet.cache.get(query)) {
|
|
if (callback.length === 1) {
|
|
return callback.call(wordnet, hit);
|
|
} else {
|
|
return callback.call(wordnet, null, hit);
|
|
}
|
|
}
|
|
}
|
|
return _validFormsWithExceptions(this, string, function(err, result) {
|
|
if (query) {
|
|
wordnet.cache.set(query, result);
|
|
}
|
|
if (callback.length === 1) {
|
|
return callback.call(wordnet, result);
|
|
} else {
|
|
return callback.call(wordnet, null, result);
|
|
}
|
|
});
|
|
};
|
|
|
|
WordNet.prototype.validFormsAsync = function(string) {
|
|
return new Promise((function(_this) {
|
|
return function(resolve, reject) {
|
|
return _this.validForms(string, function(err, data) {
|
|
if (err != null) {
|
|
return reject(err);
|
|
} else {
|
|
return resolve(data);
|
|
}
|
|
});
|
|
};
|
|
})(this));
|
|
};
|
|
|
|
return WordNet;
|
|
|
|
})();
|
|
|
|
module.exports = WordNet;
|