fchat-rising/learn/dictionary/wordnet/wordnet.js

682 lines
19 KiB
JavaScript

var DataFile, IndexFile, LRU, Promise, WordNet, async, fs, path,
slice = [].slice;
IndexFile = require('./index_file');
DataFile = require('./data_file');
async = require('async');
Promise = require('bluebird');
path = require('path');
fs = require('fs');
LRU = require('lru-cache');
WordNet = (function() {
var _forms, _loadExceptions, _validForms, _validFormsWithExceptions, exceptions, forms, tokenDetach, unique;
function WordNet(options) {
var WNdb, e, error;
if (typeof options === 'string') {
options = {
dataDir: options
};
} else {
if (options == null) {
options = {};
}
}
if (options.dataDir == null) {
try {
WNdb = require('wndb-with-exceptions');
} catch (error) {
e = error;
console.error("Please 'npm install wndb-with-exceptions' before using WordNet module or specify a dict directory.");
throw e;
}
options.dataDir = WNdb.path;
}
if (!options.cache) {
this.cache = null;
} else {
if (options.cache === true) {
options.cache = {
max: 2000
};
}
if (typeof options.cache === 'object' && typeof options.cache.get === 'function') {
this.cache = options.cache;
} else {
this.cache = LRU(options.cache);
}
}
this.path = options.dataDir;
this.nounIndex = new IndexFile(this.path, 'noun');
this.verbIndex = new IndexFile(this.path, 'verb');
this.adjIndex = new IndexFile(this.path, 'adj');
this.advIndex = new IndexFile(this.path, 'adv');
this.nounData = new DataFile(this.path, 'noun');
this.verbData = new DataFile(this.path, 'verb');
this.adjData = new DataFile(this.path, 'adj');
this.advData = new DataFile(this.path, 'adv');
this.allFiles = [
{
index: this.nounIndex,
data: this.nounData,
pos: 'n'
}, {
index: this.verbIndex,
data: this.verbData,
pos: 'v'
}, {
index: this.adjIndex,
data: this.adjData,
pos: 'a'
}, {
index: this.advIndex,
data: this.advData,
pos: 'r'
}
];
}
WordNet.prototype.get = function(synsetOffset, pos, callback) {
var dataFile, hit, query, wordnet;
wordnet = this;
if (this.cache) {
query = "get:" + synsetOffset + ":" + pos;
if (hit = wordnet.cache.get(query)) {
if (callback.length === 1) {
return callback.call(wordnet, hit);
} else {
return callback.call(wordnet, null, hit);
}
}
}
dataFile = wordnet.getDataFile(pos);
return dataFile.get(synsetOffset, function(err, result) {
if (query && (err == null)) {
wordnet.cache.set(query, result);
}
if (callback.length === 1) {
return callback.call(wordnet, result);
} else {
return callback.call(wordnet, err, result);
}
});
};
WordNet.prototype.getAsync = function(synsetOffset, pos) {
var wordnet;
wordnet = this;
return new Promise(function(resolve, reject) {
return wordnet.get(synsetOffset, pos, function(err, data) {
if (err != null) {
return reject(err);
} else {
return resolve(data);
}
});
});
};
WordNet.prototype.lookup = function(input, callback) {
var hit, lword, pos, query, ref, selectedFiles, word, wordnet;
wordnet = this;
ref = input.split('#'), word = ref[0], pos = ref[1];
lword = word.toLowerCase().replace(/\s+/g, '_');
if (this.cache) {
query = "lookup:" + input;
if (hit = wordnet.cache.get(query)) {
if (callback.length === 1) {
return callback.call(wordnet, hit);
} else {
return callback.call(wordnet, null, hit);
}
}
}
selectedFiles = !pos ? wordnet.allFiles.slice() : wordnet.allFiles.filter(function(file) {
return file.pos === pos;
});
return wordnet.lookupFromFiles(selectedFiles, [], lword, function(err, results) {
if (err != null) {
return callback.call(wordnet, err);
}
if (query) {
wordnet.cache.set(query, results);
}
if (callback.length === 1) {
return callback.call(wordnet, results);
} else {
return callback.call(wordnet, null, results);
}
});
};
WordNet.prototype.lookupAsync = function(input, callback) {
var wordnet;
wordnet = this;
return new Promise(function(resolve, reject) {
return wordnet.lookup(input, function(err, data) {
if (err != null) {
return reject(err);
} else {
return resolve(data);
}
});
});
};
WordNet.prototype.findSense = function(input, callback) {
var hit, lword, pos, query, ref, selectedFiles, sense, senseNumber, word, wordnet;
wordnet = this;
ref = input.split('#'), word = ref[0], pos = ref[1], senseNumber = ref[2];
if (this.cache) {
query = "findSense:" + input;
if (hit = wordnet.cache.get(query)) {
if (callback.length === 1) {
return callback.call(wordnet, hit);
} else {
return callback.call(wordnet, null, hit);
}
}
}
sense = parseInt(senseNumber);
if (Number.isNaN(sense)) {
throw new Error("Sense number should be an integer");
} else if (sense < 1) {
throw new Error("Sense number should be a positive integer");
}
lword = word.toLowerCase().replace(/\s+/g, '_');
selectedFiles = wordnet.allFiles.filter(function(file) {
return file.pos === pos;
});
return wordnet.lookupFromFiles(selectedFiles, [], lword, function(err, response) {
var result;
if (err != null) {
return callback.call(wordnet, err);
}
result = response[sense - 1];
if (query) {
wordnet.cache.set(query, result);
}
if (callback.length === 1) {
return callback.call(wordnet, result);
} else {
return callback.call(wordnet, null, result);
}
});
};
WordNet.prototype.findSenseAsync = function(input) {
var wordnet;
wordnet = this;
return new Promise(function(resolve, reject) {
return wordnet.findSense(input, function(err, data) {
if (err != null) {
return reject(err);
} else {
return resolve(data);
}
});
});
};
WordNet.prototype.querySense = function(input, callback) {
var hit, pos, query, ref, word, wordnet;
wordnet = this;
ref = input.split('#'), word = ref[0], pos = ref[1];
if (this.cache) {
query = "querySense:" + input;
if (hit = wordnet.cache.get(query)) {
if (callback.length === 1) {
return callback.call(wordnet, hit);
} else {
return callback.call(wordnet, null, hit);
}
}
}
return wordnet.lookup(input, function(err, results) {
var i, sense, senseCounts, senses;
if (err != null) {
return callback.call(wordnet, err);
}
senseCounts = {};
senses = (function() {
var j, len, results1;
results1 = [];
for (i = j = 0, len = results.length; j < len; i = ++j) {
sense = results[i];
pos = sense.pos;
if (pos === 's') {
pos = 'a';
}
if (senseCounts[pos] == null) {
senseCounts[pos] = 1;
}
results1.push(word + "#" + pos + "#" + senseCounts[pos]++);
}
return results1;
})();
if (query) {
wordnet.cache.set(query, senses);
}
if (callback.length === 1) {
return callback.call(wordnet, senses);
} else {
return callback.call(wordnet, null, senses);
}
});
};
WordNet.prototype.querySenseAsync = function(input) {
var wordnet;
wordnet = this;
return new Promise(function(resolve, reject) {
return wordnet.querySense(input, function(err, data) {
if (err != null) {
return reject(err);
} else {
return resolve(data);
}
});
});
};
WordNet.prototype.lookupFromFiles = function(files, results, word, callback) {
var file, wordnet;
wordnet = this;
if (files.length === 0) {
return callback.call(wordnet, null, results);
} else {
file = files.pop();
return file.index.lookup(word, function(err, record) {
if (record) {
return wordnet.pushResults(file.data, results, record.synsetOffset, function() {
return wordnet.lookupFromFiles(files, results, word, callback);
});
} else {
return wordnet.lookupFromFiles(files, results, word, callback);
}
});
}
};
WordNet.prototype.pushResults = function(data, results, offsets, callback) {
var wordnet;
wordnet = this;
if (offsets.length === 0) {
return callback(results);
} else {
return data.get(offsets.pop(), function(err, record) {
results.push(record);
return wordnet.pushResults(data, results, offsets, callback);
});
}
};
WordNet.prototype.loadResultSynonyms = function(synonyms, results, callback) {
var result, wordnet;
wordnet = this;
if (results.length > 0) {
result = results.pop();
return wordnet.loadSynonyms(synonyms, results, result.ptrs, callback);
} else {
return callback(synonyms);
}
};
WordNet.prototype.loadSynonyms = function(synonyms, results, ptrs, callback) {
var ptr, wordnet;
wordnet = this;
if (ptrs.length > 0) {
ptr = ptrs.pop();
return this.get(ptr.synsetOffset, ptr.pos, function(result) {
synonyms.push(result);
return wordnet.loadSynonyms(synonyms, results, ptrs, callback);
});
} else {
return wordnet.loadResultSynonyms(synonyms, results, callback);
}
};
WordNet.prototype.lookupSynonyms = function(word, callback) {
var wordnet;
wordnet = this;
return wordnet.lookup(word, function(results) {
return wordnet.loadResultSynonyms([], results, callback);
});
};
WordNet.prototype.getSynonyms = function() {
var callback, pos, synsetOffset, wordnet;
wordnet = this;
callback = arguments[2] ? arguments[2] : arguments[1];
pos = arguments[0].pos ? arguments[0].pos : arguments[1];
synsetOffset = arguments[0].synsetOffset ? arguments[0].synsetOffset : arguments[0];
return this.get(synsetOffset, pos, function(result) {
return wordnet.loadSynonyms([], [], result.ptrs, callback);
});
};
WordNet.prototype.getDataFile = function(pos) {
switch (pos) {
case 'n':
return this.nounData;
case 'v':
return this.verbData;
case 'a':
case 's':
return this.adjData;
case 'r':
return this.advData;
}
};
exceptions = [
{
name: "noun.exc",
pos: 'n'
}, {
name: "verb.exc",
pos: 'v'
}, {
name: "adj.exc",
pos: 'a'
}, {
name: "adv.exc",
pos: 'r'
}
];
_loadExceptions = function(wordnet, callback) {
var loadFile;
WordNet.prototype.exceptions = 'pending';
loadFile = function(exception, callback) {
var fullPath;
fullPath = path.join(wordnet.path, exception.name);
return fs.readFile(fullPath, function(err, data) {
var j, len, line, lines, ref, temp, term1, term2;
if (err) {
return callback(err);
}
temp = {};
lines = data.toString().split("\n");
for (j = 0, len = lines.length; j < len; j++) {
line = lines[j];
if (line.length > 0) {
ref = line.split(' '), term1 = ref[0], term2 = 2 <= ref.length ? slice.call(ref, 1) : [];
if (temp[term1] == null) {
temp[term1] = [];
}
Array.prototype.push.apply(temp[term1], term2);
}
}
return callback(null, {
pos: exception.pos,
data: temp
});
});
};
return async.map(exceptions, loadFile, function(err, results) {
var j, len, result;
exceptions = {};
for (j = 0, len = results.length; j < len; j++) {
result = results[j];
exceptions[result.pos] = result.data;
}
WordNet.prototype.exceptions = exceptions;
return callback();
});
};
WordNet.prototype.close = function() {
this.nounIndex.close();
this.verbIndex.close();
this.adjIndex.close();
this.advIndex.close();
this.nounData.close();
this.verbData.close();
this.adjData.close();
return this.advData.close();
};
unique = function(a) {
var found;
found = {};
return a.filter(function(item) {
if (found[item]) {
return false;
} else {
return found[item] = true;
}
});
};
tokenDetach = function(string) {
var detach, length, pos, ref, sense, word;
ref = string.split('#'), word = ref[0], pos = ref[1], sense = ref[2];
detach = [word];
length = word.length;
switch (pos) {
case 'n':
if (word.endsWith("s")) {
detach.push(word.substring(0, length - 1));
}
if (word.endsWith("ses")) {
detach.push(word.substring(0, length - 2));
}
if (word.endsWith("xes")) {
detach.push(word.substring(0, length - 2));
}
if (word.endsWith("zes")) {
detach.push(word.substring(0, length - 2));
}
if (word.endsWith("ches")) {
detach.push(word.substring(0, length - 2));
}
if (word.endsWith("shes")) {
detach.push(word.substring(0, length - 2));
}
if (word.endsWith("men")) {
detach.push(word.substring(0, length - 3) + "man");
}
if (word.endsWith("ies")) {
detach.push(word.substring(0, length - 3) + "y");
}
break;
case 'v':
if (word.endsWith("s")) {
detach.push(word.substring(0, length - 1));
}
if (word.endsWith("ies")) {
detach.push(word.substring(0, length - 3) + "y");
}
if (word.endsWith("es")) {
detach.push(word.substring(0, length - 2));
}
if (word.endsWith("ed")) {
detach.push(word.substring(0, length - 1));
}
if (word.endsWith("ed")) {
detach.push(word.substring(0, length - 2));
}
if (word.endsWith("ing")) {
detach.push(word.substring(0, length - 3) + "e");
}
if (word.endsWith("ing")) {
detach.push(word.substring(0, length - 3));
}
break;
case 'r':
if (word.endsWith("er")) {
detach.push(word.substring(0, length - 2));
}
if (word.endsWith("er")) {
detach.push(word.substring(0, length - 1));
}
if (word.endsWith("est")) {
detach.push(word.substring(0, length - 3));
}
if (word.endsWith("est")) {
detach.push(word.substring(0, length - 2));
}
}
return unique(detach);
};
_forms = function(wordnet, word, pos) {
var colloc, exception, forms, i, index, j, lword, ref, ref1, rtn, token, tokens;
lword = word.toLowerCase();
exception = (ref = wordnet.exceptions[pos]) != null ? ref[lword] : void 0;
if (exception) {
return [word].concat(exception);
}
tokens = word.split(/[ _]/g);
if (tokens.length === 1) {
return tokenDetach(tokens[0] + "#" + pos);
}
forms = tokens.map(function(token) {
return _forms(wordnet, token, pos);
});
rtn = [];
index = (function() {
var j, len, results1;
results1 = [];
for (j = 0, len = tokens.length; j < len; j++) {
token = tokens[j];
results1.push(0);
}
return results1;
})();
while (true) {
colloc = forms[0][index[0]];
for (i = j = 1, ref1 = tokens.length - 1; 1 <= ref1 ? j <= ref1 : j >= ref1; i = 1 <= ref1 ? ++j : --j) {
colloc = colloc + '_' + forms[i][index[i]];
}
rtn.push(colloc);
i = 0;
while (i < tokens.length) {
index[i] = index[i] + 1;
if (index[i] < forms[i].length) {
break;
} else {
index[i] = 0;
}
i = i + 1;
}
if (i >= tokens.length) {
break;
}
}
return rtn;
};
forms = function(wordnet, string) {
var element, j, len, pos, ref, results1, rtn, sense, word;
ref = string.split('#'), word = ref[0], pos = ref[1], sense = ref[2];
rtn = _forms(wordnet, word, pos);
results1 = [];
for (j = 0, len = rtn.length; j < len; j++) {
element = rtn[j];
results1.push(element + "#" + pos);
}
return results1;
};
_validForms = function(wordnet, string, callback) {
var eachFn, filteredResults, pos, possibleForms, reducer, ref, sense, word;
ref = string.split('#'), word = ref[0], pos = ref[1], sense = ref[2];
if (!pos) {
reducer = function(previous, current, next) {
return _validForms(wordnet, string + "#" + current, function(err, value) {
if (value === void 0) {
return next(null, previous);
} else {
return next(null, previous.concat(value));
}
});
};
return async.reduce(['n', 'v', 'a', 'r'], [], reducer, function(err, result) {
return callback(null, result);
});
} else {
possibleForms = forms(wordnet, word + "#" + pos);
filteredResults = [];
eachFn = function(term, done) {
return wordnet.lookup(term, function(err, data) {
if (err != null) {
return done(err);
}
if (data.length > 0) {
filteredResults.push(term);
}
return done();
});
};
return async.each(possibleForms, eachFn, function(err) {
return callback(err, filteredResults);
});
}
};
_validFormsWithExceptions = function(wordnet, string, callback) {
if (wordnet.exceptions === void 0) {
return _loadExceptions(wordnet, function() {
return _validFormsWithExceptions(wordnet, string, callback);
});
} else if (wordnet.exceptions === 'pending') {
return setImmediate(_validFormsWithExceptions, wordnet, string, callback);
} else {
return _validForms(wordnet, string, callback);
}
};
WordNet.prototype.validForms = function(string, callback) {
var hit, query, wordnet;
wordnet = this;
if (this.cache) {
query = "validForms:" + string;
if (hit = wordnet.cache.get(query)) {
if (callback.length === 1) {
return callback.call(wordnet, hit);
} else {
return callback.call(wordnet, null, hit);
}
}
}
return _validFormsWithExceptions(this, string, function(err, result) {
if (query) {
wordnet.cache.set(query, result);
}
if (callback.length === 1) {
return callback.call(wordnet, result);
} else {
return callback.call(wordnet, null, result);
}
});
};
WordNet.prototype.validFormsAsync = function(string) {
return new Promise((function(_this) {
return function(resolve, reject) {
return _this.validForms(string, function(err, data) {
if (err != null) {
return reject(err);
} else {
return resolve(data);
}
});
};
})(this));
};
return WordNet;
})();
module.exports = WordNet;