'use strict'; /** * Typo is a JavaScript implementation of a spellchecker using hunspell-style * dictionaries. */ /** * Typo constructor. * * @param {String} [dictionary] The locale code of the dictionary being used. e.g., * "en_US". This is only used to auto-load dictionaries. * @param {String} [affData] The data from the dictionary's .aff file. If omitted * and the first argument is supplied, in "chrome" platform, * the .aff file will be loaded automatically from * lib/typo/dictionaries/[dictionary]/[dictionary].aff * In other platform, it will be loaded from * [setting.path]/dictionaries/[dictionary]/[dictionary].aff * @param {String} [wordsData] The data from the dictionary's .dic file. If omitted, * and the first argument is supplied, in "chrome" platform, * the .dic file will be loaded automatically from * lib/typo/dictionaries/[dictionary]/[dictionary].dic * In other platform, it will be loaded from * [setting.path]/dictionaries/[dictionary]/[dictionary].dic * @param {Object} [settings] Constructor settings. Available properties are: * {String} [platform]: "chrome" for Chrome Extension or other * value for the usual web. * {String} [dictionaryPath]: path to load dictionary from in non-chrome * environment. * {Object} [flags]: flag information. * * * @returns {Typo} A Typo object. */ var Typo = function (dictionary, affData, wordsData, settings) { settings = settings || {}; /** Determines the method used for auto-loading .aff and .dic files. **/ this.platform = settings.platform || "chrome"; this.dictionary = null; this.rules = {}; this.dictionaryTable = {}; this.compoundRules = []; this.compoundRuleCodes = {}; this.replacementTable = []; this.flags = settings.flags || {}; if (dictionary) { this.dictionary = dictionary; if (this.platform == "chrome") { if (!affData) affData = this._readFile(chrome.extension.getURL("lib/typo/dictionaries/" + dictionary + "/" + dictionary + ".aff")); if (!wordsData) wordsData = this._readFile(chrome.extension.getURL("lib/typo/dictionaries/" + dictionary + "/" + dictionary + ".dic")); } else { var path = settings.dictionaryPath || ''; if (!affData) affData = this._readFile(path + "/" + dictionary + "/" + dictionary + ".aff"); if (!wordsData) wordsData = this._readFile(path + "/" + dictionary + "/" + dictionary + ".dic"); } this.rules = this._parseAFF(affData); // Save the rule codes that are used in compound rules. this.compoundRuleCodes = {}; for (var i = 0, _len = this.compoundRules.length; i < _len; i++) { var rule = this.compoundRules[i]; for (var j = 0, _jlen = rule.length; j < _jlen; j++) { this.compoundRuleCodes[rule[j]] = []; } } // If we add this ONLYINCOMPOUND flag to this.compoundRuleCodes, then _parseDIC // will do the work of saving the list of words that are compound-only. if ("ONLYINCOMPOUND" in this.flags) { this.compoundRuleCodes[this.flags.ONLYINCOMPOUND] = []; } this.dictionaryTable = this._parseDIC(wordsData); // Get rid of any codes from the compound rule codes that are never used // (or that were special regex characters). Not especially necessary... for (var i in this.compoundRuleCodes) { if (this.compoundRuleCodes[i].length == 0) { delete this.compoundRuleCodes[i]; } } // Build the full regular expressions for each compound rule. // I have a feeling (but no confirmation yet) that this method of // testing for compound words is probably slow. for (var i = 0, _len = this.compoundRules.length; i < _len; i++) { var ruleText = this.compoundRules[i]; var expressionText = ""; for (var j = 0, _jlen = ruleText.length; j < _jlen; j++) { var character = ruleText[j]; if (character in this.compoundRuleCodes) { expressionText += "(" + this.compoundRuleCodes[character].join("|") + ")"; } else { expressionText += character; } } this.compoundRules[i] = new RegExp(expressionText, "i"); } } return this; }; Typo.prototype = { /** * Loads a Typo instance from a hash of all of the Typo properties. * * @param object obj A hash of Typo properties, probably gotten from a JSON.parse(JSON.stringify(typo_instance)). */ load : function (obj) { for (var i in obj) { this[i] = obj[i]; } return this; }, /** * Read the contents of a file. * * @param {String} path The path (relative) to the file. * @param {String} [charset="ISO8859-1"] The expected charset of the file * @returns string The file data. */ _readFile : function (path, charset) { if (!charset) charset = "ISO8859-1"; var req = new XMLHttpRequest(); req.open("GET", path, false); if (req.overrideMimeType) req.overrideMimeType("text/plain; charset=" + charset); req.send(null); return req.responseText; }, /** * Parse the rules out from a .aff file. * * @param {String} data The contents of the affix file. * @returns object The rules from the file. */ _parseAFF : function (data) { var rules = {}; // Remove comment lines data = this._removeAffixComments(data); var lines = data.split("\n"); for (var i = 0, _len = lines.length; i < _len; i++) { var line = lines[i]; var definitionParts = line.split(/\s+/); var ruleType = definitionParts[0]; if (ruleType == "PFX" || ruleType == "SFX") { var ruleCode = definitionParts[1]; var combineable = definitionParts[2]; var numEntries = parseInt(definitionParts[3], 10); var entries = []; for (var j = i + 1, _jlen = i + 1 + numEntries; j < _jlen; j++) { var line = lines[j]; var lineParts = line.split(/\s+/); var charactersToRemove = lineParts[2]; var additionParts = lineParts[3].split("/"); var charactersToAdd = additionParts[0]; if (charactersToAdd === "0") charactersToAdd = ""; var continuationClasses = this.parseRuleCodes(additionParts[1]); var regexToMatch = lineParts[4]; var entry = {}; entry.add = charactersToAdd; if (continuationClasses.length > 0) entry.continuationClasses = continuationClasses; if (regexToMatch !== ".") { if (ruleType === "SFX") { entry.match = new RegExp(regexToMatch + "$"); } else { entry.match = new RegExp("^" + regexToMatch); } } if (charactersToRemove != "0") { if (ruleType === "SFX") { entry.remove = new RegExp(charactersToRemove + "$"); } else { entry.remove = charactersToRemove; } } entries.push(entry); } rules[ruleCode] = { "type" : ruleType, "combineable" : (combineable == "Y"), "entries" : entries }; i += numEntries; } else if (ruleType === "COMPOUNDRULE") { var numEntries = parseInt(definitionParts[1], 10); for (var j = i + 1, _jlen = i + 1 + numEntries; j < _jlen; j++) { var line = lines[j]; var lineParts = line.split(/\s+/); this.compoundRules.push(lineParts[1]); } i += numEntries; } else if (ruleType === "REP") { var lineParts = line.split(/\s+/); if (lineParts.length === 3) { this.replacementTable.push([ lineParts[1], lineParts[2] ]); } } else { // ONLYINCOMPOUND // COMPOUNDMIN // FLAG // KEEPCASE // NEEDAFFIX this.flags[ruleType] = definitionParts[1]; } } return rules; }, /** * Removes comment lines and then cleans up blank lines and trailing whitespace. * * @param {String} data The data from an affix file. * @return {String} The cleaned-up data. */ _removeAffixComments : function (data) { // Remove comments data = data.replace(/#.*$/mg, ""); // Trim each line data = data.replace(/^\s\s*/m, '').replace(/\s\s*$/m, ''); // Remove blank lines. data = data.replace(/\n{2,}/g, "\n"); // Trim the entire string data = data.replace(/^\s\s*/, '').replace(/\s\s*$/, ''); return data; }, /** * Parses the words out from the .dic file. * * @param {String} data The data from the dictionary file. * @returns object The lookup table containing all of the words and * word forms from the dictionary. */ _parseDIC : function (data) { data = this._removeDicComments(data); var lines = data.split("\n"); var dictionaryTable = {}; function addWord(word, rules) { // Some dictionaries will list the same word multiple times with different rule sets. if (!(word in dictionaryTable) || typeof dictionaryTable[word] != 'object') { dictionaryTable[word] = []; } dictionaryTable[word].push(rules); } // The first line is the number of words in the dictionary. for (var i = 1, _len = lines.length; i < _len; i++) { var line = lines[i]; var parts = line.split("/", 2); var word = parts[0]; // Now for each affix rule, generate that form of the word. if (parts.length > 1) { var ruleCodesArray = this.parseRuleCodes(parts[1]); // Save the ruleCodes for compound word situations. if (!("NEEDAFFIX" in this.flags) || ruleCodesArray.indexOf(this.flags.NEEDAFFIX) == -1) { addWord(word, ruleCodesArray); } for (var j = 0, _jlen = ruleCodesArray.length; j < _jlen; j++) { var code = ruleCodesArray[j]; var rule = this.rules[code]; if (rule) { var newWords = this._applyRule(word, rule); for (var ii = 0, _iilen = newWords.length; ii < _iilen; ii++) { var newWord = newWords[ii]; addWord(newWord, []); if (rule.combineable) { for (var k = j + 1; k < _jlen; k++) { var combineCode = ruleCodesArray[k]; var combineRule = this.rules[combineCode]; if (combineRule) { if (combineRule.combineable && (rule.type != combineRule.type)) { var otherNewWords = this._applyRule(newWord, combineRule); for (var iii = 0, _iiilen = otherNewWords.length; iii < _iiilen; iii++) { var otherNewWord = otherNewWords[iii]; addWord(otherNewWord, []); } } } } } } } if (code in this.compoundRuleCodes) { this.compoundRuleCodes[code].push(word); } } } else { addWord(word.trim(), []); } } return dictionaryTable; }, /** * Removes comment lines and then cleans up blank lines and trailing whitespace. * * @param {String} data The data from a .dic file. * @return {String} The cleaned-up data. */ _removeDicComments : function (data) { // I can't find any official documentation on it, but at least the de_DE // dictionary uses tab-indented lines as comments. // Remove comments data = data.replace(/^\t.*$/mg, ""); return data; // Trim each line data = data.replace(/^\s\s*/m, '').replace(/\s\s*$/m, ''); // Remove blank lines. data = data.replace(/\n{2,}/g, "\n"); // Trim the entire string data = data.replace(/^\s\s*/, '').replace(/\s\s*$/, ''); return data; }, parseRuleCodes : function (textCodes) { if (!textCodes) { return []; } else if (!("FLAG" in this.flags)) { return textCodes.split(""); } else if (this.flags.FLAG === "long") { var flags = []; for (var i = 0, _len = textCodes.length; i < _len; i += 2) { flags.push(textCodes.substr(i, 2)); } return flags; } else if (this.flags.FLAG === "num") { return textCode.split(","); } }, /** * Applies an affix rule to a word. * * @param {String} word The base word. * @param {Object} rule The affix rule. * @returns {String[]} The new words generated by the rule. */ _applyRule : function (word, rule) { var entries = rule.entries; var newWords = []; for (var i = 0, _len = entries.length; i < _len; i++) { var entry = entries[i]; if (!entry.match || word.match(entry.match)) { var newWord = word; if (entry.remove) { newWord = newWord.replace(entry.remove, ""); } if (rule.type === "SFX") { newWord = newWord + entry.add; } else { newWord = entry.add + newWord; } newWords.push(newWord); if ("continuationClasses" in entry) { for (var j = 0, _jlen = entry.continuationClasses.length; j < _jlen; j++) { var continuationRule = this.rules[entry.continuationClasses[j]]; if (continuationRule) { newWords = newWords.concat(this._applyRule(newWord, continuationRule)); } /* else { // This shouldn't happen, but it does, at least in the de_DE dictionary. // I think the author mistakenly supplied lower-case rule codes instead // of upper-case. } */ } } } } return newWords; }, /** * Checks whether a word or a capitalization variant exists in the current dictionary. * The word is trimmed and several variations of capitalizations are checked. * If you want to check a word without any changes made to it, call checkExact() * * @see http://blog.stevenlevithan.com/archives/faster-trim-javascript re:trimming function * * @param {String} aWord The word to check. * @returns {Boolean} */ check : function (aWord) { // Remove leading and trailing whitespace var trimmedWord = aWord.replace(/^\s\s*/, '').replace(/\s\s*$/, ''); if (this.checkExact(trimmedWord)) { return true; } // The exact word is not in the dictionary. if (trimmedWord.toUpperCase() === trimmedWord) { // The word was supplied in all uppercase. // Check for a capitalized form of the word. var capitalizedWord = trimmedWord[0] + trimmedWord.substring(1).toLowerCase(); if (this.hasFlag(capitalizedWord, "KEEPCASE")) { // Capitalization variants are not allowed for this word. return false; } if (this.checkExact(capitalizedWord)) { return true; } } var lowercaseWord = trimmedWord.toLowerCase(); if (lowercaseWord !== trimmedWord) { if (this.hasFlag(lowercaseWord, "KEEPCASE")) { // Capitalization variants are not allowed for this word. return false; } // Check for a lowercase form if (this.checkExact(lowercaseWord)) { return true; } } return false; }, /** * Checks whether a word exists in the current dictionary. * * @param {String} word The word to check. * @returns {Boolean} */ checkExact : function (word) { var ruleCodes = this.dictionaryTable[word]; if (typeof ruleCodes === 'undefined') { // Check if this might be a compound word. if ("COMPOUNDMIN" in this.flags && word.length >= this.flags.COMPOUNDMIN) { for (var i = 0, _len = this.compoundRules.length; i < _len; i++) { if (word.match(this.compoundRules[i])) { return true; } } } return false; } else { for (var i = 0, _len = ruleCodes.length; i < _len; i++) { if (!this.hasFlag(word, "ONLYINCOMPOUND", ruleCodes[i])) { return true; } } return false; } }, /** * Looks up whether a given word is flagged with a given flag. * * @param {String} word The word in question. * @param {String} flag The flag in question. * @return {Boolean} */ hasFlag : function (word, flag, wordFlags) { if (flag in this.flags) { if (typeof wordFlags === 'undefined') { var wordFlags = Array.prototype.concat.apply([], this.dictionaryTable[word]); } if (wordFlags && wordFlags.indexOf(this.flags[flag]) !== -1) { return true; } } return false; }, /** * Returns a list of suggestions for a misspelled word. * * @see http://www.norvig.com/spell-correct.html for the basis of this suggestor. * This suggestor is primitive, but it works. * * @param {String} word The misspelling. * @param {Number} [limit=5] The maximum number of suggestions to return. * @returns {String[]} The array of suggestions. */ alphabet : "", suggest : function (word, limit) { if (!limit) limit = 5; if (this.check(word)) return []; // Check the replacement table. for (var i = 0, _len = this.replacementTable.length; i < _len; i++) { var replacementEntry = this.replacementTable[i]; if (word.indexOf(replacementEntry[0]) !== -1) { var correctedWord = word.replace(replacementEntry[0], replacementEntry[1]); if (this.check(correctedWord)) { return [ correctedWord ]; } } } var self = this; self.alphabet = "abcdefghijklmnopqrstuvwxyz"; /* if (!self.alphabet) { // Use the alphabet as implicitly defined by the words in the dictionary. var alphaHash = {}; for (var i in self.dictionaryTable) { for (var j = 0, _len = i.length; j < _len; j++) { alphaHash[i[j]] = true; } } for (var i in alphaHash) { self.alphabet += i; } var alphaArray = self.alphabet.split(""); alphaArray.sort(); self.alphabet = alphaArray.join(""); } */ function edits1(words) { var rv = []; for (var ii = 0, _iilen = words.length; ii < _iilen; ii++) { var word = words[ii]; var splits = []; for (var i = 0, _len = word.length + 1; i < _len; i++) { splits.push([ word.substring(0, i), word.substring(i, word.length) ]); } var deletes = []; for (var i = 0, _len = splits.length; i < _len; i++) { var s = splits[i]; if (s[1]) { deletes.push(s[0] + s[1].substring(1)); } } var transposes = []; for (var i = 0, _len = splits.length; i < _len; i++) { var s = splits[i]; if (s[1].length > 1) { transposes.push(s[0] + s[1][1] + s[1][0] + s[1].substring(2)); } } var replaces = []; for (var i = 0, _len = splits.length; i < _len; i++) { var s = splits[i]; if (s[1]) { for (var j = 0, _jlen = self.alphabet.length; j < _jlen; j++) { replaces.push(s[0] + self.alphabet[j] + s[1].substring(1)); } } } var inserts = []; for (var i = 0, _len = splits.length; i < _len; i++) { var s = splits[i]; if (s[1]) { for (var j = 0, _jlen = self.alphabet.length; j < _jlen; j++) { replaces.push(s[0] + self.alphabet[j] + s[1]); } } } rv = rv.concat(deletes); rv = rv.concat(transposes); rv = rv.concat(replaces); rv = rv.concat(inserts); } return rv; } function known(words) { var rv = []; for (var i = 0; i < words.length; i++) { if (self.check(words[i])) { rv.push(words[i]); } } return rv; } function correct(word) { // Get the edit-distance-1 and edit-distance-2 forms of this word. var ed1 = edits1([word]); var ed2 = edits1(ed1); var corrections = known(ed1).concat(known(ed2)); // Sort the edits based on how many different ways they were created. var weighted_corrections = {}; for (var i = 0, _len = corrections.length; i < _len; i++) { if (!(corrections[i] in weighted_corrections)) { weighted_corrections[corrections[i]] = 1; } else { weighted_corrections[corrections[i]] += 1; } } var sorted_corrections = []; for (var i in weighted_corrections) { sorted_corrections.push([ i, weighted_corrections[i] ]); } function sorter(a, b) { if (a[1] < b[1]) { return -1; } return 1; } sorted_corrections.sort(sorter).reverse(); var rv = []; for (var i = 0, _len = Math.min(limit, sorted_corrections.length); i < _len; i++) { if (!self.hasFlag(sorted_corrections[i][0], "NOSUGGEST")) { rv.push(sorted_corrections[i][0]); } } return rv; } return correct(word); } };