simplemde-markdown-editor/src/js/typo.js

'use strict';

/**
 * Typo is a JavaScript implementation of a spellchecker using hunspell-style
 * dictionaries.
 */

/**
 * Typo constructor.
 *
 * @param {String} [dictionary] The locale code of the dictionary being used. e.g.,
 *                              "en_US". This is only used to auto-load dictionaries.
 * @param {String} [affData] The data from the dictionary's .aff file. If omitted
 *                           and the first argument is supplied, in "chrome" platform,
 *                           the .aff file will be loaded automatically from
 *                           lib/typo/dictionaries/[dictionary]/[dictionary].aff
 *                           In other platform, it will be loaded from
 *                           [setting.path]/dictionaries/[dictionary]/[dictionary].aff
 * @param {String} [wordsData] The data from the dictionary's .dic file. If omitted,
 *                           and the first argument is supplied, in "chrome" platform,
 *                           the .dic file will be loaded automatically from
 *                           lib/typo/dictionaries/[dictionary]/[dictionary].dic
 *                           In other platform, it will be loaded from
 *                           [setting.path]/dictionaries/[dictionary]/[dictionary].dic
 * @param {Object} [settings] Constructor settings. Available properties are:
 *                            {String} [platform]: "chrome" for Chrome Extension or other
 *                              value for the usual web.
 *                            {String} [dictionaryPath]: path to load dictionary from in non-chrome
 *                              environment.
 *                            {Object} [flags]: flag information.
 *
 *
 * @returns {Typo} A Typo object.
 */

var Typo = function (dictionary, affData, wordsData, settings) {
	settings = settings || {};

	/** Determines the method used for auto-loading .aff and .dic files. **/
	this.platform = settings.platform || "chrome";

	this.dictionary = null;

	this.rules = {};
	this.dictionaryTable = {};

	this.compoundRules = [];
	this.compoundRuleCodes = {};

	this.replacementTable = [];

	this.flags = settings.flags || {};

	if (dictionary) {
		this.dictionary = dictionary;

		if (this.platform == "chrome") {
			if (!affData) affData = this._readFile(chrome.extension.getURL("lib/typo/dictionaries/" + dictionary + "/" + dictionary + ".aff"));
			if (!wordsData) wordsData = this._readFile(chrome.extension.getURL("lib/typo/dictionaries/" + dictionary + "/" + dictionary + ".dic"));
		} else {
			var path = settings.dictionaryPath || '';

			if (!affData) affData = this._readFile(path + "/" + dictionary + "/" + dictionary + ".aff");
			if (!wordsData) wordsData = this._readFile(path + "/" + dictionary + "/" + dictionary + ".dic");
		}

		this.rules = this._parseAFF(affData);

		// Save the rule codes that are used in compound rules.
		this.compoundRuleCodes = {};

		for (var i = 0, _len = this.compoundRules.length; i < _len; i++) {
			var rule = this.compoundRules[i];

			for (var j = 0, _jlen = rule.length; j < _jlen; j++) {
				this.compoundRuleCodes[rule[j]] = [];
			}
		}

		// If we add this ONLYINCOMPOUND flag to this.compoundRuleCodes, then _parseDIC
		// will do the work of saving the list of words that are compound-only.
		if ("ONLYINCOMPOUND" in this.flags) {
			this.compoundRuleCodes[this.flags.ONLYINCOMPOUND] = [];
		}

		this.dictionaryTable = this._parseDIC(wordsData);

		// Get rid of any codes from the compound rule codes that are never used
		// (or that were special regex characters).  Not especially necessary...
		for (var i in this.compoundRuleCodes) {
			if (this.compoundRuleCodes[i].length == 0) {
				delete this.compoundRuleCodes[i];
			}
		}

		// Build the full regular expressions for each compound rule.
		// I have a feeling (but no confirmation yet) that this method of
		// testing for compound words is probably slow.
		for (var i = 0, _len = this.compoundRules.length; i < _len; i++) {
			var ruleText = this.compoundRules[i];

			var expressionText = "";

			for (var j = 0, _jlen = ruleText.length; j < _jlen; j++) {
				var character = ruleText[j];

				if (character in this.compoundRuleCodes) {
					expressionText += "(" + this.compoundRuleCodes[character].join("|") + ")";
				}
				else {
					expressionText += character;
				}
			}

			this.compoundRules[i] = new RegExp(expressionText, "i");
		}
	}

	return this;
};

Typo.prototype = {
	/**
	 * Loads a Typo instance from a hash of all of the Typo properties.
	 *
	 * @param object obj A hash of Typo properties, probably gotten from a JSON.parse(JSON.stringify(typo_instance)).
	 */

	load : function (obj) {
		for (var i in obj) {
			this[i] = obj[i];
		}

		return this;
	},

	/**
	 * Read the contents of a file.
	 *
	 * @param {String} path The path (relative) to the file.
	 * @param {String} [charset="ISO8859-1"] The expected charset of the file
	 * @returns string The file data.
	 */

	_readFile : function (path, charset) {
		if (!charset) charset = "ISO8859-1";

		var req = new XMLHttpRequest();
		req.open("GET", path, false);

		if (req.overrideMimeType)
			req.overrideMimeType("text/plain; charset=" + charset);

		req.send(null);

		return req.responseText;
	},

	/**
	 * Parse the rules out from a .aff file.
	 *
	 * @param {String} data The contents of the affix file.
	 * @returns object The rules from the file.
	 */

	_parseAFF : function (data) {
		var rules = {};

		// Remove comment lines
		data = this._removeAffixComments(data);

		var lines = data.split("\n");

		for (var i = 0, _len = lines.length; i < _len; i++) {
			var line = lines[i];

			var definitionParts = line.split(/\s+/);

			var ruleType = definitionParts[0];

			if (ruleType == "PFX" || ruleType == "SFX") {
				var ruleCode = definitionParts[1];
				var combineable = definitionParts[2];
				var numEntries = parseInt(definitionParts[3], 10);

				var entries = [];

				for (var j = i + 1, _jlen = i + 1 + numEntries; j < _jlen; j++) {
					var line = lines[j];

					var lineParts = line.split(/\s+/);
					var charactersToRemove = lineParts[2];

					var additionParts = lineParts[3].split("/");

					var charactersToAdd = additionParts[0];
					if (charactersToAdd === "0") charactersToAdd = "";

					var continuationClasses = this.parseRuleCodes(additionParts[1]);

					var regexToMatch = lineParts[4];

					var entry = {};
					entry.add = charactersToAdd;

					if (continuationClasses.length > 0) entry.continuationClasses = continuationClasses;

					if (regexToMatch !== ".") {
						if (ruleType === "SFX") {
							entry.match = new RegExp(regexToMatch + "$");
						}
						else {
							entry.match = new RegExp("^" + regexToMatch);
						}
					}

					if (charactersToRemove != "0") {
						if (ruleType === "SFX") {
							entry.remove = new RegExp(charactersToRemove  + "$");
						}
						else {
							entry.remove = charactersToRemove;
						}
					}

					entries.push(entry);
				}

				rules[ruleCode] = { "type" : ruleType, "combineable" : (combineable == "Y"), "entries" : entries };

				i += numEntries;
			}
			else if (ruleType === "COMPOUNDRULE") {
				var numEntries = parseInt(definitionParts[1], 10);

				for (var j = i + 1, _jlen = i + 1 + numEntries; j < _jlen; j++) {
					var line = lines[j];

					var lineParts = line.split(/\s+/);
					this.compoundRules.push(lineParts[1]);
				}

				i += numEntries;
			}
			else if (ruleType === "REP") {
				var lineParts = line.split(/\s+/);

				if (lineParts.length === 3) {
					this.replacementTable.push([ lineParts[1], lineParts[2] ]);
				}
			}
			else {
				// ONLYINCOMPOUND
				// COMPOUNDMIN
				// FLAG
				// KEEPCASE
				// NEEDAFFIX

				this.flags[ruleType] = definitionParts[1];
			}
		}

		return rules;
	},

	/**
	 * Removes comment lines and then cleans up blank lines and trailing whitespace.
	 *
	 * @param {String} data The data from an affix file.
	 * @return {String} The cleaned-up data.
	 */

	_removeAffixComments : function (data) {
		// Remove comments
		data = data.replace(/#.*$/mg, "");

		// Trim each line
		data = data.replace(/^\s\s*/m, '').replace(/\s\s*$/m, '');

		// Remove blank lines.
		data = data.replace(/\n{2,}/g, "\n");

		// Trim the entire string
		data = data.replace(/^\s\s*/, '').replace(/\s\s*$/, '');

		return data;
	},

	/**
	 * Parses the words out from the .dic file.
	 *
	 * @param {String} data The data from the dictionary file.
	 * @returns object The lookup table containing all of the words and
	 *                 word forms from the dictionary.
	 */

	_parseDIC : function (data) {
		data = this._removeDicComments(data);

		var lines = data.split("\n");
		var dictionaryTable = {};

		function addWord(word, rules) {
			// Some dictionaries will list the same word multiple times with different rule sets.
			if (!(word in dictionaryTable) || typeof dictionaryTable[word] != 'object') {
				dictionaryTable[word] = [];
			}

			dictionaryTable[word].push(rules);
		}

		// The first line is the number of words in the dictionary.
		for (var i = 1, _len = lines.length; i < _len; i++) {
			var line = lines[i];

			var parts = line.split("/", 2);

			var word = parts[0];

			// Now for each affix rule, generate that form of the word.
			if (parts.length > 1) {
				var ruleCodesArray = this.parseRuleCodes(parts[1]);

				// Save the ruleCodes for compound word situations.
				if (!("NEEDAFFIX" in this.flags) || ruleCodesArray.indexOf(this.flags.NEEDAFFIX) == -1) {
					addWord(word, ruleCodesArray);
				}

				for (var j = 0, _jlen = ruleCodesArray.length; j < _jlen; j++) {
					var code = ruleCodesArray[j];

					var rule = this.rules[code];

					if (rule) {
						var newWords = this._applyRule(word, rule);

						for (var ii = 0, _iilen = newWords.length; ii < _iilen; ii++) {
							var newWord = newWords[ii];

							addWord(newWord, []);

							if (rule.combineable) {
								for (var k = j + 1; k < _jlen; k++) {
									var combineCode = ruleCodesArray[k];

									var combineRule = this.rules[combineCode];

									if (combineRule) {
										if (combineRule.combineable && (rule.type != combineRule.type)) {
											var otherNewWords = this._applyRule(newWord, combineRule);

											for (var iii = 0, _iiilen = otherNewWords.length; iii < _iiilen; iii++) {
												var otherNewWord = otherNewWords[iii];
												addWord(otherNewWord, []);
											}
										}
									}
								}
							}
						}
					}

					if (code in this.compoundRuleCodes) {
						this.compoundRuleCodes[code].push(word);
					}
				}
			}
			else {
				addWord(word.trim(), []);
			}
		}

		return dictionaryTable;
	},


	/**
	 * Removes comment lines and then cleans up blank lines and trailing whitespace.
	 *
	 * @param {String} data The data from a .dic file.
	 * @return {String} The cleaned-up data.
	 */

	_removeDicComments : function (data) {
		// I can't find any official documentation on it, but at least the de_DE
		// dictionary uses tab-indented lines as comments.

		// Remove comments
		data = data.replace(/^\t.*$/mg, "");

		return data;

		// Trim each line
		data = data.replace(/^\s\s*/m, '').replace(/\s\s*$/m, '');

		// Remove blank lines.
		data = data.replace(/\n{2,}/g, "\n");

		// Trim the entire string
		data = data.replace(/^\s\s*/, '').replace(/\s\s*$/, '');

		return data;
	},

	parseRuleCodes : function (textCodes) {
		if (!textCodes) {
			return [];
		}
		else if (!("FLAG" in this.flags)) {
			return textCodes.split("");
		}
		else if (this.flags.FLAG === "long") {
			var flags = [];

			for (var i = 0, _len = textCodes.length; i < _len; i += 2) {
				flags.push(textCodes.substr(i, 2));
			}

			return flags;
		}
		else if (this.flags.FLAG === "num") {
			return textCode.split(",");
		}
	},

	/**
	 * Applies an affix rule to a word.
	 *
	 * @param {String} word The base word.
	 * @param {Object} rule The affix rule.
	 * @returns {String[]} The new words generated by the rule.
	 */

	_applyRule : function (word, rule) {
		var entries = rule.entries;
		var newWords = [];

		for (var i = 0, _len = entries.length; i < _len; i++) {
			var entry = entries[i];

			if (!entry.match || word.match(entry.match)) {
				var newWord = word;

				if (entry.remove) {
					newWord = newWord.replace(entry.remove, "");
				}

				if (rule.type === "SFX") {
					newWord = newWord + entry.add;
				}
				else {
					newWord = entry.add + newWord;
				}

				newWords.push(newWord);

				if ("continuationClasses" in entry) {
					for (var j = 0, _jlen = entry.continuationClasses.length; j < _jlen; j++) {
						var continuationRule = this.rules[entry.continuationClasses[j]];

						if (continuationRule) {
							newWords = newWords.concat(this._applyRule(newWord, continuationRule));
						}
						/*
						else {
							// This shouldn't happen, but it does, at least in the de_DE dictionary.
							// I think the author mistakenly supplied lower-case rule codes instead
							// of upper-case.
						}
						*/
					}
				}
			}
		}

		return newWords;
	},

	/**
	 * Checks whether a word or a capitalization variant exists in the current dictionary.
	 * The word is trimmed and several variations of capitalizations are checked.
	 * If you want to check a word without any changes made to it, call checkExact()
	 *
	 * @see http://blog.stevenlevithan.com/archives/faster-trim-javascript re:trimming function
	 *
	 * @param {String} aWord The word to check.
	 * @returns {Boolean}
	 */

	check : function (aWord) {
		// Remove leading and trailing whitespace
		var trimmedWord = aWord.replace(/^\s\s*/, '').replace(/\s\s*$/, '');

		if (this.checkExact(trimmedWord)) {
			return true;
		}

		// The exact word is not in the dictionary.
		if (trimmedWord.toUpperCase() === trimmedWord) {
			// The word was supplied in all uppercase.
			// Check for a capitalized form of the word.
			var capitalizedWord = trimmedWord[0] + trimmedWord.substring(1).toLowerCase();

			if (this.hasFlag(capitalizedWord, "KEEPCASE")) {
				// Capitalization variants are not allowed for this word.
				return false;
			}

			if (this.checkExact(capitalizedWord)) {
				return true;
			}
		}

		var lowercaseWord = trimmedWord.toLowerCase();

		if (lowercaseWord !== trimmedWord) {
			if (this.hasFlag(lowercaseWord, "KEEPCASE")) {
				// Capitalization variants are not allowed for this word.
				return false;
			}

			// Check for a lowercase form
			if (this.checkExact(lowercaseWord)) {
				return true;
			}
		}

		return false;
	},

	/**
	 * Checks whether a word exists in the current dictionary.
	 *
	 * @param {String} word The word to check.
	 * @returns {Boolean}
	 */

	checkExact : function (word) {
		var ruleCodes = this.dictionaryTable[word];

		if (typeof ruleCodes === 'undefined') {
			// Check if this might be a compound word.
			if ("COMPOUNDMIN" in this.flags && word.length >= this.flags.COMPOUNDMIN) {
				for (var i = 0, _len = this.compoundRules.length; i < _len; i++) {
					if (word.match(this.compoundRules[i])) {
						return true;
					}
				}
			}

			return false;
		}
		else {
			for (var i = 0, _len = ruleCodes.length; i < _len; i++) {
				if (!this.hasFlag(word, "ONLYINCOMPOUND", ruleCodes[i])) {
					return true;
				}
			}

			return false;
		}
	},

	/**
	 * Looks up whether a given word is flagged with a given flag.
	 *
	 * @param {String} word The word in question.
	 * @param {String} flag The flag in question.
	 * @return {Boolean}
	 */

	hasFlag : function (word, flag, wordFlags) {
		if (flag in this.flags) {
			if (typeof wordFlags === 'undefined') {
				var wordFlags = Array.prototype.concat.apply([], this.dictionaryTable[word]);
			}

			if (wordFlags && wordFlags.indexOf(this.flags[flag]) !== -1) {
				return true;
			}
		}

		return false;
	},

	/**
	 * Returns a list of suggestions for a misspelled word.
	 *
	 * @see http://www.norvig.com/spell-correct.html for the basis of this suggestor.
	 * This suggestor is primitive, but it works.
	 *
	 * @param {String} word The misspelling.
	 * @param {Number} [limit=5] The maximum number of suggestions to return.
	 * @returns {String[]} The array of suggestions.
	 */

	alphabet : "",

	suggest : function (word, limit) {
		if (!limit) limit = 5;

		if (this.check(word)) return [];

		// Check the replacement table.
		for (var i = 0, _len = this.replacementTable.length; i < _len; i++) {
			var replacementEntry = this.replacementTable[i];

			if (word.indexOf(replacementEntry[0]) !== -1) {
				var correctedWord = word.replace(replacementEntry[0], replacementEntry[1]);

				if (this.check(correctedWord)) {
					return [ correctedWord ];
				}
			}
		}

		var self = this;
		self.alphabet = "abcdefghijklmnopqrstuvwxyz";

		/*
		if (!self.alphabet) {
			// Use the alphabet as implicitly defined by the words in the dictionary.
			var alphaHash = {};

			for (var i in self.dictionaryTable) {
				for (var j = 0, _len = i.length; j < _len; j++) {
					alphaHash[i[j]] = true;
				}
			}

			for (var i in alphaHash) {
				self.alphabet += i;
			}

			var alphaArray = self.alphabet.split("");
			alphaArray.sort();
			self.alphabet = alphaArray.join("");
		}
		*/

		function edits1(words) {
			var rv = [];

			for (var ii = 0, _iilen = words.length; ii < _iilen; ii++) {
				var word = words[ii];

				var splits = [];

				for (var i = 0, _len = word.length + 1; i < _len; i++) {
					splits.push([ word.substring(0, i), word.substring(i, word.length) ]);
				}

				var deletes = [];

				for (var i = 0, _len = splits.length; i < _len; i++) {
					var s = splits[i];

					if (s[1]) {
						deletes.push(s[0] + s[1].substring(1));
					}
				}

				var transposes = [];

				for (var i = 0, _len = splits.length; i < _len; i++) {
					var s = splits[i];

					if (s[1].length > 1) {
						transposes.push(s[0] + s[1][1] + s[1][0] + s[1].substring(2));
					}
				}

				var replaces = [];

				for (var i = 0, _len = splits.length; i < _len; i++) {
					var s = splits[i];

					if (s[1]) {
						for (var j = 0, _jlen = self.alphabet.length; j < _jlen; j++) {
							replaces.push(s[0] + self.alphabet[j] + s[1].substring(1));
						}
					}
				}

				var inserts = [];

				for (var i = 0, _len = splits.length; i < _len; i++) {
					var s = splits[i];

					if (s[1]) {
						for (var j = 0, _jlen = self.alphabet.length; j < _jlen; j++) {
							replaces.push(s[0] + self.alphabet[j] + s[1]);
						}
					}
				}

				rv = rv.concat(deletes);
				rv = rv.concat(transposes);
				rv = rv.concat(replaces);
				rv = rv.concat(inserts);
			}

			return rv;
		}

		function known(words) {
			var rv = [];

			for (var i = 0; i < words.length; i++) {
				if (self.check(words[i])) {
					rv.push(words[i]);
				}
			}

			return rv;
		}

		function correct(word) {
			// Get the edit-distance-1 and edit-distance-2 forms of this word.
			var ed1 = edits1([word]);
			var ed2 = edits1(ed1);

			var corrections = known(ed1).concat(known(ed2));

			// Sort the edits based on how many different ways they were created.
			var weighted_corrections = {};

			for (var i = 0, _len = corrections.length; i < _len; i++) {
				if (!(corrections[i] in weighted_corrections)) {
					weighted_corrections[corrections[i]] = 1;
				}
				else {
					weighted_corrections[corrections[i]] += 1;
				}
			}

			var sorted_corrections = [];

			for (var i in weighted_corrections) {
				sorted_corrections.push([ i, weighted_corrections[i] ]);
			}

			function sorter(a, b) {
				if (a[1] < b[1]) {
					return -1;
				}

				return 1;
			}

			sorted_corrections.sort(sorter).reverse();

			var rv = [];

			for (var i = 0, _len = Math.min(limit, sorted_corrections.length); i < _len; i++) {
				if (!self.hasFlag(sorted_corrections[i][0], "NOSUGGEST")) {
					rv.push(sorted_corrections[i][0]);
				}
			}

			return rv;
		}

		return correct(word);
	}
};