001    /*
002     * Copyright (C) 2008-2010 by Holger Arndt
003     *
004     * This file is part of the Universal Java Matrix Package (UJMP).
005     * See the NOTICE file distributed with this work for additional
006     * information regarding copyright ownership and licensing.
007     *
008     * UJMP is free software; you can redistribute it and/or modify
009     * it under the terms of the GNU Lesser General Public License as
010     * published by the Free Software Foundation; either version 2
011     * of the License, or (at your option) any later version.
012     *
013     * UJMP is distributed in the hope that it will be useful,
014     * but WITHOUT ANY WARRANTY; without even the implied warranty of
015     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
016     * GNU Lesser General Public License for more details.
017     *
018     * You should have received a copy of the GNU Lesser General Public
019     * License along with UJMP; if not, write to the
020     * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
021     * Boston, MA  02110-1301  USA
022     */
023    
024    package org.ujmp.core.util;
025    
026    /*
027    
028     Porter stemmer in Java. The original paper is in
029    
030     Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
031     no. 3, pp 130-137,
032    
033     See also http://www.tartarus.org/~martin/PorterStemmer
034    
035     History:
036    
037     Release 1
038    
039     Bug 1 (reported by Gonzalo Parra 16/10/99) fixed as marked below.
040     The words 'aed', 'eed', 'oed' leave k at 'a' for step 3, and b[k-1]
041     is then out outside the bounds of b.
042    
043     Release 2
044    
045     Similarly,
046    
047     Bug 2 (reported by Steve Dyrdahl 22/2/00) fixed as marked below.
048     'ion' by itself leaves j = -1 in the test for 'ion' in step 5, and
049     b[j] is then outside the bounds of b.
050    
051     Release 3
052    
053     Considerably revised 4/9/00 in the light of many helpful suggestions
054     from Brian Goetz of Quiotix Corporation (brian@quiotix.com).
055    
056     Release 4
057    
058     */
059    
060    import java.io.FileInputStream;
061    import java.io.FileNotFoundException;
062    import java.io.IOException;
063    
064    /**
065     * Stemmer, implementing the Porter Stemming Algorithm
066     * 
067     * The Stemmer class transforms a word into its root form. The input word can be
068     * provided a character at time (by calling add()), or at once by calling one of
069     * the various stem(something) methods.
070     */
071    
072    public class PorterStemmer {
073            private char[] b;
074            private int i, /* offset into b */
075            i_end, /* offset to end of stemmed word */
076            j, k;
077            private static final int INC = 50;
078    
079            /* unit of size whereby b is increased */
080            public PorterStemmer() {
081                    b = new char[INC];
082                    i = 0;
083                    i_end = 0;
084            }
085    
086            /**
087             * Add a character to the word being stemmed. When you are finished adding
088             * characters, you can call stem(void) to stem the word.
089             */
090    
091            public void add(char ch) {
092                    if (i == b.length) {
093                            char[] new_b = new char[i + INC];
094                            for (int c = 0; c < i; c++)
095                                    new_b[c] = b[c];
096                            b = new_b;
097                    }
098                    b[i++] = ch;
099            }
100    
101            /**
102             * Adds wLen characters to the word being stemmed contained in a portion of
103             * a char[] array. This is like repeated calls of add(char ch), but faster.
104             */
105    
106            public void add(char[] w, int wLen) {
107                    if (i + wLen >= b.length) {
108                            char[] new_b = new char[i + wLen + INC];
109                            for (int c = 0; c < i; c++)
110                                    new_b[c] = b[c];
111                            b = new_b;
112                    }
113                    for (int c = 0; c < wLen; c++)
114                            b[i++] = w[c];
115            }
116    
117            /**
118             * After a word has been stemmed, it can be retrieved by toString(), or a
119             * reference to the internal buffer can be retrieved by getResultBuffer and
120             * getResultLength (which is generally more efficient.)
121             */
122            public String toString() {
123                    return new String(b, 0, i_end);
124            }
125    
126            /**
127             * Returns the length of the word resulting from the stemming process.
128             */
129            public int getResultLength() {
130                    return i_end;
131            }
132    
133            /**
134             * Returns a reference to a character buffer containing the results of the
135             * stemming process. You also need to consult getResultLength() to determine
136             * the length of the result.
137             */
138            public char[] getResultBuffer() {
139                    return b;
140            }
141    
142            /* cons(i) is true <=> b[i] is a consonant. */
143    
144            private final boolean cons(int i) {
145                    switch (b[i]) {
146                    case 'a':
147                    case 'e':
148                    case 'i':
149                    case 'o':
150                    case 'u':
151                            return false;
152                    case 'y':
153                            return (i == 0) ? true : !cons(i - 1);
154                    default:
155                            return true;
156                    }
157            }
158    
159            /*
160             * m() measures the number of consonant sequences between 0 and j. if c is a
161             * consonant sequence and v a vowel sequence, and <..> indicates arbitrary
162             * presence,
163             * 
164             * <c><v> gives 0 <c>vc<v> gives 1 <c>vcvc<v> gives 2 <c>vcvcvc<v> gives 3
165             * ....
166             */
167    
168            private final int m() {
169                    int n = 0;
170                    int i = 0;
171                    while (true) {
172                            if (i > j)
173                                    return n;
174                            if (!cons(i))
175                                    break;
176                            i++;
177                    }
178                    i++;
179                    while (true) {
180                            while (true) {
181                                    if (i > j)
182                                            return n;
183                                    if (cons(i))
184                                            break;
185                                    i++;
186                            }
187                            i++;
188                            n++;
189                            while (true) {
190                                    if (i > j)
191                                            return n;
192                                    if (!cons(i))
193                                            break;
194                                    i++;
195                            }
196                            i++;
197                    }
198            }
199    
200            /* vowelinstem() is true <=> 0,...j contains a vowel */
201    
202            private final boolean vowelinstem() {
203                    int i;
204                    for (i = 0; i <= j; i++)
205                            if (!cons(i))
206                                    return true;
207                    return false;
208            }
209    
210            /* doublec(j) is true <=> j,(j-1) contain a double consonant. */
211    
212            private final boolean doublec(int j) {
213                    if (j < 1)
214                            return false;
215                    if (b[j] != b[j - 1])
216                            return false;
217                    return cons(j);
218            }
219    
220            /*
221             * cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant
222             * and also if the second c is not w,x or y. this is used when trying to
223             * restore an e at the end of a short word. e.g.
224             * 
225             * cav(e), lov(e), hop(e), crim(e), but snow, box, tray.
226             */
227    
228            private final boolean cvc(int i) {
229                    if (i < 2 || !cons(i) || cons(i - 1) || !cons(i - 2))
230                            return false;
231                    {
232                            int ch = b[i];
233                            if (ch == 'w' || ch == 'x' || ch == 'y')
234                                    return false;
235                    }
236                    return true;
237            }
238    
239            private final boolean ends(String s) {
240                    int l = s.length();
241                    int o = k - l + 1;
242                    if (o < 0)
243                            return false;
244                    for (int i = 0; i < l; i++)
245                            if (b[o + i] != s.charAt(i))
246                                    return false;
247                    j = k - l;
248                    return true;
249            }
250    
251            /*
252             * setto(s) sets (j+1),...k to the characters in the string s, readjusting
253             * k.
254             */
255    
256            private final void setto(String s) {
257                    int l = s.length();
258                    int o = j + 1;
259                    for (int i = 0; i < l; i++)
260                            b[o + i] = s.charAt(i);
261                    k = j + l;
262            }
263    
264            /* r(s) is used further down. */
265    
266            private final void r(String s) {
267                    if (m() > 0)
268                            setto(s);
269            }
270    
271            /*
272             * step1() gets rid of plurals and -ed or -ing. e.g.
273             * 
274             * caresses -> caress ponies -> poni ties -> ti caress -> caress cats -> cat
275             * 
276             * feed -> feed agreed -> agree disabled -> disable
277             * 
278             * matting -> mat mating -> mate meeting -> meet milling -> mill messing ->
279             * mess
280             * 
281             * meetings -> meet
282             */
283    
284            private final void step1() {
285                    if (b[k] == 's') {
286                            if (ends("sses"))
287                                    k -= 2;
288                            else if (ends("ies"))
289                                    setto("i");
290                            else if (b[k - 1] != 's')
291                                    k--;
292                    }
293                    if (ends("eed")) {
294                            if (m() > 0)
295                                    k--;
296                    } else if ((ends("ed") || ends("ing")) && vowelinstem()) {
297                            k = j;
298                            if (ends("at"))
299                                    setto("ate");
300                            else if (ends("bl"))
301                                    setto("ble");
302                            else if (ends("iz"))
303                                    setto("ize");
304                            else if (doublec(k)) {
305                                    k--;
306                                    {
307                                            int ch = b[k];
308                                            if (ch == 'l' || ch == 's' || ch == 'z')
309                                                    k++;
310                                    }
311                            } else if (m() == 1 && cvc(k))
312                                    setto("e");
313                    }
314            }
315    
316            /* step2() turns terminal y to i when there is another vowel in the stem. */
317    
318            private final void step2() {
319                    if (ends("y") && vowelinstem())
320                            b[k] = 'i';
321            }
322    
323            /*
324             * step3() maps double suffices to single ones. so -ization ( = -ize plus
325             * -ation) maps to -ize etc. note that the string before the suffix must
326             * give m() > 0.
327             */
328    
329            private final void step3() {
330                    if (k == 0)
331                            return; /* For Bug 1 */
332                    switch (b[k - 1]) {
333                    case 'a':
334                            if (ends("ational")) {
335                                    r("ate");
336                                    break;
337                            }
338                            if (ends("tional")) {
339                                    r("tion");
340                                    break;
341                            }
342                            break;
343                    case 'c':
344                            if (ends("enci")) {
345                                    r("ence");
346                                    break;
347                            }
348                            if (ends("anci")) {
349                                    r("ance");
350                                    break;
351                            }
352                            break;
353                    case 'e':
354                            if (ends("izer")) {
355                                    r("ize");
356                                    break;
357                            }
358                            break;
359                    case 'l':
360                            if (ends("bli")) {
361                                    r("ble");
362                                    break;
363                            }
364                            if (ends("alli")) {
365                                    r("al");
366                                    break;
367                            }
368                            if (ends("entli")) {
369                                    r("ent");
370                                    break;
371                            }
372                            if (ends("eli")) {
373                                    r("e");
374                                    break;
375                            }
376                            if (ends("ousli")) {
377                                    r("ous");
378                                    break;
379                            }
380                            break;
381                    case 'o':
382                            if (ends("ization")) {
383                                    r("ize");
384                                    break;
385                            }
386                            if (ends("ation")) {
387                                    r("ate");
388                                    break;
389                            }
390                            if (ends("ator")) {
391                                    r("ate");
392                                    break;
393                            }
394                            break;
395                    case 's':
396                            if (ends("alism")) {
397                                    r("al");
398                                    break;
399                            }
400                            if (ends("iveness")) {
401                                    r("ive");
402                                    break;
403                            }
404                            if (ends("fulness")) {
405                                    r("ful");
406                                    break;
407                            }
408                            if (ends("ousness")) {
409                                    r("ous");
410                                    break;
411                            }
412                            break;
413                    case 't':
414                            if (ends("aliti")) {
415                                    r("al");
416                                    break;
417                            }
418                            if (ends("iviti")) {
419                                    r("ive");
420                                    break;
421                            }
422                            if (ends("biliti")) {
423                                    r("ble");
424                                    break;
425                            }
426                            break;
427                    case 'g':
428                            if (ends("logi")) {
429                                    r("log");
430                                    break;
431                            }
432                    }
433            }
434    
435            /* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */
436    
437            private final void step4() {
438                    switch (b[k]) {
439                    case 'e':
440                            if (ends("icate")) {
441                                    r("ic");
442                                    break;
443                            }
444                            if (ends("ative")) {
445                                    r("");
446                                    break;
447                            }
448                            if (ends("alize")) {
449                                    r("al");
450                                    break;
451                            }
452                            break;
453                    case 'i':
454                            if (ends("iciti")) {
455                                    r("ic");
456                                    break;
457                            }
458                            break;
459                    case 'l':
460                            if (ends("ical")) {
461                                    r("ic");
462                                    break;
463                            }
464                            if (ends("ful")) {
465                                    r("");
466                                    break;
467                            }
468                            break;
469                    case 's':
470                            if (ends("ness")) {
471                                    r("");
472                                    break;
473                            }
474                            break;
475                    }
476            }
477    
478            /* step5() takes off -ant, -ence etc., in context <c>vcvc<v>. */
479    
480            private final void step5() {
481                    if (k == 0)
482                            return; /* for Bug 1 */
483                    switch (b[k - 1]) {
484                    case 'a':
485                            if (ends("al"))
486                                    break;
487                            return;
488                    case 'c':
489                            if (ends("ance"))
490                                    break;
491                            if (ends("ence"))
492                                    break;
493                            return;
494                    case 'e':
495                            if (ends("er"))
496                                    break;
497                            return;
498                    case 'i':
499                            if (ends("ic"))
500                                    break;
501                            return;
502                    case 'l':
503                            if (ends("able"))
504                                    break;
505                            if (ends("ible"))
506                                    break;
507                            return;
508                    case 'n':
509                            if (ends("ant"))
510                                    break;
511                            if (ends("ement"))
512                                    break;
513                            if (ends("ment"))
514                                    break;
515                            /* element etc. not stripped before the m */
516                            if (ends("ent"))
517                                    break;
518                            return;
519                    case 'o':
520                            if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't'))
521                                    break;
522                            /* j >= 0 fixes Bug 2 */
523                            if (ends("ou"))
524                                    break;
525                            return;
526                            /* takes care of -ous */
527                    case 's':
528                            if (ends("ism"))
529                                    break;
530                            return;
531                    case 't':
532                            if (ends("ate"))
533                                    break;
534                            if (ends("iti"))
535                                    break;
536                            return;
537                    case 'u':
538                            if (ends("ous"))
539                                    break;
540                            return;
541                    case 'v':
542                            if (ends("ive"))
543                                    break;
544                            return;
545                    case 'z':
546                            if (ends("ize"))
547                                    break;
548                            return;
549                    default:
550                            return;
551                    }
552                    if (m() > 1)
553                            k = j;
554            }
555    
556            /* step6() removes a final -e if m() > 1. */
557    
558            private final void step6() {
559                    j = k;
560                    if (b[k] == 'e') {
561                            int a = m();
562                            if (a > 1 || a == 1 && !cvc(k - 1))
563                                    k--;
564                    }
565                    if (b[k] == 'l' && doublec(k) && m() > 1)
566                            k--;
567            }
568    
569            /**
570             * Stem a word provided as a String. Returns the result as a String.
571             */
572            public String stem(String s) {
573                    i = 0;
574                    add(s.toCharArray(), s.length());
575                    stem();
576                    return toString();
577            }
578    
579            /**
580             * Stem the word placed into the Stemmer buffer through calls to add().
581             * Returns true if the stemming process resulted in a word different from
582             * the input. You can retrieve the result with
583             * getResultLength()/getResultBuffer() or toString().
584             */
585            public void stem() {
586                    k = i - 1;
587                    if (k > 1) {
588                            step1();
589                            step2();
590                            step3();
591                            step4();
592                            step5();
593                            step6();
594                    }
595                    i_end = k + 1;
596                    i = 0;
597            }
598    
599            /**
600             * Test program for demonstrating the Stemmer. It reads text from a a list
601             * of files, stems each word, and writes the result to standard output. Note
602             * that the word stemmed is expected to be in lower case: forcing lower case
603             * must be done outside the Stemmer class. Usage: Stemmer file-name
604             * file-name ...
605             */
606            public static void main(String[] args) {
607                    char[] w = new char[501];
608                    PorterStemmer s = new PorterStemmer();
609                    for (int i = 0; i < args.length; i++)
610                            try {
611                                    FileInputStream in = new FileInputStream(args[i]);
612    
613                                    try {
614                                            while (true)
615    
616                                            {
617                                                    int ch = in.read();
618                                                    if (Character.isLetter((char) ch)) {
619                                                            int j = 0;
620                                                            while (true) {
621                                                                    ch = Character.toLowerCase((char) ch);
622                                                                    w[j] = (char) ch;
623                                                                    if (j < 500)
624                                                                            j++;
625                                                                    ch = in.read();
626                                                                    if (!Character.isLetter((char) ch)) {
627                                                                            /* to test add(char ch) */
628                                                                            for (int c = 0; c < j; c++)
629                                                                                    s.add(w[c]);
630    
631                                                                            /* or, to test add(char[] w, int j) */
632                                                                            /* s.add(w, j); */
633    
634                                                                            s.stem();
635                                                                            {
636                                                                                    String u;
637    
638                                                                                    /* and now, to test toString() : */
639                                                                                    u = s.toString();
640    
641                                                                                    /*
642                                                                                     * to test getResultBuffer(),
643                                                                                     * getResultLength() :
644                                                                                     */
645                                                                                    /*
646                                                                                     * u = new String(s.getResultBuffer(),
647                                                                                     * 0, s.getResultLength());
648                                                                                     */
649    
650                                                                                    System.out.print(u);
651                                                                            }
652                                                                            break;
653                                                                    }
654                                                            }
655                                                    }
656                                                    if (ch < 0)
657                                                            break;
658                                                    System.out.print((char) ch);
659                                            }
660                                    } catch (IOException e) {
661                                            System.out.println("error reading " + args[i]);
662                                            break;
663                                    }
664                            } catch (FileNotFoundException e) {
665                                    System.out.println("file " + args[i] + " not found");
666                                    break;
667                            }
668            }
669    }