001 /* 002 * Copyright (C) 2008-2010 by Holger Arndt 003 * 004 * This file is part of the Universal Java Matrix Package (UJMP). 005 * See the NOTICE file distributed with this work for additional 006 * information regarding copyright ownership and licensing. 007 * 008 * UJMP is free software; you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation; either version 2 011 * of the License, or (at your option) any later version. 012 * 013 * UJMP is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU Lesser General Public License for more details. 017 * 018 * You should have received a copy of the GNU Lesser General Public 019 * License along with UJMP; if not, write to the 020 * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, 021 * Boston, MA 02110-1301 USA 022 */ 023 024 package org.ujmp.core.util; 025 026 /* 027 028 Porter stemmer in Java. The original paper is in 029 030 Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14, 031 no. 3, pp 130-137, 032 033 See also http://www.tartarus.org/~martin/PorterStemmer 034 035 History: 036 037 Release 1 038 039 Bug 1 (reported by Gonzalo Parra 16/10/99) fixed as marked below. 040 The words 'aed', 'eed', 'oed' leave k at 'a' for step 3, and b[k-1] 041 is then out outside the bounds of b. 042 043 Release 2 044 045 Similarly, 046 047 Bug 2 (reported by Steve Dyrdahl 22/2/00) fixed as marked below. 048 'ion' by itself leaves j = -1 in the test for 'ion' in step 5, and 049 b[j] is then outside the bounds of b. 050 051 Release 3 052 053 Considerably revised 4/9/00 in the light of many helpful suggestions 054 from Brian Goetz of Quiotix Corporation (brian@quiotix.com). 055 056 Release 4 057 058 */ 059 060 import java.io.FileInputStream; 061 import java.io.FileNotFoundException; 062 import java.io.IOException; 063 064 /** 065 * Stemmer, implementing the Porter Stemming Algorithm 066 * 067 * The Stemmer class transforms a word into its root form. The input word can be 068 * provided a character at time (by calling add()), or at once by calling one of 069 * the various stem(something) methods. 070 */ 071 072 public class PorterStemmer { 073 private char[] b; 074 private int i, /* offset into b */ 075 i_end, /* offset to end of stemmed word */ 076 j, k; 077 private static final int INC = 50; 078 079 /* unit of size whereby b is increased */ 080 public PorterStemmer() { 081 b = new char[INC]; 082 i = 0; 083 i_end = 0; 084 } 085 086 /** 087 * Add a character to the word being stemmed. When you are finished adding 088 * characters, you can call stem(void) to stem the word. 089 */ 090 091 public void add(char ch) { 092 if (i == b.length) { 093 char[] new_b = new char[i + INC]; 094 for (int c = 0; c < i; c++) 095 new_b[c] = b[c]; 096 b = new_b; 097 } 098 b[i++] = ch; 099 } 100 101 /** 102 * Adds wLen characters to the word being stemmed contained in a portion of 103 * a char[] array. This is like repeated calls of add(char ch), but faster. 104 */ 105 106 public void add(char[] w, int wLen) { 107 if (i + wLen >= b.length) { 108 char[] new_b = new char[i + wLen + INC]; 109 for (int c = 0; c < i; c++) 110 new_b[c] = b[c]; 111 b = new_b; 112 } 113 for (int c = 0; c < wLen; c++) 114 b[i++] = w[c]; 115 } 116 117 /** 118 * After a word has been stemmed, it can be retrieved by toString(), or a 119 * reference to the internal buffer can be retrieved by getResultBuffer and 120 * getResultLength (which is generally more efficient.) 121 */ 122 public String toString() { 123 return new String(b, 0, i_end); 124 } 125 126 /** 127 * Returns the length of the word resulting from the stemming process. 128 */ 129 public int getResultLength() { 130 return i_end; 131 } 132 133 /** 134 * Returns a reference to a character buffer containing the results of the 135 * stemming process. You also need to consult getResultLength() to determine 136 * the length of the result. 137 */ 138 public char[] getResultBuffer() { 139 return b; 140 } 141 142 /* cons(i) is true <=> b[i] is a consonant. */ 143 144 private final boolean cons(int i) { 145 switch (b[i]) { 146 case 'a': 147 case 'e': 148 case 'i': 149 case 'o': 150 case 'u': 151 return false; 152 case 'y': 153 return (i == 0) ? true : !cons(i - 1); 154 default: 155 return true; 156 } 157 } 158 159 /* 160 * m() measures the number of consonant sequences between 0 and j. if c is a 161 * consonant sequence and v a vowel sequence, and <..> indicates arbitrary 162 * presence, 163 * 164 * <c><v> gives 0 <c>vc<v> gives 1 <c>vcvc<v> gives 2 <c>vcvcvc<v> gives 3 165 * .... 166 */ 167 168 private final int m() { 169 int n = 0; 170 int i = 0; 171 while (true) { 172 if (i > j) 173 return n; 174 if (!cons(i)) 175 break; 176 i++; 177 } 178 i++; 179 while (true) { 180 while (true) { 181 if (i > j) 182 return n; 183 if (cons(i)) 184 break; 185 i++; 186 } 187 i++; 188 n++; 189 while (true) { 190 if (i > j) 191 return n; 192 if (!cons(i)) 193 break; 194 i++; 195 } 196 i++; 197 } 198 } 199 200 /* vowelinstem() is true <=> 0,...j contains a vowel */ 201 202 private final boolean vowelinstem() { 203 int i; 204 for (i = 0; i <= j; i++) 205 if (!cons(i)) 206 return true; 207 return false; 208 } 209 210 /* doublec(j) is true <=> j,(j-1) contain a double consonant. */ 211 212 private final boolean doublec(int j) { 213 if (j < 1) 214 return false; 215 if (b[j] != b[j - 1]) 216 return false; 217 return cons(j); 218 } 219 220 /* 221 * cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant 222 * and also if the second c is not w,x or y. this is used when trying to 223 * restore an e at the end of a short word. e.g. 224 * 225 * cav(e), lov(e), hop(e), crim(e), but snow, box, tray. 226 */ 227 228 private final boolean cvc(int i) { 229 if (i < 2 || !cons(i) || cons(i - 1) || !cons(i - 2)) 230 return false; 231 { 232 int ch = b[i]; 233 if (ch == 'w' || ch == 'x' || ch == 'y') 234 return false; 235 } 236 return true; 237 } 238 239 private final boolean ends(String s) { 240 int l = s.length(); 241 int o = k - l + 1; 242 if (o < 0) 243 return false; 244 for (int i = 0; i < l; i++) 245 if (b[o + i] != s.charAt(i)) 246 return false; 247 j = k - l; 248 return true; 249 } 250 251 /* 252 * setto(s) sets (j+1),...k to the characters in the string s, readjusting 253 * k. 254 */ 255 256 private final void setto(String s) { 257 int l = s.length(); 258 int o = j + 1; 259 for (int i = 0; i < l; i++) 260 b[o + i] = s.charAt(i); 261 k = j + l; 262 } 263 264 /* r(s) is used further down. */ 265 266 private final void r(String s) { 267 if (m() > 0) 268 setto(s); 269 } 270 271 /* 272 * step1() gets rid of plurals and -ed or -ing. e.g. 273 * 274 * caresses -> caress ponies -> poni ties -> ti caress -> caress cats -> cat 275 * 276 * feed -> feed agreed -> agree disabled -> disable 277 * 278 * matting -> mat mating -> mate meeting -> meet milling -> mill messing -> 279 * mess 280 * 281 * meetings -> meet 282 */ 283 284 private final void step1() { 285 if (b[k] == 's') { 286 if (ends("sses")) 287 k -= 2; 288 else if (ends("ies")) 289 setto("i"); 290 else if (b[k - 1] != 's') 291 k--; 292 } 293 if (ends("eed")) { 294 if (m() > 0) 295 k--; 296 } else if ((ends("ed") || ends("ing")) && vowelinstem()) { 297 k = j; 298 if (ends("at")) 299 setto("ate"); 300 else if (ends("bl")) 301 setto("ble"); 302 else if (ends("iz")) 303 setto("ize"); 304 else if (doublec(k)) { 305 k--; 306 { 307 int ch = b[k]; 308 if (ch == 'l' || ch == 's' || ch == 'z') 309 k++; 310 } 311 } else if (m() == 1 && cvc(k)) 312 setto("e"); 313 } 314 } 315 316 /* step2() turns terminal y to i when there is another vowel in the stem. */ 317 318 private final void step2() { 319 if (ends("y") && vowelinstem()) 320 b[k] = 'i'; 321 } 322 323 /* 324 * step3() maps double suffices to single ones. so -ization ( = -ize plus 325 * -ation) maps to -ize etc. note that the string before the suffix must 326 * give m() > 0. 327 */ 328 329 private final void step3() { 330 if (k == 0) 331 return; /* For Bug 1 */ 332 switch (b[k - 1]) { 333 case 'a': 334 if (ends("ational")) { 335 r("ate"); 336 break; 337 } 338 if (ends("tional")) { 339 r("tion"); 340 break; 341 } 342 break; 343 case 'c': 344 if (ends("enci")) { 345 r("ence"); 346 break; 347 } 348 if (ends("anci")) { 349 r("ance"); 350 break; 351 } 352 break; 353 case 'e': 354 if (ends("izer")) { 355 r("ize"); 356 break; 357 } 358 break; 359 case 'l': 360 if (ends("bli")) { 361 r("ble"); 362 break; 363 } 364 if (ends("alli")) { 365 r("al"); 366 break; 367 } 368 if (ends("entli")) { 369 r("ent"); 370 break; 371 } 372 if (ends("eli")) { 373 r("e"); 374 break; 375 } 376 if (ends("ousli")) { 377 r("ous"); 378 break; 379 } 380 break; 381 case 'o': 382 if (ends("ization")) { 383 r("ize"); 384 break; 385 } 386 if (ends("ation")) { 387 r("ate"); 388 break; 389 } 390 if (ends("ator")) { 391 r("ate"); 392 break; 393 } 394 break; 395 case 's': 396 if (ends("alism")) { 397 r("al"); 398 break; 399 } 400 if (ends("iveness")) { 401 r("ive"); 402 break; 403 } 404 if (ends("fulness")) { 405 r("ful"); 406 break; 407 } 408 if (ends("ousness")) { 409 r("ous"); 410 break; 411 } 412 break; 413 case 't': 414 if (ends("aliti")) { 415 r("al"); 416 break; 417 } 418 if (ends("iviti")) { 419 r("ive"); 420 break; 421 } 422 if (ends("biliti")) { 423 r("ble"); 424 break; 425 } 426 break; 427 case 'g': 428 if (ends("logi")) { 429 r("log"); 430 break; 431 } 432 } 433 } 434 435 /* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */ 436 437 private final void step4() { 438 switch (b[k]) { 439 case 'e': 440 if (ends("icate")) { 441 r("ic"); 442 break; 443 } 444 if (ends("ative")) { 445 r(""); 446 break; 447 } 448 if (ends("alize")) { 449 r("al"); 450 break; 451 } 452 break; 453 case 'i': 454 if (ends("iciti")) { 455 r("ic"); 456 break; 457 } 458 break; 459 case 'l': 460 if (ends("ical")) { 461 r("ic"); 462 break; 463 } 464 if (ends("ful")) { 465 r(""); 466 break; 467 } 468 break; 469 case 's': 470 if (ends("ness")) { 471 r(""); 472 break; 473 } 474 break; 475 } 476 } 477 478 /* step5() takes off -ant, -ence etc., in context <c>vcvc<v>. */ 479 480 private final void step5() { 481 if (k == 0) 482 return; /* for Bug 1 */ 483 switch (b[k - 1]) { 484 case 'a': 485 if (ends("al")) 486 break; 487 return; 488 case 'c': 489 if (ends("ance")) 490 break; 491 if (ends("ence")) 492 break; 493 return; 494 case 'e': 495 if (ends("er")) 496 break; 497 return; 498 case 'i': 499 if (ends("ic")) 500 break; 501 return; 502 case 'l': 503 if (ends("able")) 504 break; 505 if (ends("ible")) 506 break; 507 return; 508 case 'n': 509 if (ends("ant")) 510 break; 511 if (ends("ement")) 512 break; 513 if (ends("ment")) 514 break; 515 /* element etc. not stripped before the m */ 516 if (ends("ent")) 517 break; 518 return; 519 case 'o': 520 if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't')) 521 break; 522 /* j >= 0 fixes Bug 2 */ 523 if (ends("ou")) 524 break; 525 return; 526 /* takes care of -ous */ 527 case 's': 528 if (ends("ism")) 529 break; 530 return; 531 case 't': 532 if (ends("ate")) 533 break; 534 if (ends("iti")) 535 break; 536 return; 537 case 'u': 538 if (ends("ous")) 539 break; 540 return; 541 case 'v': 542 if (ends("ive")) 543 break; 544 return; 545 case 'z': 546 if (ends("ize")) 547 break; 548 return; 549 default: 550 return; 551 } 552 if (m() > 1) 553 k = j; 554 } 555 556 /* step6() removes a final -e if m() > 1. */ 557 558 private final void step6() { 559 j = k; 560 if (b[k] == 'e') { 561 int a = m(); 562 if (a > 1 || a == 1 && !cvc(k - 1)) 563 k--; 564 } 565 if (b[k] == 'l' && doublec(k) && m() > 1) 566 k--; 567 } 568 569 /** 570 * Stem a word provided as a String. Returns the result as a String. 571 */ 572 public String stem(String s) { 573 i = 0; 574 add(s.toCharArray(), s.length()); 575 stem(); 576 return toString(); 577 } 578 579 /** 580 * Stem the word placed into the Stemmer buffer through calls to add(). 581 * Returns true if the stemming process resulted in a word different from 582 * the input. You can retrieve the result with 583 * getResultLength()/getResultBuffer() or toString(). 584 */ 585 public void stem() { 586 k = i - 1; 587 if (k > 1) { 588 step1(); 589 step2(); 590 step3(); 591 step4(); 592 step5(); 593 step6(); 594 } 595 i_end = k + 1; 596 i = 0; 597 } 598 599 /** 600 * Test program for demonstrating the Stemmer. It reads text from a a list 601 * of files, stems each word, and writes the result to standard output. Note 602 * that the word stemmed is expected to be in lower case: forcing lower case 603 * must be done outside the Stemmer class. Usage: Stemmer file-name 604 * file-name ... 605 */ 606 public static void main(String[] args) { 607 char[] w = new char[501]; 608 PorterStemmer s = new PorterStemmer(); 609 for (int i = 0; i < args.length; i++) 610 try { 611 FileInputStream in = new FileInputStream(args[i]); 612 613 try { 614 while (true) 615 616 { 617 int ch = in.read(); 618 if (Character.isLetter((char) ch)) { 619 int j = 0; 620 while (true) { 621 ch = Character.toLowerCase((char) ch); 622 w[j] = (char) ch; 623 if (j < 500) 624 j++; 625 ch = in.read(); 626 if (!Character.isLetter((char) ch)) { 627 /* to test add(char ch) */ 628 for (int c = 0; c < j; c++) 629 s.add(w[c]); 630 631 /* or, to test add(char[] w, int j) */ 632 /* s.add(w, j); */ 633 634 s.stem(); 635 { 636 String u; 637 638 /* and now, to test toString() : */ 639 u = s.toString(); 640 641 /* 642 * to test getResultBuffer(), 643 * getResultLength() : 644 */ 645 /* 646 * u = new String(s.getResultBuffer(), 647 * 0, s.getResultLength()); 648 */ 649 650 System.out.print(u); 651 } 652 break; 653 } 654 } 655 } 656 if (ch < 0) 657 break; 658 System.out.print((char) ch); 659 } 660 } catch (IOException e) { 661 System.out.println("error reading " + args[i]); 662 break; 663 } 664 } catch (FileNotFoundException e) { 665 System.out.println("file " + args[i] + " not found"); 666 break; 667 } 668 } 669 }