001 /* 002 * Copyright (C) 2008-2010 by Holger Arndt 003 * 004 * This file is part of the Universal Java Matrix Package (UJMP). 005 * See the NOTICE file distributed with this work for additional 006 * information regarding copyright ownership and licensing. 007 * 008 * UJMP is free software; you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation; either version 2 011 * of the License, or (at your option) any later version. 012 * 013 * UJMP is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU Lesser General Public License for more details. 017 * 018 * You should have received a copy of the GNU Lesser General Public 019 * License along with UJMP; if not, write to the 020 * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, 021 * Boston, MA 02110-1301 USA 022 */ 023 024 package org.ujmp.core.longmatrix.calculation; 025 026 import org.ujmp.core.Matrix; 027 import org.ujmp.core.exceptions.MatrixException; 028 import org.ujmp.core.longmatrix.impl.DefaultSparseLongMatrix; 029 import org.ujmp.core.mapmatrix.DefaultMapMatrix; 030 import org.ujmp.core.mapmatrix.MapMatrix; 031 032 public class DocTerm extends AbstractLongCalculation { 033 private static final long serialVersionUID = 9021699761386822606L; 034 035 private MapMatrix<String, Long> wordMapping = null; 036 037 private Matrix result = null; 038 039 public DocTerm(Matrix m) { 040 super(m); 041 } 042 043 public long getLong(long... coordinates) throws MatrixException { 044 if (result == null) { 045 result = calculate(); 046 } 047 return result.getAsLong(coordinates); 048 } 049 050 public long[] getSize() { 051 if (result == null) { 052 result = calculate(); 053 } 054 return result.getSize(); 055 } 056 057 public boolean isSparse() { 058 return true; 059 } 060 061 private Matrix calculate() { 062 wordMapping = new DefaultMapMatrix<String, Long>(); 063 Matrix m = getSource(); 064 for (long[] c : m.availableCoordinates()) { 065 String s = m.getAsString(c); 066 if (s != null) { 067 String[] words = s.split("\\s+"); 068 for (String w : words) { 069 if (w.length() == 0) { 070 continue; 071 } 072 Long i = wordMapping.get(w); 073 if (i == null) { 074 wordMapping.put(w, wordMapping.getRowCount()); 075 } 076 } 077 } 078 } 079 result = new DefaultSparseLongMatrix(m.getRowCount(), wordMapping.getRowCount()); 080 081 long rowCount = m.getRowCount(); 082 long colCount = m.getColumnCount(); 083 for (long row = 0; row < rowCount; row++) { 084 for (long col = 0; col < colCount; col++) { 085 String string = m.getAsString(row, col); 086 if (string != null && string.length() > 0) { 087 String[] words = string.split("[\\s]+"); 088 for (String w : words) { 089 if (w.length() == 0) { 090 continue; 091 } 092 long i = wordMapping.get(w); 093 int count = result.getAsInt(row, i); 094 result.setAsInt(++count, row, i); 095 } 096 } 097 } 098 } 099 return result; 100 } 101 102 }