001    /*
002     * Copyright (C) 2008-2010 by Holger Arndt
003     *
004     * This file is part of the Universal Java Matrix Package (UJMP).
005     * See the NOTICE file distributed with this work for additional
006     * information regarding copyright ownership and licensing.
007     *
008     * UJMP is free software; you can redistribute it and/or modify
009     * it under the terms of the GNU Lesser General Public License as
010     * published by the Free Software Foundation; either version 2
011     * of the License, or (at your option) any later version.
012     *
013     * UJMP is distributed in the hope that it will be useful,
014     * but WITHOUT ANY WARRANTY; without even the implied warranty of
015     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
016     * GNU Lesser General Public License for more details.
017     *
018     * You should have received a copy of the GNU Lesser General Public
019     * License along with UJMP; if not, write to the
020     * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
021     * Boston, MA  02110-1301  USA
022     */
023    
024    package org.ujmp.core.longmatrix.calculation;
025    
026    import org.ujmp.core.Matrix;
027    import org.ujmp.core.exceptions.MatrixException;
028    import org.ujmp.core.longmatrix.impl.DefaultSparseLongMatrix;
029    import org.ujmp.core.mapmatrix.DefaultMapMatrix;
030    import org.ujmp.core.mapmatrix.MapMatrix;
031    
032    public class DocTerm extends AbstractLongCalculation {
033            private static final long serialVersionUID = 9021699761386822606L;
034    
035            private MapMatrix<String, Long> wordMapping = null;
036    
037            private Matrix result = null;
038    
039            public DocTerm(Matrix m) {
040                    super(m);
041            }
042    
043            public long getLong(long... coordinates) throws MatrixException {
044                    if (result == null) {
045                            result = calculate();
046                    }
047                    return result.getAsLong(coordinates);
048            }
049    
050            public long[] getSize() {
051                    if (result == null) {
052                            result = calculate();
053                    }
054                    return result.getSize();
055            }
056    
057            public boolean isSparse() {
058                    return true;
059            }
060    
061            private Matrix calculate() {
062                    wordMapping = new DefaultMapMatrix<String, Long>();
063                    Matrix m = getSource();
064                    for (long[] c : m.availableCoordinates()) {
065                            String s = m.getAsString(c);
066                            if (s != null) {
067                                    String[] words = s.split("\\s+");
068                                    for (String w : words) {
069                                            if (w.length() == 0) {
070                                                    continue;
071                                            }
072                                            Long i = wordMapping.get(w);
073                                            if (i == null) {
074                                                    wordMapping.put(w, wordMapping.getRowCount());
075                                            }
076                                    }
077                            }
078                    }
079                    result = new DefaultSparseLongMatrix(m.getRowCount(), wordMapping.getRowCount());
080    
081                    long rowCount = m.getRowCount();
082                    long colCount = m.getColumnCount();
083                    for (long row = 0; row < rowCount; row++) {
084                            for (long col = 0; col < colCount; col++) {
085                                    String string = m.getAsString(row, col);
086                                    if (string != null && string.length() > 0) {
087                                            String[] words = string.split("[\\s]+");
088                                            for (String w : words) {
089                                                    if (w.length() == 0) {
090                                                            continue;
091                                                    }
092                                                    long i = wordMapping.get(w);
093                                                    int count = result.getAsInt(row, i);
094                                                    result.setAsInt(++count, row, i);
095                                            }
096                                    }
097                            }
098                    }
099                    return result;
100            }
101    
102    }