001    /*
002     * Copyright (C) 2008-2010 by Holger Arndt
003     *
004     * This file is part of the Universal Java Matrix Package (UJMP).
005     * See the NOTICE file distributed with this work for additional
006     * information regarding copyright ownership and licensing.
007     *
008     * UJMP is free software; you can redistribute it and/or modify
009     * it under the terms of the GNU Lesser General Public License as
010     * published by the Free Software Foundation; either version 2
011     * of the License, or (at your option) any later version.
012     *
013     * UJMP is distributed in the hope that it will be useful,
014     * but WITHOUT ANY WARRANTY; without even the implied warranty of
015     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
016     * GNU Lesser General Public License for more details.
017     *
018     * You should have received a copy of the GNU Lesser General Public
019     * License along with UJMP; if not, write to the
020     * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
021     * Boston, MA  02110-1301  USA
022     */
023    
024    package org.ujmp.core.doublematrix.calculation.general.misc;
025    
026    import org.ujmp.core.Matrix;
027    import org.ujmp.core.doublematrix.calculation.AbstractDoubleCalculation;
028    import org.ujmp.core.exceptions.MatrixException;
029    import org.ujmp.core.longmatrix.calculation.DocTerm;
030    import org.ujmp.core.util.MathUtil;
031    
032    public class TfIdf extends AbstractDoubleCalculation {
033            private static final long serialVersionUID = 4262822624560201379L;
034    
035            /**
036             * matrix with [documents x terms]
037             */
038            private Matrix docTerm = null;
039    
040            private Matrix sumPerDoc = null;
041    
042            private Matrix sumPerTerm = null;
043    
044            private boolean calculateTf = false;
045    
046            private boolean calculateIdf = false;
047    
048            private boolean normalize = false;
049    
050            public TfIdf(Matrix matrix, boolean calculateTf, boolean calculateIdf, boolean normalize) {
051                    super(matrix);
052                    this.calculateTf = calculateTf;
053                    this.calculateIdf = calculateIdf;
054                    this.normalize = normalize;
055                    if (normalize) {
056                            throw new MatrixException("not yet implemented");
057                    }
058            }
059    
060            
061            public double getDouble(long... coordinates) throws MatrixException {
062                    if (docTerm == null) {
063                            calculate();
064                    }
065    
066                    double tf = docTerm.getAsDouble(coordinates);
067                    double idf = 1.0;
068    
069                    double numDocs = docTerm.getRowCount();
070    
071                    if (calculateTf) {
072                            tf = docTerm.getAsDouble(coordinates) / sumPerDoc.getAsDouble(coordinates[ROW], 0);
073                    }
074    
075                    if (calculateIdf) {
076                            idf = MathUtil.log10(numDocs / sumPerTerm.getAsDouble(0, coordinates[COLUMN]));
077                    }
078    
079                    double result = tf * idf;
080                    return MathUtil.isNaNOrInfinite(result) ? 0.0 : result;
081            }
082    
083            private void calculate() {
084                    docTerm = new DocTerm(getSource()).calcNew();
085                    if (calculateTf) {
086                            sumPerDoc = docTerm.sum(Ret.NEW, Matrix.COLUMN, true);
087                    }
088                    if (calculateIdf) {
089                            sumPerTerm = docTerm.toBooleanMatrix().sum(Ret.NEW, Matrix.ROW, true);
090                    }
091            }
092    
093            
094            public long[] getSize() {
095                    if (docTerm == null) {
096                            calculate();
097                    }
098                    return docTerm.getSize();
099            }
100    
101    }