001 /* 002 * Copyright (C) 2008-2010 by Holger Arndt 003 * 004 * This file is part of the Universal Java Matrix Package (UJMP). 005 * See the NOTICE file distributed with this work for additional 006 * information regarding copyright ownership and licensing. 007 * 008 * UJMP is free software; you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation; either version 2 011 * of the License, or (at your option) any later version. 012 * 013 * UJMP is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU Lesser General Public License for more details. 017 * 018 * You should have received a copy of the GNU Lesser General Public 019 * License along with UJMP; if not, write to the 020 * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, 021 * Boston, MA 02110-1301 USA 022 */ 023 024 package org.ujmp.core.doublematrix.calculation.general.misc; 025 026 import org.ujmp.core.Matrix; 027 import org.ujmp.core.doublematrix.calculation.AbstractDoubleCalculation; 028 import org.ujmp.core.exceptions.MatrixException; 029 import org.ujmp.core.longmatrix.calculation.DocTerm; 030 import org.ujmp.core.util.MathUtil; 031 032 public class TfIdf extends AbstractDoubleCalculation { 033 private static final long serialVersionUID = 4262822624560201379L; 034 035 /** 036 * matrix with [documents x terms] 037 */ 038 private Matrix docTerm = null; 039 040 private Matrix sumPerDoc = null; 041 042 private Matrix sumPerTerm = null; 043 044 private boolean calculateTf = false; 045 046 private boolean calculateIdf = false; 047 048 private boolean normalize = false; 049 050 public TfIdf(Matrix matrix, boolean calculateTf, boolean calculateIdf, boolean normalize) { 051 super(matrix); 052 this.calculateTf = calculateTf; 053 this.calculateIdf = calculateIdf; 054 this.normalize = normalize; 055 if (normalize) { 056 throw new MatrixException("not yet implemented"); 057 } 058 } 059 060 061 public double getDouble(long... coordinates) throws MatrixException { 062 if (docTerm == null) { 063 calculate(); 064 } 065 066 double tf = docTerm.getAsDouble(coordinates); 067 double idf = 1.0; 068 069 double numDocs = docTerm.getRowCount(); 070 071 if (calculateTf) { 072 tf = docTerm.getAsDouble(coordinates) / sumPerDoc.getAsDouble(coordinates[ROW], 0); 073 } 074 075 if (calculateIdf) { 076 idf = MathUtil.log10(numDocs / sumPerTerm.getAsDouble(0, coordinates[COLUMN])); 077 } 078 079 double result = tf * idf; 080 return MathUtil.isNaNOrInfinite(result) ? 0.0 : result; 081 } 082 083 private void calculate() { 084 docTerm = new DocTerm(getSource()).calcNew(); 085 if (calculateTf) { 086 sumPerDoc = docTerm.sum(Ret.NEW, Matrix.COLUMN, true); 087 } 088 if (calculateIdf) { 089 sumPerTerm = docTerm.toBooleanMatrix().sum(Ret.NEW, Matrix.ROW, true); 090 } 091 } 092 093 094 public long[] getSize() { 095 if (docTerm == null) { 096 calculate(); 097 } 098 return docTerm.getSize(); 099 } 100 101 }