001 /* 002 * Copyright (C) 2008-2010 by Holger Arndt 003 * 004 * This file is part of the Universal Java Matrix Package (UJMP). 005 * See the NOTICE file distributed with this work for additional 006 * information regarding copyright ownership and licensing. 007 * 008 * UJMP is free software; you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation; either version 2 011 * of the License, or (at your option) any later version. 012 * 013 * UJMP is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU Lesser General Public License for more details. 017 * 018 * You should have received a copy of the GNU Lesser General Public 019 * License along with UJMP; if not, write to the 020 * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, 021 * Boston, MA 02110-1301 USA 022 */ 023 024 package org.ujmp.lucene; 025 026 import java.io.ByteArrayInputStream; 027 import java.io.ByteArrayOutputStream; 028 import java.io.Closeable; 029 import java.io.File; 030 import java.io.Flushable; 031 import java.io.IOException; 032 import java.io.ObjectInputStream; 033 import java.io.ObjectOutputStream; 034 import java.io.Serializable; 035 import java.util.HashSet; 036 import java.util.Set; 037 038 import org.apache.lucene.analysis.Analyzer; 039 import org.apache.lucene.analysis.standard.StandardAnalyzer; 040 import org.apache.lucene.document.Document; 041 import org.apache.lucene.document.Field; 042 import org.apache.lucene.index.CorruptIndexException; 043 import org.apache.lucene.index.IndexReader; 044 import org.apache.lucene.index.IndexWriter; 045 import org.apache.lucene.index.Term; 046 import org.apache.lucene.index.IndexWriter.MaxFieldLength; 047 import org.apache.lucene.search.IndexSearcher; 048 import org.apache.lucene.search.ScoreDoc; 049 import org.apache.lucene.search.TermQuery; 050 import org.apache.lucene.search.TopDocs; 051 import org.apache.lucene.search.WildcardQuery; 052 import org.apache.lucene.store.Directory; 053 import org.apache.lucene.store.FSDirectory; 054 import org.apache.lucene.util.Version; 055 import org.ujmp.core.collections.AbstractMap; 056 import org.ujmp.core.exceptions.MatrixException; 057 import org.ujmp.core.interfaces.Erasable; 058 import org.ujmp.core.util.StringUtil; 059 import org.ujmp.core.util.io.FileUtil; 060 061 public class LuceneMap<K, V> extends AbstractMap<K, V> implements Flushable, 062 Closeable, Erasable { 063 private static final long serialVersionUID = 8998898900190996038L; 064 065 private static final String KEYSTRING = "KS"; 066 067 private static final String KEYDATA = "KD"; 068 069 private static final String VALUESTRING = "VS"; 070 071 private static final String VALUEDATA = "VD"; 072 073 private transient Directory directory = null; 074 075 private transient IndexWriter indexWriter = null; 076 077 private transient IndexSearcher indexSearcher = null; 078 079 private static final int MAXSIZE = 1000000; 080 081 private static final int AUTOFLUSHCOUNT = 100000; 082 083 private boolean readOnly = false; 084 085 private transient File path = null; 086 087 private transient Analyzer analyzer = null; 088 089 private int count = 0; 090 091 public LuceneMap() throws IOException { 092 this(null, false); 093 } 094 095 public LuceneMap(File dir) throws IOException { 096 this(dir, false); 097 } 098 099 public LuceneMap(File path, boolean readOnly) throws IOException { 100 this.readOnly = readOnly; 101 this.path = path; 102 } 103 104 public Directory getDirectory() throws IOException { 105 if (directory == null) { 106 directory = FSDirectory.open(getPath()); 107 } 108 return directory; 109 } 110 111 public File getPath() throws IOException { 112 if (path == null) { 113 path = File.createTempFile("lucene", ""); 114 path.delete(); 115 path.mkdir(); 116 } 117 return path; 118 } 119 120 public synchronized void optimize() throws CorruptIndexException, 121 IOException { 122 getIndexWriter().optimize(); 123 } 124 125 public synchronized void clear() { 126 try { 127 getIndexWriter().deleteAll(); 128 } catch (Exception e) { 129 throw new MatrixException("cannot clear index", e); 130 } 131 } 132 133 public synchronized boolean containsKey(Object key) { 134 try { 135 Term term = new Term(KEYSTRING, getUniqueString(key)); 136 return getIndexSearcher().docFreq(term) > 0; 137 } catch (Exception e) { 138 throw new MatrixException("could not search documents: " + key, e); 139 } 140 } 141 142 public synchronized boolean containsValue(Object value) { 143 try { 144 Term term = new Term(VALUESTRING, getUniqueString(value)); 145 return getIndexSearcher().docFreq(term) > 0; 146 } catch (Exception e) { 147 throw new MatrixException("could not search documents: " + value, e); 148 } 149 } 150 151 public synchronized V get(Object key) { 152 try { 153 Term term = new Term(KEYSTRING, getUniqueString(key)); 154 TermQuery query = new TermQuery(term); 155 TopDocs docs = getIndexSearcher().search(query, 1); 156 if (docs.totalHits > 0) { 157 ScoreDoc match = docs.scoreDocs[0]; 158 Document doc = getIndexSearcher().doc(match.doc); 159 return getObjectFromBytes(doc.getBinaryValue(VALUEDATA)); 160 } 161 } catch (Exception e) { 162 throw new MatrixException("could not search documents: " + key, e); 163 } 164 return null; 165 } 166 167 @SuppressWarnings("unchecked") 168 private V getObjectFromBytes(byte[] bytes) { 169 try { 170 ByteArrayInputStream bis = new ByteArrayInputStream(bytes); 171 ObjectInputStream ois = new ObjectInputStream(bis); 172 Object o = ois.readObject(); 173 ois.close(); 174 bis.close(); 175 return (V) o; 176 } catch (Exception e) { 177 throw new MatrixException("could not convert to object", e); 178 } 179 } 180 181 @SuppressWarnings("unchecked") 182 public synchronized Set<K> keySet() { 183 Set<K> set = new HashSet<K>(); 184 if (isEmpty()) { 185 return set; 186 } 187 try { 188 Term term = new Term(KEYSTRING, "*"); 189 WildcardQuery query = new WildcardQuery(term); 190 TopDocs docs = getIndexSearcher().search(query, MAXSIZE); 191 192 for (ScoreDoc sd : docs.scoreDocs) { 193 Document d = getIndexSearcher().doc(sd.doc); 194 set.add((K) getObjectFromBytes(d.getBinaryValue(KEYDATA))); 195 } 196 return set; 197 } catch (Exception e) { 198 throw new MatrixException("could not search documents", e); 199 } 200 } 201 202 private static String getUniqueString(Object o) throws IOException { 203 if (o == null) { 204 return ""; 205 } else if (o instanceof String) { 206 return (String) o; 207 } else { 208 return StringUtil.encodeToHex((Serializable) o); 209 } 210 } 211 212 public synchronized V put(K key, V value) { 213 try { 214 Term term = new Term(KEYSTRING, getUniqueString(key)); 215 Document doc = new Document(); 216 doc.add(new Field(KEYSTRING, getUniqueString(key), Field.Store.YES, 217 Field.Index.NOT_ANALYZED)); 218 doc.add(new Field(KEYDATA, getBytes(key), Field.Store.YES)); 219 doc.add(new Field(VALUESTRING, getUniqueString(value), 220 Field.Store.YES, Field.Index.NOT_ANALYZED)); 221 doc.add(new Field(VALUEDATA, getBytes(value), Field.Store.YES)); 222 getIndexWriter().updateDocument(term, doc); 223 224 // auto flush from time to time 225 if (++count % AUTOFLUSHCOUNT == 0) { 226 flush(); 227 } 228 return null; 229 } catch (Exception e) { 230 throw new MatrixException("could not add document: " + key, e); 231 } 232 } 233 234 public synchronized V remove(Object key) { 235 try { 236 Term term = new Term(KEYSTRING, getUniqueString(key)); 237 getIndexWriter().deleteDocuments(term); 238 // auto flush from time to time 239 if (++count % AUTOFLUSHCOUNT == 0) { 240 flush(); 241 } 242 return null; 243 } catch (Exception e) { 244 throw new MatrixException("could not delete document: " + key, e); 245 } 246 } 247 248 public Analyzer getAnalyzer() { 249 if (analyzer == null) { 250 analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT); 251 } 252 return analyzer; 253 } 254 255 public void setAnalyzer(Analyzer analyzer) { 256 this.analyzer = analyzer; 257 } 258 259 public synchronized int size() { 260 try { 261 flush(); 262 if (indexSearcher != null 263 && indexSearcher.getIndexReader().isCurrent()) { 264 return indexSearcher.getIndexReader().numDocs(); 265 } else { 266 int size = getIndexWriter().numDocs(); 267 return size; 268 } 269 } catch (Exception e) { 270 throw new MatrixException("could not count documents", e); 271 } 272 } 273 274 public synchronized void flush() throws IOException { 275 IndexWriter iw = getIndexWriter(); 276 iw.expungeDeletes(true); 277 iw.commit(); 278 iw.close(true); 279 indexWriter = null; 280 281 } 282 283 public synchronized void close() throws IOException { 284 if (indexSearcher != null) { 285 indexSearcher.close(); 286 indexSearcher = null; 287 } 288 if (indexWriter != null) { 289 indexWriter.close(true); 290 indexWriter = null; 291 } 292 293 } 294 295 private synchronized IndexWriter getIndexWriter() { 296 try { 297 if (!readOnly && indexSearcher != null) { 298 indexSearcher.close(); 299 indexSearcher = null; 300 } 301 if (indexWriter == null) { 302 if (IndexReader.indexExists(getDirectory())) { 303 if (!readOnly) { 304 if (IndexWriter.isLocked(getDirectory())) { 305 IndexWriter.unlock(getDirectory()); 306 } 307 indexWriter = new IndexWriter(getDirectory(), 308 getAnalyzer(), MaxFieldLength.UNLIMITED); 309 } 310 } else { 311 if (!readOnly) { 312 indexWriter = new IndexWriter(getDirectory(), 313 getAnalyzer(), true, MaxFieldLength.UNLIMITED); 314 } 315 } 316 } 317 return indexWriter; 318 } catch (Exception e) { 319 throw new MatrixException("could not prepare writher", e); 320 } 321 } 322 323 private synchronized IndexSearcher getIndexSearcher() { 324 try { 325 if (!IndexReader.indexExists(getDirectory())) { 326 getIndexWriter(); 327 } 328 if (indexWriter != null) { 329 if (indexSearcher != null) { 330 indexSearcher.close(); 331 indexSearcher = null; 332 } 333 indexWriter.commit(); 334 indexWriter.waitForMerges(); 335 indexWriter.expungeDeletes(true); 336 indexWriter.close(); 337 indexWriter = null; 338 } 339 if (indexSearcher != null) { 340 if (!indexSearcher.getIndexReader().isCurrent()) { 341 indexSearcher.close(); 342 indexSearcher = null; 343 } 344 } 345 if (indexSearcher == null) { 346 indexSearcher = new IndexSearcher(directory, true); 347 } 348 return indexSearcher; 349 } catch (Exception e) { 350 throw new MatrixException("could not prepare reader", e); 351 } 352 } 353 354 private byte[] getBytes(Object o) { 355 try { 356 ByteArrayOutputStream bao = new ByteArrayOutputStream(); 357 ObjectOutputStream oos = new ObjectOutputStream(bao); 358 oos.writeObject(o); 359 oos.close(); 360 bao.close(); 361 return bao.toByteArray(); 362 } catch (Exception e) { 363 throw new MatrixException("could not convert to bytes: " + o, e); 364 } 365 } 366 367 // @SuppressWarnings("unchecked") 368 // private void readObject(ObjectInputStream s) throws IOException, 369 // ClassNotFoundException { 370 // s.defaultReadObject(); 371 // while (true) { 372 // try { 373 // K k = (K) s.readObject(); 374 // V v = (V) s.readObject(); 375 // put(k, v); 376 // } catch (OptionalDataException e) { 377 // return; 378 // } 379 // } 380 // } 381 // 382 // private void writeObject(ObjectOutputStream s) throws IOException, 383 // MatrixException { 384 // s.defaultWriteObject(); 385 // for (Object k : keySet()) { 386 // Object v = get(k); 387 // s.writeObject(k); 388 // s.writeObject(v); 389 // } 390 // } 391 392 public synchronized void erase() throws IOException { 393 close(); 394 FileUtil.deleteRecursive(path); 395 } 396 397 }