1 package org.kit.furia;
2
3 import hep.aida.bin.StaticBin1D;
4
5 import java.io.BufferedReader;
6 import java.io.File;
7 import java.io.FileReader;
8 import java.io.IOException;
9 import java.text.DecimalFormat;
10 import java.text.NumberFormat;
11 import java.util.Iterator;
12 import java.util.List;
13
14 import org.ajmm.obsearch.asserts.OBAsserts;
15 import org.ajmm.obsearch.exception.NotFrozenException;
16 import org.ajmm.obsearch.exception.OBException;
17 import org.ajmm.obsearch.index.IndexFactory;
18 import org.ajmm.obsearch.index.IndexShort;
19 import org.ajmm.obsearch.index.PPTreeShort;
20 import org.ajmm.obsearch.index.UnsafePPTreeShort;
21 import org.ajmm.obsearch.index.pivotselection.AcceptAll;
22 import org.ajmm.obsearch.index.pivotselection.KMeansPPPivotSelector;
23 import org.apache.log4j.Logger;
24 import org.kit.furia.exceptions.IRException;
25 import org.kit.furia.fragment.OBFragment;
26 import org.kit.furia.index.FIRIndexShort;
27 import org.kit.furia.io.FuriaInputOBFragment;
28
29 import com.sleepycat.je.DatabaseException;
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55 public class FuriaChanEngine {
56
57 private static final Logger logger = Logger.getLogger("FuriaChanEngine");
58
59
60
61
62 protected static String OBSEARCH_FOLDER = "obsearch";
63
64 protected static String IRINDEX_FOLDER = "irindex";
65
66
67
68
69
70 protected IndexShort < OBFragment > index;
71
72
73
74
75
76 protected IRIndexShort < OBFragment > mIndex;
77
78
79
80
81 private byte k = 1;
82
83
84
85
86 private short r = 7;
87
88
89
90
91 private short n = 10;
92
93
94
95
96
97
98
99
100 private boolean validationMode = false;
101
102 public boolean isValidationMode() {
103 return validationMode;
104 }
105
106 public void setValidate(boolean validationMode) {
107 this.validationMode = validationMode;
108 }
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124 public FuriaChanEngine(File directory) throws IOException,
125 DatabaseException, NotFrozenException, IllegalAccessException,
126 InstantiationException, OBException {
127 File obFolder = new File(directory, OBSEARCH_FOLDER);
128 File irFolder = new File(directory, IRINDEX_FOLDER);
129 if (!directory.exists()) {
130 directoryCreation(directory);
131 directoryCreation(obFolder);
132 directoryCreation(irFolder);
133 index = createIndex(obFolder);
134 } else {
135 OBAsserts.chkFileExists(obFolder);
136 OBAsserts.chkFileExists(irFolder);
137 IndexFactory < OBFragment > ifac = new IndexFactory < OBFragment >();
138 if (ifac.isFrozen(obFolder)) {
139 index = (UnsafePPTreeShort < OBFragment >) ifac
140 .createFromOBFolder(obFolder);
141 index.relocateInitialize(null);
142 } else {
143 index = createIndex(obFolder);
144 }
145 }
146 mIndex = new FIRIndexShort < OBFragment >(index, irFolder);
147 }
148
149 public void close() throws IRException {
150 mIndex.close();
151 }
152
153
154
155
156
157
158 public void freeze() throws IRException {
159 mIndex.freeze();
160 }
161
162
163
164
165
166
167
168
169
170 public void insert(File dir) throws IOException, IRException {
171 FuriaInputOBFragment reader = new FuriaInputOBFragment(dir);
172 Iterator < Document < OBFragment >> it = reader
173 .getDocumentsFromDirectory();
174 while (it.hasNext()) {
175 long prevTime = System.currentTimeMillis();
176 Document < OBFragment > toAdd = it.next();
177
178 if (toAdd.size() >= FuriaChanConstants.MIN_DOC_SIZE) {
179 mIndex.insert(toAdd);
180 logger.info("Loaded: " + toAdd.getName() + " size: "
181 + toAdd.size() + " msec: "
182 + (System.currentTimeMillis() - prevTime));
183 } else {
184 logger.info("Document " + toAdd.getName()
185 + " was ignored because it is too small.");
186 }
187 }
188 }
189
190
191
192
193
194
195
196
197
198 public float search(File dir) throws IOException, IRException {
199 logger.debug("Starting search with n:" + n + " r: " + r + " k: " + k
200 + " validation: " + this.validationMode + " msetThreshold "
201 + mIndex.getMSetScoreThreshold() + " setThreshold "
202 + mIndex.getSetScoreThreshold());
203 FuriaInputOBFragment reader = new FuriaInputOBFragment(dir);
204 Iterator < Document < OBFragment >> it = reader
205 .getDocumentsFromDirectory();
206 int foundResults = 0;
207 int totalDocs = 0;
208
209
210 StaticBin1D setScoreStats = new StaticBin1D();
211
212 StaticBin1D mSetScoreStats = new StaticBin1D();
213
214 StaticBin1D nStats = new StaticBin1D();
215 StaticBin1D objectsPerSecond = new StaticBin1D();
216 int maxSizeOfAppsNotFound = 0;
217 StaticBin1D maxSizeStatsOfAppsNotFound = new StaticBin1D();
218
219
220 StaticBin1D notMatchedMSet = new StaticBin1D();
221
222 StaticBin1D notMatchedSet = new StaticBin1D();
223
224 StaticBin1D notMatchedN = new StaticBin1D();
225
226
227 int notMatchedFountAfter = 0;
228 StaticBin1D completelyUnableToFindSize = new StaticBin1D();
229
230
231
232
233
234
235
236
237 StaticBin1D notMatchedMSetWithinN = new StaticBin1D();
238
239
240
241 StaticBin1D notMatchedSetWithinN = new StaticBin1D();
242
243
244
245
246
247
248 StaticBin1D notMatchedMSetAfterN = new StaticBin1D();
249
250
251
252 StaticBin1D notMatchedSetAfterN = new StaticBin1D();
253
254
255
256
257 logger.info("# of docs in the DB: " + this.mIndex.getSize());
258 try {
259 logger.info("# of words in the DB:" + this.mIndex.getWordsSize());
260 } catch (DatabaseException d) {
261
262 }
263 logger.info("(name, luceneScore, scoreMSet, scoreSet, size)");
264 NumberFormat f = new DecimalFormat("0.000");
265 short nToUse = n;
266 int notFound = 0;
267
268 if (this.validationMode) {
269 mIndex.setValidationMode(true);
270 nToUse = (short) (n + mIndex.getSize());
271
272
273
274 }
275 while (it.hasNext()) {
276 Document < OBFragment > toSearch = it.next();
277 if (validationMode) {
278 if (mIndex.shouldSkipDoc(toSearch)) {
279 logger.info("Validation mode: skipping:"
280 + toSearch.getName());
281 continue;
282 }
283 }
284 if (toSearch.size() >= FuriaChanConstants.MIN_DOC_SIZE) {
285 totalDocs++;
286 long prevTime = System.currentTimeMillis();
287
288 List < ResultCandidate > result = mIndex.search(toSearch, k, r,
289 nToUse);
290 float time = (float) (System.currentTimeMillis() - prevTime)
291 / (float) 1000;
292 logger.info("|| Match for " + toSearch.getName() + " sec:"
293 + time + " MSet: " + toSearch.multiSetSize() + " Set:"
294 + toSearch.size());
295 if (time > 0) {
296 objectsPerSecond.add((float) toSearch.size() / time);
297 }
298 Iterator < ResultCandidate > it2 = result.iterator();
299 int nth = 1;
300 boolean found = false;
301
302 logger.debug("Total results:" + result.size());
303 while (it2.hasNext() && nth <= this.n) {
304 ResultCandidate resultCandidate = it2.next();
305 String pre = "";
306
307 if (validationMode
308 && resultCandidate.getDocumentName().equals(
309 toSearch.getName())) {
310 foundResults++;
311 setScoreStats.add(resultCandidate.getNaiveScoreSet());
312 mSetScoreStats.add(resultCandidate.getNaiveScoreMSet());
313 nStats.add(nth);
314 found = true;
315 pre = "<<";
316 } else if (validationMode) {
317 notMatchedMSetWithinN.add(resultCandidate
318 .getNaiveScoreMSet());
319 notMatchedSetWithinN.add(resultCandidate
320 .getNaiveScoreSet());
321 }
322
323 logger.info(pre + resultCandidate.toString());
324
325 nth++;
326 }
327
328
329 if (validationMode && !found) {
330 if (maxSizeOfAppsNotFound < toSearch.size()) {
331 maxSizeOfAppsNotFound = toSearch.size();
332 }
333 maxSizeStatsOfAppsNotFound.add(toSearch.size());
334 boolean found2 = false;
335 while (it2.hasNext()) {
336 ResultCandidate resultCandidate = it2.next();
337 String docName = resultCandidate.getDocumentName();
338
339 if (docName.equals(toSearch.getName())) {
340 found2 = true;
341 notMatchedMSet.add(resultCandidate
342 .getNaiveScoreMSet());
343 notMatchedSet.add(resultCandidate
344 .getNaiveScoreSet());
345 notMatchedN.add(nth);
346 logger.info(":(:(:( Found! pos: " + nth + " "
347 + resultCandidate.toString());
348 break;
349 } else {
350 notMatchedMSetAfterN.add(resultCandidate
351 .getNaiveScoreMSet());
352 notMatchedSetAfterN.add(resultCandidate
353 .getNaiveScoreSet());
354 }
355 nth++;
356 }
357 if (!found2) {
358 completelyUnableToFindSize.add(toSearch.size());
359 logger.info(":(:(:(not found :( ");
360 notFound++;
361 }
362 }
363 } else {
364 logger.warn(toSearch.getName()
365 + " ignored because it is too small");
366 }
367
368 }
369 float result = ((float) foundResults / (float) totalDocs);
370
371 if (validationMode) {
372
373
374 printStats("MSet. Mean: ", mSetScoreStats);
375 printStats("Set. Mean: ", setScoreStats);
376 printStats("N. Mean: ", nStats);
377 printStats("OBs per sec: ", objectsPerSecond);
378 printStats("OBs not found (size). Mean: ",
379 maxSizeStatsOfAppsNotFound);
380
381 printStats("Not matched (within N) MSet. Mean: ",
382 notMatchedMSetWithinN);
383 printStats("Not matched (within N) Set. Mean: ",
384 notMatchedSetWithinN);
385 printStats("Not matched (after N) MSet. Mean: ",
386 notMatchedMSetAfterN);
387 printStats("Not matched (after N) Set. Mean: ", notMatchedSetAfterN);
388 printStats(":(:(:(MSet. Mean: ", notMatchedMSet);
389 printStats(":(:(:(Set. Mean: ", notMatchedSet);
390 printStats(":(:(:(Nth. Mean: ", notMatchedN);
391 printStats("Not in the results", completelyUnableToFindSize);
392 logger.info("Not found count: " + notFound);
393 logger
394 .info("*** FuriaPrecision: (% of programs found in the first n documents) "
395 + result + " " + foundResults + " of " + totalDocs);
396
397
398 }
399 return result;
400 }
401
402 private void printStats(String msg, StaticBin1D stats) {
403 logger.info(msg + " " + stats.mean() + " StdDev: "
404 + stats.standardDeviation() + " min: " + stats.min() + " max: "
405 + stats.max());
406 }
407
408
409
410
411
412
413
414
415
416 public static String readString(final File file) throws IOException {
417 final StringBuilder res = new StringBuilder();
418 final BufferedReader metadata = new BufferedReader(new FileReader(file));
419 String r = metadata.readLine();
420 while (r != null) {
421 res.append(r);
422 r = metadata.readLine();
423 }
424 metadata.close();
425 return res.toString();
426 }
427
428
429
430
431
432
433
434
435 private void directoryCreation(File directory) throws IOException {
436 directory.mkdirs();
437 OBAsserts.chkFileExists(directory);
438 }
439
440
441
442
443
444
445
446 protected IndexShort < OBFragment > createIndex(File folder)
447 throws IOException, DatabaseException {
448
449 KMeansPPPivotSelector < OBFragment > ps = new KMeansPPPivotSelector < OBFragment >(
450 new AcceptAll < OBFragment >());
451 ps.setRetries(1);
452
453 return new UnsafePPTreeShort < OBFragment >(folder, (short) 30,
454 (byte) 12, (short) 0,
455 (short) (FuriaChanConstants.MAX_NODES_PER_FRAGMENT * 2), ps,
456 OBFragment.class);
457 }
458
459 public void setR(short r) {
460 this.r = r;
461 }
462
463 public void setN(short n) throws OBException {
464 OBAsserts.chkAssert(n > 0, "n should be greater than 0");
465 this.n = n;
466 }
467
468 public void setK(byte k) {
469 this.k = k;
470 }
471
472 public byte getK() {
473 return k;
474 }
475
476 public short getR() {
477 return r;
478 }
479
480 public short getN() {
481 return n;
482 }
483
484 public void setValidationMode(boolean validationMode) {
485 this.validationMode = validationMode;
486 }
487
488 public float getMSetScoreThreshold() {
489 return mIndex.getMSetScoreThreshold();
490 }
491
492 public float getSetScoreThreshold() {
493 return mIndex.getSetScoreThreshold();
494 }
495
496 public void setMSetScoreThreshold(float setScoreThreshold) {
497 mIndex.setMSetScoreThreshold(setScoreThreshold);
498 }
499
500 public void setSetScoreThreshold(float setScoreThreshold) {
501 mIndex.setSetScoreThreshold(setScoreThreshold);
502 }
503
504 }