View Javadoc

1   package org.kit.furia;
2   
3   import hep.aida.bin.StaticBin1D;
4   
5   import java.io.BufferedReader;
6   import java.io.File;
7   import java.io.FileReader;
8   import java.io.IOException;
9   import java.text.DecimalFormat;
10  import java.text.NumberFormat;
11  import java.util.Iterator;
12  import java.util.List;
13  
14  import org.ajmm.obsearch.asserts.OBAsserts;
15  import org.ajmm.obsearch.exception.NotFrozenException;
16  import org.ajmm.obsearch.exception.OBException;
17  import org.ajmm.obsearch.index.IndexFactory;
18  import org.ajmm.obsearch.index.IndexShort;
19  import org.ajmm.obsearch.index.PPTreeShort;
20  import org.ajmm.obsearch.index.UnsafePPTreeShort;
21  import org.ajmm.obsearch.index.pivotselection.AcceptAll;
22  import org.ajmm.obsearch.index.pivotselection.KMeansPPPivotSelector;
23  import org.apache.log4j.Logger;
24  import org.kit.furia.exceptions.IRException;
25  import org.kit.furia.fragment.OBFragment;
26  import org.kit.furia.index.FIRIndexShort;
27  import org.kit.furia.io.FuriaInputOBFragment;
28  
29  import com.sleepycat.je.DatabaseException;
30  
31  /*
32   Furia-chan: An Open Source software license violation detector.    
33   Copyright (C) 2008 Kyushu Institute of Technology
34  
35   This program is free software: you can redistribute it and/or modify
36   it under the terms of the GNU General Public License as published by
37   the Free Software Foundation, either version 3 of the License, or
38   (at your option) any later version.
39  
40   This program is distributed in the hope that it will be useful,
41   but WITHOUT ANY WARRANTY; without even the implied warranty of
42   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
43   GNU General Public License for more details.
44  
45   You should have received a copy of the GNU General Public License
46   along with this program.  If not, see <http://www.gnu.org/licenses/>.
47   */
48  
49  /**
50   * FuriaChanEngine holds the logic necessary to open fragmented programs insert
51   * them, and search them in the database.
52   * @author Arnoldo Jose Muller Molina
53   */
54  
55  public class FuriaChanEngine {
56  
57      private static final Logger logger = Logger.getLogger("FuriaChanEngine");
58  
59      /**
60       * Folder name where OB will reside.
61       */
62      protected static String OBSEARCH_FOLDER = "obsearch";
63  
64      protected static String IRINDEX_FOLDER = "irindex";
65  
66      /**
67       * OBsearch index of our database.
68       */
69  
70      protected IndexShort < OBFragment > index;
71  
72      /**
73       * Multi-set index. The information retrieval engine that calculates the
74       * similarity on documents or multi-sets of OBFragments.
75       */
76      protected IRIndexShort < OBFragment > mIndex;
77  
78      /**
79       * The number of elements to get from OBSearch.
80       */
81      private byte k = 1;
82  
83      /**
84       * Range to use for the tree distance function.
85       */
86      private short r = 7;
87  
88      /**
89       * Get the top n elements.
90       */
91      private short n = 10;
92  
93      /**
94       * Validation mode assumes that the query's document names are the same
95       * names as in the DB. Based on this, it creates a summary of the queries
96       * whose corresponding documents were found in the DB. For this to make
97       * sense, all the apps in the DB should be different. Adding different
98       * versions of a program might not give the indented effect.
99       */
100     private boolean validationMode = false;
101 
102     public boolean isValidationMode() {
103         return validationMode;
104     }
105 
106     public void setValidate(boolean validationMode) {
107         this.validationMode = validationMode;
108     }
109 
110     /**
111      * Creates a FuriaChan object. If the given directory does not exist, the
112      * directory will be created and two folders (one for OBSearch and one for
113      * Lucene) will be created beneath it. If the directory exists, then the
114      * corresponding OBSearch index and the IRIndex index will be loaded.
115      * @param directory
116      *                the database directory that will be used.
117      * @throws IOException
118      *                 If directory does not exist and it cannot be created.
119      * @throws OBException
120      * @throws InstantiationException
121      * @throws IllegalAccessException
122      * @throws NotFrozenException
123      */
124     public FuriaChanEngine(File directory) throws IOException,
125             DatabaseException, NotFrozenException, IllegalAccessException,
126             InstantiationException, OBException {
127         File obFolder = new File(directory, OBSEARCH_FOLDER);
128         File irFolder = new File(directory, IRINDEX_FOLDER);
129         if (!directory.exists()) {
130             directoryCreation(directory);
131             directoryCreation(obFolder);
132             directoryCreation(irFolder);
133             index = createIndex(obFolder);
134         } else { // load OBsearch and IRIndex
135             OBAsserts.chkFileExists(obFolder);
136             OBAsserts.chkFileExists(irFolder);
137             IndexFactory < OBFragment > ifac = new IndexFactory < OBFragment >();
138             if (ifac.isFrozen(obFolder)) {
139                 index = (UnsafePPTreeShort < OBFragment >) ifac
140                         .createFromOBFolder(obFolder);
141                 index.relocateInitialize(null);
142             } else { // the index is not frozen
143                 index = createIndex(obFolder);
144             }
145         }
146         mIndex = new FIRIndexShort < OBFragment >(index, irFolder);
147     }
148 
149     public void close() throws IRException {
150         mIndex.close();
151     }
152 
153     /**
154      * Freeze the index. After the index is frozen, matches can be performed.
155      * @throws IRException
156      *                 If there is an error in the freezing process.
157      */
158     public void freeze() throws IRException {
159         mIndex.freeze();
160     }
161 
162     /**
163      * Insert the given directory into Furia-chan We will ignore applications
164      * that have less than
165      * @param dir
166      *                The dir of an application or a directory of applications.
167      * @throws IOException
168      * @throws IRException
169      */
170     public void insert(File dir) throws IOException, IRException {
171         FuriaInputOBFragment reader = new FuriaInputOBFragment(dir);
172         Iterator < Document < OBFragment >> it = reader
173                 .getDocumentsFromDirectory();
174         while (it.hasNext()) {
175             long prevTime = System.currentTimeMillis();
176             Document < OBFragment > toAdd = it.next();
177 
178             if (toAdd.size() >= FuriaChanConstants.MIN_DOC_SIZE) {
179                 mIndex.insert(toAdd);
180                 logger.info("Loaded: " + toAdd.getName() + " size: "
181                         + toAdd.size() + " msec: "
182                         + (System.currentTimeMillis() - prevTime));
183             } else {
184                 logger.info("Document " + toAdd.getName()
185                         + " was ignored because it is too small.");
186             }
187         }
188     }
189 
190     /**
191      * Performs a search in the database and prints the result to the user.
192      * @param dir
193      * @return The FuriaPrecision value (queries found in the top n results /
194      *         total of queries)
195      * @throws IOException
196      * @throws IRException
197      */
198     public float search(File dir) throws IOException, IRException {
199         logger.debug("Starting search with n:" + n + " r: " + r + " k: " + k
200                 + " validation: " + this.validationMode + " msetThreshold "
201                 + mIndex.getMSetScoreThreshold() + " setThreshold "
202                 + mIndex.getSetScoreThreshold());
203         FuriaInputOBFragment reader = new FuriaInputOBFragment(dir);
204         Iterator < Document < OBFragment >> it = reader
205                 .getDocumentsFromDirectory();
206         int foundResults = 0; // only meaningful in validationMode
207         int totalDocs = 0;
208         // ********************************************************
209         // The following applies to queries that are found within n top docs
210         StaticBin1D setScoreStats = new StaticBin1D(); // statistics on sets
211         // scores.
212         StaticBin1D mSetScoreStats = new StaticBin1D(); // statistics on
213         // multi-sets scores.
214         StaticBin1D nStats = new StaticBin1D(); // statistics on n
215         StaticBin1D objectsPerSecond = new StaticBin1D(); // statistics on n
216         int maxSizeOfAppsNotFound = 0;
217         StaticBin1D maxSizeStatsOfAppsNotFound = new StaticBin1D();
218         // The following are stats that apply only for real answers found after
219         // n
220         StaticBin1D notMatchedMSet = new StaticBin1D(); // MSet score when the
221         // query is not in n
222         StaticBin1D notMatchedSet = new StaticBin1D(); // Set score when the
223         // query is not in n
224         StaticBin1D notMatchedN = new StaticBin1D(); // Position when the
225         // result is not found
226         // in n
227         int notMatchedFountAfter = 0; // amount of not matched found after n.
228         StaticBin1D completelyUnableToFindSize = new StaticBin1D(); // the guys
229         // that even
230         // after
231         // extending
232         // n can't
233         // be found
234         // The following are stats that apply only to apps that are not
235         // candidates but are within the top
236         // n results
237         StaticBin1D notMatchedMSetWithinN = new StaticBin1D(); // MSet score
238         // when the
239         // query is not
240         // in n
241         StaticBin1D notMatchedSetWithinN = new StaticBin1D(); // Set score
242         // when the
243         // query is not
244         // in n
245         // The following are stats that apply only to the apps that are not the
246         // candidate that
247         // are found after n
248         StaticBin1D notMatchedMSetAfterN = new StaticBin1D(); // MSet score
249         // when the
250         // query is not
251         // in n
252         StaticBin1D notMatchedSetAfterN = new StaticBin1D(); // Set score
253         // when the
254         // query is not
255         // in n
256         // ********************************************************
257         logger.info("# of docs in the DB: " + this.mIndex.getSize());
258         try {
259             logger.info("# of words in the DB:" + this.mIndex.getWordsSize());
260         } catch (DatabaseException d) {
261             // :)
262         }
263         logger.info("(name, luceneScore, scoreMSet, scoreSet, size)");
264         NumberFormat f = new DecimalFormat("0.000");
265         short nToUse = n;
266         int notFound = 0; // # of guys that were not even found in the list of
267                             // candidates.
268         if (this.validationMode) {
269             mIndex.setValidationMode(true);
270             nToUse = (short) (n + mIndex.getSize()); // used to get the
271             // unmatched guys and
272             // get statistics about
273             // them
274         }
275         while (it.hasNext()) {
276             Document < OBFragment > toSearch = it.next();
277             if (validationMode) {
278                 if (mIndex.shouldSkipDoc(toSearch)) {
279                     logger.info("Validation mode: skipping:"
280                             + toSearch.getName());
281                     continue;
282                 }
283             }
284             if (toSearch.size() >= FuriaChanConstants.MIN_DOC_SIZE) {
285                 totalDocs++;
286                 long prevTime = System.currentTimeMillis();
287 
288                 List < ResultCandidate > result = mIndex.search(toSearch, k, r,
289                         nToUse);
290                 float time = (float) (System.currentTimeMillis() - prevTime)
291                         / (float) 1000;
292                 logger.info("|| Match for " + toSearch.getName() + " sec:"
293                         + time + " MSet: " + toSearch.multiSetSize() + " Set:"
294                         + toSearch.size());
295                 if (time > 0) {
296                     objectsPerSecond.add((float) toSearch.size() / time);
297                 }
298                 Iterator < ResultCandidate > it2 = result.iterator();
299                 int nth = 1;
300                 boolean found = false;
301 
302                 logger.debug("Total results:" + result.size());
303                 while (it2.hasNext() && nth <= this.n) {
304                     ResultCandidate resultCandidate = it2.next();
305                     String pre = "";
306                     // hightlight the matched result.
307                     if (validationMode
308                             && resultCandidate.getDocumentName().equals(
309                                     toSearch.getName())) {
310                         foundResults++;
311                         setScoreStats.add(resultCandidate.getNaiveScoreSet());
312                         mSetScoreStats.add(resultCandidate.getNaiveScoreMSet());
313                         nStats.add(nth);
314                         found = true;
315                         pre = "<<";
316                     } else if (validationMode) {
317                         notMatchedMSetWithinN.add(resultCandidate
318                                 .getNaiveScoreMSet());
319                         notMatchedSetWithinN.add(resultCandidate
320                                 .getNaiveScoreSet());
321                     }
322 
323                     logger.info(pre + resultCandidate.toString());
324 
325                     nth++;
326                 }
327 
328                 // check if the item was found
329                 if (validationMode && !found) {
330                     if (maxSizeOfAppsNotFound < toSearch.size()) {
331                         maxSizeOfAppsNotFound = toSearch.size();
332                     }
333                     maxSizeStatsOfAppsNotFound.add(toSearch.size());
334                     boolean found2 = false;
335                     while (it2.hasNext()) {
336                         ResultCandidate resultCandidate = it2.next();
337                         String docName = resultCandidate.getDocumentName();
338                         // hightlight the matched result.
339                         if (docName.equals(toSearch.getName())) {
340                             found2 = true;
341                             notMatchedMSet.add(resultCandidate
342                                     .getNaiveScoreMSet());
343                             notMatchedSet.add(resultCandidate
344                                     .getNaiveScoreSet());
345                             notMatchedN.add(nth);
346                             logger.info(":(:(:( Found! pos: " + nth + " "
347                                     + resultCandidate.toString());
348                             break;
349                         } else {
350                             notMatchedMSetAfterN.add(resultCandidate
351                                     .getNaiveScoreMSet());
352                             notMatchedSetAfterN.add(resultCandidate
353                                     .getNaiveScoreSet());
354                         }
355                         nth++;
356                     }
357                     if (!found2) {
358                         completelyUnableToFindSize.add(toSearch.size());
359                         logger.info(":(:(:(not found :( ");
360                         notFound++;
361                     }
362                 }
363             } else {
364                 logger.warn(toSearch.getName()
365                         + " ignored because it is too small");
366             }
367 
368         }
369         float result = ((float) foundResults / (float) totalDocs);
370         // validationMode's summary
371         if (validationMode) {
372             
373 
374             printStats("MSet. Mean: ", mSetScoreStats);
375             printStats("Set. Mean: ", setScoreStats);
376             printStats("N. Mean: ", nStats);
377             printStats("OBs per sec: ", objectsPerSecond);
378             printStats("OBs not found (size). Mean: ",
379                     maxSizeStatsOfAppsNotFound);
380 
381             printStats("Not matched (within N) MSet. Mean: ",
382                     notMatchedMSetWithinN);
383             printStats("Not matched (within N) Set. Mean: ",
384                     notMatchedSetWithinN);
385             printStats("Not matched (after N) MSet. Mean: ",
386                     notMatchedMSetAfterN);
387             printStats("Not matched (after N) Set. Mean: ", notMatchedSetAfterN);
388             printStats(":(:(:(MSet. Mean: ", notMatchedMSet);
389             printStats(":(:(:(Set. Mean: ", notMatchedSet);
390             printStats(":(:(:(Nth. Mean: ", notMatchedN);
391             printStats("Not in the results", completelyUnableToFindSize);
392             logger.info("Not found count: " + notFound);
393             logger
394             .info("*** FuriaPrecision: (% of programs found in the first n documents) "
395                     + result + " " + foundResults + " of " + totalDocs);
396 
397           
398         }
399         return result;
400     }
401 
402     private void printStats(String msg, StaticBin1D stats) {
403         logger.info(msg + " " + stats.mean() + " StdDev: "
404                 + stats.standardDeviation() + " min: " + stats.min() + " max: "
405                 + stats.max());
406     }
407 
408     /**
409      * Reads a String from the given file.
410      * @param file
411      *                File to Read
412      * @return A String representation of the file
413      * @throws IOException
414      *                 If there is an IO error
415      */
416     public static String readString(final File file) throws IOException {
417         final StringBuilder res = new StringBuilder();
418         final BufferedReader metadata = new BufferedReader(new FileReader(file));
419         String r = metadata.readLine();
420         while (r != null) {
421             res.append(r);
422             r = metadata.readLine();
423         }
424         metadata.close();
425         return res.toString();
426     }
427 
428     /**
429      * Utility class for creating the given directory
430      * @param directory
431      *                creates the given directory
432      * @throws IOException
433      *                 If the directory cannot be created
434      */
435     private void directoryCreation(File directory) throws IOException {
436         directory.mkdirs();
437         OBAsserts.chkFileExists(directory);
438     }
439 
440     /**
441      * A convenience method that creates an OBSearch index optimized for our
442      * distance function.
443      * @param folder
444      * @return an optimized index
445      */
446     protected IndexShort < OBFragment > createIndex(File folder)
447             throws IOException, DatabaseException {
448 
449         KMeansPPPivotSelector < OBFragment > ps = new KMeansPPPivotSelector < OBFragment >(
450                 new AcceptAll < OBFragment >());
451         ps.setRetries(1);
452 
453         return new UnsafePPTreeShort < OBFragment >(folder, (short) 30,
454                 (byte) 12, (short) 0,
455                 (short) (FuriaChanConstants.MAX_NODES_PER_FRAGMENT * 2), ps,
456                 OBFragment.class);
457     }
458 
459     public void setR(short r) {
460         this.r = r;
461     }
462 
463     public void setN(short n) throws OBException {
464         OBAsserts.chkAssert(n > 0, "n should be greater than 0");
465         this.n = n;
466     }
467 
468     public void setK(byte k) {
469         this.k = k;
470     }
471 
472     public byte getK() {
473         return k;
474     }
475 
476     public short getR() {
477         return r;
478     }
479 
480     public short getN() {
481         return n;
482     }
483 
484     public void setValidationMode(boolean validationMode) {
485         this.validationMode = validationMode;
486     }
487 
488     public float getMSetScoreThreshold() {
489         return mIndex.getMSetScoreThreshold();
490     }
491 
492     public float getSetScoreThreshold() {
493         return mIndex.getSetScoreThreshold();
494     }
495 
496     public void setMSetScoreThreshold(float setScoreThreshold) {
497         mIndex.setMSetScoreThreshold(setScoreThreshold);
498     }
499 
500     public void setSetScoreThreshold(float setScoreThreshold) {
501         mIndex.setSetScoreThreshold(setScoreThreshold);
502     }
503 
504 }