1 package org.kit.furia;
2
3 import org.ajmm.obsearch.OB;
4
5 import java.io.IOException;
6 import java.util.List;
7 import org.ajmm.obsearch.Index;
8 import org.ajmm.obsearch.exception.AlreadyFrozenException;
9 import org.ajmm.obsearch.exception.IllegalIdException;
10 import org.ajmm.obsearch.exception.OBException;
11 import org.ajmm.obsearch.exception.OutOfRangeException;
12 import org.ajmm.obsearch.exception.UndefinedPivotsException;
13 import org.apache.lucene.index.CorruptIndexException;
14 import org.kit.furia.exceptions.IRException;
15
16 import com.sleepycat.je.DatabaseException;
17
18 /*
19 Furia-chan: An Open Source software license violation detector.
20 Copyright (C) 2007 Kyushu Institute of Technology
21
22 This program is free software: you can redistribute it and/or modify
23 it under the terms of the GNU General Public License as published by
24 the Free Software Foundation, either version 3 of the License, or
25 (at your option) any later version.
26
27 This program is distributed in the hope that it will be useful,
28 but WITHOUT ANY WARRANTY; without even the implied warranty of
29 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
30 GNU General Public License for more details.
31
32 You should have received a copy of the GNU General Public License
33 along with this program. If not, see <http://www.gnu.org/licenses/>.
34 */
35
36 /**
37 * IRIndex holds the basic functionality for an Information Retrieval system
38 * that works on OB objects (please see obsearch.berlios.de). By using a
39 * distance function d, we transform the queries in terms of the closest
40 * elements that are in the database, and once this transformation is performed,
41 * we utilize an information retrieval system to perform the matching. Because
42 * our documents are multi-sets, the distribution of OB objects inside a
43 * document is taken into account. So, instead of matching a huge syntax tree of
44 * for example, music, we cut a song into pieces, match the pieces and then the
45 * overall finger-print of the multi-set of OB objects is matched.
46 * @author Arnoldo Jose Muller Molina
47 * @since 0
48 */
49 public interface IRIndex < O extends OB > {
50
51 /**
52 * Inserts a new document into the database.
53 * @param document
54 * The document to be inserted.
55 * @throws IRException
56 * If something goes wrong with the IR engine or with
57 * OBSearch.
58 */
59 void insert(Document < O > document) throws IRException;
60
61 /**
62 * Deletes the given string document from the database. If more than one
63 * documents have the same name, all the documents will be erased.
64 * @return The number of documents deleted.
65 * @throws IRException
66 * If something goes wrong with the IR engine or with
67 * OBSearch.
68 */
69 int delete(String documentName) throws IRException;
70
71 /**
72 * Returns the underlying OBSearch index.
73 * @return the underlying OBSearch index.
74 */
75 Index < O > getIndex();
76
77 /**
78 * Freezes the index. From this point data can be inserted, searched and
79 * deleted. The index might deteriorate at some point so every once in a
80 * while it is a good idea to rebuild the index. This method will also
81 * @throws IRException
82 * If something goes wrong with the IR engine or with
83 * OBSearch.
84 */
85 void freeze() throws IRException;
86
87 /**
88 * Closes the databases. You *should* close the databases after using an
89 * IRIndex.
90 * @throws IRException
91 * If something goes wrong with the IR engine or with
92 * OBSearch.
93 */
94 void close() throws IRException;
95
96 /**
97 * Returns the number of documents stored in this index.
98 * @return the number of documents stored in this index.
99 */
100 int getSize();
101
102 /**
103 * Returns true if the document corresponding to x's name exists in the DB.
104 * This method is intended to be used in validation mode only.
105 * @param x
106 * @return true if the DB does not contain a document with name x.getName()
107 */
108 boolean shouldSkipDoc(Document<O> x) throws IOException;
109
110 /**
111 * The M-set score threshold is the minimum naive score for multi-sets
112 * that the index will accept.
113 * @return Returns the current M-set score threshold.
114 */
115 float getMSetScoreThreshold();
116
117 /**
118 * The M-set score threshold is the minimum naive score for multi-sets
119 * that the index will accept.
120 * @param setScoreThreshold the new threshold
121 */
122 void setMSetScoreThreshold(float setScoreThreshold);
123
124 /**
125 * * The Set score threshold is the minimum naive score for Sets
126 * that the index will accept.
127 * @return Returns the current Set score threshold.
128 */
129 float getSetScoreThreshold();
130 /**
131 * The Set score threshold is the minimum naive score for Sets
132 * that the index will accept.
133 * @param setScoreThreshold the new threshold
134 */
135 void setSetScoreThreshold(float setScoreThreshold);
136
137 /**
138 * Returns the count different words that
139 * are used by the documents indexed.
140 * @return the count different words that
141 * are used by the documents indexed.
142 */
143 int getWordsSize() throws DatabaseException;
144
145 /**
146 * Tells whether or not the index is in validation mode.
147 * In validation mode we assume that documents with the same name are equal.
148 * This helps us to add additional statistics on the performance of the scoring technique.
149 * @return true if this index is in validation mode.
150 */
151 boolean isValidationMode();
152 /**
153 * Sets whether or not the index is in validation mode.
154 * In validation mode we assume that documents with the same name are equal.
155 * This helps us to add additional statistics on the performance of the scoring technique.
156 * @param validationMode The new validation mode.
157 * */
158 void setValidationMode(boolean validationMode);
159 }