1 package org.kit.furia.io;
2
3 import java.io.BufferedReader;
4 import java.io.File;
5 import java.io.FileReader;
6 import java.io.FilenameFilter;
7 import java.io.IOException;
8 import java.util.Iterator;
9 import java.util.NoSuchElementException;
10
11 import org.ajmm.obsearch.OB;
12 import org.ajmm.obsearch.exception.OBException;
13 import org.apache.log4j.Logger;
14 import org.kit.furia.Document;
15
16 /*
17 Furia-chan: An Open Source software license violation detector.
18 Copyright (C) 2007 Kyushu Institute of Technology
19
20 This program is free software: you can redistribute it and/or modify
21 it under the terms of the GNU General Public License as published by
22 the Free Software Foundation, either version 3 of the License, or
23 (at your option) any later version.
24
25 This program is distributed in the hope that it will be useful,
26 but WITHOUT ANY WARRANTY; without even the implied warranty of
27 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
28 GNU General Public License for more details.
29
30 You should have received a copy of the GNU General Public License
31 along with this program. If not, see <http://www.gnu.org/licenses/>.
32 */
33
34 /**
35 * AbstractFuriaInput is in charge of reading fragment files and creating
36 * documents out of them.
37 * @author Arnoldo Jose Muller Molina
38 * @since 0
39 */
40
41 public abstract class AbstractFuriaInput < O extends OB > {
42
43 private static final Logger logger = Logger.getLogger(AbstractFuriaInput.class);
44
45 private File directory;
46
47 /**
48 * Creates a new fragment file reader based on the given directory.
49 * @param directory
50 */
51 public AbstractFuriaInput(File directory){
52 this.directory = directory;
53 }
54
55 /**
56 * Reads and creates an O object from the given string.
57 * @param data
58 * The string to be parsed
59 * @return an O object that was created from data
60 * @throws OBException if something goes wrong when parsing the data.
61 */
62 protected abstract O readObjectFromStringLine(String data) throws OBException;
63
64 /**
65 * The name of the file that holds the fragments (words) inside a directory.
66 */
67 public static final String fragmentFileName = "fragments";
68
69 /**
70 * This method receives a directory and returns an iterator that will lazily
71 * create documents from the given directory. The directory is composed of
72 * directories in which a file called "fragments" was previously created.
73 * @param directory
74 * that will be processed
75 * @throws IOException
76 * If the given directory does not exist.
77 * @return An iterator that will return one by one all the documents found
78 * in the given directory.
79 */
80 public Iterator < Document < O >> getDocumentsFromDirectory()
81 throws IOException {
82 if (!directory.exists()) {
83 throw new IOException("Directory does not exist: " + directory);
84 }
85 return new FuriaInputIterator(directory);
86 }
87
88 /**
89 * Parses a file that is in the furia-chan fragment file format: "#" starts
90 * a comment and it is ignored. Every object is a string separated by a
91 * newline. The subclass knows how to interpret this line, and an
92 * appropriate O object will be generated from this line.
93 * @param fragments
94 * A file in which fragment files can be found.
95 * @param id
96 * The id that the document will hold.
97 * @return A document of O objects created from the given file.
98 * @throws IOException
99 * If fragments does not exist, or any other error occurs.
100 */
101 public Document < O > getDocument(String id, File fragments)
102 throws IOException, OBException {
103 if (!fragments.exists()) {
104 throw new IOException("File does not exist: " + fragments);
105 }
106 Document < O > doc = new Document < O >(id);
107 BufferedReader r = new BufferedReader(new FileReader(fragments));
108 String re = r.readLine();
109 while (re != null) {
110 if (isParsableLine(re)) {
111 String[] tuple = re.split("\t");
112 int multiplicity = Integer.parseInt(tuple[0]);
113 O word = readObjectFromStringLine(tuple[1]);
114 doc.setWord(word,multiplicity);
115 }
116 re = r.readLine();
117 }
118 r.close();
119 return doc;
120 }
121
122 /**
123 * Returns true if the given line is not null or if it is not a comment.
124 * @return true if the given line can be parsed.
125 */
126 public boolean isParsableLine(final String line) {
127 return !("".equals(line.trim()) || (line.startsWith("#")));
128 }
129
130 /**
131 * Iterator class that creates Documents from the given directory.
132 * @author Arnoldo Jose Muller Molina
133 *
134 */
135 private class FuriaInputIterator implements Iterator < Document < O >> {
136
137 private class FragmentsFileFilter implements FilenameFilter{
138
139 public boolean accept(File dir, String name) {
140 return name.equals(fragmentFileName);
141 }
142 }
143
144 private FragmentsFileFilter fileFilter = new FragmentsFileFilter();
145
146 /**
147 * The documents that will be lazily processed.
148 */
149 private File[] documents;
150
151 /**
152 * The current index that will be processed.
153 */
154 private int i;
155
156 /**
157 * Builds an iterator of applications. If the given directory
158 * has a "fragments" file, then the program works in Single app mode.
159 * Otherwise, we run in directory of applications mode.
160 * @param directory
161 */
162 FuriaInputIterator(File directory) {
163 if(directoryOfDirectoriesMode(directory)){
164 documents = directory.listFiles();
165 }else{
166 documents = new File[1];
167 documents[0] = directory;
168 }
169
170 i = 0;
171 moveTapeToNextValidDocument();
172 }
173
174 /**
175 * Returns true if the given directory does not have a fragments file.
176 * This means that we will operate on a directory of directories.
177 * @param directory
178 * @return
179 */
180 private boolean directoryOfDirectoriesMode(File directory){
181 File [] all = directory.listFiles(fileFilter);
182 return all.length !=1;
183 }
184
185 /**
186 * Moves to the next valid document.
187 */
188 private void moveTapeToNextValidDocument() {
189 while (i < documents.length) {
190 if (documents[i].isDirectory()) {
191 File data = new File(documents[i], fragmentFileName);
192 if (data.exists()) {
193 break;
194 }
195 }
196 i++;
197 }
198 }
199
200 /**
201 * Returns the next document.
202 * @return The next document.
203 * @throws NoSuchElementException
204 * if {@link #hasNext()} == false or if the current
205 * element could not be processed.
206 */
207 public Document < O > next() {
208 if (!hasNext()) {
209 throw new NoSuchElementException("No more elements!");
210 }
211 File data = new File(documents[i], fragmentFileName);
212 String name = documents[i].getName();
213 Document < O > res = null;
214 try {
215 res = getDocument(name, data);
216 i++;
217 moveTapeToNextValidDocument();
218 } catch (Exception e) {
219 if(logger.isDebugEnabled()){
220 logger.debug(e);
221 }
222 throw new NoSuchElementException(res.toString());
223 }
224 return res;
225 }
226
227 /**
228 * The remove operation does not make sense in this Iterator. This
229 * method does not do anything.
230 */
231 public void remove() {
232 assert false;
233 }
234
235 public boolean hasNext() {
236 return i < documents.length;
237 }
238 }
239 }