Execute lucene booleanquery in file huge problems

120 views Asked by At

I 've a problem with my huge nquad file (about 4000 lines) when i execute a boolenquery, i try a query as:

Query query1 = new TermQuery(new Term(FIELD_CONTENTS, "Albania"));
    Query query2 = new TermQuery(new Term(FIELD_CONTENTS, "Hitchcock"));

    BooleanQuery booleanQuery = new BooleanQuery();
    booleanQuery.add(query1, BooleanClause.Occur.MUST);
    booleanQuery.add(query2, BooleanClause.Occur.MUST);

This query performs correctly when the words that I try to search in the line number<780, then >780 failed.

This is a snippet of my nquad file:

<http://dbpedia.org/resource/A_Clockwork_Orange> <http://dbpedia.org/ontology/numberOfPages> "192"^^<http://www.w3.org/2001/XMLSchema#positiveInteger> <http://en.wikipedia.org/wiki/A_Clockwork_Orange?oldid=606117686#absolute-line=12> .

I make a custom analyzer for distinguer tokens:

import java.io.Reader;
import java.util.Set;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;

class TestAnalyzer1 extends Analyzer {
    public static final String[] TEST_STOP_WORDS = { "http", "https",
            "resource", "foaf/0.1", "dbpedia.org", "en.wikipedia.org",
            "xmlns.com", "purl.org", "elements/1.1",
            "www.w3.org/2001/XMLSchema", "www.w3.org/1999/02/22-rdf",
            "www.w3.org/2003/01", "oldid", "wiki" };

    @SuppressWarnings("rawtypes")
    private Set stopWords = StopFilter.makeStopSet(TEST_STOP_WORDS);

    public TokenStream tokenStream(String fieldName, Reader reader) {
        TokenStream ts = new StandardTokenizer(reader);
        ts = new StandardFilter(ts);
        ts = new StopFilter(ts, stopWords);
        return ts;
    }
}

This is main class:

import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.util.Iterator;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermFreqVector;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Hit;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;

@SuppressWarnings("deprecation")
public class TestPreFinal {

    public static final String FILES_TO_INDEX_DIRECTORY = "filesToIndex_1";
    public static final String INDEX_DIRECTORY = "indexDirectory";

    public static final String FIELD_PATH = "path";
    public static final String FIELD_CONTENTS = "contents";

    public static void main(String[] args) throws CorruptIndexException,
            LockObtainFailedException, IOException, ParseException {

        long startTime = System.currentTimeMillis();

        Analyzer analyzer = new TestAnalyzer1();
        IndexWriter indexWriter = new IndexWriter(INDEX_DIRECTORY, analyzer,
                true);

        File dir = new File(FILES_TO_INDEX_DIRECTORY);
        File[] files = dir.listFiles();

        for (File file : files) {
            Reader reader = new FileReader(file);
            Document document = new Document();
            String path = file.getCanonicalPath();

            Field fieldPath = new Field(FIELD_PATH, path, Field.Store.YES,
                    Field.Index.UN_TOKENIZED);
            Field fieldContents = new Field(FIELD_CONTENTS, reader,
                    Field.TermVector.WITH_POSITIONS_OFFSETS);

            document.add(fieldPath);
            document.add(fieldContents);

            indexWriter.addDocument(document);
        }

        indexWriter.commit();
        indexWriter.close();

        Directory directory = FSDirectory.getDirectory(INDEX_DIRECTORY);
        IndexSearcher indexSearcher = new IndexSearcher(directory);
        IndexReader indexReader = IndexReader.open(directory);

        Query query1 = new TermQuery(new Term(FIELD_CONTENTS, "Albania"));
        Query query2 = new TermQuery(new Term(FIELD_CONTENTS, "Hitchcock"));

        BooleanQuery booleanQuery = new BooleanQuery();
        booleanQuery.add(query1, BooleanClause.Occur.MUST);
        booleanQuery.add(query2, BooleanClause.Occur.MUST);

        Hits hits = indexSearcher.search(booleanQuery);
        @SuppressWarnings({ "unchecked" })
        Iterator<Hit> it = hits.iterator();
        TermFreqVector tfv = null;

        while (it.hasNext()) {
            Hit hit = it.next();
            Document document = hit.getDocument();
            String path = document.get(FIELD_PATH);
            System.out.println("Hit: " + path);
        }

        for (int i = 0; i < hits.length(); i++) {
            tfv = indexReader.getTermFreqVector(i, FIELD_CONTENTS);
            System.out.println(tfv);
        }

    }
}

I do not know what else to do. You can help please. Thanks in advance.

0

There are 0 answers