require 'java'

import java.io.StringReader


Dir.glob(File.expand_path File.dirname(__FILE__) + '/java-lib/*.jar').each { |jar| $CLASSPATH <<  jar }


import org.apache.lucene.index.IndexWriter
import org.apache.lucene.analysis.Analyzer
import org.apache.lucene.analysis.StopFilter
import org.apache.lucene.analysis.LowerCaseFilter
import org.apache.lucene.analysis.ngram.NGramTokenizer
import org.apache.lucene.analysis.tokenattributes.TermAttribute
import org.apache.lucene.store.FSDirectory
import org.apache.lucene.index.IndexReader
import org.apache.lucene.search.IndexSearcher
import org.apache.lucene.search.BooleanQuery
import org.apache.lucene.search.BooleanClause
import org.apache.lucene.search.TermQuery
import org.apache.lucene.index.Term
import org.apache.lucene.search.TopScoreDocCollector

import org.apache.lucene.document.Document
import org.apache.lucene.document.Field


module Exilis
    
	class NgramParser < Analyzer
		def tokenStream(fieldName, reader) 
			res = LowerCaseFilter.new(NGramTokenizer.new(reader, 4, 4))
			if @stop[fieldName] == nil or @stop[fieldName].empty? then
				return res
			else
				return StopFilter.new(true, res, java::util::HashSet.new(@stop[fieldName]))
			end
		end
		
		def terms(fieldName, phrase)
			queryTokenStream = self.tokenStream(fieldName + '.t', StringReader.new(phrase))
			termAtt = queryTokenStream.addAttribute(TermAttribute.java_class)
			queryTokenStream.reset 
			termsMap = {}
			while queryTokenStream.incrementToken
				if termsMap[termAtt.term] == nil then 
					termsMap[termAtt.term] = 1 
				else 
					termsMap[termAtt.term] = termsMap[termAtt.term] + 1 
				end
			end
			queryTokenStream.end
			queryTokenStream.close
			return termsMap
		end
	end
	
	class ExilisWriter
	
		def initialize(destDir,stop={},isNew=false)
            isNew = true if not File.exist? destDir
            parser = NgramParser.new
            parser.stop = stop
			@writer = IndexWriter.new(FSDirectory.open(java.io.File.new(destDir)), parser, isNew, IndexWriter::MaxFieldLength::UNLIMITED )
			@count = 0
		end
		
		def write(tu)
			doc = org.apache.lucene.document.Document.new
			tu.props.each_pair do |name, val|
				doc.add Field.new(".p." + name, val, Field::Store::YES, Field::Index::NO, Field::TermVector::NO)			
			end
			tu.variants.each_pair do |lang, tuv|
				tu.props.each_pair do |name, val|
					doc.add Field.new("#{lang.upcase}.p." + name, val, Field::Store::YES, Field::Index::NO, Field::TermVector::NO)			
				end
				doc.add Field.new("#{lang.upcase}.t", tuv.text, Field::Store::YES, Field::Index::ANALYZED_NO_NORMS, Field::TermVector::YES)
			end
			@writer.addDocument(doc)
			@count = @count + 1
		end
		
		def close
			@writer.close
		end
		
	end
    
    class ExilisExplorer
		def initialize(destDir)
			@reader = IndexReader.open(FSDirectory.open(java.io.File.new(destDir)))
        end
        
		def toEntry(doc,id,targets=nil)
			res = Spongiae::Unit::Multilingual.new(id,{})
			doc.fields.each do |field|
				case field.name
					when /^\.p\.(.+)$/ then res.props[$1] = field.stringValue
					when /^([\w\-]+)\.t$/ then
                        if targets == nil or targets.any? $1 then
                            if res.variants[$1] == nil then 
                                res.variants[$1] = Spongiae::Unit::Variant.new($1, {}, field.stringValue)
                            else
                                res.variants[$1] = Spongiae::Unit::Variant.new($1, res.variants[$1].props, field.stringValue)                            
                            end
                        end
					when /^([\w\-]+)\.p\.(.+)$/ then
                        if targets == nil or targets.any? $1 then
                            if res.variants[$1] == nil then res.variants[$1] = Spongiae::Unit::Variant.new($1, {}, '--temp--') end
                            res.variants[$1].props[$2] = field.stringValue
                        end
				end
			end
			return res
		end
        
		def findAll(targets=nil)
			(0 .. @reader.maxDoc - 1).each do |i|
				unless @reader.deleted?(i) then
					doc = @reader.document(i)
					yield toEntry(doc,i,targets)
				end
			end
		end
        
        def count() @reader.numDocs end
            
        def languages
			@reader.getFieldNames(IndexReader::FieldOption::INDEXED_WITH_TERMVECTOR).map { |item| $1 if item =~ /^([\w\-]+?)\./ }.select { |item| item != nil }.uniq
        end
    end
    
	class ExilisSearcher < ExilisExplorer
		def initialize(destDir)
			super(destDir)
			@searcher = IndexSearcher.new(@reader)
			@analyzer = NgramParser.new
			@analyzer.stop = Exilis::readStopWords(destDir + "/stop.lists") if File.exist?(destDir + "/stop.lists")
		end
		
		def fuzzySearch(phrase,lang,minScore,maxCount,&f)
            if lang =~ /^(.+):(.+)/ then
               lang = $1.upcase; targets = $2.upcase.split(/,/); targets << lang
            else
                targets = nil
            end
            
			query = BooleanQuery.new
			@analyzer.terms(lang.upcase,phrase).each do |word, count|
				query.add(TermQuery.new(Term.new(lang.upcase + '.t',word)), BooleanClause::Occur::SHOULD)
			end
			minShould = query.clauses.count * minScore / 200
			query.setMinimumNumberShouldMatch(minShould) if minShould > 1  # minScore as %, divided by 2.
			
			topCollector = TopScoreDocCollector.create(maxCount, true)
			@searcher.search(query, topCollector)

			topCollector.topDocs.scoreDocs.each do |sdoc|
				f.call entryById(sdoc.doc, targets)
			end
		end        
	end

end
