#! /usr/bin/ruby

require 'getoptlong'
options = {}
GetoptLong.new(
  [ '--query-file', '--file', '-f', GetoptLong::OPTIONAL_ARGUMENT ],
  [ '--query-string', '--string', '-S', GetoptLong::OPTIONAL_ARGUMENT ],
  [ '--collection', '--coll', '-c', GetoptLong::OPTIONAL_ARGUMENT ],
  [ '--docname', '--doc', '-d', GetoptLong::OPTIONAL_ARGUMENT ],
  [ '--segmenter', '--seg', '-s', GetoptLong::REQUIRED_ARGUMENT ],
  [ '--config-file', '--conf', '-C', GetoptLong::REQUIRED_ARGUMENT ],
  [ '--output-file', '--of', '-o', GetoptLong::OPTIONAL_ARGUMENT ],    
  [ '--output-encoding', '--enc', '-e', GetoptLong::OPTIONAL_ARGUMENT ],    
  [ '--verbose', '-v', GetoptLong::NO_ARGUMENT ],    
).each { |key,val| options[key] = val }
lang = ARGV[1]; srcLang = lang; traLangs = nil; srcLang, traLangs = $1, $2.split(',') if srcLang =~ /^([\w\-]+):([\w\-\,]+)$/

if ARGV.count < 2
    puts "Syntax: "
    puts "\t#{__FILE__} DB-Name(:user:password)?(:host:port)? source-lang(:target-langs?) [--collection coll-name]? [--doc docName]? [--query-string|-S] segment"
    puts "\t#{__FILE__} DB-Name(:user:password)?(:host:port)? source-lang(:target-langs?) [--collection coll-name]? [--doc docName]? [--query-file|-f] file.txt"
    exit
end

$LOAD_PATH << "#{File.dirname(__FILE__)}/../lib"                # For non-standard installation

db = ARGV.shift

if options['--config-file'] != nil then
   require 'yaml'; config = YAML.load_file options['--config-file']
elsif File.exist?  "#{File.dirname(__FILE__)}/../conf/elefas.yml" then
   require 'yaml'; config = YAML.load_file "#{File.dirname(__FILE__)}/../conf/elefas.yml"
else
    config = {}         # All defaults, but variable must not be nil
end
require 'musci'
if db =~ /^\#/ then  Musci::add_aliases(config['memories']) end
Musci::def_token_policy(config['tokens-policy']) unless config['tokens-policy'] == nil

culter = nil
# Load culter module only if required. So, Culter is not mandatory to use Spongiae
if options['--segmenter'] == nil then
   if defined? config['segmentation'] then
       if config['segmentation'].is_a? Hash then
           if db =~ /^\#/ then  options['--segmenter'] = config['segmentation'][db[1..-1]] 
           else options['--segmenter'] = config['segmentation'][db[0 .. db.index()]] end 
           options['--segmenter'] = config['segmentation']['default'] unless defined? options['--segmenter'] 
        else
            options['--segmenter'] = config['segmentation'] # String
            if options['--segmenter'] =~ /\$DB/ then
                if db =~ /^\#/ then
                    options['--segmenter'].gsub!(/\$DB/, db[1..-1]) 
                else
                    db = Musci::Connection.new(db)
                    options['--segmenter'].gsub!(/\$DB/, db.dbName) 
                end
            end
        end
        STDERR.puts "From configuration, segmentation = #{options['--segmenter']}"
   else
        options['--segmenter'] = ''
        STDERR.puts "Warning: no configured segmentation, assume input file is correctly segmented"
   end
end
if options['--segmenter'] =~ /^simple$/ then
    require 'culter/simple'; culter = Culter::Simple.new
elsif options['--segmenter']  =~ /\.srx$/ then
    require 'culter/srx'
    culter = Culter::SRX::SrxDocument.new(options['--segmenter']).segmenter(srcLang)
end


require 'elefas/read'
searcher = Elefas::ElefasSearcher.new(db,config, options['--collection'],options['--docname'])

count_units = 0
config['search']['min-score'] = 30 unless defined? config['search']['min-score']

require 'spongiae/tmx'

TMX_PARAMS = { :tool => 'Elefas Maximus', :version => Elefas::ElefasSearcher.VERSION, :srclang => srcLang }
encoding = (options.key? '--output-encoding') ? options['--output-encoding'] : 'UTF-8'
outFile = (options.key? '--output-file') ? File.open("#{options['--output-file']}.running", "w:#{encoding}") : STDOUT

# ----------------- single segment mode ----------------
if options['--query-string'] != nil then
    seg = options['--query-string']
    writer = Spongiae::TMX::TmxWriter.new(outFile, TMX_PARAMS)
    config['search']['simple-search-results'] = 50 unless defined? config['search']['simple-search-results']
    searcher.fuzzySearch(seg,lang,config['search']['min-score'],config['search']['simple-search-results']) do |tu|
        writer.add tu.id, tu
        count_units = count_units + 1
    end
    writer.comment "Result: #{count_units} units found"
    writer.close; searcher.close
    if options.key? '--output-file' then outFile.close; require 'fileutils'; FileUtils.mv outFile, options['--output-file'] end

# ---------------- text file mode --------------------
elsif options['--query-file'] != nil then
    file = options['--query-file']
    require 'spongiae/formats/all'
    writer = Spongiae::TMX::TmxWriter.new(outFile, TMX_PARAMS)
    searcher.track_duplicates!      # multiple searches, so same segment may reappear
    config['search']['file-search-results'] = 5 unless defined? config['search']['file-search-results']
    
    if defined? config['stats'] then
        require 'spongiae/stats'
        class ElefasStatsRow < Spongiae::Stats::SegmentsStatsRow
            def attr_xml() "segments='#{@segments}' words='#{@words}' characters='#{@chars}' non-blank-characters='#{@nbChars}'" end
        end
        
       config['stats']['limits'].unshift(100) unless config['stats']['limits'][0] == 100
       config['stats']['limits'] << 0 unless config['stats']['limits'][-1] == 0
       stats = []; config['stats']['limits'].each { |item| stats << ElefasStatsRow.new }
       if true == config['stats']['repetitions'] then repetitions = {}; rp_count = ElefasStatsRow.new end
    end
    
    ext = $1 if file =~ /\.(\w+)$/
    fmt = Spongiae::Formats::ALL[ext].sniff(file)
    if fmt == nil then
        STDERR.puts "Format #{ext} is not yet supported"
        exit
    end
    fmt.load!
    reader = fmt.create(file)    
    if options.key? '--verbose' then read_segs = 0; start = Time.now; STDERR.puts "#{start} - Start analyse #{file}"; end
	reader.read_unit do |fileUnit|        
        if culter != nil then phrases = culter.cut(fileUnit.srcText) else phrases = phrases = [ fileUnit.srcText ] end
        phrases.each do |phrase|        
            next if phrase !~ /\w\w+/
            if true == config['stats']['repetitions'] then
                if repetitions.key? phrase then
                    repetitions[phrase] = repetitions[phrase] + 1
                    rp_count.add_unit(phrase)
                    next    # do not search this phrase
                else
                    repetitions[phrase] = 1
                end
            end
            writer.comment "Search #{phrase}"
            if options.key? '--verbose' then 
                read_segs = read_segs + 1
                STDERR.puts "#{Time.now.to_i - start.to_i} seconds - Analyzed #{read_segs} segments, #{count_units} found matches"
            end
            first = true
            searcher.fuzzySearch(phrase,lang,config['search']['min-score'],config['search']['file-search-results']) do |tu|
                if first then
                    score = tu.props['score'].to_f * 100.0
                    config['stats']['limits'].each_with_index do |val,idx|
                        if first and score >= val then
                           stats[idx].add_unit(phrase)
                           first = false
                        end
                    end
                end
                writer.add tu.id, tu
                count_units = count_units + 1
            end
            stats[-1].add_unit(phrase) if first
        end
    end
    writer.comment "Result: #{count_units} units found"
    if defined? config['stats'] then
        config['stats']['limits'].each_with_index do |val,idx|
            writer.comment "<stat limit='#{val}' #{stats[idx].attr_xml()} />"
        end
        if true == config['stats']['repetitions'] then           
           writer.comment "<repetitions #{rp_count.attr_xml()} />"
        end
    end
    writer.close; searcher.close
    if options.key? '--output-file' then outFile.close; require 'fileutils'; FileUtils.mv outFile, options['--output-file'] end
    if options.key? '--verbose' then 
        STDERR.puts "#{Time.now.to_i - start.to_i} seconds - End analyze #{file}, #{read_segs} segments read, #{count_units} units found" 
    end

# ----------------- error case ----------------
else 
    puts "Either query string or query file is mandatory"
    puts "Syntax: "
    puts "\t#{__FILE__} DB-Name(:user:password)?(:host:port)? source-lang(:target-langs?) [--collection coll-name]? [--doc docName]? [--query-string|-S] segment"
    puts "\t#{__FILE__} DB-Name(:user:password)?(:host:port)? source-lang(:target-langs?) [--collection coll-name]? [--doc docName]? [--query-file|-f] file.txt"
    exit
end
