#! /usr/bin/ruby

require 'getoptlong'
options = {}
GetoptLong.new(
  [ '--segmenter', '--seg', '-s', GetoptLong::REQUIRED_ARGUMENT ],
  [ '--src-lang', '--srcLang', '-o', GetoptLong::REQUIRED_ARGUMENT ],
  [ '--config-file', '--conf', '-c', GetoptLong::REQUIRED_ARGUMENT ],
).each { |key,val| options[key] = val }


if ARGV.count < 2
    puts "Syntax: #{__FILE__} [#DatabaseName or parameters] [--conf config.yml]? [--segmenter file.srx]? [--src-lang lang]? file"
    puts "\t#DatabaseName : parameters are in conf/castor.yml (or parameter set with --conf)"
    puts "\tDatabase parameters: see conf/castor.yml for clarification about format"    
    exit
end

$LOAD_PATH << "#{File.dirname(__FILE__)}/../lib"                # For non-standard installation

culter = nil
# Load culter module only if required. So, Culter is not mandatory to use Spongiae
if options['--segmenter'] =~ /^simple$/ then
    require 'culter/simple'; culter = Culter::Simple.new
elsif options['--segmenter']  =~ /\.srx$/ then
    require 'culter/srx'
    culter = Culter::SRX::SrxDocument.new(options['--segmenter'])
    if options['--src-lang'] != nil then
        culter = culter.segmenter(options['--src-lang'])
    else
        culter = culter.segmenter('x-unknown')
        puts "SRX found, but missing source language, segmentation will be based on common rules"
    end
end

if options['--config-file'] != nil then
   require 'yaml'; config = YAML.load_file options['--config-file']
elsif File.exist?  "#{File.dirname(__FILE__)}/../conf/castor.yml" then
   require 'yaml'; config = YAML.load_file "#{File.dirname(__FILE__)}/../conf/castor.yml"
else
    config = {} # no options, all defaults
end
 
require 'musci'
dbParamsSet = ARGV.shift
if dbParamsSet =~ /^#(\w+)/ then
   dbParamsSet = config['databases'][$1]
   if dbParamsSet == nil then
       puts "Database #{$1} not found in configuration file"
       exit
   elsif dbParamsSet.is_a? Hash then
      dbParams = [ dbParamsSet['db'], dbParamsSet['user'], dbParamsSet['pass'], dbParamsSet['host'], dbParamsSet['port'] ]
   elsif dbParamsSet.is_a? String then
      dbParamsSet = dbParamsSet.split(':'); dbParams = []
      for i in 0..6
          if dbParamsSet.count > i then dbParams[i] = dbParamsSet[i] else dbParams[i] = nil end
      end
   end
else
    dbParamsSet = dbParamsSet.split(':'); dbParams = []
    for i in 0..6
        if dbParamsSet.count > i then dbParams[i] = dbParamsSet[i] else dbParams[i] = nil end
    end
end
conn = Musci::Connection.new(dbParams[0], dbParams[1], dbParams[2], dbParams[3], dbParams[4], dbParams[5], dbParams[6])


file = ARGV.shift
doc_id = nil
conn.exec("insert into CR_DOCUMENTS(fileName) values(:file) returning doc_id", { 'file' => file }) { |row| doc_id = row[1] }
puts "Inserted file #{file} as #{doc_id}"

st_ins_seg = conn.prepare('st_ins_seg',
    'insert into CR_SEGMENTS(document,contents)
     values(:doc,:contents)'
)
st_find_seg = conn.prepare('st_find_seg',
    'select * from CR_SEGMENTS where contents = :text'
)

matching_set = Hash.new

config['eval-rule'] = {} if config['eval-rule'] == nil
config['eval-rule']['criteria'] = 'segments' if config['eval-rule']['criteria'] == nil
case config['eval-rule']['criteria']
   when /^s/i 
      def seg_len(text) 1 end
   when /^c/i
      def seg_len(text) text.length end
   when /^n/i
      def seg_len(text) text.gsub(/[\r\n\t\s]/,'').length end
end


require 'spongiae/formats/all'
ext = $1 if file =~ /\.(\w+)$/
fmt = Spongiae::Formats::ALL[ext].sniff(file)
if fmt == nil then
    puts "Format #{ext} is not yet supported"
    exit
end
fmt.load!
reader = fmt.create(file)    
to_insert = { 'doc' => doc_id }
count_tu = 0; total_len = 0
reader.read_unit do |fileUnit|        
    if culter != nil then phrases = culter.cut(fileUnit.srcText) else phrases = phrases = [ fileUnit.srcText ] end
    phrases.each do |phrase|        
        puts "\tSearch #{phrase}"
        cur_len = seg_len(phrase); total_len = total_len + cur_len
        st_find_seg.exec({ 'text' => phrase }) do |dbUnit| 
            if matching_set[dbUnit['document']] == nil then matching_set[dbUnit['document']] = cur_len else matching_set[dbUnit['document']] = matching_set[dbUnit['document']] + cur_len end
        end
        to_insert['contents'] = phrase ; st_ins_seg.exec(to_insert) 
        count_tu = count_tu + 1
    end
end
conn.exec("update CR_DOCUMENTS set length = #{total_len} where doc_id = #{doc_id}")

if config['eval-rule']['minimum-absolute'] != nil then
    matching_set.select! { |doc,count| count >= config['eval-rule']['minimum-absolute'].to_i }
end

puts "#{count_tu} new segments inserted, now calculating rates"
st_find_doc = conn.prepare('st_find_doc', 'select * from CR_DOCUMENTS where doc_id = :id')
matching_set.sort_by { |doc1,count1| -count1 }.each do |doc2,count2|    
    st_find_doc.exec({'id' => doc2}) do |rs|
        if (config['eval-rule']['accept-absolute'] != nil) and (count2 >= config['eval-rule']['accept-absolute']) then 
           puts "\tMatching document #{rs['filename']} (#{count2} common #{config['eval-rule']['criteria']})"
        elsif config['eval-rule']['minimum-score'] != nil then
           score = count2 * 1.0 / (total_len + rs['length'].to_i)
           if score >= config['eval-rule']['minimum-score'].to_f then
              puts "\tMatching document #{rs['filename']} (score = #{score})"
           end
        end
    end
end
    
