#! /usr/bin/env jruby 

if ARGV.count < 2
    puts "Syntax: #{__FILE__} <prop>* <destination directoy> <source tmx>"
    puts "\t<prop>*: one or more .p.XXX=YYY to set properties added to each translation unit"
    puts "\tSpecial property .p.file (without required value) will be filled with file name"
    exit
end

cmdprops = { 'file' => false, 'other' => {} }
ARGV.each do |f|
	if f =~ /^\.p\.(.+)=(.+)$/ then
		cmdprops['other'][$1] = $2
    elsif f =~ /^\.p.file$/ then
        cmdprops['file'] = true
	end
end
ARGV.reject! { |f| f =~ /^\.p/ }

$LOAD_PATH << "#{File.dirname(__FILE__)}/../lib"                # For non-standard installation

require 'exilis'
require 'spongiae/tmx'

output = ARGV.shift
isNew = ! File.exist?(output)

if isNew then
	puts "First phase: search for stop ngrams"
	stopWords = {}
	stopWordsKeys = {}
	parser = Exilis::NgramParser.new
	count = 0
	ARGV.each do |f|
		puts "\tAnalyze #{f}"
		Spongiae::TMX::TmxReader.new(f).read_unit do |tu|
			tu.variants.each_pair do |lang, var|
				if stopWords[lang] == nil then stopWords[lang] = Hash.new end
				parser.terms(lang,var.text).each do |term,x| 
					if stopWords[lang][term] == nil then stopWords[lang][term] = 0 end
					stopWords[lang][term] = 1 + stopWords[lang][term] 
				end
			end
			count = count + 1
		end
	end
	puts "#{count} segments"
    if count < 1000 then
        puts "Cannot use stop ngrams for less than 1000 segments"
    else
        require 'fileutils'
        FileUtils.mkdir output
        File.open("#{output}/stop.lists",'w:UCS2-LE') do |outf|
            stopWords.each do |lang,list|			
                outf.puts "[#{lang}.t]\n"
                stopWordsKeys[lang + ".t"] = Array.new
                list.each do |k,v|
                    if v > count * 0.05 then
                        outf.puts "#{k}\n"                    
                        stopWordsKeys[lang + ".t"] << k
                    end
                end
            end
        end
    end
	stopWords = nil
else
	stopWordsKeys = Exilis.readStopWords("#{output}/stop.lists")
end

puts "Second phase: indexation"
writer = Exilis::ExilisWriter.new(output, stopWordsKeys, isNew)
ARGV.each do |f|
	Spongiae::TMX::TmxReader.new(f).read_unit do |tu| 
        tu.props['file'] = f if cmdprops['file']
        cmdprops['other'].each_pair { |k,v| tu.props[k] = v } 
        writer.write(tu) 
    end
end
writer.close
print "#{writer.count} entries added\n"

 
