#! /usr/bin/perl

=encoding utf-8
=head1 DESCRIPTION

This script implements search on Elefas memory

=head1 USAGE

The script produces a TMX document in standard output.

    perl ef-find.pl (DBI-String or #alias) source-lang(:target-langs?) [-doc <doc-name>|-col <coll-name>]? -seg segment

Searchs given segment

    perl ef-find.pl (DBI-String or #alias) source-lang(:target-langs?) [-doc <doc-name>|-col <coll-name>]? -file file.(txt or xliff)

Searchs all segments from the file (plain text with one segment per line or any XLIFF, including CAT-tool specific variants)
Note: search begins when file is closed, so if you pass standard input, search starts when the input ends.

For DBI parameters, see the documentation of ef-import.pl
    
Source language is mandatory, while by default, all target languages are extracted.

-doc followed by a document name will restrict results to one document.

-coll followed by a collection name will restrict results to one collection.

=cut

use DBI;

my $db = shift or die "Syntax: $0 DBI-String source-lang(:target-langs?) [-doc <doc-name>|-col <coll-name>]? [-seg segment | -file file.txt]";
my $lang = shift or die "Syntax: $0 DBI-String source-lang(:target-langs?) [-doc <doc-name>|-col <coll-name>]? [-seg segment | -file file.txt]";

require File::Basename; push(@INC,'.');
require sprintf('%s/../conf/elefas-config.pl', File::Basename::dirname($0));
if ($db =~ s/^\#//) {
    $db = $Elefas::Config{databases}{$db} or die "No such alias: $db";
}

my @traLangs = split(/,/, $1) if $lang =~ /:([\w\-\,]+)/;
my $count_units = 0;

# ---------------- document or collection -------------

our $docId = undef, $collId = undef;

if ($ARGV[0] =~ /^-doc/) { 
	if ($ARGV[0] =~ /:/) { $docId = shift; $docId = substr($docId, index($docId,':') + 1); }
	else { shift; $docId = shift; }
}
elsif ($ARGV[0] =~ /^-coll/) {
	if ($ARGV[0] =~ /:/) { $collId = shift; $collId = substr($collId, index($collId,':') + 1); }
	else { shift; $collId = shift; }
}

# ----------------- single segment mode ----------------
if ($ARGV[0] =~ /^-seg/) {
	my $seg = $ARGV[1]; $seg = $1 if $ARGV[0] =~ /:(.+)$/;
	my $conn = db_connect($db);
	my $st = db_search_statement($conn, $lang);
	$st->execute($seg);
	tmx_header();
	my $count = $Elefas::Config{search}{string}{limit} || 1_000_000_000;
	while (my $ref = $st->fetchrow_hashref) {
		tmx_produce($ref);
		last if --$count == 0;
	}
	tmx_footer();
}

# ---------------- text file mode --------------------
elsif ($ARGV[0] =~ /^-file/) {
	my $file = $ARGV[1]; $file = $1 if $ARGV[0] =~ /:(.+)$/;
	my $conn = db_connect($db); my @after = ();
	if ($file =~ /\.te?xt$/i) {
        open(FILE, $file) or die "Cannot open $file: $!";
    } elsif ($file =~ /\.\w*xli?ff?$/i) {
        require File::Basename; my $script = File::Basename::dirname($0);
        open(FILE, "perl -I. $script/xliff2text.pl -lang $file |");
    } elsif ($Elefas::Config{converters}) {
        my $ext = $1 if $file =~ /\.(\w+)$/; my $config = $Elefas::Config{converters}{$ext} || $Elefas::Config{converters};
        die "Command for format $1 not configured" unless $config->{command};
        if ($config->{out_file}) {
            my $cmd = $config->{command}; $cmd =~ s/\$file/$file/; $cmd =~ s/\$lang/$lang/;
            print STDERR $cmd, "\n"; system $cmd;
            if (ref($config->{out_file}) =~ /CODE/) { 
                $file = &{$config->{out_file}}($file,$lang);
                if ("$config->{after}" =~ /perl:unlink/) { push(@after, sub { unlink $file; }); }
                elsif ($config->{after}) { $cmd = $config->{after}; $cmd =~ s/\$file/$file/; push(@after, sub { system $cmd; }); }
            } else {
                my $ofile = $config->{out_file}; $ofile =~ s/\$file/$file/;
                print STDERR "Using $ofile\n";
                require File::Basename; my $script = File::Basename::dirname($0);
                open(FILE, "perl -I. $script/xliff2text.pl -lang $ofile |");
                if ("$config->{after}" =~ /perl:unlink/) { push(@after, sub { unlink $ofile; }); }
                elsif ($config->{after}) { $cmd = $config->{after}; $cmd =~ s/\$file/$ofile/; push(@after, sub { system $cmd; }); }
            }
        }
    } else {
        die "Format $1 is not supported" if $file =~ /\.(\w+)$/;
    }
	if ($lang =~ /^\-/) { # read parameters from file
	    my %params = (); while (<FILE>) { 		# warn: parameters must end with empty line
               $params{lc(substr($1,0,1))} = $2		# temporary: since we have only languages, collection and doc-name, take first letter only 
                   if /^\s*(\S+)\s*[:=]\s*(.+)\s*$/; 	# accept = or : as separator, and ignore spaces
               last unless /\w/;
           }
           $lang = $params{s} || $params{l}; # lang, srcLang...
           @traLangs = split(/,/, $params{t});
           $docId = $params{d}; $collId = $params{c};
           die "File does not contain language parameter" unless $lang;
	}
	my $st = db_search_statement($conn, $lang);
	tmx_header();
	my %found_ids = ();
	my $count = $Elefas::Config{search}{file}{limit} || 1_000_000_000;
	while (my $seg = <FILE>) {
		$seg =~ s/\r?\n//g; print "\t<!-- Search $seg -->\n";
		$st->execute($seg);
		$count = $Elefas::Config{search}{file}{limit} || 1_000_000_000;
		while (my $ref = $st->fetchrow_hashref) {
			next if $found_ids{$ref->{tuid}};
			tmx_produce($ref);
			$found_ids{$ref->{tuid}}++;
			last if --$count == 0;
		}
	}
	tmx_footer();
	foreach (@after) { &$_(); }
}

# ----------------- error case ----------------
else {
	die "Wrong extraction type ($ARGV[0])\nSyntax: $0 DBI-String source-lang(:target-langs?) [-seg segment | -file file.txt]";
}

# ========================== FUNCTIONS ========================

sub db_connect {
	my $db = shift;
	
	$db = "dbname=$db" unless $db =~ /=/;
	$db = "dbi:Pg:$db" unless $db =~ /^dbi:/i;

	my $user = undef; $user = $1 if $db =~ s/\buser(?:name)?=(.+?)(;|$)//;
	my $pass = undef; $pass = $1 if $db =~ s/\bpass(?:word)?=(.+?)(;|$)//;

	return DBI->connect($db,$user,$pass);
}

sub db_search_statement {
	my $conn = shift;
	my $lang = substr(shift, 0, 2);
	$st_tuv ||= $conn->prepare("select * from EF_SEG where unit=?");
	my $spec = ''; if ($docId) { $spec = "ef_doc.name = '$docId' and "; }
	elsif ($collId) {
		my ($collId1) = $conn->selectrow_array("select id from EF_COLLECTION where name='$collId'");
		die "Cannot find collection $collId" unless $collId1;
		$spec = "ef_doc.collection = $collId1 and ";
	}
	return $conn->prepare(
		"select ef_unit.tuid, ef_unit.id as id, ef_doc.name as doc
		   from ef_unit 
		   left join ef_seg on ef_seg.unit = ef_unit.id
		   left join ef_doc on ef_doc.id = ef_unit.doc_id
		  where $spec upper(ef_seg.lang)='\U$lang\E' and ef_seg.contents % ?"
	);
}

sub tmx_header {
	print "<tmx>\n\t<header srclang='$lang' creationtool='Elefas extractor' />\n\t<body>\n";
}

sub tmx_footer {
	print "\t\t<!-- $count_units units found -->\n\t</body>\n</tmx>"
}

our $st_tuv;

sub tmx_produce {
	my $ref = shift;
	$st_tuv->execute($ref->{id});
	print "\t\t<tu tuid='$ref->{tuid}'>\n";
	print "\t\t\t<prop type='Document name'>$ref->{doc}</prop>\n";
	while (my $tuv = $st_tuv->fetchrow_hashref) {
		print "\n\t\t\t<tuv xml:lang='$tuv->{lang}'";
		print " creationid='$tuv->{author}' " if $tuv->{author};
		print ">\n\t\t\t\t<seg>$tuv->{contents}</seg>\n\t\t\t</tuv>";
	}
	print "\n\t\t</tu>\n";
	$count_units++; # global variable
}

=head1 LICENSE

Copyright 2013 Silvestris Project (L<http://www.silvestris-lab.org/>)

Licensed under the EUPL, Version 1.1 or – as soon they will be approved by the European Commission - subsequent versions of the EUPL (the "Licence");
You may not use this work except in compliance with the Licence.
You may obtain a copy of the Licence at: L<http://ec.europa.eu/idabc/eupl>

Unless required by applicable law or agreed to in writing, software distributed under the Licence is distributed on an "AS IS" basis,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the Licence for the specific language governing permissions and limitations under the Licence. 

=cut
