#! /usr/bin/env perl
# -*- mode: perl; -*-

# file getTranslators.pl
# This file is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public
# License as published by the Free Software Foundation; either
# version 2 of the License, or (at your option) any later version.
#
# This software is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public
# License along with this software; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#
# Copyright (c) 2015 Kornel Benko <kornel@lyx.org>

use strict;

package IdentityParse;
use base "HTML::Parser";
use locale;
use POSIX qw(locale_h);
use feature 'fc';

setlocale(LC_CTYPE, "");
setlocale(LC_MESSAGES, "en_US.UTF-8");
$ENV{LC_ALL} = "C";		# set language for the output of command 'msgfmt'

# Prototypes
sub insertEntry(\%$$);
sub start($$$$$);
sub text($$);
sub end($$$);
sub actual_tag();
sub convertlang($);
sub scanPoFile($);
sub row_init(\%);
sub row_valid(\%);
sub phantomscript();

my $p = new IdentityParse;
$p->strict_names(0);

my $lyxurl = "http://www.lyx.org/I18n-trunk";
my $podir = "./po";	# script starts in top binary dir
my $jsfile = "$podir/lyxtranslators.js";

my %langs = (			# translation to make language spec unique 
  "pt" => "Portuguese",
  "pt_BR" => "Portuguese (Brazilian)",
  "ar" => "Arabic",
  "bg" => "Bulgarian",
  "ca" => "Catalan",
  "cs" => "Czech",
  "da" => "Danish",
  "de" => "German",
  "el" => "Greek",
  "en" => "English",
  "es" => "Spanish",
  "eu" => "Basque",
  "fi" => "Finnish",
  "fr" => "French",
  "gl" => "Galician",
  "he" => "Hebrew",
  "hu" => "Hungarian",
  "ia" => "Interlingua",
  "id" => "Indonesian",
  "it" => "Italian",
  "ja" => "Japanese",
  "ko" => "Korean",
  "sk" => "Slovak",
  "nb" => "Norwegian (Bokmål)",
  "nl" => "Dutch",
  "nn" => "Norwegian (Nynorsk)",
  "pl" => "Polish",
  "ro" => "Romanian",
  "ru" => "Russian",
  "sr" => "Serbian",
  "sv" => "Swedish",
  "tr" => "Turkish",
  "uk" => "Ukrainian",
  "zh_CN" => "Chinese (simplified)",
  "zh_TW" => "Chinese (traditional)",
  "Simplified" => "Chinese (simplified)",
  "Simplified Chinese" => "Chinese (simplified)",
  "Traditional Chinese" => "Chinese (traditional)",
    );

open(FO, '>', "$jsfile");
print FO &phantomscript();
close(FO);

my %status = ();
my @tag = ();			# stack for active html-tags
my %page_row = ();			# entries for mail, name pofile, language (in the actual row or po-file)
my @translation_entries = qw(language mail name pofile);

my %list = ();			# collected list of rows, entry key is language

my $errors = 0;
if (open(my $fh, "phantomjs $jsfile|")) {
  $p->parse_file($fh);
  print "Parsed \"$lyxurl\"\n";
}
else {
  $errors++;	# cannot parse html file
  print "ERROR: Program \"phantomjs\" to parse \"$lyxurl\" could not be executed\n";
}

if (opendir(DI, $podir)) {
  my $po_count = 0;
  while (my $po = readdir(DI)) {
    if ($po =~ /\.po$/) {
      my $res = &scanPoFile("$po");
      if ($res == 0) {
	print "No valid entry found in \"$po\"\n";
	$errors++;
      }
      else {
	$po_count += $res;
      }
    }
  }
  closedir(DI);
  if ($po_count < 1) {
    print "ERROR: No correct po-files in directory \"$podir\" found\n";
    $errors++;
  }
  else {
    print "Found $po_count po-files with valid translator entry\n";
  }
}
else {
  print "Directory for po-files ($podir) missing\n";
  $errors++;	# PO directory not found, so cannot check
}

for my $lang (sort keys %list) {
  for my $rentry (@{$list{$lang}}) {
    my $prefix = sprintf("(%03d%) ", "$rentry->{fract}");
    if (defined($rentry->{error})) {
      $errors++;
      $prefix .= sprintf("%-24s", "$rentry->{error}:");
    }
    else {
      $prefix .= sprintf("%24s", "");
    }
    my $msg = sprintf("%-24s%-10s", "$lang:", "$rentry->{po},");
    print "$prefix$msg mail = \"$rentry->{name}\" <$rentry->{mail}>\n";
  }
}

if ($errors > 0) {
  exit(1);
}
else {
  exit(0);
}
###################################################################
# Insert collected row values in %list
sub insertEntry(\%$$)
{
  my ($rrow, $cycle, $fract) = @_;

  # Convert mangled mail
  $rrow->{mail} =~ s/ pound /\@/;
  $rrow->{mail} =~ s/ dot /\./g;
  $rrow->{mail} =~ s/ underscore /_/g;
  my %entry = ();
  my $language = $rrow->{language};
  $entry{mail} = $rrow->{mail};
  $entry{name} = $rrow->{name};
  $entry{po} = $rrow->{pofile};
  $entry{cycle} = $cycle;
  # Check, if entry already exists
  if (! defined($list{$language})) {
    $list{$language} = [];
  }
  my $found = 0;
  my $empty = 1;
  my $name1 = undef;
  for my $rentry (@{$list{$language}}) {
    $empty = 0;
    if ($cycle == 2) {
      if (! defined($rentry->{fract})) {
	$rentry->{fract} = $fract;
      }
    }
    next if (fc($rentry->{mail}) ne fc($rrow->{mail}));	# char case does not matter in mail strings
    next if ($rentry->{po} ne $rrow->{pofile});
    $name1 = $rentry->{name};
    $found = 1;
    last;
  }
  if ($cycle == 1) {
    push(@{$list{$language}}, \%entry);
  }
  else {
    $entry{fract} = $fract;
    if ($empty) {
      if ($fract > 40) {
	$entry{error} = "Missing in page";
	push(@{$list{$language}}, \%entry);
      }
    }
    elsif (! $found) {
      $entry{error} = "Different mail in po";
      push(@{$list{$language}}, \%entry);
    }
    else {
      # found, but maybe incorrect name?
      if ($name1 ne $rrow->{name}) {
	$entry{error} = "Different name in po";
	push(@{$list{$language}}, \%entry);
      }
    }
  }
}

#######################################################################
# Routines called from parse_file(): start(), text(), end().
sub start($$$$$)
{
  my ($self, $tag, $attr, $attrseq, $origtext) = @_;

  push(@tag, $tag);
  if ($tag eq "tr") {		# new table row
    &row_init(\%page_row);
    for my $k (keys %status) {
      $status{$k} = 0;
    }
  }
  $status{"Tag_" . $tag} =  $status{"Tag_" . $tag} + 1;

  if ($tag eq "a") {
    if (defined($attr->{class})) {
      if ($attr->{class} eq "urllink") {
	if ($status{Tag_td} == 6) {
	  if ($attr->{href} =~ /^mailto:(.*)$/) {
	    $page_row{mail} = $1;
	    $page_row{mail} =~ s/\%20/ /g;
	  }
	}
	elsif ($status{Tag_td} == 1) {
	  if ($attr->{href} =~ /f=po\/([a-z][a-z](_[A-Z][A-Z])?\.po)$/) {
	    $page_row{pofile} = $1;
	  }
	}
      }
    }
  }
}

sub text($$)
{
  my ($self, $text) = @_;

  if ($status{Tag_td} == 1) {
    if (&actual_tag() eq "a") {
      if ($text =~ /^[A-Z][a-z]+( .+)?$/) {
	$page_row{language} = &convertlang($text);
      }
    }
  }
  if ($status{Tag_td} == 6) {
    if (&actual_tag() eq "a") {
      $page_row{name} .= $text;	# '.=' because text can be splitted
    }
    elsif (&actual_tag() eq "td") { # name without associated e-mail
      $page_row{name} .= $text;
    }
  }
}

sub end($$$)
{
  my ($self, $tag, $origtext) = @_;

  while (my $t = pop(@tag)) {
    last if ($t eq $tag);
  }
  if ($tag eq "tr") {
    # check row entry for completeness
    return if (! &row_valid(\%page_row));
    &insertEntry(\%page_row, 1);
  }
}

sub actual_tag()
{
  return undef if (@tag == 0);
  return($tag[$#tag]);
}

sub convertlang($)
{
  my ($ilang) = @_;
  $ilang =~ s/\s+$//;
  if (defined($langs{$ilang})) {
    return($langs{$ilang});
  }
  else {
    return($ilang);
  }
}

sub scanPoFile($)
{
  my ($pofile) = @_;
  my %po_row;
  my ($translated, $fuzzy, $untranslated) = (0, 0, 0);

  if (open(FM, "msgfmt -c --statistics $podir/$pofile 2>&1 |")) {
    while (my $l = <FM>) {
      if ($l =~ s/^(\d+)\s+translated messages.\s*//) {
	$translated = $1;
      }
      if ($l =~ s/^(\d+)\s+fuzzy translations.\s*//) {
	$fuzzy = $1;
      }
      if ($l =~ s/^(\d+)\s+untranslated messages//) {
	$untranslated = $1;
      }
    }
    close(FM);
  }
  return 0 if ($translated == 0);
  my $fract = int(($translated * 100)/($translated+$fuzzy+$untranslated));
  if (open(FI, "$podir/$pofile")) {
    &row_init(\%po_row);
    $po_row{pofile} = $pofile;
    my $i = 0;
    while (my $l = <FI>) {
      last if ($l =~ /^"[A-Z].*:/);
    }
    my $inserted = 0;
    while (my $l = <FI>) {
      last if ($l !~ /^"/);
      chomp($l);
      if ($l =~ s/^"Last-Translator:\s//) {
	while ($l !~ />\\n"$/) {
	  $l =~ s/"$//;
	  my $extraline = <FI>;
	  chomp($extraline);
	  $extraline =~ s/^"//;
	  $l .= $extraline
	}
	if ($l =~ /^([^<]*)<([^>]*)>/) {	# allow empty mail
	  $po_row{mail} = $2;
	  ($po_row{name} = $1) =~ s/\s+$//;
	  $i += 2;
	}
      }
      elsif ($l =~/^"Language:\s*([^\\]+)\\n/) {
	$po_row{language} = &convertlang($1);
	$i += 1;
      }
      if (&row_valid(\%po_row)) {
	&insertEntry(\%po_row, 2, $fract);
	$inserted++;
	last;
      }
    }
    close(FI);
    return($inserted);
  }
  else {
    return(0);
  }
}

###########################################################
# handling of row entries

sub row_init(\%)
{
  my ($rrow) = @_;
  %{$rrow} = ();
  $rrow->{mail} = "";		# Allow for empty mail
}

sub row_valid(\%)
{
  my ($rrow) = @_;
  for my $k (@translation_entries) {
    return 0 if (! defined($rrow->{$k}));
  }
  return(1);
}

# used by phantomjs command to output the refered html page
sub phantomscript()
{
  return "var page = require(\"webpage\").create();
var url = \"$lyxurl\";
page.open(url,
 function (status) {
  var f = function () {
   var html = page.evaluate(function () { return document.documentElement.innerHTML });
   console.log(html);
   phantom.exit();
  };
  setTimeout(f, 5000);
 }
);
";
}