diff --git a/development/checkurls/CheckURL.pm b/development/checkurls/CheckURL.pm
index 4ab70abfcb..330f7fe3f7 100755
--- a/development/checkurls/CheckURL.pm
+++ b/development/checkurls/CheckURL.pm
@@ -1,5 +1,6 @@
# -*- mode: perl; -*-
package CheckURL;
+
# file CheckURL.pm
#
# This file is part of LyX, the document processor.
@@ -13,9 +14,10 @@ package CheckURL;
#
use strict;
-our(@EXPORT, @ISA);
+our (@EXPORT, @ISA);
+
BEGIN {
- use Exporter ();
+ use Exporter ();
@ISA = qw(Exporter);
@EXPORT = qw(check_url);
}
@@ -25,16 +27,18 @@ sub check_http_url($$$$);
sub check_ftp_dir_entry($$);
sub check_ftp_url($$$$);
sub check_unknown_url($$$$);
-sub check_url($$);
+sub check_url($$$$);
################
-sub check_http_url($$$$)
-{
+my $fe;
+my $fs;
+
+sub check_http_url($$$$) {
require LWP::UserAgent;
my ($protocol, $host, $path, $file) = @_;
- my $ua = LWP::UserAgent->new;
+ my $ua = LWP::UserAgent->new(timeout => 20);
my $getp = "/";
if ($path ne "") {
$getp .= $path;
@@ -54,11 +58,11 @@ sub check_http_url($$$$)
$buf = $response->decoded_content;
}
else {
- print " " . $response->status_line . ": ";
+ print $fe " " . $response->status_line . ": ";
return 3;
}
my @title = ();
- my $res = 0;
+ my $res = 0;
while ($buf =~ s/\
([^\<]*)\<\/title\>//i) {
my $title = $1;
$title =~ s/[\r\n]/ /g;
@@ -66,9 +70,9 @@ sub check_http_url($$$$)
$title =~ s/^ //;
$title =~ s/ $//;
push(@title, $title);
- print "title = \"$title\": ";
+ print $fe "title = \"$title\": ";
if ($title =~ /Error 404|Not Found/) {
- print " Page reports 'Not Found' from \"$protocol://$host$getp\": ";
+ print $fe " Page reports 'Not Found' from \"$protocol://$host$getp\": ";
$res = 3;
}
}
@@ -79,39 +83,39 @@ sub check_http_url($$$$)
# returns 0, x if file does not match entry
# 1, x everything OK
# 2, x if not accesible (permission)
-sub check_ftp_dir_entry($$)
-{
+sub check_ftp_dir_entry($$) {
my ($file, $e) = @_;
my $other = '---';
my $isdir = 0;
- #print "Checking '$file' against '$e'\n";
+ #print $fe "Checking '$file' against '$e'\n";
$file =~ s/^\///;
$isdir = 1 if ($e =~ /^d/);
- return(0,$isdir) if ($e !~ /\s$file$/);
+ return (0, $isdir) if ($e !~ /\s$file$/);
if ($e =~ /^.[-rwx]{6}([-rwx]{3})\s/) {
$other = $1;
}
else {
- #print "Invalid entry\n";
+ #print $fe "Invalid entry\n";
# Invalid entry
- return(0,$isdir);
+ return (0, $isdir);
}
- return(2,$isdir) if ($other !~ /^r/); # not readable
+ return (2, $isdir) if ($other !~ /^r/); # not readable
if ($isdir) {
+
#return(2,$isdir) if ($other !~ /x$/); # directory, but not executable
}
- return(1,$isdir);
+ return (1, $isdir);
}
-sub check_ftp2_url($$$$)
-{
+sub check_ftp2_url($$$$) {
my ($protocol, $host, $path, $file) = @_;
my $checkentry = 1;
- print "\nhost $host\n";
- print "path $path\n";
- print "file $file\n";
+
+ #print $fe "\nhost $host\n";
+ #print $fe "path $path\n";
+ #print $fe "file $file\n";
my $url = "$protocol://$host";
$path =~ s/\/$//;
if (defined($file)) {
@@ -120,7 +124,8 @@ sub check_ftp2_url($$$$)
else {
$url = "$url/$path/.";
}
- print "curl $url, file = $file\n";
+
+ #print $fe "curl $url, file = $file\n";
my %listfiles = ();
if (open(FFTP, "curl --anyauth -l $url|")) {
while (my $l = ) {
@@ -130,44 +135,44 @@ sub check_ftp2_url($$$$)
close(FFTP);
}
if (%listfiles) {
- if (! defined($file)) {
- return(0, "OK");
+ if (!defined($file)) {
+ return (0, "OK");
}
elsif (defined($listfiles{$file})) {
- return(0, "OK");
+ return (0, "OK");
}
elsif (defined($listfiles{"ftpinfo.txt"})) {
- return(0, "Probably a directory");
+ return (0, "Probably a directory");
}
else {
- return(1, "Not found");
+ return (1, "Not found");
}
}
else {
- return(1, "Error");
+ return (1, "Error");
}
}
-sub check_ftp_url($$$$)
-{
+sub check_ftp_url($$$$) {
use Net::FTP;
my ($protocol, $host, $path, $file) = @_;
- my $res = 0;
+ my $res = 0;
my $message = "";
- my $ftp = Net::FTP->new($host, Debug => 0, Timeout => 120);
- if(!$ftp) {
- return(3,"Cannot connect to $host");
+ my $ftp = Net::FTP->new($host, Debug => 0, Timeout => 20);
+ if (!$ftp) {
+ return (3, "Cannot connect to $host");
}
- if (! $ftp->login("anonymous",'-anonymous@')) {
+ if (!$ftp->login("anonymous", '-anonymous@')) {
$message = $ftp->message;
- $res = 3;
+ $res = 3;
}
else {
my $rEntries;
if ($path ne "") {
- #print "Path = $path\n";
+
+ #print $fe "Path = $path\n";
#if (!$ftp->cwd($path)) {
# $message = $ftp->message;
# $res = 3;
@@ -177,41 +182,43 @@ sub check_ftp_url($$$$)
else {
$rEntries = $ftp->dir();
}
- if (! $rEntries) {
- $res = 3;
+ if (!$rEntries) {
+ $res = 3;
$message = "Could not read directory \"$path\"";
}
elsif (defined($file)) {
- my $found = 0;
+ my $found = 0;
my $found2 = 0;
- for my $f ( @{$rEntries}) {
- #print "Entry: $path $f\n";
- my ($res1,$isdir) = check_ftp_dir_entry($file,$f);
- if ($res1 == 1) {
- $found = 1;
- last;
- }
- elsif ($res1 == 2) {
- # found, but not accessible
- $found2 = 1;
- $message = "Permission denied for '$file'";
- }
+ for my $f (@{$rEntries}) {
+
+ #print $fe "Entry: $path $f\n";
+ my ($res1, $isdir) = check_ftp_dir_entry($file, $f);
+ if ($res1 == 1) {
+ $found = 1;
+ last;
+ }
+ elsif ($res1 == 2) {
+
+ # found, but not accessible
+ $found2 = 1;
+ $message = "Permission denied for '$file'";
+ }
}
- if (! $found) {
- $res = 4;
- if (! $found2) {
- $message = "File or directory '$file' not found";
- }
+ if (!$found) {
+ $res = 4;
+ if (!$found2) {
+ $message = "File or directory '$file' not found";
+ }
}
}
}
$ftp->quit;
- #print "returning ($res,$message)\n";
- return($res, $message);
+
+ #print $fe "returning ($res,$message)\n";
+ return ($res, $message);
}
-sub check_unknown_url($$$$)
-{
+sub check_unknown_url($$$$) {
use LWP::Simple;
my ($protocol, $host, $path, $file) = @_;
@@ -226,49 +233,53 @@ sub check_unknown_url($$$$)
$url .= "/$path";
}
}
- if(defined($file)) {
- #print "Trying $url$file\n";
+ if (defined($file)) {
+
+ #print $fe "Trying $url$file\n";
$res = head("$url/$file");
- if(! $res) {
+ if (!$res) {
+
# try to check for directory '/';
- #print "Trying $url$file/\n";
+ #print $fe "Trying $url$file/\n";
$res = head("$url/$file/");
}
}
else {
- #print "Trying $url\n";
+ #print $fe "Trying $url\n";
$res = head($url);
}
- return(! $res);
+ return (!$res);
}
#
# Main entry
-sub check_url($$)
-{
- my($url,$use_curl) = @_;
+sub check_url($$$$) {
+ my ($url, $use_curl, $fex, $fsx) = @_;
+ $fe = $fex;
+ $fs = $fsx;
my $file = undef;
- my ($protocol,$host,$path);
+ my ($protocol, $host, $path);
my $res = 0;
# Split the url to protocol,host,path
if ($url =~ /^([a-z]+):\/\/([^\/]+)(.*)$/) {
$protocol = $1;
- $host = $2;
- $path = $3;
+ $host = $2;
+ $path = $3;
$path =~ s/^\///;
- if($path =~ s/\/([^\/]+)$//) {
+ if ($path =~ s/\/([^\/]+)$//) {
$file = $1;
- if($file =~ / /) {
- # Filename contains ' ', maybe invalid. Don't check
- $file = undef;
+ if ($file =~ / /) {
+
+ # Filename contains ' ', maybe invalid. Don't check
+ $file = undef;
}
$path .= "/";
}
}
else {
- print " Invalid url '$url'";
+ print $fe " Invalid url '$url'";
return 2;
}
if ($protocol =~ /^https?$/) {
@@ -286,7 +297,7 @@ sub check_url($$)
}
else {
# it never should reach this point
- print " What protocol is '$protocol'?";
+ print $fe " What protocol is '$protocol'?";
$res = check_unknown_url($protocol, $host, $path, $file);
return $res;
}
diff --git a/development/checkurls/search_url.pl b/development/checkurls/search_url.pl
index 8bba11c1fc..59586ae11b 100755
--- a/development/checkurls/search_url.pl
+++ b/development/checkurls/search_url.pl
@@ -31,47 +31,59 @@
use strict;
-BEGIN {
+BEGIN {
use File::Spec;
my $p = File::Spec->rel2abs(__FILE__);
$p =~ s/[\/\\]?[^\/\\]+$//;
unshift(@INC, "$p");
}
+use warnings;
+use Cwd qw(abs_path);
use CheckURL;
use Try::Tiny;
use locale;
use POSIX qw(locale_h);
+use Readonly;
-setlocale(LC_CTYPE, "");
+Readonly::Scalar my $NR_JOBS => 10;
+
+setlocale(LC_CTYPE, "");
setlocale(LC_MESSAGES, "en_US.UTF-8");
+use File::Temp qw/ tempfile tempdir /;
+use File::Spec;
+use Fcntl qw(:flock SEEK_END);
+
# Prototypes
sub printNotUsedURLS($\%);
sub replaceSpecialChar($);
sub readUrls($\%);
sub parse_file($ );
sub handle_url($$$ );
+sub printx($$$$);
+sub getnrjobs($$$);
##########
-my %URLS = ();
-my %ignoredURLS = ();
-my %revertedURLS = ();
-my %extraURLS = ();
-my %selectedURLS = ();
+my %URLS = ();
+my %ignoredURLS = ();
+my %revertedURLS = ();
+my %extraURLS = ();
+my %selectedURLS = ();
my %knownToRegisterURLS = ();
-my $summaryFile = undef;
+my $summaryFile = undef;
my $checkSelectedOnly = 0;
for my $arg (@ARGV) {
die("Bad argument \"$arg\"") if ($arg !~ /=/);
- my ($type,$val) = split("=", $arg);
+ my ($type, $val) = split("=", $arg);
if ($type eq "filesToScan") {
+
#The file should be a list of files to search in
if (open(FLIST, $val)) {
while (my $l = ) {
- chomp($l);
- parse_file($l);
+ chomp($l);
+ parse_file($l);
}
close(FLIST);
}
@@ -102,7 +114,9 @@ for my $arg (@ARGV) {
}
}
-my @urls = sort keys %URLS, keys %extraURLS;
+my @urls = sort keys %URLS, keys %extraURLS;
+my @testvals = ();
+
# Tests
#my @urls = ("ftp://ftp.edpsciences.org/pub/aa/readme.html", "ftp://ftp.springer.de/pub/tex/latex/compsc/proc/author");
my $errorcount = 0;
@@ -126,57 +140,137 @@ for my $u (@urls) {
if (defined($selectedURLS{$u})) {
${selectedURLS}{$u}->{count} += 1;
}
- next if ($checkSelectedOnly && ! defined($selectedURLS{$u}));
+ next if ($checkSelectedOnly && !defined($selectedURLS{$u}));
$URLScount++;
- print "Checking '$u': ";
- my ($res, $prnt, $outSum);
- try {
- $res = check_url($u, $use_curl);
- if ($res) {
- print "Failed\n";
- $prnt = "";
- $outSum = 1;
- }
- else {
- $prnt = "OK\n";
- $outSum = 0;
- }
- }
- catch {
- $prnt = "Failed, caught error: $_\n";
- $outSum = 1;
- $res = 700;
- };
- printx("$prnt", $outSum);
- my $printSourceFiles = 0;
- my $err_txt = "Error url:";
-
- if ($res || $checkSelectedOnly) {
- $printSourceFiles = 1;
- }
- if ($res && defined($revertedURLS{$u})) {
- $err_txt = "Failed url:";
- }
- $res = ! $res if (defined($revertedURLS{$u}));
- if ($res || $checkSelectedOnly) {
- printx("$err_txt \"$u\"\n", $outSum);
- }
- if ($printSourceFiles) {
- if (defined($URLS{$u})) {
- for my $f(sort keys %{$URLS{$u}}) {
- my $lines = ":" . join(',', @{$URLS{$u}->{$f}});
- printx(" $f$lines\n", $outSum);
- }
- }
- if ($res ) {
- $errorcount++;
- }
- }
+ push(@testvals, {u => $u, use_curl => $use_curl,});
}
+# Ready to go multitasking
+my ($vol, $dir, $file) = File::Spec->splitpath($summaryFile);
+my $tempdir = tempdir("$dir/CounterXXXXXXX", CLEANUP => 1);
+my $countfile = "$tempdir/counter";
+my $counter = 0;
+if (open(my $FO, '>', $countfile)) {
+ print {$FO} $counter;
+ close($FO);
+}
+else {
+ unlink($countfile);
+ die("Could not write to $countfile");
+}
+
+print "Using tempdir \"" . abs_path($tempdir) . "\"\n";
+
+my @wait = ();
+for (my $i = 0; $i < $NR_JOBS; $i++) { # Number of subprocesses
+ my $pid = fork();
+ if ($pid == 0) {
+
+ # I am child
+ open(my $fe, '>:encoding(UTF-8)', "$tempdir/xxxError$i");
+ open(my $fs, '>:encoding(UTF-8)', "$tempdir/xxxSum$i");
+ while (1) {
+ open(my $fh, '+<', $countfile) or die("cannot open $countfile");
+ flock($fh, LOCK_EX) or die "$i: Cannot lock $countfile - $!\n";
+ my $l = <$fh>; # get actual count number
+ if (!defined($testvals[$l])) {
+ close($fs);
+ print $fe "NumberOfErrors $errorcount\n";
+ close($fe);
+ exit(0);
+ }
+ my $diff = getnrjobs(scalar @testvals, $l, $NR_JOBS);
+ if ($diff < 1) {
+ $diff = 1;
+ }
+ my $next = $l + $diff;
+ seek($fh, 0, 0);
+ truncate($fh, 0);
+ print $fh $next;
+ close($fh);
+ for (my $i = 0; $i < $diff; $i++) {
+ my $entryidx = $l + $i;
+ my $rentry = $testvals[$entryidx];
+ next if (!defined($rentry));
+ my $u = $rentry->{u};
+ my $use_curl = $rentry->{use_curl};
+
+ print $fe "Checking($entryidx) '$u': ";
+ my ($res, $prnt, $outSum);
+ try {
+ $res = check_url($u, $use_curl, $fe, $fs);
+ if ($res) {
+ print $fe "Failed\n";
+ $prnt = "";
+ $outSum = 1;
+ }
+ else {
+ $prnt = "OK\n";
+ $outSum = 0;
+ }
+ }
+ catch {
+ $prnt = "Failed, caught error: $_\n";
+ $outSum = 1;
+ $res = 700;
+ };
+ printx("$prnt", $outSum, $fe, $fs);
+ my $printSourceFiles = 0;
+ my $err_txt = "Error url:";
+
+ if ($res || $checkSelectedOnly) {
+ $printSourceFiles = 1;
+ }
+ if ($res && defined($revertedURLS{$u})) {
+ $err_txt = "Failed url:";
+ }
+ $res = !$res if (defined($revertedURLS{$u}));
+ if ($res || $checkSelectedOnly) {
+ printx("$err_txt \"$u\"\n", $outSum, $fe, $fs);
+ }
+ if ($printSourceFiles) {
+ if (defined($URLS{$u})) {
+ for my $f (sort keys %{$URLS{$u}}) {
+ my $lines = ":" . join(',', @{$URLS{$u}->{$f}});
+ printx(" $f$lines\n", $outSum, $fe, $fs);
+ }
+ }
+ if ($res) {
+ $errorcount++;
+ }
+ }
+ }
+ }
+ }
+ $wait[$i] = $pid;
+}
+
+for (my $i = 0; $i < $NR_JOBS; $i++) {
+ my $p = $wait[$i];
+ if ($p > 0) {
+ waitpid($p, 0);
+ open(my $fe, '<', "$tempdir/xxxError$i");
+ while (my $l = <$fe>) {
+ if ($l =~ /^NumberOfErrors\s(\d+)/) {
+ $errorcount += $1;
+ }
+ else {
+ print $l;
+ }
+ }
+ close($fe);
+ open(my $fs, '<', "$tempdir/xxxSum$i");
+ while (my $l = <$fs>) {
+ print SFO $l;
+ }
+ close($fs);
+ }
+}
+unlink($countfile);
+
if (%URLS) {
- printNotUsedURLS("Ignored", %ignoredURLS);
- printNotUsedURLS("Selected", %selectedURLS);
+ printNotUsedURLS("Ignored", %ignoredURLS);
+ printNotUsedURLS("Selected", %selectedURLS);
printNotUsedURLS("KnownInvalid", %extraURLS);
}
@@ -187,125 +281,134 @@ if (defined($summaryFile)) {
exit($errorcount);
###############################################################################
-sub printx($$)
-{
- my ($txt, $outSum) = @_;
- print "$txt";
+sub printx($$$$) {
+ my ($txt, $outSum, $fe, $fs) = @_;
+ print $fe "$txt";
if ($outSum && defined($summaryFile)) {
- print SFO "$txt";
+ print $fs "$txt";
}
}
-sub printNotUsedURLS($\%)
-{
+sub printNotUsedURLS($\%) {
my ($txt, $rURLS) = @_;
my @msg = ();
- for my $u ( sort keys %{$rURLS}) {
+ for my $u (sort keys %{$rURLS}) {
if ($rURLS->{$u}->{count} < 2) {
my @submsg = ();
for my $f (sort keys %{$rURLS->{$u}}) {
- next if ($f eq "count");
- push(@submsg, "$f:" . $rURLS->{$u}->{$f});
+ next if ($f eq "count");
+ push(@submsg, "$f:" . $rURLS->{$u}->{$f});
}
push(@msg, "\n $u\n " . join("\n ", @submsg) . "\n");
}
}
if (@msg) {
- print "\n$txt URLs not found in sources: " . join(' ',@msg) . "\n";
+ print "\n$txt URLs not found in sources: " . join(' ', @msg) . "\n";
}
}
-sub replaceSpecialChar($)
-{
+sub replaceSpecialChar($) {
my ($l) = @_;
$l =~ s/\\SpecialChar(NoPassThru)?\s*(TeX|LaTeX|LyX)[\s]?/\2/;
- return($l);
+ return ($l);
}
-sub readUrls($\%)
-{
+sub readUrls($\%) {
my ($file, $rUrls) = @_;
- die("Could not read file $file") if (! open(ULIST, $file));
+ die("Could not read file $file") if (!open(ULIST, $file));
my $line = 0;
while (my $l = ) {
$line++;
- $l =~ s/[\r\n]+$//; # remove eol
- $l =~ s/\s*\#.*$//; # remove comment
+ $l =~ s/[\r\n]+$//; # remove eol
+ $l =~ s/\s*\#.*$//; # remove comment
$l = &replaceSpecialChar($l);
next if ($l eq "");
my $use_curl = 0;
if ($l =~ s/^\s*UseCurl\s*//) {
$use_curl = 1;
}
- if (! defined($rUrls->{$l} )) {
+ if (!defined($rUrls->{$l})) {
$rUrls->{$l} = {$file => $line, count => 1, use_curl => $use_curl};
}
}
close(ULIST);
}
-sub parse_file($)
-{
- my($f) = @_;
- my $status = "out"; # outside of URL/href
+sub parse_file($) {
+ my ($f) = @_;
+ my $status = "out"; # outside of URL/href
return if ($f =~ /\/attic\//);
- if(open(FI, $f)) {
+ if (open(FI, $f)) {
my $line = 0;
- while(my $l = ) {
+ while (my $l = ) {
$line++;
- $l =~ s/[\r\n]+$//; # Simulate chomp
+ $l =~ s/[\r\n]+$//; # Simulate chomp
if ($status eq "out") {
- # searching for "\begin_inset Flex URL"
- if($l =~ /^\s*\\begin_inset\s+Flex\s+URL\s*$/) {
- $status = "inUrlInset";
- }
- elsif ($l =~ /^\s*\\begin_inset\s+CommandInset\s+href\s*$/) {
- $status = "inHrefInset";
- }
- else {
- # Outside of url, check also
- if ($l =~ /"((ftp|http|https):\/\/[^ ]+)"/) {
- my $url = $1;
- handle_url($url, $f, "x$line");
- }
- }
+
+ # searching for "\begin_inset Flex URL"
+ if ($l =~ /^\s*\\begin_inset\s+Flex\s+URL\s*$/) {
+ $status = "inUrlInset";
+ }
+ elsif ($l =~ /^\s*\\begin_inset\s+CommandInset\s+href\s*$/) {
+ $status = "inHrefInset";
+ }
+ else {
+ # Outside of url, check also
+ if ($l =~ /"((ftp|http|https):\/\/[^ ]+)"/) {
+ my $url = $1;
+ handle_url($url, $f, "x$line");
+ }
+ }
}
else {
- if($l =~ /^\s*\\end_(layout|inset)\s*$/) {
- $status = "out";
- }
- elsif ($status eq "inUrlInset") {
- if ($l =~ /\s*([a-z]+:\/\/.+)\s*$/) {
- my $url = $1;
- $status = "out";
- handle_url($url, $f, "u$line");
- }
- }
- elsif ($status eq "inHrefInset") {
- if ($l =~ /^target\s+"([a-z]+:\/\/[^ ]+)"$/) {
- my $url = $1;
- $status = "out";
- handle_url($url, $f, "h$line");
- }
- }
+ if ($l =~ /^\s*\\end_(layout|inset)\s*$/) {
+ $status = "out";
+ }
+ elsif ($status eq "inUrlInset") {
+ if ($l =~ /\s*([a-z]+:\/\/.+)\s*$/) {
+ my $url = $1;
+ $status = "out";
+ handle_url($url, $f, "u$line");
+ }
+ }
+ elsif ($status eq "inHrefInset") {
+ if ($l =~ /^target\s+"([a-z]+:\/\/[^ ]+)"$/) {
+ my $url = $1;
+ $status = "out";
+ handle_url($url, $f, "h$line");
+ }
+ }
}
}
close(FI);
}
}
-sub handle_url($$$)
-{
- my($url, $f, $line) = @_;
+sub handle_url($$$) {
+ my ($url, $f, $line) = @_;
$url = &replaceSpecialChar($url);
- if(!defined($URLS{$url})) {
+ if (!defined($URLS{$url})) {
$URLS{$url} = {};
}
- if(!defined($URLS{$url}->{$f})) {
+ if (!defined($URLS{$url}->{$f})) {
$URLS{$url}->{$f} = [];
}
push(@{$URLS{$url}->{$f}}, $line);
}
+
+sub getnrjobs($$$) {
+ my ($tabsize, $actualidx, $nr_jobs) = @_;
+ my $maxidx = $tabsize - 1;
+ my $remaining = $maxidx - $actualidx;
+ if ($remaining <= 0) {
+ return (1);
+ }
+ if ($nr_jobs < 2) {
+ return ($remaining);
+ }
+ my $diff = 1 + int($remaining / (2 * $nr_jobs));
+ return $diff;
+}