Cmake build tests: check urls optimized

The original test took about ~50 minutes. Making the checks in parallel,
now it takes about 150 seconds.
This commit is contained in:
Kornel Benko 2024-10-06 18:04:00 +02:00
parent a16e4b206a
commit 3f48486a5c
4 changed files with 310 additions and 203 deletions

View File

@ -1,5 +1,6 @@
# -*- mode: perl; -*- # -*- mode: perl; -*-
package CheckURL; package CheckURL;
# file CheckURL.pm # file CheckURL.pm
# #
# This file is part of LyX, the document processor. # This file is part of LyX, the document processor.
@ -13,7 +14,8 @@ package CheckURL;
# #
use strict; use strict;
our(@EXPORT, @ISA); our (@EXPORT, @ISA);
BEGIN { BEGIN {
use Exporter (); use Exporter ();
@ISA = qw(Exporter); @ISA = qw(Exporter);
@ -25,16 +27,18 @@ sub check_http_url($$$$);
sub check_ftp_dir_entry($$); sub check_ftp_dir_entry($$);
sub check_ftp_url($$$$); sub check_ftp_url($$$$);
sub check_unknown_url($$$$); sub check_unknown_url($$$$);
sub check_url($$); sub check_url($$$$);
################ ################
sub check_http_url($$$$) my $fe;
{ my $fs;
sub check_http_url($$$$) {
require LWP::UserAgent; require LWP::UserAgent;
my ($protocol, $host, $path, $file) = @_; my ($protocol, $host, $path, $file) = @_;
my $ua = LWP::UserAgent->new; my $ua = LWP::UserAgent->new(timeout => 20);
my $getp = "/"; my $getp = "/";
if ($path ne "") { if ($path ne "") {
$getp .= $path; $getp .= $path;
@ -54,7 +58,7 @@ sub check_http_url($$$$)
$buf = $response->decoded_content; $buf = $response->decoded_content;
} }
else { else {
print " " . $response->status_line . ": "; print $fe " " . $response->status_line . ": ";
return 3; return 3;
} }
my @title = (); my @title = ();
@ -66,9 +70,9 @@ sub check_http_url($$$$)
$title =~ s/^ //; $title =~ s/^ //;
$title =~ s/ $//; $title =~ s/ $//;
push(@title, $title); push(@title, $title);
print "title = \"$title\": "; print $fe "title = \"$title\": ";
if ($title =~ /Error 404|Not Found/) { if ($title =~ /Error 404|Not Found/) {
print " Page reports 'Not Found' from \"$protocol://$host$getp\": "; print $fe " Page reports 'Not Found' from \"$protocol://$host$getp\": ";
$res = 3; $res = 3;
} }
} }
@ -79,39 +83,39 @@ sub check_http_url($$$$)
# returns 0, x if file does not match entry # returns 0, x if file does not match entry
# 1, x everything OK # 1, x everything OK
# 2, x if not accesible (permission) # 2, x if not accesible (permission)
sub check_ftp_dir_entry($$) sub check_ftp_dir_entry($$) {
{
my ($file, $e) = @_; my ($file, $e) = @_;
my $other = '---'; my $other = '---';
my $isdir = 0; my $isdir = 0;
#print "Checking '$file' against '$e'\n"; #print $fe "Checking '$file' against '$e'\n";
$file =~ s/^\///; $file =~ s/^\///;
$isdir = 1 if ($e =~ /^d/); $isdir = 1 if ($e =~ /^d/);
return(0,$isdir) if ($e !~ /\s$file$/); return (0, $isdir) if ($e !~ /\s$file$/);
if ($e =~ /^.[-rwx]{6}([-rwx]{3})\s/) { if ($e =~ /^.[-rwx]{6}([-rwx]{3})\s/) {
$other = $1; $other = $1;
} }
else { else {
#print "Invalid entry\n"; #print $fe "Invalid entry\n";
# Invalid entry # Invalid entry
return(0,$isdir); return (0, $isdir);
} }
return(2,$isdir) if ($other !~ /^r/); # not readable return (2, $isdir) if ($other !~ /^r/); # not readable
if ($isdir) { if ($isdir) {
#return(2,$isdir) if ($other !~ /x$/); # directory, but not executable #return(2,$isdir) if ($other !~ /x$/); # directory, but not executable
} }
return(1,$isdir); return (1, $isdir);
} }
sub check_ftp2_url($$$$) sub check_ftp2_url($$$$) {
{
my ($protocol, $host, $path, $file) = @_; my ($protocol, $host, $path, $file) = @_;
my $checkentry = 1; my $checkentry = 1;
print "\nhost $host\n";
print "path $path\n"; #print $fe "\nhost $host\n";
print "file $file\n"; #print $fe "path $path\n";
#print $fe "file $file\n";
my $url = "$protocol://$host"; my $url = "$protocol://$host";
$path =~ s/\/$//; $path =~ s/\/$//;
if (defined($file)) { if (defined($file)) {
@ -120,7 +124,8 @@ sub check_ftp2_url($$$$)
else { else {
$url = "$url/$path/."; $url = "$url/$path/.";
} }
print "curl $url, file = $file\n";
#print $fe "curl $url, file = $file\n";
my %listfiles = (); my %listfiles = ();
if (open(FFTP, "curl --anyauth -l $url|")) { if (open(FFTP, "curl --anyauth -l $url|")) {
while (my $l = <FFTP>) { while (my $l = <FFTP>) {
@ -130,44 +135,44 @@ sub check_ftp2_url($$$$)
close(FFTP); close(FFTP);
} }
if (%listfiles) { if (%listfiles) {
if (! defined($file)) { if (!defined($file)) {
return(0, "OK"); return (0, "OK");
} }
elsif (defined($listfiles{$file})) { elsif (defined($listfiles{$file})) {
return(0, "OK"); return (0, "OK");
} }
elsif (defined($listfiles{"ftpinfo.txt"})) { elsif (defined($listfiles{"ftpinfo.txt"})) {
return(0, "Probably a directory"); return (0, "Probably a directory");
} }
else { else {
return(1, "Not found"); return (1, "Not found");
} }
} }
else { else {
return(1, "Error"); return (1, "Error");
} }
} }
sub check_ftp_url($$$$) sub check_ftp_url($$$$) {
{
use Net::FTP; use Net::FTP;
my ($protocol, $host, $path, $file) = @_; my ($protocol, $host, $path, $file) = @_;
my $res = 0; my $res = 0;
my $message = ""; my $message = "";
my $ftp = Net::FTP->new($host, Debug => 0, Timeout => 120); my $ftp = Net::FTP->new($host, Debug => 0, Timeout => 20);
if(!$ftp) { if (!$ftp) {
return(3,"Cannot connect to $host"); return (3, "Cannot connect to $host");
} }
if (! $ftp->login("anonymous",'-anonymous@')) { if (!$ftp->login("anonymous", '-anonymous@')) {
$message = $ftp->message; $message = $ftp->message;
$res = 3; $res = 3;
} }
else { else {
my $rEntries; my $rEntries;
if ($path ne "") { if ($path ne "") {
#print "Path = $path\n";
#print $fe "Path = $path\n";
#if (!$ftp->cwd($path)) { #if (!$ftp->cwd($path)) {
# $message = $ftp->message; # $message = $ftp->message;
# $res = 3; # $res = 3;
@ -177,41 +182,43 @@ sub check_ftp_url($$$$)
else { else {
$rEntries = $ftp->dir(); $rEntries = $ftp->dir();
} }
if (! $rEntries) { if (!$rEntries) {
$res = 3; $res = 3;
$message = "Could not read directory \"$path\""; $message = "Could not read directory \"$path\"";
} }
elsif (defined($file)) { elsif (defined($file)) {
my $found = 0; my $found = 0;
my $found2 = 0; my $found2 = 0;
for my $f ( @{$rEntries}) { for my $f (@{$rEntries}) {
#print "Entry: $path $f\n";
my ($res1,$isdir) = check_ftp_dir_entry($file,$f); #print $fe "Entry: $path $f\n";
my ($res1, $isdir) = check_ftp_dir_entry($file, $f);
if ($res1 == 1) { if ($res1 == 1) {
$found = 1; $found = 1;
last; last;
} }
elsif ($res1 == 2) { elsif ($res1 == 2) {
# found, but not accessible # found, but not accessible
$found2 = 1; $found2 = 1;
$message = "Permission denied for '$file'"; $message = "Permission denied for '$file'";
} }
} }
if (! $found) { if (!$found) {
$res = 4; $res = 4;
if (! $found2) { if (!$found2) {
$message = "File or directory '$file' not found"; $message = "File or directory '$file' not found";
} }
} }
} }
} }
$ftp->quit; $ftp->quit;
#print "returning ($res,$message)\n";
return($res, $message); #print $fe "returning ($res,$message)\n";
return ($res, $message);
} }
sub check_unknown_url($$$$) sub check_unknown_url($$$$) {
{
use LWP::Simple; use LWP::Simple;
my ($protocol, $host, $path, $file) = @_; my ($protocol, $host, $path, $file) = @_;
@ -226,29 +233,32 @@ sub check_unknown_url($$$$)
$url .= "/$path"; $url .= "/$path";
} }
} }
if(defined($file)) { if (defined($file)) {
#print "Trying $url$file\n";
#print $fe "Trying $url$file\n";
$res = head("$url/$file"); $res = head("$url/$file");
if(! $res) { if (!$res) {
# try to check for directory '/'; # try to check for directory '/';
#print "Trying $url$file/\n"; #print $fe "Trying $url$file/\n";
$res = head("$url/$file/"); $res = head("$url/$file/");
} }
} }
else { else {
#print "Trying $url\n"; #print $fe "Trying $url\n";
$res = head($url); $res = head($url);
} }
return(! $res); return (!$res);
} }
# #
# Main entry # Main entry
sub check_url($$) sub check_url($$$$) {
{ my ($url, $use_curl, $fex, $fsx) = @_;
my($url,$use_curl) = @_; $fe = $fex;
$fs = $fsx;
my $file = undef; my $file = undef;
my ($protocol,$host,$path); my ($protocol, $host, $path);
my $res = 0; my $res = 0;
@ -258,9 +268,10 @@ sub check_url($$)
$host = $2; $host = $2;
$path = $3; $path = $3;
$path =~ s/^\///; $path =~ s/^\///;
if($path =~ s/\/([^\/]+)$//) { if ($path =~ s/\/([^\/]+)$//) {
$file = $1; $file = $1;
if($file =~ / /) { if ($file =~ / /) {
# Filename contains ' ', maybe invalid. Don't check # Filename contains ' ', maybe invalid. Don't check
$file = undef; $file = undef;
} }
@ -268,7 +279,7 @@ sub check_url($$)
} }
} }
else { else {
print " Invalid url '$url'"; print $fe " Invalid url '$url'";
return 2; return 2;
} }
if ($protocol =~ /^https?$/) { if ($protocol =~ /^https?$/) {
@ -286,7 +297,7 @@ sub check_url($$)
} }
else { else {
# it never should reach this point # it never should reach this point
print " What protocol is '$protocol'?"; print $fe " What protocol is '$protocol'?";
$res = check_unknown_url($protocol, $host, $path, $file); $res = check_unknown_url($protocol, $host, $path, $file);
return $res; return $res;
} }

View File

@ -2,3 +2,4 @@ http://www.uon.edu/doe
ftp://www.test.test ftp://www.test.test
http://www.test.test http://www.test.test
#proto://host.xx.ab/abcd #proto://host.xx.ab/abcd
http://example.com/%20foo

View File

@ -10,6 +10,7 @@ http://jasa.peerx-press.org/html/jasa/Using_LaTeX
http://spie.org/app/Publications/index.cfm?fuseaction=authinfo&type=proceedings http://spie.org/app/Publications/index.cfm?fuseaction=authinfo&type=proceedings
http://www.jstatsoft.org/downloads/JSSstyle.zip http://www.jstatsoft.org/downloads/JSSstyle.zip
http://www.photogrammetry.ethz.ch/tarasp_workshop/isprs.cls http://www.photogrammetry.ethz.ch/tarasp_workshop/isprs.cls
https://journals.aps.org/revtex
# The following ftp url is correct, but # The following ftp url is correct, but
# ftp commands like 'dir', 'get' do not work. # ftp commands like 'dir', 'get' do not work.

View File

@ -38,6 +38,8 @@ BEGIN {
unshift(@INC, "$p"); unshift(@INC, "$p");
} }
use warnings;
use Cwd qw(abs_path);
use CheckURL; use CheckURL;
use Try::Tiny; use Try::Tiny;
use locale; use locale;
@ -46,12 +48,17 @@ use POSIX qw(locale_h);
setlocale(LC_CTYPE, ""); setlocale(LC_CTYPE, "");
setlocale(LC_MESSAGES, "en_US.UTF-8"); setlocale(LC_MESSAGES, "en_US.UTF-8");
use File::Temp qw/ tempfile tempdir /;
use File::Spec;
use Fcntl qw(:flock SEEK_END);
# Prototypes # Prototypes
sub printNotUsedURLS($\%); sub printNotUsedURLS($\%);
sub replaceSpecialChar($); sub replaceSpecialChar($);
sub readUrls($\%); sub readUrls($\%);
sub parse_file($ ); sub parse_file($ );
sub handle_url($$$ ); sub handle_url($$$ );
sub printx($$$$);
########## ##########
my %URLS = (); my %URLS = ();
@ -65,8 +72,9 @@ my $summaryFile = undef;
my $checkSelectedOnly = 0; my $checkSelectedOnly = 0;
for my $arg (@ARGV) { for my $arg (@ARGV) {
die("Bad argument \"$arg\"") if ($arg !~ /=/); die("Bad argument \"$arg\"") if ($arg !~ /=/);
my ($type,$val) = split("=", $arg); my ($type, $val) = split("=", $arg);
if ($type eq "filesToScan") { if ($type eq "filesToScan") {
#The file should be a list of files to search in #The file should be a list of files to search in
if (open(FLIST, $val)) { if (open(FLIST, $val)) {
while (my $l = <FLIST>) { while (my $l = <FLIST>) {
@ -103,6 +111,8 @@ for my $arg (@ARGV) {
} }
my @urls = sort keys %URLS, keys %extraURLS; my @urls = sort keys %URLS, keys %extraURLS;
my @testvals = ();
# Tests # Tests
#my @urls = ("ftp://ftp.edpsciences.org/pub/aa/readme.html", "ftp://ftp.springer.de/pub/tex/latex/compsc/proc/author"); #my @urls = ("ftp://ftp.edpsciences.org/pub/aa/readme.html", "ftp://ftp.springer.de/pub/tex/latex/compsc/proc/author");
my $errorcount = 0; my $errorcount = 0;
@ -126,14 +136,76 @@ for my $u (@urls) {
if (defined($selectedURLS{$u})) { if (defined($selectedURLS{$u})) {
${selectedURLS}{$u}->{count} += 1; ${selectedURLS}{$u}->{count} += 1;
} }
next if ($checkSelectedOnly && ! defined($selectedURLS{$u})); next if ($checkSelectedOnly && !defined($selectedURLS{$u}));
$URLScount++; $URLScount++;
print "Checking '$u': "; push(@testvals, {u => $u, use_curl => $use_curl,});
}
# Ready to go multitasking
my ($vol, $dir, $file) = File::Spec->splitpath($summaryFile);
my $tempdir = tempdir("$dir/CounterXXXXXXX", CLEANUP => 1);
my $countfile = "$tempdir/counter";
my $counter = 0;
if (open(my $FO, '>', $countfile)) {
print {$FO} $counter;
close($FO);
}
else {
unlink($countfile);
die("Could not write to $countfile");
}
print "Using tempdir \"" . abs_path($tempdir) . "\"\n";
my @wait = ();
for (my $i = 0; $i < 10; $i++) { # Number of subprocesses
my $pid = fork();
if ($pid == 0) {
# I am child
open(my $fe, '>:encoding(UTF-8)', "$tempdir/xxxError$i");
open(my $fs, '>:encoding(UTF-8)', "$tempdir/xxxSum$i");
while (1) {
open(my $fh, '+<', $countfile) or die("cannot open $countfile");
flock($fh, LOCK_EX) or die "$i: Cannot lock $countfile - $!\n";
my $l = <$fh>; # get actual count number
my $diff = undef;
if (defined($testvals[$l + 150])) {
$diff = 5;
}
elsif (defined($testvals[$l + 50])) {
$diff = 3;
}
elsif (defined($testvals[$l + 20])) {
$diff = 2;
}
elsif (defined($testvals[$l])) {
$diff = 1;
}
else {
close($fs);
print $fe "NumberOfErrors $errorcount\n";
close($fe);
exit(0);
}
my $next = $l + $diff;
seek($fh, 0, 0);
truncate($fh, 0);
print $fh $next;
close($fh);
for (my $i = 0; $i < $diff; $i++) {
my $entryidx = $l + $i;
my $rentry = $testvals[$entryidx];
next if (!defined($rentry));
my $u = $rentry->{u};
my $use_curl = $rentry->{use_curl};
print $fe "Checking($entryidx) '$u': ";
my ($res, $prnt, $outSum); my ($res, $prnt, $outSum);
try { try {
$res = check_url($u, $use_curl); $res = check_url($u, $use_curl, $fe, $fs);
if ($res) { if ($res) {
print "Failed\n"; print $fe "Failed\n";
$prnt = ""; $prnt = "";
$outSum = 1; $outSum = 1;
} }
@ -147,7 +219,7 @@ for my $u (@urls) {
$outSum = 1; $outSum = 1;
$res = 700; $res = 700;
}; };
printx("$prnt", $outSum); printx("$prnt", $outSum, $fe, $fs);
my $printSourceFiles = 0; my $printSourceFiles = 0;
my $err_txt = "Error url:"; my $err_txt = "Error url:";
@ -157,23 +229,50 @@ for my $u (@urls) {
if ($res && defined($revertedURLS{$u})) { if ($res && defined($revertedURLS{$u})) {
$err_txt = "Failed url:"; $err_txt = "Failed url:";
} }
$res = ! $res if (defined($revertedURLS{$u})); $res = !$res if (defined($revertedURLS{$u}));
if ($res || $checkSelectedOnly) { if ($res || $checkSelectedOnly) {
printx("$err_txt \"$u\"\n", $outSum); printx("$err_txt \"$u\"\n", $outSum, $fe, $fs);
} }
if ($printSourceFiles) { if ($printSourceFiles) {
if (defined($URLS{$u})) { if (defined($URLS{$u})) {
for my $f(sort keys %{$URLS{$u}}) { for my $f (sort keys %{$URLS{$u}}) {
my $lines = ":" . join(',', @{$URLS{$u}->{$f}}); my $lines = ":" . join(',', @{$URLS{$u}->{$f}});
printx(" $f$lines\n", $outSum); printx(" $f$lines\n", $outSum, $fe, $fs);
} }
} }
if ($res ) { if ($res) {
$errorcount++; $errorcount++;
} }
} }
}
}
}
$wait[$i] = $pid;
} }
for (my $i = 0; $i < 10; $i++) {
my $p = $wait[$i];
if ($p > 0) {
waitpid($p, 0);
open(my $fe, '<', "$tempdir/xxxError$i");
while (my $l = <$fe>) {
if ($l =~ /^NumberOfErrors\s(\d+)/) {
$errorcount += $1;
}
else {
print $l;
}
}
close($fe);
open(my $fs, '<', "$tempdir/xxxSum$i");
while (my $l = <$fs>) {
print SFO $l;
}
close($fs);
}
}
unlink($countfile);
if (%URLS) { if (%URLS) {
printNotUsedURLS("Ignored", %ignoredURLS); printNotUsedURLS("Ignored", %ignoredURLS);
printNotUsedURLS("Selected", %selectedURLS); printNotUsedURLS("Selected", %selectedURLS);
@ -187,20 +286,18 @@ if (defined($summaryFile)) {
exit($errorcount); exit($errorcount);
############################################################################### ###############################################################################
sub printx($$) sub printx($$$$) {
{ my ($txt, $outSum, $fe, $fs) = @_;
my ($txt, $outSum) = @_; print $fe "$txt";
print "$txt";
if ($outSum && defined($summaryFile)) { if ($outSum && defined($summaryFile)) {
print SFO "$txt"; print $fs "$txt";
} }
} }
sub printNotUsedURLS($\%) sub printNotUsedURLS($\%) {
{
my ($txt, $rURLS) = @_; my ($txt, $rURLS) = @_;
my @msg = (); my @msg = ();
for my $u ( sort keys %{$rURLS}) { for my $u (sort keys %{$rURLS}) {
if ($rURLS->{$u}->{count} < 2) { if ($rURLS->{$u}->{count} < 2) {
my @submsg = (); my @submsg = ();
for my $f (sort keys %{$rURLS->{$u}}) { for my $f (sort keys %{$rURLS->{$u}}) {
@ -211,22 +308,20 @@ sub printNotUsedURLS($\%)
} }
} }
if (@msg) { if (@msg) {
print "\n$txt URLs not found in sources: " . join(' ',@msg) . "\n"; print "\n$txt URLs not found in sources: " . join(' ', @msg) . "\n";
} }
} }
sub replaceSpecialChar($) sub replaceSpecialChar($) {
{
my ($l) = @_; my ($l) = @_;
$l =~ s/\\SpecialChar(NoPassThru)?\s*(TeX|LaTeX|LyX)[\s]?/\2/; $l =~ s/\\SpecialChar(NoPassThru)?\s*(TeX|LaTeX|LyX)[\s]?/\2/;
return($l); return ($l);
} }
sub readUrls($\%) sub readUrls($\%) {
{
my ($file, $rUrls) = @_; my ($file, $rUrls) = @_;
die("Could not read file $file") if (! open(ULIST, $file)); die("Could not read file $file") if (!open(ULIST, $file));
my $line = 0; my $line = 0;
while (my $l = <ULIST>) { while (my $l = <ULIST>) {
$line++; $line++;
@ -238,27 +333,27 @@ sub readUrls($\%)
if ($l =~ s/^\s*UseCurl\s*//) { if ($l =~ s/^\s*UseCurl\s*//) {
$use_curl = 1; $use_curl = 1;
} }
if (! defined($rUrls->{$l} )) { if (!defined($rUrls->{$l})) {
$rUrls->{$l} = {$file => $line, count => 1, use_curl => $use_curl}; $rUrls->{$l} = {$file => $line, count => 1, use_curl => $use_curl};
} }
} }
close(ULIST); close(ULIST);
} }
sub parse_file($) sub parse_file($) {
{ my ($f) = @_;
my($f) = @_;
my $status = "out"; # outside of URL/href my $status = "out"; # outside of URL/href
return if ($f =~ /\/attic\//); return if ($f =~ /\/attic\//);
if(open(FI, $f)) { if (open(FI, $f)) {
my $line = 0; my $line = 0;
while(my $l = <FI>) { while (my $l = <FI>) {
$line++; $line++;
$l =~ s/[\r\n]+$//; # Simulate chomp $l =~ s/[\r\n]+$//; # Simulate chomp
if ($status eq "out") { if ($status eq "out") {
# searching for "\begin_inset Flex URL" # searching for "\begin_inset Flex URL"
if($l =~ /^\s*\\begin_inset\s+Flex\s+URL\s*$/) { if ($l =~ /^\s*\\begin_inset\s+Flex\s+URL\s*$/) {
$status = "inUrlInset"; $status = "inUrlInset";
} }
elsif ($l =~ /^\s*\\begin_inset\s+CommandInset\s+href\s*$/) { elsif ($l =~ /^\s*\\begin_inset\s+CommandInset\s+href\s*$/) {
@ -273,7 +368,7 @@ sub parse_file($)
} }
} }
else { else {
if($l =~ /^\s*\\end_(layout|inset)\s*$/) { if ($l =~ /^\s*\\end_(layout|inset)\s*$/) {
$status = "out"; $status = "out";
} }
elsif ($status eq "inUrlInset") { elsif ($status eq "inUrlInset") {
@ -296,15 +391,14 @@ sub parse_file($)
} }
} }
sub handle_url($$$) sub handle_url($$$) {
{ my ($url, $f, $line) = @_;
my($url, $f, $line) = @_;
$url = &replaceSpecialChar($url); $url = &replaceSpecialChar($url);
if(!defined($URLS{$url})) { if (!defined($URLS{$url})) {
$URLS{$url} = {}; $URLS{$url} = {};
} }
if(!defined($URLS{$url}->{$f})) { if (!defined($URLS{$url}->{$f})) {
$URLS{$url}->{$f} = []; $URLS{$url}->{$f} = [];
} }
push(@{$URLS{$url}->{$f}}, $line); push(@{$URLS{$url}->{$f}}, $line);