#! /usr/bin/env perl # -*- mode: perl; -*- # # file search_url.pl # script to search for url's in lyxfiles # and testing their validity. # # Syntax: search_url.pl [(filesToScan|(ignored|reverted|extra|selected)URLS)={path_to_control]* # Param value is a path to a file containing list of xxx: # filesToScan={xxx = lyx-file-names to be scanned for} # ignoredURLS={xxx = urls that are discarded from test} # revertedURLS={xxx = urls that should fail, to test the test with invalid urls} # extraURLS={xxx = urls which should be also checked} # # This file is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public # License as published by the Free Software Foundation; either # version 2 of the License, or (at your option) any later version. # # This software is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public # License along with this software; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # # Copyright (c) 2013 Kornel Benko # (c) 2013 Scott Kostyshak use strict; BEGIN { use File::Spec; my $p = File::Spec->rel2abs(__FILE__); $p =~ s/[\/\\]?[^\/\\]+$//; unshift(@INC, "$p"); } use CheckURL; $ENV{LC_ALL} = "en_US.UTF-8"; $ENV{LANG} = "en_US.UTF-8"; $ENV{LANGUAGE} = "en_US.UTF-8"; my %URLS = (); my %ignoredURLS = (); my %revertedURLS = (); my %extraURLS = (); my %selectedURLS = (); my $checkSelectedOnly = 0; for my $arg (@ARGV) { die("Bad argument \"$arg\"") if ($arg !~ /=/); my ($type,$val) = split("=", $arg); if ($type eq "filesToScan") { #The file should be a list of files to search in if (open(FLIST, $val)) { while (my $l = ) { chomp($l); &parse_file($l); } close(FLIST); } } elsif ($type eq "ignoredURLS") { &readUrls($val, \%ignoredURLS); } elsif ($type eq "revertedURLS") { &readUrls($val, \%revertedURLS); } elsif ($type eq "extraURLS") { &readUrls($val, \%extraURLS); } elsif ($type eq "selectedURLS") { $checkSelectedOnly = 1; &readUrls($val, \%selectedURLS); } else { die("Invalid argument \"$arg\""); } } my @urls = sort keys %URLS, keys %extraURLS; my $errorcount = 0; my $URLScount = 0; for my $u (@urls) { if (defined($selectedURLS{$u})) { ${selectedURLS}{$u}->{count} += 1; } if (defined($ignoredURLS{$u})) { $ignoredURLS{$u}->{count} += 1; next; } next if ($checkSelectedOnly && ! defined($selectedURLS{$u})); $URLScount++; print "Checking '$u'"; my $res = &check_url($u); if ($res) { print ": Failed\n"; } else { print ": OK\n"; } my $printSourceFiles = 0; my $err_txt = "Error url:"; if ($res || $checkSelectedOnly) { $printSourceFiles = 1; } if ($res && defined($revertedURLS{$u})) { $err_txt = "Failed url:"; } $res = ! $res if (defined($revertedURLS{$u})); if ($res || $checkSelectedOnly) { print "$err_txt \"$u\"\n"; } if ($printSourceFiles) { if (defined($URLS{$u})) { for my $f(sort keys %{$URLS{$u}}) { my $lines = ":" . join(',', @{$URLS{$u}->{$f}}); print " $f$lines\n"; } } if ($res ) { $errorcount++; } } } if (%URLS) { &printNotUsedURLS("Ignored", \%ignoredURLS); &printNotUsedURLS("Selected", \%selectedURLS); &printNotUsedURLS("KnownInvalid", \%extraURLS); } print "\n$errorcount URL-tests failed out of $URLScount\n\n"; exit($errorcount); ############################################################################### sub printNotUsedURLS($$) { my ($txt, $rURLS) = @_; my @msg = (); for my $u ( sort keys %{$rURLS}) { if ($rURLS->{$u}->{count} < 2) { my @submsg = (); for my $f (sort keys %{$rURLS->{$u}}) { next if ($f eq "count"); push(@submsg, "$f:" . $rURLS->{$u}->{$f}); } push(@msg, "\n $u\n " . join("\n ", @submsg) . "\n"); } } if (@msg) { print "\n$txt URLs not found in sources: " . join(' ',@msg) . "\n"; } } sub readUrls($$) { my ($file, $rUrls) = @_; die("Could not read file $file") if (! open(ULIST, $file)); my $line = 0; while (my $l = ) { $line++; $l =~ s/[\r\n]+$//; # remove eol $l =~ s/\s*\#.*$//; # remove comment next if ($l eq ""); if (! defined($rUrls->{$l} )) { $rUrls->{$l} = {$file => $line, count => 1}; } } close(ULIST); } sub parse_file($) { my($f) = @_; my $status = "out"; # outside of URL/href return if ($f =~ /\/attic\//); if(open(FI, $f)) { my $line = 0; while(my $l = ) { $line++; $l =~ s/[\r\n]+$//; # Simulate chomp if ($status eq "out") { # searching for "\begin_inset Flex URL" if($l =~ /^\s*\\begin_inset\s+Flex\s+URL\s*$/) { $status = "inUrlInset"; } elsif ($l =~ /^\s*\\begin_inset\s+CommandInset\s+href\s*$/) { $status = "inHrefInset"; } else { # Outside of url, check also if ($l =~ /"((ftp|http|https):\/\/[^ ]+)"/) { my $url = $1; &handle_url($url, $f, "x$line"); } } } else { if($l =~ /^\s*\\end_(layout|inset)\s*$/) { $status = "out"; } elsif ($status eq "inUrlInset") { if ($l =~ /\s*([a-z]+:\/\/.+)\s*$/) { my $url = $1; $status = "out"; &handle_url($url, $f, "u$line"); } } elsif ($status eq "inHrefInset") { if ($l =~ /^target\s+"([a-z]+:\/\/[^ ]+)"$/) { my $url = $1; $status = "out"; &handle_url($url, $f, "h$line"); } } } } close(FI); } } sub handle_url($$$) { my($url, $f, $line) = @_; if(!defined($URLS{$url})) { $URLS{$url} = {}; $URLS{$url}->{$f} = []; } push(@{$URLS{$url}->{$f}}, $line); }