2013-08-28 10:17:40 +00:00
|
|
|
#! /usr/bin/env perl
|
|
|
|
# -*- mode: perl; -*-
|
|
|
|
#
|
|
|
|
# file search_url.pl
|
|
|
|
# script to search for url's in lyxfiles
|
|
|
|
# and testing their validity.
|
|
|
|
#
|
|
|
|
# Syntax: search_url.pl [(filesToScan|(ignored|reverted|extra|selected)URLS)={path_to_control]*
|
|
|
|
# Param value is a path to a file containing list of xxx:
|
|
|
|
# filesToScan={xxx = lyx-file-names to be scanned for}
|
|
|
|
# ignoredURLS={xxx = urls that are discarded from test}
|
|
|
|
# revertedURLS={xxx = urls that should fail, to test the test with invalid urls}
|
|
|
|
# extraURLS={xxx = urls which should be also checked}
|
|
|
|
#
|
|
|
|
# This file is free software; you can redistribute it and/or
|
|
|
|
# modify it under the terms of the GNU General Public
|
|
|
|
# License as published by the Free Software Foundation; either
|
|
|
|
# version 2 of the License, or (at your option) any later version.
|
|
|
|
#
|
|
|
|
# This software is distributed in the hope that it will be useful,
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
# General Public License for more details.
|
|
|
|
#
|
|
|
|
# You should have received a copy of the GNU General Public
|
|
|
|
# License along with this software; if not, write to the Free Software
|
|
|
|
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
|
|
#
|
|
|
|
# Copyright (c) 2013 Kornel Benko <kornel@lyx.org>
|
|
|
|
# (c) 2013 Scott Kostyshak <skotysh@lyx.org>
|
|
|
|
|
|
|
|
use strict;
|
2024-10-30 10:08:31 +00:00
|
|
|
use warnings;
|
2013-08-28 10:17:40 +00:00
|
|
|
|
2024-10-06 16:04:00 +00:00
|
|
|
BEGIN {
|
2013-08-28 10:17:40 +00:00
|
|
|
use File::Spec;
|
|
|
|
my $p = File::Spec->rel2abs(__FILE__);
|
|
|
|
$p =~ s/[\/\\]?[^\/\\]+$//;
|
|
|
|
unshift(@INC, "$p");
|
|
|
|
}
|
|
|
|
|
2024-10-06 16:04:00 +00:00
|
|
|
use Cwd qw(abs_path);
|
2013-08-28 10:17:40 +00:00
|
|
|
use CheckURL;
|
2014-01-08 12:36:30 +00:00
|
|
|
use Try::Tiny;
|
|
|
|
use locale;
|
|
|
|
use POSIX qw(locale_h);
|
2024-10-07 10:35:10 +00:00
|
|
|
use Readonly;
|
|
|
|
|
2024-10-30 10:08:31 +00:00
|
|
|
binmode(STDOUT, ":encoding(UTF-8)");
|
|
|
|
|
2024-10-07 10:35:10 +00:00
|
|
|
Readonly::Scalar my $NR_JOBS => 10;
|
2013-08-28 10:17:40 +00:00
|
|
|
|
2024-10-06 16:04:00 +00:00
|
|
|
setlocale(LC_CTYPE, "");
|
2014-01-08 12:36:30 +00:00
|
|
|
setlocale(LC_MESSAGES, "en_US.UTF-8");
|
2013-08-28 10:17:40 +00:00
|
|
|
|
2024-10-06 16:04:00 +00:00
|
|
|
use File::Temp qw/ tempfile tempdir /;
|
|
|
|
use File::Spec;
|
|
|
|
use Fcntl qw(:flock SEEK_END);
|
|
|
|
|
2014-01-24 10:00:07 +00:00
|
|
|
# Prototypes
|
|
|
|
sub printNotUsedURLS($\%);
|
2015-11-26 13:31:15 +00:00
|
|
|
sub replaceSpecialChar($);
|
2014-01-24 10:00:07 +00:00
|
|
|
sub readUrls($\%);
|
|
|
|
sub parse_file($ );
|
|
|
|
sub handle_url($$$ );
|
2024-10-06 16:04:00 +00:00
|
|
|
sub printx($$$$);
|
2024-10-07 10:35:10 +00:00
|
|
|
sub getnrjobs($$$);
|
2014-01-24 10:00:07 +00:00
|
|
|
##########
|
|
|
|
|
2024-10-06 16:04:00 +00:00
|
|
|
my %URLS = ();
|
|
|
|
my %ignoredURLS = ();
|
|
|
|
my %revertedURLS = ();
|
|
|
|
my %extraURLS = ();
|
|
|
|
my %selectedURLS = ();
|
2016-01-03 14:25:09 +00:00
|
|
|
my %knownToRegisterURLS = ();
|
2024-10-30 10:08:31 +00:00
|
|
|
my %extraTestURLS = ();
|
2024-10-06 16:04:00 +00:00
|
|
|
my $summaryFile = undef;
|
2013-08-28 10:17:40 +00:00
|
|
|
|
|
|
|
my $checkSelectedOnly = 0;
|
|
|
|
for my $arg (@ARGV) {
|
|
|
|
die("Bad argument \"$arg\"") if ($arg !~ /=/);
|
2024-10-06 16:04:00 +00:00
|
|
|
my ($type, $val) = split("=", $arg);
|
2013-08-28 10:17:40 +00:00
|
|
|
if ($type eq "filesToScan") {
|
2024-10-06 16:04:00 +00:00
|
|
|
|
2013-08-28 10:17:40 +00:00
|
|
|
#The file should be a list of files to search in
|
2024-10-30 10:08:31 +00:00
|
|
|
if (open(FLIST, '<', $val)) {
|
2013-08-28 10:17:40 +00:00
|
|
|
while (my $l = <FLIST>) {
|
2024-10-06 16:04:00 +00:00
|
|
|
chomp($l);
|
|
|
|
parse_file($l);
|
2013-08-28 10:17:40 +00:00
|
|
|
}
|
|
|
|
close(FLIST);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
elsif ($type eq "ignoredURLS") {
|
2014-01-24 10:00:07 +00:00
|
|
|
readUrls($val, %ignoredURLS);
|
2013-08-28 10:17:40 +00:00
|
|
|
}
|
|
|
|
elsif ($type eq "revertedURLS") {
|
2014-01-24 10:00:07 +00:00
|
|
|
readUrls($val, %revertedURLS);
|
2013-08-28 10:17:40 +00:00
|
|
|
}
|
|
|
|
elsif ($type eq "extraURLS") {
|
2016-01-03 14:25:09 +00:00
|
|
|
readUrls($val, %extraURLS);
|
2013-08-28 10:17:40 +00:00
|
|
|
}
|
|
|
|
elsif ($type eq "selectedURLS") {
|
|
|
|
$checkSelectedOnly = 1;
|
2016-01-03 14:25:09 +00:00
|
|
|
readUrls($val, %selectedURLS);
|
|
|
|
}
|
|
|
|
elsif ($type eq "knownToRegisterURLS") {
|
|
|
|
readUrls($val, %knownToRegisterURLS);
|
2013-08-28 10:17:40 +00:00
|
|
|
}
|
2015-11-23 12:07:53 +00:00
|
|
|
elsif ($type eq "summaryFile") {
|
2024-10-30 10:08:31 +00:00
|
|
|
if (open(SFO, '>:encoding(UTF8)', "$val")) {
|
2015-11-23 12:07:53 +00:00
|
|
|
$summaryFile = $val;
|
|
|
|
}
|
|
|
|
}
|
2013-08-28 10:17:40 +00:00
|
|
|
else {
|
|
|
|
die("Invalid argument \"$arg\"");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-10-06 16:04:00 +00:00
|
|
|
my @urls = sort keys %URLS, keys %extraURLS;
|
|
|
|
my @testvals = ();
|
|
|
|
|
2016-01-05 16:27:49 +00:00
|
|
|
# Tests
|
|
|
|
#my @urls = ("ftp://ftp.edpsciences.org/pub/aa/readme.html", "ftp://ftp.springer.de/pub/tex/latex/compsc/proc/author");
|
2013-08-28 10:17:40 +00:00
|
|
|
my $errorcount = 0;
|
|
|
|
|
|
|
|
my $URLScount = 0;
|
|
|
|
|
|
|
|
for my $u (@urls) {
|
2013-08-28 14:50:23 +00:00
|
|
|
if (defined($ignoredURLS{$u})) {
|
2013-08-30 15:28:46 +00:00
|
|
|
$ignoredURLS{$u}->{count} += 1;
|
2013-08-28 14:50:23 +00:00
|
|
|
next;
|
|
|
|
}
|
2016-01-05 16:27:49 +00:00
|
|
|
my $use_curl = 0;
|
|
|
|
if (defined($knownToRegisterURLS{$u})) {
|
|
|
|
if ($knownToRegisterURLS{$u}->{use_curl}) {
|
|
|
|
$use_curl = 1;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
next;
|
|
|
|
}
|
|
|
|
}
|
2016-01-03 14:25:09 +00:00
|
|
|
if (defined($selectedURLS{$u})) {
|
|
|
|
${selectedURLS}{$u}->{count} += 1;
|
|
|
|
}
|
2024-10-06 16:04:00 +00:00
|
|
|
next if ($checkSelectedOnly && !defined($selectedURLS{$u}));
|
2013-08-28 10:17:40 +00:00
|
|
|
$URLScount++;
|
2024-10-06 16:04:00 +00:00
|
|
|
push(@testvals, {u => $u, use_curl => $use_curl,});
|
2024-10-30 10:08:31 +00:00
|
|
|
my $uorig = $u;
|
|
|
|
$u = constructExtraTestUrl($uorig);
|
|
|
|
if ($u ne $uorig) {
|
|
|
|
if (!defined($selectedURLS{$u})) {
|
|
|
|
if (!defined($extraTestURLS{$u})) {
|
|
|
|
$extraTestURLS{$u} = 1; # omit multiple tests
|
|
|
|
push(@testvals, {u => $u, use_curl => $use_curl, extra => 1});
|
|
|
|
$URLScount++;
|
|
|
|
}
|
2024-10-18 16:25:53 +00:00
|
|
|
}
|
|
|
|
}
|
2024-10-06 16:04:00 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
# Ready to go multitasking
|
|
|
|
my ($vol, $dir, $file) = File::Spec->splitpath($summaryFile);
|
|
|
|
my $tempdir = tempdir("$dir/CounterXXXXXXX", CLEANUP => 1);
|
|
|
|
my $countfile = "$tempdir/counter";
|
|
|
|
my $counter = 0;
|
|
|
|
if (open(my $FO, '>', $countfile)) {
|
|
|
|
print {$FO} $counter;
|
|
|
|
close($FO);
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
unlink($countfile);
|
|
|
|
die("Could not write to $countfile");
|
|
|
|
}
|
|
|
|
|
|
|
|
print "Using tempdir \"" . abs_path($tempdir) . "\"\n";
|
|
|
|
|
2024-10-09 17:17:16 +00:00
|
|
|
my %wait = ();
|
2024-10-07 10:35:10 +00:00
|
|
|
for (my $i = 0; $i < $NR_JOBS; $i++) { # Number of subprocesses
|
2024-10-06 16:04:00 +00:00
|
|
|
my $pid = fork();
|
2024-10-09 17:17:16 +00:00
|
|
|
if ($pid > 0) {
|
|
|
|
$wait{$pid} = $i;
|
|
|
|
}
|
|
|
|
elsif ($pid == 0) {
|
2024-10-06 16:04:00 +00:00
|
|
|
|
|
|
|
# I am child
|
|
|
|
open(my $fe, '>:encoding(UTF-8)', "$tempdir/xxxError$i");
|
2024-10-09 17:17:16 +00:00
|
|
|
my $subprocess = $i;
|
2024-10-06 16:04:00 +00:00
|
|
|
open(my $fs, '>:encoding(UTF-8)', "$tempdir/xxxSum$i");
|
|
|
|
while (1) {
|
|
|
|
open(my $fh, '+<', $countfile) or die("cannot open $countfile");
|
|
|
|
flock($fh, LOCK_EX) or die "$i: Cannot lock $countfile - $!\n";
|
2024-10-07 10:35:10 +00:00
|
|
|
my $l = <$fh>; # get actual count number
|
|
|
|
if (!defined($testvals[$l])) {
|
2024-10-06 16:04:00 +00:00
|
|
|
close($fs);
|
|
|
|
print $fe "NumberOfErrors $errorcount\n";
|
|
|
|
close($fe);
|
|
|
|
exit(0);
|
|
|
|
}
|
2024-10-07 10:35:10 +00:00
|
|
|
my $diff = getnrjobs(scalar @testvals, $l, $NR_JOBS);
|
|
|
|
if ($diff < 1) {
|
|
|
|
$diff = 1;
|
|
|
|
}
|
2024-10-06 16:04:00 +00:00
|
|
|
my $next = $l + $diff;
|
|
|
|
seek($fh, 0, 0);
|
|
|
|
truncate($fh, 0);
|
|
|
|
print $fh $next;
|
|
|
|
close($fh);
|
|
|
|
for (my $i = 0; $i < $diff; $i++) {
|
|
|
|
my $entryidx = $l + $i;
|
|
|
|
my $rentry = $testvals[$entryidx];
|
|
|
|
next if (!defined($rentry));
|
|
|
|
my $u = $rentry->{u};
|
|
|
|
my $use_curl = $rentry->{use_curl};
|
2024-10-18 16:25:53 +00:00
|
|
|
my $extra = defined($rentry->{extra});
|
2024-10-06 16:04:00 +00:00
|
|
|
|
2024-10-30 10:08:31 +00:00
|
|
|
print $fe "Checking($entryidx-$subprocess) '$u': time=" . time() . ' ';
|
2024-10-06 16:04:00 +00:00
|
|
|
my ($res, $prnt, $outSum);
|
|
|
|
try {
|
|
|
|
$res = check_url($u, $use_curl, $fe, $fs);
|
|
|
|
if ($res) {
|
|
|
|
print $fe "Failed\n";
|
|
|
|
$prnt = "";
|
|
|
|
$outSum = 1;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
$prnt = "OK\n";
|
|
|
|
$outSum = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
catch {
|
|
|
|
$prnt = "Failed, caught error: $_\n";
|
|
|
|
$outSum = 1;
|
|
|
|
$res = 700;
|
|
|
|
};
|
|
|
|
printx("$prnt", $outSum, $fe, $fs);
|
|
|
|
my $printSourceFiles = 0;
|
2024-10-18 16:25:53 +00:00
|
|
|
my $err_txt;
|
|
|
|
if ($extra) {
|
|
|
|
$err_txt = "Extra_Error url:";
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
$err_txt = "Error url:";
|
|
|
|
}
|
2024-10-06 16:04:00 +00:00
|
|
|
|
|
|
|
if ($res || $checkSelectedOnly) {
|
|
|
|
$printSourceFiles = 1;
|
|
|
|
}
|
|
|
|
if ($res && defined($revertedURLS{$u})) {
|
|
|
|
$err_txt = "Failed url:";
|
|
|
|
}
|
|
|
|
$res = !$res if (defined($revertedURLS{$u}));
|
|
|
|
if ($res || $checkSelectedOnly) {
|
|
|
|
printx("$err_txt \"$u\"\n", $outSum, $fe, $fs);
|
|
|
|
}
|
2024-10-08 09:36:25 +00:00
|
|
|
else {
|
2024-10-18 16:25:53 +00:00
|
|
|
my $succes;
|
|
|
|
if ($extra) {
|
2024-10-30 10:08:31 +00:00
|
|
|
# This url is created
|
2024-10-18 16:25:53 +00:00
|
|
|
$succes = "Extra_OK url:";
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
$succes = "OK url:";
|
|
|
|
}
|
|
|
|
printx("$succes \"$u\"\n", $outSum, $fe, $fs);
|
2024-10-08 09:36:25 +00:00
|
|
|
$printSourceFiles = 1;
|
|
|
|
}
|
2024-10-06 16:04:00 +00:00
|
|
|
if ($printSourceFiles) {
|
|
|
|
if (defined($URLS{$u})) {
|
|
|
|
for my $f (sort keys %{$URLS{$u}}) {
|
|
|
|
my $lines = ":" . join(',', @{$URLS{$u}->{$f}});
|
|
|
|
printx(" $f$lines\n", $outSum, $fe, $fs);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if ($res) {
|
|
|
|
$errorcount++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2014-01-08 12:36:30 +00:00
|
|
|
}
|
2013-08-28 10:17:40 +00:00
|
|
|
}
|
2024-10-06 16:04:00 +00:00
|
|
|
}
|
|
|
|
|
2024-10-09 17:17:16 +00:00
|
|
|
sub readsublog($) {
|
|
|
|
my ($i) = @_;
|
2024-10-30 10:08:31 +00:00
|
|
|
open(my $fe, '<:encoding(UTF-8)', "$tempdir/xxxError$i");
|
2024-10-09 17:17:16 +00:00
|
|
|
while (my $l = <$fe>) {
|
|
|
|
if ($l =~ /^NumberOfErrors\s(\d+)/) {
|
|
|
|
$errorcount += $1;
|
2013-08-28 10:17:40 +00:00
|
|
|
}
|
2024-10-09 17:17:16 +00:00
|
|
|
else {
|
|
|
|
print $l;
|
2013-08-28 10:17:40 +00:00
|
|
|
}
|
2024-10-09 17:17:16 +00:00
|
|
|
}
|
|
|
|
close($fe);
|
|
|
|
open(my $fs, '<', "$tempdir/xxxSum$i");
|
|
|
|
while (my $l = <$fs>) {
|
|
|
|
print SFO $l;
|
|
|
|
}
|
|
|
|
close($fs);
|
|
|
|
}
|
|
|
|
|
|
|
|
my $p;
|
|
|
|
do {
|
|
|
|
$p = waitpid(-1, 0);
|
|
|
|
if (($p > 0) && defined($wait{$p}) && $wait{$p} >= 0) {
|
|
|
|
&readsublog($wait{$p});
|
|
|
|
$wait{$p} = -1;
|
|
|
|
}
|
2024-10-18 16:25:53 +00:00
|
|
|
} until ($p < 0);
|
2024-10-15 08:48:26 +00:00
|
|
|
print "Started to protocol remaining subprocess-logs\n";
|
2024-10-09 17:17:16 +00:00
|
|
|
|
|
|
|
for my $p (keys %wait) {
|
|
|
|
if ($wait{$p} >= 0) {
|
|
|
|
&readsublog($wait{$p});
|
|
|
|
$wait{$p} = -1;
|
2013-08-28 10:17:40 +00:00
|
|
|
}
|
|
|
|
}
|
2024-10-15 08:48:26 +00:00
|
|
|
print "Stopped to protocol remaining subprocess-logs\n";
|
2024-10-06 16:04:00 +00:00
|
|
|
unlink($countfile);
|
2013-08-28 10:17:40 +00:00
|
|
|
|
2013-08-30 15:28:46 +00:00
|
|
|
if (%URLS) {
|
2024-10-06 16:04:00 +00:00
|
|
|
printNotUsedURLS("Ignored", %ignoredURLS);
|
|
|
|
printNotUsedURLS("Selected", %selectedURLS);
|
2014-01-24 10:00:07 +00:00
|
|
|
printNotUsedURLS("KnownInvalid", %extraURLS);
|
2013-08-30 15:28:46 +00:00
|
|
|
}
|
2013-08-28 14:50:23 +00:00
|
|
|
|
2024-10-15 08:48:26 +00:00
|
|
|
print SFO "\n$errorcount URL-tests failed out of $URLScount\n\n";
|
2015-11-23 12:07:53 +00:00
|
|
|
if (defined($summaryFile)) {
|
2015-11-23 12:41:29 +00:00
|
|
|
close(SFO);
|
2015-11-23 12:07:53 +00:00
|
|
|
}
|
2013-08-28 10:17:40 +00:00
|
|
|
exit($errorcount);
|
|
|
|
|
|
|
|
###############################################################################
|
2024-10-06 16:04:00 +00:00
|
|
|
sub printx($$$$) {
|
|
|
|
my ($txt, $outSum, $fe, $fs) = @_;
|
|
|
|
print $fe "$txt";
|
2015-11-23 12:07:53 +00:00
|
|
|
if ($outSum && defined($summaryFile)) {
|
2024-10-06 16:04:00 +00:00
|
|
|
print $fs "$txt";
|
2015-11-23 12:07:53 +00:00
|
|
|
}
|
|
|
|
}
|
2013-08-28 10:17:40 +00:00
|
|
|
|
2024-10-06 16:04:00 +00:00
|
|
|
sub printNotUsedURLS($\%) {
|
2013-08-28 14:50:23 +00:00
|
|
|
my ($txt, $rURLS) = @_;
|
|
|
|
my @msg = ();
|
2024-10-06 16:04:00 +00:00
|
|
|
for my $u (sort keys %{$rURLS}) {
|
2013-08-30 15:28:46 +00:00
|
|
|
if ($rURLS->{$u}->{count} < 2) {
|
|
|
|
my @submsg = ();
|
|
|
|
for my $f (sort keys %{$rURLS->{$u}}) {
|
2024-10-06 16:04:00 +00:00
|
|
|
next if ($f eq "count");
|
|
|
|
push(@submsg, "$f:" . $rURLS->{$u}->{$f});
|
2013-08-30 15:28:46 +00:00
|
|
|
}
|
|
|
|
push(@msg, "\n $u\n " . join("\n ", @submsg) . "\n");
|
2013-08-28 14:50:23 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
if (@msg) {
|
2024-10-30 10:08:31 +00:00
|
|
|
print "\n$txt URLs: " . join(' ', @msg) . "\n";
|
2013-08-28 14:50:23 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-10-06 16:04:00 +00:00
|
|
|
sub replaceSpecialChar($) {
|
2015-11-26 13:31:15 +00:00
|
|
|
my ($l) = @_;
|
2024-10-15 08:48:26 +00:00
|
|
|
$l =~ s/\\SpecialChar(NoPassThru)?\s*(TeX|LaTeX|LyX)[\s]?/$2/;
|
2024-10-30 10:08:31 +00:00
|
|
|
$l =~ s/ /%20/g;
|
2024-10-06 16:04:00 +00:00
|
|
|
return ($l);
|
2015-11-26 13:31:15 +00:00
|
|
|
}
|
|
|
|
|
2024-10-06 16:04:00 +00:00
|
|
|
sub readUrls($\%) {
|
2013-08-28 10:17:40 +00:00
|
|
|
my ($file, $rUrls) = @_;
|
|
|
|
|
2024-10-30 10:08:31 +00:00
|
|
|
die("Could not read file $file") if (!open(ULIST, '<:encoding(UTF-8)', $file));
|
|
|
|
print "Read urls from $file\n";
|
2013-08-30 15:28:46 +00:00
|
|
|
my $line = 0;
|
2013-08-28 10:17:40 +00:00
|
|
|
while (my $l = <ULIST>) {
|
2013-08-30 15:28:46 +00:00
|
|
|
$line++;
|
2024-10-30 10:08:31 +00:00
|
|
|
chomp($l); # remove eol
|
|
|
|
$l =~ s/^\s+//;
|
|
|
|
next if ($l =~ /^\#/); # discard comment lines
|
2013-08-28 10:17:40 +00:00
|
|
|
next if ($l eq "");
|
2024-10-30 10:08:31 +00:00
|
|
|
$l = &replaceSpecialChar($l);
|
2016-01-05 16:27:49 +00:00
|
|
|
my $use_curl = 0;
|
2024-10-30 10:08:31 +00:00
|
|
|
if ($l =~ s/^UseCurl\s*//) {
|
2016-01-05 16:27:49 +00:00
|
|
|
$use_curl = 1;
|
|
|
|
}
|
2024-10-06 16:04:00 +00:00
|
|
|
if (!defined($rUrls->{$l})) {
|
2016-01-05 16:27:49 +00:00
|
|
|
$rUrls->{$l} = {$file => $line, count => 1, use_curl => $use_curl};
|
2013-08-30 15:28:46 +00:00
|
|
|
}
|
2013-08-28 10:17:40 +00:00
|
|
|
}
|
|
|
|
close(ULIST);
|
|
|
|
}
|
|
|
|
|
2024-10-06 16:04:00 +00:00
|
|
|
sub parse_file($) {
|
|
|
|
my ($f) = @_;
|
|
|
|
my $status = "out"; # outside of URL/href
|
2013-08-28 10:17:40 +00:00
|
|
|
|
2024-10-15 08:48:26 +00:00
|
|
|
#return if ($f =~ /\/attic\//);
|
2024-10-30 10:08:31 +00:00
|
|
|
if (open(FI, '<:encoding(UTF-8)', $f)) {
|
2013-08-28 17:51:19 +00:00
|
|
|
my $line = 0;
|
2024-10-06 16:04:00 +00:00
|
|
|
while (my $l = <FI>) {
|
2013-08-28 17:51:19 +00:00
|
|
|
$line++;
|
2024-10-09 17:17:16 +00:00
|
|
|
chomp($l);
|
2024-10-18 16:25:53 +00:00
|
|
|
|
2013-08-28 17:51:19 +00:00
|
|
|
if ($status eq "out") {
|
2024-10-06 16:04:00 +00:00
|
|
|
|
|
|
|
# searching for "\begin_inset Flex URL"
|
|
|
|
if ($l =~ /^\s*\\begin_inset\s+Flex\s+URL\s*$/) {
|
|
|
|
$status = "inUrlInset";
|
|
|
|
}
|
|
|
|
elsif ($l =~ /^\s*\\begin_inset\s+CommandInset\s+href\s*$/) {
|
|
|
|
$status = "inHrefInset";
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
# Outside of url, check also
|
|
|
|
if ($l =~ /"((ftp|http|https):\/\/[^ ]+)"/) {
|
|
|
|
my $url = $1;
|
|
|
|
handle_url($url, $f, "x$line");
|
|
|
|
}
|
|
|
|
}
|
2013-08-28 10:17:40 +00:00
|
|
|
}
|
|
|
|
else {
|
2024-10-06 16:04:00 +00:00
|
|
|
if ($l =~ /^\s*\\end_(layout|inset)\s*$/) {
|
|
|
|
$status = "out";
|
|
|
|
}
|
|
|
|
elsif ($status eq "inUrlInset") {
|
|
|
|
if ($l =~ /\s*([a-z]+:\/\/.+)\s*$/) {
|
|
|
|
my $url = $1;
|
|
|
|
$status = "out";
|
|
|
|
handle_url($url, $f, "u$line");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
elsif ($status eq "inHrefInset") {
|
|
|
|
if ($l =~ /^target\s+"([a-z]+:\/\/[^ ]+)"$/) {
|
|
|
|
my $url = $1;
|
|
|
|
$status = "out";
|
|
|
|
handle_url($url, $f, "h$line");
|
|
|
|
}
|
|
|
|
}
|
2013-08-28 10:17:40 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
close(FI);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-10-06 16:04:00 +00:00
|
|
|
sub handle_url($$$) {
|
|
|
|
my ($url, $f, $line) = @_;
|
2013-08-28 10:17:40 +00:00
|
|
|
|
2015-11-26 13:31:15 +00:00
|
|
|
$url = &replaceSpecialChar($url);
|
2024-10-06 16:04:00 +00:00
|
|
|
if (!defined($URLS{$url})) {
|
2013-08-28 10:17:40 +00:00
|
|
|
$URLS{$url} = {};
|
2014-01-24 10:00:07 +00:00
|
|
|
}
|
2024-10-06 16:04:00 +00:00
|
|
|
if (!defined($URLS{$url}->{$f})) {
|
2013-08-28 17:51:19 +00:00
|
|
|
$URLS{$url}->{$f} = [];
|
2013-08-28 10:17:40 +00:00
|
|
|
}
|
2013-08-28 17:51:19 +00:00
|
|
|
push(@{$URLS{$url}->{$f}}, $line);
|
2013-08-28 10:17:40 +00:00
|
|
|
}
|
2024-10-07 10:35:10 +00:00
|
|
|
|
|
|
|
sub getnrjobs($$$) {
|
|
|
|
my ($tabsize, $actualidx, $nr_jobs) = @_;
|
|
|
|
my $maxidx = $tabsize - 1;
|
|
|
|
my $remaining = $maxidx - $actualidx;
|
|
|
|
if ($remaining <= 0) {
|
|
|
|
return (1);
|
|
|
|
}
|
|
|
|
if ($nr_jobs < 2) {
|
|
|
|
return ($remaining);
|
|
|
|
}
|
2024-10-09 17:17:16 +00:00
|
|
|
my $diff = 1 + int($remaining / (3 * $nr_jobs));
|
2024-10-07 10:35:10 +00:00
|
|
|
return $diff;
|
|
|
|
}
|