2013-08-28 12:17:40 +02:00
|
|
|
# -*- mode: perl; -*-
|
|
|
|
package CheckURL;
|
2024-10-06 18:04:00 +02:00
|
|
|
|
2013-08-28 12:17:40 +02:00
|
|
|
# file CheckURL.pm
|
|
|
|
#
|
|
|
|
# This file is part of LyX, the document processor.
|
|
|
|
# Licence details can be found in the file COPYING.
|
|
|
|
#
|
|
|
|
# authors: Kornel Benko <kornel@lyx.org>
|
|
|
|
# Scott Kostyshak <skotysh@lyx.org>
|
|
|
|
#
|
|
|
|
|
|
|
|
# Check if given URL exists and is accessible
|
|
|
|
#
|
|
|
|
use strict;
|
|
|
|
|
2024-10-06 18:04:00 +02:00
|
|
|
our (@EXPORT, @ISA);
|
|
|
|
|
2013-08-28 12:17:40 +02:00
|
|
|
BEGIN {
|
2024-10-06 18:04:00 +02:00
|
|
|
use Exporter ();
|
2013-08-28 12:17:40 +02:00
|
|
|
@ISA = qw(Exporter);
|
2024-10-30 11:08:31 +01:00
|
|
|
@EXPORT = qw(check_url constructExtraTestUrl);
|
2013-08-28 12:17:40 +02:00
|
|
|
}
|
|
|
|
|
2014-01-24 11:00:07 +01:00
|
|
|
# Prototypes
|
|
|
|
sub check_http_url($$$$);
|
|
|
|
sub check_ftp_dir_entry($$);
|
|
|
|
sub check_ftp_url($$$$);
|
|
|
|
sub check_unknown_url($$$$);
|
2024-10-06 18:04:00 +02:00
|
|
|
sub check_url($$$$);
|
2014-01-24 11:00:07 +01:00
|
|
|
################
|
|
|
|
|
2024-10-06 18:04:00 +02:00
|
|
|
my $fe;
|
|
|
|
my $fs;
|
|
|
|
|
|
|
|
sub check_http_url($$$$) {
|
2016-01-03 15:25:09 +01:00
|
|
|
require LWP::UserAgent;
|
2013-08-28 12:17:40 +02:00
|
|
|
|
|
|
|
my ($protocol, $host, $path, $file) = @_;
|
|
|
|
|
2024-10-06 18:04:00 +02:00
|
|
|
my $ua = LWP::UserAgent->new(timeout => 20);
|
2013-08-28 12:17:40 +02:00
|
|
|
my $getp = "/";
|
|
|
|
if ($path ne "") {
|
|
|
|
$getp .= $path;
|
|
|
|
}
|
|
|
|
if (defined($file)) {
|
|
|
|
if ($getp =~ /\/$/) {
|
|
|
|
$getp .= $file;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
$getp .= "/$file";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
my $buf;
|
2016-01-03 15:25:09 +01:00
|
|
|
$ua->agent("Firefox/43.0");
|
|
|
|
my $response = $ua->get("$protocol://$host$getp");
|
|
|
|
if ($response->is_success) {
|
|
|
|
$buf = $response->decoded_content;
|
|
|
|
}
|
|
|
|
else {
|
2024-10-06 18:04:00 +02:00
|
|
|
print $fe " " . $response->status_line . ": ";
|
2013-08-28 12:17:40 +02:00
|
|
|
return 3;
|
|
|
|
}
|
2024-10-09 19:17:16 +02:00
|
|
|
my @atitle = ();
|
|
|
|
my %htitle = ();
|
2024-10-06 18:04:00 +02:00
|
|
|
my $res = 0;
|
2016-01-03 15:25:09 +01:00
|
|
|
while ($buf =~ s/\<title\>([^\<]*)\<\/title\>//i) {
|
2016-01-02 22:18:13 +01:00
|
|
|
my $title = $1;
|
2016-01-03 15:25:09 +01:00
|
|
|
$title =~ s/[\r\n]/ /g;
|
|
|
|
$title =~ s/ +/ /g;
|
|
|
|
$title =~ s/^ //;
|
|
|
|
$title =~ s/ $//;
|
2024-10-09 19:17:16 +02:00
|
|
|
if (! defined($htitle{$title})) {
|
|
|
|
push(@atitle, $title);
|
|
|
|
$htitle{$title} = 1;
|
|
|
|
}
|
2016-01-03 15:25:09 +01:00
|
|
|
if ($title =~ /Error 404|Not Found/) {
|
2024-10-06 18:04:00 +02:00
|
|
|
print $fe " Page reports 'Not Found' from \"$protocol://$host$getp\": ";
|
2016-01-03 15:25:09 +01:00
|
|
|
$res = 3;
|
2016-01-02 22:18:13 +01:00
|
|
|
}
|
2016-01-02 11:29:11 +01:00
|
|
|
}
|
2024-10-09 19:17:16 +02:00
|
|
|
|
|
|
|
print $fe "title = \"" . join(': ', @atitle) . "\": ";
|
2016-01-03 15:25:09 +01:00
|
|
|
return $res;
|
2013-08-28 12:17:40 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
# Returns ($err, $isdir)
|
|
|
|
# returns 0, x if file does not match entry
|
|
|
|
# 1, x everything OK
|
|
|
|
# 2, x if not accesible (permission)
|
2024-10-06 18:04:00 +02:00
|
|
|
sub check_ftp_dir_entry($$) {
|
2013-08-28 12:17:40 +02:00
|
|
|
my ($file, $e) = @_;
|
|
|
|
my $other = '---';
|
|
|
|
my $isdir = 0;
|
|
|
|
|
2024-10-06 18:04:00 +02:00
|
|
|
#print $fe "Checking '$file' against '$e'\n";
|
2013-08-28 12:17:40 +02:00
|
|
|
$file =~ s/^\///;
|
|
|
|
$isdir = 1 if ($e =~ /^d/);
|
2024-10-06 18:04:00 +02:00
|
|
|
return (0, $isdir) if ($e !~ /\s$file$/);
|
2013-08-28 12:17:40 +02:00
|
|
|
if ($e =~ /^.[-rwx]{6}([-rwx]{3})\s/) {
|
|
|
|
$other = $1;
|
|
|
|
}
|
|
|
|
else {
|
2024-10-06 18:04:00 +02:00
|
|
|
#print $fe "Invalid entry\n";
|
2013-08-28 12:17:40 +02:00
|
|
|
# Invalid entry
|
2024-10-06 18:04:00 +02:00
|
|
|
return (0, $isdir);
|
2013-08-28 12:17:40 +02:00
|
|
|
}
|
2024-10-06 18:04:00 +02:00
|
|
|
return (2, $isdir) if ($other !~ /^r/); # not readable
|
2013-08-28 12:17:40 +02:00
|
|
|
if ($isdir) {
|
2024-10-06 18:04:00 +02:00
|
|
|
|
2013-08-28 12:17:40 +02:00
|
|
|
#return(2,$isdir) if ($other !~ /x$/); # directory, but not executable
|
|
|
|
}
|
2024-10-06 18:04:00 +02:00
|
|
|
return (1, $isdir);
|
2013-08-28 12:17:40 +02:00
|
|
|
}
|
|
|
|
|
2024-10-06 18:04:00 +02:00
|
|
|
sub check_ftp2_url($$$$) {
|
2016-01-05 17:27:49 +01:00
|
|
|
my ($protocol, $host, $path, $file) = @_;
|
|
|
|
|
|
|
|
my $checkentry = 1;
|
2024-10-06 18:04:00 +02:00
|
|
|
|
|
|
|
#print $fe "\nhost $host\n";
|
|
|
|
#print $fe "path $path\n";
|
|
|
|
#print $fe "file $file\n";
|
2016-01-05 17:27:49 +01:00
|
|
|
my $url = "$protocol://$host";
|
|
|
|
$path =~ s/\/$//;
|
|
|
|
if (defined($file)) {
|
|
|
|
$url = "$url/$path/$file";
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
$url = "$url/$path/.";
|
|
|
|
}
|
2024-10-06 18:04:00 +02:00
|
|
|
|
|
|
|
#print $fe "curl $url, file = $file\n";
|
2016-01-05 17:27:49 +01:00
|
|
|
my %listfiles = ();
|
|
|
|
if (open(FFTP, "curl --anyauth -l $url|")) {
|
|
|
|
while (my $l = <FFTP>) {
|
|
|
|
chomp($l);
|
|
|
|
$listfiles{$l} = 1;
|
|
|
|
}
|
|
|
|
close(FFTP);
|
|
|
|
}
|
|
|
|
if (%listfiles) {
|
2024-10-06 18:04:00 +02:00
|
|
|
if (!defined($file)) {
|
|
|
|
return (0, "OK");
|
2016-01-05 17:27:49 +01:00
|
|
|
}
|
|
|
|
elsif (defined($listfiles{$file})) {
|
2024-10-06 18:04:00 +02:00
|
|
|
return (0, "OK");
|
2016-01-05 17:27:49 +01:00
|
|
|
}
|
|
|
|
elsif (defined($listfiles{"ftpinfo.txt"})) {
|
2024-10-06 18:04:00 +02:00
|
|
|
return (0, "Probably a directory");
|
2016-01-05 17:27:49 +01:00
|
|
|
}
|
|
|
|
else {
|
2024-10-06 18:04:00 +02:00
|
|
|
return (1, "Not found");
|
2016-01-05 17:27:49 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
else {
|
2024-10-06 18:04:00 +02:00
|
|
|
return (1, "Error");
|
2016-01-05 17:27:49 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-10-06 18:04:00 +02:00
|
|
|
sub check_ftp_url($$$$) {
|
2013-08-28 12:17:40 +02:00
|
|
|
use Net::FTP;
|
|
|
|
|
|
|
|
my ($protocol, $host, $path, $file) = @_;
|
2024-10-06 18:04:00 +02:00
|
|
|
my $res = 0;
|
2013-08-28 12:17:40 +02:00
|
|
|
my $message = "";
|
|
|
|
|
2024-10-06 18:04:00 +02:00
|
|
|
my $ftp = Net::FTP->new($host, Debug => 0, Timeout => 20);
|
|
|
|
if (!$ftp) {
|
|
|
|
return (3, "Cannot connect to $host");
|
2013-08-28 12:17:40 +02:00
|
|
|
}
|
2024-10-06 18:04:00 +02:00
|
|
|
if (!$ftp->login("anonymous", '-anonymous@')) {
|
2013-08-28 12:17:40 +02:00
|
|
|
$message = $ftp->message;
|
2024-10-06 18:04:00 +02:00
|
|
|
$res = 3;
|
2013-08-28 12:17:40 +02:00
|
|
|
}
|
|
|
|
else {
|
|
|
|
my $rEntries;
|
|
|
|
if ($path ne "") {
|
2024-10-06 18:04:00 +02:00
|
|
|
|
|
|
|
#print $fe "Path = $path\n";
|
2013-08-28 12:17:40 +02:00
|
|
|
#if (!$ftp->cwd($path)) {
|
|
|
|
# $message = $ftp->message;
|
|
|
|
# $res = 3;
|
|
|
|
#}
|
|
|
|
$rEntries = $ftp->dir($path);
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
$rEntries = $ftp->dir();
|
|
|
|
}
|
2024-10-06 18:04:00 +02:00
|
|
|
if (!$rEntries) {
|
|
|
|
$res = 3;
|
2013-08-28 12:17:40 +02:00
|
|
|
$message = "Could not read directory \"$path\"";
|
|
|
|
}
|
|
|
|
elsif (defined($file)) {
|
2024-10-06 18:04:00 +02:00
|
|
|
my $found = 0;
|
2013-08-28 12:17:40 +02:00
|
|
|
my $found2 = 0;
|
2024-10-06 18:04:00 +02:00
|
|
|
for my $f (@{$rEntries}) {
|
|
|
|
|
|
|
|
#print $fe "Entry: $path $f\n";
|
|
|
|
my ($res1, $isdir) = check_ftp_dir_entry($file, $f);
|
|
|
|
if ($res1 == 1) {
|
|
|
|
$found = 1;
|
|
|
|
last;
|
|
|
|
}
|
|
|
|
elsif ($res1 == 2) {
|
|
|
|
|
|
|
|
# found, but not accessible
|
|
|
|
$found2 = 1;
|
|
|
|
$message = "Permission denied for '$file'";
|
|
|
|
}
|
2013-08-28 12:17:40 +02:00
|
|
|
}
|
2024-10-06 18:04:00 +02:00
|
|
|
if (!$found) {
|
|
|
|
$res = 4;
|
|
|
|
if (!$found2) {
|
|
|
|
$message = "File or directory '$file' not found";
|
|
|
|
}
|
2013-08-28 12:17:40 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
$ftp->quit;
|
2024-10-06 18:04:00 +02:00
|
|
|
|
|
|
|
#print $fe "returning ($res,$message)\n";
|
|
|
|
return ($res, $message);
|
2013-08-28 12:17:40 +02:00
|
|
|
}
|
|
|
|
|
2024-10-06 18:04:00 +02:00
|
|
|
sub check_unknown_url($$$$) {
|
2013-08-28 12:17:40 +02:00
|
|
|
use LWP::Simple;
|
|
|
|
|
|
|
|
my ($protocol, $host, $path, $file) = @_;
|
|
|
|
my $res = 1;
|
|
|
|
|
|
|
|
my $url = "$protocol://$host";
|
|
|
|
if ($path ne "") {
|
|
|
|
if ($path =~ /^\//) {
|
|
|
|
$url .= $path;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
$url .= "/$path";
|
|
|
|
}
|
|
|
|
}
|
2024-10-06 18:04:00 +02:00
|
|
|
if (defined($file)) {
|
|
|
|
|
|
|
|
#print $fe "Trying $url$file\n";
|
2013-08-28 12:17:40 +02:00
|
|
|
$res = head("$url/$file");
|
2024-10-06 18:04:00 +02:00
|
|
|
if (!$res) {
|
|
|
|
|
2013-08-28 12:17:40 +02:00
|
|
|
# try to check for directory '/';
|
2024-10-06 18:04:00 +02:00
|
|
|
#print $fe "Trying $url$file/\n";
|
2013-08-28 12:17:40 +02:00
|
|
|
$res = head("$url/$file/");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else {
|
2024-10-06 18:04:00 +02:00
|
|
|
#print $fe "Trying $url\n";
|
2013-08-28 12:17:40 +02:00
|
|
|
$res = head($url);
|
|
|
|
}
|
2024-10-06 18:04:00 +02:00
|
|
|
return (!$res);
|
2013-08-28 12:17:40 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# Main entry
|
2024-10-06 18:04:00 +02:00
|
|
|
sub check_url($$$$) {
|
|
|
|
my ($url, $use_curl, $fex, $fsx) = @_;
|
2024-10-30 11:08:31 +01:00
|
|
|
$url =~ s/%20/ /g;
|
2024-10-06 18:04:00 +02:00
|
|
|
$fe = $fex;
|
|
|
|
$fs = $fsx;
|
2013-08-28 12:17:40 +02:00
|
|
|
my $file = undef;
|
2024-10-06 18:04:00 +02:00
|
|
|
my ($protocol, $host, $path);
|
2013-08-28 12:17:40 +02:00
|
|
|
|
|
|
|
my $res = 0;
|
|
|
|
|
|
|
|
# Split the url to protocol,host,path
|
|
|
|
if ($url =~ /^([a-z]+):\/\/([^\/]+)(.*)$/) {
|
|
|
|
$protocol = $1;
|
2024-10-06 18:04:00 +02:00
|
|
|
$host = $2;
|
|
|
|
$path = $3;
|
2013-08-28 12:17:40 +02:00
|
|
|
$path =~ s/^\///;
|
2024-10-06 18:04:00 +02:00
|
|
|
if ($path =~ s/\/([^\/]+)$//) {
|
2013-08-28 12:17:40 +02:00
|
|
|
$file = $1;
|
2024-10-06 18:04:00 +02:00
|
|
|
if ($file =~ / /) {
|
|
|
|
|
|
|
|
# Filename contains ' ', maybe invalid. Don't check
|
|
|
|
$file = undef;
|
2013-08-28 12:17:40 +02:00
|
|
|
}
|
|
|
|
$path .= "/";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else {
|
2024-10-06 18:04:00 +02:00
|
|
|
print $fe " Invalid url '$url'";
|
2013-08-28 12:17:40 +02:00
|
|
|
return 2;
|
|
|
|
}
|
|
|
|
if ($protocol =~ /^https?$/) {
|
2014-01-24 11:00:07 +01:00
|
|
|
return check_http_url($protocol, $host, $path, $file);
|
2013-08-28 12:17:40 +02:00
|
|
|
}
|
|
|
|
elsif ($protocol eq "ftp") {
|
|
|
|
my $message;
|
2016-01-05 17:27:49 +01:00
|
|
|
if ($use_curl) {
|
|
|
|
($res, $message) = check_ftp2_url($protocol, $host, $path, $file);
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
($res, $message) = check_ftp_url($protocol, $host, $path, $file);
|
|
|
|
}
|
2013-08-28 12:17:40 +02:00
|
|
|
return $res;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
# it never should reach this point
|
2024-10-06 18:04:00 +02:00
|
|
|
print $fe " What protocol is '$protocol'?";
|
2014-01-24 11:00:07 +01:00
|
|
|
$res = check_unknown_url($protocol, $host, $path, $file);
|
2013-08-28 12:17:40 +02:00
|
|
|
return $res;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-10-30 11:08:31 +01:00
|
|
|
sub constructExtraTestUrl($) {
|
|
|
|
my ($url) = @_;
|
|
|
|
|
|
|
|
my $urlok = $url;
|
|
|
|
my $protokol;
|
|
|
|
if ($urlok =~ s/^(ftp|https?):\/\///) {
|
|
|
|
$protokol = $1;
|
|
|
|
if ($protokol eq 'http') {
|
|
|
|
$protokol = 'https';
|
|
|
|
}
|
|
|
|
if (($protokol eq 'ftp') && ($urlok =~ /ctan/)) {
|
|
|
|
$protokol = 'https';
|
|
|
|
}
|
|
|
|
}
|
|
|
|
$urlok =~ s/^([^\/]+)//;
|
|
|
|
my $server = $1;
|
|
|
|
$urlok =~ s/^\///;
|
|
|
|
if ($server =~ /ctan/) {
|
|
|
|
$urlok =~ s/\/\/+/\//g;
|
|
|
|
$urlok =~ s/^ctan\///;
|
|
|
|
if ($urlok =~ /[\w][.](pdf|html|dvi)$/) {
|
|
|
|
if ($urlok =~ s/^(tex-archive|CTAN)\///) {
|
|
|
|
$server = 'mirrors.ctan.org';
|
|
|
|
}
|
|
|
|
elsif ($urlok =~ /(pgf)\//) {
|
|
|
|
$server = 'www.ctan.org';
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
if ($urlok =~ s/\/$//) {
|
|
|
|
$server = 'www.cpan.org';
|
|
|
|
if ($urlok ne '') {
|
|
|
|
if ("$urlok/" =~
|
|
|
|
/^(biblio|bibliography|digest|documentation|dviware|fonts|graphics|help|indexing|info|install|languages?|macros|obsolete|support|systems|tds|usergrps|web)\//
|
|
|
|
)
|
|
|
|
{
|
|
|
|
$urlok = 'tex-archive/' . $urlok;
|
|
|
|
}
|
|
|
|
if ("$urlok/" !~ /^(pkg|topic|tex-archive|author)\//) {
|
|
|
|
die("");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
my $url2;
|
|
|
|
if ($urlok eq '') {
|
|
|
|
$url2 = "$protokol://$server";
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
$url2 = "$protokol://$server/$urlok";
|
|
|
|
}
|
|
|
|
return($url2);
|
|
|
|
}
|
|
|
|
|
2013-08-28 12:17:40 +02:00
|
|
|
1;
|