#!/usr/bin/perl -w eval 'exec i:/perllib/bin/perl -w -S $0 ${1+"$@"}' if 0; # not running under some shell =head1 NAME lwp-rget - Retrieve web documents recursively =head1 SYNOPSIS lwp-rget [--verbose] [--debug] [--queit] [--auth=USER:PASS] [--depth=N] [--hier] [--iis] [--keepext=mime/type[,mime/type]] [--limit=N] [--nospace] [--prefix=URL] [--referer=URL] [--sleep=N] [--tolower] [--noimg] [--filter=REG_EXPR] [--html_suff=SUFF] [--progress] [--ascii] [--filter-img] [--depth-first] [--take_query=ext[,ext]] [--nocharset] [--upload-time] [ ...] lwp-rget --version =head1 DESCRIPTION This program will retrieve a document and store it in a local file. It will follow any links found in the document and store these documents as well, patching links so that they refer to these local copies. This process continues until there are no more unvisited links or the process is stopped by the one or more of the limits which can be controlled by the command line arguments. This program is useful if you want to make a local copy of a collection of documents or want to do web reading off-line. All documents are stored as plain files in the current directory. The file names chosen are derived from the last component of URL paths. The options are: =over 3 =item --auth=USER:PASS Set the authentication credentials to user "USER" and password "PASS" if any restricted parts of the web site are hit. If there are restricted parts of the web site and authentication credentials are not available, those pages will not be downloaded. =item --depth=I Limit the recursive level. Embedded images are always loaded, even if they fall outside the I<--depth>. This means that one can use I<--depth=0> in order to fetch a single document together with all inline graphics. The default depth is 5. E.g., to fetch a document and all the documents mentioned in this document (and graphics therein) use C<--depth 1>. =item --depth-first Download in depth-first order; without this option the documents related to the top-level document are downloaded first, then documents related to them etc. =item --hier Download files into a hierarchy that mimics the web site structure. The default is to put all files in the current directory. =item --referer=I Set the value of the referer header for the initial request. The special value C<"NONE"> can be used to suppress the referer header in any of subsequent requests. =item --iis Sends an "Accept: */*" on all URL requests as a workaround for a bug in IIS 2.0. If no Accept MIME header is present, IIS 2.0 returns with a "406 No acceptable objects were found" error. Also converts any back slashes (\\) in URLs to forward slashes (/). =item --keepext=I Keeps the current extension for the list MIME types. Useful when downloading text/plain documents that shouldn't all be translated to *.txt files. Defaults to C. =item --limit=I Limit the number of documents to get. The default limit is 50. =item --nospace Changes spaces in all URLs to underscore characters (_). Useful when downloading files from sites serving URLs with spaces in them. Does not remove spaces from fragments, e.g., "file.html#somewhere in here". =item --prefix=I Limit the links to follow. Only URLs that start the prefix string are followed. The default prefix is set as the "directory" of the initial URL to follow. For instance if we start lwp-rget with the URL C, then prefix will be set to C. Use C<--prefix=''> if you don't want the fetching to be limited by any prefix. Use multiple options to allow following links from several locations. =item --filter=I Limit the links to follow. Only URLs which match the regular expression are followed. Should not be used together with C<--prefix>. =item --sleep=I Sleep I seconds before retrieving each document. This options allows you to go slowly, not loading the server you visiting too much. =item --tolower Translates all links to lowercase. Useful when downloading files from IIS since it does not serve files in a case sensitive manner. =item --noimg Do not download the images. =item --filter-img Treat images as links: do not download what does not match PREFIX. =item --ascii Save files as text. =item --html_suff=I Save HTML files with the given suffix. =item --verbose Make more noise while running. =item --debug Enable LWP debugging messages. =item --quiet Don't make any noise. =item --progress Show progress message while downloading. Without this option the files are downloaded into memory. =item --take-query Comma-separated list of file extensions which can take a query parameter (?STRING) meaning the position inside the file (e.g., F<.djvu> files take a page number as a parameter). Defaults to C. =time --upload-time Without this option the timestamp of the downloaded files will be set to the time in the C header. If used twice, the modification time will be also updated when URLs of links are changed to reflect new locations. =item --version Print program version number and quit. =item --help Print the usage message and quit. =back Before the program exits the name of the file, where the initial URL is stored, is printed on stdout. All used filenames are also printed on stderr as they are loaded. This printing can be suppressed with the I<--quiet> option. =head1 TODO If multiple targets are specified, one should disable "early translation" of HTML. =head1 SEE ALSO L, L =head1 AUTHOR Gisle Aas , Ilya Zakharevich =cut use strict; use Getopt::Long qw(GetOptions); use URI::URL qw(url); use LWP::MediaTypes qw(media_suffix); use HTML::Entities (); use vars qw($VERSION); use vars qw($MAX_DEPTH $MAX_DOCS @PREFIX $PREFIX $REFERER $VERBOSE $QUIET $SLEEP $HIER $AUTH $IIS $TOLOWER $NOSPACE %KEEPEXT $NOIMG $HTML_SUFFIX $ASCII $PROGRESS $FILTER_IMG $DEPTH_FIRST $TAKE_QUERY $KEEP_NOEXT %KEEP_NOEXT $KEEP_LOCAL $NAME_LIMIT $IMAGE_BY_DEPTH $CHARSET $SDBM_FILE $FILTER $SIGNAL $ASSUME_CHARSET $FIX_BACKSLASHES $SITE_DIR @FILTER_FILES $UPLOAD_TIME $COOKIE_JAR $CONN_CACHE); my $progname = $0; $progname =~ s|.*/||; # only basename left $progname =~ s/\.\w*$//; #strip extension if any $VERSION = sprintf("%d.%02d", q$Revision: 2.3 $ =~ /(\d+)\.(\d+)/); #$Getopt::Long::debug = 1; #$Getopt::Long::ignorecase = 0; # Defaults $MAX_DEPTH = 5; $MAX_DOCS = 50; $KEEPEXT{'OPT'} = 'text/plain,application/octet-stream,application/download'; $TAKE_QUERY = 'djvu,djv'; $NAME_LIMIT = 1e9; $CHARSET = 1; $FILTER = 1; $SIGNAL = 1; my $debug; GetOptions('version' => \&print_version, 'help' => \&usage, 'depth=i' => \$MAX_DEPTH, 'limit=i' => \$MAX_DOCS, 'verbose!' => \$VERBOSE, 'quiet!' => \$QUIET, 'debug!' => \$debug, 'sleep=i' => \$SLEEP, 'prefix:s' => \@PREFIX, 'filter:s' => \$PREFIX, 'referer:s'=> \$REFERER, 'hier' => \$HIER, 'auth=s' => \$AUTH, 'iis' => \$IIS, 'fix-backslashes' => \$FIX_BACKSLASHES, 'tolower' => \$TOLOWER, 'nospace' => \$NOSPACE, 'noimg' => \$NOIMG, 'html_suff=s'=> \$HTML_SUFFIX, 'keepext=s'=> \$KEEPEXT{'OPT'}, 'ascii' => \$ASCII, 'progress' => \$PROGRESS, 'filter-img'=> \$FILTER_IMG, 'depth-first!'=> \$DEPTH_FIRST, 'name-limit=i'=> \$NAME_LIMIT, 'take-query=s'=> \$TAKE_QUERY, 'keep-noext=s'=> \$KEEP_NOEXT, 'keep-local'=> \$KEEP_LOCAL, 'image-by-depth'=> \$IMAGE_BY_DEPTH, 'charset!' => \$CHARSET, 'translate!' => \$FILTER, 'sdbm-file=s' => \$SDBM_FILE, 'assume-charset=s'=> \$ASSUME_CHARSET, 'site-dir' => \$SITE_DIR, 'signal!' => \$SIGNAL, 'upload-time+' => \$UPLOAD_TIME, 'cooky-jar=s'=> \$COOKIE_JAR, 'cache-connections=i'=> \$CONN_CACHE, #'filter-files'=> \@FILTER_FILES, ) || usage(); $TAKE_QUERY =~ s/,/|/g if defined $TAKE_QUERY; sub print_version { require LWP; my $DISTNAME = 'libwww-perl-' . LWP::Version(); print <<"EOT"; This is lwp-rget version $VERSION ($DISTNAME) Copyright 1996-1998, Gisle Aas. This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. EOT exit 0; } (my @start_url = map { /^\@file=(.*)/s ? do { open FF, "< $1" or die "error opening `$1' for read: $!"; my @l = ; close FF or die "error closing `$1' for read: $!"; chomp @l; @l } : $_ } @ARGV) || usage(); eval ' use LWP::Debug "+" ' if $debug; require LWP::UserAgent; my %ua_options; $ua_options{keep_alive} = $CONN_CACHE if $CONN_CACHE; $ua_options{cookie_jar} = $COOKIE_JAR if defined $COOKIE_JAR; my $ua = new LWP::UserAgent (%ua_options); $ua->agent("$progname/$VERSION " . $ua->agent); $ua->env_proxy; usage() if @PREFIX and defined $PREFIX; unless (@PREFIX or defined $PREFIX) { my @PREFIX1 = map url($_), @start_url; # limit to URLs below these eval { map $_->eparams(undef), @PREFIX1; map $_->equery(undef), @PREFIX1; }; for my $p (@PREFIX1) { $_ = $p->epath; s|[^/]+$||; $p->epath($_); } @PREFIX = map $_->as_string, @PREFIX1; } unless (defined $PREFIX) { $PREFIX = join '|', map quotemeta, @PREFIX; $PREFIX = "($PREFIX)" if @PREFIX > 1; $PREFIX = "^$PREFIX"; } %KEEPEXT = map { lc($_) => 1 } split(/\s*,\s*/, ($KEEPEXT{'OPT'}||'')); %KEEP_NOEXT = map { lc($_) => 1 } split(/\s*,\s*/, ($KEEP_NOEXT||'')); my $SUPPRESS_REFERER; $SUPPRESS_REFERER++ if ($REFERER || "") eq "NONE"; undef $REFERER if $SUPPRESS_REFERER; print <<"" if $VERBOSE; START = @start_url MAX_DEPTH = $MAX_DEPTH MAX_DOCS = $MAX_DOCS PREFIX = $PREFIX my $no_docs = 0; my %seen = (); # mapping from URL => local_file # When downloading a lot of files with a lot of URLs mentioned in # these files, @process_urls may become very big. Guard against # this... Keys for URLs destinated to be downloaded have value ''. # XXXX Need to guard $@process_urls against unnecessary growth also in # the cases of too much depth, and too many already downloaded files. if ($SDBM_FILE) { $SIG{INT} = sub {warn "Interrupted by SIGINT. Writing sdbm...\n"; exit 13} if $SIGNAL; # Run handlers require SDBM_File; require Fcntl; tie %seen, 'SDBM_File', $SDBM_FILE, Fcntl::O_RDWR()|Fcntl::O_CREAT(), 0666 or die "tie failed: $!"; eval 'END {untie %seen or warn "untie() failed: $!"}'; } my %seen_bad; # This is the main memory hog; a lot of effort is performed to avoid its growth my $process_urls = [map ['get', $_], @start_url]; my $to_get = @$process_urls; my %start_url = map +($_ => 1), @start_url; my %filter_files; my $utime_errors; my $depth = 0; my $req_referer = $REFERER; my $process_urls_extra; # Not used yet: for late filtering while (@$process_urls) { my $these_urls = $process_urls; $process_urls = []; while (@$these_urls) { my ($how, @what) = @{shift @$these_urls}; if ($how eq 'ref') { $REFERER = $what[0]; } elsif ($how eq 'get') { fetch($what[0], 'filtered', $REFERER, $depth, 0); # Do not postpone #print "$filename\n" unless $QUIET or $depth; } elsif ($how eq 'translate') { my $WAS_VERBOSE = $VERBOSE; local $VERBOSE = 0; my ($f, $base, $url, $charset) = @what; my $new = "$f-tmp$$"; my $mtime = (stat $f)[9]; rename $f, $new or warn("Error renaming `$f' -> `$new': $!"), next; #$new = $f; local $/; print STDERR "Translating URLs in $f\n" unless $QUIET; open IN, "< $new" or warn("Error opening `$new': $!"), next; binmode IN unless $ASCII; open(OUT, ">$f") || die "Can't save $f: $!"; binmode OUT unless $ASCII; print OUT filter_chunk(scalar , $base, $f, $url, \&new_link, $MAX_DEPTH+1, 1, undef, $charset); # while ; close OUT or warn("Error closing `$f': $!"); close IN or warn("Error closing `$new': $!"); unlink $new or warn("Error unlinking `$new': $!") unless $new eq $f; unless (($UPLOAD_TIME || 0) > 1) { # make sure the file has the same last modification time $utime_errors++ or warn "!!!* update timestamp `$f': $!\n" unless utime $mtime, $mtime, $f; } } else { die "Unknown process command '$how' for '@what'" } } $depth++; $process_urls = $process_urls_extra, $process_urls_extra = [] if not @$process_urls and $process_urls_extra and @$process_urls_extra; } sub downloader ($;$$$$$$$) { my ($find_name, $fh, $ascii, $name, $printn, $m, $url, $file, $ask) = (@_); my ($size, $length, $flength, $last_dur, $start_t, $msg) = (0); my @ani = qw(- \ | /); my $ani = 0; sub { unless(defined $file) { if ($_[1]->content_type eq 'text/ftp-dir-listing') { #$_[1]->content .= $_[0]; #return; } $file = &$find_name($_[1], $url); # Check if the file is already present if ($ask && -f $file && -t) { print $fh "Overwrite $file? [y] "; my $ans = ; exit if !defined($ans) || !($ans =~ /^y?\n/); } else { print $fh "Saving to '$file'...\n" if $ask; } open(FILE, ">$file") || die "Can't open $file: $!"; binmode FILE unless $ascii; $length = $_[1]->content_length; $flength = fbytes($length) if defined $length; $start_t = time; $last_dur = 0; print $printn "$file\n" if $printn; } $$name = $file if defined $name; $size += length($_[0]); print FILE $_[0]; my $dur = time - $start_t; if ($dur != $last_dur) { # don't update too often $last_dur = $dur; my $speed; my $show = ''; $speed = fbytes($size/$dur) . "/sec" if $dur > 3; if (defined $length) { my $perc = $size / $length if defined $length; my $secs_left = fduration($dur/$perc - $dur); $perc = int($perc*100); $show = "$perc% of $flength"; $show .= " (at $speed, $secs_left remaining)" if $speed; } else { $show = fbytes($size) . " received"; $show .= " (at $speed)" if $speed; } print $fh "\r$show" . (" " x (75 - length $show)); print $fh "$ani[$ani++]\b"; $ani %= @ani; $$m = 1; # Inform the caller } } } sub plain_url ($) { my $url = shift; # The $plain_url is a URL without the fragment (and possibly query) part my $plain_url = $url->clone; my $append = ''; $plain_url->frag(undef); if ( defined $url->query and defined $TAKE_QUERY and $url->path =~ /\.($TAKE_QUERY)$/io ) { $append = '?' . $url->query; $plain_url->query(undef); } my $frag = $url->frag; $append .= "#$frag" if defined $frag; # Translate URL to lowercase if $TOLOWER defined $plain_url = to_lower($plain_url) if (defined $TOLOWER); ($plain_url, $append); } sub postpone_plain_url { my ($plain_url, $url, $referer) = (shift, shift, shift); unless (exists $seen{$plain_url->as_string} or ++$to_get > $MAX_DOCS) { if (not $SUPPRESS_REFERER and defined $referer and (not defined $req_referer or $req_referer ne "$referer")) { push @$process_urls, ['ref', "$referer"]; } push @$process_urls, ['get', "$url"]; $seen{$plain_url->as_string} = ''; } } # Checks whether we need to actuall get the URL contents; returns the # possibly substituted (to a local file) URL. # if $postpone, arrange for the URL to be fetched later, and does no substitution. sub fetch { my($url, $type, $referer, $depth, $postpone, $cntref) = @_; # Fix http://sitename.com/../blah/blah.html to # http://sitename.com/blah/blah.html $url = $url->as_string if (ref($url)); while ($url =~ s#(https?://[^/]+/)\.\.\/#$1#) {} # Fix backslashes (\) in URL if $IIS defined $url = fix_backslashes($url) if $IIS or $FIX_BACKSLASHES; $url = url($url) unless ref($url); $type ||= 'a'; # Might be the background attribute $type = 'img' if ($type eq 'body' || $type eq 'td'); $depth ||= 0; # Print the URL before we start checking... if ($VERBOSE and not $postpone) { my $out = (" " x $depth) . $url . " "; $out .= "." x (60 - length($out)); print STDERR $out . " "; } # Can't get mailto things if ($url->scheme eq 'mailto') { print STDERR "*skipping mailto*\n" if $VERBOSE and not $postpone; return $url->as_string; } # Skip links if requested if ($type eq 'img' and $NOIMG) { print STDERR "*image*\n" if $VERBOSE and not $postpone; return $url->as_string; } # Check PREFIX, but not for links if ( $type ne 'filtered' and ($type ne 'img' or $FILTER_IMG) and $url->as_string !~ /$PREFIX/o ) { print STDERR "*outsider*\n" if $VERBOSE and not $postpone; return $url->as_string; } # The $plain_url is a URL without the fragment (and possibly query) part my ($plain_url, $append) = plain_url($url); # If we already have it, then there is nothing to be done my $seen = $seen{$plain_url->as_string}; $seen = $seen_bad{$plain_url->as_string} unless defined $seen and length $seen; if (defined $seen and length $seen) { $$cntref++ if $cntref and $seen ne $plain_url->as_string; $seen .= $append; $seen = protect_frag_spaces($seen); print STDERR "$seen (again)\n" if $VERBOSE and not $postpone; return $seen; } # Too much or too deep if ( $depth > $MAX_DEPTH and ($IMAGE_BY_DEPTH or $type ne 'img' and $type ne 'filtered') ) { print STDERR "*too deep*\n" if $VERBOSE and not $postpone; return $url; } if ($no_docs > $MAX_DOCS) { print STDERR "*too many*\n" if $VERBOSE and not $postpone; return $url; } if ($postpone) { $$cntref++ if $cntref; postpone_plain_url($plain_url, $url, $referer); return $url; } # Fetch document my $local; my $res; my $name; $no_docs++; if ($KEEP_LOCAL and $url->as_string =~ m(^file://(localhost)?/)i) { $local = 1; $name = $url->path; } else { sleep($SLEEP) if $SLEEP; my $req = HTTP::Request->new(GET => $url); # See: http://ftp.sunet.se/pub/NT/mirror-microsoft/kb/Q163/7/74.TXT $req->header ('Accept', '*/*') if $IIS; # GIF/JPG from IIS 2.0 $req->authorization_basic(split (/:/, $AUTH)) if (defined $AUTH); $req->referer($referer) if $referer && !$SUPPRESS_REFERER; if ($PROGRESS) { my $message; $res = $ua->request($req, downloader(\&name_finder, \*STDERR, $ASCII, \$name, \*STDERR, \$message)); close FILE; die $res->header('X-Died') if defined $res->header('X-Died'); print STDERR "\r", " " x 76, "\r" if $message; } else { $res = $ua->request($req); } } # Check outcome if ($local or $res->is_success) { my @ct = ($local ? ($name =~ /\.html?$/i ? 'text/html' : 'application/download') : $res->content_type); my $doc; if ($ct[0] eq 'text/ftp-dir-listing') { require File::Listing; # Actually, loaded already... # Based on LWP::Protocol::ftp 1.36 my @lst; if (defined $name) { open F, "< $name" or die "open `$name' for read: $!"; @lst = ; chomp @lst; close F or die "close `$name' for read: $!"; } else { @lst = split /\n/, $res->content; } my @f; # The last ditch effort is to get filenames assuming no spaces... for my $f (File::Listing::parse_dir(\@lst, 'GMT', undef, sub { my $l = shift; $l =~ s/.*[^\S\n]//; push @f, $l; ''} )) { my($name, $type, $size, $mtime, $mode) = @$f; push @f, $name if length $name; } # Now we "know" the list of files. my $sub_url = $url->clone; (my $path = $sub_url->path) =~ s,/$,,; if (not $HIER or $depth >= $MAX_DEPTH) { # Keep as HTML list $doc = "FTP File Listing\n"; $sub_url->path("$path/"); # We do not parse \n); $doc .= qq(\n); $doc .= "\n
    \n"; for my $f (@f) { # We do not parse $f\n); #$doc .= qq(
  • $f\n); } $doc .= "
\n"; $ct[0] = "text/html"; # Do not need to do anything else... $name = name_finder($res, $ct[0]) unless defined $name; open F, "> $name" or die "open `$name' for write: $!"; print F $doc; close F or die "close `$name' for write: $!"; } else { # Remove the listing, and fetch the files if (defined $name) { print STDERR "... unlink directory listing $name\n" if $VERBOSE; unlink $name or warn "unlink `$name': $!"; } $sub_url->path("$path/aaa.txt"); $name = find_name($sub_url, 'text/plain'); # Creates dir too $name =~ s,(.*)/(.*),$1,s; # Now manually arrange for the files to be downloaded: for my $f (@f) { $sub_url->path("$path/$f"); if ($DEPTH_FIRST) { fetch($sub_url, 'filtered', $url, $depth+1, 0); # Do not postpone } else { my ($plain) = plain_url($sub_url); postpone_plain_url($plain, $sub_url, $url); } } } } $seen{$plain_url->as_string} = $name; if ( not defined $name and (!$local and !$PROGRESS or (defined ($res->content_length) and $res->content_length == 0))) { $name = name_finder($res, $ct[0]); $doc = $res->content; $seen{$plain_url->as_string} = $name; print STDERR "$name\n" unless $QUIET; # Save an unprosessed version of the HTML document (or any file). # This both reserves the name used, and it also ensures that we # don't loose everything if this program is killed before # we finish. save($name, $doc); } elsif (not defined $name) { # XXX Should never happen warn("???* $url: undefined \$name"); require Data::Dumper; warn Data::Dumper::Dumper($res); return $url->as_string; } unless ($UPLOAD_TIME or $local) { if (my $lm = $res->last_modified) { # make sure the file has the same last modification time $utime_errors++ or warn "!!!* update timestamp `$name': $!\n" unless utime $lm, $lm, $name; } } $$cntref++ if $cntref and not $local; # If the file is HTML, then we look for internal links if ($ct[0] eq "text/html") { my $base = $local ? $url : $res->base; if ($PROGRESS or $local) { local $/ = undef; open IN, "< $name" or die "Can't open downloaded file $name"; $doc = ; close IN or die "Error closing the downloaded file $name for read"; } # If more than one charset is present, file has tags my (@charsets) = grep /^charset=/i, @ct[1..$#ct]; my $charset; if ( not $CHARSET ) {} # Do nothing elsif ( @charsets == 1 and not $local and $res->request->url->scheme eq 'http' # @ct has info obtained from the body of the message too and $res->headers->header('content_type') =~ /;\s*charset=/i) { $charset = $charsets[0]; } elsif (@charsets == 0 and $ASSUME_CHARSET) { $charset = "charset=$ASSUME_CHARSET"; } if ($DEPTH_FIRST) { # Do translation and recursing now save($name, filter_chunk($doc, $base, $name, "$url", \&new_link, $depth+1, 0, undef, $charset)); } else { my $cnt = 0; filter_chunk($doc, $base, $name, "$url", \&new_link, $depth+1, 1, \$cnt); # postpone saving and translation # If an URL was not downloaded yet, it won't be; # so it is safe to translate at this moment... push @$process_urls, ['translate', $name, $base, "$url", $charset] if $FILTER and ($cnt or $charset); #$filter_files{$name} = [$base, $name, "$url"]; } } return "$name$append"; } else { print STDERR "!!!* " . $url->as_string . ": " unless $VERBOSE or $QUIET; print STDERR "* ", $res->code . " " . $res->message . "\n" unless $QUIET; $seen_bad{$plain_url->as_string} = $plain_url->as_string; if ($PROGRESS and defined $name and not -f "$name-") { rename $name, "$name-" or warn "Error renaming unfinished download: `$name' => `$name-': $!"; } return $url->as_string; } } # unless $postpone, fetches the encountered URLs and returns a local URL # if postpone, does no translation, and arranges for a later download sub filter_chunk { my ($base, $name, $url, $cvt, $depth, $postpone, $cntref, $charset) = @_[1..8]; # Follow and substitute links... $_[0] =~ s/ ( # 1: what was matched <(img|a|body|area|frame|td)\b # 2: some interesting tag [^>]+ # still inside tag (not strictly correct) \b(?:src|href|background) # some link attribute \s*=\s* # = ) (?: (?! ["']? \# ) # Skip fragment references (")([^"]+)" | # 3: 4: value in double quotes OR (')([^']+)' | # 5: 6: value in single quotes OR ([^\s>]+) # 7: quoteless value ) / &$cvt($1, lc($2), $3||$5, HTML::Entities::decode($4||$6||$7||0), $base, $name, $url, $depth, $postpone, $cntref) /giex; # XXX # The regular expression above is not strictly correct. # It is not really possible to parse HTML with a single # regular expression, but it is faster. Tags that might # confuse us include: # # # # Process charset if ($charset) { < EOC } else { $_[0] } } # unless $postpone, fetches the URL and returns a local URL # if postpone, does no translation, and arranges for a later download sub new_link { my($pre, $type, $quote, $url, $base, $localbase, $referer, $depth, $postpone, $cntref) = @_; my $ini = $url; $url = protect_frag_spaces($url); my $was = url($url, $base)->abs; $url = fetch($was, $type, $referer, $depth, $postpone, $cntref); $url = url("file:$url", "file:$localbase")->rel unless $url =~ /^[.+\-\w]+:/; $url = unprotect_frag_spaces($url); $quote |= ''; # It is ' or " or undef $url = $ini unless $FILTER; return $pre . $quote . $url . $quote; } sub protect_frag_spaces { my ($url) = @_; $url = $url->as_string if (ref($url)); if ($url =~ m/^([^#]*#)(.+)$/) { my ($base, $frag) = ($1, $2); $frag =~ s/ /%20/g; $url = $base . $frag; } return $url; } sub unprotect_frag_spaces { my ($url) = @_; $url = $url->as_string if (ref($url)); if ($url =~ m/^([^#]*#)(.+)$/) { my ($base, $frag) = ($1, $2); $frag =~ s/%20/ /g; $url = $base . $frag; } return $url; } sub fix_backslashes { my ($url) = @_; my ($base, $frag); $url = $url->as_string if (ref($url)); if ($url =~ m/([^#]+)(#.*)/) { ($base, $frag) = ($1, $2); } else { $base = $url; $frag = ""; } $base =~ tr/\\/\//; $base =~ s/%5[cC]/\//g; # URL-encoded back slash is %5C return $base . $frag; } sub to_lower { my ($url) = @_; my $was_object = 0; if (ref($url)) { $url = $url->as_string; $was_object = 1; } if ($url =~ m/([^#]+)(#.*)/) { $url = lc($1) . $2; } else { $url = lc($url); } if ($was_object == 1) { return url($url); } else { return $url; } } sub translate_spaces { my ($url) = @_; my ($base, $frag); $url = $url->as_string if (ref($url)); if ($url =~ m/([^#]+)(#.*)/) { ($base, $frag) = ($1, $2); } else { $base = $url; $frag = ""; } $base =~ s/^ *//; # Remove initial spaces from base $base =~ s/ *$//; # Remove trailing spaces from base $base =~ tr/ /_/; $base =~ s/%20/_/g; # URL-encoded space is %20 return $base . $frag; } sub mkdirp { my($directory, $mode) = @_; my @dirs = split(/\//, $directory); my $path = shift(@dirs); # build it as we go my $result = 1; # assume it will work unless (-d $path) { $result &&= mkdir($path, $mode); } foreach (@dirs) { $path .= "/$_"; if ( ! -d $path) { $result &&= mkdir($path, $mode); } } return $result; } sub name_finder { my ($res, $ct) = (shift, shift); $ct = $res->content_type unless defined $ct; return find_name($res->request->url, $ct, $res->content_encoding); } sub find_name { my($url, $type, $enc) = @_; #print "find_name($url, $type)\n"; # Translate spaces in URL to underscores (_) if $NOSPACE defined $url = translate_spaces($url) if (defined $NOSPACE); # Translate URL to lowercase if $TOLOWER defined $url = to_lower($url) if (defined $TOLOWER); $url = url($url) unless ref($url); my $path = $url->path; # Protect against funny characters, some filesystems can bark on them $path =~ s/([|\"<>\\:?#\x00-\x1F\x80-\xFF])/ sprintf '@%X', ord $1 /ge; # trim path until only the basename is left $path =~ s|(.*/)||; my $dirname = $1 ? ".$1" : ''; if (!$HIER) { $dirname = ""; } if ($SITE_DIR) { eval { my $site = $url->host; my $port = $url->port; my $def = $url->default_port; $port = '' if $port == $def; $site .= "=port$port" if length $port; $dirname =~ s#^\./##; $dirname = "./$site/$dirname"; }; } if (length $dirname and not -d $dirname) { mkdirp($dirname, 0775); } my $extra = ""; # something to make the name unique my ($suffix, $keep_suffix, $check_gz); if ($type eq 'text/ftp-dir-listing') { $suffix = '.dirl'; } elsif ($KEEPEXT{lc($type)} and $path =~ /\./) { $suffix = ($path =~ m/\.(.*)/) ? ".$1" : ""; } elsif ($type eq "text/html" and defined $HTML_SUFFIX) { $suffix = ".$HTML_SUFFIX"; $check_gz = 1; } elsif ($KEEP_NOEXT{lc($type)} and not $path =~ /\./) { $suffix = ""; } else { $suffix = media_suffix($type); $suffix = "" unless defined $suffix; if (length $suffix) { $suffix = ".$suffix"; } else { $keep_suffix = 1; } $check_gz = 1 unless $keep_suffix; } if ($check_gz and $enc and $enc eq 'x-gzip') { $suffix .= '.gz'; } $path =~ s|\..*|| unless $keep_suffix; # trim suffix $path = "index" unless length $path; while (1) { # Construct a new file name if ( length "$path$extra$suffix" > $NAME_LIMIT and length "$extra$suffix" < $NAME_LIMIT ) { $path = substr $path, 0, $NAME_LIMIT - length "$extra$suffix"; } my $file = $dirname . $path . $extra . $suffix; # Check if it is unique return $file unless -e $file; # Try something extra unless ($extra) { $extra = "001"; next; } $extra++; } } sub save { my $name = shift; #print "save($name,...)\n"; open(FILE, ">$name") || die "Can't save $name: $!"; binmode FILE unless $ASCII; print FILE $_[0]; close(FILE); } sub fbytes { my $n = int(shift); if ($n >= 1024 * 1024) { return sprintf "%.3g MB", $n / (1024.0 * 1024); } elsif ($n >= 1024) { return sprintf "%.3g KB", $n / 1024.0; } else { return "$n bytes"; } } sub fduration { use integer; my $secs = int(shift); my $hours = $secs / (60*60); $secs -= $hours * 60*60; my $mins = $secs / 60; $secs %= 60; if ($hours) { return "${hours}h${mins}m"; } elsif ($mins >= 2) { return "${mins}min"; } else { $secs += $mins * 60; return "${secs}sec"; } } sub usage { print <<""; exit 1; Usage: $progname [options] ... Allowed options are: --auth=USER:PASS Set authentication credentials for web site --depth=N Maximum depth to traverse (default is $MAX_DEPTH) --hier Download into hierarchy (not all files into cwd) --referer=URI Set initial referer header (or "NONE") --iis Workaround IIS 2.0 bug by sending "Accept: */*" MIME header; translates backslashes (\\) to forward slashes (/) --fix-backslashes Convert backslashes in URLs to forward slashes --keepext=type Keep file extension for MIME types (comma-separated list) --keep_noext=type Likewise for files without extensions --limit=N A limit on the number documents to get (default is $MAX_DOCS) --nospace Translate spaces URLs (not #fragments) to underscores (_) --version Print version number and quit --verbose More output --debug Enable LWP debugging --quiet No output --sleep=SECS Sleep between gets, ie. go slowly --prefix=PREFIX Follow only URLs which begin with PREFIX (may be repeated) --filter=RE Follow only URLs which match the given regular expression --tolower Translate all URLs to lowercase (useful with IIS servers) --noimg Do not download images --filter-img Do not download images which do not match PREFIX --html_suff=SUFF Save HTML files with the given suffix --ascii Save files as text --progress Show progress message while downloading --depth-first Change the order of retrieval --take-query=LST Comma-separated list of file extensions which can take a query parameter (?STRING) meaning the position inside the file. --keep-local Edit local files (via file: URLs) in place --name-limit=LEN Limit length of local files to this length --image-by-depth Do not get images if they are too deep --nocharset Do not insert for HTTP charset --assume-charset=CHARSET Assume this charset if none is found --notranslate Do not modify downloaded files --sdbm-file=STEM Use SDBM file for persistent list of mirrored files --site-dir Create toplevel directory for each site --upload-time Do not set timestamp basing on the server's Last-Modified }