Name: Anonymous 2007-03-23 10:32 ID:wd9QhLeo
It's written in a totally inappropriate language and bad style, but it works. I'm actually posting this for someone on /b/, but go ahead and discuss if you wish
#!/usr/bin/env php
<?php
// SETTINGS BEGIN HERE
// The directory where downloaded images will be saved (must exist before the script is run)
$cwd = '/home/foo';
// Filename extensions for files that are to be fetched
$exts = array('jpg', 'jpeg', 'gif', 'png');
// HTTP_USER_AGENT string sent to the remote server
$ua = 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X; en) AppleWebKit/417.9 (KHTML, like Gecko) Safari/417.9.2';
// Regular expressions for filenames to ignore
$ignores = array(
'#reporting.png$#',
'#adtypes.png$#',
'#/src\.cgi/#',
);
// SETTINGS END HERE
// Status constants
define('CS_EXISTS', 2);
define('CS_FAILURE', 1);
define('CS_SUCCESS', 0);
// Function definitions
function echoln($str='') {
echo $str."\n";
}
function curl($url, $output=false) {
// We'll need these variables from outside the function scope
global $cwd, $ua;
// Generate the curl command that is to be executed
$filename = basename($url);
if(!file_exists($cwd.'/'.$filename)) {
$e = 'curl -fsLRA '.escapeshellarg($ua).' '.escapeshellarg($url).($output ? ' -o '.escapeshellarg($cwd.'/'.$filename) : '');
$r = shell_exec($e);
// Check whether we're in "output mode" (e.g. any fetch after the initial source fetch) and set the return var depending on the success of the download
if($output) {
$r = file_exists($cwd.'/'.basename($url));
return ($r ? CS_SUCCESS : CS_FAILURE);
} else {
return $r;
}
} else {
return CS_EXISTS;
}
}
// Variable initialization
$appName = basename($argv[0]);
$lLen = 0;
$fail = 0;
$links = array();
$counters = array(CS_SUCCESS => 0, CS_FAILURE => 0, CS_EXISTS => 0);
// Check if an argument was provided
// - if yes, continue execution
// - if not, exit with usage instructions
if(count($argv) > 1) {
echo 'Retrieving thread source code...';
// Get source code of thread page
$page = curl($argv[1]);
echoln(' DONE');
echoln();
// Find links with the given extensions
foreach($exts as $ext) {
$regex = '#<a[^>]+href=["\']?([^"\']+\.'.preg_quote($ext, '#').')["\']?[ >'."\n".']#i';
if(preg_match_all($regex, $page, $m)) {
$links = array_merge($links, $m[1]);
}
}
// Clear duplicate entries from links
$links = array_values(array_unique($links));
// Check links against ignore regexes
foreach($links as $linkK => $linkV) {
foreach($ignores as $ignore) {
if(preg_match($ignore, $linkV)) {
// If a match was found, remove the link and continue looping
unset($links[$linkK]);
continue 2;
} else {
// Save string length of the longest link, for output formatting
$lLen = (strlen($linkV) > $lLen ? strlen($linkV) : $lLen);
}
}
}
// Reset the keys of the link array
$links = array_values($links);
// Count the amount of links and save the string length of the total, for output formatting
$total = count($links);
$tp = strlen($total);
// Loop through the resulting list of links
foreach($links as $linkN => $link) {
// Tell the user what is being fetched
echo str_pad($linkN+1, $tp, '0', STR_PAD_LEFT).'/'.$total.str_pad(': Fetching '.$link.'...', $lLen+13, ' ', STR_PAD_RIGHT);
// Check status.
$status = curl($link, true);
if($status == CS_SUCCESS) {
echoln('DONE');
} elseif($status == CS_FAILURE) {
echoln('failed');
} elseif($status == CS_EXISTS) {
echoln('file exists');
}
$counters[$status]++;
}
// Finish by displaying the total amount of links downloaded and the number of failures (if any)
echoln();
echoln('Successful: '.$counters[CS_SUCCESS]);
echoln('Failed: '.$counters[CS_FAILURE]);
echoln('Skipped: '.$counters[CS_EXISTS]);
echoln();
echoln('Total: '.$total);
} else {
echoln('Usage: '.$appName.' [URL]');
}
?>