Sei sulla pagina 1di 3

<?

php

$url = "http://www.najevtino.com"; // without traling /

// the local filename to save the root html


$root_filename = "index.html";

// maximum level of spidering


$max_level = 3;

// check if the spidering is in progress


$url_to_parse = $_GET["url"];

// the spidering is not started, initalize variables and store it in session


if ($url_to_parse == NULL) {
// local dir for saving data fetched from server
// ex. "/data/00000001-2008-10-15"
$local_files_dir = "data";
$today = date("Y-m-d");

// create and prepare local dir for saving


$d = dir($local_files_dir);
$last_index = 0;
while (false !== ($entry = $d->read())) {
$a = explode("-", $entry);
if ( is_numeric($a[0])) {
$last_index = $a[0];
}
}
$d->close();
$last_index++;
$local_files_path = $local_files_dir . "/" . str_pad($last_index, 8, "0",
STR_PAD_LEFT) . "-" . $today;
if ( !is_dir($local_files_path) ) {
mkdir($local_files_path, 0777, true);
}

// store variables in session


$_SESSION["local_files_path"] = $local_files_path;

} else {
// the spidering is in progess, read variables from session
$local_files_path = $_SESSION["local_files_path"];

// now ready for parsing, go!!!


get_and_parse_url($url . "/category/PC+Components", 1);
}

function get_and_parse_url($url_to_parse, $level) {

global $url;
global $root_filename;
global $local_files_path;
global $max_level;

if ($level > $max_level) return;


$filename = $root_filename;
$save_to_dir = $local_files_path;

echo "<pre>";
echo "get_and_parse_url('$url_to_parse', '$save_to_dir', $level)\n";

$relative_url = substr($url_to_parse, strlen($url) + 1);


echo "relative_url='$relative_url'\n";

if (strpos($relative_url, "/") != false) {


$save_to_dir = $save_to_dir . "/" . substr($relative_url, 0,
strrpos($relative_url, "/"));
if ( !is_dir($save_to_dir) ) {
mkdir($save_to_dir, 0777, true);
}
$filename = substr($relative_url, strrpos($relative_url, "/") + 1);
} else if (strlen($relative_url) > 1) {
$filename = $relative_url;
}

if ( (substr($filename, strlen($filename) - 4) != ".htm")


&& (substr($filename, strlen($filename) - 5) != ".html") ) {
$filename = $filename . ".html";
}

echo "save_to_dir='$save_to_dir'\n";
echo "filename='$filename'\n";

// get the html from remote server


$html = do_post_request( $url_to_parse );

// store fetched html to local dir


file_put_contents( $save_to_dir . "/" . $filename, $html );

// now parse the html to get more links and data

// $filename = "data/00000001-2008-09-27/index.html";
// $html = file_get_contents($filename);

$html = str_replace("\"", "'", $html);

//echo htmlentities($html);

// get all <a>'s with regex


preg_match_all("|<a[^>]+>(.*)</[^>]+>|U", $html, $matches);

// loop through the matches with foreach


$i = 0;
foreach($matches[0] as $value)
{
//echo htmlentities($value) . "\n";
// filter the one that we need
if (strstr($value, "font-weight: bold;")) {
$i++;
//echo htmlentities($value) . "\n";
preg_match("/href='(.*)'/", $value, $m);
$links[$i] = trim($m[1]);
// fix relative links
if (substr($links[$i], strlen($url)) != $url) {
if (substr($links[$i], 1) != "/") {
$links[$i] = "/" . substr($links[$i], 1);
}
$links[$i] = $url . $links[$i];
}

}
}

// check if links are found


if ($links == null) {
// no links of interest are found
} else {
$links = array_unique($links);
// print the found links of interest
foreach($links as $value) {
echo htmlentities($value) . "\n";
get_and_parse_url($value, $level + 1);
// break;
}
}
echo "</pre>";
}

function do_post_request($url, $data = null, $optional_headers = null)


{
$params = array('http' => array(
'method' => 'POST',
'content' => $data
));
if ($optional_headers !== null) {
$params['http']['header'] = $optional_headers;
}
$ctx = stream_context_create($params);
$fp = @fopen($url, 'rb', false, $ctx);
if (!$fp) {
throw new Exception("Problem with $url, $php_errormsg");
}
$response = @stream_get_contents($fp);
if ($response === false) {
throw new Exception("Problem reading data from $url, $php_errormsg");
}
return $response;
}
?>

Potrebbero piacerti anche