LexZEUS Crawler v1.0 Souce Code

  This php code is converted into HTML using 
     Colorizator v1.0 (c) 2001 by Alexander Yanuar Koentjara
                                  (lexzeus@hotmail.com) 

<?
/*
Program name : LexZEUS Crawler
Version      : 1.0.10032001  (10 - Mar - 2001)
Purpose      : Web Spider / Crawler, provide list of URL that will be used
               by LexZEUS Indexer v1.0
File name    : lexz_crawler.php
Author       : Alexander Yanuar Koentjara
               lexzeus@hotmail.com
               http://lexzeus.tripod.com

Term of use  : This program can be used freely (without notification).
               If you want to add/modify some functionality, please to do so.
               You can also redestribute it freely.
               Using/redestributing this portion/completed/modified code should
               retain the original author and the one who modified it (please don't
               forget to update the version number)

Syntax       : php -q lexz_crawler.php URL RESULTFILE.TXT [MAXNODE]

URL               The initial page you want to start crawling.
RESULTFILE.TXT    Crawling result will be stored here.
MAXNODE           Maximum nodes you want to crawl. The default is 100.
                  The maximum value is 10000
*/

// The message should be redirected to STDOUT or file ?
$log_to_file = 0;            # Put 1 if you want to log to file

$log_file_name = "log.txt"; # Put the directory and name as log file


// if you use proxy server to connect to the site, please specify
// the proxy setting (the setting can be seen from browser's connection setting):
$use_proxy    = 0;              # Put 1 means use proxy, 0 means not

$proxy_server = "127.0.0.1";  # Your Proxy IP address or host, for example "proxy.company.com"

$proxy_port   = 7777;           # Your Proxy Port, usually larger than 1024


// if you use proxy and you use basic authentication to connect to proxy server,
// please supply login and password below :
$use_basic_auth = 0;          # Put 1 means use authentication, 0 means not

$proxy_login    = "username"; # The login name is here

$proxy_password = "password"; # The password is here


// functions ...
function log_msg($txt, $stop=0)
{
  GLOBAL $fp, $log_to_file, $log_file_name;
  if ($log_to_file)
     {
     if (!$fp) $fp=fopen("$log_file_name","w");
     fputs($fp,$txt);
     }
  else
     print $txt;
  if ($stop) exit;
}

function servername($txt)
{
if (substr(strtoupper($txt),0,4)=="WWW.") $txt="HTTP://".$txt;
if (substr(strtoupper($txt),0,7)!="HTTP://") return 0;
eregi("^(http://([^/ ]+))",$txt,$arr);
return $arr[2];
}

function get_content($url)
{
  global $proxy_server, $proxy_port, $use_proxy,
         $use_basic_auth, $proxy_login, $proxy_password;

  $proxy_user = "";
  if ($use_proxy)
     {
     $fp = fsockopen($proxy_server,$proxy_port);
     if (!$fp) die("Can't connect to proxy server !!");

     if ($use_basic_auth)
        $proxy_user ="Proxy-Authorization: Basic ".
                      base64_encode("$proxy_login:$proxy_password").
                     "\r\n";
     }
  else
     {
     $host = servername($url);
     $fp = fsockopen($host,80);
     if (!$fp) die("Can't connect to $host !!");
     }

  fputs($fp,"GET $url HTTP/1.0\r\n". $proxy_user .
            "Accept: image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, */*\r\n".
            "Accept-Encoding: gzip, deflate\r\n\r\n");
  $data=fread($fp,4096);
  if (!eregi("text/html",$data))
     {
     fclose($fp);
     return 0;
     }

  while (!feof($fp)) $data.=fread($fp,4096);
  fclose($fp);

  for($i=0;$i<1500;$i++)
     if (substr($data,$i,4)=="\r\n\r\n") {$pos=$i+4;$i=2000;}
  return substr($data,$pos);
}

function get_href($txt)
{
  $result = array();

  preg_match_all("/(href|src|background)[[:space:]]*=".
                 "[[:space:]]*[\"']{0,1}([^\"'[:space:]>]*)/i",
                 $txt,$res,PREG_SET_ORDER);

  for ($i = 0; $i<count($res); $i++)
      if (trim($res[$i][2]))
         {
         // eliminate anchor
         $tmp = trim(ereg_replace("#.*","",$res[$i][2]));

         // don't crawl other than http
         if (eregi("(mailto|https|javascript|file|ftp|gopher):",$tmp))
            continue;

         $result["$tmp"]=1;
         }
  return $result;
}

function validate_path($path,$txt) {
// if $path is http://www.geocities.com/lexzeus/next/other/
// if $txt  is ../../pic/p1.jpg
// the result should be : http://www.geocities.com/lexzeus/pic/p1.jpg

$path = str_replace("\\","/",$path);
$txt = str_replace("\\","/",$txt);

$path=trim($path);
$txt =trim($txt);
if (substr($path,strlen($path)-1,1)=="/")
   $path=substr($path,0,strlen($path)-1);
if (strtoupper(substr($path,0,7))=="HTTP://")
   $path = substr($path,7);
if (strtoupper(substr($txt,0,7))=="HTTP://")
   return $txt;
$arr=split("/",$path);
if (substr($txt,0,1)=="/")
   { $idx=0; $txt=substr($txt,1); }
else
   {
   if (substr($txt,0,1)==".")
      { while (substr($txt,0,3)=="../") { $txt=substr($txt,3); $idx++;}
        $idx=count($arr)-$idx-1; }
   else
      $idx=count($arr)-1;
   }
$ret="http://";
for ($i=0;$i<=$idx;$i++) $ret.=$arr[$i]."/"; $ret.=$txt;
return $ret;
}

set_time_limit(0);

// Initializing parameter :
$root = $argv[1];
$fresult = str_replace("\\","/",$argv[2]);
$node = $argv[3];

if (!$node) $node=100;
if ($node+0!=$node) $node=100; // force it become number

if (strtoupper(substr($root,0,7))!="HTTP://") $root = "http://$root";
if (eregi("[a-z][/\\][a-z]+$",$root))
   $base_href=dirname($root);
else
   $base_href=$root;

if (!($base_href) || !$node || !trim($fresult))
   print   "Usage : crawl.php URL RESULTFILE.TXT [MAXNODE] \n\n".
           "URL                The initial page you want to start crawling.\n".
           "RESULTFILE.TXT     Crawling result will be stored here.\n".
           "MAXNODE            Maximum nodes you want to crawl. The default is 100. The maximum\n".
           "                   value is 10000\n\n";

// main program
$fresult = @fopen($fresult,"w");
if (!$fresult) log_msg("Cannot write to $argv[2]\n",1);

log_msg("URL      : $root\nMax Node : $node\nResult   : $argv[2]\nCookie   : $cookie\n\n");
log_msg("Start crawling ...\n\n");

$urls = array();
$unique_urls = array();

$urls[1] = $root;

$ct = 0;
$ct2 = 0;
$item = 1;
$origin_server = servername($root);

while (1)
{
  $ct++;
  $ct2++;

  if (!$urls[$ct])
     {
     fclose($fresult);
     log_msg("Crawling is finished.\n",1);
     }

  log_msg("\nNow crawling ".$urls[$ct]." ...\n");
  $htmlcode = get_content($urls[$ct]);
  if (!trim($htmlcode))
     {
     log_msg($urls[$ct]." is not a valid text/html mime type.\n");
     $ct++;
     continue;
     }

  fputs($fresult,$urls[$ct]."\n");
  $arr = get_href($htmlcode);
  if ($ct2>=$node)
     {
     $ct=99999999; // exit
     log_msg("Max node is reached\n");
     }

  if ($ct>1)
     $base_href=dirname($urls[$ct])."/";

  // queue the new urls
  while (list($key,$val)=each($arr))
      {
      $sname = servername($key);

      // Do not crawl if server host is different ...
      if ($sname) if ($origin_server!=$sname)
         {
         log_msg("*> $key won't be crawled: Different server host!\n");
         continue;
         }

      $url_name = validate_path($base_href,$key);

      if (!$unique_urls[$url_name])
         {
         $item++;
         $urls[$item]=$url_name;
         $unique_urls[$url_name]=1;
         }
      }
}

?>