Make your own free website on Tripod.com
  This php code is converted into HTML using 
     Colorizator v1.0 (c) 2001 by Alexander Yanuar Koentjara
                                  (lexzeus@hotmail.com) 

<? /* Program name : LexZEUS Crawler Version : 1.0.10032001 (10 - Mar - 2001) Purpose : Web Spider / Crawler, provide list of URL that will be used by LexZEUS Indexer v1.0 File name : lexz_crawler.php Author : Alexander Yanuar Koentjara lexzeus@hotmail.com http://lexzeus.tripod.com Term of use : This program can be used freely (without notification). If you want to add/modify some functionality, please to do so. You can also redestribute it freely. Using/redestributing this portion/completed/modified code should retain the original author and the one who modified it (please don't forget to update the version number) Syntax : php -q lexz_crawler.php URL RESULTFILE.TXT [MAXNODE] URL The initial page you want to start crawling. RESULTFILE.TXT Crawling result will be stored here. MAXNODE Maximum nodes you want to crawl. The default is 100. The maximum value is 10000 */ // The message should be redirected to STDOUT or file ? $log_to_file = 0; # Put 1 if you want to log to file $log_file_name = "log.txt"; # Put the directory and name as log file // if you use proxy server to connect to the site, please specify // the proxy setting (the setting can be seen from browser's connection setting): $use_proxy = 0; # Put 1 means use proxy, 0 means not $proxy_server = "127.0.0.1"; # Your Proxy IP address or host, for example "proxy.company.com" $proxy_port = 7777; # Your Proxy Port, usually larger than 1024 // if you use proxy and you use basic authentication to connect to proxy server, // please supply login and password below : $use_basic_auth = 0; # Put 1 means use authentication, 0 means not $proxy_login = "username"; # The login name is here $proxy_password = "password"; # The password is here // functions ... function log_msg($txt, $stop=0) { GLOBAL $fp, $log_to_file, $log_file_name; if ($log_to_file) { if (!$fp) $fp=fopen("$log_file_name","w"); fputs($fp,$txt); } else print $txt; if ($stop) exit; } function servername($txt) { if (substr(strtoupper($txt),0,4)=="WWW.") $txt="HTTP://".$txt; if (substr(strtoupper($txt),0,7)!="HTTP://") return 0; eregi("^(http://([^/ ]+))",$txt,$arr); return $arr[2]; } function get_content($url) { global $proxy_server, $proxy_port, $use_proxy, $use_basic_auth, $proxy_login, $proxy_password; $proxy_user = ""; if ($use_proxy) { $fp = fsockopen($proxy_server,$proxy_port); if (!$fp) die("Can't connect to proxy server !!"); if ($use_basic_auth) $proxy_user ="Proxy-Authorization: Basic ". base64_encode("$proxy_login:$proxy_password"). "\r\n"; } else { $host = servername($url); $fp = fsockopen($host,80); if (!$fp) die("Can't connect to $host !!"); } fputs($fp,"GET $url HTTP/1.0\r\n". $proxy_user . "Accept: image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, */*\r\n". "Accept-Encoding: gzip, deflate\r\n\r\n"); $data=fread($fp,4096); if (!eregi("text/html",$data)) { fclose($fp); return 0; } while (!feof($fp)) $data.=fread($fp,4096); fclose($fp); for($i=0;$i<1500;$i++) if (substr($data,$i,4)=="\r\n\r\n") {$pos=$i+4;$i=2000;} return substr($data,$pos); } function get_href($txt) { $result = array(); preg_match_all("/(href|src|background)[[:space:]]*=". "[[:space:]]*[\"']{0,1}([^\"'[:space:]>]*)/i", $txt,$res,PREG_SET_ORDER); for ($i = 0; $i<count($res); $i++) if (trim($res[$i][2])) { // eliminate anchor $tmp = trim(ereg_replace("#.*","",$res[$i][2])); // don't crawl other than http if (eregi("(mailto|https|javascript|file|ftp|gopher):",$tmp)) continue; $result["$tmp"]=1; } return $result; } function validate_path($path,$txt) { // if $path is http://www.geocities.com/lexzeus/next/other/ // if $txt is ../../pic/p1.jpg // the result should be : http://www.geocities.com/lexzeus/pic/p1.jpg $path = str_replace("\\","/",$path); $txt = str_replace("\\","/",$txt); $path=trim($path); $txt =trim($txt); if (substr($path,strlen($path)-1,1)=="/") $path=substr($path,0,strlen($path)-1); if (strtoupper(substr($path,0,7))=="HTTP://") $path = substr($path,7); if (strtoupper(substr($txt,0,7))=="HTTP://") return $txt; $arr=split("/",$path); if (substr($txt,0,1)=="/") { $idx=0; $txt=substr($txt,1); } else { if (substr($txt,0,1)==".") { while (substr($txt,0,3)=="../") { $txt=substr($txt,3); $idx++;} $idx=count($arr)-$idx-1; } else $idx=count($arr)-1; } $ret="http://"; for ($i=0;$i<=$idx;$i++) $ret.=$arr[$i]."/"; $ret.=$txt; return $ret; } set_time_limit(0); // Initializing parameter : $root = $argv[1]; $fresult = str_replace("\\","/",$argv[2]); $node = $argv[3]; if (!$node) $node=100; if ($node+0!=$node) $node=100; // force it become number if (strtoupper(substr($root,0,7))!="HTTP://") $root = "http://$root"; if (eregi("[a-z][/\\][a-z]+$",$root)) $base_href=dirname($root); else $base_href=$root; if (!($base_href) || !$node || !trim($fresult)) print "Usage : crawl.php URL RESULTFILE.TXT [MAXNODE] \n\n". "URL The initial page you want to start crawling.\n". "RESULTFILE.TXT Crawling result will be stored here.\n". "MAXNODE Maximum nodes you want to crawl. The default is 100. The maximum\n". " value is 10000\n\n"; // main program $fresult = @fopen($fresult,"w"); if (!$fresult) log_msg("Cannot write to $argv[2]\n",1); log_msg("URL : $root\nMax Node : $node\nResult : $argv[2]\nCookie : $cookie\n\n"); log_msg("Start crawling ...\n\n"); $urls = array(); $unique_urls = array(); $urls[1] = $root; $ct = 0; $ct2 = 0; $item = 1; $origin_server = servername($root); while (1) { $ct++; $ct2++; if (!$urls[$ct]) { fclose($fresult); log_msg("Crawling is finished.\n",1); } log_msg("\nNow crawling ".$urls[$ct]." ...\n"); $htmlcode = get_content($urls[$ct]); if (!trim($htmlcode)) { log_msg($urls[$ct]." is not a valid text/html mime type.\n"); $ct++; continue; } fputs($fresult,$urls[$ct]."\n"); $arr = get_href($htmlcode); if ($ct2>=$node) { $ct=99999999; // exit log_msg("Max node is reached\n"); } if ($ct>1) $base_href=dirname($urls[$ct])."/"; // queue the new urls while (list($key,$val)=each($arr)) { $sname = servername($key); // Do not crawl if server host is different ... if ($sname) if ($origin_server!=$sname) { log_msg("*> $key won't be crawled: Different server host!\n"); continue; } $url_name = validate_path($base_href,$key); if (!$unique_urls[$url_name]) { $item++; $urls[$item]=$url_name; $unique_urls[$url_name]=1; } } } ?>