This php code is converted into HTML using Colorizator v1.0 (c) 2001 by Alexander Yanuar Koentjara (lexzeus@hotmail.com)
<? /* Program name : LexZEUS Indexer (in use as support after using LexZEUS Crawler v1.0) Version : 1.0.10032001 (10 - Mar - 2001) Purpose : Index the html page into database after LexZEUS Crawler do its job File name : lexz_indexer.php Author : Alexander Yanuar Koentjara lexzeus@hotmail.com http://lexzeus.tripod.com Syntax : php -q lexz_indexer.php [URL_LIST.TXT] [URL_LIST.TXT] is an optional Text file contains URL to be indexed into database. The URL must be seperated by new line without commas. If URL_LIST.TXT is not found, the predefined array of URL will be used. Term of use : This program can be used freely (without notification). If you want to add/modify some functionality, please to do so. You can also redestribute it freely. Using/redestributing this portion/completed/modified code should retain the original author and the one who modified it (please don't forget to update the version number) DATABASE required : MySQL from www.mysql.com ******************* TABLE required: *************** TABLE TB_KEYWORD KWD_ID bigint(10) UNSIGNED Not Null auto_increment primary KEYWORD varchar(100) Not Null indexed TABLE TB_KEYWORD_REFERENCE KWD_ID bigint(10) UNSIGNED Not Null index_2(1) URL_ID bigint(10) UNSIGNED Not Null index_1 META_ID int(3) UNSIGNED Null index_2(2) COUNT int(3) UNSIGNED Not Null TABLE TB_URL URL_ID bigint(10) UNSIGNED Not Null auto_increment primary URL_STRING varchar(255) Not Null indexed first 5 chars TITLE varchar(255) Not Null DESCRIPTION varchar(255) Not Null Example of SEARCH Query when searching "COMPUTER" or "PROGRAMMING" : ******************************************************************** SELECT TBU.URL_STRING, TBU.TITLE, TBU.DESCRIPTION, SUM(TBR.COUNT) AS RELEVANCE FROM TB_KEYWORD TBK, TB_KEYWORD_REFERENCE TBR, TB_URL TBU WHERE (TBK.KEYWORD = 'COMPUTER' OR TBK.KEYWORD = 'PROGRAMMING') AND TBK.KWD_ID = TBR.KWD_ID AND TBR.URL_ID = TBU.URL_ID GROUP BY TBR.URL_ID ORDER BY RELEVANCE DESC */ // connect to MySQL $conn= @mysql_connect('localhost','user','password'); if (!$conn) die ("Cannot connect to MySQL !") // if you use proxy server to connect to the site, please specify // the proxy setting (the setting can be seen from browser's connection setting): $use_proxy = 1; # Put 1 means use proxy, 0 means not $proxy_server = "127.0.0.1"; # Your Proxy IP address or host, for example "proxy.company.com" $proxy_port = 7777; # Your Proxy Port, usually larger than 1024 // if you use proxy and you use basic authentication to connect to proxy server, // please supply login and password below : $use_basic_auth = 0; # Put 1 means use authentication, 0 means not $proxy_login = "username"; # The login name is here $proxy_password = "password"; # The password is here // array of URL to be indexed (predefined URLs) : $arr_page = array( "http://www.somewhere.page1.php", "http://www.somewhere.page2.htm", "http://www.somewhere.page3.cgi" ); if ($argv[1]) $arr_page = @file($argv[1]); // exclude common english in indexer ... just add into array if necessary ... // all words must be in CAPITAL ... $exclude_words = array( "I", "AM", "WE", "ARE", "YOU", "YOUR", "OURS", "THEY", "THEIR", "THEM", "A", "AN", "MY", "HIS", "HER", "SHE", "HE", "IS", "WERE", "WAS", "TO", "FROM", "UNTIL", "WHAT", "WHERE", "WHO", "WHOM", "WHICH", "HAS", "HAVE", "WHY", "HAD", "BEEN", "BE", "THE", "OR", "AND", "FOR", "BY", "WILL", "WOULD", "SHALL", "SHOULD", "OUGHT", "MUST", "IF", "UPON", "ON", "IN", "ONTO", "THIS", "THESE", "THERE", "THOSE", "THAT", "BECAUSE", "SINCE", "AT", "YES", "NO", "SOME", "ANY", "ALTHOUGH", "THOUGH", "DO", "DOES", "NOT", "PLEASE", "WANT", "NEED", "WANTED", "NEEDED", "ANYONE", "ANYBODY", "ANYTHING", "ANYWHERE", "SOMETHING", "SOMEBODY", "SOMEWHERE", "SOMEONE", "NOBODY", "NOONE", "NOTHING", "NOWHERE" ); // construct $arr_exclude $arr_exclude=array(); for($i=0;$i<count($exclude_words);$i++) { $arr_exclude[$exclude_words[$i]]=1; } function server_name($url) { if (strtoupper(substr($url,0,7))=="HTTP://") $url = substr($url,7); $url = ereg_replace("/.*","",$url); return $url; } function get_content($url) { global $proxy_server, $proxy_port, $use_proxy, $use_basic_auth, $proxy_login, $proxy_password; $proxy_user = ""; if ($use_proxy) { $fp = fsockopen($proxy_server,$proxy_port); if (!$fp) die("Can't connect to proxy server !!"); if ($use_basic_auth) $proxy_user ="Proxy-Authorization: Basic ". base64_encode("$proxy_login:$proxy_password"). "\r\n"; } else { $host = server_name($url); $fp = fsockopen($host,80); if (!$fp) die("Can't connect to $host !!"); } fputs($fp,"GET $url HTTP/1.0\r\n". $proxy_user . "Accept: image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, */*\r\n". "Accept-Encoding: gzip, deflate\r\n\r\n"); $data=fread($fp,4096); if (!eregi("html",$data)) { fclose($fp); return 0; } while (!feof($fp)) $data.=fread($fp,4096); fclose($fp); for($i=0;$i<1500;$i++) if (substr($data,$i,4)=="\r\n\r\n") {$pos=$i+4;$i=2000;} return substr($data,$pos); } function to_log($str,$flag=0) { // IMPLEMENT YOUR LOGGING SYSTEM HERE ... print $str."\n"; if ($flag_exit) die ("\n--ERROR OCCURS, TERMINATED--\n"); } function index_to_db($conn,$html_code,$url) { $META = array ("KEYWORD" => 1, "META_KEYWORDS" =>5, "META_DESCRIPTION"=>10, "TITLE" =>15); eregi("<TITLE>([^<]+)",$html_code,$title); $title=$title[1]; if (!trim($title)) $title="UNTITLED"; $htm2 = substr($html_code,0,2000); preg_match_all("/<META ([^>]*)/i",$htm2,$arr,PREG_SET_ORDER); $meta_k=array(); $meta_d=array(""); for ($i=0;$i<count($arr);$i++) { $meta = $arr[$i][1]; if (eregi("KEYWORD",$meta)) { if (count($meta_k)) continue; $meta=eregi_replace("name[[:space:]]*=[[:space:]]*[\"']*". "[^\"' ]+[\"']*[[:space:]]*","",$meta); $meta=eregi_replace("content[[:space:]]*=[[:space:]]*","",$meta); $meta=eregi_replace("[\"']","",$meta); $meta=eregi_replace("[[:space:]]+"," ",$meta); $meta=str_replace(", ",",",$meta); $meta_k = explode(",",trim($meta)); } if (eregi("DESCRIPTION",$meta)) { $meta=eregi_replace("name[[:space:]]*=[[:space:]]*[\"']*[^\"' ]+[\"']*[[:space:]]*", "",$meta); $meta=eregi_replace("content[[:space:]]*=[[:space:]]*","",$meta); $meta=eregi_replace("[\"']","",$meta); $meta=eregi_replace("[[:space:]]+"," ",$meta); $meta=str_replace(", ",",",$meta); $meta_d[0] = trim($meta); } } $txt = eregi_replace("<STYLE[^>]*>[^>]*</STYLE>","",$html_code); $txt = eregi_replace("<SCRIPT[^>]*>[^>]*</SCRIPT>","",$txt); $txt = eregi_replace("<OBJECT[^>]*>[^>]*</OBJECT>","",$txt); $txt = eregi_replace("<APPLET[^>]*>[^>]*</APPLET>","",$txt); $txt = eregi_replace("<EMBED[^>]*>[^>]*</EMBED>","",$txt); $txt = eregi_replace("&[a-z]+;","",$txt); $txt = strip_tags($txt); $txt = ereg_replace("[[:space:]]+"," ",$txt); if (!trim($meta_d[0])) { $meta_d[0] = trim(substr($txt,0,100)); $arr_text = split("[, ]",$meta_d[0]); unset($arr_text[count($arr_text)-1]); $meta_d[0]=join(" ",$arr_text); } // better to strip out http:// so we don't have unecessary string comparation // when look for URL_STRING $url2 = eregi_replace("^http://","",$url); $sql = mysql_query("SELECT URL_ID FROM TB_URL WHERE URL_STRING='$url2'",$conn); $row=@mysql_fetch_array($sql); // If URL is exist in database, it means do reindex : if ($row["URL_ID"]) { to_log("RE-INDEXing $url ..."); @mysql_free_result($sql); $URL_ID = $row["URL_ID"]; mysql_query("DELETE FROM TB_KEYWORD_REFERENCE WHERE URL_ID = $URL_ID",$conn); mysql_query("UPDATE TB_URL SET DESCRIPTION='".str_replace("'","''",$meta_d[0])."', TITLE='".str_replace("'","''",$title)."' WHERE URL_ID = $URL_ID",$conn); } else // A new URL to be indexed ... { to_log("INDEXing $url ..."); @mysql_free_result($sql); mysql_query("INSERT INTO TB_URL (URL_STRING, TITLE, DESCRIPTION) VALUES ('$url2', '".str_replace("'","''",$title). "', '".str_replace("'","''",$meta_d[0])."')",$conn); $URL_ID = mysql_insert_id($conn); } // index the title $arr_text = split("[, ]",$title); for ($i=0;$i<count($arr_text);$i++) index_word($arr_text[$i], $conn, $URL_ID, $META["TITLE"]); // index the meta description $arr_text = split("[, ]",$meta_d[0]); for ($i=0;$i<count($arr_text);$i++) index_word($arr_text[$i], $conn, $URL_ID, $META["META_DESCRIPTION"]); // index the meta keyword, remove the repetitive $meta_k2=array(); for ($i=0;$i<count($meta_k);$i++) $meta_k2[$meta_k[$i]]=1; reset($meta_k2); while($key=key($meta_k2)) { index_word($key, $conn, $URL_ID, $META["META_KEYWORDS"]); next($meta_k2); } // index the body of page $arr_text = split("[, ]",$txt); for ($i=0;$i<count($arr_text);$i++) index_word($arr_text[$i], $conn, $URL_ID, $META["KEYWORD"]); unset($arr_text); unset($meta_k); unset($meta_k2); } function index_word($word, $conn, $URL_ID, $meta_id) { if (strlen($word)>255) return; GLOBAL $arr_exclude; // remove the . , ! ? at the last word $word = strtoupper( ereg_replace("[.,!?]+$","",trim($word)) ); // remove the ( ) { } $word = ereg_replace("[(){}'\"]","",$word); if ($arr_exclude[$word] || strlen($word)<2) return; // check if keyword is exist ... $sql = mysql_query("SELECT KWD_ID FROM TB_KEYWORD WHERE KEYWORD='$word'",$conn); $row=@mysql_fetch_array($sql); if ($row["KWD_ID"]) { mysql_free_result($sql); $sql = mysql_query("SELECT KWD_ID FROM TB_KEYWORD_REFERENCE WHERE KWD_ID=".$row["KWD_ID"]." AND URL_ID=$URL_ID AND META_ID=$meta_id",$conn); // if the keyword reference is in database, update the count $row2=mysql_fetch_array($sql); if ($row2["KWD_ID"]) { mysql_free_result($sql); $sql = mysql_query("UPDATE TB_KEYWORD_REFERENCE SET COUNT=COUNT+$meta_id WHERE KWD_ID=".$row["KWD_ID"]." AND URL_ID=$URL_ID AND META_ID=$meta_id",$conn); } // if the keyword reference is not in database, insert it else { mysql_free_result($sql); $sql = mysql_query("INSERT TB_KEYWORD_REFERENCE (KWD_ID, URL_ID, META_ID, COUNT) VALUES (".$row["KWD_ID"].", $URL_ID, $meta_id, $meta_id)",$conn); } } else { @mysql_free_result($sql); $sql = mysql_query("INSERT TB_KEYWORD (KEYWORD) VALUES ('$word')",$conn); $KWD_ID = mysql_insert_id($conn); $sql = mysql_query("INSERT TB_KEYWORD_REFERENCE (KWD_ID, URL_ID, META_ID, COUNT) VALUES ($KWD_ID, $URL_ID, $meta_id, $meta_id)",$conn); } } if ($conn) { mysql_select_db("DB_SPIDER_SEARCH",$conn); for ($i=0; $i<count($arr_page); $i++) { $html_code = get_content(chop($arr_page[$i])); index_to_db($conn,$html_code,chop($arr_page[$i])); } } else { die("Cannot connect to database !\n"); } @mysql_close($conn); ?>