<?php
session_start();
class bot {
var $check_links = array();
var $checked_links = array();
var $external_links = array();
var $passed_links = array();
var $invalid_links = array();
var $depth = 6;
function bot() {
}
public function start() {
global $getpost_mgr;
$task = mysql_escape_string($getpost_mgr->get_key("task", "get"));
if($task and !empty($task)) {
$this->$task();
}
else {
session_destroy();
$this->show_form();
}
}
private function show_form() {
global $smarty, $db, $getpost_mgr;
$this->output("grabber_form.tpl");
}
private function process_main_urls() {
global $db;
foreach($this->main_urls as $key => $main_url) {
echo "Read Base Uri: " . $main_url['site_url'] . "<br />";
$this->open_url_entry_base($main_url['site_url']);
unset($this->main_urls[$key]);
$this->do_ajax("do");
}
$this->post_status();
shuffle($this->check_links);
return "do_sub";
}
private function do_index() {
global $db, $getpost_mgr;
$this->checked_links = $_SESSION['checked_links'];
$this->check_links = $_SESSION['check_links'];
$this->main_urls = $_SESSION['main_urls'];
$this->invalid_links = $_SESSION['invalid_links'];
$this->passed_links = $_SESSION['passed_links'];
$this->external_links = $_SESSION['external_links'];
$this->page_gen = microtime(true);
$work_id = $getpost_mgr->get_key("work","get");
$this->main_urls = $_SESSION['main_urls'];
$this->checked_links = $_SESSION['checked_links'];
switch($work_id) {
case 'start':
$action = $this->getting_url_list();
$this->do_ajax($action);
break;
case 'do':
$this->process_main_urls();
$this->do_ajax("do_sub");
break;
case 'do_sub':
$action = $this->process_sub_pages();
$this->post_status();
$this->do_ajax($action);
break;
case 'restart':
break;
default:
break;
}
/*if($work_id != 0) {
$last_url = $_SESSION['last'];
$this->checked_links = $_SESSION['checked_links'];
$this->check_links = $_SESSION['check_links'];
$parsed = parse_url($last_url);
if($parsed['scheme'] == "javascript") {
var_dump($this->check_links);
die();
if(is_array($this->check_links)) {
foreach($this->check_links as $item) {
if(!in_array($item, $this->checked_links)) {
$last_url = $item;
}
}
}
}
$this->open_url_entry($last_url);
}
else {
unset($_SESSION['checked_links']);
unset($_SESSION['check_links']);
foreach($this->main_urls as $key => $row) {
$this->open_url_entry($row['site_url']);
}
}
*/
}
private function getting_url_list() {
global $db;
echo "<div>Getting URL´s that older than Yesterday</div>";
$date = date('Y-m-d H:i:s', time()-(86400*2));
$last_url = $_SESSION['last'];
$query = "SELECT * FROM
url_index
WHERE timestep <= '" . $date . "'";
$result = $db->sqlquery($query , 1);
if(is_array($result) and !empty($result)) {
echo '<div style="color: green;">' . count($result) . " Base Links Found</div>";
$this->main_urls = $result;
return "do";
}
else {
echo '<div style="color:orange;">Links uptodate!</div>';
return "stop";
}
return "stop";
}
private function open_url_entry($url) {
global $db;
$agent = "Meine Browserkennung v1.0 :)";
$header[] = "Accept: text/vnd.wap.wml,*.*";
$action = "insert";
if(is_array($this->checked_links)) {
if(@in_array($url, $this->passed_links)) {
$this->post_status();
return 1;
}
}
$parsed = parse_url($url);
if($parsed['scheme'] == "javascript") {
$this->invalid_links[] = $url;
return 1;
}
$select_query = "SELECT * FROM site_index WHERE url = '" .$url . "'";
$select_result = $db->sqlquery($select_query, 1);
$time_entry = strtotime($select_result[0]['index_date']);
$last_day = time()-(86400*2);
$do_link = 1;
$ret = "do_sub";
if(is_array($select_result[0])) {
if($time_entry <= $last_day) {
$action = "update";
}
else {
if(!@in_array($url, $this->passed_links)) $this->passed_links[] = $url;
//$this->post_status();
$do_link = 0;
$ret = "do_nothing";
//return 1;
}
}
else {
$action = "insert";
}
if($do_link == 1) {
$website_base = curl_init($url);
curl_setopt($website_base, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($website_base, CURLOPT_USERAGENT, $agent);
curl_setopt($website_base, CURLOPT_HTTPHEADER, $header);
curl_setopt($website_base, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($website_base, CURLOPT_URL, $url);
curl_setopt($website_base, CURLOPT_RETURNTRANSFER, 1);
$read = curl_exec($website_base);
curl_close($website_base);
$this->check_content($url, $read, $action);
}
return $ret;
}
private function open_url_entry_base($url) {
global $db;
$agent = "Meine Browserkennung v1.0 :)";
$header[] = "Accept: text/vnd.wap.wml,*.*";
$action = "insert";
if(is_array($this->checked_links)) {
foreach($this->checked_links as $links) {
if($links == $url) {
$this->post_status();
return 1;
}
}
}
$parsed = parse_url($url);
if($parsed['scheme'] == "javascript") {
$this->invalid_links[] = $url;
return 1;
}
$select_query = "SELECT * FROM url_index WHERE site_url = '" .$url . "'";
$select_result = $db->sqlquery($select_query, 1);
$time_entry = strtotime($select_result[0]['timestep']);
$last_day = time()-(86400*2);
if(is_array($select_result[0])) {
if($time_entry <= $last_day) {
$action = "update";
}
else {
$this->passed_links[] = $url;
$this->post_status();
return 1;
}
}
else {
$action = "insert";
}
$website_base = curl_init($url);
curl_setopt($website_base, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($website_base, CURLOPT_USERAGENT, $agent);
curl_setopt($website_base, CURLOPT_HTTPHEADER, $header);
curl_setopt($website_base, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($website_base, CURLOPT_URL, $url);
curl_setopt($website_base, CURLOPT_RETURNTRANSFER, 1);
$read = curl_exec($website_base);
curl_close($website_base);
$this->check_content($url, $read, $action);
}
function get_string_between($string, $start, $end){
$string = " ".$string;
$ini = strpos($string,$start);
if ($ini == 0) return "";
$ini += strlen($start);
$len = strpos($string,$end,$ini) - $ini;
return substr($string,$ini,$len);
}
function getMetaData($url, $content){
// get meta tags
stream_context_set_default(
array(
'http' => array(
'timeout' => 10
)
)
);
if($meta = @get_meta_tags($url)) {
// store page
$page=$content;
// find where the title CONTENT begins
$titleStart=strpos($page,'<title>')+7;
// find how long the title is
$titleLength=strpos($page,'</title>')-$titleStart;
// extract title from $page
$meta['title']=substr($page,$titleStart,$titleLength);
// return array of data
}
return $meta;
}
private function update_entry($url, $content, $meta ,$action) {
global $db;
$meta_serialized = serialize($meta);
if($action == "update") {
$query = "UPDATE
site_index
SET
`url` = '" . $url . "',
`tags` = '" . mysql_escape_string($meta['keywords']) ."',
`title` = '" . mysql_escape_string($meta['title']) ."',
`meta_desc` = '" . mysql_escape_string($meta['description']) ."',
`content` = '" . mysql_escape_string(str_replace(array(" ", "\n"), "", strip_tags($content))) . "',
`type`= 'forum',
`meta_options_serialized` = '" . mysql_escape_string( $meta_serialized) . "',
`index_date` = NOW())
WHERE
url = '" .$use_url . "'";
$db->sqlquery($query);
}
elseif($action == "insert") {
$query = "INSERT INTO site_index (`url`,`tags`, `title`, `meta_desc`, `content`, `index_date`, `type`, `meta_options_serialized`)
VALUES ('" .
mysql_escape_string($url) . "', '" .
mysql_escape_string($meta['keywords']) ."', '" .
mysql_escape_string($meta['title']) ."', '" .
mysql_escape_string($meta['description']) ."', '".
mysql_escape_string(strip_tags($content)) ."', NOW(),'forum','" .
mysql_escape_string($meta_serialized) . "')";
$db->sqlquery($query);
}
}
private function check_link($current_url, $url_on_page) {
global $db;
$invalid_scheme = array("javascript", "file");
$external_url = 0;
$pass_url = 0;
if(empty($url_on_page)) throw new Exception('$ur_on_page are empty!', 4);
$parsed_url = parse_url($url_on_page);
$parsed_current_url = parse_url($current_url);
if(is_array($parsed_url) and is_array($parsed_current_url)) {
if(count($parsed_current_url) <= 3) {
if(!isset($parsed_current_url['host'])) throw new Exception('#002: Unable to check Link [uncomplete base]', 2);
}
if(!strstr($parsed_url['path'], "/")) $parsed_url['path'] = "/";
if(!empty($parsed_url['query'])) $param_str = "?" . $parsed_url['query'];
if(empty($parsed_url['scheme'])) $parsed_url['scheme'] = $parsed_current_url['scheme'];
if(in_array($parsed_url['scheme'], $invalid_scheme)) throw new Exception('#003: Invalid scheme!', 3);
if(isset($parsed_url['host'])) {
if($parsed_url['host'] == $parsed_current_url['host'] or ("www.".$parsed_url['host']) == $parsed_current_url['host']) {
$external_url = 0;
}
else {
$external_url = 1;
}
$builded_url = $parsed_url['scheme'] . "://".$parsed_url['host']. $parsed_url['path'].$param_str;
}
else {
$builded_url = $parsed_url['scheme'] . "://".$parsed_current_url['host']. $parsed_url['path'].$param_str;
$external_url = 0;
}
if(@in_array($builded_url,$this->checked_links) or @in_array($builded_url,$this->passed_links) or @in_array($builded_url,$this->check_links)) {
$pass_url = 1;
return array("url" => $builded_url, "is_external" => $external_url, "is_passed" => $pass_url);
}
$query = "SELECT * FROM site_index WHERE url = '". $builded_url . "' LIMIT 1";
$fetched = $db->sqlquery($query);
if(!is_array($fetched) or empty($fetched)) {
$time_entry = strtotime($fetched[0]['index_date']);
$last_day = time()-(86400*2);
if($time_entry >= $last_day) $pass_url = 1;
}
return array("url" => $builded_url, "is_external" => $external_url, "is_passed" => $pass_url);
}
else {
throw new Exception('#001: invalid link', 1);
}
}
private function add_link($url, $is_external, $is_passed, $is_valid) {
if($is_valid == 0) {
if(!@in_array($url, $this->invalid_links)) $this->invalid_links[] = $url;
return 1;
}
if($is_passed == 1) {
if(!@in_array($url, $this->passed_links)) $this->passed_links[] = $url;
return 1;
}
if($is_external == 1) {
if(!@in_array($url, $this->external_links)) $this->external_links[] = $url;
return 1;
}
if(!@in_array($url, $this->check_links)) {
$this->check_links[] = $url;
return 1;
}
return 0;
}
private function check_content($base_url, $content, $action) {
global $db;
$invalid_scheme = array("javascript", "file");
$base_uri_array = parse_url($base_url);
$regexp = "<a\s[^>]*href=(\"??)([^\" >]*?)\\1[^>]*>(.*)<\/a>";
$var = preg_match_all("/$regexp/siU",$content, $links, PREG_SET_ORDER);
die("/$regexp/siU");
//var_dump($base_uri_array);
$meta = $this->getMetaData($base_url, $content);
if(is_array($meta)) {
$this->update_entry($base_url, $content, $meta, $action);
foreach($links as $key => $link) {
try {
extract($this->check_link($base_url, $link[2]));
$this->add_link($url, $is_external, $is_passed, 1);
}
catch(Exception $e) {
switch ($e->getCode()) {
case 3:
$this->add_link($link[2], 1, 1, 0);
break;
case 4:
break;
}
}
}
$this->checked_links[] = $base_url;
}
else {
$this->checked_links[] = $base_url;
$this->add_link($base_url, 1, 1, 0);
}
}
private function process_sub_pages() {
global $db;
if(count($this->check_links) != 0) {
$i = 0;
foreach($this->check_links as $key => $item) {
if(!@in_array($item, $this->checked_links)) {
if(!@in_array($item, $this->passed_links)) {
$parsed = parse_url($item);
if($parsed['scheme'] != "javascript") {
echo "Read Subpageuri: " . $item;
$ret = $this->open_url_entry($item);
if($ret == "do_sub") {
echo ' <span style="color: green;">Checked!</span><br />';
}
else {
echo ' <span style="color: blue;">Passed!</span><br />';
}
if($i >= 10) {
return "do_sub";
}
$i++;
}
else {
$this->invalid_links[] = $item;
}
}
}
}
session_destroy();
return "do";
}
else {
$this->post_status();
return "do";
}
}
private function post_status() {
echo "<hr><div style=\"color: green;\" >" . count($this->check_links). " internal Links found!</div>";
echo " <div style=\"color: darkgreen;\" >" . count($this->external_links). " external Links found!</div>";
echo " <div style=\"color: red;\" >" . count($this->invalid_links). " invalid Links!</div>";
echo " <div style=\"color: blue;\" >" . count($this->passed_links). " Links passed!</div>";
echo "<div style=\"color: orange;\" >" . count($this->checked_links). " Links checked!</div><hr />";
echo "<div style=\"color: green;\" >Last Link in internal links: " . @end($this->check_links). "</div>";
echo " <div style=\"color: darkgreen;\" >Last Link in external links: " . @end($this->external_links). "</div>";
echo " <div style=\"color: red;\" >Last Link in invalid links: " . @end($this->invalid_links). "</div>";
echo " <div style=\"color: blue;\" >Last Link in Passed links: " . @end($this->passed_links). "</div>";
echo " <div style=\"color: orange;\" >Last Link in checked links: " . @end($this->checked_links). "</div>";
echo "<hr /><div><div>List external Links</div>";
if(is_array($this->external_links)) {
foreach($this->external_links as $ex_url) {
echo "<div> >" . $ex_url . "</div>";
}
}
echo "</div>";
}
private function do_ajax($action, $last_url ='') {
$_SESSION['last'] = $last_url;
$_SESSION['checked_links'] = $this->checked_links;
$_SESSION['check_links'] = $this->check_links;
$_SESSION['main_urls'] = $this->main_urls;
$_SESSION['invalid_links'] = $this->invalid_links;
$_SESSION['passed_links'] = $this->passed_links;
$_SESSION['external_links'] = $this->external_links;
$time = $stop-$this->page_gen;
switch($action) {
case "do":
if(count($this->main_urls) >= 1) {
echo "|" .$action;
exit;
}
break;
case "start":
echo "|" .$action;
exit;
break;
default:
echo "|" .$action;
exit;
}
//echo $time . " Sekunden läuft das sript schon<br />";
}
function output($file) {
global $smarty;
$smarty->display($file);
}
}
?>