PHP LInk Grabber Class (frist draw)


SUBMITTED BY: Guest

DATE: July 9, 2013, 8:23 a.m.

FORMAT: PHP

SIZE: 17.0 kB

HITS: 1132

  1. <?php
  2. session_start();
  3. class bot {
  4. var $check_links = array();
  5. var $checked_links = array();
  6. var $external_links = array();
  7. var $passed_links = array();
  8. var $invalid_links = array();
  9. var $depth = 6;
  10. function bot() {
  11. }
  12. public function start() {
  13. global $getpost_mgr;
  14. $task = mysql_escape_string($getpost_mgr->get_key("task", "get"));
  15. if($task and !empty($task)) {
  16. $this->$task();
  17. }
  18. else {
  19. session_destroy();
  20. $this->show_form();
  21. }
  22. }
  23. private function show_form() {
  24. global $smarty, $db, $getpost_mgr;
  25. $this->output("grabber_form.tpl");
  26. }
  27. private function process_main_urls() {
  28. global $db;
  29. foreach($this->main_urls as $key => $main_url) {
  30. echo "Read Base Uri: " . $main_url['site_url'] . "<br />";
  31. $this->open_url_entry_base($main_url['site_url']);
  32. unset($this->main_urls[$key]);
  33. $this->do_ajax("do");
  34. }
  35. $this->post_status();
  36. shuffle($this->check_links);
  37. return "do_sub";
  38. }
  39. private function do_index() {
  40. global $db, $getpost_mgr;
  41. $this->checked_links = $_SESSION['checked_links'];
  42. $this->check_links = $_SESSION['check_links'];
  43. $this->main_urls = $_SESSION['main_urls'];
  44. $this->invalid_links = $_SESSION['invalid_links'];
  45. $this->passed_links = $_SESSION['passed_links'];
  46. $this->external_links = $_SESSION['external_links'];
  47. $this->page_gen = microtime(true);
  48. $work_id = $getpost_mgr->get_key("work","get");
  49. $this->main_urls = $_SESSION['main_urls'];
  50. $this->checked_links = $_SESSION['checked_links'];
  51. switch($work_id) {
  52. case 'start':
  53. $action = $this->getting_url_list();
  54. $this->do_ajax($action);
  55. break;
  56. case 'do':
  57. $this->process_main_urls();
  58. $this->do_ajax("do_sub");
  59. break;
  60. case 'do_sub':
  61. $action = $this->process_sub_pages();
  62. $this->post_status();
  63. $this->do_ajax($action);
  64. break;
  65. case 'restart':
  66. break;
  67. default:
  68. break;
  69. }
  70. /*if($work_id != 0) {
  71. $last_url = $_SESSION['last'];
  72. $this->checked_links = $_SESSION['checked_links'];
  73. $this->check_links = $_SESSION['check_links'];
  74. $parsed = parse_url($last_url);
  75. if($parsed['scheme'] == "javascript") {
  76. var_dump($this->check_links);
  77. die();
  78. if(is_array($this->check_links)) {
  79. foreach($this->check_links as $item) {
  80. if(!in_array($item, $this->checked_links)) {
  81. $last_url = $item;
  82. }
  83. }
  84. }
  85. }
  86. $this->open_url_entry($last_url);
  87. }
  88. else {
  89. unset($_SESSION['checked_links']);
  90. unset($_SESSION['check_links']);
  91. foreach($this->main_urls as $key => $row) {
  92. $this->open_url_entry($row['site_url']);
  93. }
  94. }
  95. */
  96. }
  97. private function getting_url_list() {
  98. global $db;
  99. echo "<div>Getting URL´s that older than Yesterday</div>";
  100. $date = date('Y-m-d H:i:s', time()-(86400*2));
  101. $last_url = $_SESSION['last'];
  102. $query = "SELECT * FROM
  103. url_index
  104. WHERE timestep <= '" . $date . "'";
  105. $result = $db->sqlquery($query , 1);
  106. if(is_array($result) and !empty($result)) {
  107. echo '<div style="color: green;">' . count($result) . " Base Links Found</div>";
  108. $this->main_urls = $result;
  109. return "do";
  110. }
  111. else {
  112. echo '<div style="color:orange;">Links uptodate!</div>';
  113. return "stop";
  114. }
  115. return "stop";
  116. }
  117. private function open_url_entry($url) {
  118. global $db;
  119. $agent = "Meine Browserkennung v1.0 :)";
  120. $header[] = "Accept: text/vnd.wap.wml,*.*";
  121. $action = "insert";
  122. if(is_array($this->checked_links)) {
  123. if(@in_array($url, $this->passed_links)) {
  124. $this->post_status();
  125. return 1;
  126. }
  127. }
  128. $parsed = parse_url($url);
  129. if($parsed['scheme'] == "javascript") {
  130. $this->invalid_links[] = $url;
  131. return 1;
  132. }
  133. $select_query = "SELECT * FROM site_index WHERE url = '" .$url . "'";
  134. $select_result = $db->sqlquery($select_query, 1);
  135. $time_entry = strtotime($select_result[0]['index_date']);
  136. $last_day = time()-(86400*2);
  137. $do_link = 1;
  138. $ret = "do_sub";
  139. if(is_array($select_result[0])) {
  140. if($time_entry <= $last_day) {
  141. $action = "update";
  142. }
  143. else {
  144. if(!@in_array($url, $this->passed_links)) $this->passed_links[] = $url;
  145. //$this->post_status();
  146. $do_link = 0;
  147. $ret = "do_nothing";
  148. //return 1;
  149. }
  150. }
  151. else {
  152. $action = "insert";
  153. }
  154. if($do_link == 1) {
  155. $website_base = curl_init($url);
  156. curl_setopt($website_base, CURLOPT_RETURNTRANSFER, 1);
  157. curl_setopt($website_base, CURLOPT_USERAGENT, $agent);
  158. curl_setopt($website_base, CURLOPT_HTTPHEADER, $header);
  159. curl_setopt($website_base, CURLOPT_FOLLOWLOCATION, 1);
  160. curl_setopt($website_base, CURLOPT_URL, $url);
  161. curl_setopt($website_base, CURLOPT_RETURNTRANSFER, 1);
  162. $read = curl_exec($website_base);
  163. curl_close($website_base);
  164. $this->check_content($url, $read, $action);
  165. }
  166. return $ret;
  167. }
  168. private function open_url_entry_base($url) {
  169. global $db;
  170. $agent = "Meine Browserkennung v1.0 :)";
  171. $header[] = "Accept: text/vnd.wap.wml,*.*";
  172. $action = "insert";
  173. if(is_array($this->checked_links)) {
  174. foreach($this->checked_links as $links) {
  175. if($links == $url) {
  176. $this->post_status();
  177. return 1;
  178. }
  179. }
  180. }
  181. $parsed = parse_url($url);
  182. if($parsed['scheme'] == "javascript") {
  183. $this->invalid_links[] = $url;
  184. return 1;
  185. }
  186. $select_query = "SELECT * FROM url_index WHERE site_url = '" .$url . "'";
  187. $select_result = $db->sqlquery($select_query, 1);
  188. $time_entry = strtotime($select_result[0]['timestep']);
  189. $last_day = time()-(86400*2);
  190. if(is_array($select_result[0])) {
  191. if($time_entry <= $last_day) {
  192. $action = "update";
  193. }
  194. else {
  195. $this->passed_links[] = $url;
  196. $this->post_status();
  197. return 1;
  198. }
  199. }
  200. else {
  201. $action = "insert";
  202. }
  203. $website_base = curl_init($url);
  204. curl_setopt($website_base, CURLOPT_RETURNTRANSFER, 1);
  205. curl_setopt($website_base, CURLOPT_USERAGENT, $agent);
  206. curl_setopt($website_base, CURLOPT_HTTPHEADER, $header);
  207. curl_setopt($website_base, CURLOPT_FOLLOWLOCATION, 1);
  208. curl_setopt($website_base, CURLOPT_URL, $url);
  209. curl_setopt($website_base, CURLOPT_RETURNTRANSFER, 1);
  210. $read = curl_exec($website_base);
  211. curl_close($website_base);
  212. $this->check_content($url, $read, $action);
  213. }
  214. function get_string_between($string, $start, $end){
  215. $string = " ".$string;
  216. $ini = strpos($string,$start);
  217. if ($ini == 0) return "";
  218. $ini += strlen($start);
  219. $len = strpos($string,$end,$ini) - $ini;
  220. return substr($string,$ini,$len);
  221. }
  222. function getMetaData($url, $content){
  223. // get meta tags
  224. stream_context_set_default(
  225. array(
  226. 'http' => array(
  227. 'timeout' => 10
  228. )
  229. )
  230. );
  231. if($meta = @get_meta_tags($url)) {
  232. // store page
  233. $page=$content;
  234. // find where the title CONTENT begins
  235. $titleStart=strpos($page,'<title>')+7;
  236. // find how long the title is
  237. $titleLength=strpos($page,'</title>')-$titleStart;
  238. // extract title from $page
  239. $meta['title']=substr($page,$titleStart,$titleLength);
  240. // return array of data
  241. }
  242. return $meta;
  243. }
  244. private function update_entry($url, $content, $meta ,$action) {
  245. global $db;
  246. $meta_serialized = serialize($meta);
  247. if($action == "update") {
  248. $query = "UPDATE
  249. site_index
  250. SET
  251. `url` = '" . $url . "',
  252. `tags` = '" . mysql_escape_string($meta['keywords']) ."',
  253. `title` = '" . mysql_escape_string($meta['title']) ."',
  254. `meta_desc` = '" . mysql_escape_string($meta['description']) ."',
  255. `content` = '" . mysql_escape_string(str_replace(array(" ", "\n"), "", strip_tags($content))) . "',
  256. `type`= 'forum',
  257. `meta_options_serialized` = '" . mysql_escape_string( $meta_serialized) . "',
  258. `index_date` = NOW())
  259. WHERE
  260. url = '" .$use_url . "'";
  261. $db->sqlquery($query);
  262. }
  263. elseif($action == "insert") {
  264. $query = "INSERT INTO site_index (`url`,`tags`, `title`, `meta_desc`, `content`, `index_date`, `type`, `meta_options_serialized`)
  265. VALUES ('" .
  266. mysql_escape_string($url) . "', '" .
  267. mysql_escape_string($meta['keywords']) ."', '" .
  268. mysql_escape_string($meta['title']) ."', '" .
  269. mysql_escape_string($meta['description']) ."', '".
  270. mysql_escape_string(strip_tags($content)) ."', NOW(),'forum','" .
  271. mysql_escape_string($meta_serialized) . "')";
  272. $db->sqlquery($query);
  273. }
  274. }
  275. private function check_link($current_url, $url_on_page) {
  276. global $db;
  277. $invalid_scheme = array("javascript", "file");
  278. $external_url = 0;
  279. $pass_url = 0;
  280. if(empty($url_on_page)) throw new Exception('$ur_on_page are empty!', 4);
  281. $parsed_url = parse_url($url_on_page);
  282. $parsed_current_url = parse_url($current_url);
  283. if(is_array($parsed_url) and is_array($parsed_current_url)) {
  284. if(count($parsed_current_url) <= 3) {
  285. if(!isset($parsed_current_url['host'])) throw new Exception('#002: Unable to check Link [uncomplete base]', 2);
  286. }
  287. if(!strstr($parsed_url['path'], "/")) $parsed_url['path'] = "/";
  288. if(!empty($parsed_url['query'])) $param_str = "?" . $parsed_url['query'];
  289. if(empty($parsed_url['scheme'])) $parsed_url['scheme'] = $parsed_current_url['scheme'];
  290. if(in_array($parsed_url['scheme'], $invalid_scheme)) throw new Exception('#003: Invalid scheme!', 3);
  291. if(isset($parsed_url['host'])) {
  292. if($parsed_url['host'] == $parsed_current_url['host'] or ("www.".$parsed_url['host']) == $parsed_current_url['host']) {
  293. $external_url = 0;
  294. }
  295. else {
  296. $external_url = 1;
  297. }
  298. $builded_url = $parsed_url['scheme'] . "://".$parsed_url['host']. $parsed_url['path'].$param_str;
  299. }
  300. else {
  301. $builded_url = $parsed_url['scheme'] . "://".$parsed_current_url['host']. $parsed_url['path'].$param_str;
  302. $external_url = 0;
  303. }
  304. if(@in_array($builded_url,$this->checked_links) or @in_array($builded_url,$this->passed_links) or @in_array($builded_url,$this->check_links)) {
  305. $pass_url = 1;
  306. return array("url" => $builded_url, "is_external" => $external_url, "is_passed" => $pass_url);
  307. }
  308. $query = "SELECT * FROM site_index WHERE url = '". $builded_url . "' LIMIT 1";
  309. $fetched = $db->sqlquery($query);
  310. if(!is_array($fetched) or empty($fetched)) {
  311. $time_entry = strtotime($fetched[0]['index_date']);
  312. $last_day = time()-(86400*2);
  313. if($time_entry >= $last_day) $pass_url = 1;
  314. }
  315. return array("url" => $builded_url, "is_external" => $external_url, "is_passed" => $pass_url);
  316. }
  317. else {
  318. throw new Exception('#001: invalid link', 1);
  319. }
  320. }
  321. private function add_link($url, $is_external, $is_passed, $is_valid) {
  322. if($is_valid == 0) {
  323. if(!@in_array($url, $this->invalid_links)) $this->invalid_links[] = $url;
  324. return 1;
  325. }
  326. if($is_passed == 1) {
  327. if(!@in_array($url, $this->passed_links)) $this->passed_links[] = $url;
  328. return 1;
  329. }
  330. if($is_external == 1) {
  331. if(!@in_array($url, $this->external_links)) $this->external_links[] = $url;
  332. return 1;
  333. }
  334. if(!@in_array($url, $this->check_links)) {
  335. $this->check_links[] = $url;
  336. return 1;
  337. }
  338. return 0;
  339. }
  340. private function check_content($base_url, $content, $action) {
  341. global $db;
  342. $invalid_scheme = array("javascript", "file");
  343. $base_uri_array = parse_url($base_url);
  344. $regexp = "<a\s[^>]*href=(\"??)([^\" >]*?)\\1[^>]*>(.*)<\/a>";
  345. $var = preg_match_all("/$regexp/siU",$content, $links, PREG_SET_ORDER);
  346. die("/$regexp/siU");
  347. //var_dump($base_uri_array);
  348. $meta = $this->getMetaData($base_url, $content);
  349. if(is_array($meta)) {
  350. $this->update_entry($base_url, $content, $meta, $action);
  351. foreach($links as $key => $link) {
  352. try {
  353. extract($this->check_link($base_url, $link[2]));
  354. $this->add_link($url, $is_external, $is_passed, 1);
  355. }
  356. catch(Exception $e) {
  357. switch ($e->getCode()) {
  358. case 3:
  359. $this->add_link($link[2], 1, 1, 0);
  360. break;
  361. case 4:
  362. break;
  363. }
  364. }
  365. }
  366. $this->checked_links[] = $base_url;
  367. }
  368. else {
  369. $this->checked_links[] = $base_url;
  370. $this->add_link($base_url, 1, 1, 0);
  371. }
  372. }
  373. private function process_sub_pages() {
  374. global $db;
  375. if(count($this->check_links) != 0) {
  376. $i = 0;
  377. foreach($this->check_links as $key => $item) {
  378. if(!@in_array($item, $this->checked_links)) {
  379. if(!@in_array($item, $this->passed_links)) {
  380. $parsed = parse_url($item);
  381. if($parsed['scheme'] != "javascript") {
  382. echo "Read Subpageuri: " . $item;
  383. $ret = $this->open_url_entry($item);
  384. if($ret == "do_sub") {
  385. echo ' <span style="color: green;">Checked!</span><br />';
  386. }
  387. else {
  388. echo ' <span style="color: blue;">Passed!</span><br />';
  389. }
  390. if($i >= 10) {
  391. return "do_sub";
  392. }
  393. $i++;
  394. }
  395. else {
  396. $this->invalid_links[] = $item;
  397. }
  398. }
  399. }
  400. }
  401. session_destroy();
  402. return "do";
  403. }
  404. else {
  405. $this->post_status();
  406. return "do";
  407. }
  408. }
  409. private function post_status() {
  410. echo "<hr><div style=\"color: green;\" >" . count($this->check_links). " internal Links found!</div>";
  411. echo " <div style=\"color: darkgreen;\" >" . count($this->external_links). " external Links found!</div>";
  412. echo " <div style=\"color: red;\" >" . count($this->invalid_links). " invalid Links!</div>";
  413. echo " <div style=\"color: blue;\" >" . count($this->passed_links). " Links passed!</div>";
  414. echo "<div style=\"color: orange;\" >" . count($this->checked_links). " Links checked!</div><hr />";
  415. echo "<div style=\"color: green;\" >Last Link in internal links: " . @end($this->check_links). "</div>";
  416. echo " <div style=\"color: darkgreen;\" >Last Link in external links: " . @end($this->external_links). "</div>";
  417. echo " <div style=\"color: red;\" >Last Link in invalid links: " . @end($this->invalid_links). "</div>";
  418. echo " <div style=\"color: blue;\" >Last Link in Passed links: " . @end($this->passed_links). "</div>";
  419. echo " <div style=\"color: orange;\" >Last Link in checked links: " . @end($this->checked_links). "</div>";
  420. echo "<hr /><div><div>List external Links</div>";
  421. if(is_array($this->external_links)) {
  422. foreach($this->external_links as $ex_url) {
  423. echo "<div> >" . $ex_url . "</div>";
  424. }
  425. }
  426. echo "</div>";
  427. }
  428. private function do_ajax($action, $last_url ='') {
  429. $_SESSION['last'] = $last_url;
  430. $_SESSION['checked_links'] = $this->checked_links;
  431. $_SESSION['check_links'] = $this->check_links;
  432. $_SESSION['main_urls'] = $this->main_urls;
  433. $_SESSION['invalid_links'] = $this->invalid_links;
  434. $_SESSION['passed_links'] = $this->passed_links;
  435. $_SESSION['external_links'] = $this->external_links;
  436. $time = $stop-$this->page_gen;
  437. switch($action) {
  438. case "do":
  439. if(count($this->main_urls) >= 1) {
  440. echo "|" .$action;
  441. exit;
  442. }
  443. break;
  444. case "start":
  445. echo "|" .$action;
  446. exit;
  447. break;
  448. default:
  449. echo "|" .$action;
  450. exit;
  451. }
  452. //echo $time . " Sekunden läuft das sript schon<br />";
  453. }
  454. function output($file) {
  455. global $smarty;
  456. $smarty->display($file);
  457. }
  458. }
  459. ?>

comments powered by Disqus