#!/usr/bin/env ruby # encoding: UTF-8 require 'rubygems' require 'time' require 'date' require 'hpricot' require 'htmlentities' require 'rexml/document' require 'oauth' require 'mime/types' require 'socket' require 'cgi' require 'httpclient' require 'net/ftp' require 'net/smtp' require 'digest/sha2.rb' require 'net/http' require 'uri' require 'fileutils' require 'mechanize' require 'nokogiri' require 'open-uri' class Seeker def aleat r = Random.new return r.rand(35..50) end def parse_web(query) formated_tab = [] query.parser.xpath('//h3[@class="r"]').take(10).each do |cite| formated_tab << URI::extract( cite.to_s, [ 'http' ] )#cite.inner_text end return formated_tab end def run(query,max,domain,proxy) puts " #{query} #{max} #{domain} #{proxy}" i = 2 limit = i + max result = [] #formated_query = CGI.escape(query) formated_query = query begin addr, port = proxy.split(':') make_agent(addr, port) #page = @agent.get 'http://www.google.' << domain << '/search?q='<< formated_query page = @agent.get 'http://www.google.' << domain #puts page.body #search_result = page.form_with(:name => 'f') do |search| form = page.form_with(:action => '/search') form.fields.each { |f| #puts f.name if f.name == 'q' form[f.name] = formated_query else form[f.name] = '' end } page_res = form.submit #puts page_res.body result << parse_web(page_res) #puts result while next_link = page_res.links.find { |l| l.text == i.to_s } #puts i if i < limit sleep aleat next_page = @agent.get next_link.href result << parse_web(next_page) i += 1 end end return result rescue Mechanize::ResponseCodeError => exception #puts exception.message + " - invalid " + proxy #puts exception.backtrace.inspect @agent.log.info exception.message return false rescue Net::HTTPBadResponse , Net::HTTP::Persistent::Error, Errno::ECONNRESET=> e #puts "net exception "+ e.to_s @agent.log.info "net exception "+ e.to_s return false rescue Timeout::Error => e #puts "timeout: "+e.to_s @agent.log.info "timeout: "+e.to_s return false rescue => e #puts "unknown #{e.to_s}" @agent.log.info "unknown #{e.to_s}" return false end end private def make_agent user_agents = { 'Windows IE 6' => 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)', 'Windows IE 7' => 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)', 'Windows Mozilla' => 'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.4b) Gecko/20030516 Mozilla Firebird/0.6', 'Mac Safari' => 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_2; de-at) AppleWebKit/531.21.8 (KHTML, like Gecko) Version/4.0.4 Safari/531.21.10', 'Mac FireFox' => 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2) Gecko/20100115 Firefox/3.6', 'Mac Mozilla' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.4a) Gecko/20030401', 'Linux Mozilla' => 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.4) Gecko/20030624', 'Linux Firefox' => 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.1) Gecko/20100122 firefox/3.6.1', 'Linux Konqueror' => 'Mozilla/5.0 (compatible; Konqueror/3; Linux)', 'iPhone' => 'Mozilla/5.0 (iPhone; U; CPU like Mac OS X; en) AppleWebKit/420+ (KHTML, like Gecko) Version/3.0 Mobile/1C28 Safari/419.3' } items = user_agents.keys @agent = Mechanize.new do |a| a.user_agent_alias = items[rand(items.length)] a.max_history = 1 a.open_timeout = 15 a.read_timeout = 4 a.keep_alive = true end end end