#!/usr/bin/env ruby
# encoding: UTF-8
require 'rubygems'
require 'time'
require 'date'
require 'hpricot'
require 'htmlentities'
require 'rexml/document'
require 'oauth'
require 'mime/types'
require 'socket'
require 'cgi'
require 'httpclient'
require 'net/ftp'
require 'net/smtp'
require 'digest/sha2.rb'
require 'net/http'
require 'uri'
require 'fileutils'
require 'mechanize'
require 'nokogiri'
require 'open-uri'
class Seeker
def aleat
r = Random.new
return r.rand(35..50)
end
def parse_web(query)
formated_tab = []
query.parser.xpath('//h3[@class="r"]').take(10).each do |cite|
formated_tab << URI::extract( cite.to_s, [ 'http' ] )#cite.inner_text
end
return formated_tab
end
def run(query,max,domain,proxy)
puts " #{query} #{max} #{domain} #{proxy}"
i = 2
limit = i + max
result = []
#formated_query = CGI.escape(query)
formated_query = query
begin
addr, port = proxy.split(':')
make_agent(addr, port)
#page = @agent.get 'http://www.google.' << domain << '/search?q='<< formated_query
page = @agent.get 'http://www.google.' << domain
#puts page.body
#search_result = page.form_with(:name => 'f') do |search|
form = page.form_with(:action => '/search')
form.fields.each { |f|
#puts f.name
if f.name == 'q'
form[f.name] = formated_query
else
form[f.name] = ''
end
}
page_res = form.submit
#puts page_res.body
result << parse_web(page_res)
#puts result
while next_link = page_res.links.find { |l| l.text == i.to_s }
#puts i
if i < limit
sleep aleat
next_page = @agent.get next_link.href
result << parse_web(next_page)
i += 1
end
end
return result
rescue Mechanize::ResponseCodeError => exception
#puts exception.message + " - invalid " + proxy
#puts exception.backtrace.inspect
@agent.log.info exception.message
return false
rescue Net::HTTPBadResponse , Net::HTTP::Persistent::Error, Errno::ECONNRESET=> e
#puts "net exception "+ e.to_s
@agent.log.info "net exception "+ e.to_s
return false
rescue Timeout::Error => e
#puts "timeout: "+e.to_s
@agent.log.info "timeout: "+e.to_s
return false
rescue => e
#puts "unknown #{e.to_s}"
@agent.log.info "unknown #{e.to_s}"
return false
end
end
private
def make_agent
user_agents = {
'Windows IE 6' => 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
'Windows IE 7' => 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
'Windows Mozilla' => 'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.4b) Gecko/20030516 Mozilla Firebird/0.6',
'Mac Safari' => 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_2; de-at) AppleWebKit/531.21.8 (KHTML, like Gecko) Version/4.0.4 Safari/531.21.10',
'Mac FireFox' => 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2) Gecko/20100115 Firefox/3.6',
'Mac Mozilla' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.4a) Gecko/20030401',
'Linux Mozilla' => 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.4) Gecko/20030624',
'Linux Firefox' => 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.1) Gecko/20100122 firefox/3.6.1',
'Linux Konqueror' => 'Mozilla/5.0 (compatible; Konqueror/3; Linux)',
'iPhone' => 'Mozilla/5.0 (iPhone; U; CPU like Mac OS X; en) AppleWebKit/420+ (KHTML, like Gecko) Version/3.0 Mobile/1C28 Safari/419.3'
}
items = user_agents.keys
@agent = Mechanize.new do |a|
a.user_agent_alias = items[rand(items.length)]
a.max_history = 1
a.open_timeout = 15
a.read_timeout = 4
a.keep_alive = true
end
end
end