#!/usr/bin/env ruby
# encoding: UTF-8
require 'rubygems'
require 'time'
require 'date'
require 'hpricot'
require 'htmlentities'
require 'rexml/document'
require 'oauth'
require 'mime/types'
require 'socket'
require 'cgi'
require 'multipart_body'
require 'httpclient'
require 'net/ftp'
require 'net/smtp'
require 'digest/sha2.rb'
require 'net/http'
require 'uri'
require 'fileutils'
require 'mechanize'
require 'nokogiri'
require 'open-uri'
module Get_google_serp
def aleat
return rand(0.15..0.75) #nb_aleatoire entre 15 et 75 sec # en real
end
def parse_web(query)
formated_tab = []
query.parser.xpath('//h3[@class="r"]').take(10).each do |cite|
puts URI::extract( cite.to_s, [ 'http' ] )
formated_tab << URI::extract( cite.to_s, [ 'http' ] )#cite.inner_text
end
return formated_tab
end
def initialize(query)
i = 2
limit = i + 9
result = []
formated_query = URI.escape(query)
@agent = Mechanize.new
page = @agent.get 'http://www.google.fr/search?q='<< formated_query
puts "page 1"
result << parse_web(page)
while i < limit
aleat.minutes.sleep
puts "page "<< i.to_s
next_link = page.links.find { |l| l.text == i.to_s }
next_page = @agent.get next_link.href
result << parse_web(next_page)
i += 1
end
return result
end
end
def main(query)
include Get_google_serp
puts initialize(query)
end
main ARGV[0]