#!/usr/bin/env ruby # encoding: UTF-8 require 'rubygems' require 'time' require 'date' require 'hpricot' require 'htmlentities' require 'rexml/document' require 'oauth' require 'mime/types' require 'socket' require 'cgi' require 'multipart_body' require 'httpclient' require 'net/ftp' require 'net/smtp' require 'digest/sha2.rb' require 'net/http' require 'uri' require 'fileutils' require 'mechanize' require 'nokogiri' require 'open-uri' module Get_google_serp def aleat return rand(0.15..0.75) #nb_aleatoire entre 15 et 75 sec # en real end def parse_web(query) formated_tab = [] query.parser.xpath('//h3[@class="r"]').take(10).each do |cite| puts URI::extract( cite.to_s, [ 'http' ] ) formated_tab << URI::extract( cite.to_s, [ 'http' ] )#cite.inner_text end return formated_tab end def initialize(query) i = 2 limit = i + 9 result = [] formated_query = URI.escape(query) @agent = Mechanize.new page = @agent.get 'http://www.google.fr/search?q='<< formated_query puts "page 1" result << parse_web(page) while i < limit aleat.minutes.sleep puts "page "<< i.to_s next_link = page.links.find { |l| l.text == i.to_s } next_page = @agent.get next_link.href result << parse_web(next_page) i += 1 end return result end end def main(query) include Get_google_serp puts initialize(query) end main ARGV[0]