#!/usr/bin/env ruby
# encoding: UTF-8

require 'rubygems'
require 'time'
require 'date'
require 'hpricot'
require 'htmlentities'
require 'rexml/document'
require 'oauth'
require 'mime/types'
require 'socket' 
require 'cgi'
require 'httpclient'
require 'net/ftp'
require 'net/smtp'
require 'digest/sha2.rb'
require 'net/http' 
require 'uri'
require 'fileutils'
require 'mechanize'
require 'nokogiri'
require 'open-uri'

  class Seeker
    def aleat
      r = Random.new
      return r.rand(35..50)
    end

    def parse_web(query)    
      formated_tab = []
      query.parser.xpath('//h3[@class="r"]').take(10).each do |cite|
        formated_tab << URI::extract( cite.to_s, [ 'http' ] )#cite.inner_text
      end
      return formated_tab    
    end

    def run(query,max,domain,proxy)
      puts " #{query} #{max} #{domain} #{proxy}"
      i = 2
      limit = i + max
      result = []
      #formated_query = CGI.escape(query)
      formated_query = query
      begin
        addr, port = proxy.split(':')
        make_agent(addr, port) 
        #page = @agent.get 'http://www.google.' << domain << '/search?q='<<  formated_query 

        page = @agent.get 'http://www.google.' << domain
        #puts page.body
        #search_result = page.form_with(:name => 'f') do |search|
        form = page.form_with(:action => '/search') 
        form.fields.each { |f| 
          #puts f.name
          if f.name == 'q'
            form[f.name] = formated_query 
          else
            form[f.name] = ''
          end
        }
        page_res = form.submit
        #puts page_res.body

        result << parse_web(page_res)
        #puts result

        while next_link =  page_res.links.find { |l| l.text == i.to_s }
          #puts i
          if i < limit
            sleep aleat
            next_page = @agent.get next_link.href
            result << parse_web(next_page)
            i += 1
          end
        end
        return result   
      rescue Mechanize::ResponseCodeError => exception
              #puts exception.message + " - invalid " + proxy
              #puts exception.backtrace.inspect  
              @agent.log.info exception.message
              return false 
      rescue Net::HTTPBadResponse , Net::HTTP::Persistent::Error, Errno::ECONNRESET=> e
              #puts "net exception "+ e.to_s
              @agent.log.info "net exception "+ e.to_s
              return false
      rescue Timeout::Error => e
              #puts "timeout: "+e.to_s
              @agent.log.info "timeout: "+e.to_s
              return false
      rescue => e
              #puts  "unknown #{e.to_s}"
              @agent.log.info "unknown #{e.to_s}"
              return false
      end
    end

    private

   def make_agent
     user_agents = {
      'Windows IE 6' => 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
      'Windows IE 7' => 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
      'Windows Mozilla' => 'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.4b) Gecko/20030516 Mozilla Firebird/0.6',
      'Mac Safari' => 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_2; de-at) AppleWebKit/531.21.8 (KHTML, like Gecko) Version/4.0.4 Safari/531.21.10',
      'Mac FireFox' => 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2) Gecko/20100115 Firefox/3.6',
      'Mac Mozilla' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.4a) Gecko/20030401',
      'Linux Mozilla' => 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.4) Gecko/20030624',
      'Linux Firefox' => 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.1) Gecko/20100122 firefox/3.6.1',
      'Linux Konqueror' => 'Mozilla/5.0 (compatible; Konqueror/3; Linux)',
      'iPhone' => 'Mozilla/5.0 (iPhone; U; CPU like Mac OS X; en) AppleWebKit/420+ (KHTML, like Gecko) Version/3.0 Mobile/1C28 Safari/419.3'
     } 
     items = user_agents.keys
      @agent = Mechanize.new do |a|
        a.user_agent_alias = items[rand(items.length)]
        a.max_history = 1
        a.open_timeout = 15
        a.read_timeout = 4
        a.keep_alive = true
      end
    end    
  end 
