ruby google get serps


SUBMITTED BY: Guest

DATE: Sept. 28, 2014, 8:24 p.m.

FORMAT: Text only

SIZE: 4.6 kB

HITS: 890

  1. #!/usr/bin/env ruby
  2. # encoding: UTF-8
  3. require 'rubygems'
  4. require 'time'
  5. require 'date'
  6. require 'hpricot'
  7. require 'htmlentities'
  8. require 'rexml/document'
  9. require 'oauth'
  10. require 'mime/types'
  11. require 'socket'
  12. require 'cgi'
  13. require 'httpclient'
  14. require 'net/ftp'
  15. require 'net/smtp'
  16. require 'digest/sha2.rb'
  17. require 'net/http'
  18. require 'uri'
  19. require 'fileutils'
  20. require 'mechanize'
  21. require 'nokogiri'
  22. require 'open-uri'
  23. class Seeker
  24. def aleat
  25. return r.rand(35..50)
  26. end
  27. def parse_web(query)
  28. formated_tab = []
  29. query.parser.xpath('//h3[@class="r"]').take(10).each do |cite|
  30. formated_tab << URI::extract( cite.to_s, [ 'http' ] )#cite.inner_text
  31. end
  32. return formated_tab
  33. end
  34. def run(query,max,domain,proxy)
  35. puts " #{query} #{max} #{domain} #{proxy}"
  36. i = 2
  37. limit = i + max
  38. result = []
  39. #formated_query = CGI.escape(query)
  40. formated_query = query
  41. begin
  42. addr, port = proxy.split(':')
  43. make_agent(addr, port)
  44. #page = @agent.get 'http://www.google.' << domain << '/search?q='<< formated_query
  45. page = @agent.get 'http://www.google.' << domain
  46. #puts page.body
  47. #search_result = page.form_with(:name => 'f') do |search|
  48. form = page.form_with(:action => '/search')
  49. form.fields.each { |f|
  50. #puts f.name
  51. if f.name == 'q'
  52. form[f.name] = formated_query
  53. else
  54. form[f.name] = ''
  55. end
  56. }
  57. page_res = form.submit
  58. #puts page_res.body
  59. result << parse_web(page_res)
  60. #puts result
  61. while next_link = page_res.links.find { |l| l.text == i.to_s }
  62. #puts i
  63. if i < limit
  64. sleep aleat
  65. next_page = @agent.get next_link.href
  66. result << parse_web(next_page)
  67. i += 1
  68. end
  69. end
  70. return result
  71. rescue Mechanize::ResponseCodeError => exception
  72. #puts exception.message + " - invalid " + proxy
  73. #puts exception.backtrace.inspect
  74. @agent.log.info exception.message
  75. return false
  76. rescue Net::HTTPBadResponse , Net::HTTP::Persistent::Error, Errno::ECONNRESET=> e
  77. #puts "net exception "+ e.to_s
  78. @agent.log.info "net exception "+ e.to_s
  79. return false
  80. rescue Timeout::Error => e
  81. #puts "timeout: "+e.to_s
  82. @agent.log.info "timeout: "+e.to_s
  83. return false
  84. rescue => e
  85. #puts "unknown #{e.to_s}"
  86. @agent.log.info "unknown #{e.to_s}"
  87. return false
  88. end
  89. end
  90. private
  91. def make_agent
  92. user_agents = {
  93. 'Windows IE 6' => 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
  94. 'Windows IE 7' => 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
  95. 'Windows Mozilla' => 'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.4b) Gecko/20030516 Mozilla Firebird/0.6',
  96. 'Mac Safari' => 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_2; de-at) AppleWebKit/531.21.8 (KHTML, like Gecko) Version/4.0.4 Safari/531.21.10',
  97. 'Mac FireFox' => 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2) Gecko/20100115 Firefox/3.6',
  98. 'Mac Mozilla' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.4a) Gecko/20030401',
  99. 'Linux Mozilla' => 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.4) Gecko/20030624',
  100. 'Linux Firefox' => 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.1) Gecko/20100122 firefox/3.6.1',
  101. 'Linux Konqueror' => 'Mozilla/5.0 (compatible; Konqueror/3; Linux)',
  102. 'iPhone' => 'Mozilla/5.0 (iPhone; U; CPU like Mac OS X; en) AppleWebKit/420+ (KHTML, like Gecko) Version/3.0 Mobile/1C28 Safari/419.3'
  103. }
  104. items = user_agents.keys
  105. @agent = Mechanize.new do |a|
  106. a.user_agent_alias = items[rand(items.length)]
  107. a.max_history = 1
  108. a.open_timeout = 15
  109. a.read_timeout = 4
  110. a.keep_alive = true
  111. end
  112. end
  113. end