ruby google get serps


SUBMITTED BY: Guest

DATE: Sept. 28, 2014, 8:24 p.m.

FORMAT: Text only

SIZE: 4.6 kB

HITS: 870

  1. #!/usr/bin/env ruby
  2. # encoding: UTF-8
  3. require 'rubygems'
  4. require 'time'
  5. require 'date'
  6. require 'hpricot'
  7. require 'htmlentities'
  8. require 'rexml/document'
  9. require 'oauth'
  10. require 'mime/types'
  11. require 'socket'
  12. require 'cgi'
  13. require 'httpclient'
  14. require 'net/ftp'
  15. require 'net/smtp'
  16. require 'digest/sha2.rb'
  17. require 'net/http'
  18. require 'uri'
  19. require 'fileutils'
  20. require 'mechanize'
  21. require 'nokogiri'
  22. require 'open-uri'
  23. class Seeker
  24. def aleat
  25. r = Random.new
  26. return r.rand(35..50)
  27. end
  28. def parse_web(query)
  29. formated_tab = []
  30. query.parser.xpath('//h3[@class="r"]').take(10).each do |cite|
  31. formated_tab << URI::extract( cite.to_s, [ 'http' ] )#cite.inner_text
  32. end
  33. return formated_tab
  34. end
  35. def run(query,max,domain,proxy)
  36. puts " #{query} #{max} #{domain} #{proxy}"
  37. i = 2
  38. limit = i + max
  39. result = []
  40. #formated_query = CGI.escape(query)
  41. formated_query = query
  42. begin
  43. addr, port = proxy.split(':')
  44. make_agent(addr, port)
  45. #page = @agent.get 'http://www.google.' << domain << '/search?q='<< formated_query
  46. page = @agent.get 'http://www.google.' << domain
  47. #puts page.body
  48. #search_result = page.form_with(:name => 'f') do |search|
  49. form = page.form_with(:action => '/search')
  50. form.fields.each { |f|
  51. #puts f.name
  52. if f.name == 'q'
  53. form[f.name] = formated_query
  54. else
  55. form[f.name] = ''
  56. end
  57. }
  58. page_res = form.submit
  59. #puts page_res.body
  60. result << parse_web(page_res)
  61. #puts result
  62. while next_link = page_res.links.find { |l| l.text == i.to_s }
  63. #puts i
  64. if i < limit
  65. sleep aleat
  66. next_page = @agent.get next_link.href
  67. result << parse_web(next_page)
  68. i += 1
  69. end
  70. end
  71. return result
  72. rescue Mechanize::ResponseCodeError => exception
  73. #puts exception.message + " - invalid " + proxy
  74. #puts exception.backtrace.inspect
  75. @agent.log.info exception.message
  76. return false
  77. rescue Net::HTTPBadResponse , Net::HTTP::Persistent::Error, Errno::ECONNRESET=> e
  78. #puts "net exception "+ e.to_s
  79. @agent.log.info "net exception "+ e.to_s
  80. return false
  81. rescue Timeout::Error => e
  82. #puts "timeout: "+e.to_s
  83. @agent.log.info "timeout: "+e.to_s
  84. return false
  85. rescue => e
  86. #puts "unknown #{e.to_s}"
  87. @agent.log.info "unknown #{e.to_s}"
  88. return false
  89. end
  90. end
  91. private
  92. def make_agent
  93. user_agents = {
  94. 'Windows IE 6' => 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
  95. 'Windows IE 7' => 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
  96. 'Windows Mozilla' => 'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.4b) Gecko/20030516 Mozilla Firebird/0.6',
  97. 'Mac Safari' => 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_2; de-at) AppleWebKit/531.21.8 (KHTML, like Gecko) Version/4.0.4 Safari/531.21.10',
  98. 'Mac FireFox' => 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2) Gecko/20100115 Firefox/3.6',
  99. 'Mac Mozilla' => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.4a) Gecko/20030401',
  100. 'Linux Mozilla' => 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.4) Gecko/20030624',
  101. 'Linux Firefox' => 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.1) Gecko/20100122 firefox/3.6.1',
  102. 'Linux Konqueror' => 'Mozilla/5.0 (compatible; Konqueror/3; Linux)',
  103. 'iPhone' => 'Mozilla/5.0 (iPhone; U; CPU like Mac OS X; en) AppleWebKit/420+ (KHTML, like Gecko) Version/3.0 Mobile/1C28 Safari/419.3'
  104. }
  105. items = user_agents.keys
  106. @agent = Mechanize.new do |a|
  107. a.user_agent_alias = items[rand(items.length)]
  108. a.max_history = 1
  109. a.open_timeout = 15
  110. a.read_timeout = 4
  111. a.keep_alive = true
  112. end
  113. end
  114. end

comments powered by Disqus