ruby google get video serp


SUBMITTED BY: Guest

DATE: Sept. 28, 2014, 8:24 p.m.

FORMAT: Text only

SIZE: 1.7 kB

HITS: 866

  1. #!/usr/bin/env ruby
  2. # encoding: UTF-8
  3. require 'rubygems'
  4. require 'time'
  5. require 'date'
  6. require 'hpricot'
  7. require 'htmlentities'
  8. require 'rexml/document'
  9. require 'oauth'
  10. require 'mime/types'
  11. require 'socket'
  12. require 'cgi'
  13. require 'multipart_body'
  14. require 'httpclient'
  15. require 'net/ftp'
  16. require 'net/smtp'
  17. require 'digest/sha2.rb'
  18. require 'net/http'
  19. require 'uri'
  20. require 'fileutils'
  21. require 'mechanize'
  22. require 'nokogiri'
  23. require 'open-uri'
  24. module Get_google_serp
  25. def aleat
  26. return rand(0.15..0.75) #nb_aleatoire entre 15 et 75 sec # en real
  27. end
  28. def parse_web(query)
  29. formated_tab = []
  30. query.parser.xpath('//h3[@class="r"]').take(10).each do |cite|
  31. puts URI::extract( cite.to_s, [ 'http' ] )
  32. formated_tab << URI::extract( cite.to_s, [ 'http' ] )#cite.inner_text
  33. end
  34. return formated_tab
  35. end
  36. def initialize(query)
  37. i = 2
  38. limit = i + 9
  39. result = []
  40. formated_query = URI.escape(query)
  41. @agent = Mechanize.new
  42. page = @agent.get 'http://www.google.fr/search?q='<< formated_query
  43. puts "page 1"
  44. result << parse_web(page)
  45. while i < limit
  46. aleat.minutes.sleep
  47. puts "page "<< i.to_s
  48. next_link = page.links.find { |l| l.text == i.to_s }
  49. next_page = @agent.get next_link.href
  50. result << parse_web(next_page)
  51. i += 1
  52. end
  53. return result
  54. end
  55. end
  56. def main(query)
  57. include Get_google_serp
  58. puts initialize(query)
  59. end
  60. main ARGV[0]

comments powered by Disqus