ruby yellow page


SUBMITTED BY: Guest

DATE: Sept. 29, 2014, 5:40 p.m.

FORMAT: Text only

SIZE: 4.2 kB

HITS: 1968

  1. #!/usr/bin/env ruby
  2. # encoding: UTF-8
  3. require 'selenium/webdriver'
  4. require 'watir-webdriver'
  5. require 'nokogiri'
  6. module YellowPage
  7. class Reader
  8. START_URL = "hxxp://xxx.zzzzzzzzzz.fr/"
  9. =begin rdoc
  10. In order to create a valid instance pass keywords and localisation to scrap.
  11. =end
  12. def initialize(quoiqui, ou)
  13. @quoiqui = quoiqui
  14. @ou = ou
  15. make_agent
  16. end
  17. def finalize
  18. kill_agent
  19. end
  20. =begin rdoc
  21. Connect to the main page and set elements used to perform search.
  22. =end
  23. def process
  24. @b.goto START_URL
  25. @b.text_field(:id, "quoiqui").set(@quoiqui)
  26. @b.text_field(:id, "ou").set(@ou)
  27. sleep(5)
  28. @b.button(:id,"btnValidSearch").click
  29. sleep(10)
  30. if @b.checkbox(:id, "allActivities").exist?
  31. then
  32. @b.checkbox(:id, "allActivities").set
  33. sleep(10)
  34. @b.button(:text, "Valider la sélection").when_present.click
  35. end
  36. end
  37. =begin rdoc
  38. Crawl external links in order to find email address.
  39. =end
  40. def find_more_infos(urls)
  41. include FindMail
  42. mail_ls = []
  43. urls.each do |url|
  44. mech = Finder.new(url)
  45. mech.curse_dimension(url,0,mail_ls)
  46. end
  47. puts mail_ls
  48. return mail_ls
  49. end
  50. =begin rdoc
  51. Extract externals links from bloc.
  52. =end
  53. def check_external(bloc)
  54. array_of_hrefs = []
  55. news_links = bloc.css("a").select{|link| link['title'] == "Lien externe"}
  56. news_links.each do |link| array_of_hrefs << link.text end
  57. return array_of_hrefs.uniq
  58. end
  59. =begin rdoc
  60. Parse each result page to format informations to extract.
  61. =end
  62. def get_infos_pages
  63. sleep(30)
  64. doc = Nokogiri::HTML.parse(@b.html)
  65. doc.css('br').each{ |br| br.replace(" ") }
  66. infos = doc.search('//li[@class="visitCard withVisual "]').map do | row |
  67. {
  68. 'title' => row.xpath('div[1]/h2/a/span[1]').text ,
  69. 'address' => row.xpath('div[1]/div[3]/div[1]/p').text,
  70. 'address2' => row.xpath('div[1]/div[2]/div[1]/p').text,
  71. 'categorie' => row.xpath('div[1]/div[1]/ul/li').text ,
  72. 'telephone' => row.xpath('div[1]/div[2]/div[2]/ul/li[1]/strong').text,
  73. 'site_internet' => check_external(row)#,
  74. #'mail' => find_more_infos(check_external(row))
  75. }
  76. end
  77. return infos
  78. end
  79. =begin rdoc
  80. Write formatted informations in output file.
  81. =end
  82. def write_infos(file, infos)
  83. infos.each do |bloc|
  84. bloc.each do |key,value|
  85. file.write value << ";" unless value.empty?
  86. end
  87. file.write "\n"
  88. end
  89. end
  90. =begin rdoc
  91. Browse all results pages and extract wished informations.
  92. =end
  93. def scrape_all(file)
  94. begin
  95. infos = get_infos_pages
  96. write_infos(file, infos)
  97. while @b.link(:text =>"Page suivante").when_present.click
  98. infos = get_infos_pages
  99. write_infos(file, infos)
  100. end
  101. rescue Exception => e
  102. #puts e.message
  103. end
  104. end
  105. private
  106. def make_agent
  107. @b = Watir::Browser.new :firefox
  108. end
  109. def kill_agent
  110. @b.close
  111. end
  112. end
  113. end
  114. ########################
  115. # usage : ruby exec.rb plumber
  116. ########################
  117. def main(quoiqui)
  118. include YellowPage
  119. i = 75 #fr area code
  120. while i <= 75
  121. pp = Reader.new(quoiqui, i)
  122. sentence = "scrape_" << quoiqui << "_" << i.to_s << ".txt"
  123. fic_out =File.open(sentence,'a')
  124. begin
  125. pp.process
  126. pp.scrape_all(fic_out)
  127. rescue Exception => e
  128. puts e.message
  129. ensure
  130. fic_out.close
  131. pp.finalize
  132. end
  133. i+=1
  134. end
  135. end
  136. main(ARGV[0])

comments powered by Disqus