#!/usr/bin/env ruby # encoding: UTF-8 require 'selenium/webdriver' require 'watir-webdriver' require 'nokogiri' module YellowPage class Reader START_URL = "hxxp://xxx.zzzzzzzzzz.fr/" =begin rdoc In order to create a valid instance pass keywords and localisation to scrap. =end def initialize(quoiqui, ou) @quoiqui = quoiqui @ou = ou make_agent end def finalize kill_agent end =begin rdoc Connect to the main page and set elements used to perform search. =end def process @b.goto START_URL @b.text_field(:id, "quoiqui").set(@quoiqui) @b.text_field(:id, "ou").set(@ou) sleep(5) @b.button(:id,"btnValidSearch").click sleep(10) if @b.checkbox(:id, "allActivities").exist? then @b.checkbox(:id, "allActivities").set sleep(10) @b.button(:text, "Valider la sélection").when_present.click end end =begin rdoc Crawl external links in order to find email address. =end def find_more_infos(urls) include FindMail mail_ls = [] urls.each do |url| mech = Finder.new(url) mech.curse_dimension(url,0,mail_ls) end puts mail_ls return mail_ls end =begin rdoc Extract externals links from bloc. =end def check_external(bloc) array_of_hrefs = [] news_links = bloc.css("a").select{|link| link['title'] == "Lien externe"} news_links.each do |link| array_of_hrefs << link.text end return array_of_hrefs.uniq end =begin rdoc Parse each result page to format informations to extract. =end def get_infos_pages sleep(30) doc = Nokogiri::HTML.parse(@b.html) doc.css('br').each{ |br| br.replace(" ") } infos = doc.search('//li[@class="visitCard withVisual "]').map do | row | { 'title' => row.xpath('div[1]/h2/a/span[1]').text , 'address' => row.xpath('div[1]/div[3]/div[1]/p').text, 'address2' => row.xpath('div[1]/div[2]/div[1]/p').text, 'categorie' => row.xpath('div[1]/div[1]/ul/li').text , 'telephone' => row.xpath('div[1]/div[2]/div[2]/ul/li[1]/strong').text, 'site_internet' => check_external(row)#, #'mail' => find_more_infos(check_external(row)) } end return infos end =begin rdoc Write formatted informations in output file. =end def write_infos(file, infos) infos.each do |bloc| bloc.each do |key,value| file.write value << ";" unless value.empty? end file.write "\n" end end =begin rdoc Browse all results pages and extract wished informations. =end def scrape_all(file) begin infos = get_infos_pages write_infos(file, infos) while @b.link(:text =>"Page suivante").when_present.click infos = get_infos_pages write_infos(file, infos) end rescue Exception => e #puts e.message end end private def make_agent @b = Watir::Browser.new :firefox end def kill_agent @b.close end end end ######################## # usage : ruby exec.rb plumber ######################## def main(quoiqui) include YellowPage i = 75 #fr area code while i <= 75 pp = Reader.new(quoiqui, i) sentence = "scrape_" << quoiqui << "_" << i.to_s << ".txt" fic_out =File.open(sentence,'a') begin pp.process pp.scrape_all(fic_out) rescue Exception => e puts e.message ensure fic_out.close pp.finalize end i+=1 end end main(ARGV[0])