#!/usr/bin/env ruby
# encoding: UTF-8
require 'selenium/webdriver'
require 'watir-webdriver'
require 'nokogiri'
module YellowPage
class Reader
START_URL = "hxxp://xxx.zzzzzzzzzz.fr/"
=begin rdoc
In order to create a valid instance pass keywords and localisation to scrap.
=end
def initialize(quoiqui, ou)
@quoiqui = quoiqui
@ou = ou
make_agent
end
def finalize
kill_agent
end
=begin rdoc
Connect to the main page and set elements used to perform search.
=end
def process
@b.goto START_URL
@b.text_field(:id, "quoiqui").set(@quoiqui)
@b.text_field(:id, "ou").set(@ou)
sleep(5)
@b.button(:id,"btnValidSearch").click
sleep(10)
if @b.checkbox(:id, "allActivities").exist?
then
@b.checkbox(:id, "allActivities").set
sleep(10)
@b.button(:text, "Valider la sélection").when_present.click
end
end
=begin rdoc
Crawl external links in order to find email address.
=end
def find_more_infos(urls)
include FindMail
mail_ls = []
urls.each do |url|
mech = Finder.new(url)
mech.curse_dimension(url,0,mail_ls)
end
puts mail_ls
return mail_ls
end
=begin rdoc
Extract externals links from bloc.
=end
def check_external(bloc)
array_of_hrefs = []
news_links = bloc.css("a").select{|link| link['title'] == "Lien externe"}
news_links.each do |link| array_of_hrefs << link.text end
return array_of_hrefs.uniq
end
=begin rdoc
Parse each result page to format informations to extract.
=end
def get_infos_pages
sleep(30)
doc = Nokogiri::HTML.parse(@b.html)
doc.css('br').each{ |br| br.replace(" ") }
infos = doc.search('//li[@class="visitCard withVisual "]').map do | row |
{
'title' => row.xpath('div[1]/h2/a/span[1]').text ,
'address' => row.xpath('div[1]/div[3]/div[1]/p').text,
'address2' => row.xpath('div[1]/div[2]/div[1]/p').text,
'categorie' => row.xpath('div[1]/div[1]/ul/li').text ,
'telephone' => row.xpath('div[1]/div[2]/div[2]/ul/li[1]/strong').text,
'site_internet' => check_external(row)#,
#'mail' => find_more_infos(check_external(row))
}
end
return infos
end
=begin rdoc
Write formatted informations in output file.
=end
def write_infos(file, infos)
infos.each do |bloc|
bloc.each do |key,value|
file.write value << ";" unless value.empty?
end
file.write "\n"
end
end
=begin rdoc
Browse all results pages and extract wished informations.
=end
def scrape_all(file)
begin
infos = get_infos_pages
write_infos(file, infos)
while @b.link(:text =>"Page suivante").when_present.click
infos = get_infos_pages
write_infos(file, infos)
end
rescue Exception => e
#puts e.message
end
end
private
def make_agent
@b = Watir::Browser.new :firefox
end
def kill_agent
@b.close
end
end
end
########################
# usage : ruby exec.rb plumber
########################
def main(quoiqui)
include YellowPage
i = 75 #fr area code
while i <= 75
pp = Reader.new(quoiqui, i)
sentence = "scrape_" << quoiqui << "_" << i.to_s << ".txt"
fic_out =File.open(sentence,'a')
begin
pp.process
pp.scrape_all(fic_out)
rescue Exception => e
puts e.message
ensure
fic_out.close
pp.finalize
end
i+=1
end
end
main(ARGV[0])