ruby yellow page

SUBMITTED BY: Guest
DATE: Sept. 29, 2014, 5:40 p.m.
FORMAT: Text only
SIZE: 4.2 kB
Raw Download
HITS: 1968
Report
#!/usr/bin/env ruby
# encoding: UTF-8

require 'selenium/webdriver'
require 'watir-webdriver'
require 'nokogiri'

module YellowPage
  class Reader
    START_URL = "hxxp://xxx.zzzzzzzzzz.fr/"

=begin rdoc
In order to create a valid instance pass keywords and localisation to scrap.
=end
    def initialize(quoiqui, ou)
      @quoiqui = quoiqui
      @ou = ou
      make_agent
    end
    
    def finalize
      kill_agent
    end

=begin rdoc
Connect to the main page and set elements used to perform search.
=end
    def process
      @b.goto START_URL 
      @b.text_field(:id, "quoiqui").set(@quoiqui)  
      @b.text_field(:id, "ou").set(@ou)    
      sleep(5)  
      @b.button(:id,"btnValidSearch").click
      sleep(10)        
      if @b.checkbox(:id, "allActivities").exist?
      then 
        @b.checkbox(:id, "allActivities").set
        sleep(10)
        @b.button(:text, "Valider la sélection").when_present.click
      end
    end

=begin rdoc
Crawl external links in order to find email address.
=end
    def find_more_infos(urls)
        include FindMail
        mail_ls = []
        urls.each do |url|
          mech = Finder.new(url)
          mech.curse_dimension(url,0,mail_ls)
        end
        puts mail_ls
        return mail_ls   
    end
   
=begin rdoc
Extract externals links from bloc.
=end
    def check_external(bloc)
      array_of_hrefs = []
      news_links = bloc.css("a").select{|link| link['title'] == "Lien externe"}
      news_links.each do |link| array_of_hrefs << link.text end 
      return array_of_hrefs.uniq   
    end

=begin rdoc
Parse each result page to format informations to extract.
=end    
    def get_infos_pages
      sleep(30)
      doc = Nokogiri::HTML.parse(@b.html)
      doc.css('br').each{ |br| br.replace(" ") }

     infos = doc.search('//li[@class="visitCard withVisual "]').map do | row | 
      {  
       'title' => row.xpath('div[1]/h2/a/span[1]').text , 
       'address' => row.xpath('div[1]/div[3]/div[1]/p').text,
       'address2' => row.xpath('div[1]/div[2]/div[1]/p').text,    
       'categorie' => row.xpath('div[1]/div[1]/ul/li').text ,
       'telephone' => row.xpath('div[1]/div[2]/div[2]/ul/li[1]/strong').text,
       'site_internet' => check_external(row)#,
       #'mail' => find_more_infos(check_external(row))
      }
      end
     return infos
    end

=begin rdoc
Write formatted informations in output file.
=end    
    def write_infos(file, infos)
      infos.each do |bloc|
        bloc.each do |key,value|
          file.write value << ";" unless value.empty?
        end
        file.write "\n"
      end
    end    

=begin rdoc
Browse all results pages and extract wished informations.
=end
    def scrape_all(file)
      begin
        infos = get_infos_pages
        write_infos(file, infos)
        while @b.link(:text =>"Page suivante").when_present.click
          infos = get_infos_pages
          write_infos(file, infos)
        end
      rescue Exception => e
        #puts e.message
      end         
    end
   
  private
    def make_agent
      @b = Watir::Browser.new :firefox
    end  
    
    def kill_agent
      @b.close
    end 

  end
end

########################
# usage : ruby exec.rb plumber
########################

  def main(quoiqui)
    include YellowPage
    i = 75 #fr area code
    
    while i <= 75 
      pp = Reader.new(quoiqui, i)
      sentence = "scrape_" << quoiqui << "_" << i.to_s << ".txt"
      fic_out =File.open(sentence,'a')    
      begin
        pp.process
        pp.scrape_all(fic_out)
      rescue Exception => e
        puts e.message        
      ensure
        fic_out.close  
        pp.finalize
      end
      i+=1
    end
  end


main(ARGV[0])