#!/usr/bin/env ruby # encoding: UTF-8 require 'net/http' require 'uri' require 'fileutils' require 'logger' require 'mechanize' require 'nokogiri' require './google_get_serp.rb' require './word1_calculate_density.rb' require './word2_calculate_density.rb' require './word3_calculate_density.rb' require './word4_calculate_density.rb' require './search_proxy.rb' require 'page_rankr' require 'uri' require 'cgi' module Page_analysis class Page_analyse def aleat #return rand(0.15..0.75) r = Random.new return r.rand(15..20) end # Only parses once def get_host_without_www2(url) url = "http://#{url}" unless url.start_with?('http') uri = URI.parse(url) host = uri.host.downcase host.start_with?('www.') ? host[4..-1] : host end # URL always gets parsed twice def get_host_without_www(url) url = "http://#{url}" if URI.parse(url).scheme.nil? host = URI.parse(url).host.downcase host.start_with?('www.') ? host[4..-1] : host end # Only parses twice if url doesn't start with a scheme def get_host_without_www(url) uri = URI.parse(url) uri = URI.parse("http://#{url}") if uri.scheme.nil? host = uri.host.downcase host.start_with?('www.') ? host[4..-1] : host end ### content def check_h1(body) #regex_h1 = '/<\s*h1[^>]*>(.*?)<\s*/\s*h1>/g' doc = Nokogiri::HTML::DocumentFragment.parse(body) puts doc.search('h1').map { |n| n.inner_text } end def check_h2(body) doc = Nokogiri::HTML::DocumentFragment.parse(body) puts doc.search('h2').map { |n| n.inner_text } end def check_h3(body) doc = Nokogiri::HTML::DocumentFragment.parse(body) puts doc.search('h3').map { |n| n.inner_text } end def check_h4(body) doc = Nokogiri::HTML::DocumentFragment.parse(body) puts doc.search('h4').map { |n| n.inner_text } end def check_h5(body) doc = Nokogiri::HTML::DocumentFragment.parse(body) puts doc.search('h5').map { |n| n.inner_text } end def check_h6(body) doc = Nokogiri::HTML::DocumentFragment.parse(body) puts doc.search('h6').map { |n| n.inner_text } end def check_title(body) # ++ longueur entre 10 et 70 char doc = Nokogiri::HTML::DocumentFragment.parse(body) puts doc.search('title').map { |n| n.inner_text } end def check_strong(body) #strong doc = Nokogiri::HTML::DocumentFragment.parse(body) puts doc.search('strong').map { |n| n.inner_text } end def check_italic(body) #em doc = Nokogiri::HTML::DocumentFragment.parse(body) puts doc.search('em').map { |n| n.inner_text } end ### meta def check_meta_description(page) # puts page.at('meta[@name="description"]')[:content] end def check_meta_keywords(page) # puts page.at('meta[@name="keywords"]')[:content] end def check_meta_content_type(page) # puts page.at('meta[@http-equiv="Content-Type"]')[:content] end def check_meta_content(page) #encodage, indexable, .. puts page.at('meta[@name="Robots"]')[:content] end ### robots.txt & sitemap => récupérer le domaine def check_spider(page) page_bot = page << "robots.txt" puts @agent.get page_bot end def check_sitemap(page) page_sitemap = page << "sitemap1.xml" puts @agent.get page_sitemap end ### links & imgs def check_links(page) page.links.each do |link| puts link end end def check_images(page) page.images.each do |img| puts img.alt end end ### duplicate content def check_duplicate_content(body) proxy_addr, proxy_port = proxy.split(':') end ### pagerank def check_pagerank(url) res = PageRankr.ranks(url, :google) puts res[:google] end ### backlink def check_link(body,url) # check si la page contient bien le texte doc = Nokogiri::HTML::DocumentFragment.parse(body) puts doc.search(url).map { |n| n.inner_text } end def check_backlink(url,lang) myobj = Find_Proxy.new #proxies = myobj.find_proxy(6) res = [] exact_query = url proxies.each do |proxy| begin myobj = Find_Proxy.new proxy = myobj.find_proxy(1) puts proxy pp = Seeker.new tmp = pp.run(exact_query,100,lang,proxy.first) # fr, com, de, nl, it, es if tmp == false raise end res.flatten << tmp puts "fin #{tmp.length}" rescue puts "retry" retry end res = res.flatten puts res res = res.uniq return res.length end ### run main def run(url) begin myobj = Find_Proxy.new proxy = myobj.find_proxy(1) addr, port = proxy.first.split(':') make_agent(addr, port) #list_lang = ["com","fr","de","es","nl","it"] list_lang = ["fr"] #page = @agent.get url #check_h1(page.body) #check_title(page.body) #check_strong(page.body) #check_italic(page.body) #check_meta_description(page) #check_meta_keywords(page) #check_meta_content_type(page) #check_meta_content(page) #check_spider(page.uri.to_s ) #check_sitemap(page.uri.to_s ) #check_links(page) #check_images(page) #density_one(page.uri.to_s) #density_two(page.uri.to_s) #density_three(page.uri.to_s) #density_for(page.uri.to_s) #check_pagerank(page.uri.to_s) how = 0 #@tab_vide = [] list_lang.each do |lang| #check_backlink(page.uri.to_s) how = check_backlink(url,lang) puts "number #{how.to_s} for #{lang.to_s}" end rescue e puts e #retry end end def make_agent(addr, port) @agent = Mechanize.new do |a| a.user_agent_alias = 'Mac Safari' a.max_history = 1 a.open_timeout = 15 a.read_timeout = 5 a.keep_alive = false #a.log = Logger.new(STDOUT) a.log = Logger.new('log_analyse_page.txt') #a.log.level = Logger::INFO a.set_proxy(addr, port) end end end end def main(url) include Page_analysis tt = Page_analyse.new analyse_page = tt.run(url) end main ARGV[0]