ruby page content analysis h1 to h6, title, description, links, images alt, strong and italic, robots, and many others

SUBMITTED BY: Guest

DATE: Sept. 28, 2014, 8:18 p.m.

FORMAT: Text only

SIZE: 7.3 kB

Raw Download

HITS: 1074

Report

#!/usr/bin/env ruby
# encoding: UTF-8
require 'net/http'
require 'uri'
require 'fileutils'
require 'logger'
require 'mechanize'
require 'nokogiri'
require './google_get_serp.rb'
require './word1_calculate_density.rb'
require './word2_calculate_density.rb'
require './word3_calculate_density.rb'
require './word4_calculate_density.rb'
require './search_proxy.rb'
require 'page_rankr'
require 'uri'
require 'cgi'
module Page_analysis
class Page_analyse
def aleat
#return rand(0.15..0.75)
r = Random.new
return r.rand(15..20)
end
# Only parses once
def get_host_without_www2(url)
url = "http://#{url}" unless url.start_with?('http')
uri = URI.parse(url)
host = uri.host.downcase
host.start_with?('www.') ? host[4..-1] : host
end
# URL always gets parsed twice
def get_host_without_www(url)
url = "http://#{url}" if URI.parse(url).scheme.nil?
host = URI.parse(url).host.downcase
host.start_with?('www.') ? host[4..-1] : host
end
# Only parses twice if url doesn't start with a scheme
def get_host_without_www(url)
uri = URI.parse(url)
uri = URI.parse("http://#{url}") if uri.scheme.nil?
host = uri.host.downcase
host.start_with?('www.') ? host[4..-1] : host
end
### content
def check_h1(body)
#regex_h1 = '/<\s*h1[^>]*>(.*?)<\s*/\s*h1>/g'
doc = Nokogiri::HTML::DocumentFragment.parse(body)
puts doc.search('h1').map { |n| n.inner_text }
end
def check_h2(body)
doc = Nokogiri::HTML::DocumentFragment.parse(body)
puts doc.search('h2').map { |n| n.inner_text }
end
def check_h3(body)
doc = Nokogiri::HTML::DocumentFragment.parse(body)
puts doc.search('h3').map { |n| n.inner_text }
end
def check_h4(body)
doc = Nokogiri::HTML::DocumentFragment.parse(body)
puts doc.search('h4').map { |n| n.inner_text }
end
def check_h5(body)
doc = Nokogiri::HTML::DocumentFragment.parse(body)
puts doc.search('h5').map { |n| n.inner_text }
end
def check_h6(body)
doc = Nokogiri::HTML::DocumentFragment.parse(body)
puts doc.search('h6').map { |n| n.inner_text }
end
def check_title(body) # ++ longueur entre 10 et 70 char
doc = Nokogiri::HTML::DocumentFragment.parse(body)
puts doc.search('title').map { |n| n.inner_text }
end
def check_strong(body) #strong
doc = Nokogiri::HTML::DocumentFragment.parse(body)
puts doc.search('strong').map { |n| n.inner_text }
end
def check_italic(body) #em
doc = Nokogiri::HTML::DocumentFragment.parse(body)
puts doc.search('em').map { |n| n.inner_text }
end
### meta
def check_meta_description(page) #
puts page.at('meta[@name="description"]')[:content]
end
def check_meta_keywords(page) #
puts page.at('meta[@name="keywords"]')[:content]
end
def check_meta_content_type(page) #
puts page.at('meta[@http-equiv="Content-Type"]')[:content]
end
def check_meta_content(page) #encodage, indexable, ..
puts page.at('meta[@name="Robots"]')[:content]
end
### robots.txt & sitemap => récupérer le domaine
def check_spider(page)
page_bot = page << "robots.txt"
puts @agent.get page_bot
end
def check_sitemap(page)
page_sitemap = page << "sitemap1.xml"
puts @agent.get page_sitemap
end
### links & imgs
def check_links(page)
page.links.each do |link|
puts link
end
end
def check_images(page)
page.images.each do |img|
puts img.alt
end
end
### duplicate content
def check_duplicate_content(body)
proxy_addr, proxy_port = proxy.split(':')
end
### pagerank
def check_pagerank(url)
res = PageRankr.ranks(url, :google)
puts res[:google]
end
### backlink
def check_link(body,url) # check si la page contient bien le texte
doc = Nokogiri::HTML::DocumentFragment.parse(body)
puts doc.search(url).map { |n| n.inner_text }
end
def check_backlink(url,lang)
myobj = Find_Proxy.new
#proxies = myobj.find_proxy(6)
res = []
exact_query = url
proxies.each do |proxy|
begin
myobj = Find_Proxy.new
proxy = myobj.find_proxy(1)
puts proxy
pp = Seeker.new
tmp = pp.run(exact_query,100,lang,proxy.first) # fr, com, de, nl, it, es
if tmp == false
raise
end
res.flatten << tmp
puts "fin #{tmp.length}"
rescue
puts "retry"
retry
end
res = res.flatten
puts res
res = res.uniq
return res.length
end
### run main
def run(url)
begin
myobj = Find_Proxy.new
proxy = myobj.find_proxy(1)
addr, port = proxy.first.split(':')
make_agent(addr, port)
#list_lang = ["com","fr","de","es","nl","it"]
list_lang = ["fr"]
#page = @agent.get url
#check_h1(page.body)
#check_title(page.body)
#check_strong(page.body)
#check_italic(page.body)
#check_meta_description(page)
#check_meta_keywords(page)
#check_meta_content_type(page)
#check_meta_content(page)
#check_spider(page.uri.to_s )
#check_sitemap(page.uri.to_s )
#check_links(page)
#check_images(page)
#density_one(page.uri.to_s)
#density_two(page.uri.to_s)
#density_three(page.uri.to_s)
#density_for(page.uri.to_s)
#check_pagerank(page.uri.to_s)
how = 0
#@tab_vide = []
list_lang.each do |lang|
#check_backlink(page.uri.to_s)
how = check_backlink(url,lang)
puts "number #{how.to_s} for #{lang.to_s}"
end
rescue e
puts e
#retry
end
end
def make_agent(addr, port)
@agent = Mechanize.new do |a|
a.user_agent_alias = 'Mac Safari'
a.max_history = 1
a.open_timeout = 15
a.read_timeout = 5
a.keep_alive = false
#a.log = Logger.new(STDOUT)
a.log = Logger.new('log_analyse_page.txt')
#a.log.level = Logger::INFO
a.set_proxy(addr, port)
end
end
end
end
def main(url)
include Page_analysis
tt = Page_analyse.new
analyse_page = tt.run(url)
end
main ARGV[0]