ruby page content analysis h1 to h6, title, description, links, images alt, strong and italic, robots, and many others


SUBMITTED BY: Guest

DATE: Sept. 28, 2014, 8:18 p.m.

FORMAT: Text only

SIZE: 7.3 kB

HITS: 1074

  1. #!/usr/bin/env ruby
  2. # encoding: UTF-8
  3. require 'net/http'
  4. require 'uri'
  5. require 'fileutils'
  6. require 'logger'
  7. require 'mechanize'
  8. require 'nokogiri'
  9. require './google_get_serp.rb'
  10. require './word1_calculate_density.rb'
  11. require './word2_calculate_density.rb'
  12. require './word3_calculate_density.rb'
  13. require './word4_calculate_density.rb'
  14. require './search_proxy.rb'
  15. require 'page_rankr'
  16. require 'uri'
  17. require 'cgi'
  18. module Page_analysis
  19. class Page_analyse
  20. def aleat
  21. #return rand(0.15..0.75)
  22. return r.rand(15..20)
  23. end
  24. # Only parses once
  25. def get_host_without_www2(url)
  26. url = "http://#{url}" unless url.start_with?('http')
  27. uri = URI.parse(url)
  28. host = uri.host.downcase
  29. host.start_with?('www.') ? host[4..-1] : host
  30. end
  31. # URL always gets parsed twice
  32. def get_host_without_www(url)
  33. url = "http://#{url}" if URI.parse(url).scheme.nil?
  34. host = URI.parse(url).host.downcase
  35. host.start_with?('www.') ? host[4..-1] : host
  36. end
  37. # Only parses twice if url doesn't start with a scheme
  38. def get_host_without_www(url)
  39. uri = URI.parse(url)
  40. uri = URI.parse("http://#{url}") if uri.scheme.nil?
  41. host = uri.host.downcase
  42. host.start_with?('www.') ? host[4..-1] : host
  43. end
  44. ### content
  45. def check_h1(body)
  46. #regex_h1 = '/<\s*h1[^>]*>(.*?)<\s*/\s*h1>/g'
  47. doc = Nokogiri::HTML::DocumentFragment.parse(body)
  48. puts doc.search('h1').map { |n| n.inner_text }
  49. end
  50. def check_h2(body)
  51. doc = Nokogiri::HTML::DocumentFragment.parse(body)
  52. puts doc.search('h2').map { |n| n.inner_text }
  53. end
  54. def check_h3(body)
  55. doc = Nokogiri::HTML::DocumentFragment.parse(body)
  56. puts doc.search('h3').map { |n| n.inner_text }
  57. end
  58. def check_h4(body)
  59. doc = Nokogiri::HTML::DocumentFragment.parse(body)
  60. puts doc.search('h4').map { |n| n.inner_text }
  61. end
  62. def check_h5(body)
  63. doc = Nokogiri::HTML::DocumentFragment.parse(body)
  64. puts doc.search('h5').map { |n| n.inner_text }
  65. end
  66. def check_h6(body)
  67. doc = Nokogiri::HTML::DocumentFragment.parse(body)
  68. puts doc.search('h6').map { |n| n.inner_text }
  69. end
  70. def check_title(body) # ++ longueur entre 10 et 70 char
  71. doc = Nokogiri::HTML::DocumentFragment.parse(body)
  72. puts doc.search('title').map { |n| n.inner_text }
  73. end
  74. def check_strong(body) #strong
  75. doc = Nokogiri::HTML::DocumentFragment.parse(body)
  76. puts doc.search('strong').map { |n| n.inner_text }
  77. end
  78. def check_italic(body) #em
  79. doc = Nokogiri::HTML::DocumentFragment.parse(body)
  80. puts doc.search('em').map { |n| n.inner_text }
  81. end
  82. ### meta
  83. def check_meta_description(page) #
  84. puts page.at('meta[@name="description"]')[:content]
  85. end
  86. def check_meta_keywords(page) #
  87. puts page.at('meta[@name="keywords"]')[:content]
  88. end
  89. def check_meta_content_type(page) #
  90. puts page.at('meta[@http-equiv="Content-Type"]')[:content]
  91. end
  92. def check_meta_content(page) #encodage, indexable, ..
  93. puts page.at('meta[@name="Robots"]')[:content]
  94. end
  95. ### robots.txt & sitemap => récupérer le domaine
  96. def check_spider(page)
  97. page_bot = page << "robots.txt"
  98. puts @agent.get page_bot
  99. end
  100. def check_sitemap(page)
  101. page_sitemap = page << "sitemap1.xml"
  102. puts @agent.get page_sitemap
  103. end
  104. ### links & imgs
  105. def check_links(page)
  106. page.links.each do |link|
  107. puts link
  108. end
  109. end
  110. def check_images(page)
  111. page.images.each do |img|
  112. puts img.alt
  113. end
  114. end
  115. ### duplicate content
  116. def check_duplicate_content(body)
  117. proxy_addr, proxy_port = proxy.split(':')
  118. end
  119. ### pagerank
  120. def check_pagerank(url)
  121. res = PageRankr.ranks(url, :google)
  122. puts res[:google]
  123. end
  124. ### backlink
  125. def check_link(body,url) # check si la page contient bien le texte
  126. doc = Nokogiri::HTML::DocumentFragment.parse(body)
  127. puts doc.search(url).map { |n| n.inner_text }
  128. end
  129. def check_backlink(url,lang)
  130. myobj = Find_Proxy.new
  131. #proxies = myobj.find_proxy(6)
  132. res = []
  133. exact_query = url
  134. proxies.each do |proxy|
  135. begin
  136. myobj = Find_Proxy.new
  137. proxy = myobj.find_proxy(1)
  138. puts proxy
  139. pp = Seeker.new
  140. tmp = pp.run(exact_query,100,lang,proxy.first) # fr, com, de, nl, it, es
  141. if tmp == false
  142. raise
  143. end
  144. res.flatten << tmp
  145. puts "fin #{tmp.length}"
  146. rescue
  147. puts "retry"
  148. retry
  149. end
  150. res = res.flatten
  151. puts res
  152. res = res.uniq
  153. return res.length
  154. end
  155. ### run main
  156. def run(url)
  157. begin
  158. myobj = Find_Proxy.new
  159. proxy = myobj.find_proxy(1)
  160. addr, port = proxy.first.split(':')
  161. make_agent(addr, port)
  162. #list_lang = ["com","fr","de","es","nl","it"]
  163. list_lang = ["fr"]
  164. #page = @agent.get url
  165. #check_h1(page.body)
  166. #check_title(page.body)
  167. #check_strong(page.body)
  168. #check_italic(page.body)
  169. #check_meta_description(page)
  170. #check_meta_keywords(page)
  171. #check_meta_content_type(page)
  172. #check_meta_content(page)
  173. #check_spider(page.uri.to_s )
  174. #check_sitemap(page.uri.to_s )
  175. #check_links(page)
  176. #check_images(page)
  177. #density_one(page.uri.to_s)
  178. #density_two(page.uri.to_s)
  179. #density_three(page.uri.to_s)
  180. #density_for(page.uri.to_s)
  181. #check_pagerank(page.uri.to_s)
  182. how = 0
  183. #@tab_vide = []
  184. list_lang.each do |lang|
  185. #check_backlink(page.uri.to_s)
  186. how = check_backlink(url,lang)
  187. puts "number #{how.to_s} for #{lang.to_s}"
  188. end
  189. rescue e
  190. puts e
  191. #retry
  192. end
  193. end
  194. def make_agent(addr, port)
  195. @agent = Mechanize.new do |a|
  196. a.user_agent_alias = 'Mac Safari'
  197. a.max_history = 1
  198. a.open_timeout = 15
  199. a.read_timeout = 5
  200. a.keep_alive = false
  201. #a.log = Logger.new(STDOUT)
  202. a.log = Logger.new('log_analyse_page.txt')
  203. #a.log.level = Logger::INFO
  204. a.set_proxy(addr, port)
  205. end
  206. end
  207. end
  208. end
  209. def main(url)
  210. include Page_analysis
  211. tt = Page_analyse.new
  212. analyse_page = tt.run(url)
  213. end
  214. main ARGV[0]