ruby page content analysis h1 to h6, title, description, links, images alt, strong and italic, robots, and many others


SUBMITTED BY: Guest

DATE: Sept. 28, 2014, 8:18 p.m.

FORMAT: Text only

SIZE: 7.3 kB

HITS: 1052

  1. #!/usr/bin/env ruby
  2. # encoding: UTF-8
  3. require 'net/http'
  4. require 'uri'
  5. require 'fileutils'
  6. require 'logger'
  7. require 'mechanize'
  8. require 'nokogiri'
  9. require './google_get_serp.rb'
  10. require './word1_calculate_density.rb'
  11. require './word2_calculate_density.rb'
  12. require './word3_calculate_density.rb'
  13. require './word4_calculate_density.rb'
  14. require './search_proxy.rb'
  15. require 'page_rankr'
  16. require 'uri'
  17. require 'cgi'
  18. module Page_analysis
  19. class Page_analyse
  20. def aleat
  21. #return rand(0.15..0.75)
  22. r = Random.new
  23. return r.rand(15..20)
  24. end
  25. # Only parses once
  26. def get_host_without_www2(url)
  27. url = "http://#{url}" unless url.start_with?('http')
  28. uri = URI.parse(url)
  29. host = uri.host.downcase
  30. host.start_with?('www.') ? host[4..-1] : host
  31. end
  32. # URL always gets parsed twice
  33. def get_host_without_www(url)
  34. url = "http://#{url}" if URI.parse(url).scheme.nil?
  35. host = URI.parse(url).host.downcase
  36. host.start_with?('www.') ? host[4..-1] : host
  37. end
  38. # Only parses twice if url doesn't start with a scheme
  39. def get_host_without_www(url)
  40. uri = URI.parse(url)
  41. uri = URI.parse("http://#{url}") if uri.scheme.nil?
  42. host = uri.host.downcase
  43. host.start_with?('www.') ? host[4..-1] : host
  44. end
  45. ### content
  46. def check_h1(body)
  47. #regex_h1 = '/<\s*h1[^>]*>(.*?)<\s*/\s*h1>/g'
  48. doc = Nokogiri::HTML::DocumentFragment.parse(body)
  49. puts doc.search('h1').map { |n| n.inner_text }
  50. end
  51. def check_h2(body)
  52. doc = Nokogiri::HTML::DocumentFragment.parse(body)
  53. puts doc.search('h2').map { |n| n.inner_text }
  54. end
  55. def check_h3(body)
  56. doc = Nokogiri::HTML::DocumentFragment.parse(body)
  57. puts doc.search('h3').map { |n| n.inner_text }
  58. end
  59. def check_h4(body)
  60. doc = Nokogiri::HTML::DocumentFragment.parse(body)
  61. puts doc.search('h4').map { |n| n.inner_text }
  62. end
  63. def check_h5(body)
  64. doc = Nokogiri::HTML::DocumentFragment.parse(body)
  65. puts doc.search('h5').map { |n| n.inner_text }
  66. end
  67. def check_h6(body)
  68. doc = Nokogiri::HTML::DocumentFragment.parse(body)
  69. puts doc.search('h6').map { |n| n.inner_text }
  70. end
  71. def check_title(body) # ++ longueur entre 10 et 70 char
  72. doc = Nokogiri::HTML::DocumentFragment.parse(body)
  73. puts doc.search('title').map { |n| n.inner_text }
  74. end
  75. def check_strong(body) #strong
  76. doc = Nokogiri::HTML::DocumentFragment.parse(body)
  77. puts doc.search('strong').map { |n| n.inner_text }
  78. end
  79. def check_italic(body) #em
  80. doc = Nokogiri::HTML::DocumentFragment.parse(body)
  81. puts doc.search('em').map { |n| n.inner_text }
  82. end
  83. ### meta
  84. def check_meta_description(page) #
  85. puts page.at('meta[@name="description"]')[:content]
  86. end
  87. def check_meta_keywords(page) #
  88. puts page.at('meta[@name="keywords"]')[:content]
  89. end
  90. def check_meta_content_type(page) #
  91. puts page.at('meta[@http-equiv="Content-Type"]')[:content]
  92. end
  93. def check_meta_content(page) #encodage, indexable, ..
  94. puts page.at('meta[@name="Robots"]')[:content]
  95. end
  96. ### robots.txt & sitemap => récupérer le domaine
  97. def check_spider(page)
  98. page_bot = page << "robots.txt"
  99. puts @agent.get page_bot
  100. end
  101. def check_sitemap(page)
  102. page_sitemap = page << "sitemap1.xml"
  103. puts @agent.get page_sitemap
  104. end
  105. ### links & imgs
  106. def check_links(page)
  107. page.links.each do |link|
  108. puts link
  109. end
  110. end
  111. def check_images(page)
  112. page.images.each do |img|
  113. puts img.alt
  114. end
  115. end
  116. ### duplicate content
  117. def check_duplicate_content(body)
  118. proxy_addr, proxy_port = proxy.split(':')
  119. end
  120. ### pagerank
  121. def check_pagerank(url)
  122. res = PageRankr.ranks(url, :google)
  123. puts res[:google]
  124. end
  125. ### backlink
  126. def check_link(body,url) # check si la page contient bien le texte
  127. doc = Nokogiri::HTML::DocumentFragment.parse(body)
  128. puts doc.search(url).map { |n| n.inner_text }
  129. end
  130. def check_backlink(url,lang)
  131. myobj = Find_Proxy.new
  132. #proxies = myobj.find_proxy(6)
  133. res = []
  134. exact_query = url
  135. proxies.each do |proxy|
  136. begin
  137. myobj = Find_Proxy.new
  138. proxy = myobj.find_proxy(1)
  139. puts proxy
  140. pp = Seeker.new
  141. tmp = pp.run(exact_query,100,lang,proxy.first) # fr, com, de, nl, it, es
  142. if tmp == false
  143. raise
  144. end
  145. res.flatten << tmp
  146. puts "fin #{tmp.length}"
  147. rescue
  148. puts "retry"
  149. retry
  150. end
  151. res = res.flatten
  152. puts res
  153. res = res.uniq
  154. return res.length
  155. end
  156. ### run main
  157. def run(url)
  158. begin
  159. myobj = Find_Proxy.new
  160. proxy = myobj.find_proxy(1)
  161. addr, port = proxy.first.split(':')
  162. make_agent(addr, port)
  163. #list_lang = ["com","fr","de","es","nl","it"]
  164. list_lang = ["fr"]
  165. #page = @agent.get url
  166. #check_h1(page.body)
  167. #check_title(page.body)
  168. #check_strong(page.body)
  169. #check_italic(page.body)
  170. #check_meta_description(page)
  171. #check_meta_keywords(page)
  172. #check_meta_content_type(page)
  173. #check_meta_content(page)
  174. #check_spider(page.uri.to_s )
  175. #check_sitemap(page.uri.to_s )
  176. #check_links(page)
  177. #check_images(page)
  178. #density_one(page.uri.to_s)
  179. #density_two(page.uri.to_s)
  180. #density_three(page.uri.to_s)
  181. #density_for(page.uri.to_s)
  182. #check_pagerank(page.uri.to_s)
  183. how = 0
  184. #@tab_vide = []
  185. list_lang.each do |lang|
  186. #check_backlink(page.uri.to_s)
  187. how = check_backlink(url,lang)
  188. puts "number #{how.to_s} for #{lang.to_s}"
  189. end
  190. rescue e
  191. puts e
  192. #retry
  193. end
  194. end
  195. def make_agent(addr, port)
  196. @agent = Mechanize.new do |a|
  197. a.user_agent_alias = 'Mac Safari'
  198. a.max_history = 1
  199. a.open_timeout = 15
  200. a.read_timeout = 5
  201. a.keep_alive = false
  202. #a.log = Logger.new(STDOUT)
  203. a.log = Logger.new('log_analyse_page.txt')
  204. #a.log.level = Logger::INFO
  205. a.set_proxy(addr, port)
  206. end
  207. end
  208. end
  209. end
  210. def main(url)
  211. include Page_analysis
  212. tt = Page_analyse.new
  213. analyse_page = tt.run(url)
  214. end
  215. main ARGV[0]

comments powered by Disqus