#!/usr/bin/env ruby
# encoding: UTF-8

require 'net/http' 
require 'uri'
require 'fileutils'
require 'logger'
require 'mechanize'
require 'nokogiri'
require './google_get_serp.rb'
require './word1_calculate_density.rb'
require './word2_calculate_density.rb'
require './word3_calculate_density.rb'
require './word4_calculate_density.rb'
require './search_proxy.rb'
require 'page_rankr'
require 'uri'
require 'cgi'

module Page_analysis
  class Page_analyse

    def aleat
      #return rand(0.15..0.75)
      r = Random.new
      return r.rand(15..20)
    end

   # Only parses once
    def get_host_without_www2(url)
      url = "http://#{url}" unless url.start_with?('http')
      uri = URI.parse(url)
      host = uri.host.downcase
      host.start_with?('www.') ? host[4..-1] : host
    end

   # URL always gets parsed twice
    def get_host_without_www(url)
      url = "http://#{url}" if URI.parse(url).scheme.nil?
      host = URI.parse(url).host.downcase
      host.start_with?('www.') ? host[4..-1] : host
    end

   # Only parses twice if url doesn't start with a scheme
    def get_host_without_www(url)
      uri = URI.parse(url)
      uri = URI.parse("http://#{url}") if uri.scheme.nil?
      host = uri.host.downcase
      host.start_with?('www.') ? host[4..-1] : host
    end

   ### content
    def check_h1(body)
     #regex_h1 = '/<\s*h1[^>]*>(.*?)<\s*/\s*h1>/g'
     doc = Nokogiri::HTML::DocumentFragment.parse(body)
     puts doc.search('h1').map { |n| n.inner_text }
    end

    def check_h2(body)
     doc = Nokogiri::HTML::DocumentFragment.parse(body)
     puts doc.search('h2').map { |n| n.inner_text }
    end

    def check_h3(body)
     doc = Nokogiri::HTML::DocumentFragment.parse(body)
     puts doc.search('h3').map { |n| n.inner_text }
    end

    def check_h4(body)
     doc = Nokogiri::HTML::DocumentFragment.parse(body)
     puts doc.search('h4').map { |n| n.inner_text }
    end

    def check_h5(body)
     doc = Nokogiri::HTML::DocumentFragment.parse(body)
     puts doc.search('h5').map { |n| n.inner_text }
    end

    def check_h6(body)
     doc = Nokogiri::HTML::DocumentFragment.parse(body)
     puts doc.search('h6').map { |n| n.inner_text }
    end

    def check_title(body) # ++ longueur entre 10 et 70 char
     doc = Nokogiri::HTML::DocumentFragment.parse(body)
     puts doc.search('title').map { |n| n.inner_text }
    end

    def check_strong(body) #strong
     doc = Nokogiri::HTML::DocumentFragment.parse(body)
     puts doc.search('strong').map { |n| n.inner_text }
    end

    def check_italic(body) #em
     doc = Nokogiri::HTML::DocumentFragment.parse(body)
     puts doc.search('em').map { |n| n.inner_text }
    end

   ### meta
    def check_meta_description(page) # 
     puts page.at('meta[@name="description"]')[:content]
    end

    def check_meta_keywords(page) #
     puts page.at('meta[@name="keywords"]')[:content]
    end

    def check_meta_content_type(page) #
     puts page.at('meta[@http-equiv="Content-Type"]')[:content]
    end

    def check_meta_content(page) #encodage, indexable, ..
     puts page.at('meta[@name="Robots"]')[:content]
    end

   ### robots.txt & sitemap          => récupérer le domaine
    def check_spider(page)
      page_bot = page << "robots.txt"
      puts @agent.get page_bot
    end

    def check_sitemap(page)
      page_sitemap = page << "sitemap1.xml"
      puts @agent.get page_sitemap
    end


   ### links & imgs   
    def check_links(page)
     page.links.each do |link|
       puts link
     end
    end

    def check_images(page)  
     page.images.each do |img|
       puts img.alt
     end
    end

   ### duplicate content
    def check_duplicate_content(body)
     proxy_addr, proxy_port = proxy.split(':') 
    end

   ### pagerank
    def check_pagerank(url)
     res = PageRankr.ranks(url, :google) 
     puts res[:google]
    end

   ### backlink
    def check_link(body,url) # check si la page contient bien le texte
     doc = Nokogiri::HTML::DocumentFragment.parse(body)
     puts doc.search(url).map { |n| n.inner_text }
    end


    def check_backlink(url,lang)

      myobj = Find_Proxy.new
      #proxies = myobj.find_proxy(6)

      res = []
      exact_query = url

      proxies.each do |proxy|
      begin
        myobj = Find_Proxy.new
        proxy = myobj.find_proxy(1)
        puts proxy
        pp = Seeker.new
        tmp = pp.run(exact_query,100,lang,proxy.first) # fr, com, de, nl, it, es
        if tmp == false
          raise
        end
        res.flatten << tmp 
        puts "fin #{tmp.length}"
      rescue
        puts "retry"
        retry     
      end 
      res = res.flatten
      puts res
      res = res.uniq
      return res.length     
    end


   ### run main
   def run(url)

     begin
       myobj = Find_Proxy.new
       proxy = myobj.find_proxy(1)
       addr, port = proxy.first.split(':')
       make_agent(addr, port)
       #list_lang = ["com","fr","de","es","nl","it"]
       list_lang = ["fr"]

       #page = @agent.get url
       #check_h1(page.body)
       #check_title(page.body)
       #check_strong(page.body)
       #check_italic(page.body)
       #check_meta_description(page)
       #check_meta_keywords(page)
       #check_meta_content_type(page)
       #check_meta_content(page)

       #check_spider(page.uri.to_s )
       #check_sitemap(page.uri.to_s )
       #check_links(page)
       #check_images(page)
       #density_one(page.uri.to_s)
       #density_two(page.uri.to_s)
       #density_three(page.uri.to_s)
       #density_for(page.uri.to_s)
       #check_pagerank(page.uri.to_s)
       how = 0
       #@tab_vide = []
       list_lang.each do |lang|
         #check_backlink(page.uri.to_s)
         how = check_backlink(url,lang)
         puts "number #{how.to_s} for #{lang.to_s}"
       end
     rescue e
       puts e
       #retry
     end
   end

   def make_agent(addr, port)
      @agent = Mechanize.new do |a|
        a.user_agent_alias = 'Mac Safari'
        a.max_history = 1
        a.open_timeout = 15
        a.read_timeout = 5
        a.keep_alive = false
        #a.log = Logger.new(STDOUT)
        a.log = Logger.new('log_analyse_page.txt')
        #a.log.level = Logger::INFO
        a.set_proxy(addr, port)
      end
    end      
    
  end
end

  def main(url)
    include Page_analysis

    tt = Page_analyse.new
    analyse_page = tt.run(url)
  end

main ARGV[0]
    

