proxy scraper


SUBMITTED BY: uuuuuu

DATE: Oct. 24, 2016, 9:35 p.m.

FORMAT: Text only

SIZE: 2.4 kB

HITS: 455

  1. from eventlet.green import urllib2
  2. import eventlet
  3. import re
  4. def getProxies(url):
  5. src = urllib2.urlopen(url).read().split("\n")
  6. css = False
  7. classes = {}
  8. ips = []
  9. proxies = []
  10. for i in xrange(len(src)):
  11. line = src[i]
  12. # Start of style
  13. if "<td><span><style>" in line:
  14. css = True
  15. continue
  16. # Handle the CSS
  17. if css == True:
  18. # End of style
  19. if "</style>" in line:
  20. css = False
  21. if "display:none" in line:
  22. classes[line[1:5]] = "none"
  23. if "display:inline" in line:
  24. classes[line[1:5]] = "inline"
  25. # IP line
  26. if len(classes) > 0 and css == False:
  27. ip = line
  28. linePort = src[i + 2]
  29. lineCountry = src[i + 4]
  30. lineResponseTime = src[i + 7]
  31. lineConnectionTime = src[i + 11]
  32. lineType = src[i + 16]
  33. lineAnonymity = src[i + 17]
  34. # Replace class declarations with style ones
  35. for class_ in classes:
  36. ip = ip.replace("class=\"%s\"" % (class_), "style=\"display:%s\"" % (classes[class_]))
  37. # Remove all unecessary poop :)
  38. ip = re.sub(r"<(div|span) style=\"display:none\">[\.0-9]+</(div|span)>", r"", ip)
  39. ip = re.sub(r"class=\"[0-9]+\"", r"", ip)
  40. ip = re.sub(r"[^0123456789\.]", r"", ip)
  41. # Port
  42. port = linePort.replace("</td>", "")
  43. # Country
  44. country = lineCountry.split("/> ")[1].split("<")[0]
  45. # Response Time Percents
  46. responseTime = lineResponseTime.split(":")[1].split("%")[0]
  47. # Connection Time Percents
  48. connectionTime = lineConnectionTime.split(":")[1].split("%")[0]
  49. # Connection Type
  50. type = lineType.split(">")[1].split("<")[0]
  51. # Anonymity
  52. anonymity = lineAnonymity.split(">")[1].split("<")[0]
  53. proxies.append({"ip":ip, "port":port, "country":country, "responseTime":responseTime, "connectionTime":connectionTime, "type":type, "anonymity":anonymity})
  54. classes = {}
  55. return url, proxies
  56. proxies = []
  57. pool = eventlet.GreenPool(12)
  58. for url, proxyList in pool.imap(getProxies, ["https://hidemyass.com/proxy-list/" + str(i + 1) for i in xrange(12)]):
  59. for proxy in proxyList:
  60. proxies.append(proxy)
  61. for proxy in proxies:
  62. print "%s:%s" % (proxy["ip"], proxy["port"])
  63. print "\nGot %s proxies!" % (len(proxies))

comments powered by Disqus