1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110
|
import urllib.request import re import gzip
def ungzip(data): try: print('compressing.....') data = gzip.decompress(data) print('dcompress done!') except: print('without compress!') return data
class Proxy: def __init__(self,url='http://www.xicidaili.com/'): self.ipPool=[] self.url=url self.getipPool();
def getipPool(self): headers={ 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6', } req = urllib.request.Request(self.url,headers=headers) response = urllib.request.urlopen(req) res_bytes = ungzip(response.read()) html_str = res_bytes.decode(encoding='utf-8')
pattern = r'\s*<td>(((2[0-4]\d|25[0-5]|[01]?\d\d?)\.){3}(2[0-4]\d|25[0-5]|[01]?\d\d?))</td>\s*\n\s*<td>(\d{2,5})</td>\s*\n\s*<td>(\w+)</td>\s*\n\s*<td class="country">透明</td>\s*\n\s*<td>(HTTP|HTTPS)</td>' regex = re.compile(pattern) s = regex.findall(html_str) for host in s: hostdict = {'host':host[0],'port':host[4],'country':host[5],'protocol':host[-1]} self.ipPool.append(hostdict) return self.ipPool
def startProxy(self):
self.printIpPool(); host_num = input('Please select a host:') host_num = int(host_num.strip()) proxy = {'http':self.ipPool[host_num]['host']+':'+self.ipPool[host_num]['port']} proxy_support = urllib.request.ProxyHandler(proxy) opener = urllib.request.build_opener(proxy_support) urllib.request.install_opener(opener) pass
def printIpPool(self): for i in range(0,len(self.ipPool)): print(i,":",self.ipPool[i])
if __name__ == '__main__':
url = "http://ip.cn/" headers={ 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6', } proxyhost = Proxy() proxyhost.startProxy()
req = urllib.request.Request(url,headers=headers) response = urllib.request.urlopen(req) res_bytes = ungzip(response.read())
html_str = res_bytes.decode(encoding='utf-8') file_html = open("proxy.html",'w',encoding = "utf-8") file_html.write(html_str) file_html.close()
pattern = r'<code>(((2[0-4]\d|25[0-5]|[01]?\d\d?)\.){3}(2[0-4]\d|25[0-5]|[01]?\d\d?))</code>'
pattern2 = r'所在地理位置:<code>([\s\w]*)</code>' my_locate = re.findall(pattern2,html_str) my_locate = my_locate[0] my_ip = re.findall(pattern,html_str) my_ip = my_ip[0][0] print('my locate:',my_locate) print('my ip:',my_ip)
|