python+beautifulsoup
爬取112报修单信息,写入文档保存。
1 #!/usr/bin/python 2 # -*- coding: utf-8 -*- 3 import requests 4 import sys 5 import http.cookiejar as cookielib 6 from bs4 import BeautifulSoup 7 import re 8 import time 9 import random 10 11 s = requests.session() 12 s.cookies = cookielib.LWPCookieJar(filename = "LoginCookies-112.txt") 13 14 def login_post(): 15 userAgent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' 16 header = { 'Origin':'http://112.efoxconn.com','Referer':'http://112.efoxconn.com/Home/Index','User-Agent':userAgent} 17 18 login_url = 'http://112.efoxconn.com/Login/Index' 19 login_data = { 'username':'H2605177','password':'123'} 20 21 r = s.post(login_url,data=login_data,headers = header,allow_redirects = False) 22 time.sleep(random.random()*2) 23 login_cookies = s.cookies 24 cok = requests.utils.dict_from_cookiejar(login_cookies) 25 #print(cok) 26 #print(login_cookies) 27 #print(r.text) 28 print("登陆状态:",r.status_code) 29 s.cookies.save() 30 31 def get_bynum_1(url,num): #获取承办人信息、申请人信息、满意度信息。 32 userAgent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' 33 header = { 'Host':'112.efoxconn.com','Referer':'http://112.efoxconn.com/UserCenter/ByMe','User-Agent':userAgent} 34 bynum_url= "http://112.efoxconn.com"+url+"/"+num 35 r1= s.get(bynum_url) 36 time.sleep(random.random()*2) 37 soup1 = BeautifulSoup(r1.text, features='html.parser') 38 #re_11 = soup1.td.get_text() 39 re_12 = soup1.find_all("td",class_=re.compile("value"),limit=16) 40 list_1 = [] 41 for i in range (len(re_12)): 42 re_13 = re_12[i].get_text().replace("\r","").replace("\n","").replace("\000","").replace(" ","") 43 if i == 4: 44 list_1.append(re_13) 45 elif i == 5: 46 list_1.append(re_13) 47 elif i == 6: 48 list_1.append(re_13) 49 elif i == 7: 50 list_1.append(re_13) 51 elif i == 11: 52 list_1.append(re_13) 53 elif i == 12: 54 list_1.append(re_13) 55 elif i == 13: 56 list_1.append(re_13) 57 #elif i == 15: 58 #list_1.append(re_13) 59 60 re_12 = soup1.find_all("td",attrs={ "class":"r-value","colspan":"7"})#同时使用多个属性值过滤出满意度评价。 61 if re_12: 62 re_13 = re_12[0].get_text().replace("\r","").replace("\n","").replace("\000","").replace(" ","") 63 list_1.append(re_13) 64 else: 65 list_1.append("未评价") 66 #re_12 = re.findall(r'(.*?)',re_11,re.S|re.M)#使用re.findall模块和正则表达式,匹配过滤出目标字符串中的标签内容 67 #re_12 = re.findall(r'(?<=).+?(?=)',re_11,re.S|re.M) 68 return list_1 69 70 def get_bynum_2(url,num): #获取申请内容信息 71 userAgent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' 72 header = { 'Host':'112.efoxconn.com','Referer':'http://112.efoxconn.com/UserCenter/ByMe','User-Agent':userAgent} 73 bynum_url= "http://112.efoxconn.com"+url+"/"+num 74 r1= s.get(bynum_url) 75 time.sleep(random.random()*2) 76 soup1 = BeautifulSoup(r1.text, features='html.parser') 77 #re_11 = soup1.td.get_text() 78 list_2 = [] 79 re_12 = soup1.find_all("td",class_=re.compile("nettype-frame-title")) #获取通信单-申请信息标题 80 for i in range (len(re_12)): 81 re_13 = re_12[i].get_text().replace("\r","").replace("\n","").replace("\000","").replace(" ","") 82 if i == 0: 83 list_2.append(re_13) 84 85 re_12 = soup1.find_all("div",class_=re.compile("model-header")) #获取特殊单-申请信息标题 86 for i in range (len(re_12)): 87 re_13 = re_12[i].get_text().replace("\r","").replace("\n","").replace("\000","").replace(" ","") 88 if i == 0: 89 list_2.append(re_13) 90 91 re_122 = soup1.find_all("td",class_=re.compile("color-blue r-value")) #获取通信单数量(新增分机、迁移分机) 92 for i in range (len(re_122)): 93 re_13 = re_122[i].get_text().replace("\r","").replace("\n","").replace("\000","").replace(" ","") 94 if i == 0: 95 list_2.append(re_13) 96 97 re_122 = soup1.find_all("span",style=re.compile("color: blue; font-weight: bold;")) #获取通信单报修数量(网点不通、电话不通) 98 for i in range (len(re_122)): 99 re_13 = re_122[i].get_text().replace("\r","").replace("\n","").replace("\000","").replace(" ","")100 if i == 0:101 list_2.append(re_13)102 103 re_122 = soup1.find_all("div",style=re.compile("padding-left: 3px;")) #获取通信单功能设定类标题信息104 for i in range (len(re_122)):105 re_13 = re_122[i].get_text().replace("\r","").replace("\n","").replace("\000","").replace(" ","")106 if i == 0:107 list_2.append(re_13)108 109 re_122 = soup1.find_all("div",class_=re.compile("tb-memo")) #获取通信单《其他》标题下的需求内容描述,特殊单《其他》标题下需求描述110 for i in range (len(re_122)):111 re_13 = re_122[i].get_text().replace("\r","").replace("\n","").replace("\000","").replace(" ","")112 if i == 0:113 list_2.append(re_13)114 115 re_122 = soup1.find_all("td",class_=re.compile("r-value tb-memo"),limit=2) #获取特殊单需求描述116 for i in range (len(re_122)):117 re_13 = re_122[i].get_text().replace("\r","").replace("\n","").replace("\000","").replace(" ","")118 if i == 1:119 list_2.append(re_13)120 121 re_122 = soup1.find_all("td",rowspan="1",limit=2) #获领料单信息 122 for i in range (len(re_122)):123 re_13 = re_122[i].get_text().replace("\r","").replace("\n","").replace("\000","").replace(" ","")124 if i == 0:125 list_2.append(re_13)126 elif i ==1:127 list_2.append(re_13)128 129 return list_2130 131 def get_bynum_3(url,num): #获取签核进度,签核人及时间信息。132 userAgent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'133 header = { 'Host':'112.efoxconn.com','Referer':'http://112.efoxconn.com/UserCenter/ByMe','User-Agent':userAgent}134 bynum_url= "http://112.efoxconn.com"+url+"/"+num135 r1= s.get(bynum_url)136 time.sleep(random.random()*2)137 #print("get获取状态:",r1.status_code)138 soup1 = BeautifulSoup(r1.text, features='html.parser')139 140 list_1 = []141 list_2 = []142 re_1 = soup1.find_all("tr",class_=re.compile("pro-item"))143 for i in range (len(re_1)):144 #re_11 = re_1[i].get_text().replace("\r","").replace("\n","").replace("\000","").replace(" ","")145 re_11 =str(re_1[i])146 soup2 = BeautifulSoup(re_11,features='html.parser')147 re_12 = soup2.find_all("td")148 for j in range (len(re_12)):149 re_13 = re_12[j].get_text().replace("\r","").replace("\n","").replace("\000","").replace(" ","")150 list_2.append(re_13)151 list_1.append(list_2)152 list_2 = []153 154 #print(list_1) 155 156 return list_1157 158 def get_byme():159 userAgent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'160 header = { 'Host':'112.efoxconn.com','Referer':'http://112.efoxconn.com/UserCenter/ByMe','User-Agent':userAgent}161 162 byme_url = 'http://112.efoxconn.com/UserCenter/ByMe'163 post_data = { 'sonbr':'','formtype':'','startdate':'','enddate':'','page':'1','pagesize':'100'}164 165 r = s.post(byme_url,data=post_data,headers = header,allow_redirects = False)166 time.sleep(random.random()*3)167 print("获取查询结果:",r.status_code)168 print('*************************************************************')169 170 #使用正则匹配出过滤标签内容171 re_1 = str(re.findall(r"\[(.+)\]",r.text))172 re_2 = re.split(r"\{|\},|\[\'|\'\]",re_1)173 #使用过滤器筛掉空串得到了迭代器,再重新构造出列表174 re_2 = [item for item in filter(lambda x:x != '',re_2)]175 for i in range (len(re_2)):176 print("报修单序号:"+str(i))177 re_3 = re.split(r"[,\"]",str(re_2[i])) #把元素内容装进列表re_3178 #print (re_3)179 bb="/Sign" 180 #print(re_3[13],re_3[18],re_3[28]) #打印出每个单所需要的关键字段,如单号、url等。181 #print(re_3[53],re_3[3],re_3[13])182 if re_3[1] == "SO_NBR": #确认元素排序的顺序183 for aa in re_3:184 if bb in aa: #遍历找到包含/Sign的元素185 a=aa186 b=re_3[3]187 print(a,b,re_3[13])188 else:189 a=re_3[13]190 b=re_3[18]191 print(a,b,re_3[28])192 193 list_1 = get_bynum_1(a,b) #通过特定函数返回本单号相应数据(申请人信息)194 list_2 = get_bynum_2(a,b) #通过特定函数返回本单号相应数据(申请内容)195 list_3 = get_bynum_3(a,b) #通过调用函数获取签核记录集合196 list_4 = [list_3[-1][-3],list_3[1][-1],list_3[-2][2],list_3[-1][-1]]197 list_text = list_4+list_1+list_2#拼接需要打印的list为一个list198 list_f = [a,"~",b,"~"] 199 200 for xx in list_text:201 list_f.append(xx)202 list_f.append("~")203 list_ft=str(list_f).replace("[","").replace("'","").replace("]","").replace(",","")#把list转换为字符串,并替换掉无用标点符号。204 with open('my_check_list.txt','a+',encoding='utf-8',errors='ignore') as f:205 f.write(list_ft)206 207 '''208 with open('my_check_list.txt','a+',encoding='utf-8',errors='ignore') as f:209 f.write(list_3[-1][-3])210 f.write("~")211 f.write(list_3[1][-1])212 f.write("~")213 f.write(list_3[-2][2])214 f.write("~")215 f.write(list_3[-1][-1])216 f.write("~") 217 218 list_1 = get_bynum_1(a,b) #通过特定函数返回本单号相应数据(申请人信息)219 #print(list_1)220 for x in list_1:221 with open('my_check_list.txt','a+',encoding='utf-8',errors='ignore') as f:222 f.write(x)223 f.write("~")224 225 list_2 = get_bynum_2(a,b) #通过特定函数返回本单号相应数据(申请内容)226 #print(list_2)227 for x in list_2:228 with open('my_check_list.txt','a+',encoding='utf-8',errors='ignore') as f:229 f.write(x)230 f.write("~")231 '''232 233 with open('my_check_list.txt','a+',encoding='utf-8',errors='ignore') as f: #写入一个完整的list后,换行234 f.write('\n')235 236 print('#############################################################')237 238 239 login_post()240 get_byme()241 #print(get_bynum_2("/Network/Sign","21118061100034"))