Python爬虫基础
时间:2023-05-12 02:37:00
一、概要
爬取目标:
https://www.gsmchoice.com/zh-cn/catalogue/
爬升级:3
爬行信息:各品牌、各型号手机规格数据
二、环境
- Python
3.9.7
- beautifulsoup4
4.10.0
- bs4
0.0.1
- certifi
2021.5.30
- charset-normalizer
2.0.6
- idna
3.2
- lxml
4.6.3
- pip
21.2.4
- requests
2.26.0
- setuptools
57.4.0
- soupsieve
2.2.1
- urllib3
1.26.7
三、Code
爬三级要爬url
'''* @Description: 爬取 www.gsmchoice.com 三级网页手机信息* @Param: url level info* @return: phone_info* @Author: zhangjinke@corp.netease.com* @Date: 2021-09-22* 三级页面规则明显,不使用re库'''import randomimport reimport requestsfrom bs4 import BeautifulSoup''' in : 一级url out : 二级url https://www.gsmchoice.com/zh-cn/catalogue/nec/'''def craw_lev1(base_url, url): li = [] req_headers= dict() user_agent_list = ["Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/68.0.3440.106 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/67.0.3396.99 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/64.0.3282.186 Safari/537.36", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/62.0.3202.62 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/45.0.2454.101 Safari/537.36", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)", "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15", ] req_headers['User-Agent'] = random.choice(user_agent_list) req_obj = requests.get(url,headers=req_headers) bresp = BeautifulSoup(req_obj.text,'lxml') CatalogueBrands = bresp.find(id='CatalogueBrands') a = CatalogueBrands.find_all('a') for item in a: if ("https" in item['href']): # 确认没有重复的框架内href没有重复的,一层url不去重直接追加 li.append(item['href']) else: li.append(base_url + item['href']) return li''' in : 二级url out : 三级url https://www.gsmchoice.com/zh-cn/catalogue/nec/mediasxn06e/'''def craw_lev2(url): soup_a = [] base_url3 = [] base_url = "https://www.gsmchoice.com/" factory = url.split('/')[-3] reg_key = 'href="/zh-cn/catalogue/' + factory + '/\w*' req_obj = requests.get(url) soup = BeautifulSoup(req_obj.text,'html.parser') soup_len = len(soup.find_all('div',class_='phone-container phone-container--left')) if soup_len == 2: soup_a = soup.find_all('div',class_='phone-container phone-container--left')[0].find_all('a')+soup.find_all('div',class_='phone-container phone-container--left')[1].find_all('a') else: soup_a = soup.find_all('div',class_='phone-container phone-container--left')[0].find_all('a') for i in soup_a: reg = re.compile(reg_key) x = reg.findall(str(i))[0] base_url3.append(base_url + str(x).split('"/')[1]) return base_url3def page_num(u): req_obj = requests.get(u) soup = BeautifulSoup(req_obj.text,'html.parser') b = soup.find_all('b') num = re.findall("\d+",str(b[-3]))[0] return numif __name__ == '__main__': base_url = "https://www.gsmchoice.com" url_lev1 = "https://www.gsmchoice.com/zh-cn/catalogue/" #410个品牌 url_lev2 = craw_lev1(base_url,url_lev1) # #check每一二级页面的手机个数 # print (craw_lev1(base_url,url)[i],page_num(craw_lev1(base_url,url)[i])) #拿二级(手机品牌)分页 取三级(手机品牌-手机型号) with open("/Users/zjk/IdeaProjects/test_Python/resource/craw_results.txt",'a' ,encoding="utf-8") as file: for iu in url_lev2: url_lev3 = [] intn = int(page_num(iu)) if intn%40 == 0: n = intn//40 else: n = intn//40 + 1 #爬取 二级分页的三级url for x in range(0,n): # real_url = https://www.gsmchoice.com/zh-cn/catalogue/huawei/models/80 real_url = iu + "models/" + str(x*40) staus_code = requests.get(real_url).status_code url_lev3 += craw_lev2(real_url) print(str(staus_code)+"-成功爬取:"+real_url) for m in url_lev3: file.write(m+"\n")
遍历文件内三级url,爬取要的手机信息
【单线程】
#coding:utf-8'''* @Description: 爬取指定单页url的 数据 分行存储\t\t\t分割* @Param: url_lev3* @return: 每一种手机模型数据* @Author: zhangjinke@corp.netease.com* @Date: 2021-09-23'''import randomimport refrom multiprocessing.pool import ThreadPoolimport requestsimport unicodedatafrom bs4 import BeautifulSoup# import logging# logging.captureWarnings(True)from requests.packages import urllib3urllib3.disable_warnings()def get_soup(url_lev3): soup_one = "null" soup_two = "null" real_sout_li = [] req_headers= dict() user_agent_list = ["Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)", "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15", ] req_headers['User-Agent'] = random.choice(user_agent_list) #源html提取 [soup_1,soup_2] req_obj = requests.get(url_lev3,headers=req_headers) req_obj.encoding = req_obj.apparent_encoding soup = BeautifulSoup(req_obj.text,'lxml') soup_t = soup.find_all(class_='PhoneData YesDict') real_sout_li.append(str(soup_t[0])) for sou in soup_t: html_text = str(sou) if '加速度计' in html_text: real_sout_li.append(html_text) #html转soup对 soup_one = BeautifulSoup(real_sout_li[0],'lxml') if len(real_sout_li) == 1: return [soup_one,'null'] soup_two = BeautifulSoup(real_sout_li[1],'lxml') return [soup_one,soup_two]def craw_cell1(soup_1): #分别 正则提取 替换 #key1 item = re.sub(r'\t*|\n*|\[|\]','',unicodedata.normalize('NFKC', str(soup_1.find_all(class_='phoneCategoryName')).replace('\xa0',''))) key_item = str(item).replace('','').replace(' ','') key_li = key_item.split(', ') #value1 item_v = re.sub(r'\t*|\n*|\[|\]','',str(soup_1.find_all(class_='phoneCategoryValue'))) item_v_li = item_v.split(', in item_v_li[index]: item_v_li[index] = re.sub('(.*)ue">||||\xa0','',item_v_li[index].replace('
',' ')) elif (' in item_v_li[index]) & (' in item_v_li[index]): ss = re.sub(r'.* ','',item_v_li[index]) cont = ss.split('">')[1].split('')[0] if 'tick' in item_v_li[index]: item_v_li[index] = 'yes '+ cont elif 'cross' in item_v_li[index]: item_v_li[index] = 'no '+ cont else: item_v_li[index] = 'unknown' elif ' in item_v_li[index]: item_v_li[index] = item_v_li[index].split(')[0].split('">')[-1] elif ' in item_v_li[index]: mark = item_v_li[index].split(')[1].split('">')[0] if mark=='tick': item_v_li[index] = 'yes' elif mark=='question': item_v_li[index] = 'unknown' else: item_v_li[index] = 'no' elif '' in item_v_li[index]: item_v_li[index] = item_v_li[index].split('')[0].split('">')[-1] else: item_v_li[index] = item_v_li[index].split('">')[-1].replace('
',' ') #统一处理 漏筛的脏字符 item_v_li[index] = re.sub(r'||
| ','') if len(key_li) == len(item_v_li): res_li = {
} for ind in range(0,len(key_li)): res_li[key_li[ind]] = item_v_li[ind] return res_lidef craw_cell2(soup_2): res_li = {
} sub = re.sub(r'\t*|\n*|\xa0', '', str(soup_2)) findall_key = re.findall(r'me">(.+?)
传感器': 'no', '光传感器': 'yes', '磁力仪': 'yes', '陀螺仪': 'yes', '晴雨表': 'no', '高度表': 'no', '重力感应器': 'yes', '霍尔�