爬虫草稿

时间：2022-09-12 08:30:01 油位传感器xgyw310
from urllib import request from bs4  import BeautifulSoup import logging, os   base_url = 'http://www.xgyw.cc' url_list = [base_url '/Xgyw'] h_list = [] path = r'E:\python\0425\pics' for i in range(2,5):     url_list.append('http://www.xgyw.cc/Xgyw/page_%s.html' %i) print(url_list) headers={    
     'User-Agent':'Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML, likeGecko) Chrome/57.0.2987.110 Safari/537.36'}  #解析网址 def get_Hrefs():     maxtrynum = 5     global hrefs_list     hrefs_list = []     for url in url_list:         print('解析页：[%s]' % url)         for tries in range(maxtrynum):             try:                 req = request.Request(url=url, headers=headers)                 res = request.urlopen(req)                 html = res.read().decode('UTF-8', 'ignore').replace(u'\0xd0', '')               # html = res.read().decode('gbk', 'ignore')                 # print(html)                                                                    soup = BeautifulSoup(html, 'html.parser')                                         # hrefs = soup.find_all('div', class_='biank1')                                  hrefs = soup.select('a[href^="/Xgyw/Xgyw"]')                                     # print(hrefs)                                                                   for each_href in hrefs:                     hre = each_href.get('href')                     # print(hre)                     hrefs_list.append(base_url   hre)                 break  except:                 if tries < (maxtrynum - 1):                     
      
       continue 
       else:
                    logging.error("Has tried %d times to access url %s, all failed!", maxtrynum, url)
                    break  return hrefs_list

# 列表去重
def dedupe(list):
    global L
    L=[]
    for i in list:
        if i not in L:
            yield i
            L.append(i)
    return L
# 解析page图片页
def get_pages_hrefs(href):
    times = 5
  # for href in href_list:
    for t in range(times):
        try:
            print('找到页面：%s' % href)
            req = request.Request(url=href, headers=headers)
            res = request.urlopen(req)
            html = res.read().decode('UTF-8', 'ignore').replace(u'\0xd0', '')
          # print(html)
            soup = BeautifulSoup(html, 'html.parser')
            pages = soup.select('a[href^="/Xgyw/Xgyw"]')
          # print(pages)
            for each in pages:
                addr = each.get('href')
               # print(addr)
                h_list.append('http://www.xgyw.cc' + addr)
            break  except:
            if t < (times - 1):
                continue  else:
                logging.error("Has tried %d times to access url %s, all failed!", times, href)
                break  return h_list

# 解析图片&保存图片
def parser_pics(list):
    n = 5
    for each_list in list:
        for t in range(n):
            try:
                print('解析图片地址：%s' % each_list)
                req = request.Request(url=each_list, headers=headers)
                res = request.urlopen(req)
                html = res.read().decode('UTF-8', 'ignore').replace(u'\0xd0', '')
                # print(html)
                soup = BeautifulSoup(html, 'html.parser')
                pics = soup.select('img[src^="/uploadfile"]')
                #print(pics)
                for each_pic in pics:
                    srcs = each_pic.get('src')
                    print("解析的图片url:", base_url + srcs)
                    save_pics(base_url + srcs)
                break  except:
                if t < (n - 1):
                    continue  else:
                    logging.error("Has tried %d times to access url %s, all failed!", n, each_list)
                    break  # 保存图片
def save_pics(pic):
    fileName = path + os.sep + pic.split("/")[-1]
    if not os.path.exists(fileName):
        with open(fileName, "wb") as f:
            print("正在保存:", fileName)
            f.write(request.urlopen(pic).read())



if __name__=='__main__':
    for i in dedupe(get_Hrefs()):
        get_pages_hrefs(i)
    print(h_list)
    parser_pics(h_list)
锐单商城拥有海量元器件数据手册、IC替代型号，打造电子元器件IC百科大全！
爬虫草稿

相关文章