02_解析

时间：2023-11-21 15:37:01 yl1雨量传感器

1、xpath的 基本使用

from lxml import etree  #xpath解析 #(1)本地文件 etree.parse #(2)服务器响应的数据 response.read().decode('utf-8') etree.HTML()  #xpath本地文件分析 tree = etree.parse('70_xpath的使用.html') print(tree)  #tree.xpath('xpath路径') #查找ul下面的li # li_list = tree.xpath('//body/ul/li')  #找到一切id属性的li标签 #text()获取标签中的内容 # li_list= tree.xpath('//ul/li[@id]/text()')  #查找id为l1标签，注意引号问题 # li_list = tree.xpath('//ul/li[@id="l1"]/text()')  #查找到id为l1的li标签的class的属性值 li = tree.xpath('//ul/li[@id="l1"]/@class')  #查询id中包含l的li标签 li_list = tree.xpath('//ul/li[contains(@id,"l")]/text()')  #查询id值以l开头li标签 li_list = tree.xpath('//ul/li[starts-with(@id,"c")]/text()')  #查询id为l1和class为c1的标签 # li_list= tree.xpath('//ul/li[@id="l1" and @class="c1"]/text()')  #两种方法：标签分开操作： 或者 在标签中l“改成 or # li_list = tree.xpath('//ul/li[@id="l1"]/text() | //ul/li[@id="l2"]/text()') li_list = tree.xpath('//ul/li[@id="l1" or @id="l2"]/text() ')  #判断列表的长度 print(li_list) print(len(li_list))

2.获取百度网站百度一下

#(1)获取网页的源代码 #（2）解析 分析服务器响应的文件 etree.HTML #（3）打印  import urllib.request url = 'https://www.baidu.com/' headers = { 
             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36' }  request = urllib.request.Request(url=url,headers=headers) response = urllib.request.urlopen(request) content = response.read().decode('utf-8') #解析网页源码 获取我们想要的数据 from lxml import etree #解析服务器响应的文件 tree = etree.HTML(content) #获取想要的数据 xpath的返回值是列表类型的数据  result = tree.xpath('//input[@id="su"]/@value') #使用xpath获取的路径 print(result) # print(content)

3、站长素材

#（1）请求对象的定制
#（2）获取网页的源码
#（3）下载

#需求 下载前十页的图片
#https://sc.chinaz.com/tupian/fengjingtupian.html
#https://sc.chinaz.com/tupian/fengjingtupian_2.html
#https://sc.chinaz.com/tupian/fengjingtupian_3.html
import os
import urllib.request
from lxml import etree
def create_request(page):
    if page == 1:
        url = 'https://sc.chinaz.com/tupian/fengjingtupian.html'
    else:
        url = 'https://sc.chinaz.com/tupian/fengjingtupian_'+str(page)+'.html'
    headers = { 
        
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'
    }
    request = urllib.request.Request(url=url,headers=headers)
    return request
def get_content(request):
    response = urllib.request.urlopen(request)
    content = response.read().decode('utf-8')
    return content

def down_load(content):
    #下载图片
    # urllib.request.urlretrieve('图片地址','文件的名字')
    tree = etree.HTML(content)
    name_list = tree.xpath('//div[@id="container"]//a/img/@alt')
    #
    src_list = tree.xpath('//div[@id="container"]//a/img/@src2')
    for i in range(len(name_list)):
        name = name_list[i]
        src = src_list[i]
        url = 'https:'+src
        urllib.request.urlretrieve(url=url,filename='./fengjing/'+ name+'.jpg')

if __name__ == '__main__':

    start_page = int(input('请输入开始页码：'))
    end_page = int(input('请输入结束页码：'))

    for page in range(start_page,end_page+1):
        request = create_request(page)
        content = get_content(request)
        down_load(content)
    print(len(os.listdir('./fengjing/')))

4、jsonpath基本使用

import json
import jsonpath

obj = json.load(open('073_jsonpath.json','r',encoding='utf-8'))

#书店所有书的作者
# author_list = jsonpath.jsonpath(obj,'$.store.book[*].author')
# print(author_list)

#所有的作者(可能不是书店的作者 也能获取 )
# author_list = jsonpath.jsonpath(obj,'$..author')
# print(author_list)

#store下面的所有元素
# tag_list = jsonpath.jsonpath(obj,'$.store.*')
# print(tag_list)

#store里面所有东西的price
# price_list = jsonpath.jsonpath(obj,'$.store..price')
# print(price_list)

#最后一本书
# book = jsonpath.jsonpath(obj,'$..book[(@.length-1)]')
# print(book)

#前面两本书
# book_list = jsonpath.jsonpath(obj,'$..book[:2]')
# print(book_list)

#条件过滤需要在（）的前面添加一个？
#过滤出所有包含isbn的书
# book_list = jsonpath.jsonpath(obj,'$..book[?(@.isbn)]')
# print(book_list)

#哪本书超过了10块
# book_list = jsonpath.jsonpath(obj,'$..book[?(@.price>10)]')
# print(book_list)

5、jsonpath解析淘票票实例

import urllib.request

url = 'https://dianying.taobao.com/cityAction.json?activityId&_ksTS=1653230282698_97&jsoncallback=jsonp98&action=cityAction&n_s=new&event_submit_doGetAllRegion=true'

headers = { 
        
    # ':authority': ' dianying.taobao.com',
    # ':method': ' GET',
    # ':path': ' /?spm=a1z21.3046609.city.1.32c0112aR5PquL&city=110100',
    # ':scheme': ' https',
    'accept': ' text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    # 'accept-encoding': ' gzip, deflate, br',
    'accept-language': ' zh-CN,zh;q=0.9',
    'cookie': ' cna=4O+LGOddyxgCAWp2iaSwYabo; t=9988905f6e39406f85d1ff46ab7e47ef; cookie2=14ed630db5c6825c1b6fae715eaa2e35; v=0; _tb_token_=78e0a5316ee5b; xlly_s=1; isg=BMTEtqdgej0fxM5xy0_6TS5flUK23ehHbBVWCN5nLQ9SCWDTBuzN14brSaHRESCf; l=eBPK3OElLmkvtDusBO5whurza77OBQAfGsPzaNbMiInca1yl1FG88NCh_Bb9RdtjgtfeXetPSMVCeRhw-Ozd0dlxdgF-1NKDnYvp-; tfstk=cDDhBQi5FXPCf3PiGpwBFNR3Ml8Aal6zoYksQHj0xIdUygDaYs4F_syp6h4Hle55.',
    'referer':'https://dianying.taobao.com/',
    'sec-ch-ua': ' " Not A;Brand";v="99", "Chromium";v="101", "Google Chrome";v="101"',
    'sec-ch-ua-mobile': ' ?0',
    'sec-ch-ua-platform': ' "Windows"',
    'sec-fetch-dest': ' document',
    'sec-fetch-mode': ' navigate',
    'sec-fetch-site': ' same-origin',
    'sec-fetch-user': ' ?1',
    'upgrade-insecure-requests': ' 1',
    'user-agent': ' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36'
}

request = urllib.request.Request(url=url,headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
# print(content)

content = content.split('(')[1].split(')')[0]
# print(content)

with open('123.json', 'w', encoding='utf-8')as fp:
    fp.write(content)

import json
import jsonpath

obj = json.load(open('074_jsonpath解析淘票票.json','r',encoding='utf-8'))
city_list = jsonpath.jsonpath(obj,'$..regionName')
print(city_list)

6、bs4的基本使用

from bs4 import BeautifulSoup

#通过解析本地文件 来将bs4的基础语法进行讲解
#默认打开的文件的编码格式是gbk 所以在打开文件的时候需要指定编码
soup = BeautifulSoup(open('075_bs4的基本使用.html',encoding='utf-8'),'lxml')
# print(soup)

#通过标签名查找节点
#找到的是第一个复合条件的数据
# print(soup.a)
#获取标签的属性和属性值
# print(soup.a.attrs)

#bs4的一些函数
#(1)find
#返回的是第一个符合条件的数据
# print(soup.find('a'))

#根据title的值来找到对应的标签对象
# print(soup.find('a',title='a2'))

#根据class的值来找到对应的标签对象 注意：class需要添加下划线
# print(soup.find('a',class_='a1'))

#（2）find_all:返回的是一个列表 并且返回了所有的a标签
# print(soup.findAll('a'))

#如果想获取的是多个标签的数据 那么需要在find_all的参数中添加的是列表的数据
# print(soup.findAll(['a','span']))

#limit的作用是查找的前几个数据
# print(soup.findAll('li',limit=2))

#（3）select(推荐)
#select方法返回的是一个列表 并且会返回多个数据
# print(soup.select('a'))

#可以通过'.'来代表class 我们把这种操作叫做类选择器
# print(soup.select('.a1'))
# print(soup.select('#l1'))

#属性选择器---通过属性来寻找对象的标签
#查找到li标签中有id的标签
# print(soup.select('li[id]'))

#查找到li标签中id为l2得标签
# print(soup.select('li[id="l2"]'))

#层级选择器
# 后代选择器
#找到得是div下面得li
# print(soup.select('div li'))

# 子代选择器
# 某标签得第一级子标签
# 注意:很多计算机编程语言中 如果不加空格不会输出内容 但是在bs4中 不会报错 会显示内容
# print(soup.select('div > ul > li'))

#找到a标签和li标签得所有对象
# print(soup.select('a,li'))

#节点信息
#获取节点内容
# obj = soup.select('#d1')[0]
""" 如果标签对象中 只有内容 那么string和get_text（）都可以使用 如果标签对象中 除了内容还有标签 那么string就获取不到数据 而get_text()是可以获取数据的 我们一般情况下 推荐使用get_text() """
# print(obj.string)
# print(obj.get_text())

#节点的属性
obj = soup.select('#p1')[0]
#name是标签的名字
# print(obj.name)
#将属性值作为一个字典返回
# print(obj.attrs)

#获取节点的属性
obj = soup.select('#p1')[0]
#三种方法都可以获取节点的属性
print(obj.attrs.get('class'))
print(obj.get('class'))
print(obj['class'])

7、bs4获取xbk数据

import urllib.request
url = 'https://www.starbucks.com.cn/menu/'
response = urllib.request.urlopen(url)
content = response.read().decode('utf-8')
# print(content)
from bs4 import BeautifulSoup
soup = BeautifulSoup(content,'lxml')

#xpath：//ul[@class="grid padded-3 product"]//strong/text()
name_list = soup.select('ul[class="grid padded-3 product"] strong')
# print(name_list)
for name in name_list:
    # print(name.string)
    print(name.get_text())

锐单商城拥有海量元器件数据手册、IC替代型号，打造电子元器件IC百科大全！

02_解析

1、xpath的基本使用

相关文章