import urllib,re
import urllib.request
from bs4 import BeautifulSoup
#file = open('./aatest.html', 'rb')
root = 'https://health.china.com/toutiao/13003338/20220517/42305957.html'
bs = BeautifulSoup(urllib.request.urlopen(root+u'/').read())
#file = open('https://www.runoob.com/', 'rb')
#html = file.read()
#bs = BeautifulSoup(html,"html.parser") # 缩进格式
#print(bs.prettify()) # 格式化html结构
print(bs.title) # 获取title标签的名称
#print(bs.title.name) # 获取title的name
#print(bs.title.string) # 获取head标签的所有内容
print(bs.head)
print(bs.p)
#print(bs.div) # 获取第一个div标签中的所有内容
#print(bs.div["id"]) # 获取第一个div标签的id的值
#print(bs.a)
#print(bs.find_all("a")) # 获取所有的a标签
#print(bs.find(id="u1")) # 获取id="u1"
for item in bs.find_all("a"):
s = item.get("href")
if s[0:2]!="//" :
print(s)
#pass
else:
#print(s[2:])
#print("https:"+s)
pass
# print(item.get("href")) # 获取所有的a标签,并遍历打印a标签中的href的值
#for item in bs.find_all("a"):
# print(item.get_text())
#for item in bs.find_all("p"):
# print(item.get_text())
for item in bs.find_all(name='div',attrs={"class":"article_notice"}):
print(item.get_text())
print(item)
print(item.select('p'))
child = bs.find("div",{"id":"chan_breadcrumbs"})
print(child)
child2 = bs.find("div",class_="article_notice")
print(child2)
#[0].get_text()
print(bs.select('.article_notice')) # 类选择器
print(bs.select('#chan_breadcrumbs')) # id选择器
#print(bs.select('p'))
#print(bs.select('a'))