人立晚风月照中
独散步长廊
月浸在池塘
欢欣充满了心上
静听乐悠扬
越觉乐洋洋
夜鸟高枝齐和唱
月照彩云上
熏风轻掠
如入山荫心向往
🎵 苏妙玲《彩云追月》
import time
import requests
from playwright._impl._errors import TimeoutError
from playwright.sync_api import sync_playwright
from loguru import logger
from scrapy import Selector
import re
from urllib.parse import urlparse
def extract_html_text(html):
logger.info(f"Worker-0-run-extract_html_text")
selector = Selector(text=html)
# 去掉空格和换行符
text = selector.xpath('//body//text()').re(r'\S+|\n+')
text = ''.join(text)
charlist = re.findall('[\u4e00-\u9fa5]+', text)
text = ''.join(charlist)
if text:
return text.strip()
else:
return ''
def extract_html(url):
# 启动Playwright
with sync_playwright() as p:
# 选择浏览器引擎(chromium、firefox、webkit)
browser = p.chromium.launch(
headless=True, # 设置为False以便在界面模式下启动浏览器
executable_path=r'/Applications/Chromium.app/Contents/MacOS/Chromium' # 设置浏览器路径
# executable_path=r'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome' # 设置浏览器路径
)
# headers = {
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
# }
context = browser.new_context()
pageInit = context.new_page()
page = context.new_page()
page.set_default_timeout(15000)
page.goto(url)
# 等待7s
time.sleep(5)
html = page.content()
# print(html)
page.close()
return html
def crawl(url):
html = ''
try:
html = extract_html(url)
except TimeoutError as e:
logger.info("页面访问超时,检查网络连接或者网站是否正常")
except Exception as e:
logger.error("页面访问异常")
response = Selector(text=html)
values = response.xpath("//footer/@aria-label").extract_first()
share, comment, like = values.split(',')
print(share, comment, like)
def is_valid_http_url(url):
parsed_url = urlparse(url)
return parsed_url.scheme in ("http", "https")
def request_url(wId):
url = f'https://weibo.com/ajax/statuses/show?id={wId}&locale=zh-CN'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
"Cookie": "SUB=_2AkMRHTAuf8NxqwFRmfscyW7na4Rzzw3EieKnQcH1JRMxHRl-yT9kqnBetRB6Op0ewXqJGg99xI9PHf9GLxIl4ywMtbjK;"
}
resp = requests.get(url, headers=headers)
data = resp.json()
reposts_count = data['reposts_count']
comments_count = data['comments_count']
attitudes_count = data['attitudes_count']
print(reposts_count, comments_count, attitudes_count)
if __name__ == '__main__':
url = 'https://weibo.com/1731986465/Oe4y0u9Pn?refer_flag=1001030103_'
ret = crawl(url)
wId = url.split('/')[-1].split('?')[0]
request_url(wId)