36 lines
1.1 KiB
Python
36 lines
1.1 KiB
Python
import requests
|
|
from bs4 import BeautifulSoup
|
|
from django.utils import timezone
|
|
from core.models import Website, Article
|
|
|
|
def crawl_xinhua_article(url, website):
|
|
headers = {
|
|
"User-Agent": "Mozilla/5.0"
|
|
}
|
|
resp = requests.get(url, headers=headers)
|
|
resp.encoding = 'utf-8'
|
|
soup = BeautifulSoup(resp.text, "html.parser")
|
|
|
|
# 提取标题
|
|
title_tag = soup.find("span", class_="title")
|
|
title = title_tag.get_text(strip=True) if title_tag else "无标题"
|
|
|
|
# 提取正文
|
|
content_tag = soup.find("span", id="detailContent")
|
|
paragraphs = content_tag.find_all("p") if content_tag else []
|
|
content_html = "".join(str(p) for p in paragraphs) # 保留p标签的html结构
|
|
|
|
# 如果文章已存在,则不重复插入
|
|
if Article.objects.filter(url=url).exists():
|
|
print(f"文章已存在,跳过: {url}")
|
|
return
|
|
|
|
article = Article.objects.create(
|
|
website=website,
|
|
title=title,
|
|
url=url,
|
|
content=content_html,
|
|
pub_date=timezone.now(),
|
|
)
|
|
print(f"已保存文章:{title}")
|