xinhua_base

This commit is contained in:
2025-08-11 13:10:23 +08:00
parent b43443551f
commit 4e5e35b4fa
20 changed files with 427 additions and 1 deletions

35
core/utils.py Normal file
View File

@@ -0,0 +1,35 @@
import requests
from bs4 import BeautifulSoup
from django.utils import timezone
from core.models import Website, Article
def crawl_xinhua_article(url, website):
headers = {
"User-Agent": "Mozilla/5.0"
}
resp = requests.get(url, headers=headers)
resp.encoding = 'utf-8'
soup = BeautifulSoup(resp.text, "html.parser")
# 提取标题
title_tag = soup.find("span", class_="title")
title = title_tag.get_text(strip=True) if title_tag else "无标题"
# 提取正文
content_tag = soup.find("span", id="detailContent")
paragraphs = content_tag.find_all("p") if content_tag else []
content_html = "".join(str(p) for p in paragraphs) # 保留p标签的html结构
# 如果文章已存在,则不重复插入
if Article.objects.filter(url=url).exists():
print(f"文章已存在,跳过: {url}")
return
article = Article.objects.create(
website=website,
title=title,
url=url,
content=content_html,
pub_date=timezone.now(),
)
print(f"已保存文章:{title}")