xinhua_base
This commit is contained in:
35
core/utils.py
Normal file
35
core/utils.py
Normal file
@@ -0,0 +1,35 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from django.utils import timezone
|
||||
from core.models import Website, Article
|
||||
|
||||
def crawl_xinhua_article(url, website):
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0"
|
||||
}
|
||||
resp = requests.get(url, headers=headers)
|
||||
resp.encoding = 'utf-8'
|
||||
soup = BeautifulSoup(resp.text, "html.parser")
|
||||
|
||||
# 提取标题
|
||||
title_tag = soup.find("span", class_="title")
|
||||
title = title_tag.get_text(strip=True) if title_tag else "无标题"
|
||||
|
||||
# 提取正文
|
||||
content_tag = soup.find("span", id="detailContent")
|
||||
paragraphs = content_tag.find_all("p") if content_tag else []
|
||||
content_html = "".join(str(p) for p in paragraphs) # 保留p标签的html结构
|
||||
|
||||
# 如果文章已存在,则不重复插入
|
||||
if Article.objects.filter(url=url).exists():
|
||||
print(f"文章已存在,跳过: {url}")
|
||||
return
|
||||
|
||||
article = Article.objects.create(
|
||||
website=website,
|
||||
title=title,
|
||||
url=url,
|
||||
content=content_html,
|
||||
pub_date=timezone.now(),
|
||||
)
|
||||
print(f"已保存文章:{title}")
|
||||
Reference in New Issue
Block a user