xinhua_base

2025-08-11 13:10:23 +08:00
parent b43443551f
commit 4e5e35b4fa
20 changed files with 427 additions and 1 deletions
--- a/core/utils.py
+++ b/core/utils.py
@@ -0,0 +1,35 @@
+import requests
+from bs4 import BeautifulSoup
+from django.utils import timezone
+from core.models import Website, Article
+
+def crawl_xinhua_article(url, website):
+    headers = {
+        "User-Agent": "Mozilla/5.0"
+    }
+    resp = requests.get(url, headers=headers)
+    resp.encoding = 'utf-8'
+    soup = BeautifulSoup(resp.text, "html.parser")
+
+    # 提取标题
+    title_tag = soup.find("span", class_="title")
+    title = title_tag.get_text(strip=True) if title_tag else "无标题"
+
+    # 提取正文
+    content_tag = soup.find("span", id="detailContent")
+    paragraphs = content_tag.find_all("p") if content_tag else []
+    content_html = "".join(str(p) for p in paragraphs)  # 保留p标签的html结构
+
+    # 如果文章已存在，则不重复插入
+    if Article.objects.filter(url=url).exists():
+        print(f"文章已存在，跳过: {url}")
+        return
+
+    article = Article.objects.create(
+        website=website,
+        title=title,
+        url=url,
+        content=content_html,
+        pub_date=timezone.now(),
+    )
+    print(f"已保存文章：{title}")