import requests from bs4 import BeautifulSoup from django.utils import timezone from core.models import Website, Article def crawl_xinhua_article(url, website): headers = { "User-Agent": "Mozilla/5.0" } resp = requests.get(url, headers=headers) resp.encoding = 'utf-8' soup = BeautifulSoup(resp.text, "html.parser") # 提取标题 title_tag = soup.find("span", class_="title") title = title_tag.get_text(strip=True) if title_tag else "无标题" # 提取正文 content_tag = soup.find("span", id="detailContent") paragraphs = content_tag.find_all("p") if content_tag else [] content_html = "".join(str(p) for p in paragraphs) # 保留p标签的html结构 # 如果文章已存在,则不重复插入 if Article.objects.filter(url=url).exists(): print(f"文章已存在,跳过: {url}") return article = Article.objects.create( website=website, title=title, url=url, content=content_html, pub_date=timezone.now(), ) print(f"已保存文章:{title}")