import os import requests from bs4 import BeautifulSoup from urllib.parse import urljoin, urlparse from django.utils import timezone from django.conf import settings from core.models import Article def download_media(url, save_dir): try: resp = requests.get(url, timeout=15) resp.raise_for_status() except Exception as e: print(f"下载失败:{url},错误:{e}") return None filename = url.split("/")[-1].split("?")[0] os.makedirs(save_dir, exist_ok=True) filepath = os.path.join(save_dir, filename) base, ext = os.path.splitext(filename) counter = 1 while os.path.exists(filepath): filename = f"{base}_{counter}{ext}" filepath = os.path.join(save_dir, filename) counter += 1 with open(filepath, "wb") as f: f.write(resp.content) # 返回相对路径,方便存数据库和展示 return os.path.relpath(filepath, settings.MEDIA_ROOT).replace("\\", "/") def crawl_xinhua_article(url, website): headers = {"User-Agent": "Mozilla/5.0"} resp = requests.get(url, headers=headers) resp.encoding = 'utf-8' soup = BeautifulSoup(resp.text, "html.parser") title_tag = soup.find("span", class_="title") title = title_tag.get_text(strip=True) if title_tag else "无标题" content_tag = soup.find("span", id="detailContent") if not content_tag: print(f"没有找到正文,跳过文章: {url}") return imgs = content_tag.find_all("img") media_files = [] safe_title = "".join(c if c.isalnum() else "_" for c in title)[:50] save_dir = os.path.join(settings.MEDIA_ROOT, "articles", safe_title) os.makedirs(save_dir, exist_ok=True) for img in imgs: src = img.get("src") if not src: continue # 这里用文章URL作为基准拼接相对路径,避免错误 if not src.startswith("http"): src = urljoin(url, src) local_rel_path = download_media(src, save_dir) if local_rel_path: img["src"] = settings.MEDIA_URL + local_rel_path media_files.append(local_rel_path) content_html = str(content_tag) if Article.objects.filter(url=url).exists(): print(f"文章已存在,跳过: {url}") return article = Article.objects.create( website=website, title=title, url=url, content=content_html, pub_date=timezone.now(), media_files=media_files ) print(f"已保存文章及图片:{title}") def crawl_xinhua_list(list_url, website): headers = {"User-Agent": "Mozilla/5.0"} resp = requests.get(list_url, headers=headers) resp.encoding = 'utf-8' soup = BeautifulSoup(resp.text, "html.parser") article_urls = set() for link in soup.find_all("a", href=True): href = link["href"] if href.startswith("https://www.news.cn/legal/") and href.endswith("c.html"): article_urls.add(href) print(f"在列表页找到 {len(article_urls)} 篇文章链接") for url in article_urls: print("文章链接:", url) from core.utils import crawl_xinhua_article for article_url in article_urls: crawl_xinhua_article(article_url, website)