import os import requests from bs4 import BeautifulSoup from django.utils import timezone from django.conf import settings from urllib.parse import urljoin from core.models import Article def download_media(url, save_dir): try: resp = requests.get(url, timeout=15) resp.raise_for_status() except Exception as e: print(f"下载失败:{url},错误:{e}") return None filename = url.split("/")[-1].split("?")[0] os.makedirs(save_dir, exist_ok=True) filepath = os.path.join(save_dir, filename) base, ext = os.path.splitext(filename) counter = 1 while os.path.exists(filepath): filename = f"{base}_{counter}{ext}" filepath = os.path.join(save_dir, filename) counter += 1 with open(filepath, "wb") as f: f.write(resp.content) return filepath def crawl_xinhua_article(url, website): headers = { "User-Agent": "Mozilla/5.0" } resp = requests.get(url, headers=headers) resp.encoding = 'utf-8' soup = BeautifulSoup(resp.text, "html.parser") title_tag = soup.find("span", class_="title") title = title_tag.get_text(strip=True) if title_tag else "无标题" content_tag = soup.find("span", id="detailContent") if not content_tag: print("没有找到正文") return imgs = content_tag.find_all("img") media_files = [] safe_title = "".join(c if c.isalnum() else "_" for c in title)[:50] save_dir = os.path.join(settings.MEDIA_ROOT, "articles", safe_title) os.makedirs(save_dir, exist_ok=True) for img in imgs: src = img.get("src") print("原始图片 src =", src) if not src: continue # 用文章页面url作为base拼接,确保拼出完整图片链接 src = urljoin(url, src) print("拼接后图片 URL =", src) local_path = download_media(src, save_dir) if local_path: rel_path = os.path.relpath(local_path, settings.MEDIA_ROOT) img["src"] = settings.MEDIA_URL + rel_path.replace("\\", "/") media_files.append(rel_path.replace("\\", "/")) content_html = str(content_tag) if Article.objects.filter(url=url).exists(): print(f"文章已存在,跳过: {url}") return article = Article.objects.create( website=website, title=title, url=url, content=content_html, pub_date=timezone.now(), media_files=media_files ) print(f"已保存文章及图片:{title}")