green_classroom/core/utils.py

import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from django.utils import timezone
from django.conf import settings
from core.models import Article

def download_media(url, save_dir):
    try:
        resp = requests.get(url, timeout=15)
        resp.raise_for_status()
    except Exception as e:
        print(f"下载失败：{url}，错误：{e}")
        return None

    filename = url.split("/")[-1].split("?")[0]
    os.makedirs(save_dir, exist_ok=True)
    filepath = os.path.join(save_dir, filename)

    base, ext = os.path.splitext(filename)
    counter = 1
    while os.path.exists(filepath):
        filename = f"{base}_{counter}{ext}"
        filepath = os.path.join(save_dir, filename)
        counter += 1

    with open(filepath, "wb") as f:
        f.write(resp.content)

    # 返回相对路径，方便存数据库和展示
    return os.path.relpath(filepath, settings.MEDIA_ROOT).replace("\\", "/")

def crawl_xinhua_article(url, website):
    headers = {"User-Agent": "Mozilla/5.0"}
    resp = requests.get(url, headers=headers)
    resp.encoding = 'utf-8'
    soup = BeautifulSoup(resp.text, "html.parser")

    title_tag = soup.find("span", class_="title")
    title = title_tag.get_text(strip=True) if title_tag else "无标题"

    content_tag = soup.find("span", id="detailContent")
    if not content_tag:
        print(f"没有找到正文，跳过文章: {url}")
        return

    imgs = content_tag.find_all("img")
    media_files = []

    safe_title = "".join(c if c.isalnum() else "_" for c in title)[:50]
    save_dir = os.path.join(settings.MEDIA_ROOT, "articles", safe_title)
    os.makedirs(save_dir, exist_ok=True)

    for img in imgs:
        src = img.get("src")
        if not src:
            continue

        # 这里用文章URL作为基准拼接相对路径，避免错误
        if not src.startswith("http"):
            src = urljoin(url, src)

        local_rel_path = download_media(src, save_dir)
        if local_rel_path:
            img["src"] = settings.MEDIA_URL + local_rel_path
            media_files.append(local_rel_path)

    content_html = str(content_tag)

    if Article.objects.filter(url=url).exists():
        print(f"文章已存在，跳过: {url}")
        return

    article = Article.objects.create(
        website=website,
        title=title,
        url=url,
        content=content_html,
        pub_date=timezone.now(),
        media_files=media_files
    )
    print(f"已保存文章及图片：{title}")

def crawl_xinhua_list(list_url, website):
    headers = {"User-Agent": "Mozilla/5.0"}
    resp = requests.get(list_url, headers=headers)
    resp.encoding = 'utf-8'
    soup = BeautifulSoup(resp.text, "html.parser")

    article_urls = set()
    for link in soup.find_all("a", href=True):
        href = link["href"]
        if href.startswith("https://www.news.cn/legal/") and href.endswith("c.html"):
            article_urls.add(href)

    print(f"在列表页找到 {len(article_urls)} 篇文章链接")
    for url in article_urls:
        print("文章链接:", url)

    from core.utils import crawl_xinhua_article
    for article_url in article_urls:
        crawl_xinhua_article(article_url, website)