Files
green_classroom/core/utils.py
2025-08-11 13:52:52 +08:00

104 lines
3.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from django.utils import timezone
from django.conf import settings
from core.models import Article
def download_media(url, save_dir):
try:
resp = requests.get(url, timeout=15)
resp.raise_for_status()
except Exception as e:
print(f"下载失败:{url},错误:{e}")
return None
filename = url.split("/")[-1].split("?")[0]
os.makedirs(save_dir, exist_ok=True)
filepath = os.path.join(save_dir, filename)
base, ext = os.path.splitext(filename)
counter = 1
while os.path.exists(filepath):
filename = f"{base}_{counter}{ext}"
filepath = os.path.join(save_dir, filename)
counter += 1
with open(filepath, "wb") as f:
f.write(resp.content)
# 返回相对路径,方便存数据库和展示
return os.path.relpath(filepath, settings.MEDIA_ROOT).replace("\\", "/")
def crawl_xinhua_article(url, website):
headers = {"User-Agent": "Mozilla/5.0"}
resp = requests.get(url, headers=headers)
resp.encoding = 'utf-8'
soup = BeautifulSoup(resp.text, "html.parser")
title_tag = soup.find("span", class_="title")
title = title_tag.get_text(strip=True) if title_tag else "无标题"
content_tag = soup.find("span", id="detailContent")
if not content_tag:
print(f"没有找到正文,跳过文章: {url}")
return
imgs = content_tag.find_all("img")
media_files = []
safe_title = "".join(c if c.isalnum() else "_" for c in title)[:50]
save_dir = os.path.join(settings.MEDIA_ROOT, "articles", safe_title)
os.makedirs(save_dir, exist_ok=True)
for img in imgs:
src = img.get("src")
if not src:
continue
# 这里用文章URL作为基准拼接相对路径避免错误
if not src.startswith("http"):
src = urljoin(url, src)
local_rel_path = download_media(src, save_dir)
if local_rel_path:
img["src"] = settings.MEDIA_URL + local_rel_path
media_files.append(local_rel_path)
content_html = str(content_tag)
if Article.objects.filter(url=url).exists():
print(f"文章已存在,跳过: {url}")
return
article = Article.objects.create(
website=website,
title=title,
url=url,
content=content_html,
pub_date=timezone.now(),
media_files=media_files
)
print(f"已保存文章及图片:{title}")
def crawl_xinhua_list(list_url, website):
headers = {"User-Agent": "Mozilla/5.0"}
resp = requests.get(list_url, headers=headers)
resp.encoding = 'utf-8'
soup = BeautifulSoup(resp.text, "html.parser")
article_urls = set()
for link in soup.find_all("a", href=True):
href = link["href"]
if href.startswith("https://www.news.cn/legal/") and href.endswith("c.html"):
article_urls.add(href)
print(f"在列表页找到 {len(article_urls)} 篇文章链接")
for url in article_urls:
print("文章链接:", url)
from core.utils import crawl_xinhua_article
for article_url in article_urls:
crawl_xinhua_article(article_url, website)