Add download media

This commit is contained in:
2025-08-11 13:28:32 +08:00
parent 4e5e35b4fa
commit da1b8d98e4
4 changed files with 63 additions and 9 deletions

8
.gitignore vendored
View File

@@ -174,3 +174,11 @@ cython_debug/
# PyPI configuration file
.pypirc
#####################################
#
# jimmy.fang: ignore data/media/
#
#####################################
date/media/

View File

View File

@@ -6,7 +6,6 @@ class Command(BaseCommand):
help = '爬取新华网文章示例'
def handle(self, *args, **options):
# 假设你事先在后台建了“新华网”这个Website实例
website_name = "新华网"
try:
website = Website.objects.get(name=website_name)
@@ -14,7 +13,6 @@ class Command(BaseCommand):
self.stdout.write(self.style.ERROR(f"网站 '{website_name}' 不存在,请先后台创建"))
return
# 这里写你想爬取的文章URL列表可以循环多篇
urls = [
"https://www.news.cn/legal/20250721/f340f7be3d5b4b938cbd6b9889b6fbdc/c.html",
]

View File

@@ -1,7 +1,33 @@
import os
import requests
from bs4 import BeautifulSoup
from django.utils import timezone
from core.models import Website, Article
from django.conf import settings
from urllib.parse import urljoin
from core.models import Article
def download_media(url, save_dir):
try:
resp = requests.get(url, timeout=15)
resp.raise_for_status()
except Exception as e:
print(f"下载失败:{url},错误:{e}")
return None
filename = url.split("/")[-1].split("?")[0]
os.makedirs(save_dir, exist_ok=True)
filepath = os.path.join(save_dir, filename)
base, ext = os.path.splitext(filename)
counter = 1
while os.path.exists(filepath):
filename = f"{base}_{counter}{ext}"
filepath = os.path.join(save_dir, filename)
counter += 1
with open(filepath, "wb") as f:
f.write(resp.content)
return filepath
def crawl_xinhua_article(url, website):
headers = {
@@ -11,16 +37,37 @@ def crawl_xinhua_article(url, website):
resp.encoding = 'utf-8'
soup = BeautifulSoup(resp.text, "html.parser")
# 提取标题
title_tag = soup.find("span", class_="title")
title = title_tag.get_text(strip=True) if title_tag else "无标题"
# 提取正文
content_tag = soup.find("span", id="detailContent")
paragraphs = content_tag.find_all("p") if content_tag else []
content_html = "".join(str(p) for p in paragraphs) # 保留p标签的html结构
if not content_tag:
print("没有找到正文")
return
imgs = content_tag.find_all("img")
media_files = []
safe_title = "".join(c if c.isalnum() else "_" for c in title)[:50]
save_dir = os.path.join(settings.MEDIA_ROOT, "articles", safe_title)
os.makedirs(save_dir, exist_ok=True)
for img in imgs:
src = img.get("src")
print("原始图片 src =", src)
if not src:
continue
# 用文章页面url作为base拼接确保拼出完整图片链接
src = urljoin(url, src)
print("拼接后图片 URL =", src)
local_path = download_media(src, save_dir)
if local_path:
rel_path = os.path.relpath(local_path, settings.MEDIA_ROOT)
img["src"] = settings.MEDIA_URL + rel_path.replace("\\", "/")
media_files.append(rel_path.replace("\\", "/"))
content_html = str(content_tag)
# 如果文章已存在,则不重复插入
if Article.objects.filter(url=url).exists():
print(f"文章已存在,跳过: {url}")
return
@@ -31,5 +78,6 @@ def crawl_xinhua_article(url, website):
url=url,
content=content_html,
pub_date=timezone.now(),
media_files=media_files
)
print(f"已保存文章:{title}")
print(f"已保存文章及图片{title}")