Add download media

This commit is contained in:
2025-08-11 13:28:32 +08:00
parent 4e5e35b4fa
commit da1b8d98e4
4 changed files with 63 additions and 9 deletions

8
.gitignore vendored
View File

@@ -174,3 +174,11 @@ cython_debug/
# PyPI configuration file # PyPI configuration file
.pypirc .pypirc
#####################################
#
# jimmy.fang: ignore data/media/
#
#####################################
date/media/

View File

View File

@@ -6,7 +6,6 @@ class Command(BaseCommand):
help = '爬取新华网文章示例' help = '爬取新华网文章示例'
def handle(self, *args, **options): def handle(self, *args, **options):
# 假设你事先在后台建了“新华网”这个Website实例
website_name = "新华网" website_name = "新华网"
try: try:
website = Website.objects.get(name=website_name) website = Website.objects.get(name=website_name)
@@ -14,7 +13,6 @@ class Command(BaseCommand):
self.stdout.write(self.style.ERROR(f"网站 '{website_name}' 不存在,请先后台创建")) self.stdout.write(self.style.ERROR(f"网站 '{website_name}' 不存在,请先后台创建"))
return return
# 这里写你想爬取的文章URL列表可以循环多篇
urls = [ urls = [
"https://www.news.cn/legal/20250721/f340f7be3d5b4b938cbd6b9889b6fbdc/c.html", "https://www.news.cn/legal/20250721/f340f7be3d5b4b938cbd6b9889b6fbdc/c.html",
] ]

View File

@@ -1,7 +1,33 @@
import os
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from django.utils import timezone from django.utils import timezone
from core.models import Website, Article from django.conf import settings
from urllib.parse import urljoin
from core.models import Article
def download_media(url, save_dir):
try:
resp = requests.get(url, timeout=15)
resp.raise_for_status()
except Exception as e:
print(f"下载失败:{url},错误:{e}")
return None
filename = url.split("/")[-1].split("?")[0]
os.makedirs(save_dir, exist_ok=True)
filepath = os.path.join(save_dir, filename)
base, ext = os.path.splitext(filename)
counter = 1
while os.path.exists(filepath):
filename = f"{base}_{counter}{ext}"
filepath = os.path.join(save_dir, filename)
counter += 1
with open(filepath, "wb") as f:
f.write(resp.content)
return filepath
def crawl_xinhua_article(url, website): def crawl_xinhua_article(url, website):
headers = { headers = {
@@ -11,16 +37,37 @@ def crawl_xinhua_article(url, website):
resp.encoding = 'utf-8' resp.encoding = 'utf-8'
soup = BeautifulSoup(resp.text, "html.parser") soup = BeautifulSoup(resp.text, "html.parser")
# 提取标题
title_tag = soup.find("span", class_="title") title_tag = soup.find("span", class_="title")
title = title_tag.get_text(strip=True) if title_tag else "无标题" title = title_tag.get_text(strip=True) if title_tag else "无标题"
# 提取正文
content_tag = soup.find("span", id="detailContent") content_tag = soup.find("span", id="detailContent")
paragraphs = content_tag.find_all("p") if content_tag else [] if not content_tag:
content_html = "".join(str(p) for p in paragraphs) # 保留p标签的html结构 print("没有找到正文")
return
imgs = content_tag.find_all("img")
media_files = []
safe_title = "".join(c if c.isalnum() else "_" for c in title)[:50]
save_dir = os.path.join(settings.MEDIA_ROOT, "articles", safe_title)
os.makedirs(save_dir, exist_ok=True)
for img in imgs:
src = img.get("src")
print("原始图片 src =", src)
if not src:
continue
# 用文章页面url作为base拼接确保拼出完整图片链接
src = urljoin(url, src)
print("拼接后图片 URL =", src)
local_path = download_media(src, save_dir)
if local_path:
rel_path = os.path.relpath(local_path, settings.MEDIA_ROOT)
img["src"] = settings.MEDIA_URL + rel_path.replace("\\", "/")
media_files.append(rel_path.replace("\\", "/"))
content_html = str(content_tag)
# 如果文章已存在,则不重复插入
if Article.objects.filter(url=url).exists(): if Article.objects.filter(url=url).exists():
print(f"文章已存在,跳过: {url}") print(f"文章已存在,跳过: {url}")
return return
@@ -31,5 +78,6 @@ def crawl_xinhua_article(url, website):
url=url, url=url,
content=content_html, content=content_html,
pub_date=timezone.now(), pub_date=timezone.now(),
media_files=media_files
) )
print(f"已保存文章:{title}") print(f"已保存文章及图片{title}")