Add download media
This commit is contained in:
8
.gitignore
vendored
8
.gitignore
vendored
@@ -174,3 +174,11 @@ cython_debug/
|
|||||||
# PyPI configuration file
|
# PyPI configuration file
|
||||||
.pypirc
|
.pypirc
|
||||||
|
|
||||||
|
#####################################
|
||||||
|
#
|
||||||
|
# jimmy.fang: ignore data/media/
|
||||||
|
#
|
||||||
|
#####################################
|
||||||
|
|
||||||
|
date/media/
|
||||||
|
|
||||||
|
|||||||
0
core/management/commands/__init__.py
Normal file
0
core/management/commands/__init__.py
Normal file
@@ -6,7 +6,6 @@ class Command(BaseCommand):
|
|||||||
help = '爬取新华网文章示例'
|
help = '爬取新华网文章示例'
|
||||||
|
|
||||||
def handle(self, *args, **options):
|
def handle(self, *args, **options):
|
||||||
# 假设你事先在后台建了“新华网”这个Website实例
|
|
||||||
website_name = "新华网"
|
website_name = "新华网"
|
||||||
try:
|
try:
|
||||||
website = Website.objects.get(name=website_name)
|
website = Website.objects.get(name=website_name)
|
||||||
@@ -14,7 +13,6 @@ class Command(BaseCommand):
|
|||||||
self.stdout.write(self.style.ERROR(f"网站 '{website_name}' 不存在,请先后台创建"))
|
self.stdout.write(self.style.ERROR(f"网站 '{website_name}' 不存在,请先后台创建"))
|
||||||
return
|
return
|
||||||
|
|
||||||
# 这里写你想爬取的文章URL列表,可以循环多篇
|
|
||||||
urls = [
|
urls = [
|
||||||
"https://www.news.cn/legal/20250721/f340f7be3d5b4b938cbd6b9889b6fbdc/c.html",
|
"https://www.news.cn/legal/20250721/f340f7be3d5b4b938cbd6b9889b6fbdc/c.html",
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -1,7 +1,33 @@
|
|||||||
|
import os
|
||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from django.utils import timezone
|
from django.utils import timezone
|
||||||
from core.models import Website, Article
|
from django.conf import settings
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
from core.models import Article
|
||||||
|
|
||||||
|
def download_media(url, save_dir):
|
||||||
|
try:
|
||||||
|
resp = requests.get(url, timeout=15)
|
||||||
|
resp.raise_for_status()
|
||||||
|
except Exception as e:
|
||||||
|
print(f"下载失败:{url},错误:{e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
filename = url.split("/")[-1].split("?")[0]
|
||||||
|
os.makedirs(save_dir, exist_ok=True)
|
||||||
|
filepath = os.path.join(save_dir, filename)
|
||||||
|
|
||||||
|
base, ext = os.path.splitext(filename)
|
||||||
|
counter = 1
|
||||||
|
while os.path.exists(filepath):
|
||||||
|
filename = f"{base}_{counter}{ext}"
|
||||||
|
filepath = os.path.join(save_dir, filename)
|
||||||
|
counter += 1
|
||||||
|
|
||||||
|
with open(filepath, "wb") as f:
|
||||||
|
f.write(resp.content)
|
||||||
|
return filepath
|
||||||
|
|
||||||
def crawl_xinhua_article(url, website):
|
def crawl_xinhua_article(url, website):
|
||||||
headers = {
|
headers = {
|
||||||
@@ -11,16 +37,37 @@ def crawl_xinhua_article(url, website):
|
|||||||
resp.encoding = 'utf-8'
|
resp.encoding = 'utf-8'
|
||||||
soup = BeautifulSoup(resp.text, "html.parser")
|
soup = BeautifulSoup(resp.text, "html.parser")
|
||||||
|
|
||||||
# 提取标题
|
|
||||||
title_tag = soup.find("span", class_="title")
|
title_tag = soup.find("span", class_="title")
|
||||||
title = title_tag.get_text(strip=True) if title_tag else "无标题"
|
title = title_tag.get_text(strip=True) if title_tag else "无标题"
|
||||||
|
|
||||||
# 提取正文
|
|
||||||
content_tag = soup.find("span", id="detailContent")
|
content_tag = soup.find("span", id="detailContent")
|
||||||
paragraphs = content_tag.find_all("p") if content_tag else []
|
if not content_tag:
|
||||||
content_html = "".join(str(p) for p in paragraphs) # 保留p标签的html结构
|
print("没有找到正文")
|
||||||
|
return
|
||||||
|
|
||||||
|
imgs = content_tag.find_all("img")
|
||||||
|
media_files = []
|
||||||
|
|
||||||
|
safe_title = "".join(c if c.isalnum() else "_" for c in title)[:50]
|
||||||
|
save_dir = os.path.join(settings.MEDIA_ROOT, "articles", safe_title)
|
||||||
|
os.makedirs(save_dir, exist_ok=True)
|
||||||
|
|
||||||
|
for img in imgs:
|
||||||
|
src = img.get("src")
|
||||||
|
print("原始图片 src =", src)
|
||||||
|
if not src:
|
||||||
|
continue
|
||||||
|
# 用文章页面url作为base拼接,确保拼出完整图片链接
|
||||||
|
src = urljoin(url, src)
|
||||||
|
print("拼接后图片 URL =", src)
|
||||||
|
local_path = download_media(src, save_dir)
|
||||||
|
if local_path:
|
||||||
|
rel_path = os.path.relpath(local_path, settings.MEDIA_ROOT)
|
||||||
|
img["src"] = settings.MEDIA_URL + rel_path.replace("\\", "/")
|
||||||
|
media_files.append(rel_path.replace("\\", "/"))
|
||||||
|
|
||||||
|
content_html = str(content_tag)
|
||||||
|
|
||||||
# 如果文章已存在,则不重复插入
|
|
||||||
if Article.objects.filter(url=url).exists():
|
if Article.objects.filter(url=url).exists():
|
||||||
print(f"文章已存在,跳过: {url}")
|
print(f"文章已存在,跳过: {url}")
|
||||||
return
|
return
|
||||||
@@ -31,5 +78,6 @@ def crawl_xinhua_article(url, website):
|
|||||||
url=url,
|
url=url,
|
||||||
content=content_html,
|
content=content_html,
|
||||||
pub_date=timezone.now(),
|
pub_date=timezone.now(),
|
||||||
|
media_files=media_files
|
||||||
)
|
)
|
||||||
print(f"已保存文章:{title}")
|
print(f"已保存文章及图片:{title}")
|
||||||
|
|||||||
Reference in New Issue
Block a user