From 969d46b07026ef2b667857709e43f49bbb4b9e8d Mon Sep 17 00:00:00 2001 From: yuangyaa Date: Mon, 11 Aug 2025 13:52:52 +0800 Subject: [PATCH] Add Find weburl --- core/management/commands/crawl_xinhua.py | 19 ++++----- core/utils.py | 50 +++++++++++++++++------- core/views.py | 2 +- 3 files changed, 44 insertions(+), 27 deletions(-) diff --git a/core/management/commands/crawl_xinhua.py b/core/management/commands/crawl_xinhua.py index b810425..559f77f 100644 --- a/core/management/commands/crawl_xinhua.py +++ b/core/management/commands/crawl_xinhua.py @@ -1,21 +1,18 @@ from django.core.management.base import BaseCommand from core.models import Website -from core.utils import crawl_xinhua_article +from core.utils import crawl_xinhua_list class Command(BaseCommand): - help = '爬取新华网文章示例' + help = '批量爬取新华网文章' def handle(self, *args, **options): - website_name = "新华网" + list_url = "https://www.news.cn/legal/index.html" try: - website = Website.objects.get(name=website_name) + website = Website.objects.get(base_url="https://www.news.cn/") except Website.DoesNotExist: - self.stdout.write(self.style.ERROR(f"网站 '{website_name}' 不存在,请先后台创建")) + self.stdout.write(self.style.ERROR("网站 https://www.news.cn/ 不存在,请先后台添加")) return - urls = [ - "https://www.news.cn/legal/20250721/f340f7be3d5b4b938cbd6b9889b6fbdc/c.html", - ] - - for url in urls: - crawl_xinhua_article(url, website) + self.stdout.write(f"开始爬取文章列表页: {list_url}") + crawl_xinhua_list(list_url, website) + self.stdout.write(self.style.SUCCESS("批量爬取完成")) diff --git a/core/utils.py b/core/utils.py index 02fe2ac..d7ca2f6 100644 --- a/core/utils.py +++ b/core/utils.py @@ -1,9 +1,9 @@ import os import requests from bs4 import BeautifulSoup +from urllib.parse import urljoin, urlparse from django.utils import timezone from django.conf import settings -from urllib.parse import urljoin from core.models import Article def download_media(url, save_dir): @@ -27,12 +27,12 @@ def download_media(url, save_dir): with open(filepath, "wb") as f: f.write(resp.content) - return filepath + + # 返回相对路径,方便存数据库和展示 + return os.path.relpath(filepath, settings.MEDIA_ROOT).replace("\\", "/") def crawl_xinhua_article(url, website): - headers = { - "User-Agent": "Mozilla/5.0" - } + headers = {"User-Agent": "Mozilla/5.0"} resp = requests.get(url, headers=headers) resp.encoding = 'utf-8' soup = BeautifulSoup(resp.text, "html.parser") @@ -42,7 +42,7 @@ def crawl_xinhua_article(url, website): content_tag = soup.find("span", id="detailContent") if not content_tag: - print("没有找到正文") + print(f"没有找到正文,跳过文章: {url}") return imgs = content_tag.find_all("img") @@ -54,17 +54,17 @@ def crawl_xinhua_article(url, website): for img in imgs: src = img.get("src") - print("原始图片 src =", src) if not src: continue - # 用文章页面url作为base拼接,确保拼出完整图片链接 - src = urljoin(url, src) - print("拼接后图片 URL =", src) - local_path = download_media(src, save_dir) - if local_path: - rel_path = os.path.relpath(local_path, settings.MEDIA_ROOT) - img["src"] = settings.MEDIA_URL + rel_path.replace("\\", "/") - media_files.append(rel_path.replace("\\", "/")) + + # 这里用文章URL作为基准拼接相对路径,避免错误 + if not src.startswith("http"): + src = urljoin(url, src) + + local_rel_path = download_media(src, save_dir) + if local_rel_path: + img["src"] = settings.MEDIA_URL + local_rel_path + media_files.append(local_rel_path) content_html = str(content_tag) @@ -81,3 +81,23 @@ def crawl_xinhua_article(url, website): media_files=media_files ) print(f"已保存文章及图片:{title}") + +def crawl_xinhua_list(list_url, website): + headers = {"User-Agent": "Mozilla/5.0"} + resp = requests.get(list_url, headers=headers) + resp.encoding = 'utf-8' + soup = BeautifulSoup(resp.text, "html.parser") + + article_urls = set() + for link in soup.find_all("a", href=True): + href = link["href"] + if href.startswith("https://www.news.cn/legal/") and href.endswith("c.html"): + article_urls.add(href) + + print(f"在列表页找到 {len(article_urls)} 篇文章链接") + for url in article_urls: + print("文章链接:", url) + + from core.utils import crawl_xinhua_article + for article_url in article_urls: + crawl_xinhua_article(article_url, website) diff --git a/core/views.py b/core/views.py index 956c854..d1f39b5 100644 --- a/core/views.py +++ b/core/views.py @@ -7,7 +7,7 @@ def article_list(request): 显示文章列表的视图函数 """ articles = Article.objects.all().order_by('-created_at') - paginator = Paginator(articles, 10) # 每页显示10篇文章 + paginator = Paginator(articles, 20) # 每页显示10篇文章 page_number = request.GET.get('page') page_obj = paginator.get_page(page_number)