Add Find weburl

This commit is contained in:
2025-08-11 13:52:52 +08:00
parent da1b8d98e4
commit 969d46b070
3 changed files with 44 additions and 27 deletions

View File

@@ -1,21 +1,18 @@
from django.core.management.base import BaseCommand
from core.models import Website
from core.utils import crawl_xinhua_article
from core.utils import crawl_xinhua_list
class Command(BaseCommand):
help = '爬取新华网文章示例'
help = '批量爬取新华网文章'
def handle(self, *args, **options):
website_name = "新华网"
list_url = "https://www.news.cn/legal/index.html"
try:
website = Website.objects.get(name=website_name)
website = Website.objects.get(base_url="https://www.news.cn/")
except Website.DoesNotExist:
self.stdout.write(self.style.ERROR(f"网站 '{website_name}' 不存在,请先后台创建"))
self.stdout.write(self.style.ERROR("网站 https://www.news.cn/ 不存在,请先后台添加"))
return
urls = [
"https://www.news.cn/legal/20250721/f340f7be3d5b4b938cbd6b9889b6fbdc/c.html",
]
for url in urls:
crawl_xinhua_article(url, website)
self.stdout.write(f"开始爬取文章列表页: {list_url}")
crawl_xinhua_list(list_url, website)
self.stdout.write(self.style.SUCCESS("批量爬取完成"))

View File

@@ -1,9 +1,9 @@
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from django.utils import timezone
from django.conf import settings
from urllib.parse import urljoin
from core.models import Article
def download_media(url, save_dir):
@@ -27,12 +27,12 @@ def download_media(url, save_dir):
with open(filepath, "wb") as f:
f.write(resp.content)
return filepath
# 返回相对路径,方便存数据库和展示
return os.path.relpath(filepath, settings.MEDIA_ROOT).replace("\\", "/")
def crawl_xinhua_article(url, website):
headers = {
"User-Agent": "Mozilla/5.0"
}
headers = {"User-Agent": "Mozilla/5.0"}
resp = requests.get(url, headers=headers)
resp.encoding = 'utf-8'
soup = BeautifulSoup(resp.text, "html.parser")
@@ -42,7 +42,7 @@ def crawl_xinhua_article(url, website):
content_tag = soup.find("span", id="detailContent")
if not content_tag:
print("没有找到正文")
print(f"没有找到正文,跳过文章: {url}")
return
imgs = content_tag.find_all("img")
@@ -54,17 +54,17 @@ def crawl_xinhua_article(url, website):
for img in imgs:
src = img.get("src")
print("原始图片 src =", src)
if not src:
continue
# 用文章页面url作为base拼接确保拼出完整图片链接
# 这里用文章URL作为基准拼接相对路径避免错误
if not src.startswith("http"):
src = urljoin(url, src)
print("拼接后图片 URL =", src)
local_path = download_media(src, save_dir)
if local_path:
rel_path = os.path.relpath(local_path, settings.MEDIA_ROOT)
img["src"] = settings.MEDIA_URL + rel_path.replace("\\", "/")
media_files.append(rel_path.replace("\\", "/"))
local_rel_path = download_media(src, save_dir)
if local_rel_path:
img["src"] = settings.MEDIA_URL + local_rel_path
media_files.append(local_rel_path)
content_html = str(content_tag)
@@ -81,3 +81,23 @@ def crawl_xinhua_article(url, website):
media_files=media_files
)
print(f"已保存文章及图片:{title}")
def crawl_xinhua_list(list_url, website):
headers = {"User-Agent": "Mozilla/5.0"}
resp = requests.get(list_url, headers=headers)
resp.encoding = 'utf-8'
soup = BeautifulSoup(resp.text, "html.parser")
article_urls = set()
for link in soup.find_all("a", href=True):
href = link["href"]
if href.startswith("https://www.news.cn/legal/") and href.endswith("c.html"):
article_urls.add(href)
print(f"在列表页找到 {len(article_urls)} 篇文章链接")
for url in article_urls:
print("文章链接:", url)
from core.utils import crawl_xinhua_article
for article_url in article_urls:
crawl_xinhua_article(article_url, website)

View File

@@ -7,7 +7,7 @@ def article_list(request):
显示文章列表的视图函数
"""
articles = Article.objects.all().order_by('-created_at')
paginator = Paginator(articles, 10) # 每页显示10篇文章
paginator = Paginator(articles, 20) # 每页显示10篇文章
page_number = request.GET.get('page')
page_obj = paginator.get_page(page_number)