Add Support full site

This commit is contained in:
2025-08-11 14:33:32 +08:00
parent 969d46b070
commit 6d80326a4e
2 changed files with 76 additions and 30 deletions

View File

@@ -0,0 +1,20 @@
# core/management/commands/crawl_full_site.py
from django.core.management.base import BaseCommand
from core.models import Website
from core.utils import full_site_crawler
class Command(BaseCommand):
help = "全站递归爬取 www.news.cn"
def handle(self, *args, **kwargs):
website, created = Website.objects.get_or_create(
name="www.news.cn",
defaults={
'article_list_url': 'https://www.news.cn/',
'article_selector': 'a'
}
)
start_url = "https://www.news.cn/"
self.stdout.write(f"开始全站爬取: {start_url}")
full_site_crawler(start_url, website, max_pages=500)
self.stdout.write("爬取完成")

View File

@@ -1,7 +1,9 @@
# core/utils.py
import os import os
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse from urllib.parse import urljoin, urlparse
from collections import deque
from django.utils import timezone from django.utils import timezone
from django.conf import settings from django.conf import settings
from core.models import Article from core.models import Article
@@ -27,11 +29,13 @@ def download_media(url, save_dir):
with open(filepath, "wb") as f: with open(filepath, "wb") as f:
f.write(resp.content) f.write(resp.content)
return filepath
# 返回相对路径,方便存数据库和展示 def process_article(url, website):
return os.path.relpath(filepath, settings.MEDIA_ROOT).replace("\\", "/") if Article.objects.filter(url=url).exists():
print(f"文章已存在,跳过: {url}")
return
def crawl_xinhua_article(url, website):
headers = {"User-Agent": "Mozilla/5.0"} headers = {"User-Agent": "Mozilla/5.0"}
resp = requests.get(url, headers=headers) resp = requests.get(url, headers=headers)
resp.encoding = 'utf-8' resp.encoding = 'utf-8'
@@ -42,7 +46,7 @@ def crawl_xinhua_article(url, website):
content_tag = soup.find("span", id="detailContent") content_tag = soup.find("span", id="detailContent")
if not content_tag: if not content_tag:
print(f"没有找到正文,跳过文章: {url}") print("没有找到正文,跳过:", url)
return return
imgs = content_tag.find_all("img") imgs = content_tag.find_all("img")
@@ -56,22 +60,16 @@ def crawl_xinhua_article(url, website):
src = img.get("src") src = img.get("src")
if not src: if not src:
continue continue
# 这里用文章URL作为基准拼接相对路径避免错误
if not src.startswith("http"): if not src.startswith("http"):
src = urljoin(url, src) src = urljoin(url, src)
local_path = download_media(src, save_dir)
local_rel_path = download_media(src, save_dir) if local_path:
if local_rel_path: rel_path = os.path.relpath(local_path, settings.MEDIA_ROOT)
img["src"] = settings.MEDIA_URL + local_rel_path img["src"] = settings.MEDIA_URL + rel_path.replace("\\", "/")
media_files.append(local_rel_path) media_files.append(rel_path.replace("\\", "/"))
content_html = str(content_tag) content_html = str(content_tag)
if Article.objects.filter(url=url).exists():
print(f"文章已存在,跳过: {url}")
return
article = Article.objects.create( article = Article.objects.create(
website=website, website=website,
title=title, title=title,
@@ -82,22 +80,50 @@ def crawl_xinhua_article(url, website):
) )
print(f"已保存文章及图片:{title}") print(f"已保存文章及图片:{title}")
def crawl_xinhua_list(list_url, website): def is_valid_url(url, base_netloc):
try:
parsed = urlparse(url)
if parsed.scheme not in ("http", "https"):
return False
if parsed.netloc != base_netloc:
return False
return True
except Exception:
return False
def full_site_crawler(start_url, website, max_pages=1000):
headers = {"User-Agent": "Mozilla/5.0"} headers = {"User-Agent": "Mozilla/5.0"}
resp = requests.get(list_url, headers=headers) visited = set()
resp.encoding = 'utf-8' queue = deque([start_url])
soup = BeautifulSoup(resp.text, "html.parser")
article_urls = set() base_netloc = urlparse(start_url).netloc
for link in soup.find_all("a", href=True):
href = link["href"]
if href.startswith("https://www.news.cn/legal/") and href.endswith("c.html"):
article_urls.add(href)
print(f"在列表页找到 {len(article_urls)} 篇文章链接") pages_crawled = 0
for url in article_urls:
print("文章链接:", url)
from core.utils import crawl_xinhua_article while queue and pages_crawled < max_pages:
for article_url in article_urls: url = queue.popleft()
crawl_xinhua_article(article_url, website) if url in visited:
continue
print(f"正在爬取:{url}")
visited.add(url)
try:
resp = requests.get(url, headers=headers, timeout=15)
resp.raise_for_status()
except Exception as e:
print(f"请求失败:{url},错误:{e}")
continue
resp.encoding = 'utf-8'
soup = BeautifulSoup(resp.text, "html.parser")
# 如果是文章页面,则调用文章处理
if soup.find("span", id="detailContent"):
process_article(url, website)
pages_crawled += 1
# 扩展队列,发现新链接
for link in soup.find_all("a", href=True):
href = urljoin(url, link["href"])
if href not in visited and is_valid_url(href, base_netloc):
queue.append(href)