add 多线程 && 新华网

This commit is contained in:
2025-07-21 21:16:27 +08:00
parent c750d77eab
commit 61688f4bff
13 changed files with 264 additions and 75 deletions

View File

@@ -1,13 +1,7 @@
from django.contrib import admin
from .models import Article, SourceSite
from .models import Article
@admin.register(Article)
class ArticleAdmin(admin.ModelAdmin):
list_display = ('title', 'category', 'publish_date', 'source', 'is_verified')
list_filter = ('category', 'source', 'is_verified')
search_fields = ('title', 'content')
@admin.register(SourceSite)
class SourceSiteAdmin(admin.ModelAdmin):
list_display = ('name', 'url', 'is_active')
search_fields = ('name', 'url')
list_display = ('title', 'url', 'crawled')
search_fields = ('title',)

View File

@@ -1,29 +1,32 @@
# xinhua.py
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime, timedelta
import pytz # 确保已安装 pytz 库,用于处理时区
def parse_xinhua_article(url: str):
"""
解析新华网文章页,返回 dict 包含标题与正文
"""
headers = {
"User-Agent": "Mozilla/5.0"
}
BASE_URL = "https://www.news.cn/"
HEADERS = {
"User-Agent": "Mozilla/5.0"
}
def parse_xinhua_article(url: str, time_range_days: int = None):
"""
解析新华网文章页,返回 dict 包含标题与正文。
如果指定了 time_range_days则只保留该时间范围内的文章。
"""
try:
resp = requests.get(url, headers=headers, timeout=10)
resp = requests.get(url, headers=HEADERS, timeout=10)
resp.encoding = 'utf-8'
except requests.RequestException as e:
print(f"❌ 请求失败:{e}")
print(f"❌ 请求失败:{e} URL: {url}")
return None
soup = BeautifulSoup(resp.text, "html.parser")
# 提取标题
title_tag = soup.find("span", class_="title")
title = title_tag.get_text(strip=True) if title_tag else "无标题"
# 提取正文
content_tag = soup.find("span", id="detailContent")
if not content_tag:
print(f"❌ 没找到 detailContent: {url}")
@@ -32,26 +35,90 @@ def parse_xinhua_article(url: str):
paragraphs = content_tag.find_all("p")
content = "\n".join(p.get_text(strip=True) for p in paragraphs)
# 有效性校验
if len(content.strip()) < 50:
print(f"⚠️ 内容过短:{url}")
return None
# 提取发布时间(假设格式为 YYYY-MM-DD
publish_time_tag = soup.find("span", class_="publish-time")
if publish_time_tag:
publish_time_str = publish_time_tag.get_text(strip=True)
try:
publish_time = datetime.strptime(publish_time_str, "%Y-%m-%d").replace(tzinfo=pytz.utc)
except ValueError:
print(f"❌ 无法解析时间:{publish_time_str} URL: {url}")
return None
else:
print(f"❌ 页面未找到发布时间:{url}")
return None
# 检查时间范围
if time_range_days is not None:
cutoff_time = datetime.now(pytz.utc) - timedelta(days=time_range_days)
if publish_time < cutoff_time:
print(f"⏰ 文章超出时间范围:{url}")
return None
return {
"url": url,
"title": title,
"content": content
"content": content,
"publish_time": publish_time
}
def crawl_xinhua_green():
def crawl_xinhua_green(time_range_days: int = None):
"""
启动新华网绿色内容爬虫的入口函数
爬取新华网所有频道及其文章,并保存到数据库(支持多线程)。
可选参数time_range_days仅爬取最近指定天数内的文章
"""
print("✅ 开始爬取新华网绿色内容...")
# 示例 URL实际应根据需求进行扩展
test_url = "https://www.xinhua.net.cn/example-article"
result = parse_xinhua_article(test_url)
if result:
print("✅ 爬取成功:", result['title'])
else:
print("❌ 爬取失败")
print("✅ 开始爬取新华网栏目列表...")
channels = get_channel_urls()
print(f"共找到 {len(channels)} 个频道")
all_articles = []
# 并发抓取每个频道的文章链接
with ThreadPoolExecutor(max_workers=5) as executor:
future_to_channel = {
executor.submit(get_article_urls_from_channel, ch_url): ch_url
for ch_url in channels
}
for future in as_completed(future_to_channel):
ch_url = future_to_channel[future]
try:
articles = future.result()
print(f"\n➡️ 抓取频道:{ch_url}")
print(f" 该频道找到 {len(articles)} 篇文章")
# 并发解析每篇文章
with ThreadPoolExecutor(max_workers=5) as article_executor:
article_futures = {
article_executor.submit(parse_xinhua_article, art_url, time_range_days): art_url
for art_url in articles
}
for article_future in as_completed(article_futures):
article = article_future.result()
if article:
print(f" ✔️ 文章:{article['title']}")
# 更新或创建文章,并标记 crawled=True
Article.objects.update_or_create(
url=article['url'],
defaults={
'title': article['title'],
'content': article['content'],
'crawled': True # 标记为已爬取
}
)
all_articles.append(article)
else:
print(f" ❌ 文章解析失败:{article_futures[article_future]}")
except Exception as exc:
print(f"❌ 频道 {ch_url} 抓取时发生异常:{exc}")
print(f"\n✅ 爬取结束,共抓取文章 {len(all_articles)}")
return all_articles
if __name__ == "__main__":
crawl_xinhua_green(time_range_days=7) # 示例:仅爬取最近 7 天的文章

View File

@@ -1,10 +1,10 @@
# src/green_classroom/apps/collector/management/commands/crawl_xinhua.py
from django.core.management.base import BaseCommand
from green_classroom.apps.collector.crawler.xinhua import crawl_xinhua_green
class Command(BaseCommand):
help = "抓取新华网绿色发展相关资料"
help = '爬取新华网文章并保存到数据库'
def handle(self, *args, **kwargs):
crawl_xinhua_green()
self.stdout.write("开始爬取...")
articles = crawl_xinhua_green()
self.stdout.write(f"爬取完成,共抓取 {len(articles)} 篇文章")

View File

@@ -0,0 +1,39 @@
# Generated by Django 5.1 on 2025-07-21 12:52
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('collector', '0001_initial'),
]
operations = [
migrations.RenameField(
model_name='article',
old_name='is_verified',
new_name='crawled',
),
migrations.RemoveField(
model_name='article',
name='category',
),
migrations.RemoveField(
model_name='article',
name='created_at',
),
migrations.RemoveField(
model_name='article',
name='publish_date',
),
migrations.RemoveField(
model_name='article',
name='source',
),
migrations.AlterField(
model_name='article',
name='title',
field=models.CharField(max_length=255),
),
]

View File

@@ -10,21 +10,10 @@ class SourceSite(models.Model):
return self.name
class Article(models.Model):
CATEGORY_CHOICES = [
('政策', '政策'),
('案例', '案例'),
('新闻', '新闻'),
('科研', '科研'),
]
title = models.CharField(max_length=300)
url = models.URLField(unique=True)
publish_date = models.DateField(null=True, blank=True)
title = models.CharField(max_length=255)
content = models.TextField()
category = models.CharField(max_length=100, choices=CATEGORY_CHOICES)
source = models.ForeignKey(SourceSite, on_delete=models.SET_NULL, null=True)
created_at = models.DateTimeField(auto_now_add=True)
is_verified = models.BooleanField(default=False)
crawled = models.BooleanField(default=False) # 确保此字段存在
def __str__(self):
return self.title

View File

@@ -1,9 +1,10 @@
from django.urls import path
from . import views
from green_classroom.apps.collector import views
app_name = 'collector'
app_name = 'collector' # 确保命名空间正确
urlpatterns = [
path('', views.article_list, name='article_list'),
path('<int:pk>/', views.article_detail, name='article_detail'),
path('delete_all_articles/', views.delete_all_articles, name='delete_all_articles'),
path('article/<int:article_id>/', views.article_detail, name='article_detail'),
path('articles/', views.list_articles, name='article_list'), # 添加这一行
]

View File

@@ -1,20 +1,51 @@
from django.shortcuts import render, get_object_or_404
from .models import Article
from django.db.models import Q
from django.core.management import call_command
from green_classroom.apps.collector.models import Article
import os
from django.conf import settings
from django.template import TemplateDoesNotExist
def article_list(request):
query = request.GET.get('q', '')
articles = Article.objects.filter(is_verified=True)
def list_articles(request):
"""
展示所有文章的视图
"""
articles = Article.objects.all()
return render(request, 'collector/article_list.html', {'articles': articles})
if query:
articles = articles.filter(Q(title__icontains=query) | Q(content__icontains=query))
def article_detail(request, article_id):
"""
展示单篇文章的详细内容
"""
article = get_object_or_404(Article, id=article_id)
return render(request, 'collector/article_detail.html', {'article': article})
return render(request, 'collector/article_list.html', {
'articles': articles.order_by('-publish_date')[:100]
})
def run_crawler(request):
result = []
if request.method == 'POST':
# 调用爬虫命令并获取输出
from io import StringIO
output = StringIO()
call_command('crawl_xinhua', stdout=output)
result.append(output.getvalue())
def article_detail(request, pk):
article = get_object_or_404(Article, pk=pk, is_verified=True)
return render(request, 'collector/article_detail.html', {
'article': article
})
# 调试:打印模板路径
template_path = os.path.join(settings.BASE_DIR, 'templates', 'collector', 'run_crawler.html')
print(f"🔍 正在查找模板文件:{template_path}")
# 调试:检查模板是否存在
try:
with open(template_path, 'r', encoding='utf-8') as f:
print("✅ 模板文件存在")
except FileNotFoundError:
print("❌ 模板文件不存在,请检查路径")
return render(request, 'collector/run_crawler.html', {'output': result})
def delete_all_articles(request):
"""
删除所有文章的视图
"""
if request.method == 'POST':
Article.objects.all().delete()
return redirect('collector:article_list')
return render(request, 'collector/delete_all_articles.html')

View File

@@ -11,6 +11,7 @@ https://docs.djangoproject.com/en/5.1/ref/settings/
"""
from pathlib import Path
import os
# Build paths inside the project like this: BASE_DIR / 'subdir'.
BASE_DIR = Path(__file__).resolve().parent.parent
@@ -55,7 +56,7 @@ ROOT_URLCONF = 'green_classroom.urls'
TEMPLATES = [
{
'BACKEND': 'django.template.backends.django.DjangoTemplates',
'DIRS': [],
'DIRS': [os.path.join(BASE_DIR, 'templates')], # ✅ 确保此行存在
'APP_DIRS': True,
'OPTIONS': {
'context_processors': [
@@ -122,3 +123,6 @@ STATIC_URL = 'static/'
# https://docs.djangoproject.com/en/5.1/ref/settings/#default-auto-field
DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
# 增加最大字段数限制,适应 admin 页面大数据量展示
DATA_UPLOAD_MAX_NUMBER_FIELDS = 10240

View File

@@ -0,0 +1,11 @@
<!DOCTYPE html>
<html>
<head>
<title>{{ article.title }}</title>
</head>
<body>
<h1>{{ article.title }}</h1>
<p>{{ article.content }}</p>
<a href="{% url 'collector:article_list' %}">返回文章列表</a>
</body>
</html>

View File

@@ -0,0 +1,19 @@
<!DOCTYPE html>
<html>
<head>
<title>文章列表</title>
</head>
<body>
<h1>所有文章</h1>
<ul>
{% for article in articles %}
<li>
<a href="{% url 'collector:article_detail' article_id=article.id %}">
<strong>{{ article.title }}</strong>
</a><br>
{{ article.content|truncatewords:50 }}
</li>
{% endfor %}
</ul>
</body>
</html>

View File

@@ -0,0 +1,15 @@
<!DOCTYPE html>
<html>
<head>
<title>删除所有文章</title>
</head>
<body>
<h1>确认删除所有文章</h1>
<p>您确定要删除所有文章吗?此操作不可撤销。</p>
<form method="post">
{% csrf_token %}
<button type="submit">删除所有文章</button>
</form>
<a href="{% url 'collector:article_list' %}">取消</a>
</body>
</html>

View File

@@ -0,0 +1,17 @@
<!DOCTYPE html>
<html>
<head>
<title>运行爬虫</title>
</head>
<body>
<h1>运行爬虫</h1>
<form method="post">
{% csrf_token %}
<button type="submit">开始爬取</button>
</form>
{% if output %}
<h2>输出:</h2>
<pre>{{ output }}</pre>
{% endif %}
</body>
</html>

View File

@@ -1,7 +1,9 @@
from django.contrib import admin
from django.urls import path, include
from django.contrib import admin # 新增导入
from green_classroom.apps.collector import views
urlpatterns = [
path('admin/', admin.site.urls),
path('collector/', include('green_classroom.apps.collector.urls')),
path('collector/', include('green_classroom.apps.collector.urls', namespace='collector')),
path('articles/', views.list_articles, name='article_list'),
path('admin/', admin.site.urls), # 添加这一行以恢复 admin 页面访问
]