add 多线程 && 新华网

2025-07-21 21:16:27 +08:00
parent c750d77eab
commit 61688f4bff
13 changed files with 264 additions and 75 deletions
--- a/src/green_classroom/apps/collector/admin.py
+++ b/src/green_classroom/apps/collector/admin.py
@@ -1,13 +1,7 @@
 from django.contrib import admin
-from .models import Article, SourceSite
+from .models import Article

@admin.register(Article)
 class ArticleAdmin(admin.ModelAdmin):
-    list_display = ('title', 'category', 'publish_date', 'source', 'is_verified')
-    list_filter = ('category', 'source', 'is_verified')
-    search_fields = ('title', 'content')
-
-@admin.register(SourceSite)
-class SourceSiteAdmin(admin.ModelAdmin):
-    list_display = ('name', 'url', 'is_active')
-    search_fields = ('name', 'url')
+    list_display = ('title', 'url', 'crawled')
+    search_fields = ('title',)
--- a/src/green_classroom/apps/collector/crawler/xinhua.py
+++ b/src/green_classroom/apps/collector/crawler/xinhua.py
@@ -1,29 +1,32 @@
-# xinhua.py
 import requests
 from bs4 import BeautifulSoup
+from urllib.parse import urljoin
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from datetime import datetime, timedelta
+import pytz  # 确保已安装 pytz 库，用于处理时区

-def parse_xinhua_article(url: str):
-    """
-    解析新华网文章页，返回 dict 包含标题与正文
-    """
-    headers = {
-        "User-Agent": "Mozilla/5.0"
-    }
+BASE_URL = "https://www.news.cn/"
+HEADERS = {
+    "User-Agent": "Mozilla/5.0"
+}

+def parse_xinhua_article(url: str, time_range_days: int = None):
+    """
+    解析新华网文章页，返回 dict 包含标题与正文。
+    如果指定了 time_range_days，则只保留该时间范围内的文章。
+    """
    try:
-        resp = requests.get(url, headers=headers, timeout=10)
+        resp = requests.get(url, headers=HEADERS, timeout=10)
        resp.encoding = 'utf-8'
    except requests.RequestException as e:
-        print(f"❌ 请求失败：{e}")
+        print(f"❌ 请求失败：{e} URL: {url}")
        return None

    soup = BeautifulSoup(resp.text, "html.parser")

-    # 提取标题
    title_tag = soup.find("span", class_="title")
    title = title_tag.get_text(strip=True) if title_tag else "无标题"

-    # 提取正文
    content_tag = soup.find("span", id="detailContent")
    if not content_tag:
        print(f"❌ 没找到 detailContent: {url}")
@@ -32,26 +35,90 @@ def parse_xinhua_article(url: str):
    paragraphs = content_tag.find_all("p")
    content = "\n".join(p.get_text(strip=True) for p in paragraphs)

-    # 有效性校验
    if len(content.strip()) < 50:
        print(f"⚠️ 内容过短：{url}")
        return None

+    # 提取发布时间（假设格式为 YYYY-MM-DD）
+    publish_time_tag = soup.find("span", class_="publish-time")
+    if publish_time_tag:
+        publish_time_str = publish_time_tag.get_text(strip=True)
+        try:
+            publish_time = datetime.strptime(publish_time_str, "%Y-%m-%d").replace(tzinfo=pytz.utc)
+        except ValueError:
+            print(f"❌ 无法解析时间：{publish_time_str} URL: {url}")
+            return None
+    else:
+        print(f"❌ 页面未找到发布时间：{url}")
+        return None
+
+    # 检查时间范围
+    if time_range_days is not None:
+        cutoff_time = datetime.now(pytz.utc) - timedelta(days=time_range_days)
+        if publish_time < cutoff_time:
+            print(f"⏰ 文章超出时间范围：{url}")
+            return None
+
    return {
        "url": url,
        "title": title,
-        "content": content
+        "content": content,
+        "publish_time": publish_time
    }

-def crawl_xinhua_green():
+def crawl_xinhua_green(time_range_days: int = None):
    """
-    启动新华网绿色内容爬虫的入口函数
+    爬取新华网所有频道及其文章，并保存到数据库（支持多线程）。
+    可选参数：time_range_days（仅爬取最近指定天数内的文章）
    """
-    print("✅ 开始爬取新华网绿色内容...")
-    # 示例 URL，实际应根据需求进行扩展
-    test_url = "https://www.xinhua.net.cn/example-article"
-    result = parse_xinhua_article(test_url)
-    if result:
-        print("✅ 爬取成功：", result['title'])
-    else:
-        print("❌ 爬取失败")
+    print("✅ 开始爬取新华网栏目列表...")
+    channels = get_channel_urls()
+    print(f"共找到 {len(channels)} 个频道")
+
+    all_articles = []
+    
+    # 并发抓取每个频道的文章链接
+    with ThreadPoolExecutor(max_workers=5) as executor:
+        future_to_channel = {
+            executor.submit(get_article_urls_from_channel, ch_url): ch_url
+            for ch_url in channels
+        }
+        
+        for future in as_completed(future_to_channel):
+            ch_url = future_to_channel[future]
+            try:
+                articles = future.result()
+                print(f"\n➡️ 抓取频道：{ch_url}")
+                print(f"  该频道找到 {len(articles)} 篇文章")
+
+                # 并发解析每篇文章
+                with ThreadPoolExecutor(max_workers=5) as article_executor:
+                    article_futures = {
+                        article_executor.submit(parse_xinhua_article, art_url, time_range_days): art_url
+                        for art_url in articles
+                    }
+
+                    for article_future in as_completed(article_futures):
+                        article = article_future.result()
+                        if article:
+                            print(f"  ✔️ 文章：{article['title']}")
+                            # 更新或创建文章，并标记 crawled=True
+                            Article.objects.update_or_create(
+                                url=article['url'],
+                                defaults={
+                                    'title': article['title'],
+                                    'content': article['content'],
+                                    'crawled': True  # 标记为已爬取
+                                }
+                            )
+                            all_articles.append(article)
+                        else:
+                            print(f"  ❌ 文章解析失败：{article_futures[article_future]}")
+            except Exception as exc:
+                print(f"❌ 频道 {ch_url} 抓取时发生异常：{exc}")
+
+    print(f"\n✅ 爬取结束，共抓取文章 {len(all_articles)} 篇")
+    return all_articles
+
+if __name__ == "__main__":
+    crawl_xinhua_green(time_range_days=7)  # 示例：仅爬取最近 7 天的文章
--- a/src/green_classroom/apps/collector/management/commands/crawl_xinhua.py
+++ b/src/green_classroom/apps/collector/management/commands/crawl_xinhua.py
@@ -1,10 +1,10 @@
-# src/green_classroom/apps/collector/management/commands/crawl_xinhua.py
-
 from django.core.management.base import BaseCommand
 from green_classroom.apps.collector.crawler.xinhua import crawl_xinhua_green

 class Command(BaseCommand):
-    help = "抓取新华网绿色发展相关资料"
+    help = '爬取新华网文章并保存到数据库'

    def handle(self, *args, **kwargs):
-        crawl_xinhua_green()
+        self.stdout.write("开始爬取...")
+        articles = crawl_xinhua_green()
+        self.stdout.write(f"爬取完成，共抓取 {len(articles)} 篇文章")
--- a/src/green_classroom/apps/collector/migrations/0002_rename_is_verified_article_crawled_and_more.py
+++ b/src/green_classroom/apps/collector/migrations/0002_rename_is_verified_article_crawled_and_more.py
@@ -0,0 +1,39 @@
+# Generated by Django 5.1 on 2025-07-21 12:52
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('collector', '0001_initial'),
+    ]
+
+    operations = [
+        migrations.RenameField(
+            model_name='article',
+            old_name='is_verified',
+            new_name='crawled',
+        ),
+        migrations.RemoveField(
+            model_name='article',
+            name='category',
+        ),
+        migrations.RemoveField(
+            model_name='article',
+            name='created_at',
+        ),
+        migrations.RemoveField(
+            model_name='article',
+            name='publish_date',
+        ),
+        migrations.RemoveField(
+            model_name='article',
+            name='source',
+        ),
+        migrations.AlterField(
+            model_name='article',
+            name='title',
+            field=models.CharField(max_length=255),
+        ),
+    ]
--- a/src/green_classroom/apps/collector/models.py
+++ b/src/green_classroom/apps/collector/models.py
@@ -10,21 +10,10 @@ class SourceSite(models.Model):
        return self.name

 class Article(models.Model):
-    CATEGORY_CHOICES = [
-        ('政策', '政策'),
-        ('案例', '案例'),
-        ('新闻', '新闻'),
-        ('科研', '科研'),
-    ]
-
-    title = models.CharField(max_length=300)
    url = models.URLField(unique=True)
-    publish_date = models.DateField(null=True, blank=True)
+    title = models.CharField(max_length=255)
    content = models.TextField()
-    category = models.CharField(max_length=100, choices=CATEGORY_CHOICES)
-    source = models.ForeignKey(SourceSite, on_delete=models.SET_NULL, null=True)
-    created_at = models.DateTimeField(auto_now_add=True)
-    is_verified = models.BooleanField(default=False)
+    crawled = models.BooleanField(default=False)  # 确保此字段存在

    def __str__(self):
        return self.title
--- a/src/green_classroom/apps/collector/urls.py
+++ b/src/green_classroom/apps/collector/urls.py
@@ -1,9 +1,10 @@
 from django.urls import path
-from . import views
+from green_classroom.apps.collector import views

-app_name = 'collector'
+app_name = 'collector'  # 确保命名空间正确

 urlpatterns = [
-    path('', views.article_list, name='article_list'),
-    path('<int:pk>/', views.article_detail, name='article_detail'),
+    path('delete_all_articles/', views.delete_all_articles, name='delete_all_articles'),
+    path('article/<int:article_id>/', views.article_detail, name='article_detail'),
+    path('articles/', views.list_articles, name='article_list'),  # 添加这一行
 ]
--- a/src/green_classroom/apps/collector/views.py
+++ b/src/green_classroom/apps/collector/views.py
@@ -1,20 +1,51 @@
 from django.shortcuts import render, get_object_or_404
-from .models import Article
-from django.db.models import Q
+from django.core.management import call_command
+from green_classroom.apps.collector.models import Article
+import os
+from django.conf import settings
+from django.template import TemplateDoesNotExist

-def article_list(request):
-    query = request.GET.get('q', '')
-    articles = Article.objects.filter(is_verified=True)
+def list_articles(request):
+    """
+    展示所有文章的视图
+    """
+    articles = Article.objects.all()
+    return render(request, 'collector/article_list.html', {'articles': articles})

-    if query:
-        articles = articles.filter(Q(title__icontains=query) | Q(content__icontains=query))
+def article_detail(request, article_id):
+    """
+    展示单篇文章的详细内容
+    """
+    article = get_object_or_404(Article, id=article_id)
+    return render(request, 'collector/article_detail.html', {'article': article})

-    return render(request, 'collector/article_list.html', {
-        'articles': articles.order_by('-publish_date')[:100]
-    })
+def run_crawler(request):
+    result = []
+    if request.method == 'POST':
+        # 调用爬虫命令并获取输出
+        from io import StringIO
+        output = StringIO()
+        call_command('crawl_xinhua', stdout=output)
+        result.append(output.getvalue())
    
-def article_detail(request, pk):
-    article = get_object_or_404(Article, pk=pk, is_verified=True)
-    return render(request, 'collector/article_detail.html', {
-        'article': article
-    })
+    # 调试：打印模板路径
+    template_path = os.path.join(settings.BASE_DIR, 'templates', 'collector', 'run_crawler.html')
+    print(f"🔍 正在查找模板文件：{template_path}")
+    
+    # 调试：检查模板是否存在
+    try:
+        with open(template_path, 'r', encoding='utf-8') as f:
+            print("✅ 模板文件存在")
+    except FileNotFoundError:
+        print("❌ 模板文件不存在，请检查路径")
+    
+    return render(request, 'collector/run_crawler.html', {'output': result})
+
+def delete_all_articles(request):
+    """
+    删除所有文章的视图
+    """
+    if request.method == 'POST':
+        Article.objects.all().delete()
+        return redirect('collector:article_list')
+    return render(request, 'collector/delete_all_articles.html')
--- a/src/green_classroom/settings.py
+++ b/src/green_classroom/settings.py
@@ -11,6 +11,7 @@ https://docs.djangoproject.com/en/5.1/ref/settings/
 """

 from pathlib import Path
+import os

 # Build paths inside the project like this: BASE_DIR / 'subdir'.
 BASE_DIR = Path(__file__).resolve().parent.parent
@@ -55,7 +56,7 @@ ROOT_URLCONF = 'green_classroom.urls'
 TEMPLATES = [
    {
        'BACKEND': 'django.template.backends.django.DjangoTemplates',
-        'DIRS': [],
+        'DIRS': [os.path.join(BASE_DIR, 'templates')],  # ✅ 确保此行存在
        'APP_DIRS': True,
        'OPTIONS': {
            'context_processors': [
@@ -122,3 +123,6 @@ STATIC_URL = 'static/'
 # https://docs.djangoproject.com/en/5.1/ref/settings/#default-auto-field

 DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
+
+# 增加最大字段数限制，适应 admin 页面大数据量展示
+DATA_UPLOAD_MAX_NUMBER_FIELDS = 10240
--- a/src/green_classroom/templates/collector/article_detail.html
+++ b/src/green_classroom/templates/collector/article_detail.html
@@ -0,0 +1,11 @@
+<!DOCTYPE html>
+<html>
+<head>
+    <title>{{ article.title }}</title>
+</head>
+<body>
+    <h1>{{ article.title }}</h1>
+    <p>{{ article.content }}</p>
+    <a href="{% url 'collector:article_list' %}">返回文章列表</a>
+</body>
+</html>
--- a/src/green_classroom/templates/collector/article_list.html
+++ b/src/green_classroom/templates/collector/article_list.html
@@ -0,0 +1,19 @@
+<!DOCTYPE html>
+<html>
+<head>
+    <title>文章列表</title>
+</head>
+<body>
+    <h1>所有文章</h1>
+    <ul>
+        {% for article in articles %}
+            <li>
+                <a href="{% url 'collector:article_detail' article_id=article.id %}">
+                    <strong>{{ article.title }}</strong>
+                </a><br>
+                {{ article.content|truncatewords:50 }}
+            </li>
+        {% endfor %}
+    </ul>
+</body>
+</html>
--- a/src/green_classroom/templates/collector/delete_all_articles.html
+++ b/src/green_classroom/templates/collector/delete_all_articles.html
@@ -0,0 +1,15 @@
+<!DOCTYPE html>
+<html>
+<head>
+    <title>删除所有文章</title>
+</head>
+<body>
+    <h1>确认删除所有文章</h1>
+    <p>您确定要删除所有文章吗？此操作不可撤销。</p>
+    <form method="post">
+        {% csrf_token %}
+        <button type="submit">删除所有文章</button>
+    </form>
+    <a href="{% url 'collector:article_list' %}">取消</a>
+</body>
+</html>
--- a/src/green_classroom/templates/collector/run_crawler.html
+++ b/src/green_classroom/templates/collector/run_crawler.html
@@ -0,0 +1,17 @@
+<!DOCTYPE html>
+<html>
+<head>
+    <title>运行爬虫</title>
+</head>
+<body>
+    <h1>运行爬虫</h1>
+    <form method="post">
+        {% csrf_token %}
+        <button type="submit">开始爬取</button>
+    </form>
+    {% if output %}
+        <h2>输出：</h2>
+        <pre>{{ output }}</pre>
+    {% endif %}
+</body>
+</html>
--- a/src/green_classroom/urls.py
+++ b/src/green_classroom/urls.py
@@ -1,7 +1,9 @@
-from django.contrib import admin
 from django.urls import path, include
+from django.contrib import admin  # 新增导入
+from green_classroom.apps.collector import views

 urlpatterns = [
-    path('admin/', admin.site.urls),
-    path('collector/', include('green_classroom.apps.collector.urls')),
+    path('collector/', include('green_classroom.apps.collector.urls', namespace='collector')),
+    path('articles/', views.list_articles, name='article_list'),
+    path('admin/', admin.site.urls),  # 添加这一行以恢复 admin 页面访问
 ]