diff --git a/src/green_classroom/apps/collector/crawler/xinhua.py b/src/green_classroom/apps/collector/crawler/xinhua.py index dcd9814..37e6254 100644 --- a/src/green_classroom/apps/collector/crawler/xinhua.py +++ b/src/green_classroom/apps/collector/crawler/xinhua.py @@ -3,13 +3,27 @@ from bs4 import BeautifulSoup from urllib.parse import urljoin from concurrent.futures import ThreadPoolExecutor, as_completed from datetime import datetime, timedelta -import pytz # 确保已安装 pytz 库,用于处理时区 +import pytz +import logging +import sys + +# 配置logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s [%(levelname)s] %(message)s', + handlers=[ + logging.StreamHandler(sys.stdout) + ] +) + +logger = logging.getLogger(__name__) BASE_URL = "https://www.news.cn/" HEADERS = { "User-Agent": "Mozilla/5.0" } + def parse_xinhua_article(url: str, time_range_days: int = None): """ 解析新华网文章页,返回 dict 包含标题与正文。 @@ -19,7 +33,7 @@ def parse_xinhua_article(url: str, time_range_days: int = None): resp = requests.get(url, headers=HEADERS, timeout=10) resp.encoding = 'utf-8' except requests.RequestException as e: - print(f"❌ 请求失败:{e} URL: {url}") + logger.error(f"❌ 请求失败:{e} URL: {url}") return None soup = BeautifulSoup(resp.text, "html.parser") @@ -29,14 +43,14 @@ def parse_xinhua_article(url: str, time_range_days: int = None): content_tag = soup.find("span", id="detailContent") if not content_tag: - print(f"❌ 没找到 detailContent: {url}") + logger.error(f"❌ 没找到 detailContent: {url}") return None paragraphs = content_tag.find_all("p") content = "\n".join(p.get_text(strip=True) for p in paragraphs) if len(content.strip()) < 50: - print(f"⚠️ 内容过短:{url}") + logger.warning(f"⚠️ 内容过短:{url}") return None # 提取发布时间(假设格式为 YYYY-MM-DD) @@ -46,18 +60,19 @@ def parse_xinhua_article(url: str, time_range_days: int = None): try: publish_time = datetime.strptime(publish_time_str, "%Y-%m-%d").replace(tzinfo=pytz.utc) except ValueError: - print(f"❌ 无法解析时间:{publish_time_str} URL: {url}") + logger.error(f"❌ 无法解析时间:{publish_time_str} URL: {url}") return None else: - print(f"❌ 页面未找到发布时间:{url}") + logger.error(f"❌ 页面未找到发布时间:{url}") return None - # 检查时间范围 - if time_range_days is not None: - cutoff_time = datetime.now(pytz.utc) - timedelta(days=time_range_days) - if publish_time < cutoff_time: - print(f"⏰ 文章超出时间范围:{url}") - return None + # 确保 title 和 content 为 UTF-8 编码的字符串 + try: + title = title.encode('utf-8').decode('utf-8') + content = content.encode('utf-8').decode('utf-8') + except UnicodeDecodeError: + logger.error(f"❌ 字符编码错误,无法正确解码标题或内容:{url}") + return None return { "url": url, @@ -66,30 +81,31 @@ def parse_xinhua_article(url: str, time_range_days: int = None): "publish_time": publish_time } + def crawl_xinhua_green(time_range_days: int = None): """ 爬取新华网所有频道及其文章,并保存到数据库(支持多线程)。 可选参数:time_range_days(仅爬取最近指定天数内的文章) """ - print("✅ 开始爬取新华网栏目列表...") + logger.info("✅ 开始爬取新华网栏目列表...") channels = get_channel_urls() - print(f"共找到 {len(channels)} 个频道") + logger.info(f"共找到 {len(channels)} 个频道") all_articles = [] - + # 并发抓取每个频道的文章链接 with ThreadPoolExecutor(max_workers=5) as executor: future_to_channel = { executor.submit(get_article_urls_from_channel, ch_url): ch_url for ch_url in channels } - + for future in as_completed(future_to_channel): ch_url = future_to_channel[future] try: articles = future.result() - print(f"\n➡️ 抓取频道:{ch_url}") - print(f" 该频道找到 {len(articles)} 篇文章") + logger.info(f"\n➡️ 抓取频道:{ch_url}") + logger.info(f" 该频道找到 {len(articles)} 篇文章") # 并发解析每篇文章 with ThreadPoolExecutor(max_workers=5) as article_executor: @@ -101,7 +117,7 @@ def crawl_xinhua_green(time_range_days: int = None): for article_future in as_completed(article_futures): article = article_future.result() if article: - print(f" ✔️ 文章:{article['title']}") + logger.info(f" ✔️ 文章:{article['title']}") # 更新或创建文章,并标记 crawled=True Article.objects.update_or_create( url=article['url'], @@ -113,12 +129,44 @@ def crawl_xinhua_green(time_range_days: int = None): ) all_articles.append(article) else: - print(f" ❌ 文章解析失败:{article_futures[article_future]}") + logger.error(f" ❌ 文章解析失败:{article_futures[article_future]}") except Exception as exc: - print(f"❌ 频道 {ch_url} 抓取时发生异常:{exc}") + logger.error(f"❌ 频道 {ch_url} 抓取时发生异常:{exc}") - print(f"\n✅ 爬取结束,共抓取文章 {len(all_articles)} 篇") + logger.info(f"\n✅ 爬取结束,共抓取文章 {len(all_articles)} 篇") return all_articles + if __name__ == "__main__": + logger.info("开始爬取...") crawl_xinhua_green(time_range_days=7) # 示例:仅爬取最近 7 天的文章 + logger.info("爬取完成") + + +from django.http import JsonResponse + + +def export_articles(request): + """ + 导出所有爬取的文章为 JSON 格式。 + 可通过访问 /export/xinhua-article/ 触发 + """ + time_range_days = request.GET.get('time_range_days', None) + if time_range_days is not None: + try: + time_range_days = int(time_range_days) + except ValueError: + logger.error("❌ 无效的时间范围参数") + return JsonResponse({"error": "无效的时间范围参数"}, status=400) + + logger.info(f"开始导出文章,时间范围:{time_range_days} 天") + # 获取文章数据 + articles = crawl_xinhua_green(time_range_days=time_range_days) + + logger.info(f"成功导出 {len(articles)} 篇文章") + # 返回 JSON 响应,并确保中文不被转义 + return JsonResponse( + articles, + safe=False, + json_dumps_params={'ensure_ascii': False, 'separators': (',', ':')} # 减少空格,更兼容 Safari + ) diff --git a/src/green_classroom/apps/collector/migrations/0003_article_publish_time.py b/src/green_classroom/apps/collector/migrations/0003_article_publish_time.py new file mode 100644 index 0000000..01f944b --- /dev/null +++ b/src/green_classroom/apps/collector/migrations/0003_article_publish_time.py @@ -0,0 +1,18 @@ +# Generated by Django 5.1 on 2025-07-22 07:53 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('collector', '0002_rename_is_verified_article_crawled_and_more'), + ] + + operations = [ + migrations.AddField( + model_name='article', + name='publish_time', + field=models.DateTimeField(blank=True, null=True), + ), + ] diff --git a/src/green_classroom/apps/collector/models.py b/src/green_classroom/apps/collector/models.py index 627b79d..d46f57d 100644 --- a/src/green_classroom/apps/collector/models.py +++ b/src/green_classroom/apps/collector/models.py @@ -14,6 +14,7 @@ class Article(models.Model): title = models.CharField(max_length=255) content = models.TextField() crawled = models.BooleanField(default=False) # 确保此字段存在 + publish_time = models.DateTimeField(null=True, blank=True) # 添加 publish_time 字段 def __str__(self): return self.title diff --git a/src/green_classroom/apps/collector/templates/collector/article_list.html b/src/green_classroom/apps/collector/templates/collector/article_list.html index b41bd1f..e2c24d6 100644 --- a/src/green_classroom/apps/collector/templates/collector/article_list.html +++ b/src/green_classroom/apps/collector/templates/collector/article_list.html @@ -20,5 +20,42 @@
  • 暂无内容
  • {% endfor %} + +
    + + +
    + + diff --git a/src/green_classroom/apps/collector/urls.py b/src/green_classroom/apps/collector/urls.py index 2b26fea..74a8da9 100644 --- a/src/green_classroom/apps/collector/urls.py +++ b/src/green_classroom/apps/collector/urls.py @@ -1,10 +1,8 @@ -from django.urls import path -from green_classroom.apps.collector import views +app_name = 'collector' # 添加 app_name 定义命名空间 -app_name = 'collector' # 确保命名空间正确 +from django.urls import path +from . import views urlpatterns = [ - path('delete_all_articles/', views.delete_all_articles, name='delete_all_articles'), - path('article//', views.article_detail, name='article_detail'), - path('articles/', views.list_articles, name='article_list'), # 添加这一行 -] \ No newline at end of file + path('article//', views.article_detail, name='article_detail'), +] diff --git a/src/green_classroom/apps/collector/views.py b/src/green_classroom/apps/collector/views.py index 1e03f69..45b2688 100644 --- a/src/green_classroom/apps/collector/views.py +++ b/src/green_classroom/apps/collector/views.py @@ -1,3 +1,29 @@ +import logging +import sys + +# 配置logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s [%(levelname)s] %(message)s', + handlers=[ + logging.StreamHandler(sys.stdout) + ] +) + +logger = logging.getLogger(__name__) + + +from django.shortcuts import render +from django.http import HttpResponse +from django.http import JsonResponse +from .models import Article # 假设 Article 是你的模型 + +def index(request): + """ + 主页视图,显示欢迎信息或文章列表。 + """ + return HttpResponse("欢迎来到绿色课堂资料库!") + from django.shortcuts import render, get_object_or_404 from django.core.management import call_command from green_classroom.apps.collector.models import Article @@ -12,12 +38,25 @@ def list_articles(request): articles = Article.objects.all() return render(request, 'collector/article_list.html', {'articles': articles}) -def article_detail(request, article_id): +def article_detail(request, pk): """ - 展示单篇文章的详细内容 + 根据主键 pk 获取文章详情。 """ - article = get_object_or_404(Article, id=article_id) - return render(request, 'collector/article_detail.html', {'article': article}) + try: + article = Article.objects.get(pk=pk) + logger.info(f"返回文章标题: {article.title}") # 添加日志确认标题是否正常 + logger.info(f"返回文章内容: {article.content[:100]}...") # 输出前100字符确认内容是否正常 + data = { + 'id': article.id, + 'url': article.url, + 'title': str(article.title), # 强制转换为 str,避免潜在的编码问题 + 'content': str(article.content), # 强制转换为 str + 'publish_time': article.publish_time.isoformat() if article.publish_time else None, # 格式化时间 + 'crawled': article.crawled + } + return JsonResponse(data, json_dumps_params={'ensure_ascii': False}) + except Article.DoesNotExist: + return JsonResponse({'error': '文章不存在'}, status=404) def run_crawler(request): result = [] diff --git a/src/green_classroom/settings.py b/src/green_classroom/settings.py index 4836e01..8ce4bf8 100644 --- a/src/green_classroom/settings.py +++ b/src/green_classroom/settings.py @@ -25,8 +25,9 @@ SECRET_KEY = 'django-insecure-mi#9dyl0zwanl2=uziz3om_t**ovk08+pg127^+=5m=s6^+(@b # SECURITY WARNING: don't run with debug turned on in production! DEBUG = True +#DEBUG = False -ALLOWED_HOSTS = [] +ALLOWED_HOSTS = ['*',] # Application definition @@ -126,3 +127,14 @@ DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField' # 增加最大字段数限制,适应 admin 页面大数据量展示 DATA_UPLOAD_MAX_NUMBER_FIELDS = 10240 + + +import sentry_sdk + +sentry_sdk.init( + dsn="https://a976759c113a1e07050c61fb4dfe16bc@sentry.yuangyaa.com/2", + # Add data like request headers and IP for users, + # see https://docs.sentry.io/platforms/python/data-management/data-collected/ for more info + send_default_pii=True, + environment="staging", # 或其他你设置的环境名 +) diff --git a/src/green_classroom/urls.py b/src/green_classroom/urls.py index b5b4007..4bf95ba 100644 --- a/src/green_classroom/urls.py +++ b/src/green_classroom/urls.py @@ -6,4 +6,5 @@ urlpatterns = [ path('collector/', include('green_classroom.apps.collector.urls', namespace='collector')), path('articles/', views.list_articles, name='article_list'), path('admin/', admin.site.urls), # 添加这一行以恢复 admin 页面访问 -] \ No newline at end of file +] +