diff --git a/src/green_classroom/apps/collector/crawler/xinhua.py b/src/green_classroom/apps/collector/crawler/xinhua.py index dcd9814..37e6254 100644 --- a/src/green_classroom/apps/collector/crawler/xinhua.py +++ b/src/green_classroom/apps/collector/crawler/xinhua.py @@ -3,13 +3,27 @@ from bs4 import BeautifulSoup from urllib.parse import urljoin from concurrent.futures import ThreadPoolExecutor, as_completed from datetime import datetime, timedelta -import pytz # 确保已安装 pytz 库,用于处理时区 +import pytz +import logging +import sys + +# 配置logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s [%(levelname)s] %(message)s', + handlers=[ + logging.StreamHandler(sys.stdout) + ] +) + +logger = logging.getLogger(__name__) BASE_URL = "https://www.news.cn/" HEADERS = { "User-Agent": "Mozilla/5.0" } + def parse_xinhua_article(url: str, time_range_days: int = None): """ 解析新华网文章页,返回 dict 包含标题与正文。 @@ -19,7 +33,7 @@ def parse_xinhua_article(url: str, time_range_days: int = None): resp = requests.get(url, headers=HEADERS, timeout=10) resp.encoding = 'utf-8' except requests.RequestException as e: - print(f"❌ 请求失败:{e} URL: {url}") + logger.error(f"❌ 请求失败:{e} URL: {url}") return None soup = BeautifulSoup(resp.text, "html.parser") @@ -29,14 +43,14 @@ def parse_xinhua_article(url: str, time_range_days: int = None): content_tag = soup.find("span", id="detailContent") if not content_tag: - print(f"❌ 没找到 detailContent: {url}") + logger.error(f"❌ 没找到 detailContent: {url}") return None paragraphs = content_tag.find_all("p") content = "\n".join(p.get_text(strip=True) for p in paragraphs) if len(content.strip()) < 50: - print(f"⚠️ 内容过短:{url}") + logger.warning(f"⚠️ 内容过短:{url}") return None # 提取发布时间(假设格式为 YYYY-MM-DD) @@ -46,18 +60,19 @@ def parse_xinhua_article(url: str, time_range_days: int = None): try: publish_time = datetime.strptime(publish_time_str, "%Y-%m-%d").replace(tzinfo=pytz.utc) except ValueError: - print(f"❌ 无法解析时间:{publish_time_str} URL: {url}") + logger.error(f"❌ 无法解析时间:{publish_time_str} URL: {url}") return None else: - print(f"❌ 页面未找到发布时间:{url}") + logger.error(f"❌ 页面未找到发布时间:{url}") return None - # 检查时间范围 - if time_range_days is not None: - cutoff_time = datetime.now(pytz.utc) - timedelta(days=time_range_days) - if publish_time < cutoff_time: - print(f"⏰ 文章超出时间范围:{url}") - return None + # 确保 title 和 content 为 UTF-8 编码的字符串 + try: + title = title.encode('utf-8').decode('utf-8') + content = content.encode('utf-8').decode('utf-8') + except UnicodeDecodeError: + logger.error(f"❌ 字符编码错误,无法正确解码标题或内容:{url}") + return None return { "url": url, @@ -66,30 +81,31 @@ def parse_xinhua_article(url: str, time_range_days: int = None): "publish_time": publish_time } + def crawl_xinhua_green(time_range_days: int = None): """ 爬取新华网所有频道及其文章,并保存到数据库(支持多线程)。 可选参数:time_range_days(仅爬取最近指定天数内的文章) """ - print("✅ 开始爬取新华网栏目列表...") + logger.info("✅ 开始爬取新华网栏目列表...") channels = get_channel_urls() - print(f"共找到 {len(channels)} 个频道") + logger.info(f"共找到 {len(channels)} 个频道") all_articles = [] - + # 并发抓取每个频道的文章链接 with ThreadPoolExecutor(max_workers=5) as executor: future_to_channel = { executor.submit(get_article_urls_from_channel, ch_url): ch_url for ch_url in channels } - + for future in as_completed(future_to_channel): ch_url = future_to_channel[future] try: articles = future.result() - print(f"\n➡️ 抓取频道:{ch_url}") - print(f" 该频道找到 {len(articles)} 篇文章") + logger.info(f"\n➡️ 抓取频道:{ch_url}") + logger.info(f" 该频道找到 {len(articles)} 篇文章") # 并发解析每篇文章 with ThreadPoolExecutor(max_workers=5) as article_executor: @@ -101,7 +117,7 @@ def crawl_xinhua_green(time_range_days: int = None): for article_future in as_completed(article_futures): article = article_future.result() if article: - print(f" ✔️ 文章:{article['title']}") + logger.info(f" ✔️ 文章:{article['title']}") # 更新或创建文章,并标记 crawled=True Article.objects.update_or_create( url=article['url'], @@ -113,12 +129,44 @@ def crawl_xinhua_green(time_range_days: int = None): ) all_articles.append(article) else: - print(f" ❌ 文章解析失败:{article_futures[article_future]}") + logger.error(f" ❌ 文章解析失败:{article_futures[article_future]}") except Exception as exc: - print(f"❌ 频道 {ch_url} 抓取时发生异常:{exc}") + logger.error(f"❌ 频道 {ch_url} 抓取时发生异常:{exc}") - print(f"\n✅ 爬取结束,共抓取文章 {len(all_articles)} 篇") + logger.info(f"\n✅ 爬取结束,共抓取文章 {len(all_articles)} 篇") return all_articles + if __name__ == "__main__": + logger.info("开始爬取...") crawl_xinhua_green(time_range_days=7) # 示例:仅爬取最近 7 天的文章 + logger.info("爬取完成") + + +from django.http import JsonResponse + + +def export_articles(request): + """ + 导出所有爬取的文章为 JSON 格式。 + 可通过访问 /export/xinhua-article/ 触发 + """ + time_range_days = request.GET.get('time_range_days', None) + if time_range_days is not None: + try: + time_range_days = int(time_range_days) + except ValueError: + logger.error("❌ 无效的时间范围参数") + return JsonResponse({"error": "无效的时间范围参数"}, status=400) + + logger.info(f"开始导出文章,时间范围:{time_range_days} 天") + # 获取文章数据 + articles = crawl_xinhua_green(time_range_days=time_range_days) + + logger.info(f"成功导出 {len(articles)} 篇文章") + # 返回 JSON 响应,并确保中文不被转义 + return JsonResponse( + articles, + safe=False, + json_dumps_params={'ensure_ascii': False, 'separators': (',', ':')} # 减少空格,更兼容 Safari + ) diff --git a/src/green_classroom/apps/collector/migrations/0003_article_publish_time.py b/src/green_classroom/apps/collector/migrations/0003_article_publish_time.py new file mode 100644 index 0000000..01f944b --- /dev/null +++ b/src/green_classroom/apps/collector/migrations/0003_article_publish_time.py @@ -0,0 +1,18 @@ +# Generated by Django 5.1 on 2025-07-22 07:53 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('collector', '0002_rename_is_verified_article_crawled_and_more'), + ] + + operations = [ + migrations.AddField( + model_name='article', + name='publish_time', + field=models.DateTimeField(blank=True, null=True), + ), + ] diff --git a/src/green_classroom/apps/collector/models.py b/src/green_classroom/apps/collector/models.py index 627b79d..d46f57d 100644 --- a/src/green_classroom/apps/collector/models.py +++ b/src/green_classroom/apps/collector/models.py @@ -14,6 +14,7 @@ class Article(models.Model): title = models.CharField(max_length=255) content = models.TextField() crawled = models.BooleanField(default=False) # 确保此字段存在 + publish_time = models.DateTimeField(null=True, blank=True) # 添加 publish_time 字段 def __str__(self): return self.title diff --git a/src/green_classroom/apps/collector/templates/collector/article_list.html b/src/green_classroom/apps/collector/templates/collector/article_list.html index b41bd1f..e2c24d6 100644 --- a/src/green_classroom/apps/collector/templates/collector/article_list.html +++ b/src/green_classroom/apps/collector/templates/collector/article_list.html @@ -20,5 +20,42 @@