support keyword crawl

2025-09-26 10:39:36 +08:00
parent 499454ff27
commit a36d730384
12 changed files with 2370 additions and 505 deletions
--- a/core/utils.py
+++ b/core/utils.py
@@ -2274,24 +2274,204 @@ def full_site_crawler(start_url, website, max_pages=1000):
                queue.append(href)


+
+# 网站爬取配置
+WEBSITE_CRAWL_CONFIGS = {
+    "新华网": {
+        "base_url": "http://www.xinhuanet.com",
+        "article_list_urls": [
+            "http://www.xinhuanet.com/politics/",
+            "http://www.xinhuanet.com/world/",
+            "http://www.xinhuanet.com/finance/",
+            "http://www.xinhuanet.com/tech/",
+            "http://www.xinhuanet.com/edu/",
+            "http://www.xinhuanet.com/health/"
+        ],
+        "article_selector": "a[href*='/politics/'], a[href*='/world/'], a[href*='/finance/'], a[href*='/tech/'], a[href*='/edu/'], a[href*='/health/']",
+        "content_selector": ".box_con, .content, .article-content, .text",
+        "title_selector": "h1, .title, .article-title",
+        "pagination": True,
+        "pagination_selector": ".page a, .pagination a"
+    },
+    "人民日报": {
+        "base_url": "http://www.people.com.cn",
+        "article_list_urls": [
+            "http://politics.people.com.cn/",
+            "http://world.people.com.cn/",
+            "http://finance.people.com.cn/",
+            "http://tech.people.com.cn/",
+            "http://edu.people.com.cn/",
+            "http://health.people.com.cn/"
+        ],
+        "article_selector": "a[href*='/politics/'], a[href*='/world/'], a[href*='/finance/'], a[href*='/tech/'], a[href*='/edu/'], a[href*='/health/']",
+        "content_selector": ".box_con, .content, .article-content, .text",
+        "title_selector": "h1, .title, .article-title",
+        "pagination": True,
+        "pagination_selector": ".page a, .pagination a"
+    },
+    "央视网": {
+        "base_url": "http://www.cctv.com",
+        "article_list_urls": [
+            "http://news.cctv.com/politics/",
+            "http://news.cctv.com/world/",
+            "http://news.cctv.com/finance/",
+            "http://news.cctv.com/tech/",
+            "http://news.cctv.com/edu/",
+            "http://news.cctv.com/health/"
+        ],
+        "article_selector": "a[href*='/politics/'], a[href*='/world/'], a[href*='/finance/'], a[href*='/tech/'], a[href*='/edu/'], a[href*='/health/']",
+        "content_selector": ".content_area, .content, .article-content, .text",
+        "title_selector": "h1, .title, .article-title",
+        "pagination": True,
+        "pagination_selector": ".page a, .pagination a"
+    },
+    "光明日报": {
+        "base_url": "http://www.gmw.cn",
+        "article_list_urls": ["http://www.gmw.cn/"],
+        "article_selector": "a[href*='gmw.cn']",
+        "content_selector": ".content, .article-content, .text",
+        "title_selector": "h1, .title, .article-title"
+    },
+    "经济日报": {
+        "base_url": "http://www.ce.cn",
+        "article_list_urls": ["http://www.ce.cn/"],
+        "article_selector": "a[href*='ce.cn']",
+        "content_selector": ".content, .article-content, .text",
+        "title_selector": "h1, .title, .article-title"
+    },
+    "中国日报": {
+        "base_url": "http://www.chinadaily.com.cn",
+        "article_list_urls": ["http://www.chinadaily.com.cn/"],
+        "article_selector": "a[href*='chinadaily.com.cn']",
+        "content_selector": ".content, .article-content, .text",
+        "title_selector": "h1, .title, .article-title"
+    },
+    "工人日报": {
+        "base_url": "https://www.workercn.cn",
+        "article_list_urls": ["https://www.workercn.cn/"],
+        "article_selector": "a[href*='workercn.cn']",
+        "content_selector": ".content, .article-content, .text",
+        "title_selector": "h1, .title, .article-title"
+    },
+    "科技日报": {
+        "base_url": "http://www.stdaily.com",
+        "article_list_urls": ["http://www.stdaily.com/"],
+        "article_selector": "a[href*='stdaily.com']",
+        "content_selector": ".content, .article-content, .text",
+        "title_selector": "h1, .title, .article-title"
+    },
+    "人民政协网": {
+        "base_url": "https://www.rmzxw.com.cn",
+        "article_list_urls": ["https://www.rmzxw.com.cn/"],
+        "article_selector": "a[href*='rmzxw.com.cn']",
+        "content_selector": ".content, .article-content, .text",
+        "title_selector": "h1, .title, .article-title"
+    },
+    "中国纪检监察报": {
+        "base_url": "http://www.jjjcb.cn",
+        "article_list_urls": ["http://www.jjjcb.cn/"],
+        "article_selector": "a[href*='jjjcb.cn']",
+        "content_selector": ".content, .article-content, .text",
+        "title_selector": "h1, .title, .article-title"
+    },
+    "中国新闻社": {
+        "base_url": "https://www.chinanews.com.cn",
+        "article_list_urls": ["https://www.chinanews.com.cn/"],
+        "article_selector": "a[href*='chinanews.com.cn']",
+        "content_selector": ".content, .article-content, .text",
+        "title_selector": "h1, .title, .article-title"
+    },
+    "学习时报": {
+        "base_url": "http://www.studytimes.cn",
+        "article_list_urls": ["http://www.studytimes.cn/"],
+        "article_selector": "a[href*='studytimes.cn']",
+        "content_selector": ".content, .article-content, .text",
+        "title_selector": "h1, .title, .article-title"
+    },
+    "中国青年报": {
+        "base_url": "http://www.cyol.com",
+        "article_list_urls": ["http://www.cyol.com/"],
+        "article_selector": "a[href*='cyol.com']",
+        "content_selector": ".content, .article-content, .text",
+        "title_selector": "h1, .title, .article-title"
+    },
+    "中国妇女报": {
+        "base_url": "http://www.cnwomen.com.cn",
+        "article_list_urls": ["http://www.cnwomen.com.cn/"],
+        "article_selector": "a[href*='cnwomen.com.cn']",
+        "content_selector": ".content, .article-content, .text",
+        "title_selector": "h1, .title, .article-title"
+    },
+    "法治日报": {
+        "base_url": "http://www.legaldaily.com.cn",
+        "article_list_urls": ["http://www.legaldaily.com.cn/"],
+        "article_selector": "a[href*='legaldaily.com.cn']",
+        "content_selector": ".content, .article-content, .text",
+        "title_selector": "h1, .title, .article-title"
+    },
+    "农民日报": {
+        "base_url": "http://www.farmer.com.cn",
+        "article_list_urls": ["http://www.farmer.com.cn/"],
+        "article_selector": "a[href*='farmer.com.cn']",
+        "content_selector": ".content, .article-content, .text",
+        "title_selector": "h1, .title, .article-title"
+    },
+    "学习强国": {
+        "base_url": "https://www.xuexi.cn",
+        "article_list_urls": ["https://www.xuexi.cn/"],
+        "article_selector": "a[href*='xuexi.cn']",
+        "content_selector": ".content, .article-content, .text",
+        "title_selector": "h1, .title, .article-title"
+    },
+    "旗帜网": {
+        "base_url": "http://www.qizhiwang.org.cn",
+        "article_list_urls": ["http://www.qizhiwang.org.cn/"],
+        "article_selector": "a[href*='qizhiwang.org.cn']",
+        "content_selector": ".content, .article-content, .text",
+        "title_selector": "h1, .title, .article-title"
+    },
+    "中国网": {
+        "base_url": "http://www.china.com.cn",
+        "article_list_urls": ["http://www.china.com.cn/"],
+        "article_selector": "a[href*='china.com.cn']",
+        "content_selector": ".content, .article-content, .text",
+        "title_selector": "h1, .title, .article-title"
+    },
+    "中国政府网": {
+        "base_url": "http://www.gov.cn",
+        "article_list_urls": ["http://www.gov.cn/"],
+        "article_selector": "a[href*='gov.cn']",
+        "content_selector": ".content, .article-content, .text",
+        "title_selector": "h1, .title, .article-title"
+    },
+    "求是网": {
+        "base_url": "http://www.qstheory.cn",
+        "article_list_urls": ["http://www.qstheory.cn/"],
+        "article_selector": "a[href*='qstheory.cn']",
+        "content_selector": ".content, .article-content, .text",
+        "title_selector": "h1, .title, .article-title"
+    },
+    "解放军报": {
+        "base_url": "http://www.81.cn",
+        "article_list_urls": ["http://www.81.cn/"],
+        "article_selector": "a[href*='81.cn']",
+        "content_selector": ".content, .article-content, .text",
+        "title_selector": "h1, .title, .article-title"
+    }
+}
+
 # 网站搜索配置
 WEBSITE_SEARCH_CONFIGS = {
    "新华网": {
-        "search_url": "http://so.news.cn/getNews",
+        "search_url": "http://www.news.cn/search",
        "search_params": {
-            "keyword": "{keyword}",
-            "curPage": "{page}",
-            "sortField": "0",
-            "sortType": "1"
+            "q": "{keyword}"
        },
-        "method": "post",
-        "headers": {
-            "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
-        }
+        "method": "get",
+        "note": "新华网有较强反爬虫机制，可能需要使用Selenium等工具处理"
    },
    "人民日报": {
-        "search_url": "http://search.people.com.cn/search",
+        "search_url": "http://search.people.cn",
        "search_params": {
            "keyword": "{keyword}",
            "st": "0",
@@ -2464,6 +2644,175 @@ WEBSITE_SEARCH_CONFIGS = {
    }
 }

+# 新的网站爬取配置 - 直接爬取文章列表页面，然后进行关键字过滤
+# WEBSITE_CRAWL_CONFIGS 已移至 WEBSITE_SEARCH_CONFIGS
+
+def check_keyword_in_content(content, keyword):
+    """
+    检查内容是否包含关键字
+    
+    Args:
+        content: 文章内容
+        keyword: 关键字
+    
+    Returns:
+        bool: 是否包含关键字
+    """
+    if not content or not keyword:
+        return False
+    
+    # 转换为小写进行比较
+    content_lower = content.lower()
+    keyword_lower = keyword.lower()
+    
+    # 检查是否包含关键字
+    return keyword_lower in content_lower
+
+
+def get_page_with_requests(url, website_name=""):
+    """使用requests获取页面内容"""
+    try:
+        headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
+            "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
+            "Accept-Encoding": "gzip, deflate",
+            "Connection": "keep-alive",
+            "Upgrade-Insecure-Requests": "1"
+        }
+        
+        # 为不同网站设置特殊的请求头
+        if "新华网" in website_name:
+            headers.update({
+                "Referer": "http://www.news.cn/",
+            })
+        elif "人民网" in website_name:
+            headers.update({
+                "Referer": "http://www.people.com.cn/",
+            })
+        
+        resp = requests.get(url, headers=headers, timeout=15)
+        resp.raise_for_status()
+        resp.encoding = 'utf-8'
+        
+        soup = BeautifulSoup(resp.text, "html.parser")
+        return soup
+    except Exception as e:
+        print(f"使用requests获取页面失败: {url}, 错误: {e}")
+        return None
+
+
+def crawl_articles_with_keyword_filter(website_name, keyword, max_pages=10, start_date=None, end_date=None):
+    """
+    爬取文章并进行关键字过滤
+    
+    Args:
+        website_name: 网站名称
+        keyword: 关键字
+        max_pages: 最大页数
+        start_date: 开始日期
+        end_date: 结束日期
+    
+    Returns:
+        list: 符合条件的文章URL列表
+    """
+    if website_name not in WEBSITE_SEARCH_CONFIGS:
+        print(f"网站 {website_name} 不支持爬取功能")
+        return []
+    
+    config = WEBSITE_SEARCH_CONFIGS[website_name]
+    article_urls = []
+    
+    print(f"开始爬取 {website_name} 并过滤关键字 '{keyword}'...")
+    
+    # 获取文章列表页面
+    # 对于新华网等网站，使用搜索URL作为基础URL
+    base_urls = config.get("article_list_urls", [config.get("search_url", "")])
+    for list_url in base_urls:
+        try:
+            print(f"爬取列表页面: {list_url}")
+            # 首先尝试使用Selenium，如果失败则使用requests
+            soup = get_page_with_selenium(list_url, website_name)
+            if not soup:
+                print("尝试使用requests获取页面...")
+                soup = get_page_with_requests(list_url, website_name)
+            if not soup:
+                continue
+            
+            # 获取文章链接
+            links = soup.select(config.get("article_selector", "a"))
+            page_urls = []
+            
+            for link in links:
+                href = link.get("href")
+                if not href:
+                    continue
+                
+                # 处理相对URL
+                if not href.startswith("http"):
+                    href = urljoin(list_url, href)
+                
+                # 检查是否是文章URL
+                parsed_list_url = urlparse(list_url)
+                base_netloc = parsed_list_url.netloc
+                if is_valid_url(href, base_netloc) and href not in page_urls:
+                    page_urls.append(href)
+            
+            print(f"找到 {len(page_urls)} 个文章链接")
+            
+            # 检查每个文章是否包含关键字
+            for article_url in page_urls:
+                try:
+                    print(f"检查文章: {article_url}")
+                    # 首先尝试使用Selenium，如果失败则使用requests
+                    article_soup = get_page_with_selenium(article_url, website_name)
+                    if not article_soup:
+                        print("尝试使用requests获取文章页面...")
+                        article_soup = get_page_with_requests(article_url, website_name)
+                    if not article_soup:
+                        continue
+                    
+                    # 获取文章标题
+                    title_element = article_soup.select_one(config.get("title_selector", "h1"))
+                    title = title_element.get_text().strip() if title_element else ""
+                    
+                    # 获取文章内容
+                    content_element = article_soup.select_one(config.get("content_selector", ".content"))
+                    content = content_element.get_text().strip() if content_element else ""
+                    
+                    # 检查标题和内容是否包含关键字
+                    if check_keyword_in_content(title, keyword) or check_keyword_in_content(content, keyword):
+                        print(f"✓ 文章包含关键字: {title[:50]}...")
+                        article_urls.append(article_url)
+                    else:
+                        print(f"✗ 文章不包含关键字: {title[:50]}...")
+                        
+                except Exception as e:
+                    print(f"检查文章时出错: {e}")
+                    continue
+            
+            # 如果设置了分页，处理分页
+            if config.get("pagination", False) and len(article_urls) < max_pages * 10:
+                pagination_links = soup.select(config.get("pagination_selector", ".page a"))
+                for i, page_link in enumerate(pagination_links[:max_pages-1]):
+                    page_href = page_link.get("href")
+                    if page_href and not page_href.startswith("http"):
+                        page_href = urljoin(list_url, page_href)
+                    
+                    if page_href and page_href not in [list_url]:
+                        # 递归处理分页
+                        page_articles = crawl_articles_with_keyword_filter(
+                            website_name, keyword, 1, start_date, end_date
+                        )
+                        article_urls.extend(page_articles)
+                        
+        except Exception as e:
+            print(f"爬取列表页面时出错: {e}")
+            continue
+    
+    print(f"{website_name} 关键字过滤完成，找到 {len(article_urls)} 篇相关文章")
+    return article_urls
+

 def search_articles_by_keyword(website_name, keyword, max_pages=10, start_date=None, end_date=None):
    """
@@ -2479,76 +2828,8 @@ def search_articles_by_keyword(website_name, keyword, max_pages=10, start_date=N
    Returns:
        list: 搜索到的文章URL列表
    """
-    if website_name not in WEBSITE_SEARCH_CONFIGS:
-        print(f"网站 {website_name} 不支持搜索功能")
-        return []
-    
-    config = WEBSITE_SEARCH_CONFIGS[website_name]
-    article_urls = []
-    
-    # 设置默认日期范围
-    if not start_date:
-        start_date = (datetime.now() - timedelta(days=365)).strftime("%Y-%m-%d")
-    if not end_date:
-        end_date = datetime.now().strftime("%Y-%m-%d")
-    
-    headers = {
-        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
-    }
-    headers.update(config.get("headers", {}))
-    
-    for page in range(1, max_pages + 1):
-        try:
-            # 构建搜索参数
-            search_params = {}
-            for key, value in config["search_params"].items():
-                search_params[key] = value.format(
-                    keyword=quote(keyword),
-                    page=page,
-                    start_date=start_date,
-                    end_date=end_date
-                )
-            
-            print(f"搜索 {website_name} 第 {page} 页: {keyword}")
-            
-            if config["method"] == "post":
-                response = requests.post(
-                    config["search_url"],
-                    data=search_params,
-                    headers=headers,
-                    timeout=15
-                )
-            else:
-                response = requests.get(
-                    config["search_url"],
-                    params=search_params,
-                    headers=headers,
-                    timeout=15
-                )
-            
-            response.raise_for_status()
-            response.encoding = 'utf-8'
-            
-            # 解析搜索结果
-            soup = BeautifulSoup(response.text, "html.parser")
-            page_urls = extract_search_results(soup, website_name)
-            
-            if not page_urls:
-                print(f"第 {page} 页没有找到更多结果")
-                break
-            
-            article_urls.extend(page_urls)
-            print(f"第 {page} 页找到 {len(page_urls)} 篇文章")
-            
-            # 避免请求过快
-            time.sleep(1)
-            
-        except Exception as e:
-            print(f"搜索第 {page} 页时出错: {e}")
-            continue
-    
-    print(f"总共找到 {len(article_urls)} 篇文章")
-    return article_urls
+    # 使用新的关键字过滤策略
+    return crawl_articles_with_keyword_filter(website_name, keyword, max_pages, start_date, end_date)


 def extract_search_results(soup, website_name):
@@ -2738,8 +3019,12 @@ def crawl_by_keyword(keyword, website_names=None, max_pages=10, start_date=None,
    Returns:
        dict: 爬取结果统计
    """
+    # 导入task_executor用于检查任务取消状态
+    from core.task_executor import task_executor
+    task_id = getattr(crawl_by_keyword, 'task_id', None)
+    
    if website_names is None:
-        website_names = list(WEBSITE_SEARCH_CONFIGS.keys())
+        website_names = list(WEBSITE_CRAWL_CONFIGS.keys())
    
    results = {
        "keyword": keyword,
@@ -2753,6 +3038,11 @@ def crawl_by_keyword(keyword, website_names=None, max_pages=10, start_date=None,
    print(f"目标网站: {', '.join(website_names)}")
    
    for website_name in website_names:
+        # 检查任务是否被取消
+        if task_id and task_executor.is_task_cancelled(task_id):
+            print(f"任务 {task_id} 已被取消，停止爬取")
+            break
+            
        print(f"\n开始爬取 {website_name}...")
        
        try:
@@ -2761,7 +3051,7 @@ def crawl_by_keyword(keyword, website_names=None, max_pages=10, start_date=None,
            website, created = Website.objects.get_or_create(
                name=website_name,
                defaults={
-                    'base_url': WEBSITE_SEARCH_CONFIGS[website_name]["search_url"],
+                    'base_url': WEBSITE_SEARCH_CONFIGS.get(website_name, {}).get("search_url", ""),
                    'enabled': True
                }
            )
@@ -2791,6 +3081,11 @@ def crawl_by_keyword(keyword, website_names=None, max_pages=10, start_date=None,
            website_failed = 0
            
            for i, url in enumerate(article_urls, 1):
+                # 检查任务是否被取消
+                if task_id and task_executor.is_task_cancelled(task_id):
+                    print(f"任务 {task_id} 已被取消，停止处理文章")
+                    break
+                    
                try:
                    print(f"处理第 {i}/{len(article_urls)} 篇: {url}")
                    process_article(url, website)
@@ -2805,6 +3100,10 @@ def crawl_by_keyword(keyword, website_names=None, max_pages=10, start_date=None,
                    website_failed += 1
                    results["failed_count"] += 1
            
+            # 如果任务被取消，跳出循环
+            if task_id and task_executor.is_task_cancelled(task_id):
+                break
+                
            results["website_results"][website_name] = {
                "found_urls": len(article_urls),
                "processed": len(article_urls),
@@ -2815,6 +3114,11 @@ def crawl_by_keyword(keyword, website_names=None, max_pages=10, start_date=None,
            print(f"{website_name} 完成: 成功 {website_success}, 失败 {website_failed}")
            
        except Exception as e:
+            # 检查任务是否被取消
+            if task_id and task_executor.is_task_cancelled(task_id):
+                print(f"任务 {task_id} 已被取消，停止爬取")
+                break
+                
            print(f"爬取 {website_name} 时出错: {e}")
            results["website_results"][website_name] = {
                "found_urls": 0,
@@ -2849,13 +3153,17 @@ def crawl_historical_articles(website_names=None, start_date=None, end_date=None
    Returns:
        dict: 爬取结果统计
    """
+    # 导入task_executor用于检查任务取消状态
+    from core.task_executor import task_executor
+    task_id = getattr(crawl_historical_articles, 'task_id', None)
+    
    if not start_date:
        start_date = (datetime.now() - timedelta(days=30)).strftime("%Y-%m-%d")
    if not end_date:
        end_date = datetime.now().strftime("%Y-%m-%d")
    
    if website_names is None:
-        website_names = list(WEBSITE_SEARCH_CONFIGS.keys())
+        website_names = list(WEBSITE_CRAWL_CONFIGS.keys())
    
    results = {
        "start_date": start_date,
@@ -2874,6 +3182,11 @@ def crawl_historical_articles(website_names=None, start_date=None, end_date=None
    common_keywords = ["新闻", "报道", "文章", "资讯", "动态"]
    
    for website_name in website_names:
+        # 检查任务是否被取消
+        if task_id and task_executor.is_task_cancelled(task_id):
+            print(f"任务 {task_id} 已被取消，停止爬取")
+            break
+            
        print(f"\n开始爬取 {website_name} 历史文章...")
        
        try:
@@ -2881,7 +3194,7 @@ def crawl_historical_articles(website_names=None, start_date=None, end_date=None
            website, created = Website.objects.get_or_create(
                name=website_name,
                defaults={
-                    'base_url': WEBSITE_SEARCH_CONFIGS[website_name]["search_url"],
+                    'base_url': WEBSITE_SEARCH_CONFIGS.get(website_name, {}).get("search_url", ""),
                    'enabled': True
                }
            )
@@ -2892,6 +3205,11 @@ def crawl_historical_articles(website_names=None, start_date=None, end_date=None
            
            # 使用多个关键词搜索
            for keyword in common_keywords:
+                # 检查任务是否被取消
+                if task_id and task_executor.is_task_cancelled(task_id):
+                    print(f"任务 {task_id} 已被取消，停止搜索")
+                    break
+                    
                try:
                    article_urls = search_articles_by_keyword(
                        website_name, keyword, max_pages=5, 
@@ -2906,6 +3224,10 @@ def crawl_historical_articles(website_names=None, start_date=None, end_date=None
                    print(f"搜索关键词 '{keyword}' 时出错: {e}")
                    continue
            
+            # 如果任务被取消，跳出循环
+            if task_id and task_executor.is_task_cancelled(task_id):
+                break
+            
            # 限制文章数量
            article_urls = list(all_urls)[:max_articles_per_site]
            
@@ -2922,6 +3244,11 @@ def crawl_historical_articles(website_names=None, start_date=None, end_date=None
            print(f"{website_name} 找到 {len(article_urls)} 篇历史文章，开始处理...")
            
            for i, url in enumerate(article_urls, 1):
+                # 检查任务是否被取消
+                if task_id and task_executor.is_task_cancelled(task_id):
+                    print(f"任务 {task_id} 已被取消，停止处理文章")
+                    break
+                    
                try:
                    print(f"处理第 {i}/{len(article_urls)} 篇: {url}")
                    process_article(url, website)
@@ -2935,6 +3262,10 @@ def crawl_historical_articles(website_names=None, start_date=None, end_date=None
                    website_failed += 1
                    results["failed_count"] += 1
            
+            # 如果任务被取消，跳出循环
+            if task_id and task_executor.is_task_cancelled(task_id):
+                break
+                
            results["website_results"][website_name] = {
                "found_urls": len(article_urls),
                "processed": len(article_urls),
@@ -2945,6 +3276,11 @@ def crawl_historical_articles(website_names=None, start_date=None, end_date=None
            print(f"{website_name} 完成: 成功 {website_success}, 失败 {website_failed}")
            
        except Exception as e:
+            # 检查任务是否被取消
+            if task_id and task_executor.is_task_cancelled(task_id):
+                print(f"任务 {task_id} 已被取消，停止爬取")
+                break
+                
            print(f"爬取 {website_name} 历史文章时出错: {e}")
            results["website_results"][website_name] = {
                "found_urls": 0,