Support keword
This commit is contained in:
696
core/utils.py
696
core/utils.py
@@ -1,7 +1,7 @@
|
||||
import os
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from urllib.parse import urljoin, urlparse
|
||||
from urllib.parse import urljoin, urlparse, quote
|
||||
from collections import deque
|
||||
from django.utils import timezone
|
||||
from django.conf import settings
|
||||
@@ -15,6 +15,8 @@ from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from webdriver_manager.chrome import ChromeDriverManager
|
||||
from datetime import datetime, timedelta
|
||||
import json
|
||||
|
||||
|
||||
def get_selenium_driver():
|
||||
@@ -2270,3 +2272,695 @@ def full_site_crawler(start_url, website, max_pages=1000):
|
||||
queue.append(href)
|
||||
elif href not in visited and is_valid_url(href, base_netloc):
|
||||
queue.append(href)
|
||||
|
||||
|
||||
# 网站搜索配置
|
||||
WEBSITE_SEARCH_CONFIGS = {
|
||||
"新华网": {
|
||||
"search_url": "http://so.news.cn/getNews",
|
||||
"search_params": {
|
||||
"keyword": "{keyword}",
|
||||
"curPage": "{page}",
|
||||
"sortField": "0",
|
||||
"sortType": "1"
|
||||
},
|
||||
"method": "post",
|
||||
"headers": {
|
||||
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
||||
}
|
||||
},
|
||||
"人民日报": {
|
||||
"search_url": "http://search.people.com.cn/search",
|
||||
"search_params": {
|
||||
"keyword": "{keyword}",
|
||||
"st": "0",
|
||||
"startDate": "{start_date}",
|
||||
"endDate": "{end_date}",
|
||||
"page": "{page}"
|
||||
},
|
||||
"method": "get"
|
||||
},
|
||||
"央视网": {
|
||||
"search_url": "https://search.cctv.com/search.php",
|
||||
"search_params": {
|
||||
"qtext": "{keyword}",
|
||||
"type": "web",
|
||||
"page": "{page}"
|
||||
},
|
||||
"method": "get"
|
||||
},
|
||||
"光明日报": {
|
||||
"search_url": "http://search.gmw.cn/search",
|
||||
"search_params": {
|
||||
"q": "{keyword}",
|
||||
"page": "{page}"
|
||||
},
|
||||
"method": "get"
|
||||
},
|
||||
"经济日报": {
|
||||
"search_url": "http://www.ce.cn/search",
|
||||
"search_params": {
|
||||
"keyword": "{keyword}",
|
||||
"page": "{page}"
|
||||
},
|
||||
"method": "get"
|
||||
},
|
||||
"中国日报": {
|
||||
"search_url": "http://www.chinadaily.com.cn/search",
|
||||
"search_params": {
|
||||
"q": "{keyword}",
|
||||
"page": "{page}"
|
||||
},
|
||||
"method": "get"
|
||||
},
|
||||
"工人日报": {
|
||||
"search_url": "https://www.workercn.cn/search",
|
||||
"search_params": {
|
||||
"keyword": "{keyword}",
|
||||
"page": "{page}"
|
||||
},
|
||||
"method": "get"
|
||||
},
|
||||
"科技日报": {
|
||||
"search_url": "http://www.stdaily.com/search",
|
||||
"search_params": {
|
||||
"keyword": "{keyword}",
|
||||
"page": "{page}"
|
||||
},
|
||||
"method": "get"
|
||||
},
|
||||
"人民政协网": {
|
||||
"search_url": "https://www.rmzxw.com.cn/search",
|
||||
"search_params": {
|
||||
"keyword": "{keyword}",
|
||||
"page": "{page}"
|
||||
},
|
||||
"method": "get"
|
||||
},
|
||||
"中国纪检监察报": {
|
||||
"search_url": "http://www.jjjcb.cn/search",
|
||||
"search_params": {
|
||||
"keyword": "{keyword}",
|
||||
"page": "{page}"
|
||||
},
|
||||
"method": "get"
|
||||
},
|
||||
"中国新闻社": {
|
||||
"search_url": "https://www.chinanews.com.cn/search",
|
||||
"search_params": {
|
||||
"keyword": "{keyword}",
|
||||
"page": "{page}"
|
||||
},
|
||||
"method": "get"
|
||||
},
|
||||
"学习时报": {
|
||||
"search_url": "https://www.studytimes.cn/search",
|
||||
"search_params": {
|
||||
"keyword": "{keyword}",
|
||||
"page": "{page}"
|
||||
},
|
||||
"method": "get"
|
||||
},
|
||||
"中国青年报": {
|
||||
"search_url": "http://news.cyol.com/search",
|
||||
"search_params": {
|
||||
"keyword": "{keyword}",
|
||||
"page": "{page}"
|
||||
},
|
||||
"method": "get"
|
||||
},
|
||||
"中国妇女报": {
|
||||
"search_url": "https://www.cnwomen.com.cn/search",
|
||||
"search_params": {
|
||||
"keyword": "{keyword}",
|
||||
"page": "{page}"
|
||||
},
|
||||
"method": "get"
|
||||
},
|
||||
"法治日报": {
|
||||
"search_url": "http://www.legaldaily.com.cn/search",
|
||||
"search_params": {
|
||||
"keyword": "{keyword}",
|
||||
"page": "{page}"
|
||||
},
|
||||
"method": "get"
|
||||
},
|
||||
"农民日报": {
|
||||
"search_url": "https://www.farmer.com.cn/search",
|
||||
"search_params": {
|
||||
"keyword": "{keyword}",
|
||||
"page": "{page}"
|
||||
},
|
||||
"method": "get"
|
||||
},
|
||||
"学习强国": {
|
||||
"search_url": "https://www.xuexi.cn/search",
|
||||
"search_params": {
|
||||
"keyword": "{keyword}",
|
||||
"page": "{page}"
|
||||
},
|
||||
"method": "get"
|
||||
},
|
||||
"旗帜网": {
|
||||
"search_url": "http://www.qizhiwang.org.cn/search",
|
||||
"search_params": {
|
||||
"keyword": "{keyword}",
|
||||
"page": "{page}"
|
||||
},
|
||||
"method": "get"
|
||||
},
|
||||
"中国网": {
|
||||
"search_url": "http://www.china.com.cn/search",
|
||||
"search_params": {
|
||||
"keyword": "{keyword}",
|
||||
"page": "{page}"
|
||||
},
|
||||
"method": "get"
|
||||
},
|
||||
"中国政府网": {
|
||||
"search_url": "https://www.gov.cn/search",
|
||||
"search_params": {
|
||||
"keyword": "{keyword}",
|
||||
"page": "{page}"
|
||||
},
|
||||
"method": "get"
|
||||
},
|
||||
"求是网": {
|
||||
"search_url": "http://www.qstheory.cn/search",
|
||||
"search_params": {
|
||||
"keyword": "{keyword}",
|
||||
"page": "{page}"
|
||||
},
|
||||
"method": "get"
|
||||
},
|
||||
"解放军报": {
|
||||
"search_url": "http://www.81.cn/search",
|
||||
"search_params": {
|
||||
"keyword": "{keyword}",
|
||||
"page": "{page}"
|
||||
},
|
||||
"method": "get"
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def search_articles_by_keyword(website_name, keyword, max_pages=10, start_date=None, end_date=None):
|
||||
"""
|
||||
根据关键词搜索文章
|
||||
|
||||
Args:
|
||||
website_name: 网站名称
|
||||
keyword: 搜索关键词
|
||||
max_pages: 最大搜索页数
|
||||
start_date: 开始日期 (YYYY-MM-DD)
|
||||
end_date: 结束日期 (YYYY-MM-DD)
|
||||
|
||||
Returns:
|
||||
list: 搜索到的文章URL列表
|
||||
"""
|
||||
if website_name not in WEBSITE_SEARCH_CONFIGS:
|
||||
print(f"网站 {website_name} 不支持搜索功能")
|
||||
return []
|
||||
|
||||
config = WEBSITE_SEARCH_CONFIGS[website_name]
|
||||
article_urls = []
|
||||
|
||||
# 设置默认日期范围
|
||||
if not start_date:
|
||||
start_date = (datetime.now() - timedelta(days=365)).strftime("%Y-%m-%d")
|
||||
if not end_date:
|
||||
end_date = datetime.now().strftime("%Y-%m-%d")
|
||||
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
||||
}
|
||||
headers.update(config.get("headers", {}))
|
||||
|
||||
for page in range(1, max_pages + 1):
|
||||
try:
|
||||
# 构建搜索参数
|
||||
search_params = {}
|
||||
for key, value in config["search_params"].items():
|
||||
search_params[key] = value.format(
|
||||
keyword=quote(keyword),
|
||||
page=page,
|
||||
start_date=start_date,
|
||||
end_date=end_date
|
||||
)
|
||||
|
||||
print(f"搜索 {website_name} 第 {page} 页: {keyword}")
|
||||
|
||||
if config["method"] == "post":
|
||||
response = requests.post(
|
||||
config["search_url"],
|
||||
data=search_params,
|
||||
headers=headers,
|
||||
timeout=15
|
||||
)
|
||||
else:
|
||||
response = requests.get(
|
||||
config["search_url"],
|
||||
params=search_params,
|
||||
headers=headers,
|
||||
timeout=15
|
||||
)
|
||||
|
||||
response.raise_for_status()
|
||||
response.encoding = 'utf-8'
|
||||
|
||||
# 解析搜索结果
|
||||
soup = BeautifulSoup(response.text, "html.parser")
|
||||
page_urls = extract_search_results(soup, website_name)
|
||||
|
||||
if not page_urls:
|
||||
print(f"第 {page} 页没有找到更多结果")
|
||||
break
|
||||
|
||||
article_urls.extend(page_urls)
|
||||
print(f"第 {page} 页找到 {len(page_urls)} 篇文章")
|
||||
|
||||
# 避免请求过快
|
||||
time.sleep(1)
|
||||
|
||||
except Exception as e:
|
||||
print(f"搜索第 {page} 页时出错: {e}")
|
||||
continue
|
||||
|
||||
print(f"总共找到 {len(article_urls)} 篇文章")
|
||||
return article_urls
|
||||
|
||||
|
||||
def extract_search_results(soup, website_name):
|
||||
"""
|
||||
从搜索结果页面提取文章链接
|
||||
|
||||
Args:
|
||||
soup: BeautifulSoup对象
|
||||
website_name: 网站名称
|
||||
|
||||
Returns:
|
||||
list: 文章URL列表
|
||||
"""
|
||||
urls = []
|
||||
|
||||
# 根据不同网站的搜索结果结构提取链接
|
||||
if website_name == "新华网":
|
||||
# 新华网搜索结果结构
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = link["href"]
|
||||
if "/news/" in href or "/article/" in href:
|
||||
urls.append(href)
|
||||
|
||||
elif website_name == "人民日报":
|
||||
# 人民日报搜索结果结构
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = link["href"]
|
||||
if "/n1/" in href or "/article/" in href:
|
||||
urls.append(href)
|
||||
|
||||
elif website_name == "央视网":
|
||||
# 央视网搜索结果结构
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = link["href"]
|
||||
if "/news/" in href or "ARTI" in href:
|
||||
urls.append(href)
|
||||
|
||||
elif website_name == "光明日报":
|
||||
# 光明日报搜索结果结构
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = link["href"]
|
||||
if "/article/" in href or "/content/" in href:
|
||||
urls.append(href)
|
||||
|
||||
elif website_name == "经济日报":
|
||||
# 经济日报搜索结果结构
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = link["href"]
|
||||
if "/article/" in href or "/content/" in href:
|
||||
urls.append(href)
|
||||
|
||||
elif website_name == "中国日报":
|
||||
# 中国日报搜索结果结构
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = link["href"]
|
||||
if "/article/" in href or "/content/" in href:
|
||||
urls.append(href)
|
||||
|
||||
elif website_name == "工人日报":
|
||||
# 工人日报搜索结果结构
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = link["href"]
|
||||
if "/c/" in href or "/article/" in href:
|
||||
urls.append(href)
|
||||
|
||||
elif website_name == "科技日报":
|
||||
# 科技日报搜索结果结构
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = link["href"]
|
||||
if "/article/" in href or "/content/" in href:
|
||||
urls.append(href)
|
||||
|
||||
elif website_name == "人民政协网":
|
||||
# 人民政协网搜索结果结构
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = link["href"]
|
||||
if "/article/" in href or "/content/" in href:
|
||||
urls.append(href)
|
||||
|
||||
elif website_name == "中国纪检监察报":
|
||||
# 中国纪检监察报搜索结果结构
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = link["href"]
|
||||
if "/article/" in href or "/content/" in href:
|
||||
urls.append(href)
|
||||
|
||||
elif website_name == "中国新闻社":
|
||||
# 中国新闻社搜索结果结构
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = link["href"]
|
||||
if "/article/" in href or "/content/" in href:
|
||||
urls.append(href)
|
||||
|
||||
elif website_name == "学习时报":
|
||||
# 学习时报搜索结果结构
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = link["href"]
|
||||
if "/article/" in href or "/content/" in href:
|
||||
urls.append(href)
|
||||
|
||||
elif website_name == "中国青年报":
|
||||
# 中国青年报搜索结果结构
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = link["href"]
|
||||
if "/article/" in href or "/content/" in href:
|
||||
urls.append(href)
|
||||
|
||||
elif website_name == "中国妇女报":
|
||||
# 中国妇女报搜索结果结构
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = link["href"]
|
||||
if "/article/" in href or "/content/" in href:
|
||||
urls.append(href)
|
||||
|
||||
elif website_name == "法治日报":
|
||||
# 法治日报搜索结果结构
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = link["href"]
|
||||
if "/content/" in href and "content_" in href:
|
||||
urls.append(href)
|
||||
|
||||
elif website_name == "农民日报":
|
||||
# 农民日报搜索结果结构
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = link["href"]
|
||||
if "/article/" in href or "/content/" in href:
|
||||
urls.append(href)
|
||||
|
||||
elif website_name == "学习强国":
|
||||
# 学习强国搜索结果结构
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = link["href"]
|
||||
if "/article/" in href or "/content/" in href:
|
||||
urls.append(href)
|
||||
|
||||
elif website_name == "旗帜网":
|
||||
# 旗帜网搜索结果结构
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = link["href"]
|
||||
if "/n1/" in href or "/article/" in href:
|
||||
urls.append(href)
|
||||
|
||||
elif website_name == "中国网":
|
||||
# 中国网搜索结果结构
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = link["href"]
|
||||
if "/article/" in href or "/opinion/" in href:
|
||||
urls.append(href)
|
||||
|
||||
elif website_name == "中国政府网":
|
||||
# 中国政府网搜索结果结构
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = link["href"]
|
||||
if "/zhengce/" in href or "/xinwen/" in href:
|
||||
urls.append(href)
|
||||
|
||||
elif website_name == "求是网":
|
||||
# 求是网搜索结果结构
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = link["href"]
|
||||
if "/article/" in href or "/content/" in href:
|
||||
urls.append(href)
|
||||
|
||||
elif website_name == "解放军报":
|
||||
# 解放军报搜索结果结构
|
||||
for link in soup.find_all("a", href=True):
|
||||
href = link["href"]
|
||||
if "/zt/" in href or "/article/" in href:
|
||||
urls.append(href)
|
||||
|
||||
# 去重并返回
|
||||
return list(set(urls))
|
||||
|
||||
|
||||
def crawl_by_keyword(keyword, website_names=None, max_pages=10, start_date=None, end_date=None, max_articles=100):
|
||||
"""
|
||||
根据关键词爬取多个网站的文章
|
||||
|
||||
Args:
|
||||
keyword: 搜索关键词
|
||||
website_names: 网站名称列表,如果为None则爬取所有支持的网站
|
||||
max_pages: 每个网站最大搜索页数
|
||||
start_date: 开始日期 (YYYY-MM-DD)
|
||||
end_date: 结束日期 (YYYY-MM-DD)
|
||||
max_articles: 最大文章数量
|
||||
|
||||
Returns:
|
||||
dict: 爬取结果统计
|
||||
"""
|
||||
if website_names is None:
|
||||
website_names = list(WEBSITE_SEARCH_CONFIGS.keys())
|
||||
|
||||
results = {
|
||||
"keyword": keyword,
|
||||
"total_articles": 0,
|
||||
"success_count": 0,
|
||||
"failed_count": 0,
|
||||
"website_results": {}
|
||||
}
|
||||
|
||||
print(f"开始根据关键词 '{keyword}' 爬取文章...")
|
||||
print(f"目标网站: {', '.join(website_names)}")
|
||||
|
||||
for website_name in website_names:
|
||||
print(f"\n开始爬取 {website_name}...")
|
||||
|
||||
try:
|
||||
# 获取或创建网站对象
|
||||
from core.models import Website
|
||||
website, created = Website.objects.get_or_create(
|
||||
name=website_name,
|
||||
defaults={
|
||||
'base_url': WEBSITE_SEARCH_CONFIGS[website_name]["search_url"],
|
||||
'enabled': True
|
||||
}
|
||||
)
|
||||
|
||||
# 搜索文章URL
|
||||
article_urls = search_articles_by_keyword(
|
||||
website_name, keyword, max_pages, start_date, end_date
|
||||
)
|
||||
|
||||
if not article_urls:
|
||||
print(f"{website_name} 没有找到相关文章")
|
||||
results["website_results"][website_name] = {
|
||||
"found_urls": 0,
|
||||
"processed": 0,
|
||||
"success": 0,
|
||||
"failed": 0
|
||||
}
|
||||
continue
|
||||
|
||||
# 限制文章数量
|
||||
if len(article_urls) > max_articles:
|
||||
article_urls = article_urls[:max_articles]
|
||||
|
||||
print(f"{website_name} 找到 {len(article_urls)} 篇文章,开始处理...")
|
||||
|
||||
website_success = 0
|
||||
website_failed = 0
|
||||
|
||||
for i, url in enumerate(article_urls, 1):
|
||||
try:
|
||||
print(f"处理第 {i}/{len(article_urls)} 篇: {url}")
|
||||
process_article(url, website)
|
||||
website_success += 1
|
||||
results["success_count"] += 1
|
||||
|
||||
# 避免请求过快
|
||||
time.sleep(0.5)
|
||||
|
||||
except Exception as e:
|
||||
print(f"处理文章失败: {url}, 错误: {e}")
|
||||
website_failed += 1
|
||||
results["failed_count"] += 1
|
||||
|
||||
results["website_results"][website_name] = {
|
||||
"found_urls": len(article_urls),
|
||||
"processed": len(article_urls),
|
||||
"success": website_success,
|
||||
"failed": website_failed
|
||||
}
|
||||
|
||||
print(f"{website_name} 完成: 成功 {website_success}, 失败 {website_failed}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"爬取 {website_name} 时出错: {e}")
|
||||
results["website_results"][website_name] = {
|
||||
"found_urls": 0,
|
||||
"processed": 0,
|
||||
"success": 0,
|
||||
"failed": 1,
|
||||
"error": str(e)
|
||||
}
|
||||
results["failed_count"] += 1
|
||||
|
||||
results["total_articles"] = results["success_count"] + results["failed_count"]
|
||||
|
||||
print(f"\n爬取完成!")
|
||||
print(f"关键词: {keyword}")
|
||||
print(f"总文章数: {results['total_articles']}")
|
||||
print(f"成功: {results['success_count']}")
|
||||
print(f"失败: {results['failed_count']}")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def crawl_historical_articles(website_names=None, start_date=None, end_date=None, max_articles_per_site=50):
|
||||
"""
|
||||
爬取历史文章
|
||||
|
||||
Args:
|
||||
website_names: 网站名称列表
|
||||
start_date: 开始日期 (YYYY-MM-DD)
|
||||
end_date: 结束日期 (YYYY-MM-DD)
|
||||
max_articles_per_site: 每个网站最大文章数
|
||||
|
||||
Returns:
|
||||
dict: 爬取结果统计
|
||||
"""
|
||||
if not start_date:
|
||||
start_date = (datetime.now() - timedelta(days=30)).strftime("%Y-%m-%d")
|
||||
if not end_date:
|
||||
end_date = datetime.now().strftime("%Y-%m-%d")
|
||||
|
||||
if website_names is None:
|
||||
website_names = list(WEBSITE_SEARCH_CONFIGS.keys())
|
||||
|
||||
results = {
|
||||
"start_date": start_date,
|
||||
"end_date": end_date,
|
||||
"total_articles": 0,
|
||||
"success_count": 0,
|
||||
"failed_count": 0,
|
||||
"website_results": {}
|
||||
}
|
||||
|
||||
print(f"开始爬取历史文章...")
|
||||
print(f"日期范围: {start_date} 到 {end_date}")
|
||||
print(f"目标网站: {', '.join(website_names)}")
|
||||
|
||||
# 使用通用关键词搜索历史文章
|
||||
common_keywords = ["新闻", "报道", "文章", "资讯", "动态"]
|
||||
|
||||
for website_name in website_names:
|
||||
print(f"\n开始爬取 {website_name} 历史文章...")
|
||||
|
||||
try:
|
||||
from core.models import Website
|
||||
website, created = Website.objects.get_or_create(
|
||||
name=website_name,
|
||||
defaults={
|
||||
'base_url': WEBSITE_SEARCH_CONFIGS[website_name]["search_url"],
|
||||
'enabled': True
|
||||
}
|
||||
)
|
||||
|
||||
website_success = 0
|
||||
website_failed = 0
|
||||
all_urls = set()
|
||||
|
||||
# 使用多个关键词搜索
|
||||
for keyword in common_keywords:
|
||||
try:
|
||||
article_urls = search_articles_by_keyword(
|
||||
website_name, keyword, max_pages=5,
|
||||
start_date=start_date, end_date=end_date
|
||||
)
|
||||
all_urls.update(article_urls)
|
||||
|
||||
if len(all_urls) >= max_articles_per_site:
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
print(f"搜索关键词 '{keyword}' 时出错: {e}")
|
||||
continue
|
||||
|
||||
# 限制文章数量
|
||||
article_urls = list(all_urls)[:max_articles_per_site]
|
||||
|
||||
if not article_urls:
|
||||
print(f"{website_name} 没有找到历史文章")
|
||||
results["website_results"][website_name] = {
|
||||
"found_urls": 0,
|
||||
"processed": 0,
|
||||
"success": 0,
|
||||
"failed": 0
|
||||
}
|
||||
continue
|
||||
|
||||
print(f"{website_name} 找到 {len(article_urls)} 篇历史文章,开始处理...")
|
||||
|
||||
for i, url in enumerate(article_urls, 1):
|
||||
try:
|
||||
print(f"处理第 {i}/{len(article_urls)} 篇: {url}")
|
||||
process_article(url, website)
|
||||
website_success += 1
|
||||
results["success_count"] += 1
|
||||
|
||||
time.sleep(0.5)
|
||||
|
||||
except Exception as e:
|
||||
print(f"处理文章失败: {url}, 错误: {e}")
|
||||
website_failed += 1
|
||||
results["failed_count"] += 1
|
||||
|
||||
results["website_results"][website_name] = {
|
||||
"found_urls": len(article_urls),
|
||||
"processed": len(article_urls),
|
||||
"success": website_success,
|
||||
"failed": website_failed
|
||||
}
|
||||
|
||||
print(f"{website_name} 完成: 成功 {website_success}, 失败 {website_failed}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"爬取 {website_name} 历史文章时出错: {e}")
|
||||
results["website_results"][website_name] = {
|
||||
"found_urls": 0,
|
||||
"processed": 0,
|
||||
"success": 0,
|
||||
"failed": 1,
|
||||
"error": str(e)
|
||||
}
|
||||
results["failed_count"] += 1
|
||||
|
||||
results["total_articles"] = results["success_count"] + results["failed_count"]
|
||||
|
||||
print(f"\n历史文章爬取完成!")
|
||||
print(f"日期范围: {start_date} 到 {end_date}")
|
||||
print(f"总文章数: {results['total_articles']}")
|
||||
print(f"成功: {results['success_count']}")
|
||||
print(f"失败: {results['failed_count']}")
|
||||
|
||||
return results
|
||||
|
||||
Reference in New Issue
Block a user