Support keword

This commit is contained in:
2025-09-24 03:38:32 +08:00
parent a4891b1c30
commit 8592833d74
16 changed files with 2888 additions and 2 deletions

View File

@@ -1,7 +1,7 @@
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from urllib.parse import urljoin, urlparse, quote
from collections import deque
from django.utils import timezone
from django.conf import settings
@@ -15,6 +15,8 @@ from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from datetime import datetime, timedelta
import json
def get_selenium_driver():
@@ -2270,3 +2272,695 @@ def full_site_crawler(start_url, website, max_pages=1000):
queue.append(href)
elif href not in visited and is_valid_url(href, base_netloc):
queue.append(href)
# 网站搜索配置
WEBSITE_SEARCH_CONFIGS = {
"新华网": {
"search_url": "http://so.news.cn/getNews",
"search_params": {
"keyword": "{keyword}",
"curPage": "{page}",
"sortField": "0",
"sortType": "1"
},
"method": "post",
"headers": {
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
}
},
"人民日报": {
"search_url": "http://search.people.com.cn/search",
"search_params": {
"keyword": "{keyword}",
"st": "0",
"startDate": "{start_date}",
"endDate": "{end_date}",
"page": "{page}"
},
"method": "get"
},
"央视网": {
"search_url": "https://search.cctv.com/search.php",
"search_params": {
"qtext": "{keyword}",
"type": "web",
"page": "{page}"
},
"method": "get"
},
"光明日报": {
"search_url": "http://search.gmw.cn/search",
"search_params": {
"q": "{keyword}",
"page": "{page}"
},
"method": "get"
},
"经济日报": {
"search_url": "http://www.ce.cn/search",
"search_params": {
"keyword": "{keyword}",
"page": "{page}"
},
"method": "get"
},
"中国日报": {
"search_url": "http://www.chinadaily.com.cn/search",
"search_params": {
"q": "{keyword}",
"page": "{page}"
},
"method": "get"
},
"工人日报": {
"search_url": "https://www.workercn.cn/search",
"search_params": {
"keyword": "{keyword}",
"page": "{page}"
},
"method": "get"
},
"科技日报": {
"search_url": "http://www.stdaily.com/search",
"search_params": {
"keyword": "{keyword}",
"page": "{page}"
},
"method": "get"
},
"人民政协网": {
"search_url": "https://www.rmzxw.com.cn/search",
"search_params": {
"keyword": "{keyword}",
"page": "{page}"
},
"method": "get"
},
"中国纪检监察报": {
"search_url": "http://www.jjjcb.cn/search",
"search_params": {
"keyword": "{keyword}",
"page": "{page}"
},
"method": "get"
},
"中国新闻社": {
"search_url": "https://www.chinanews.com.cn/search",
"search_params": {
"keyword": "{keyword}",
"page": "{page}"
},
"method": "get"
},
"学习时报": {
"search_url": "https://www.studytimes.cn/search",
"search_params": {
"keyword": "{keyword}",
"page": "{page}"
},
"method": "get"
},
"中国青年报": {
"search_url": "http://news.cyol.com/search",
"search_params": {
"keyword": "{keyword}",
"page": "{page}"
},
"method": "get"
},
"中国妇女报": {
"search_url": "https://www.cnwomen.com.cn/search",
"search_params": {
"keyword": "{keyword}",
"page": "{page}"
},
"method": "get"
},
"法治日报": {
"search_url": "http://www.legaldaily.com.cn/search",
"search_params": {
"keyword": "{keyword}",
"page": "{page}"
},
"method": "get"
},
"农民日报": {
"search_url": "https://www.farmer.com.cn/search",
"search_params": {
"keyword": "{keyword}",
"page": "{page}"
},
"method": "get"
},
"学习强国": {
"search_url": "https://www.xuexi.cn/search",
"search_params": {
"keyword": "{keyword}",
"page": "{page}"
},
"method": "get"
},
"旗帜网": {
"search_url": "http://www.qizhiwang.org.cn/search",
"search_params": {
"keyword": "{keyword}",
"page": "{page}"
},
"method": "get"
},
"中国网": {
"search_url": "http://www.china.com.cn/search",
"search_params": {
"keyword": "{keyword}",
"page": "{page}"
},
"method": "get"
},
"中国政府网": {
"search_url": "https://www.gov.cn/search",
"search_params": {
"keyword": "{keyword}",
"page": "{page}"
},
"method": "get"
},
"求是网": {
"search_url": "http://www.qstheory.cn/search",
"search_params": {
"keyword": "{keyword}",
"page": "{page}"
},
"method": "get"
},
"解放军报": {
"search_url": "http://www.81.cn/search",
"search_params": {
"keyword": "{keyword}",
"page": "{page}"
},
"method": "get"
}
}
def search_articles_by_keyword(website_name, keyword, max_pages=10, start_date=None, end_date=None):
"""
根据关键词搜索文章
Args:
website_name: 网站名称
keyword: 搜索关键词
max_pages: 最大搜索页数
start_date: 开始日期 (YYYY-MM-DD)
end_date: 结束日期 (YYYY-MM-DD)
Returns:
list: 搜索到的文章URL列表
"""
if website_name not in WEBSITE_SEARCH_CONFIGS:
print(f"网站 {website_name} 不支持搜索功能")
return []
config = WEBSITE_SEARCH_CONFIGS[website_name]
article_urls = []
# 设置默认日期范围
if not start_date:
start_date = (datetime.now() - timedelta(days=365)).strftime("%Y-%m-%d")
if not end_date:
end_date = datetime.now().strftime("%Y-%m-%d")
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
headers.update(config.get("headers", {}))
for page in range(1, max_pages + 1):
try:
# 构建搜索参数
search_params = {}
for key, value in config["search_params"].items():
search_params[key] = value.format(
keyword=quote(keyword),
page=page,
start_date=start_date,
end_date=end_date
)
print(f"搜索 {website_name}{page} 页: {keyword}")
if config["method"] == "post":
response = requests.post(
config["search_url"],
data=search_params,
headers=headers,
timeout=15
)
else:
response = requests.get(
config["search_url"],
params=search_params,
headers=headers,
timeout=15
)
response.raise_for_status()
response.encoding = 'utf-8'
# 解析搜索结果
soup = BeautifulSoup(response.text, "html.parser")
page_urls = extract_search_results(soup, website_name)
if not page_urls:
print(f"{page} 页没有找到更多结果")
break
article_urls.extend(page_urls)
print(f"{page} 页找到 {len(page_urls)} 篇文章")
# 避免请求过快
time.sleep(1)
except Exception as e:
print(f"搜索第 {page} 页时出错: {e}")
continue
print(f"总共找到 {len(article_urls)} 篇文章")
return article_urls
def extract_search_results(soup, website_name):
"""
从搜索结果页面提取文章链接
Args:
soup: BeautifulSoup对象
website_name: 网站名称
Returns:
list: 文章URL列表
"""
urls = []
# 根据不同网站的搜索结果结构提取链接
if website_name == "新华网":
# 新华网搜索结果结构
for link in soup.find_all("a", href=True):
href = link["href"]
if "/news/" in href or "/article/" in href:
urls.append(href)
elif website_name == "人民日报":
# 人民日报搜索结果结构
for link in soup.find_all("a", href=True):
href = link["href"]
if "/n1/" in href or "/article/" in href:
urls.append(href)
elif website_name == "央视网":
# 央视网搜索结果结构
for link in soup.find_all("a", href=True):
href = link["href"]
if "/news/" in href or "ARTI" in href:
urls.append(href)
elif website_name == "光明日报":
# 光明日报搜索结果结构
for link in soup.find_all("a", href=True):
href = link["href"]
if "/article/" in href or "/content/" in href:
urls.append(href)
elif website_name == "经济日报":
# 经济日报搜索结果结构
for link in soup.find_all("a", href=True):
href = link["href"]
if "/article/" in href or "/content/" in href:
urls.append(href)
elif website_name == "中国日报":
# 中国日报搜索结果结构
for link in soup.find_all("a", href=True):
href = link["href"]
if "/article/" in href or "/content/" in href:
urls.append(href)
elif website_name == "工人日报":
# 工人日报搜索结果结构
for link in soup.find_all("a", href=True):
href = link["href"]
if "/c/" in href or "/article/" in href:
urls.append(href)
elif website_name == "科技日报":
# 科技日报搜索结果结构
for link in soup.find_all("a", href=True):
href = link["href"]
if "/article/" in href or "/content/" in href:
urls.append(href)
elif website_name == "人民政协网":
# 人民政协网搜索结果结构
for link in soup.find_all("a", href=True):
href = link["href"]
if "/article/" in href or "/content/" in href:
urls.append(href)
elif website_name == "中国纪检监察报":
# 中国纪检监察报搜索结果结构
for link in soup.find_all("a", href=True):
href = link["href"]
if "/article/" in href or "/content/" in href:
urls.append(href)
elif website_name == "中国新闻社":
# 中国新闻社搜索结果结构
for link in soup.find_all("a", href=True):
href = link["href"]
if "/article/" in href or "/content/" in href:
urls.append(href)
elif website_name == "学习时报":
# 学习时报搜索结果结构
for link in soup.find_all("a", href=True):
href = link["href"]
if "/article/" in href or "/content/" in href:
urls.append(href)
elif website_name == "中国青年报":
# 中国青年报搜索结果结构
for link in soup.find_all("a", href=True):
href = link["href"]
if "/article/" in href or "/content/" in href:
urls.append(href)
elif website_name == "中国妇女报":
# 中国妇女报搜索结果结构
for link in soup.find_all("a", href=True):
href = link["href"]
if "/article/" in href or "/content/" in href:
urls.append(href)
elif website_name == "法治日报":
# 法治日报搜索结果结构
for link in soup.find_all("a", href=True):
href = link["href"]
if "/content/" in href and "content_" in href:
urls.append(href)
elif website_name == "农民日报":
# 农民日报搜索结果结构
for link in soup.find_all("a", href=True):
href = link["href"]
if "/article/" in href or "/content/" in href:
urls.append(href)
elif website_name == "学习强国":
# 学习强国搜索结果结构
for link in soup.find_all("a", href=True):
href = link["href"]
if "/article/" in href or "/content/" in href:
urls.append(href)
elif website_name == "旗帜网":
# 旗帜网搜索结果结构
for link in soup.find_all("a", href=True):
href = link["href"]
if "/n1/" in href or "/article/" in href:
urls.append(href)
elif website_name == "中国网":
# 中国网搜索结果结构
for link in soup.find_all("a", href=True):
href = link["href"]
if "/article/" in href or "/opinion/" in href:
urls.append(href)
elif website_name == "中国政府网":
# 中国政府网搜索结果结构
for link in soup.find_all("a", href=True):
href = link["href"]
if "/zhengce/" in href or "/xinwen/" in href:
urls.append(href)
elif website_name == "求是网":
# 求是网搜索结果结构
for link in soup.find_all("a", href=True):
href = link["href"]
if "/article/" in href or "/content/" in href:
urls.append(href)
elif website_name == "解放军报":
# 解放军报搜索结果结构
for link in soup.find_all("a", href=True):
href = link["href"]
if "/zt/" in href or "/article/" in href:
urls.append(href)
# 去重并返回
return list(set(urls))
def crawl_by_keyword(keyword, website_names=None, max_pages=10, start_date=None, end_date=None, max_articles=100):
"""
根据关键词爬取多个网站的文章
Args:
keyword: 搜索关键词
website_names: 网站名称列表如果为None则爬取所有支持的网站
max_pages: 每个网站最大搜索页数
start_date: 开始日期 (YYYY-MM-DD)
end_date: 结束日期 (YYYY-MM-DD)
max_articles: 最大文章数量
Returns:
dict: 爬取结果统计
"""
if website_names is None:
website_names = list(WEBSITE_SEARCH_CONFIGS.keys())
results = {
"keyword": keyword,
"total_articles": 0,
"success_count": 0,
"failed_count": 0,
"website_results": {}
}
print(f"开始根据关键词 '{keyword}' 爬取文章...")
print(f"目标网站: {', '.join(website_names)}")
for website_name in website_names:
print(f"\n开始爬取 {website_name}...")
try:
# 获取或创建网站对象
from core.models import Website
website, created = Website.objects.get_or_create(
name=website_name,
defaults={
'base_url': WEBSITE_SEARCH_CONFIGS[website_name]["search_url"],
'enabled': True
}
)
# 搜索文章URL
article_urls = search_articles_by_keyword(
website_name, keyword, max_pages, start_date, end_date
)
if not article_urls:
print(f"{website_name} 没有找到相关文章")
results["website_results"][website_name] = {
"found_urls": 0,
"processed": 0,
"success": 0,
"failed": 0
}
continue
# 限制文章数量
if len(article_urls) > max_articles:
article_urls = article_urls[:max_articles]
print(f"{website_name} 找到 {len(article_urls)} 篇文章,开始处理...")
website_success = 0
website_failed = 0
for i, url in enumerate(article_urls, 1):
try:
print(f"处理第 {i}/{len(article_urls)} 篇: {url}")
process_article(url, website)
website_success += 1
results["success_count"] += 1
# 避免请求过快
time.sleep(0.5)
except Exception as e:
print(f"处理文章失败: {url}, 错误: {e}")
website_failed += 1
results["failed_count"] += 1
results["website_results"][website_name] = {
"found_urls": len(article_urls),
"processed": len(article_urls),
"success": website_success,
"failed": website_failed
}
print(f"{website_name} 完成: 成功 {website_success}, 失败 {website_failed}")
except Exception as e:
print(f"爬取 {website_name} 时出错: {e}")
results["website_results"][website_name] = {
"found_urls": 0,
"processed": 0,
"success": 0,
"failed": 1,
"error": str(e)
}
results["failed_count"] += 1
results["total_articles"] = results["success_count"] + results["failed_count"]
print(f"\n爬取完成!")
print(f"关键词: {keyword}")
print(f"总文章数: {results['total_articles']}")
print(f"成功: {results['success_count']}")
print(f"失败: {results['failed_count']}")
return results
def crawl_historical_articles(website_names=None, start_date=None, end_date=None, max_articles_per_site=50):
"""
爬取历史文章
Args:
website_names: 网站名称列表
start_date: 开始日期 (YYYY-MM-DD)
end_date: 结束日期 (YYYY-MM-DD)
max_articles_per_site: 每个网站最大文章数
Returns:
dict: 爬取结果统计
"""
if not start_date:
start_date = (datetime.now() - timedelta(days=30)).strftime("%Y-%m-%d")
if not end_date:
end_date = datetime.now().strftime("%Y-%m-%d")
if website_names is None:
website_names = list(WEBSITE_SEARCH_CONFIGS.keys())
results = {
"start_date": start_date,
"end_date": end_date,
"total_articles": 0,
"success_count": 0,
"failed_count": 0,
"website_results": {}
}
print(f"开始爬取历史文章...")
print(f"日期范围: {start_date}{end_date}")
print(f"目标网站: {', '.join(website_names)}")
# 使用通用关键词搜索历史文章
common_keywords = ["新闻", "报道", "文章", "资讯", "动态"]
for website_name in website_names:
print(f"\n开始爬取 {website_name} 历史文章...")
try:
from core.models import Website
website, created = Website.objects.get_or_create(
name=website_name,
defaults={
'base_url': WEBSITE_SEARCH_CONFIGS[website_name]["search_url"],
'enabled': True
}
)
website_success = 0
website_failed = 0
all_urls = set()
# 使用多个关键词搜索
for keyword in common_keywords:
try:
article_urls = search_articles_by_keyword(
website_name, keyword, max_pages=5,
start_date=start_date, end_date=end_date
)
all_urls.update(article_urls)
if len(all_urls) >= max_articles_per_site:
break
except Exception as e:
print(f"搜索关键词 '{keyword}' 时出错: {e}")
continue
# 限制文章数量
article_urls = list(all_urls)[:max_articles_per_site]
if not article_urls:
print(f"{website_name} 没有找到历史文章")
results["website_results"][website_name] = {
"found_urls": 0,
"processed": 0,
"success": 0,
"failed": 0
}
continue
print(f"{website_name} 找到 {len(article_urls)} 篇历史文章,开始处理...")
for i, url in enumerate(article_urls, 1):
try:
print(f"处理第 {i}/{len(article_urls)} 篇: {url}")
process_article(url, website)
website_success += 1
results["success_count"] += 1
time.sleep(0.5)
except Exception as e:
print(f"处理文章失败: {url}, 错误: {e}")
website_failed += 1
results["failed_count"] += 1
results["website_results"][website_name] = {
"found_urls": len(article_urls),
"processed": len(article_urls),
"success": website_success,
"failed": website_failed
}
print(f"{website_name} 完成: 成功 {website_success}, 失败 {website_failed}")
except Exception as e:
print(f"爬取 {website_name} 历史文章时出错: {e}")
results["website_results"][website_name] = {
"found_urls": 0,
"processed": 0,
"success": 0,
"failed": 1,
"error": str(e)
}
results["failed_count"] += 1
results["total_articles"] = results["success_count"] + results["failed_count"]
print(f"\n历史文章爬取完成!")
print(f"日期范围: {start_date}{end_date}")
print(f"总文章数: {results['total_articles']}")
print(f"成功: {results['success_count']}")
print(f"失败: {results['failed_count']}")
return results