Add support *

This commit is contained in:
2025-09-26 15:55:48 +08:00
parent f15b730dca
commit 181a78ad44
5 changed files with 1004 additions and 100 deletions

View File

@@ -148,15 +148,29 @@ class WebsiteCrawler:
return links
def check_keyword_match(self, text, title):
"""检查关键字匹配"""
"""检查关键字匹配 - 支持通配符的模糊匹配"""
matched_keywords = []
text_lower = text.lower()
title_lower = title.lower()
for keyword in self.keywords:
keyword_lower = keyword.lower()
if keyword_lower in text_lower or keyword_lower in title_lower:
matched_keywords.append(keyword)
# 处理通配符匹配
import re
# 转义特殊字符并转换 * 为 .* 正则表达式
escaped_keyword = re.escape(keyword)
regex_pattern = escaped_keyword.replace(r'\*', '.*')
# 编译正则表达式,支持不区分大小写的匹配
try:
pattern = re.compile(regex_pattern, re.IGNORECASE)
if pattern.search(text_lower) or pattern.search(title_lower):
matched_keywords.append(keyword)
except re.error:
# 如果正则表达式编译失败,回退到简单匹配
keyword_lower = keyword.lower()
if keyword_lower in text_lower or keyword_lower in title_lower:
matched_keywords.append(keyword)
return matched_keywords
@@ -200,23 +214,131 @@ class WebsiteCrawler:
'.publish-date',
'time[datetime]',
'.article-time',
'.news-time'
'.news-time',
'.post-time',
'.create-time',
'.update-time',
'.time span',
'.date span',
'.info span', # 一些网站使用.info类包含发布信息
'.meta span',
'.meta-info',
'.article-info span',
'.news-info span',
'.content-info span',
'.a-shijian', # 上海纪检监察网站的发布时间类
'.l-time' # 天津纪检监察网站的发布时间类
]
for selector in date_selectors:
element = soup.select_one(selector)
if element:
elements = soup.select(selector)
for element in elements:
date_text = element.get_text().strip()
if element.get('datetime'):
date_text = element.get('datetime')
# 如果文本太短或为空,跳过
if not date_text or len(date_text) < 4:
continue
# 尝试解析日期
try:
from datetime import datetime
# 这里可以添加更复杂的日期解析逻辑
# 暂时返回当前时间
return timezone.now()
except:
import re
# 清理日期文本,移除常见的无关字符
date_text = re.sub(r'发布(时间|日期)[:]?', '', date_text).strip()
date_text = re.sub(r'时间[:]?', '', date_text).strip()
date_text = re.sub(r'日期[:]?', '', date_text).strip()
date_text = re.sub(r'发表于[:]?', '', date_text).strip()
date_text = re.sub(r'更新[:]?', '', date_text).strip()
date_text = re.sub(r'\s+', ' ', date_text).strip() # 替换多个空白字符为单个空格
# 如果有 datetime 属性且是标准格式,直接使用
if element.get('datetime'):
datetime_attr = element.get('datetime')
# 尝试解析常见的日期时间格式
for fmt in [
'%Y-%m-%d %H:%M:%S',
'%Y-%m-%dT%H:%M:%S',
'%Y-%m-%dT%H:%M:%S%z',
'%Y-%m-%d %H:%M',
'%Y-%m-%d',
'%Y/%m/%d %H:%M:%S',
'%Y/%m/%d %H:%M',
'%Y/%m/%d',
'%Y年%m月%d%H:%M:%S',
'%Y年%m月%d%H:%M',
'%Y年%m月%d',
'%m/%d/%Y %H:%M:%S',
'%m/%d/%Y %H:%M',
'%m/%d/%Y',
'%d/%m/%Y %H:%M:%S',
'%d/%m/%Y %H:%M',
'%d/%m/%Y',
'%d.%m.%Y %H:%M:%S',
'%d.%m.%Y %H:%M',
'%d.%m.%Y'
]:
try:
if '%z' in fmt and '+' not in datetime_attr and datetime_attr.endswith('Z'):
datetime_attr = datetime_attr[:-1] + '+0000'
parsed_date = datetime.strptime(datetime_attr, fmt)
if not timezone.is_aware(parsed_date):
parsed_date = timezone.make_aware(parsed_date)
return parsed_date
except ValueError:
continue
# 尝试解析从文本中提取的日期
# 尝试解析各种常见的中文日期格式
for fmt in [
'%Y年%m月%d%H:%M:%S',
'%Y年%m月%d%H:%M',
'%Y年%m月%d',
'%Y-%m-%d %H:%M:%S',
'%Y-%m-%d %H:%M',
'%Y-%m-%d',
'%Y/%m/%d %H:%M:%S',
'%Y/%m/%d %H:%M',
'%Y/%m/%d',
'%m月%d%H:%M',
'%m月%d',
'%m/%d/%Y %H:%M:%S',
'%m/%d/%Y %H:%M',
'%m/%d/%Y',
'%d/%m/%Y %H:%M:%S',
'%d/%m/%Y %H:%M',
'%d/%m/%Y',
'%d.%m.%Y %H:%M:%S',
'%d.%m.%Y %H:%M',
'%d.%m.%Y'
]:
try:
parsed_date = datetime.strptime(date_text, fmt)
# 如果没有年份,使用当前年份
if '%Y' not in fmt:
parsed_date = parsed_date.replace(year=datetime.now().year)
if not timezone.is_aware(parsed_date):
parsed_date = timezone.make_aware(parsed_date)
return parsed_date
except ValueError:
continue
# 如果以上格式都不匹配,尝试使用 dateutil 解析
try:
from dateutil import parser
# 过滤掉明显不是日期的文本
if len(date_text) > 5 and not date_text.isdigit():
parsed_date = parser.parse(date_text)
if not timezone.is_aware(parsed_date):
parsed_date = timezone.make_aware(parsed_date)
return parsed_date
except:
pass
except Exception as e:
self.log('debug', f'解析日期失败: {date_text}, 错误: {str(e)}')
continue
return None

View File

@@ -84,67 +84,58 @@
<div class="col-12 mb-3">
<div class="card">
<div class="card-body">
<div class="row">
<div class="col-md-6">
<form method="get" class="d-flex">
<select name="website" class="form-select me-2" onchange="this.form.submit()">
<option value="">所有网站</option>
{% for website in stats.websites %}
<option value="{{ website.id }}" {% if website.id == stats.selected_website_id %}selected{% endif %}>
{{ website.name }} ({{ website.region }})
</option>
{% endfor %}
</select>
<select name="page_size" class="form-select me-2" onchange="this.form.submit()">
<option value="10" {% if stats.page_size == 10 %}selected{% endif %}>10条/页</option>
<option value="20" {% if stats.page_size == 20 %}selected{% endif %}>20条/页</option>
<option value="50" {% if stats.page_size == 50 %}selected{% endif %}>50条/页</option>
<option value="100" {% if stats.page_size == 100 %}selected{% endif %}>100条/页</option>
</select>
<noscript>
<button type="submit" class="btn btn-primary">应用</button>
</noscript>
</form>
<form method="get" class="row g-3">
<div class="col-md-4">
<label for="website" class="form-label">网站筛选</label>
<select name="website" id="website" class="form-select" onchange="this.form.submit()">
<option value="">所有网站</option>
{% for website in stats.websites %}
<option value="{{ website.id }}" {% if website.id == stats.selected_website_id %}selected{% endif %}>
{{ website.name }} ({{ website.region }})
</option>
{% endfor %}
</select>
</div>
<div class="col-md-6">
<!-- 分页导航 -->
{% if stats.page_obj.has_other_pages %}
<nav aria-label="页面导航">
<ul class="pagination justify-content-end mb-0">
{% if stats.page_obj.has_previous %}
<li class="page-item">
<a class="page-link" href="?page={{ stats.page_obj.previous_page_number }}{% if stats.selected_website_id %}&website={{ stats.selected_website_id }}{% endif %}{% if stats.page_size %}&page_size={{ stats.page_size }}{% endif %}" aria-label="上一页">
<span aria-hidden="true">&laquo;</span>
</a>
</li>
{% endif %}
{% for num in stats.page_obj.paginator.page_range %}
{% if stats.page_obj.number == num %}
<li class="page-item active">
<span class="page-link">{{ num }}</span>
</li>
{% elif num > stats.page_obj.number|add:'-3' and num < stats.page_obj.number|add:'3' %}
<li class="page-item">
<a class="page-link" href="?page={{ num }}{% if stats.selected_website_id %}&website={{ stats.selected_website_id }}{% endif %}{% if stats.page_size %}&page_size={{ stats.page_size }}{% endif %}">{{ num }}</a>
</li>
{% endif %}
{% endfor %}
{% if stats.page_obj.has_next %}
<li class="page-item">
<a class="page-link" href="?page={{ stats.page_obj.next_page_number }}{% if stats.selected_website_id %}&website={{ stats.selected_website_id }}{% endif %}{% if stats.page_size %}&page_size={{ stats.page_size }}{% endif %}" aria-label="下一页">
<span aria-hidden="true">&raquo;</span>
</a>
</li>
{% endif %}
</ul>
</nav>
{% endif %}
<div class="col-md-2">
<label for="page_size" class="form-label">每页条数</label>
<select name="page_size" id="page_size" class="form-select" onchange="this.form.submit()">
<option value="10" {% if stats.page_size == 10 %}selected{% endif %}>10条/页</option>
<option value="20" {% if stats.page_size == 20 %}selected{% endif %}>20条/页</option>
<option value="50" {% if stats.page_size == 50 %}selected{% endif %}>50条/页</option>
<option value="100" {% if stats.page_size == 100 %}selected{% endif %}>100条/页</option>
</select>
</div>
</div>
<div class="col-md-2">
<label for="start_date" class="form-label">开始日期</label>
<input type="date"
class="form-control"
id="start_date"
name="start_date"
value="{{ stats.start_date }}">
</div>
<div class="col-md-2">
<label for="end_date" class="form-label">结束日期</label>
<input type="date"
class="form-control"
id="end_date"
name="end_date"
value="{{ stats.end_date }}">
</div>
<div class="col-md-2 d-flex align-items-end">
<div class="btn-group" role="group">
<button type="submit" class="btn btn-primary">
<i class="bi bi-funnel"></i> 筛选
</button>
<a href="/" class="btn btn-outline-secondary">
<i class="bi bi-x-circle"></i> 清除
</a>
</div>
</div>
</form>
</div>
</div>
</div>
@@ -240,7 +231,7 @@
<ul class="pagination mb-0">
{% if stats.page_obj.has_previous %}
<li class="page-item">
<a class="page-link" href="?page={{ stats.page_obj.previous_page_number }}{% if stats.selected_website_id %}&website={{ stats.selected_website_id }}{% endif %}{% if stats.page_size %}&page_size={{ stats.page_size }}{% endif %}" aria-label="上一页">
<a class="page-link" href="?page={{ stats.page_obj.previous_page_number }}{% if stats.selected_website_id %}&website={{ stats.selected_website_id }}{% endif %}{% if stats.page_size %}&page_size={{ stats.page_size }}{% endif %}{% if stats.start_date %}&start_date={{ stats.start_date }}{% endif %}{% if stats.end_date %}&end_date={{ stats.end_date }}{% endif %}" aria-label="上一页">
<span aria-hidden="true">&laquo;</span>
</a>
</li>
@@ -253,14 +244,14 @@
</li>
{% elif num > stats.page_obj.number|add:'-3' and num < stats.page_obj.number|add:'3' %}
<li class="page-item">
<a class="page-link" href="?page={{ num }}{% if stats.selected_website_id %}&website={{ stats.selected_website_id }}{% endif %}{% if stats.page_size %}&page_size={{ stats.page_size }}{% endif %}">{{ num }}</a>
<a class="page-link" href="?page={{ num }}{% if stats.selected_website_id %}&website={{ stats.selected_website_id }}{% endif %}{% if stats.page_size %}&page_size={{ stats.page_size }}{% endif %}{% if stats.start_date %}&start_date={{ stats.start_date }}{% endif %}{% if stats.end_date %}&end_date={{ stats.end_date }}{% endif %}">{{ num }}</a>
</li>
{% endif %}
{% endfor %}
{% if stats.page_obj.has_next %}
<li class="page-item">
<a class="page-link" href="?page={{ stats.page_obj.next_page_number }}{% if stats.selected_website_id %}&website={{ stats.selected_website_id }}{% endif %}{% if stats.page_size %}&page_size={{ stats.page_size }}{% endif %}" aria-label="下一页">
<a class="page-link" href="?page={{ stats.page_obj.next_page_number }}{% if stats.selected_website_id %}&website={{ stats.selected_website_id }}{% endif %}{% if stats.page_size %}&page_size={{ stats.page_size }}{% endif %}{% if stats.start_date %}&start_date={{ stats.start_date }}{% endif %}{% if stats.end_date %}&end_date={{ stats.end_date }}{% endif %}" aria-label="下一页">
<span aria-hidden="true">&raquo;</span>
</a>
</li>

View File

@@ -18,16 +18,41 @@
<div class="card">
<div class="card-body">
<form method="get" action="{% url 'search' %}">
<div class="input-group input-group-lg">
<input type="text"
class="form-control"
name="q"
value="{{ keyword }}"
placeholder="输入关键字搜索内容..."
required>
<button class="btn btn-primary" type="submit">
<i class="bi bi-search"></i> 搜索
</button>
<div class="row g-3">
<div class="col-md-6">
<label for="keyword" class="form-label">搜索关键字</label>
<div class="input-group">
<input type="text"
class="form-control"
id="keyword"
name="q"
value="{{ keyword }}"
placeholder="输入关键字搜索内容,支持 * 通配符..."
required>
<button class="btn btn-primary" type="submit">
<i class="bi bi-search"></i> 搜索
</button>
</div>
<div class="form-text">
支持通配符:使用 * 进行模糊匹配,例如 "反腐*" 可以匹配 "反腐倡廉"、"反腐斗争" 等
</div>
</div>
<div class="col-md-3">
<label for="start_date" class="form-label">开始日期</label>
<input type="date"
class="form-control"
id="start_date"
name="start_date"
value="{{ start_date }}">
</div>
<div class="col-md-3">
<label for="end_date" class="form-label">结束日期</label>
<input type="date"
class="form-control"
id="end_date"
name="end_date"
value="{{ end_date }}">
</div>
</div>
</form>
</div>
@@ -36,7 +61,7 @@
</div>
<!-- 搜索结果 -->
{% if keyword %}
{% if keyword or start_date or end_date %}
<div class="row">
<div class="col-12">
<div class="card">
@@ -93,7 +118,13 @@
{% else %}
<div class="text-center py-5">
<i class="bi bi-search fs-1 text-muted"></i>
<p class="text-muted mt-3">没有找到包含 "{{ keyword }}" 的内容</p>
<p class="text-muted mt-3">
{% if keyword %}
没有找到包含 "{{ keyword }}" 的内容
{% else %}
没有找到符合时间范围的内容
{% endif %}
</p>
<p class="text-muted">请尝试其他关键字或检查爬取任务是否正常运行</p>
</div>
{% endif %}
@@ -119,6 +150,11 @@
<span class="badge bg-light text-dark">廉政</span>
<span class="badge bg-light text-dark">违纪</span>
</div>
<div class="mt-3">
<small class="text-muted">
<i class="bi bi-info-circle"></i> 支持通配符:使用 * 进行模糊匹配,例如 "反腐*" 可以匹配 "反腐倡廉"、"反腐斗争" 等
</small>
</div>
</div>
</div>
</div>

View File

@@ -42,6 +42,10 @@ def dashboard(request):
page_number = request.GET.get('page', 1)
page_size = request.GET.get('page_size', 20) # 默认每页20篇文章
# 获取时间范围参数
start_date = request.GET.get('start_date', '')
end_date = request.GET.get('end_date', '')
# 尝试转换page_size为整数
try:
page_size = int(page_size)
@@ -61,6 +65,20 @@ def dashboard(request):
except (ValueError, TypeError):
pass
# 添加时间范围筛选
if start_date:
all_contents = all_contents.filter(created_at__gte=start_date)
if end_date:
# 将结束日期设置为当天的结束时间
from django.utils import timezone
from datetime import datetime
try:
end_datetime = datetime.strptime(end_date, '%Y-%m-%d')
end_datetime = timezone.make_aware(end_datetime.replace(hour=23, minute=59, second=59))
all_contents = all_contents.filter(created_at__lte=end_datetime)
except ValueError:
pass # 如果日期格式不正确,忽略时间筛选
# 分页处理
paginator = Paginator(all_contents, page_size)
page_obj = paginator.get_page(page_number)
@@ -93,6 +111,8 @@ def dashboard(request):
'page_size': page_size,
'recent_tasks': recent_tasks,
'total_media_files': total_media_files,
'start_date': start_date,
'end_date': end_date,
}
return render(request, 'crawler/dashboard.html', {'stats': stats})
@@ -101,6 +121,8 @@ def dashboard(request):
def search_page(request):
"""搜索页面视图"""
keyword = request.GET.get('q', '').strip()
start_date = request.GET.get('start_date', '')
end_date = request.GET.get('end_date', '')
contents = []
if keyword:
@@ -110,15 +132,38 @@ def search_page(request):
defaults={'last_used': timezone.now()}
)
# 搜索内容
contents = CrawledContent.objects.filter(
Q(title__icontains=keyword) |
Q(content__icontains=keyword) |
Q(keywords_matched__icontains=keyword)
).order_by('-created_at')[:50]
# 构建模糊搜索查询
import re
# 处理通配符:将用户输入的 * 转换为数据库正则表达式的 .*
# 同时转义特殊字符以防止正则表达式错误
escaped_keyword = re.escape(keyword)
regex_pattern = escaped_keyword.replace(r'\*', '.*')
# 搜索内容 - 使用正则表达式进行模糊匹配
contents_query = CrawledContent.objects.filter(
Q(title__iregex=regex_pattern) |
Q(content__iregex=regex_pattern) |
Q(keywords_matched__iregex=regex_pattern)
)
# 添加时间范围筛选
if start_date:
contents_query = contents_query.filter(created_at__gte=start_date)
if end_date:
# 将结束日期设置为当天的结束时间
from django.utils import timezone
from datetime import datetime
end_datetime = datetime.strptime(end_date, '%Y-%m-%d')
end_datetime = timezone.make_aware(end_datetime.replace(hour=23, minute=59, second=59))
contents_query = contents_query.filter(created_at__lte=end_datetime)
contents = contents_query.order_by('-created_at')[:50]
return render(request, 'crawler/search.html', {
'keyword': keyword,
'start_date': start_date,
'end_date': end_date,
'contents': contents
})
@@ -281,10 +326,10 @@ def preview_crawled_content(request, content_id):
<div class="meta">
<p><strong>来源网站:</strong> {content.website.name} ({content.website.region})</p>
<p><strong>原始链接:</strong> <a href="{content.url}" target="_blank">{content.url}</a></p>
<p><strong>发布时间:</strong> {content.publish_date or '未知'}</p>
<p><strong>发布时间:</strong> {content.publish_date.strftime('%Y-%m-%d %H:%M:%S') if content.publish_date else '未知'}</p>
<p><strong>作者:</strong> {content.author or '未知'}</p>
<p><strong>匹配关键字:</strong> {content.keywords_matched}</p>
<p><strong>爬取时间:</strong> {content.created_at}</p>
<p><strong>爬取时间:</strong> {content.created_at.strftime('%Y-%m-%d %H:%M:%S')}</p>
<p><strong>媒体文件数量:</strong> {len(media_files)}</p>
</div>
@@ -315,11 +360,11 @@ def download_crawled_content(request, content_id):
# 添加元数据
doc.add_paragraph(f'来源网站: {content.website.name} ({content.website.region})')
doc.add_paragraph(f'原始链接: {content.url}')
doc.add_paragraph(f'发布时间: {content.publish_date or "未知"}')
doc.add_paragraph(f'发布时间: {content.publish_date.strftime("%Y-%m-%d %H:%M:%S") if content.publish_date else "未知"}')
doc.add_paragraph(f'作者: {content.author or "未知"}')
doc.add_paragraph(f'匹配关键字: {content.keywords_matched}')
doc.add_paragraph(f'爬取时间: {content.created_at}')
doc.add_paragraph(f'爬取时间: {content.created_at.strftime("%Y-%m-%d %H:%M:%S")}')
# 添加内容
doc.add_heading('正文', level=1)
for paragraph in content.content.split('\n\n'):
@@ -391,11 +436,11 @@ def download_selected_contents(request):
# 添加元数据
doc.add_paragraph(f'来源网站: {content.website.name} ({content.website.region})')
doc.add_paragraph(f'原始链接: {content.url}')
doc.add_paragraph(f'发布时间: {content.publish_date or "未知"}')
doc.add_paragraph(f'发布时间: {content.publish_date.strftime("%Y-%m-%d %H:%M:%S") if content.publish_date else "未知"}')
doc.add_paragraph(f'作者: {content.author or "未知"}')
doc.add_paragraph(f'匹配关键字: {content.keywords_matched}')
doc.add_paragraph(f'爬取时间: {content.created_at}')
doc.add_paragraph(f'爬取时间: {content.created_at.strftime("%Y-%m-%d %H:%M:%S")}')
# 添加内容
doc.add_heading('正文', level=1)
for paragraph in content.content.split('\n\n'):