Add log
This commit is contained in:
@@ -16,14 +16,18 @@ def crawl_website(self, website_id, node_id=None, batch_id=None):
|
|||||||
try:
|
try:
|
||||||
website = Website.objects.get(id=website_id)
|
website = Website.objects.get(id=website_id)
|
||||||
logger.info(f"开始爬取网站: {website.name} (节点: {node_id}, 批次: {batch_id})")
|
logger.info(f"开始爬取网站: {website.name} (节点: {node_id}, 批次: {batch_id})")
|
||||||
|
logger.info(f"网站URL: {website.base_url}")
|
||||||
|
|
||||||
# 记录任务开始
|
# 记录任务开始
|
||||||
if node_id and batch_id:
|
if node_id and batch_id:
|
||||||
from .distributed_crawler import distributed_crawler
|
from .distributed_crawler import distributed_crawler
|
||||||
distributed_crawler.heartbeat(node_id, 1)
|
distributed_crawler.heartbeat(node_id, 1)
|
||||||
|
logger.info(f"分布式爬虫心跳已发送 - 节点: {node_id}, 状态: 1")
|
||||||
|
|
||||||
# 调用爬虫函数
|
# 调用爬虫函数
|
||||||
|
logger.info(f"开始调用 full_site_crawler 函数处理网站: {website.name}")
|
||||||
full_site_crawler(website.base_url, website, max_pages=100)
|
full_site_crawler(website.base_url, website, max_pages=100)
|
||||||
|
logger.info(f"完成调用 full_site_crawler 函数处理网站: {website.name}")
|
||||||
|
|
||||||
# 统计结果
|
# 统计结果
|
||||||
article_count = website.article_set.count()
|
article_count = website.article_set.count()
|
||||||
@@ -32,8 +36,9 @@ def crawl_website(self, website_id, node_id=None, batch_id=None):
|
|||||||
# 记录任务完成
|
# 记录任务完成
|
||||||
if node_id and batch_id:
|
if node_id and batch_id:
|
||||||
distributed_crawler.heartbeat(node_id, 0)
|
distributed_crawler.heartbeat(node_id, 0)
|
||||||
|
logger.info(f"分布式爬虫心跳已发送 - 节点: {node_id}, 状态: 0")
|
||||||
|
|
||||||
return {
|
result = {
|
||||||
'website_id': website_id,
|
'website_id': website_id,
|
||||||
'website_name': website.name,
|
'website_name': website.name,
|
||||||
'article_count': article_count,
|
'article_count': article_count,
|
||||||
@@ -41,13 +46,18 @@ def crawl_website(self, website_id, node_id=None, batch_id=None):
|
|||||||
'node_id': node_id,
|
'node_id': node_id,
|
||||||
'batch_id': batch_id
|
'batch_id': batch_id
|
||||||
}
|
}
|
||||||
|
logger.info(f"任务完成,返回结果: {result}")
|
||||||
|
return result
|
||||||
|
|
||||||
except Website.DoesNotExist:
|
except Website.DoesNotExist:
|
||||||
logger.error(f"网站不存在: {website_id}")
|
error_msg = f"网站不存在: {website_id}"
|
||||||
|
logger.error(error_msg)
|
||||||
raise
|
raise
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
logger.error(f"爬取网站 {website_id} 失败: {exc}")
|
error_msg = f"爬取网站 {website_id} 失败: {exc}"
|
||||||
|
logger.error(error_msg)
|
||||||
# 重试任务
|
# 重试任务
|
||||||
|
logger.info(f"准备重试任务,将在5分钟后重试")
|
||||||
raise self.retry(exc=exc, countdown=60 * 5) # 5分钟后重试
|
raise self.retry(exc=exc, countdown=60 * 5) # 5分钟后重试
|
||||||
|
|
||||||
|
|
||||||
@@ -62,19 +72,23 @@ def crawl_all_websites(self):
|
|||||||
# 获取所有启用的网站
|
# 获取所有启用的网站
|
||||||
websites = Website.objects.filter(enabled=True)
|
websites = Website.objects.filter(enabled=True)
|
||||||
total_websites = websites.count()
|
total_websites = websites.count()
|
||||||
|
logger.info(f"找到 {total_websites} 个启用的网站")
|
||||||
|
|
||||||
results = []
|
results = []
|
||||||
for website in websites:
|
for website in websites:
|
||||||
try:
|
try:
|
||||||
|
logger.info(f"启动网站 {website.name} 的爬取任务")
|
||||||
# 调用单个网站爬取任务
|
# 调用单个网站爬取任务
|
||||||
result = crawl_website.delay(website.id)
|
result = crawl_website.delay(website.id)
|
||||||
|
logger.info(f"网站 {website.name} 的爬取任务已启动,任务ID: {result.id}")
|
||||||
results.append({
|
results.append({
|
||||||
'website_id': website.id,
|
'website_id': website.id,
|
||||||
'website_name': website.name,
|
'website_name': website.name,
|
||||||
'task_id': result.id
|
'task_id': result.id
|
||||||
})
|
})
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"启动网站 {website.name} 爬取任务失败: {e}")
|
error_msg = f"启动网站 {website.name} 爬取任务失败: {e}"
|
||||||
|
logger.error(error_msg)
|
||||||
results.append({
|
results.append({
|
||||||
'website_id': website.id,
|
'website_id': website.id,
|
||||||
'website_name': website.name,
|
'website_name': website.name,
|
||||||
@@ -90,7 +104,8 @@ def crawl_all_websites(self):
|
|||||||
}
|
}
|
||||||
|
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
logger.error(f"批量爬取任务失败: {exc}")
|
error_msg = f"批量爬取任务失败: {exc}"
|
||||||
|
logger.error(error_msg)
|
||||||
raise self.retry(exc=exc, countdown=60 * 10) # 10分钟后重试
|
raise self.retry(exc=exc, countdown=60 * 10) # 10分钟后重试
|
||||||
|
|
||||||
|
|
||||||
@@ -103,7 +118,9 @@ def crawl_specific_media(media_list):
|
|||||||
logger.info(f"开始爬取指定媒体: {media_list}")
|
logger.info(f"开始爬取指定媒体: {media_list}")
|
||||||
|
|
||||||
# 调用管理命令
|
# 调用管理命令
|
||||||
|
logger.info("调用 crawl_all_media 管理命令")
|
||||||
call_command('crawl_all_media', media=','.join(media_list))
|
call_command('crawl_all_media', media=','.join(media_list))
|
||||||
|
logger.info("crawl_all_media 管理命令执行完成")
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'media_list': media_list,
|
'media_list': media_list,
|
||||||
@@ -111,7 +128,8 @@ def crawl_specific_media(media_list):
|
|||||||
}
|
}
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"爬取指定媒体失败: {e}")
|
error_msg = f"爬取指定媒体失败: {e}"
|
||||||
|
logger.error(error_msg)
|
||||||
raise
|
raise
|
||||||
|
|
||||||
|
|
||||||
@@ -125,10 +143,13 @@ def cleanup_old_articles(days=30):
|
|||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
|
|
||||||
cutoff_date = timezone.now() - timedelta(days=days)
|
cutoff_date = timezone.now() - timedelta(days=days)
|
||||||
|
logger.info(f"查找 {days} 天前的文章,截止日期: {cutoff_date}")
|
||||||
old_articles = Article.objects.filter(created_at__lt=cutoff_date)
|
old_articles = Article.objects.filter(created_at__lt=cutoff_date)
|
||||||
count = old_articles.count()
|
count = old_articles.count()
|
||||||
|
logger.info(f"找到 {count} 篇旧文章")
|
||||||
|
|
||||||
old_articles.delete()
|
old_articles.delete()
|
||||||
|
logger.info(f"已删除 {count} 篇旧文章")
|
||||||
|
|
||||||
logger.info(f"清理了 {count} 篇旧文章({days}天前)")
|
logger.info(f"清理了 {count} 篇旧文章({days}天前)")
|
||||||
|
|
||||||
@@ -139,7 +160,8 @@ def cleanup_old_articles(days=30):
|
|||||||
}
|
}
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"清理旧文章失败: {e}")
|
error_msg = f"清理旧文章失败: {e}"
|
||||||
|
logger.error(error_msg)
|
||||||
raise
|
raise
|
||||||
|
|
||||||
|
|
||||||
@@ -152,7 +174,9 @@ def export_articles():
|
|||||||
logger.info("开始导出文章")
|
logger.info("开始导出文章")
|
||||||
|
|
||||||
# 调用导出命令
|
# 调用导出命令
|
||||||
|
logger.info("调用 export_articles 管理命令")
|
||||||
call_command('export_articles')
|
call_command('export_articles')
|
||||||
|
logger.info("export_articles 管理命令执行完成")
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'status': 'success',
|
'status': 'success',
|
||||||
@@ -160,7 +184,8 @@ def export_articles():
|
|||||||
}
|
}
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"导出文章失败: {e}")
|
error_msg = f"导出文章失败: {e}"
|
||||||
|
logger.error(error_msg)
|
||||||
raise
|
raise
|
||||||
|
|
||||||
|
|
||||||
@@ -170,26 +195,33 @@ def health_check():
|
|||||||
健康检查任务
|
健康检查任务
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
|
logger.info("开始执行健康检查")
|
||||||
# 检查数据库连接
|
# 检查数据库连接
|
||||||
website_count = Website.objects.count()
|
website_count = Website.objects.count()
|
||||||
article_count = Article.objects.count()
|
article_count = Article.objects.count()
|
||||||
|
logger.info(f"数据库状态正常 - 网站数量: {website_count}, 文章数量: {article_count}")
|
||||||
|
|
||||||
# 检查Redis连接
|
# 检查Redis连接
|
||||||
from django.core.cache import cache
|
from django.core.cache import cache
|
||||||
|
logger.info("检查Redis连接")
|
||||||
cache.set('health_check', 'ok', 60)
|
cache.set('health_check', 'ok', 60)
|
||||||
cache_result = cache.get('health_check')
|
cache_result = cache.get('health_check')
|
||||||
|
logger.info(f"Redis连接状态: {'正常' if cache_result == 'ok' else '异常'}")
|
||||||
|
|
||||||
return {
|
result = {
|
||||||
'database': 'ok',
|
'database': 'ok',
|
||||||
'redis': 'ok' if cache_result == 'ok' else 'error',
|
'redis': 'ok' if cache_result == 'ok' else 'error',
|
||||||
'website_count': website_count,
|
'website_count': website_count,
|
||||||
'article_count': article_count,
|
'article_count': article_count,
|
||||||
'status': 'healthy'
|
'status': 'healthy'
|
||||||
}
|
}
|
||||||
|
logger.info(f"健康检查完成,结果: {result}")
|
||||||
|
return result
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"健康检查失败: {e}")
|
error_msg = f"健康检查失败: {e}"
|
||||||
|
logger.error(error_msg)
|
||||||
return {
|
return {
|
||||||
'status': 'unhealthy',
|
'status': 'unhealthy',
|
||||||
'error': str(e)
|
'error': str(e)
|
||||||
}
|
}
|
||||||
Reference in New Issue
Block a user