本文详解如何用Python开发Telegram机器人,实现服务器监控和自动告警。
前言
服务器出问题了,怎么第一时间知道?
- 邮件通知?可能漏看
- 短信通知?要钱
- Telegram Bot?免费、实时、还能远程控制
今天用Python开发一个服务器监控机器人。
一、Telegram Bot简介
1.1 为什么选择Telegram
优点:- 完全免费
- API简单易用
- 消息实时推送
- 支持群组/频道
- 可发送文件、图片
- 支持命令和按钮交互适用场景:- 服务器监控告警
- CI/CD通知
- 定时任务提醒
- 远程执行命令
- 日志推送
1.2 创建Bot
1. 打开Telegram,搜索 @BotFather
2. 发送 /newbot
3. 输入Bot名称,如:MyServerBot
4. 输入Bot用户名,如:my_server_monitor_bot
5. 获得Token:123456789:ABCdefGHIjklMNOpqrsTUVwxyz
6. 保存好Token!
1.3 获取Chat ID
方法1:使用@userinfobot- 搜索@userinfobot
- 发送任意消息
- 返回你的Chat ID方法2:API获取- 给你的Bot发送消息
- 访问:https://api.telegram.org/bot<TOKEN>/getUpdates
- 在返回的JSON中找到chat.id
二、环境准备
2.1 安装依赖
pip install python-telegram-bot
pip install psutil # 系统监控
pip install aiohttp # 异步HTTP
pip install apscheduler # 定时任务
2.2 项目结构
telegram_bot/
├── bot.py # 主程序
├── config.py # 配置
├── monitors/ # 监控模块
│ ├── __init__.py
│ ├── cpu.py
│ ├── memory.py
│ ├── disk.py
│ └── network.py
├── handlers/ # 命令处理
│ ├── __init__.py
│ └── commands.py
└── requirements.txt
三、基础Bot开发
3.1 Hello World
# bot.py
from telegram import Update
from telegram.ext import Application, CommandHandler, ContextTypes# 配置
TOKEN = "你的Bot Token"# 命令处理器
async def start(update: Update, context: ContextTypes.DEFAULT_TYPE):await update.message.reply_text("你好!我是服务器监控机器人。")async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE):help_text = """
可用命令:
/start - 开始
/help - 帮助
/status - 服务器状态
/cpu - CPU使用率
/memory - 内存使用
/disk - 磁盘使用"""await update.message.reply_text(help_text)def main():# 创建应用app = Application.builder().token(TOKEN).build()# 注册命令app.add_handler(CommandHandler("start", start))app.add_handler(CommandHandler("help", help_command))# 启动print("Bot启动中...")app.run_polling()if __name__ == "__main__":main()
3.2 运行测试
python bot.py# 在Telegram中:
# 1. 搜索你的Bot
# 2. 发送 /start
# 3. 发送 /help
四、系统监控功能
4.1 CPU监控
# monitors/cpu.py
import psutildef get_cpu_info():"""获取CPU信息"""cpu_percent = psutil.cpu_percent(interval=1)cpu_count = psutil.cpu_count()cpu_freq = psutil.cpu_freq()# 每核使用率per_cpu = psutil.cpu_percent(interval=1, percpu=True)return {"percent": cpu_percent,"count": cpu_count,"freq": cpu_freq.current if cpu_freq else 0,"per_cpu": per_cpu}def format_cpu_info():"""格式化CPU信息"""info = get_cpu_info()text = f"""
🖥️ CPU状态
━━━━━━━━━━━━━━━
总使用率: {info['percent']}%
核心数量: {info['count']}
当前频率: {info['freq']:.0f} MHz各核心使用率:
"""for i, p in enumerate(info['per_cpu']):bar = "█" * int(p / 10) + "░" * (10 - int(p / 10))text += f" 核心{i}: [{bar}] {p}%\n"return text
4.2 内存监控
# monitors/memory.py
import psutildef get_memory_info():"""获取内存信息"""mem = psutil.virtual_memory()swap = psutil.swap_memory()return {"total": mem.total / (1024**3), # GB"used": mem.used / (1024**3),"available": mem.available / (1024**3),"percent": mem.percent,"swap_total": swap.total / (1024**3),"swap_used": swap.used / (1024**3),"swap_percent": swap.percent}def format_memory_info():"""格式化内存信息"""info = get_memory_info()bar = "█" * int(info['percent'] / 10) + "░" * (10 - int(info['percent'] / 10))text = f"""
💾 内存状态
━━━━━━━━━━━━━━━
使用率: [{bar}] {info['percent']}%物理内存:总量: {info['total']:.1f} GB已用: {info['used']:.1f} GB可用: {info['available']:.1f} GB交换分区:总量: {info['swap_total']:.1f} GB已用: {info['swap_used']:.1f} GB ({info['swap_percent']}%)
"""return text
4.3 磁盘监控
# monitors/disk.py
import psutildef get_disk_info():"""获取磁盘信息"""partitions = []for part in psutil.disk_partitions():try:usage = psutil.disk_usage(part.mountpoint)partitions.append({"device": part.device,"mountpoint": part.mountpoint,"total": usage.total / (1024**3),"used": usage.used / (1024**3),"free": usage.free / (1024**3),"percent": usage.percent})except:continuereturn partitionsdef format_disk_info():"""格式化磁盘信息"""partitions = get_disk_info()text = "💿 磁盘状态\n━━━━━━━━━━━━━━━\n"for p in partitions:bar = "█" * int(p['percent'] / 10) + "░" * (10 - int(p['percent'] / 10))text += f"""
{p['mountpoint']}[{bar}] {p['percent']}%已用: {p['used']:.1f} GB / {p['total']:.1f} GB剩余: {p['free']:.1f} GB
"""return text
4.4 网络监控
# monitors/network.py
import psutil
import socketdef get_network_info():"""获取网络信息"""# 网络IOnet_io = psutil.net_io_counters()# IP地址hostname = socket.gethostname()try:ip = socket.gethostbyname(hostname)except:ip = "未知"# 网络连接数connections = len(psutil.net_connections())return {"hostname": hostname,"ip": ip,"bytes_sent": net_io.bytes_sent / (1024**2), # MB"bytes_recv": net_io.bytes_recv / (1024**2),"packets_sent": net_io.packets_sent,"packets_recv": net_io.packets_recv,"connections": connections}def format_network_info():"""格式化网络信息"""info = get_network_info()text = f"""
🌐 网络状态
━━━━━━━━━━━━━━━
主机名: {info['hostname']}
IP地址: {info['ip']}
连接数: {info['connections']}流量统计:发送: {info['bytes_sent']:.1f} MB接收: {info['bytes_recv']:.1f} MB发送包: {info['packets_sent']}接收包: {info['packets_recv']}
"""return text
五、完整Bot实现
5.1 主程序
# bot.py
from telegram import Update, InlineKeyboardButton, InlineKeyboardMarkup
from telegram.ext import (Application, CommandHandler, CallbackQueryHandler, ContextTypes
)
from apscheduler.schedulers.asyncio import AsyncIOScheduler
import psutil
from datetime import datetime# 配置
TOKEN = "你的Bot Token"
ADMIN_CHAT_ID = 123456789 # 你的Chat ID# 告警阈值
THRESHOLDS = {"cpu": 80,"memory": 85,"disk": 90
}# 监控函数(简化版,集成上面的模块)
def get_system_status():cpu = psutil.cpu_percent(interval=1)mem = psutil.virtual_memory()disk = psutil.disk_usage('/')return {"cpu": cpu,"memory": mem.percent,"disk": disk.percent,"uptime": datetime.now() - datetime.fromtimestamp(psutil.boot_time())}def format_status():s = get_system_status()# 状态图标cpu_icon = "🔴" if s['cpu'] > THRESHOLDS['cpu'] else "🟢"mem_icon = "🔴" if s['memory'] > THRESHOLDS['memory'] else "🟢"disk_icon = "🔴" if s['disk'] > THRESHOLDS['disk'] else "🟢"text = f"""
📊 服务器状态概览
━━━━━━━━━━━━━━━━━━
{cpu_icon} CPU: {s['cpu']}%
{mem_icon} 内存: {s['memory']}%
{disk_icon} 磁盘: {s['disk']}%⏱️ 运行时间: {str(s['uptime']).split('.')[0]}
🕐 更新时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
"""return text# 命令处理器
async def start(update: Update, context: ContextTypes.DEFAULT_TYPE):keyboard = [[InlineKeyboardButton("📊 状态", callback_data="status"),InlineKeyboardButton("🖥️ CPU", callback_data="cpu"),],[InlineKeyboardButton("💾 内存", callback_data="memory"),InlineKeyboardButton("💿 磁盘", callback_data="disk"),],[InlineKeyboardButton("🌐 网络", callback_data="network"),InlineKeyboardButton("🔄 刷新", callback_data="refresh"),]]reply_markup = InlineKeyboardMarkup(keyboard)await update.message.reply_text("🤖 服务器监控机器人\n选择要查看的信息:",reply_markup=reply_markup)async def status(update: Update, context: ContextTypes.DEFAULT_TYPE):await update.message.reply_text(format_status())async def button_callback(update: Update, context: ContextTypes.DEFAULT_TYPE):query = update.callback_queryawait query.answer()if query.data == "status" or query.data == "refresh":text = format_status()elif query.data == "cpu":text = format_cpu_info()elif query.data == "memory":text = format_memory_info()elif query.data == "disk":text = format_disk_info()elif query.data == "network":text = format_network_info()else:text = "未知命令"await query.edit_message_text(text=text)# 告警检查
async def check_alerts(context: ContextTypes.DEFAULT_TYPE):s = get_system_status()alerts = []if s['cpu'] > THRESHOLDS['cpu']:alerts.append(f"🔴 CPU使用率过高: {s['cpu']}%")if s['memory'] > THRESHOLDS['memory']:alerts.append(f"🔴 内存使用率过高: {s['memory']}%")if s['disk'] > THRESHOLDS['disk']:alerts.append(f"🔴 磁盘使用率过高: {s['disk']}%")if alerts:alert_text = "⚠️ 服务器告警\n━━━━━━━━━━━━━━━\n" + "\n".join(alerts)alert_text += f"\n\n⏰ {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"await context.bot.send_message(chat_id=ADMIN_CHAT_ID, text=alert_text)# 定时报告
async def daily_report(context: ContextTypes.DEFAULT_TYPE):text = "📈 每日服务器报告\n" + format_status()await context.bot.send_message(chat_id=ADMIN_CHAT_ID, text=text)def main():# 创建应用app = Application.builder().token(TOKEN).build()# 注册命令app.add_handler(CommandHandler("start", start))app.add_handler(CommandHandler("status", status))app.add_handler(CallbackQueryHandler(button_callback))# 定时任务scheduler = AsyncIOScheduler()# 每分钟检查告警scheduler.add_job(check_alerts, 'interval', minutes=1,args=[app])# 每天9点发送报告scheduler.add_job(daily_report, 'cron', hour=9,args=[app])scheduler.start()# 启动print("Bot启动中...")app.run_polling()if __name__ == "__main__":main()
六、高级功能
6.1 远程执行命令
import subprocessasync def exec_command(update: Update, context: ContextTypes.DEFAULT_TYPE):"""执行系统命令(危险!仅限管理员)"""# 权限检查if update.effective_user.id != ADMIN_CHAT_ID:await update.message.reply_text("❌ 无权限")returnif not context.args:await update.message.reply_text("用法: /exec <命令>")returncmd = " ".join(context.args)try:result = subprocess.run(cmd, shell=True, capture_output=True,text=True, timeout=30)output = result.stdout or result.stderr or "(无输出)"# 限制输出长度if len(output) > 4000:output = output[:4000] + "\n...(输出过长已截断)"await update.message.reply_text(f"```\n{output}\n```", parse_mode='Markdown')except subprocess.TimeoutExpired:await update.message.reply_text("❌ 命令执行超时")except Exception as e:await update.message.reply_text(f"❌ 执行失败: {e}")
6.2 进程管理
async def processes(update: Update, context: ContextTypes.DEFAULT_TYPE):"""查看进程列表"""procs = []for p in psutil.process_iter(['pid', 'name', 'cpu_percent', 'memory_percent']):try:procs.append(p.info)except:pass# 按CPU排序procs.sort(key=lambda x: x['cpu_percent'] or 0, reverse=True)text = "📋 进程列表 (Top 10 CPU)\n━━━━━━━━━━━━━━━━━━\n"for p in procs[:10]:text += f"{p['pid']:>6} | {p['cpu_percent']:>5.1f}% | {p['name'][:20]}\n"await update.message.reply_text(f"```\n{text}\n```", parse_mode='Markdown')
6.3 Docker容器监控
import dockerasync def docker_status(update: Update, context: ContextTypes.DEFAULT_TYPE):"""Docker容器状态"""try:client = docker.from_env()containers = client.containers.list(all=True)text = "🐳 Docker容器状态\n━━━━━━━━━━━━━━━━━━\n"for c in containers:status_icon = "🟢" if c.status == "running" else "🔴"text += f"{status_icon} {c.name[:20]}: {c.status}\n"if not containers:text += "没有容器"await update.message.reply_text(text)except Exception as e:await update.message.reply_text(f"❌ Docker连接失败: {e}")
七、跨网络访问
7.1 问题
场景:
- Bot运行在内网服务器
- 想在外面通过Telegram控制服务器
- 服务器没有公网IP问题:
- Telegram API可以正常调用(服务器能访问外网)
- 但无法直接SSH到服务器进行维护
7.2 解决方案
Bot本身可以正常工作,但如果需要直接访问内网服务器:
使用组网软件(如星空组网):
1. 内网服务器安装组网客户端
2. 你的手机/电脑安装组网客户端
3. 组建虚拟局域网
4. 通过虚拟IP直接SSH到服务器优势:
- Bot负责告警和简单查询
- 组网负责需要时的直接访问
- 互相补充,完美配合
八、部署运维
8.1 后台运行
# 使用nohup
nohup python bot.py > bot.log 2>&1 &# 使用screen
screen -S bot
python bot.py
# Ctrl+A, D 退出# 使用systemd(推荐)
8.2 Systemd服务
# /etc/systemd/system/telegram-bot.service
[Unit]
Description=Telegram Server Monitor Bot
After=network.target[Service]
Type=simple
User=root
WorkingDirectory=/opt/telegram_bot
ExecStart=/usr/bin/python3 /opt/telegram_bot/bot.py
Restart=always
RestartSec=10[Install]
WantedBy=multi-user.target
sudo systemctl daemon-reload
sudo systemctl enable telegram-bot
sudo systemctl start telegram-bot
sudo systemctl status telegram-bot
九、总结
Telegram Bot监控要点:
| 功能 | 实现 |
|---|---|
| 状态查询 | 命令 + 按钮 |
| 定时检查 | APScheduler |
| 告警推送 | 阈值触发 |
| 远程控制 | 命令执行 |
Bot能做的:
- 实时查看服务器状态
- 自动告警通知
- 远程执行命令
- 定时报告
参考资料
- python-telegram-bot文档:https://docs.python-telegram-bot.org/
- psutil文档:https://psutil.readthedocs.io/
- Telegram Bot API:https://core.telegram.org/bots/api
💡 Telegram Bot是服务器监控的好帮手,配合组网软件可以实现更完整的远程运维方案。