InfluxDB实战教程4:配合Grafana/IPMI做硬件监控
# InfluxDB实战教程4:配合Grafana/IPMI做硬件监控
本教程将通过一个实际项目,展示如何使用 InfluxDB 配合 Grafana 和 IPMI 构建完整的硬件监控系统。我们将实现服务器硬件状态的实时采集、存储和可视化。
## 一、项目架构
### 1.1 系统组件
```
┌─────────────┐ ┌─────────────┐ ┌─────────────┐
│ 服务器 │ ───> │ InfluxDB │ <──── │ Grafana │
│ (IPMI采集) │ │ (数据存储) │ │ (可视化) │
└─────────────┘ └─────────────┘ └─────────────┘
│ │
└──────────────────────────────────────┘
(查询展示)
```
### 1.2 技术栈
- **InfluxDB**:存储时序监控数据
- **Grafana**:数据可视化和告警
- **IPMI**:硬件状态采集
- **Python**:数据采集脚本
## 二、环境准备
### 2.1 使用 Docker Compose 部署
创建 `docker-compose.yml`:
version: '3.8'
services:
influxdb:
image: influxdb:2.7
container_name: influxdb
ports:
- "8086:8086"
environment:
- DOCKER_INFLUXDB_INIT_MODE=setup
- DOCKER_INFLUXDB_INIT_USERNAME=admin
- DOCKER_INFLUXDB_INIT_PASSWORD=StrongPassword123!
- DOCKER_INFLUXDB_INIT_ORG=monitoring
- DOCKER_INFLUXDB_INIT_BUCKET=hardware
- DOCKER_INFLUXDB_INIT_ADMIN_TOKEN=YourAuthTokenHereReplaceWithSecureToken
volumes:
- influxdb-data:/var/lib/influxdb2
- influxdb-config:/etc/influxdb2
restart: unless-stopped
networks:
- monitoring
grafana:
image: grafana/grafana:latest
container_name: grafana
ports:
- "3000:3000"
environment:
- GF_SECURITY_ADMIN_USER=admin
- GF_SECURITY_ADMIN_PASSWORD=GrafanaPassword123!
- GF_INSTALL_PLUGINS=grafana-influxdb-flux-datasource
volumes:
- grafana-data:/var/lib/grafana
- grafana-provisioning:/etc/grafana/provisioning
depends_on:
- influxdb
restart: unless-stopped
networks:
- monitoring
volumes:
influxdb-data:
influxdb-config:
grafana-data:
grafana-provisioning:
networks:
monitoring:
driver: bridge
启动服务:
# 启动服务
docker-compose up -d
# 查看日志
docker-compose logs -f
# 检查服务状态
docker-compose ps
### 2.2 创建 InfluxDB Bucket
# 进入 InfluxDB CLI
docker exec -it influxdb influx
# 创建用于硬件监控的 bucket
influx bucket create \
--name hardware \
--org monitoring \
--token YourAuthTokenHereReplaceWithSecureToken \
--retention 30d
# 创建用于告警的 bucket
influx bucket create \
--name alerts \
--org monitoring \
--token YourAuthTokenHereReplaceWithSecureToken \
--retention 7d
# 查看所有 buckets
influx bucket list --org monitoring
## 三、IPMI 数据采集
### 3.1 安装 IPMI 工具
# Ubuntu/Debian
sudo apt-get update
sudo apt-get install -y ipmitool freeipmi-tools
# CentOS/RHEL
sudo yum install -y ipmitool freeipmi
# 验证安装
ipmitool -V
ipmi-sensor --version
### 3.2 Python 数据采集脚本
创建采集脚本 `ipmi_collector.py`:
#!/usr/bin/env python3
"""
IPMI Hardware Monitoring Collector
采集服务器硬件状态并写入 InfluxDB
"""
import subprocess
import json
import re
from datetime import datetime
from influxdb_client import InfluxDBClient, Point, WritePrecision
from influxdb_client.client.write_api import SYNCHRONOUS
# 配置参数
INFLUX_URL = "http://localhost:8086"
INFLUX_TOKEN = "YourAuthTokenHereReplaceWithSecureToken"
INFLUX_ORG = "monitoring"
INFLUX_BUCKET = "hardware"
HOSTNAME = subprocess.check_output(['hostname']).decode().strip()
IPMI_HOST = "192.168.1.100" # BMC IP 地址
IPMI_USER = "admin"
IPMI_PASS = "password"
# 初始化 InfluxDB 客户端
client = InfluxDBClient(url=INFLUX_URL, token=INFLUX_TOKEN, org=INFLUX_ORG)
write_api = client.write_api(write_options=SYNCHRONOUS)
def get_ipmi_sensors():
"""获取 IPMI 传感器数据"""
try:
# 使用 ipmitool 获取传感器
cmd = ['ipmitool', '-H', IPMI_HOST, '-U', IPMI_USER, '-P', IPMI_PASS, 'sensor']
result = subprocess.run(cmd, capture_output=True, text=True)
sensors = []
for line in result.stdout.split('\n'):
if '|' in line and 'Sensor Reading' not in line:
parts = [p.strip() for p in line.split('|')]
if len(parts) >= 3:
sensor_name = parts[0]
reading = parts[1]
status = parts[2]
# 解析数值
value = None
unit = None
match = re.search(r'([\d.]+)\s*(\w+)', reading)
if match:
value = float(match.group(1))
unit = match.group(2)
if value is not None:
sensors.append({
'name': sensor_name,
'value': value,
'unit': unit,
'status': status
})
return sensors
except Exception as e:
print(f"Error getting IPMI sensors: {e}")
return []
def parse_sensor_name(name):
"""解析传感器名称获取类型"""
name_lower = name.lower()
if 'temp' in name_lower or 'temperature' in name_lower:
return 'temperature'
elif 'fan' in name_lower:
return 'fan'
elif 'voltage' in name_lower or 'volt' in name_lower:
return 'voltage'
elif 'current' in name_lower:
return 'current'
elif 'power' in name_lower:
return 'power'
else:
return 'other'
def write_to_influxdb(sensors):
"""将传感器数据写入 InfluxDB"""
points = []
timestamp = datetime.utcnow()
for sensor in sensors:
sensor_type = parse_sensor_name(sensor['name'])
point = Point("ipmi_sensor") \
.tag("host", HOSTNAME) \
.tag("sensor_name", sensor['name']) \
.tag("sensor_type", sensor_type) \
.tag("unit", sensor['unit']) \
.tag("status", sensor['status']) \
.field("value", sensor['value']) \
.time(timestamp, WritePrecision.NS)
points.append(point)
if points:
try:
write_api.write(bucket=INFLUX_BUCKET, record=points)
print(f"Successfully wrote {len(points)} sensor readings to InfluxDB")
except Exception as e:
print(f"Error writing to InfluxDB: {e}")
def get_system_info():
"""获取系统信息"""
points = []
timestamp = datetime.utcnow()
try:
# 获取系统状态
cmd = ['ipmitool', '-H', IPMI_HOST, '-U', IPMI_USER, '-P', IPMI_PASS, 'chassis', 'status']
result = subprocess.run(cmd, capture_output=True, text=True)
# 解析电源状态
for line in result.stdout.split('\n'):
if 'System Power' in line:
if 'on' in line.lower():
power_status = 1
else:
power_status = 0
point = Point("ipmi_system") \
.tag("host", HOSTNAME) \
.field("power_status", power_status)
points.append(point)
except Exception as e:
print(f"Error getting system info: {e}")
if points:
try:
write_api.write(bucket=INFLUX_BUCKET, record=points)
except Exception as e:
print(f"Error writing system info: {e}")
def main():
print(f"Starting IPMI data collection for {HOSTNAME}")
# 采集传感器数据
sensors = get_ipmi_sensors()
if sensors:
print(f"Collected {len(sensors)} sensor readings")
write_to_influxdb(sensors)
else:
print("No sensor data collected")
# 采集系统信息
get_system_info()
# 关闭客户端
client.close()
print("Data collection completed")
if __name__ == "__main__":
main()
安装依赖:
# 安装 Python 依赖
pip3 install influxdb-client
# 设置执行权限
chmod +x ipmi_collector.py
# 测试运行
python3 ipmi_collector.py
### 3.3 配置定时采集
创建 systemd 服务 `/etc/systemd/system/ipmi-collector.service`:
[Unit]
Description=IPMI Hardware Monitoring Collector
After=network.target
[Service]
Type=oneshot
User=root
WorkingDirectory=/opt/ipmi-monitoring
ExecStart=/usr/bin/python3 /opt/ipmi-monitoring/ipmi_collector.py
[Install]
WantedBy=multi-user.target
创建定时器 `/etc/systemd/system/ipmi-collector.timer`:
[Unit]
Description=Run IPMI collector every minute
[Service]
Type=oneshot
User=root
WorkingDirectory=/opt/ipmi-monitoring
ExecStart=/usr/bin/python3 /opt/ipmi-monitoring/ipmi_collector.py
启动服务:
# 创建工作目录
sudo mkdir -p /opt/ipmi-monitoring
sudo cp ipmi_collector.py /opt/ipmi-monitoring/
# 重新加载 systemd 配置
sudo systemctl daemon-reload
# 启动定时器
sudo systemctl enable ipmi-collector.timer
sudo systemctl start ipmi-collector.timer
# 查看状态
sudo systemctl status ipmi-collector.timer
sudo journalctl -u ipmi-collector.service -f
## 四、Grafana 配置
### 4.1 添加 InfluxDB 数据源
访问 `http://localhost:3000`,使用 admin/GrafanaPassword123! 登录。
1. 进入 **Configuration > Data Sources**
2. 点击 **Add data source**
3. 选择 **InfluxDB**
4. 配置如下:
Name: InfluxDB Hardware Monitoring
Query Language: Flux
URL: http://influxdb:8086
Organization: monitoring
Token: YourAuthTokenHereReplaceWithSecureToken
Default Bucket: hardware
5. 点击 **Save & Test**
### 4.2 创建仪表盘
#### 温度监控面板
Flux 查询:
from(bucket: "hardware")
|> range(start: v.timeRangeStart, stop: v.timeRangeStop)
|> filter(fn: (r) => r["_measurement"] == "ipmi_sensor")
|> filter(fn: (r) => r["sensor_type"] == "temperature")
|> filter(fn: (r) => r["_field"] == "value")
|> aggregateWindow(every: 1m, fn: mean, createEmpty: false)
|> yield(name: "mean")
#### 风扇转速监控面板
from(bucket: "hardware")
|> range(start: v.timeRangeStart, stop: v.timeRangeStop)
|> filter(fn: (r) => r["_measurement"] == "ipmi_sensor")
|> filter(fn: (r) => r["sensor_type"] == "fan")
|> filter(fn: (r) => r["_field"] == "value")
|> aggregateWindow(every: 1m, fn: mean, createEmpty: false)
|> yield(name: "mean")
#### 电压监控面板
from(bucket: "hardware")
|> range(start: v.timeRangeStart, stop: v.timeRangeStop)
|> filter(fn: (r) => r["_measurement"] == "ipmi_sensor")
|> filter(fn: (r) => r["sensor_type"] == "voltage")
|> filter(fn: (r) => r["_field"] == "value")
|> aggregateWindow(every: 1m, fn: mean, createEmpty: false)
|> yield(name: "mean")
#### 系统状态概览
from(bucket: "hardware")
|> range(start: v.timeRangeStart, stop: v.timeRangeStop)
|> filter(fn: (r) => r["_measurement"] == "ipmi_system")
|> filter(fn: (r) => r["_field"] == "power_status")
|> last()
### 4.3 导入仪表盘 JSON
创建 `hardware_dashboard.json`:
{
"dashboard": {
"title": "Hardware Monitoring Dashboard",
"panels": [
{
"title": "Temperature",
"type": "timeseries",
"targets": [
{
"query": "from(bucket: \"hardware\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r[\"_measurement\"] == \"ipmi_sensor\")\n |> filter(fn: (r) => r[\"sensor_type\"] == \"temperature\")\n |> filter(fn: (r) => r[\"_field\"] == \"value\")\n |> aggregateWindow(every: 1m, fn: mean, createEmpty: false)",
"refId": "A"
}
]
},
{
"title": "Fan Speed",
"type": "timeseries",
"targets": [
{
"query": "from(bucket: \"hardware\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r[\"_measurement\"] == \"ipmi_sensor\")\n |> filter(fn: (r) => r[\"sensor_type\"] == \"fan\")\n |> filter(fn: (r) => r[\"_field\"] == \"value\")\n |> aggregateWindow(every: 1m, fn: mean, createEmpty: false)",
"refId": "B"
}
]
},
{
"title": "Voltage",
"type": "timeseries",
"targets": [
{
"query": "from(bucket: \"hardware\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r[\"_measurement\"] == \"ipmi_sensor\")\n |> filter(fn: (r) => r[\"sensor_type\"] == \"voltage\")\n |> filter(fn: (r) => r[\"_field\"] == \"value\")\n |> aggregateWindow(every: 1m, fn: mean, createEmpty: false)",
"refId": "C"
}
]
},
{
"title": "System Status",
"type": "stat",
"targets": [
{
"query": "from(bucket: \"hardware\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r[\"_measurement\"] == \"ipmi_system\")\n |> filter(fn: (r) => r[\"_field\"] == \"power_status\")\n |> last()",
"refId": "D"
}
]
}
],
"time": {
"from": "now-1h",
"to": "now"
},
"refresh": "10s"
}
}
导入步骤:
1. 进入 **Dashboards > Import**
2. 粘贴 JSON 内容
3. 点击 **Load**
## 五、告警配置
### 5.1 InfluxDB 告警规则
创建告警规则:
// 检查高温告警
option task = {
name: "Hardware Alert Task",
every: 1m,
offset: 0s
}
from(bucket: "hardware")
|> range(start: -5m)
|> filter(fn: (r) => r["_measurement"] == "ipmi_sensor")
|> filter(fn: (r) => r["sensor_type"] == "temperature")
|> filter(fn: (r) => r["_field"] == "value")
|> mean(column: "_value")
|> map(fn: (r) => ({
r with
_level: if r._value > 80.0 then "crit"
else if r._value > 70.0 then "warn"
else "ok"
}))
|> yield(name: "temperature_alert")
### 5.2 Grafana 告警
在 Grafana 中创建告警规则:
1. 编辑面板
2. 进入 **Alert** 标签
3. 设置告警条件
示例告警规则:
告警名称: High Temperature Alert
条件: avg() > 75
持续时间: 5分钟
通知方式: Email/Webhook
## 六、高级功能
### 6.1 多服务器监控
修改采集脚本支持多服务器:
# 服务器配置
SERVERS = [
{
'hostname': 'server01',
'ipmi_host': '192.168.1.100',
'ipmi_user': 'admin',
'ipmi_pass': 'password'
},
{
'hostname': 'server02',
'ipmi_host': '192.168.1.101',
'ipmi_user': 'admin',
'ipmi_pass': 'password'
}
]
### 6.2 历史数据归档
创建归档脚本:
#!/bin/bash
# Archive old hardware monitoring data
BACKUP_DIR="/data/backups/hardware"
DATE=$(date +%Y%m%d)
# 导出数据
influx query 'from(bucket:"hardware") |> range(start: -90d, stop: -30d) |> csv()' \
--org monitoring \
--token YourAuthTokenHereReplaceWithSecureToken \
> ${BACKUP_DIR}/hardware_${DATE}.csv
# 压缩
gzip ${BACKUP_DIR}/hardware_${DATE}.csv
# 删除旧数据
influx query 'from(bucket:"hardware") |> range(start: -90d, stop: -89d) |> drop()' \
--org monitoring \
--token YourAuthTokenHereReplaceWithSecureToken
### 6.3 集成 Prometheus
作为补充,可以添加 Prometheus Exporter:
# 使用 ipmi_exporter
docker run -d \
--name ipmi_exporter \
-p 9290:9290 \
prometheuscommunity/ipmi-exporter
## 七、性能优化
### 7.1 数据采集优化
# 调整采集频率
# 根据实际需求设置,一般每 1-5 分钟一次
# 批量写入
# 收集多个传感器的数据后批量写入 InfluxDB
# 异步写入
# 使用异步 API 提高性能
### 7.2 查询优化
// 使用合适的聚合窗口
// 温度:5分钟聚合
// 风扇:1分钟聚合
// 电压:10秒聚合
// 合理设置时间范围
// 不要查询过长时间范围的数据
## 八、总结
本实战教程完成了:
- InfluxDB 和 Grafana 的 Docker 部署
- IPMI 硬件数据采集脚本开发
- Grafana 仪表盘创建和配置
- 告警规则设置
- 多服务器监控方案
通过这个项目,你可以:
1. 实时监控服务器硬件状态
2. 快速发现硬件异常
3. 生成历史趋势报告
4. 及时响应硬件故障
这个监控系统可以扩展到更多场景:
- 网络设备监控
- 机房环境监控
- 容器资源监控
- 应用性能监控
InfluxDB + Grafana + IPMI 组合提供了一个强大而灵活的硬件监控解决方案,适合各种规模的 IT 基础设施。






