Kubernetes故障排查与性能调优完全指南
categories: - Kubernetes运维 tags: - Kubernetes - - 故障排查 - 性能调优 - 调试 - 问题解决
故障排查流程
┌─────────────────────────────────────────────────────────────────┐
│ 故障排查流程 │
│ │
│ 1. 查看症状 │
│ ├── kubectl get pods -A │
│ ├── kubectl get events -A --sort-by='.lastTimestamp' │
│ └── kubectl describe │
│ │
│ 2. 分析原因 │
│ ├── kubectl logs -c │
│ ├── kubectl exec -it -- /bin/sh │
│ └── journalctl -u kubelet -f │
│ │
│ 3. 解决问题 │
│ ├── kubectl apply -f .yaml │
│ ├── kubectl delete pod │
│ └── kubectl rollout restart deployment │
└─────────────────────────────────────────────────────────────────┘
常见问题排查
Pod状态异常
# 查看Pod详细状态
kubectl describe pod -n
# 查看Pod事件
kubectl get events -n --field-selector involvedObject.name=
# 查看最近事件
kubectl get events -A --sort-by='.metadata.creationTimestamp'
# 常见状态:
# Pending - 调度失败,可能是资源不足
# CrashLoopBackOff - 容器反复重启
# ImagePullBackOff - 镜像拉取失败
# Error - 容器运行出错
Pod无法启动
# 1. 检查镜像名称
kubectl get pod -o jsonpath='{.spec.containers[*].image}'
# 2. 检查镜像仓库凭证
kubectl get secret -n
# 3. 检查资源是否足够
kubectl describe nodes | grep -A 5 "Allocated resources"
# 4. 检查PVC挂载
kubectl describe pvc
# 5. 检查SecurityContext
kubectl describe pod | grep -A 10 "Security Context"
Service不可用
# 1. 检查Service配置
kubectl describe service
# 2. 检查Endpoint
kubectl get endpoints
# 3. 检查Selector匹配
kubectl get pods -l --show-labels
# 4. 测试服务连通性
kubectl exec -it -- curl http://
# 5. 检查NetworkPolicy
kubectl get networkpolicy -n
kubectl describe networkpolicy
网络问题
# 1. 查看节点网络状态
kubectl get nodes -o wide
# 2. 检查CNI插件
kubectl get pods -n kube-system -l k8s-app=kube-router
# 3. 测试节点间连通性
kubectl exec -it -- ping
# 4. 查看网络策略
kubectl describe networkpolicy
# 5. 检查DNS
kubectl exec -it -- nslookup kubernetes.default
kubectl exec -it -- cat /etc/resolv.conf
性能问题排查
# 1. 查看资源使用
kubectl top pods -A --sort-by=memory
kubectl top pods -A --sort-by=cpu
# 2. 查看节点资源
kubectl describe node
# 3. 查看调度失败原因
kubectl get events --field-selector reason=FailedScheduling
# 4. 检查OOMKilled
kubectl logs | grep -i "oom\|killed"
# 5. 检查CPU Throttling
kubectl logs | grep -i "throttl"
日志分析
获取各类日志
# 应用日志
kubectl logs -n
kubectl logs -n --tail=100
kubectl logs -n -f --since=1h
# Kubernetes组件日志
journalctl -u kubelet -f
journalctl -u kube-apiserver -f
journalctl -u kube-scheduler -f
# 容器运行时日志
# Docker
docker logs
# Containerd
crictl logs
日志聚合查询(Loki)
# 查看错误日志
{namespace="production",app="myapp"} |= "ERROR"
# 查看最近1小时日志
{namespace="production",app="myapp"} | json | __line__ | __timestamp__ >= time() - 3600
# 统计错误数量
count_over_time({namespace="production"} |= "ERROR"[1h])
# 查看特定Pod日志
{namespace="production",pod="myapp-pod-xxx"}
性能调优
节点级别调优
# 优化内核参数
cat >> /etc/sysctl.d/95-k8s.conf << EOF
# 网络优化
net.core.somaxconn = 65535
net.ipv4.ip_local_port_range = 1024 65535
net.core.netdev_max_backlog = 65535
# 文件描述符
fs.file-max = 2097152
fs.inotify.max_user_watches = 524288
# 内存管理
vm.swappiness = 10
vm.vfs_cache_pressure = 50
EOF
sysctl --system
容器运行时调优
# Containerd配置 /etc/containerd/config.toml
[plugins."io.containerd.grpc.v1.cri"]
sandbox_image = "registry.k8s.io/pause:3.9"
default_runtime_name = "runc"
[plugins."io.containerd.grpc.v1.cri".containerd]
default_runtime_name = "runc"
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc]
runtime_type = "io.containerd.runc.v2"
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options]
SystemdCgroup = true
# 资源限制
IoGid = 1000
IoUid = 1000
# PIDs限制
PidsLimit = 4096
Pod性能优化
apiVersion: apps/v1
kind: Deployment
metadata:
name: optimized-app
spec:
template:
spec:
containers:
- name: app
image: myapp:v1
# 资源优化
resources:
requests:
cpu: 500m
memory: 512Mi
limits:
cpu: 2000m
memory: 2Gi
# 启动优化
startupProbe:
httpGet:
path: /health
port: 8080
failureThreshold: 30
periodSeconds: 10
# 就绪探测
readinessProbe:
httpGet:
path: /ready
port: 8080
initialDelaySeconds: 5
periodSeconds: 10
# 存活探测
livenessProbe:
httpGet:
path: /healthz
port: 8080
initialDelaySeconds: 15
periodSeconds: 20
# 优雅关闭
terminationGracePeriodSeconds: 30
# 禁用特权
securityContext:
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
KubeScheduler调优
# Scheduler配置 /etc/kubernetes/scheduler-config.yaml
apiVersion: kubescheduler.config.k8s.io/v1
kind: KubeSchedulerConfiguration
leaderElection:
leaderElect: true
resourceLock: endpoints
resourceNamespace: kube-system
resourceName: kube-scheduler
profiles:
- schedulerName: default-scheduler
percentageOfNodesToScore: 50
bindTimeoutSeconds: 600
plugins:
score:
enabled:
- name: ImageLocality
weight: 1
- name: LeastRequested
weight: 1
disabled:
- name: *
诊断工具
kubectl-debug
# 安装kubectl-debug插件
kubectl debug -it --image=busybox --target=
# 使用诊断镜像
kubectl debug node/node1 -it --image=busybox
kube-bench
# 运行CIS基准测试
kube-bench run --targets=node
kube-bench run --targets=master
kube-bench run --targets=etcd
kubectl-tree
# 查看资源关系
kubectl tree deployment myapp -n namespace
性能基准测试
网络基准测试
# iPerf3测试
kubectl run iperf3 --image=networkstatic/iperf3 -it --rm -- iperf3 -c
# 网络延迟测试
kubectl exec -it -- ping
# CNI性能测试
kubectl run --rm -it --restart=Never --image=busybox:1.36 netperf -- sh -c 'for i in $(seq 1 100); do nc -zv ; done'
存储基准测试
# FIO测试
kubectl apply -f - << EOF
apiVersion: batch/v1
kind: Job
metadata:
name: fio-test
spec:
template:
spec:
containers:
- name: fio
image: hrishikeshsuresh/fio:latest
command: ["sh", "-c", "fio --name=iodemo --filename=/data/file --size=1G --runtime=60 --ioengine=libaio --direct=1 --bs=4k --rw=randwrite --iodepth=32"]
volumeMounts:
- name: data
mountPath: /data
volumes:
- name: data
persistentVolumeClaim:
claimName: test-pvc
restartPolicy: Never
EOF
常用故障排查命令速查
# 集群健康
kubectl get componentstatuses
kubectl get --raw='/healthz'
# 节点问题
kubectl describe node
kubectl top node
# Pod问题
kubectl get pods -o wide
kubectl logs -p
# 网络问题
kubectl get svc -A
kubectl get endpoints -A
# 资源问题
kubectl describe quota -A
kubectl describe limitranges -A
# 事件排查
kubectl get events --sort-by='.metadata.creationTimestamp'
kubectl get events -w
# 临时调试Pod
kubectl run debug --image=busybox:1.36 -it --rm --restart=Never -- sh
声明:本站所有文章,如无特殊说明或标注,均为本站原创发布。任何个人或组织,在未征得本站同意时,禁止复制、盗用、采集、发布本站内容到任何网站、书籍等各类媒体平台。如若本站内容侵犯了原著者的合法权益,可联系我们进行处理。







