Kubernetes故障排查与性能调优完全指南


categories: - Kubernetes运维 tags: - Kubernetes - - 故障排查 - 性能调优 - 调试 - 问题解决


故障排查流程


┌─────────────────────────────────────────────────────────────────┐
│                      故障排查流程                               │
│                                                                  │
│  1. 查看症状                                                    │
│     ├── kubectl get pods -A                                     │
│     ├── kubectl get events -A --sort-by='.lastTimestamp'       │
│     └── kubectl describe                        │
│                                                                  │
│  2. 分析原因                                                    │
│     ├── kubectl logs  -c                       │
│     ├── kubectl exec -it  -- /bin/sh                      │
│     └── journalctl -u kubelet -f                               │
│                                                                  │
│  3. 解决问题                                                    │
│     ├── kubectl apply -f .yaml                   │
│     ├── kubectl delete pod                               │
│     └── kubectl rollout restart deployment               │
└─────────────────────────────────────────────────────────────────┘

常见问题排查

Pod状态异常


# 查看Pod详细状态
kubectl describe pod  -n 

# 查看Pod事件
kubectl get events -n  --field-selector involvedObject.name=

# 查看最近事件
kubectl get events -A --sort-by='.metadata.creationTimestamp'

# 常见状态:
# Pending    - 调度失败,可能是资源不足
# CrashLoopBackOff - 容器反复重启
# ImagePullBackOff - 镜像拉取失败
# Error      - 容器运行出错

Pod无法启动


# 1. 检查镜像名称
kubectl get pod  -o jsonpath='{.spec.containers[*].image}'

# 2. 检查镜像仓库凭证
kubectl get secret -n 

# 3. 检查资源是否足够
kubectl describe nodes | grep -A 5 "Allocated resources"

# 4. 检查PVC挂载
kubectl describe pvc 

# 5. 检查SecurityContext
kubectl describe pod  | grep -A 10 "Security Context"

Service不可用


# 1. 检查Service配置
kubectl describe service 

# 2. 检查Endpoint
kubectl get endpoints 

# 3. 检查Selector匹配
kubectl get pods -l  --show-labels

# 4. 测试服务连通性
kubectl exec -it  -- curl http://

# 5. 检查NetworkPolicy
kubectl get networkpolicy -n 
kubectl describe networkpolicy 

网络问题


# 1. 查看节点网络状态
kubectl get nodes -o wide

# 2. 检查CNI插件
kubectl get pods -n kube-system -l k8s-app=kube-router

# 3. 测试节点间连通性
kubectl exec -it  -- ping 

# 4. 查看网络策略
kubectl describe networkpolicy

# 5. 检查DNS
kubectl exec -it  -- nslookup kubernetes.default
kubectl exec -it  -- cat /etc/resolv.conf

性能问题排查


# 1. 查看资源使用
kubectl top pods -A --sort-by=memory
kubectl top pods -A --sort-by=cpu

# 2. 查看节点资源
kubectl describe node 

# 3. 查看调度失败原因
kubectl get events --field-selector reason=FailedScheduling

# 4. 检查OOMKilled
kubectl logs  | grep -i "oom\|killed"

# 5. 检查CPU Throttling
kubectl logs  | grep -i "throttl"

日志分析

获取各类日志


# 应用日志
kubectl logs  -n 
kubectl logs  -n  --tail=100
kubectl logs  -n  -f --since=1h

# Kubernetes组件日志
journalctl -u kubelet -f
journalctl -u kube-apiserver -f
journalctl -u kube-scheduler -f

# 容器运行时日志
# Docker
docker logs 

# Containerd
crictl logs 

日志聚合查询(Loki)


# 查看错误日志
{namespace="production",app="myapp"} |= "ERROR"

# 查看最近1小时日志
{namespace="production",app="myapp"} | json | __line__ | __timestamp__ >= time() - 3600

# 统计错误数量
count_over_time({namespace="production"} |= "ERROR"[1h])

# 查看特定Pod日志
{namespace="production",pod="myapp-pod-xxx"}

性能调优

节点级别调优


# 优化内核参数
cat >> /etc/sysctl.d/95-k8s.conf << EOF
# 网络优化
net.core.somaxconn = 65535
net.ipv4.ip_local_port_range = 1024 65535
net.core.netdev_max_backlog = 65535

# 文件描述符
fs.file-max = 2097152
fs.inotify.max_user_watches = 524288

# 内存管理
vm.swappiness = 10
vm.vfs_cache_pressure = 50
EOF

sysctl --system

容器运行时调优


# Containerd配置 /etc/containerd/config.toml
[plugins."io.containerd.grpc.v1.cri"]
  sandbox_image = "registry.k8s.io/pause:3.9"
  default_runtime_name = "runc"

[plugins."io.containerd.grpc.v1.cri".containerd]
  default_runtime_name = "runc"
  [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc]
    runtime_type = "io.containerd.runc.v2"
    [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options]
      SystemdCgroup = true
      # 资源限制
      IoGid = 1000
      IoUid = 1000
      # PIDs限制
      PidsLimit = 4096

Pod性能优化


apiVersion: apps/v1
kind: Deployment
metadata:
  name: optimized-app
spec:
  template:
    spec:
      containers:
      - name: app
        image: myapp:v1
        # 资源优化
        resources:
          requests:
            cpu: 500m
            memory: 512Mi
          limits:
            cpu: 2000m
            memory: 2Gi
        # 启动优化
        startupProbe:
          httpGet:
            path: /health
            port: 8080
          failureThreshold: 30
          periodSeconds: 10
        # 就绪探测
        readinessProbe:
          httpGet:
            path: /ready
            port: 8080
          initialDelaySeconds: 5
          periodSeconds: 10
        # 存活探测
        livenessProbe:
          httpGet:
            path: /healthz
            port: 8080
          initialDelaySeconds: 15
          periodSeconds: 20
        # 优雅关闭
        terminationGracePeriodSeconds: 30
        # 禁用特权
        securityContext:
          allowPrivilegeEscalation: false
          readOnlyRootFilesystem: true

KubeScheduler调优


# Scheduler配置 /etc/kubernetes/scheduler-config.yaml
apiVersion: kubescheduler.config.k8s.io/v1
kind: KubeSchedulerConfiguration
leaderElection:
  leaderElect: true
  resourceLock: endpoints
  resourceNamespace: kube-system
  resourceName: kube-scheduler
profiles:
- schedulerName: default-scheduler
  percentageOfNodesToScore: 50
  bindTimeoutSeconds: 600
  plugins:
    score:
      enabled:
      - name: ImageLocality
        weight: 1
      - name: LeastRequested
        weight: 1
      disabled:
      - name: *

诊断工具

kubectl-debug


# 安装kubectl-debug插件
kubectl debug -it  --image=busybox --target=

# 使用诊断镜像
kubectl debug node/node1 -it --image=busybox

kube-bench


# 运行CIS基准测试
kube-bench run --targets=node
kube-bench run --targets=master
kube-bench run --targets=etcd

kubectl-tree


# 查看资源关系
kubectl tree deployment myapp -n namespace

性能基准测试

网络基准测试


# iPerf3测试
kubectl run iperf3 --image=networkstatic/iperf3 -it --rm -- iperf3 -c 

# 网络延迟测试
kubectl exec -it  -- ping 

# CNI性能测试
kubectl run --rm -it --restart=Never --image=busybox:1.36 netperf -- sh -c 'for i in $(seq 1 100); do nc -zv  ; done'

存储基准测试


# FIO测试
kubectl apply -f - << EOF
apiVersion: batch/v1
kind: Job
metadata:
  name: fio-test
spec:
  template:
    spec:
      containers:
      - name: fio
        image: hrishikeshsuresh/fio:latest
        command: ["sh", "-c", "fio --name=iodemo --filename=/data/file --size=1G --runtime=60 --ioengine=libaio --direct=1 --bs=4k --rw=randwrite --iodepth=32"]
        volumeMounts:
        - name: data
          mountPath: /data
      volumes:
      - name: data
        persistentVolumeClaim:
          claimName: test-pvc
      restartPolicy: Never
EOF

常用故障排查命令速查


# 集群健康
kubectl get componentstatuses
kubectl get --raw='/healthz'

# 节点问题
kubectl describe node 
kubectl top node

# Pod问题
kubectl get pods -o wide
kubectl logs  -p

# 网络问题
kubectl get svc -A
kubectl get endpoints -A

# 资源问题
kubectl describe quota -A
kubectl describe limitranges -A

# 事件排查
kubectl get events --sort-by='.metadata.creationTimestamp'
kubectl get events -w

# 临时调试Pod
kubectl run debug --image=busybox:1.36 -it --rm --restart=Never -- sh

发表回复

后才能评论