共计 30324 个字符,预计需要花费 76 分钟才能阅读完成。
前几天零时搭建的单master节点集群,今天挂了。那一夜走的太快,poweroff一键关机,现在无了。。。(っ °Д °;)っ
排错
故障现象:
- 无法连接api server,k8s_kube-apiserver_pod一直重启
- kubelet状态是running
- k8s_etcd_etcd-pod也是一直重启
[root@node1 ~]# kubectl get nodes
The connection to the server 192.168.174.132:6443 was refused - did you specify the right host or port?
[root@node1 ~]# netstat -tunlp | grep 6443
[root@node1 ~]# docker ps -a
CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
610937b240ee 303ce5db0e90 "etcd --advertise-cl…" 36 seconds ago Exited (2) 35 seconds ago k8s_etcd_etcd-node1_kube-system_70f631342af8c34c78512d527c75a699_20
83126c1ff8f7 e9585e7d0849 "kube-apiserver --ad…" About a minute ago Exited (2) About a minute ago k8s_kube-apiserver_kube-apiserver-node1_kube-system_8a86cf7abb8aa60e9d536609c8d67ee8_18
8d7f7564bb6f c53af2e3d068 "kube-scheduler --au…" 36 minutes ago Up 36 minutes k8s_kube-scheduler_kube-scheduler-node1_kube-system_119b9a7e2b8d8fc97be3274e089bc788_10
df206b057512 dacf3f247065 "kube-controller-man…" 36 minutes ago Up 36 minutes k8s_kube-controller-manager_kube-controller-manager-node1_kube-system_356740264271957bc1f4e25572eff010_11
d7c013b87427 registry.aliyuncs.com/k8sxio/pause:3.2 "/pause" 36 minutes ago Up 36 minutes k8s_POD_kube-scheduler-node1_kube-system_119b9a7e2b8d8fc97be3274e089bc788_9
cd293244ff77 registry.aliyuncs.com/k8sxio/pause:3.2 "/pause" 36 minutes ago Up 36 minutes k8s_POD_kube-controller-manager-node1_kube-system_356740264271957bc1f4e25572eff010_9
ccf976a0254d registry.aliyuncs.com/k8sxio/pause:3.2 "/pause" 36 minutes ago Up 36 minutes k8s_POD_kube-apiserver-node1_kube-system_8a86cf7abb8aa60e9d536609c8d67ee8_9
7e1d0711d339 registry.aliyuncs.com/k8sxio/pause:3.2 "/pause" 36 minutes ago Up 36 minutes k8s_POD_etcd-node1_kube-system_70f631342af8c34c78512d527c75a699_9
c27503bf4524 dacf3f247065 "kube-controller-man…" 46 minutes ago Exited (2) 37 minutes ago k8s_kube-controller-manager_kube-controller-manager-node1_kube-system_356740264271957bc1f4e25572eff010_10
2b94d34f3502 c53af2e3d068 "kube-scheduler --au…" 46 minutes ago Exited (2) 37 minutes ago k8s_kube-scheduler_kube-scheduler-node1_kube-system_119b9a7e2b8d8fc97be3274e089bc788_9
e3c04d75b70a registry.aliyuncs.com/k8sxio/pause:3.2 "/pause" 46 minutes ago Exited (0) 37 minutes ago k8s_POD_kube-controller-manager-node1_kube-system_356740264271957bc1f4e25572eff010_8
c42054b6f010 registry.aliyuncs.com/k8sxio/pause:3.2 "/pause" 46 minutes ago Exited (0) 37 minutes ago k8s_POD_kube-scheduler-node1_kube-system_119b9a7e2b8d8fc97be3274e089bc788_8
查看日志
kubelet日志
# 发现提示无法连接node1,但是可以ping通...继续往下查
[root@node1 ~]# systemctl status kubelet
● kubelet.service - kubelet: The Kubernetes Node Agent
Loaded: loaded (/usr/lib/systemd/system/kubelet.service; enabled; vendor preset: disabled)
Drop-In: /usr/lib/systemd/system/kubelet.service.d
└─10-kubeadm.conf
Active: active (running) since 四 2021-03-05 22:19:08 CST; 38min ago
Docs: https://kubernetes.io/docs/
Main PID: 9563 (kubelet)
Tasks: 27
Memory: 161.2M
CGroup: /system.slice/kubelet.service
└─9563 /usr/bin/kubelet --bootstrap-kubeconfig=/etc/kubernetes/bootstrap-kubelet.conf --kubeconfig=/etc/kubernetes/kubelet.conf --config=/var/lib/kubelet/config.yaml --cgroup-driver=systemd --network-plugin=cni --pod-infra-container-image=registry.aliyuncs.com/k8sxio/pause:3.2...
3月 05 22:57:18 node1 kubelet[9563]: E0505 22:57:18.700685 9563 controller.go:136] failed to ensure node lease exists, will retry in 7s, error: Get https://192.168.174.132:6443/apis/coordination.k8s.io/v1/namespaces/kube-node-lease/leases/node1?timeout=10s: dia...ect: connection refused
3月 05 22:57:18 node1 kubelet[9563]: E0505 22:57:18.725161 9563 kubelet.go:2270] node "node1" not found
3月 05 22:57:18 node1 kubelet[9563]: E0505 22:57:18.829260 9563 kubelet.go:2270] node "node1" not found
3月 05 22:57:18 node1 kubelet[9563]: E0505 22:57:18.930948 9563 kubelet.go:2270] node "node1" not found
3月 05 22:57:19 node1 kubelet[9563]: E0505 22:57:19.034322 9563 kubelet.go:2270] node "node1" not found
3月 05 22:57:19 node1 kubelet[9563]: E0505 22:57:19.137015 9563 kubelet.go:2270] node "node1" not found
3月 05 22:57:19 node1 kubelet[9563]: E0505 22:57:19.239107 9563 kubelet.go:2270] node "node1" not found
3月 05 22:57:19 node1 kubelet[9563]: E0505 22:57:19.339725 9563 kubelet.go:2270] node "node1" not found
3月 05 22:57:19 node1 kubelet[9563]: E0505 22:57:19.441224 9563 kubelet.go:2270] node "node1" not found
3月 05 22:57:19 node1 kubelet[9563]: E0505 22:57:19.541954 9563 kubelet.go:2270] node "node1" not found
Hint: Some lines were ellipsized, use -l to show in full.
[root@node1 ~]# ping node1
PING node1 (192.168.174.132) 56(84) bytes of data.
64 bytes from node1 (192.168.174.132): icmp_seq=1 ttl=64 time=0.058 ms
64 bytes from node1 (192.168.174.132): icmp_seq=2 ttl=64 time=0.085 ms
^C
--- node1 ping statistics ---
2 packets transmitted, 2 received, 0% packet loss, time 1001ms
rtt min/avg/max/mdev = 0.058/0.071/0.085/0.015 ms
看api server日志
# 发现etcd无法连接
[root@node1 ~]# docker ps -a | grep apiserver
83126c1ff8f7 e9585e7d0849 "kube-apiserver --ad…" 4 minutes ago Exited (2) 4 minutes ago k8s_kube-apiserver_kube-apiserver-node1_kube-system_8a86cf7abb8aa60e9d536609c8d67ee8_18
ccf976a0254d registry.aliyuncs.com/k8sxio/pause:3.2 "/pause" 39 minutes ago Up 39 minutes k8s_POD_kube-apiserver-node1_kube-system_8a86cf7abb8aa60e9d536609c8d67ee8_9
[root@node1 ~]#
[root@node1 ~]# docker logs 83126c1ff8f7
Flag --insecure-port has been deprecated, This flag will be removed in a future version.
I0505 14:54:08.197472 1 server.go:618] external host was not specified, using 192.168.174.132
I0505 14:54:08.197938 1 server.go:148] Version: v1.18.9
I0505 14:54:09.179551 1 plugins.go:158] Loaded 12 mutating admission controller(s) successfully in the following order: NamespaceLifecycle,LimitRanger,ServiceAccount,NodeRestriction,TaintNodesByCondition,Priority,DefaultTolerationSeconds,DefaultStorageClass,StorageObjectInUseProtection,RuntimeClass,DefaultIngressClass,MutatingAdmissionWebhook.
I0505 14:54:09.179591 1 plugins.go:161] Loaded 10 validating admission controller(s) successfully in the following order: LimitRanger,ServiceAccount,Priority,PersistentVolumeClaimResize,RuntimeClass,CertificateApproval,CertificateSigning,CertificateSubjectRestriction,ValidatingAdmissionWebhook,ResourceQuota.
I0505 14:54:09.180508 1 plugins.go:158] Loaded 12 mutating admission controller(s) successfully in the following order: NamespaceLifecycle,LimitRanger,ServiceAccount,NodeRestriction,TaintNodesByCondition,Priority,DefaultTolerationSeconds,DefaultStorageClass,StorageObjectInUseProtection,RuntimeClass,DefaultIngressClass,MutatingAdmissionWebhook.
I0505 14:54:09.180548 1 plugins.go:161] Loaded 10 validating admission controller(s) successfully in the following order: LimitRanger,ServiceAccount,Priority,PersistentVolumeClaimResize,RuntimeClass,CertificateApproval,CertificateSigning,CertificateSubjectRestriction,ValidatingAdmissionWebhook,ResourceQuota.
I0505 14:54:09.182885 1 client.go:361] parsed scheme: "endpoint"
I0505 14:54:09.182951 1 endpoint.go:68] ccResolverWrapper: sending new addresses to cc: [{https://127.0.0.1:2379 <nil> 0 <nil>}]
W0505 14:54:09.183371 1 clientconn.go:1208] grpc: addrConn.createTransport failed to connect to {https://127.0.0.1:2379 <nil> 0 <nil>}. Err :connection error: desc = "transport: Error while dialing dial tcp 127.0.0.1:2379: connect: connection refused". Reconnecting...
I0505 14:54:10.179592 1 client.go:361] parsed scheme: "endpoint"
I0505 14:54:10.179714 1 endpoint.go:68] ccResolverWrapper: sending new addresses to cc: [{https://127.0.0.1:2379 <nil> 0 <nil>}]
W0505 14:54:10.180511 1 clientconn.go:1208] grpc: addrConn.createTransport failed to connect to {https://127.0.0.1:2379 <nil> 0 <nil>}. Err :connection error: desc = "transport: Error while dialing dial tcp 127.0.0.1:2379: connect: connection refused". Reconnecting...
W0505 14:54:10.184406 1 clientconn.go:1208] grpc: addrConn.createTransport failed to connect to {https://127.0.0.1:2379 <nil> 0 <nil>}. Err :connection error: desc = "transport: Error while dialing dial tcp 127.0.0.1:2379: connect: connection refused". Reconnecting...
W0505 14:54:11.181411 1 clientconn.go:1208] grpc: addrConn.createTransport failed to connect to {https://127.0.0.1:2379 <nil> 0 <nil>}. Err :connection error: desc = "transport: Error while dialing dial tcp 127.0.0.1:2379: connect: connection refused". Reconnecting...
W0505 14:54:12.018639 1 clientconn.go:1208] grpc: addrConn.createTransport failed to connect to {https://127.0.0.1:2379 <nil> 0 <nil>}. Err :connection error: desc = "transport: Error while dialing dial tcp 127.0.0.1:2379: connect: connection refused". Reconnecting...
W0505 14:54:13.016328 1 clientconn.go:1208] grpc: addrConn.createTransport failed to connect to {https://127.0.0.1:2379 <nil> 0 <nil>}. Err :connection error: desc = "transport: Error while dialing dial tcp 127.0.0.1:2379: connect: connection refused". Reconnecting...
W0505 14:54:14.842734 1 clientconn.go:1208] grpc: addrConn.createTransport failed to connect to {https://127.0.0.1:2379 <nil> 0 <nil>}. Err :connection error: desc = "transport: Error while dialing dial tcp 127.0.0.1:2379: connect: connection refused". Reconnecting...
W0505 14:54:15.491639 1 clientconn.go:1208] grpc: addrConn.createTransport failed to connect to {https://127.0.0.1:2379 <nil> 0 <nil>}. Err :connection error: desc = "transport: Error while dialing dial tcp 127.0.0.1:2379: connect: connection refused". Reconnecting...
W0505 14:54:19.219028 1 clientconn.go:1208] grpc: addrConn.createTransport failed to connect to {https://127.0.0.1:2379 <nil> 0 <nil>}. Err :connection error: desc = "transport: Error while dialing dial tcp 127.0.0.1:2379: connect: connection refused". Reconnecting...
W0505 14:54:19.724433 1 clientconn.go:1208] grpc: addrConn.createTransport failed to connect to {https://127.0.0.1:2379 <nil> 0 <nil>}. Err :connection error: desc = "transport: Error while dialing dial tcp 127.0.0.1:2379: connect: connection refused". Reconnecting...
W0505 14:54:25.463756 1 clientconn.go:1208] grpc: addrConn.createTransport failed to connect to {https://127.0.0.1:2379 <nil> 0 <nil>}. Err :connection error: desc = "transport: Error while dialing dial tcp 127.0.0.1:2379: connect: connection refused". Reconnecting...
W0505 14:54:25.624701 1 clientconn.go:1208] grpc: addrConn.createTransport failed to connect to {https://127.0.0.1:2379 <nil> 0 <nil>}. Err :connection error: desc = "transport: Error while dialing dial tcp 127.0.0.1:2379: connect: connection refused". Reconnecting...
panic: context deadline exceeded
goroutine 1 [running]:
k8s.io/kubernetes/vendor/k8s.io/apiextensions-apiserver/pkg/registry/customresourcedefinition.NewREST(0xc0008ae150, 0x50ebdc0, 0xc0000f87e0, 0xc0000f8a08)
/workspace/anago-v1.18.9-rc.0.79+8147d851af540a/src/k8s.io/kubernetes/_output/dockerized/go/src/k8s.io/kubernetes/vendor/k8s.io/apiextensions-apiserver/pkg/registry/customresourcedefinition/etcd.go:56 +0x3e7
k8s.io/kubernetes/vendor/k8s.io/apiextensions-apiserver/pkg/apiserver.completedConfig.New(0xc000a02800, 0xc000434b48, 0x51aa780, 0x774b858, 0x10, 0x0, 0x0)
/workspace/anago-v1.18.9-rc.0.79+8147d851af540a/src/k8s.io/kubernetes/_output/dockerized/go/src/k8s.io/kubernetes/vendor/k8s.io/apiextensions-apiserver/pkg/apiserver/apiserver.go:145 +0x14ef
k8s.io/kubernetes/cmd/kube-apiserver/app.createAPIExtensionsServer(0xc000434b40, 0x51aa780, 0x774b858, 0x0, 0x50eb980, 0xc0004a1cc0)
/workspace/anago-v1.18.9-rc.0.79+8147d851af540a/src/k8s.io/kubernetes/_output/dockerized/go/src/k8s.io/kubernetes/cmd/kube-apiserver/app/apiextensions.go:102 +0x59
k8s.io/kubernetes/cmd/kube-apiserver/app.CreateServerChain(0xc000355b80, 0xc0002d8420, 0x455f9f4, 0xc, 0xc0009d7c48)
/workspace/anago-v1.18.9-rc.0.79+8147d851af540a/src/k8s.io/kubernetes/_output/dockerized/go/src/k8s.io/kubernetes/cmd/kube-apiserver/app/server.go:181 +0x2b8
k8s.io/kubernetes/cmd/kube-apiserver/app.Run(0xc000355b80, 0xc0002d8420, 0x0, 0x0)
/workspace/anago-v1.18.9-rc.0.79+8147d851af540a/src/k8s.io/kubernetes/_output/dockerized/go/src/k8s.io/kubernetes/cmd/kube-apiserver/app/server.go:150 +0x101
k8s.io/kubernetes/cmd/kube-apiserver/app.NewAPIServerCommand.func1(0xc000155400, 0xc0008476c0, 0x0, 0x1a, 0x0, 0x0)
/workspace/anago-v1.18.9-rc.0.79+8147d851af540a/src/k8s.io/kubernetes/_output/dockerized/go/src/k8s.io/kubernetes/cmd/kube-apiserver/app/server.go:117 +0x104
k8s.io/kubernetes/vendor/github.com/spf13/cobra.(*Command).execute(0xc000155400, 0xc0000e0010, 0x1a, 0x1b, 0xc000155400, 0xc0000e0010)
/workspace/anago-v1.18.9-rc.0.79+8147d851af540a/src/k8s.io/kubernetes/_output/dockerized/go/src/k8s.io/kubernetes/vendor/github.com/spf13/cobra/command.go:826 +0x460
k8s.io/kubernetes/vendor/github.com/spf13/cobra.(*Command).ExecuteC(0xc000155400, 0x16ec3d31fa156e96, 0x772d680, 0xc000076750)
/workspace/anago-v1.18.9-rc.0.79+8147d851af540a/src/k8s.io/kubernetes/_output/dockerized/go/src/k8s.io/kubernetes/vendor/github.com/spf13/cobra/command.go:914 +0x2fb
k8s.io/kubernetes/vendor/github.com/spf13/cobra.(*Command).Execute(...)
/workspace/anago-v1.18.9-rc.0.79+8147d851af540a/src/k8s.io/kubernetes/_output/dockerized/go/src/k8s.io/kubernetes/vendor/github.com/spf13/cobra/command.go:864
main.main()
_output/dockerized/go/src/k8s.io/kubernetes/cmd/kube-apiserver/apiserver.go:43 +0xcd
查看etcd日志
# 发现etcd数据损坏...
[root@node1 ~]# docker ps -a | grep etcd
610937b240ee 303ce5db0e90 "etcd --advertise-cl…" 4 minutes ago Exited (2) 4 minutes ago k8s_etcd_etcd-node1_kube-system_70f631342af8c34c78512d527c75a699_20
7e1d0711d339 registry.aliyuncs.com/k8sxio/pause:3.2 "/pause" 41 minutes ago Up 41 minutes k8s_POD_etcd-node1_kube-system_70f631342af8c34c78512d527c75a699_9
[root@node1 ~]# docker logs 610937b240ee
[WARNING] Deprecated '--logger=capnslog' flag is set; use '--logger=zap' flag instead
2021-03-05 14:55:23.076848 I | etcdmain: etcd Version: 3.4.3
2021-03-05 14:55:23.076892 I | etcdmain: Git SHA: 3cf2f69b5
2021-03-05 14:55:23.076897 I | etcdmain: Go Version: go1.12.12
2021-03-05 14:55:23.076900 I | etcdmain: Go OS/Arch: linux/amd64
2021-03-05 14:55:23.076904 I | etcdmain: setting maximum number of CPUs to 8, total number of available CPUs is 8
2021-03-05 14:55:23.076988 N | etcdmain: the server is already initialized as member before, starting as etcd member...
[WARNING] Deprecated '--logger=capnslog' flag is set; use '--logger=zap' flag instead
2021-03-05 14:55:23.077098 I | embed: peerTLS: cert = /etc/kubernetes/pki/etcd/peer.crt, key = /etc/kubernetes/pki/etcd/peer.key, trusted-ca = /etc/kubernetes/pki/etcd/ca.crt, client-cert-auth = true, crl-file =
2021-03-05 14:55:23.078053 I | embed: name = node1
2021-03-05 14:55:23.078114 I | embed: data dir = /var/lib/etcd
2021-03-05 14:55:23.078123 I | embed: member dir = /var/lib/etcd/member
2021-03-05 14:55:23.078127 I | embed: heartbeat = 100ms
2021-03-05 14:55:23.078130 I | embed: election = 1000ms
2021-03-05 14:55:23.078134 I | embed: snapshot count = 10000
2021-03-05 14:55:23.078205 I | embed: advertise client URLs = https://192.168.174.132:2379
2021-03-05 14:55:23.078242 I | embed: initial advertise peer URLs = https://192.168.174.132:2380
2021-03-05 14:55:23.078251 I | embed: initial cluster =
2021-03-05 14:55:23.082775 I | etcdserver: recovered store from snapshot at index 170018
2021-03-05 14:55:23.094694 C | etcdserver: recovering backend from snapshot error: failed to find database snapshot file (snap: snapshot file doesn't exist)
panic: recovering backend from snapshot error: failed to find database snapshot file (snap: snapshot file doesn't exist)
panic: runtime error: invalid memory address or nil pointer dereference
[signal SIGSEGV: segmentation violation code=0x1 addr=0x20 pc=0xc2cc4e]
goroutine 1 [running]:
go.etcd.io/etcd/etcdserver.NewServer.func1(0xc0002aaf50, 0xc0002a8f48)
/tmp/etcd-release-3.4.3/etcd/release/etcd/etcdserver/server.go:335 +0x3e
panic(0xed6960, 0xc00007acb0)
/usr/local/go/src/runtime/panic.go:522 +0x1b5
github.com/coreos/pkg/capnslog.(*PackageLogger).Panicf(0xc0001b88e0, 0x10aeaf5, 0x2a, 0xc0002a9018, 0x1, 0x1)
/home/ec2-user/go/pkg/mod/github.com/coreos/pkg@v0.0.0-20160727233714-3ac0863d7acf/capnslog/pkg_logger.go:75 +0x135
go.etcd.io/etcd/etcdserver.NewServer(0x7ffc01b57e80, 0x5, 0x0, 0x0, 0x0, 0x0, 0xc000230f00, 0x1, 0x1, 0xc000231080, ...)
/tmp/etcd-release-3.4.3/etcd/release/etcd/etcdserver/server.go:456 +0x42f7
go.etcd.io/etcd/embed.StartEtcd(0xc00026c000, 0xc00026c580, 0x0, 0x0)
/tmp/etcd-release-3.4.3/etcd/release/etcd/embed/etcd.go:211 +0x9d0
go.etcd.io/etcd/etcdmain.startEtcd(0xc00026c000, 0x108423e, 0x6, 0x1, 0xc0001d51d0)
/tmp/etcd-release-3.4.3/etcd/release/etcd/etcdmain/etcd.go:302 +0x40
go.etcd.io/etcd/etcdmain.startEtcdOrProxyV2()
/tmp/etcd-release-3.4.3/etcd/release/etcd/etcdmain/etcd.go:144 +0x2f71
go.etcd.io/etcd/etcdmain.Main()
/tmp/etcd-release-3.4.3/etcd/release/etcd/etcdmain/main.go:46 +0x38
main.main()
/tmp/etcd-release-3.4.3/etcd/release/etcd/main.go:28 +0x20
如何恢复集群使用?
- etcd单节点,也没有做备份,未找到数据修复方式。。。
- 没备份那就只能reset了。。。
优化方式
- 使用etcd集群保障集群高可用
- 增加etcd快照备份任务(本地或远程)
k8s集群重建
[root@node1 ~]# export MASTER_IP=192.168.174.132
[root@node1 ~]# export APISERVER_NAME=apiserver.demo
[root@node1 ~]# export POD_SUBNET=10.100.0.1/16
[root@node1 ~]# curl -sSL https://kuboard.cn/install-script/v1.18.x/init_master.sh | sh -s 1.18.9
W0505 23:22:25.953211 61909 configset.go:202] WARNING: kubeadm cannot validate component configs for API groups [kubelet.config.k8s.io kubeproxy.config.k8s.io]
[init] Using Kubernetes version: v1.18.9
[preflight] Running pre-flight checks
[preflight] Pulling images required for setting up a Kubernetes cluster
[preflight] This might take a minute or two, depending on the speed of your internet connection
[preflight] You can also perform this action in beforehand using 'kubeadm config images pull'
[kubelet-start] Writing kubelet environment file with flags to file "/var/lib/kubelet/kubeadm-flags.env"
[kubelet-start] Writing kubelet configuration to file "/var/lib/kubelet/config.yaml"
[kubelet-start] Starting the kubelet
[certs] Using certificateDir folder "/etc/kubernetes/pki"
[certs] Generating "ca" certificate and key
[certs] Generating "apiserver" certificate and key
[certs] apiserver serving cert is signed for DNS names [node1 kubernetes kubernetes.default kubernetes.default.svc kubernetes.default.svc.cluster.local apiserver.demo] and IPs [10.96.0.1 192.168.174.132]
[certs] Generating "apiserver-kubelet-client" certificate and key
[certs] Generating "front-proxy-ca" certificate and key
[certs] Generating "front-proxy-client" certificate and key
[certs] Generating "etcd/ca" certificate and key
[certs] Generating "etcd/server" certificate and key
[certs] etcd/server serving cert is signed for DNS names [node1 localhost] and IPs [192.168.174.132 127.0.0.1 ::1]
[certs] Generating "etcd/peer" certificate and key
[certs] etcd/peer serving cert is signed for DNS names [node1 localhost] and IPs [192.168.174.132 127.0.0.1 ::1]
[certs] Generating "etcd/healthcheck-client" certificate and key
[certs] Generating "apiserver-etcd-client" certificate and key
[certs] Generating "sa" key and public key
[kubeconfig] Using kubeconfig folder "/etc/kubernetes"
[kubeconfig] Writing "admin.conf" kubeconfig file
[kubeconfig] Writing "kubelet.conf" kubeconfig file
[kubeconfig] Writing "controller-manager.conf" kubeconfig file
[kubeconfig] Writing "scheduler.conf" kubeconfig file
[control-plane] Using manifest folder "/etc/kubernetes/manifests"
[control-plane] Creating static Pod manifest for "kube-apiserver"
[control-plane] Creating static Pod manifest for "kube-controller-manager"
W0505 23:22:32.767864 61909 manifests.go:225] the default kube-apiserver authorization-mode is "Node,RBAC"; using "Node,RBAC"
[control-plane] Creating static Pod manifest for "kube-scheduler"
W0505 23:22:32.769963 61909 manifests.go:225] the default kube-apiserver authorization-mode is "Node,RBAC"; using "Node,RBAC"
[etcd] Creating static Pod manifest for local etcd in "/etc/kubernetes/manifests"
[wait-control-plane] Waiting for the kubelet to boot up the control plane as static Pods from directory "/etc/kubernetes/manifests". This can take up to 4m0s
[apiclient] All control plane components are healthy after 14.004460 seconds
[upload-config] Storing the configuration used in ConfigMap "kubeadm-config" in the "kube-system" Namespace
[kubelet] Creating a ConfigMap "kubelet-config-1.18" in namespace kube-system with the configuration for the kubelets in the cluster
[upload-certs] Storing the certificates in Secret "kubeadm-certs" in the "kube-system" Namespace
[upload-certs] Using certificate key:
b5bd6ca1af3cca6cdc137cae27298f0f53a4de005dbd44b0f9e38be5ca60d5c0
[mark-control-plane] Marking the node node1 as control-plane by adding the label "node-role.kubernetes.io/master=''"
[mark-control-plane] Marking the node node1 as control-plane by adding the taints [node-role.kubernetes.io/master:NoSchedule]
[bootstrap-token] Using token: 0j23ia.d03aoazakrhu0615
[bootstrap-token] Configuring bootstrap tokens, cluster-info ConfigMap, RBAC Roles
[bootstrap-token] configured RBAC rules to allow Node Bootstrap tokens to get nodes
[bootstrap-token] configured RBAC rules to allow Node Bootstrap tokens to post CSRs in order for nodes to get long term certificate credentials
[bootstrap-token] configured RBAC rules to allow the csrapprover controller automatically approve CSRs from a Node Bootstrap Token
[bootstrap-token] configured RBAC rules to allow certificate rotation for all node client certificates in the cluster
[bootstrap-token] Creating the "cluster-info" ConfigMap in the "kube-public" namespace
[kubelet-finalize] Updating "/etc/kubernetes/kubelet.conf" to point to a rotatable kubelet client certificate and key
[addons] Applied essential addon: CoreDNS
[addons] Applied essential addon: kube-proxy
Your Kubernetes control-plane has initialized successfully!
To start using your cluster, you need to run the following as a regular user:
mkdir -p $HOME/.kube
sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config
sudo chown $(id -u):$(id -g) $HOME/.kube/config
You should now deploy a pod network to the cluster.
Run "kubectl apply -f [podnetwork].yaml" with one of the options listed at:
https://kubernetes.io/docs/concepts/cluster-administration/addons/
You can now join any number of the control-plane node running the following command on each as root:
kubeadm join apiserver.demo:6443 --token 0j23ia.d03aoazakrhu0615 \
--discovery-token-ca-cert-hash sha256:05beff7ea5ce48c6d5feb27559b17c1c3c99f6aac1e0766a9587583be492fb4a \
--control-plane --certificate-key b5bd6ca1af3cca6cdc137cae27298f0f53a4de005dbd44b0f9e38be5ca60d5c0
Please note that the certificate-key gives access to cluster sensitive data, keep it secret!
As a safeguard, uploaded-certs will be deleted in two hours; If necessary, you can use
"kubeadm init phase upload-certs --upload-certs" to reload certs afterward.
Then you can join any number of worker nodes by running the following on each as root:
kubeadm join apiserver.demo:6443 --token 0j23ia.d03aoazakrhu0615 \
--discovery-token-ca-cert-hash sha256:05beff7ea5ce48c6d5feb27559b17c1c3c99f6aac1e0766a9587583be492fb4a
安装calico-3.13.1
--2021-05-05 23:22:48-- https://kuboard.cn/install-script/calico/calico-3.13.1.yaml
正在解析主机 kuboard.cn (kuboard.cn)... 119.3.92.138, 122.112.240.69
正在连接 kuboard.cn (kuboard.cn)|119.3.92.138|:443... 已连接。
已发出 HTTP 请求,正在等待回应... 200 OK
长度:21077 (21K) [application/octet-stream]
正在保存至: “calico-3.13.1.yaml”
100%[=========================================================================================================================================================================================================================================================>] 21,077 --.-K/s 用时 0s
2021-05-05 23:22:49 (252 MB/s) - 已保存 “calico-3.13.1.yaml” [21077/21077])
configmap/calico-config created
customresourcedefinition.apiextensions.k8s.io/bgpconfigurations.crd.projectcalico.org created
customresourcedefinition.apiextensions.k8s.io/bgppeers.crd.projectcalico.org created
customresourcedefinition.apiextensions.k8s.io/blockaffinities.crd.projectcalico.org created
customresourcedefinition.apiextensions.k8s.io/clusterinformations.crd.projectcalico.org created
customresourcedefinition.apiextensions.k8s.io/felixconfigurations.crd.projectcalico.org created
customresourcedefinition.apiextensions.k8s.io/globalnetworkpolicies.crd.projectcalico.org created
customresourcedefinition.apiextensions.k8s.io/globalnetworksets.crd.projectcalico.org created
customresourcedefinition.apiextensions.k8s.io/hostendpoints.crd.projectcalico.org created
customresourcedefinition.apiextensions.k8s.io/ipamblocks.crd.projectcalico.org created
customresourcedefinition.apiextensions.k8s.io/ipamconfigs.crd.projectcalico.org created
customresourcedefinition.apiextensions.k8s.io/ipamhandles.crd.projectcalico.org created
customresourcedefinition.apiextensions.k8s.io/ippools.crd.projectcalico.org created
customresourcedefinition.apiextensions.k8s.io/networkpolicies.crd.projectcalico.org created
customresourcedefinition.apiextensions.k8s.io/networksets.crd.projectcalico.org created
clusterrole.rbac.authorization.k8s.io/calico-kube-controllers created
clusterrolebinding.rbac.authorization.k8s.io/calico-kube-controllers created
clusterrole.rbac.authorization.k8s.io/calico-node created
clusterrolebinding.rbac.authorization.k8s.io/calico-node created
daemonset.apps/calico-node created
serviceaccount/calico-node created
deployment.apps/calico-kube-controllers created
serviceaccount/calico-kube-controllers created
[root@node1 ~]# kubectl get nodes
NAME STATUS ROLES AGE VERSION
node1 Ready master 87s v1.18.9
[root@node1 ~]# kubectl get pods -A
NAMESPACE NAME READY STATUS RESTARTS AGE
kube-system calico-kube-controllers-5b8b769fcd-wvgnb 1/1 Running 0 90s
kube-system calico-node-mjlzh 1/1 Running 0 90s
kube-system coredns-66db54ff7f-4p496 1/1 Running 0 90s
kube-system coredns-66db54ff7f-f27tw 1/1 Running 0 90s
kube-system etcd-node1 1/1 Running 0 105s
kube-system kube-apiserver-node1 1/1 Running 0 105s
kube-system kube-controller-manager-node1 1/1 Running 0 105s
kube-system kube-proxy-27jzw 1/1 Running 0 90s
kube-system kube-scheduler-node1 1/1 Running 0 105s
etcd快照备份任务
查看etcd pod状态
[root@node1 ~]# kubectl describe pods -n kube-system etcd-node1
Name: etcd-node1
Namespace: kube-system
Priority: 2000000000
Priority Class Name: system-cluster-critical
Node: node1/192.168.174.132
Start Time: Thu, 03 May 2021 23:22:49 +0800
Labels: component=etcd
tier=control-plane
Annotations: kubeadm.kubernetes.io/etcd.advertise-client-urls: https://192.168.174.132:2379
kubernetes.io/config.hash: 70f631342af8c34c78512d527c75a699
kubernetes.io/config.mirror: 70f631342af8c34c78512d527c75a699
kubernetes.io/config.seen: 2022-05-05T23:22:48.609562827+08:00
kubernetes.io/config.source: file
Status: Running
IP: 192.168.174.132
IPs:
IP: 192.168.174.132
Controlled By: Node/node1
Containers:
etcd:
Container ID: docker://f79ad76afed1a8a5a2af2dcaaf503882b466d263ad2088dfc3a0ad1fbb91920b
Image: registry.aliyuncs.com/k8sxio/etcd:3.4.3-0
Image ID: docker-pullable://registry.aliyuncs.com/k8sxio/etcd@sha256:4afb99b4690b418ffc2ceb67e1a17376457e441c1f09ab55447f0aaf992fa646
Port: <none>
Host Port: <none>
Command:
etcd
--advertise-client-urls=https://192.168.174.132:2379
--cert-file=/etc/kubernetes/pki/etcd/server.crt
--client-cert-auth=true
--data-dir=/var/lib/etcd
--initial-advertise-peer-urls=https://192.168.174.132:2380
--initial-cluster=node1=https://192.168.174.132:2380
--key-file=/etc/kubernetes/pki/etcd/server.key
--listen-client-urls=https://127.0.0.1:2379,https://192.168.174.132:2379
--listen-metrics-urls=http://127.0.0.1:2381
--listen-peer-urls=https://192.168.174.132:2380
--name=node1
--peer-cert-file=/etc/kubernetes/pki/etcd/peer.crt
--peer-client-cert-auth=true
--peer-key-file=/etc/kubernetes/pki/etcd/peer.key
--peer-trusted-ca-file=/etc/kubernetes/pki/etcd/ca.crt
--snapshot-count=10000
--trusted-ca-file=/etc/kubernetes/pki/etcd/ca.crt
State: Running
Started: Thu, 03 May 2021 23:22:40 +0800
Ready: True
Restart Count: 0
Liveness: http-get http://127.0.0.1:2381/health delay=15s timeout=15s period=10s #success=1 #failure=8
Environment: <none>
Mounts:
/etc/kubernetes/pki/etcd from etcd-certs (rw)
/var/lib/etcd from etcd-data (rw)
Conditions:
Type Status
Initialized True
Ready True
ContainersReady True
PodScheduled True
Volumes:
etcd-certs:
Type: HostPath (bare host directory volume)
Path: /etc/kubernetes/pki/etcd
HostPathType: DirectoryOrCreate
etcd-data:
Type: HostPath (bare host directory volume)
Path: /var/lib/etcd
HostPathType: DirectoryOrCreate
QoS Class: BestEffort
Node-Selectors: <none>
Tolerations: :NoExecute
Events: <none>
查看etcd member状态
[root@node1 ~]# kubectl exec -n kube-system etcd-node1 -- etcdctl --endpoints="https://127.0.0.1:2379" --cacert=/etc/kubernetes/pki/etcd/ca.crt --cert=/etc/kubernetes/pki/etcd/server.crt --key=/etc/kubernetes/pki/etcd/server.key member list -w table
+------------------+---------+-------+------------------------------+------------------------------+------------+
| ID | STATUS | NAME | PEER ADDRS | CLIENT ADDRS | IS LEARNER |
+------------------+---------+-------+------------------------------+------------------------------+------------+
| d8ad19a694bd9821 | started | node1 | https://192.168.174.132:2380 | https://192.168.174.132:2379 | false |
+------------------+---------+-------+------------------------------+------------------------------+------------+
创建别名etcdctl
# 自行写入用户的.bash_profile,此处略
[root@node1 ~]# alias etcdctl='kubectl exec -n kube-system etcd-node1 -- etcdctl --endpoints="https://127.0.0.1:2379" --cacert=/etc/kubernetes/pki/etcd/ca.crt --cert=/etc/kubernetes/pki/etcd/server.crt --key=/etc/kubernetes/pki/etcd/server.key'
查看键
[root@node1 ~]# etcdctl get / --prefix=true --keys-only | head -10
/registry/apiextensions.k8s.io/customresourcedefinitions/bgpconfigurations.crd.projectcalico.org
/registry/apiextensions.k8s.io/customresourcedefinitions/bgppeers.crd.projectcalico.org
/registry/apiextensions.k8s.io/customresourcedefinitions/blockaffinities.crd.projectcalico.org
/registry/apiextensions.k8s.io/customresourcedefinitions/clusterinformations.crd.projectcalico.org
/registry/apiextensions.k8s.io/customresourcedefinitions/felixconfigurations.crd.projectcalico.org
[root@node1 ~]# etcdctl get /registry/pods/kube-system --prefix=true --keys-only | head -10
/registry/pods/kube-system/calico-kube-controllers-5b8b769fcd-wvgnb
/registry/pods/kube-system/calico-node-mjlzh
/registry/pods/kube-system/coredns-66db54ff7f-4p496
/registry/pods/kube-system/coredns-66db54ff7f-f27tw
/registry/pods/kube-system/etcd-node1
创建etcd 快照备份
# 创建备份目录
[root@node1 ~]# mkdir /var/lib/etcd/snapshot
[root@node1 ~]# chmod 700 /var/lib/etcd/snapshot
[root@node1 ~]# ll /var/lib/etcd/
总用量 0
drwx------ 4 root root 29 3月 5 23:22 member
drwx------ 2 root root 6 3月 5 23:45 snapshot
# 手动创建备份
[root@node1 ~]# etcdctl snapshot save /var/lib/etcd/snapshot/$(hostname)_$(date +%Y_%m_%d-%H_%M_%S).db
{"level":"info","ts":1651766209.5003688,"caller":"snapshot/v3_snapshot.go:110","msg":"created temporary db file","path":"/var/lib/etcd/snapshot/node1_2021_03_05-23_56_49.db.part"}
{"level":"warn","ts":"2021-03-05T15:56:49.510Z","caller":"clientv3/retry_interceptor.go:116","msg":"retry stream intercept"}
{"level":"info","ts":1651766209.5106444,"caller":"snapshot/v3_snapshot.go:121","msg":"fetching snapshot","endpoint":"https://127.0.0.1:2379"}
Snapshot saved at /var/lib/etcd/snapshot/node1_2021_03_05-23_56_49.db
{"level":"info","ts":1651766209.5522013,"caller":"snapshot/v3_snapshot.go:134","msg":"fetched snapshot","endpoint":"https://127.0.0.1:2379","took":0.05168735}
{"level":"info","ts":1651766209.5523207,"caller":"snapshot/v3_snapshot.go:143","msg":"saved","path":"/var/lib/etcd/snapshot/node1_2021_03_05-23_56_49.db"}
创建定时任务脚本
[root@node1 ~]# cat >/usr/local/scripts/etcd_backup.sh<<'EOF'
#!/bin/bash
source ~/.bashrc
etcdbkdir=/var/lib/etcd/snapshot
kubectl exec -n kube-system etcd-node1 -- etcdctl --endpoints="https://127.0.0.1:2379" --cacert=/etc/kubernetes/pki/etcd/ca.crt --cert=/etc/kubernetes/pki/etcd/server.crt --key=/etc/kubernetes/pki/etcd/server.key snapshot save /var/lib/etcd/snapshot/$(hostname)_$(date +%Y_%m_%d-%H_%M_%S).db
for i in `ls -lt ${etcdbkdir} | awk 'NR>6 {print $9}'`;do rm -rf ${etcdbkdir}/$i;done
EOF
[root@node1 ~]# chmod +x /usr/local/scripts/etcd_backup.sh
[root@node1 ~]# crontab -l
# 时间间隔自行把控,博主是零时倒腾不一会关机,所以备份间隔很短
*/5 * * * * /bin/bash /usr/local/scripts/etcd_backup.sh
[root@node1 ~]# ll /var/lib/etcd/snapshot/
总用量 12880
-rw------- 1 root root 2633760 3月 6 00:20 node1_2021_03_06-00_20_01.db
-rw------- 1 root root 2633760 3月 6 00:25 node1_2021_03_06-00_25_01.db
-rw------- 1 root root 2633760 3月 6 00:30 node1_2021_03_06-00_30_01.db
-rw------- 1 root root 2633760 3月 6 00:35 node1_2021_03_06-00_35_01.db
-rw------- 1 root root 2633760 3月 6 00:40 node1_2021_03_06-00_40_02.db
正文完