跳到主要内容

k8s

k8s安装

  1. 设置系统参数

    # 关闭swap
    swapoff -a
    sed -i 's/.*swap/#&/' /etc/fstab

    # 内核参数
    cat <<EOF > /etc/sysctl.d/k8s.conf
    net.bridge.bridge-nf-call-ip6tables = 1
    net.bridge.bridge-nf-call-iptables = 1
    net.ipv4.ip_nonlocal_bind = 1
    net.ipv4.ip_forward = 1
    vm.swappiness = 0
    vm.max_map_count = 262144
    net.netfilter.nf_conntrack_max = 1000000
    EOF

    modprobe br_netfilter
    sysctl -p /etc/sysctl.d/k8s.conf
    echo "* soft nofile 65536" >> /etc/security/limits.conf
    echo "* hard nofile 65536" >> /etc/security/limits.conf
  2. 安装docker

    • 脚本安装

      curl -fsSL https://get.docker.com/ | sh -s -- --mirror Aliyun
    • yum安装

      yum -y install yum-utils
      yum-config-manager --add-repo https://mirrors.aliyun.com/docker-ce/linux/centos/docker-ce.repo
      yum -y install docker-ce
    #修改默认docker仓库
    cat <<EOF > /etc/docker/daemon.json
    {
    "registry-mirrors": ["https://fl791z1h.mirror.aliyuncs.com"]
    }
    EOF
  3. 添加阿里kubernetes源

    cat <<EOF > /etc/yum.repos.d/kubernetes.repo
    [kubernetes]
    name=Kubernetes
    baseurl=https://mirrors.aliyun.com/kubernetes/yum/repos/kubernetes-el7-x86_64/
    enabled=1
    gpgcheck=1
    repo_gpgcheck=1
    gpgkey=https://mirrors.aliyun.com/kubernetes/yum/doc/yum-key.gpg https://mirrors.aliyun.com/kubernetes/yum/doc/rpm-package-key.gpg
    EOF
  4. 安装k8s

    • 安装工具

      yum install -y kubectl kubelet kubeadm
    • 初始化集群

      --service-cidr service ip范围

      --pod-network-cidr pod ip范围

      kubeadm init --kubernetes-version=1.18.0  --apiserver-advertise-address=172.16.7.14 --image-repository registry.aliyuncs.com/google_containers --service-cidr=10.10.0.0/16 --pod-network-cidr=10.244.0.0/16
  5. 安装网络插件

    • 安装flannel网络插件

      kubectl apply -f https://raw.githubusercontent.com/coreos/flannel/master/Documentation/kube-flannel.yml
    • 安装calico网络插件

      下载使用ectd数据存储calico.yaml

      curl https://docs.projectcalico.org/manifests/calico-etcd.yaml -o calico.yaml

      修改kind: Secret的data内容

      etcd-key: null    -->  etcd-key: #改成输出的内容cat /etc/kubernetes/pki/etcd/server.key | base64 -w 0
      etcd-cert: null --> etcd-cert: #改成输出的内容cat /etc/kubernetes/pki/etcd/server.crt | base64 -w 0
      etcd-ca: null --> etcd-ca: #改成输出的内容cat /etc/kubernetes/pki/etcd/ca.crt | base64 -w 0

      修改kind: ConfigMap内容data内容

      etcd_endpoints: "http://<ETCD_IP>:<ETCD_PORT>"  -->  etcd_endpoints: "https://172.16.7.14:2379"
      etcd_ca: "" # "/calico-secrets/etcd-ca" --> etcd_ca: "/calico-secrets/etcd-ca"
      etcd_cert: "" # "/calico-secrets/etcd-cert" --> etcd_cert: "/calico-secrets/etcd-cert"
      etcd_key: "" # "/calico-secrets/etcd-key" --> etcd_key: "/calico-secrets/etcd-key"

      为Calico节点配置IP自动检测,以确保使用正确的IP地址进行路由interface=修改为对应的实际物理网卡

      不做该设置会出现mster calico-node notready状态以及出现Connect Socket: Connection reset by peer bird: BGP: Unexpected connect from unknown address

      kubectl set env daemonset/calico-node -n kube-system IP_AUTODETECTION_METHOD=interface=eth.*
  6. 安装dashboard

    安装

    kubectl apply -f https://raw.githubusercontent.com/kubernetes/dashboard/v2.0.0/aio/deploy/recommended.yaml

    修改service配置,找到type,将ClusterIP改成NodePort 设置nodePort端口

    kubectl edit svc kubernetes-dashboard -n kubernetes-dashboard

    授权

    kubectl create clusterrolebinding serviceaccount-cluster-admin --clusterrole=cluster-admin --user=system:serviceaccount:kubernetes-dashboard:kubernetes-dashboard

    获取页面登录token

    kubectl describe secrets -n kubernetes-dashboard $(kubectl -n kubernetes-dashboard get secret|grep kubernetes-dashboard-token|awk '{print $1}')| grep token | awk 'NR==3{print $2}'

修改集群启用ipvs模式:

#修改ConfigMap的kube-system/kube-proxy中的config.conf,mode: “ipvs”
kubectl edit cm kube-proxy -n kube-system

#重启各个节点上的kube-proxy pod
kubectl get pod -n kube-system | grep kube-proxy | awk '{system("kubectl delete pod "$1" -n kube-system")}'

启用vip(多master须在每个上面执行):

#定义vip地址必须同在主机网络非使用的ip
export VIP=172.16.7.18
#定义网卡接口主机当前网卡
export INTERFACE=eth0
ctr image pull ghcr.io/kube-vip/kube-vip:v0.4.0
ctr run --rm --net-host ghcr.io/kube-vip/kube-vip:v0.4.0 vip /kube-vip manifest pod \
--interface $INTERFACE \
--vip $VIP \
--controlplane \
--services \
--arp \
--leaderElection | tee /etc/kubernetes/manifests/kube-vip.yaml

安装nfs作为sc:

git clone https://github.com/kubernetes-sigs/nfs-subdir-external-provisioner
#修改deployment.yaml中server地址与path对应实际的nfs地址与目录
cd nfs-subdir-external-provisioner && kubectl apply -f deploy/deployment.yaml deploy/rbac.yaml deploy/class.yaml

#设置nfs默认sc
kubectl patch storageclass managed-nfs-storage -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}'

更改ROLES:

kubectl label --overwrite nodes nodename kubernetes.io/role=node1

修改revision触发滚动更新:

revision=`kubectl get deploy test01-app1 -ojson|jq -r '.metadata.annotations."deployment.kubernetes.io/revision"|tonumber+1'`

kubectl patch deployment test01-app1 -p '{"spec":{"template": {"metadata": {"annotations": {"deployment.kubernetes.io/revision": $revision}}}}}'

kubectl 1.15版本后滚动重启:

kubectl  rollout restart deploy myapp-deploy -n ops

ingress-nginx path被带过 ingress配置添加下面配置:

annotations:
nginx.ingress.kubernetes.io/rewrite-target: /

新节点加入集群:

kubeadm token create --print-join-command

导出当前集群配置:

kubeadm config view > k8s.yaml

更新证书:

kubeadm alpha certs renew all --config=k8s.yaml

删除卡住Terminating状态无法删除:

#删除- finalizers.kubesphere.io/namespaces内容即可
kubectl edit namespace myns

#直接patch修改
kubectl patch ns/myns -p '{"metadata":{"finalizers":[]}}' --type=merge
kubectl patch crd/helmcategories.application.kubesphere.io -p '{"metadata":{"finalizers":[]}}' --type=merge

修复/etc/kubernetes所有文件:

kubeadm init phase certs all --config k8s.yml
kubeadm init phase kubeconfig all --config k8s.yml
kubeadm init phase control-plane all --config k8s.yml
kubeadm init phase etcd local --config k8s.yml

#更新cluster-info配置
kubeadm init phase bootstrap-token
#重启控制平面组件
docker ps |grep -E 'k8s_kube-apiserver|k8s_kube-controller-manager|k8s_kube-scheduler|k8s_etcd_etcd' | awk '{print $1}'|xargs docker restart

修复kubelet配置:

systemctl stop kubelet
rm -rf /var/lib/kubelet/pki/ /etc/kubernetes/kubelet.conf
kubeadm init phase kubeconfig kubelet --config k8s.yml
kubeadm init phase kubelet-start --config k8s.yml

etcd无法启动(节点挂掉):

vim /etc/kubernetes/manifests/etcd.yaml
添加参数覆盖旧集群信息,正常启动后可去掉
--force-new-cluster

查看etcd集群信息

etcdctl --cacert=/etc/kubernetes/pki/etcd/ca.crt --cert=/etc/kubernetes/pki/etcd/server.crt --key=/etc/kubernetes/pki/etcd/server.key endpoint status --cluster -w table

kubeshpere安装:

#安装HelmClient
wget https://get.helm.sh/helm-v2.14.3-linux-amd64.tar.gz
tar -zxvf helm-v2.14.3-linux-amd64.tar.gz
mv linux-amd64/helm /usr/local/bin/


#安装TillerServer
helm init --upgrade --tiller-image registry.cn-hangzhou.aliyuncs.com/google_containers/tiller:v2.14.3 --stable-repo-url https://kubernetes.oss-cn-hangzhou.aliyuncs.com/charts
kubectl create serviceaccount --namespace kube-system tiller
kubectl create clusterrolebinding tiller-cluster-rule --clusterrole=cluster-admin --serviceaccount=kube-system:tiller
kubectl patch deploy --namespace kube-system tiller-deploy -p '{"spec":{"template":{"spec":{"serviceAccount":"tiller"}}}}'

etcd

配置etcdctl命令:

alias etcdctl='etcdctl --cacert=/etc/kubernetes/pki/etcd/ca.crt --cert=/etc/kubernetes/pki/etcd/server.crt --key=/etc/kubernetes/pki/etcd/server.key'

查看集群信息:

etcdctl endpoint status --cluster -w table

查看所有key:

etcdctl get / --prefix --keys-only

生成快照备份:

etcdctl snapshot save snapshot.db

恢复备份:

  • 单master

    etcdctl snapshot restore /root/snapshot.db --data-dir=/var/lib/etcd
  • 多master

    拷贝备份文件snapshot.db到隔master上

    分别在其他master上执行--name使用当前主机名 --initial-advertise-peer-urls 当前的ectd地址

    #master1
    etcdctl snapshot restore /root/snapshot.db --name master1 --initial-cluster master1=https://192.168.200.201:2380,master2=https://192.168.200.202:2380,master3=https://192.168.200.203:2380 --initial-advertise-peer-urls https://192.168.200.201:2380 --data-dir /var/lib/etcd


    #master2
    etcdctl snapshot restore /root/snapshot.db --name master2 --initial-cluster master1=https://192.168.200.201:2380,master2=https://192.168.200.202:2380,master3=https://192.168.200.203:2380 --initial-advertise-peer-urls https://192.168.200.202:2380 --data-dir /var/lib/etcd


    #master3
    etcdctl snapshot restore snapshot.db --name master3 --initial-cluster master1=https://192.168.200.201:2380,master2=https://192.168.200.202:2380,master3=https://192.168.200.203:2380 --initial-advertise-peer-urls https://192.168.200.203:2380 --data-dir /var/lib/etcd

k8s集群修复

在 master 节点上,这个目录包含:

  • etcd 的一组证书和 CA(在 /etc/kubernetes/pki/etcd 目录下)
  • 一组 kubernetes 的证书和 CA(在 /etc/kubernetes/pki 目录下)
  • 还有 kube-controller-manager、kube-scheduler、cluster-admin 以及 kubelet 这些使用的 kubeconfig 文件
  • etcd、kube-apiserver、kube-scheduler 和 kube-controller-manager 的静态 Pod 资源清单文件(位于 /etc/kubernetes/manifests 目录)
  1. 导出当前集群配置

    如果集群已不能正常访问导出不了配置,在后面执行kubeadm命令生成配置证书时手动指定--apiserver-advertise-address、--service-cidr等参数对应当前集群的配置或手动生成k8s.yml。

    kubeadm config view > k8s.yml
    apiServer:
    extraArgs:
    authorization-mode: Node,RBAC
    timeoutForControlPlane: 4m0s
    apiVersion: kubeadm.k8s.io/v1beta2
    certificatesDir: /etc/kubernetes/pki
    clusterName: kubernetes
    controllerManager: {}
    dns:
    type: CoreDNS
    etcd:
    local:
    dataDir: /var/lib/etcd
    imageRepository: registry.aliyuncs.com/google_containers
    kind: ClusterConfiguration
    kubernetesVersion: v1.18.12
    networking:
    dnsDomain: cluster.local
    podSubnet: 10.244.0.0/16
    serviceSubnet: 10.10.0.0/16
    scheduler: {}
  2. 删除master上所有旧配置文件包括证书

    rm -rf /etc/kubernetes
  3. 修复控制平面组件

    在其中一个 master 节点上执行下面所有操作

    生成 Kubernetes 的所有 SSL 证书,以及 Kubernetes 服务的静态 Pods 清单和 kubeconfigs 文件。

    kubeadm init phase certs all --config k8s.yml
    kubeadm init phase kubeconfig all --config k8s.yml
    kubeadm init phase control-plane all --config k8s.yml
    kubeadm init phase etcd local --config k8s.yml

    cp -f /etc/kubernetes/admin.conf ~/.kube/config

    重启旧的所有控制平面组件。

    docker ps |grep -E 'k8s_kube-apiserver|k8s_kube-controller-manager|k8s_kube-scheduler|k8s_etcd_etcd' | awk  '{print $1}'|xargs docker restart

    如果你使用 kubeadm 加入 kubelet,你还需要更新 kube-public 命名空间中的 cluster-info 配置,因为它仍然包含你的旧 CA 的哈希值。

    kubeadm init phase bootstrap-token

    由于其他 master 节点上的所有证书也必须由单一 CA 签署,手动复制下面证书到其他master节点,并在每个节点上重复上面命令。

    scp /etc/kubernetes/pki/{ca,front-proxy-ca}.{key,crt} master2
    scp /etc/kubernetes/pki/sa.{key,pub} master2

    作为手动复制证书的替代方法,你也可以使用 Kubernetes API,如下所示的命令:

    kubeadm init phase upload-certs --upload-certs

    kubeadm token create --print-join-command

    该命令将加密并上传证书到 Kubernetes,时间为2小时,可以使用下面方式在其他master上注册 master 节点:

    # 更新现有master节点证书
    kubeadm join phase control-plane-prepare all kubernetes-apiserver:6443 --control-plane --token cs0etm.ua7fbmwuf1jz946l --discovery-token-ca-cert-hash sha256:555f6ececd4721fed0269d27a5c7f1c6d7ef4614157a18e56ed9a1fd031a3ab8 --certificate-key 385655ee0ab98d2441ba8038b4e8d03184df1806733eac131511891d1096be73


    # 加入新的master节点
    kubeadm join kubernetes-apiserver:6443 --control-plane --token cs0etm.ua7fbmwuf1jz946l --discovery-token-ca-cert-hash sha256:555f6ececd4721fed0269d27a5c7f1c6d7ef4614157a18e56ed9a1fd031a3ab8 --certificate-key 385655ee0ab98d2441ba8038b4e8d03184df1806733eac131511891d1096be73

    需要注意的是,Kubernetes API 还有一个配置,它为 front-proxy 客户端持有 CA 证书,它用于验证从 apiserver 到 webhooks 和聚合层服务的请求。不过 kube-apiserver 会自动更新它。

  4. 修复工作节点

    现在我们可以使用下面的命令列出集群的所有节点(master):

    kubectl get node

    当然正常现在所有节点的状态都是 NotReady,这是因为他们仍然还使用的是旧的证书,为了解决这个问题,我们将使用 kubeadm 来执行重新加入集群节点(master)。

    systemctl stop kubelet
    rm -rf /var/lib/kubelet/pki/ /etc/kubernetes/kubelet.conf
    kubeadm init phase kubeconfig kubelet --config k8s.yml
    kubeadm init phase kubelet-start --config k8s.yml

    但要加入工作节点,我们必须生成一个新的 token(master)。

    kubeadm token create --print-join-command

    然后在工作节点分别执行下面的命令(node):

    systemctl stop kubelet
    rm -rf /var/lib/kubelet/pki/ /etc/kubernetes/pki/ /etc/kubernetes/kubelet.conf
    kubeadm join phase kubelet-start kubernetes-apiserver:6443 --token cs0etm.ua7fbmwuf1jz946l --discovery-token-ca-cert-hash sha256:555f6ececd4721fed0269d27a5c7f1c6d7ef4614157a18e56ed9a1fd031a3ab8

    你不需要删除 master 节点上的 /etc/kubernetes/pki 目录,因为它已经包含了所有需要的证书。

    上面的操作会把你所有的 kubelet 重新加入到集群中,它并不会影响任何已经运行在上面的容器,但是,如果集群中有多个节点并且不同时进行,则可能会遇到一种情况,即 kube-controller-mananger 开始从 NotReady 节点重新创建容器,并尝试在活动节点上重新调度它们。

    为了防止这种情况,我们可以暂时停掉 master 节点上的 controller-manager(可略)。

    rm /etc/kubernetes/manifests/kube-controller-manager.yaml
    crictl rmp $(crictl ps --name kube-controller-manager -q)

    一旦集群中的所有节点都被加入,你就可以为 controller-manager 生成一个静态资源清单,在所有 master 节点上运行下面的命令(可略)。

    kubeadm init phase control-plane controller-manager --config k8s.yml

    如果 kubelet 被配置为请求由你的 CA 签署的证书(选项serverTLSBootstrap: true),你还需要批准来自 kubelet 的 CSR(可略):

    kubectl get csr
    kubectl certificate approve <csr>
  5. 更新calico中etcd证书(使用calico etcd存储情况下)

    # Source: calico/templates/calico-etcd-secrets.yaml
    # The following contains k8s Secrets for use with a TLS enabled etcd cluster.
    # For information on populating Secrets, see http://kubernetes.io/docs/user-guide/secrets/
    apiVersion: v1
    kind: Secret
    type: Opaque
    metadata:
    name: calico-etcd-secrets
    namespace: kube-system
    data:
    # Populate the following with etcd TLS configuration if desired, but leave blank if
    # not using TLS for etcd.
    # The keys below should be uncommented and the values populated with the base64
    # encoded contents of each file that would be associated with the TLS data.
    # Example command for encoding a file contents: cat <file> | base64 -w 0
    etcd-key: null --> null: #改成输出的内容cat /etc/kubernetes/pki/etcd/server.key | base64 -w 0
    etcd-cert: null --> null: #改成输出的内容cat /etc/kubernetes/pki/etcd/server.crt | base64 -w 0
    etcd-ca: null --> null: #改成输出的内容cat /etc/kubernetes/pki/etcd/ca.crt | base64 -w 0
    #kubectl edit secret calico-etcd-secrets -n kube-system

    kubectl replace -f calico-etcd-secrets.yml
  6. 修复 ServiceAccounts

    因为我们丢失了 /etc/kubernetes/pki/sa.key ,这个 key 用于为集群中所有 ServiceAccounts 签署 jwt tokens,因此,我们必须为每个 sa 重新创建tokens。

    这可以通过类型为 kubernetes.io/service-account-token 的 Secret 中删除 token 字段来完成。

    kubectl get secret --all-namespaces | awk '/kubernetes.io\/service-account-token/ { print "kubectl patch secret -n " $1 " " $2 " -p {\\\"data\\\":{\\\"token\\\":null}}"}' | sh -x

    删除之后,kube-controller-manager 会自动生成用新密钥签名的新令牌。不过需要注意的是并非所有的微服务都能即时更新 tokens,因此很可能需要手动重新启动使用 tokens 的容器(看实际情况再重启对应的pod)。

    kubectl get pod --field-selector 'spec.serviceAccountName!=default' --no-headers -n kube-system | awk '{print "kubectl delete pod -n " $1 " " $2 " --wait=false --grace-period=0"}'

    例如,这个命令会生成一个命令列表,会将所有使用非默认的 serviceAccount 的 Pod 删除,我建议从 kube-system 命名空间执行,因为 kube-proxy 和 CNI 插件都安装在这个命名空间中,它们对于处理你的微服务之间的通信至关重要。

    到这里我们的集群就恢复完成了。

链接:

k8s自定义host

coresdns

kubectl edit cm -n kube-system coredns


# 添加hosts内容(fallthrough必须存在)

apiVersion: v1
data:
Corefile: |
.:53 {
errors
health {
lameduck 5s
}
ready
kubernetes cluster.local in-addr.arpa ip6.arpa {
pods insecure
fallthrough in-addr.arpa ip6.arpa
ttl 30
}
hosts {
10.100.1.71 node1 hadoop01
10.100.1.72 node2 hadoop02
10.100.1.73 node3 hadoop03

fallthrough
}
prometheus :9153
forward . /etc/resolv.conf
cache 30
loop
reload
loadbalance
}
kind: ConfigMap
metadata:
creationTimestamp: "2020-12-03T09:28:23Z"
managedFields:
- apiVersion: v1
fieldsType: FieldsV1
fieldsV1:
f:data: {}
manager: kubeadm
operation: Update
time: "2020-12-03T09:28:23Z"
- apiVersion: v1
fieldsType: FieldsV1
fieldsV1:
f:data:
f:Corefile: {}
manager: kubectl
operation: Update
time: "2021-10-12T02:40:57Z"
name: coredns
namespace: kube-system
resourceVersion: "111183417"
selfLink: /api/v1/namespaces/kube-system/configmaps/coredns
uid: 337c1a28-cfe5-454d-8cbc-b6e15e64470d

hostAliases

apiVersion: apps/v1
kind: Deployment
metadata:
name: nginx-deployment
labels:
app: nginx
spec:
replicas: 3
selector:
matchLabels:
app: nginx
template:
metadata:
labels:
app: nginx
spec:
hostAliases:
- ip: 127.0.0.1
hostnames:
- foo.local
- bar.local
- ip: 10.1.2.3
hostnames:
- foo.remote
- bar.remote
containers:
- name: nginx
image: nginx:1.14.2
ports:
- containerPort: 80

k8s设置时区

通过环境变量设置:

apiVersion: v1
kind: Pod
metadata:
name: pod-env-tz
spec:
containers:
- name: ngx
image: nginx:latest
imagePullPolicy: IfNotPresent
env:
- name: TZ
value: Asia/Shanghai

通过挂载主机时区文件设置:

apiVersion: v1
kind: Pod
metadata:
name: pod-vol-tz
spec:
containers:
- name: ngx
image: nginx:latest
imagePullPolicy: IfNotPresent
volumeMounts:
- name: tz-config
mountPath: /etc/localtime
readOnly: true
volumes:
- name: tz-config
hostPath:
path: /etc/localtime

参考:https://developer.aliyun.com/article/637809#slide-0

k8s清理error pod

kubectl get pods -A | awk '/Evicted/{print $1,$2}' | xargs -r -n2 kubectl delete pod -n