From 039205560a5a38dac7e180ec4c46ce4edd404d39 Mon Sep 17 00:00:00 2001 From: Cristian Calin <6627509+cristicalin@users.noreply.github.com> Date: Tue, 9 Nov 2021 19:57:47 +0200 Subject: [PATCH] nodelocaldns: allow a secondary pod for nodelocaldns for local-HA (#8100) * nodelocaldns: allow a secondary pod for nodelocaldns for local-HA * CI: add job to test nodelocaldns secondary --- .gitlab-ci/packet.yml | 5 + docs/dns-stack.md | 16 +++ .../group_vars/k8s_cluster/k8s-cluster.yml | 3 + roles/download/defaults/main.yml | 2 +- .../kubernetes-apps/ansible/defaults/main.yml | 2 + roles/kubernetes-apps/ansible/tasks/main.yml | 1 + .../ansible/tasks/nodelocaldns.yml | 28 +++++ .../templates/nodelocaldns-config.yml.j2 | 88 ++++++++++++++- .../templates/nodelocaldns-daemonset.yml.j2 | 32 ++++-- .../nodelocaldns-second-daemonset.yml.j2 | 103 ++++++++++++++++++ roles/kubespray-defaults/defaults/main.yaml | 3 + ..._centos8-calico-nodelocaldns-secondary.yml | 15 +++ 12 files changed, 281 insertions(+), 17 deletions(-) create mode 100644 roles/kubernetes-apps/ansible/templates/nodelocaldns-second-daemonset.yml.j2 create mode 100644 tests/files/packet_centos8-calico-nodelocaldns-secondary.yml diff --git a/.gitlab-ci/packet.yml b/.gitlab-ci/packet.yml index 9b432a19a..6e72a4cd8 100644 --- a/.gitlab-ci/packet.yml +++ b/.gitlab-ci/packet.yml @@ -194,6 +194,11 @@ packet_amazon-linux-2-aio: extends: .packet_pr when: manual +packet_centos8-calico-nodelocaldns-secondary: + stage: deploy-part2 + extends: .packet_pr + when: manual + packet_fedora34-kube-ovn-containerd: stage: deploy-part2 extends: .packet_periodic diff --git a/docs/dns-stack.md b/docs/dns-stack.md index 7771c26bb..b6d2064a6 100644 --- a/docs/dns-stack.md +++ b/docs/dns-stack.md @@ -212,6 +212,22 @@ nodelocaldns_external_zones: See [dns_etchosts](#dns_etchosts-coredns) above. +### Nodelocal DNS HA + +Under some circumstances the single POD nodelocaldns implementation may not be able to be replaced soon enough and a cluster upgrade or a nodelocaldns upgrade can cause DNS requests to time out for short intervals. If for any reason your applications cannot tollerate this behavior you can enable a redundant nodelocal DNS pod on each node: + +```yaml +enable_nodelocaldns_secondary: true +``` + +**Note:** when the nodelocaldns secondary is enabled, the primary is instructed to no longer tear down the iptables rules it sets up to direct traffic to itself. In case both daemonsets have failing pods on the same node, this can cause a DNS blackout with traffic no longer being forwarded to the coredns central service as a fallback. Please ensure you account for this also if you decide to disable the nodelocaldns cache. + +There is a time delta (in seconds) allowed for the secondary nodelocaldns to survive in case both primary and secondary daemonsets are updated at the same time. It is advised to tune this variable after you have performed some tests in your own environment. + +```yaml +nodelocaldns_secondary_skew_seconds: 5 +``` + ## Limitations * Kubespray has yet ways to configure Kubedns addon to forward requests SkyDns can diff --git a/inventory/sample/group_vars/k8s_cluster/k8s-cluster.yml b/inventory/sample/group_vars/k8s_cluster/k8s-cluster.yml index 4248832eb..dbd66d3dd 100644 --- a/inventory/sample/group_vars/k8s_cluster/k8s-cluster.yml +++ b/inventory/sample/group_vars/k8s_cluster/k8s-cluster.yml @@ -166,9 +166,12 @@ dns_mode: coredns # manual_dns_server: 10.x.x.x # Enable nodelocal dns cache enable_nodelocaldns: true +enable_nodelocaldns_secondary: false nodelocaldns_ip: 169.254.25.10 nodelocaldns_health_port: 9254 +nodelocaldns_second_health_port: 9256 nodelocaldns_bind_metrics_host_ip: false +nodelocaldns_secondary_skew_seconds: 5 # nodelocaldns_external_zones: # - zones: # - example.com diff --git a/roles/download/defaults/main.yml b/roles/download/defaults/main.yml index 8e858bb3a..6d19e2324 100644 --- a/roles/download/defaults/main.yml +++ b/roles/download/defaults/main.yml @@ -610,7 +610,7 @@ coredns_image_is_namespaced: "{{ (kube_version is version('v1.21.0','>=')) or (c coredns_image_repo: "{{ kube_image_repo }}{{'/coredns/coredns' if (coredns_image_is_namespaced | bool) else '/coredns' }}" coredns_image_tag: "{{ coredns_version if (coredns_image_is_namespaced | bool) else (coredns_version | regex_replace('^v', '')) }}" -nodelocaldns_version: "1.17.1" +nodelocaldns_version: "1.21.1" nodelocaldns_image_repo: "{{ kube_image_repo }}/dns/k8s-dns-node-cache" nodelocaldns_image_tag: "{{ nodelocaldns_version }}" diff --git a/roles/kubernetes-apps/ansible/defaults/main.yml b/roles/kubernetes-apps/ansible/defaults/main.yml index 411260551..fa06b2e0d 100644 --- a/roles/kubernetes-apps/ansible/defaults/main.yml +++ b/roles/kubernetes-apps/ansible/defaults/main.yml @@ -17,6 +17,8 @@ nodelocaldns_cpu_requests: 100m nodelocaldns_memory_limit: 170Mi nodelocaldns_memory_requests: 70Mi nodelocaldns_ds_nodeselector: "kubernetes.io/os: linux" +nodelocaldns_prometheus_port: 9253 +nodelocaldns_secondary_prometheus_port: 9255 # Limits for dns-autoscaler dns_autoscaler_cpu_requests: 20m diff --git a/roles/kubernetes-apps/ansible/tasks/main.yml b/roles/kubernetes-apps/ansible/tasks/main.yml index 75ee477b0..d59f0e0b6 100644 --- a/roles/kubernetes-apps/ansible/tasks/main.yml +++ b/roles/kubernetes-apps/ansible/tasks/main.yml @@ -48,6 +48,7 @@ - "{{ coredns_manifests.results | default({}) }}" - "{{ coredns_secondary_manifests.results | default({}) }}" - "{{ nodelocaldns_manifests.results | default({}) }}" + - "{{ nodelocaldns_second_manifests.results | default({}) }}" when: - dns_mode != 'none' - inventory_hostname == groups['kube_control_plane'][0] diff --git a/roles/kubernetes-apps/ansible/tasks/nodelocaldns.yml b/roles/kubernetes-apps/ansible/tasks/nodelocaldns.yml index ce79ceed4..4809aa9b8 100644 --- a/roles/kubernetes-apps/ansible/tasks/nodelocaldns.yml +++ b/roles/kubernetes-apps/ansible/tasks/nodelocaldns.yml @@ -43,3 +43,31 @@ tags: - nodelocaldns - coredns + +- name: Kubernetes Apps | Lay Down nodelocaldns-secondary Template + template: + src: "{{ item.file }}.j2" + dest: "{{ kube_config_dir }}/{{ item.file }}" + with_items: + - { name: nodelocaldns, file: nodelocaldns-second-daemonset.yml, type: daemonset } + register: nodelocaldns_second_manifests + vars: + forwardTarget: >- + {%- if secondaryclusterIP is defined and dns_mode == 'coredns_dual' -%} + {{ primaryClusterIP }} {{ secondaryclusterIP }} + {%- else -%} + {{ primaryClusterIP }} + {%- endif -%} + upstreamForwardTarget: >- + {%- if resolvconf_mode == 'host_resolvconf' and upstream_dns_servers is defined and upstream_dns_servers|length > 0 -%} + {{ upstream_dns_servers|join(' ') }} + {%- else -%} + /etc/resolv.conf + {%- endif -%} + when: + - enable_nodelocaldns + - enable_nodelocaldns_secondary + - inventory_hostname == groups['kube_control_plane'] | first + tags: + - nodelocaldns + - coredns diff --git a/roles/kubernetes-apps/ansible/templates/nodelocaldns-config.yml.j2 b/roles/kubernetes-apps/ansible/templates/nodelocaldns-config.yml.j2 index 18abf8ea3..0244c04a4 100644 --- a/roles/kubernetes-apps/ansible/templates/nodelocaldns-config.yml.j2 +++ b/roles/kubernetes-apps/ansible/templates/nodelocaldns-config.yml.j2 @@ -17,7 +17,7 @@ data: loop bind {{ nodelocaldns_ip }} forward . {{ block['nameservers'] | join(' ') }} - prometheus {% if nodelocaldns_bind_metrics_host_ip %}{$MY_HOST_IP}{% endif %}:9253 + prometheus {% if nodelocaldns_bind_metrics_host_ip %}{$MY_HOST_IP}{% endif %}:{{ nodelocaldns_prometheus_port }} log {% if dns_etchosts | default(None) %} hosts /etc/coredns/hosts { @@ -39,7 +39,7 @@ data: forward . {{ forwardTarget }} { force_tcp } - prometheus {% if nodelocaldns_bind_metrics_host_ip %}{$MY_HOST_IP}{% endif %}:9253 + prometheus {% if nodelocaldns_bind_metrics_host_ip %}{$MY_HOST_IP}{% endif %}:{{ nodelocaldns_prometheus_port }} health {{ nodelocaldns_ip }}:{{ nodelocaldns_health_port }} {% if dns_etchosts | default(None) %} hosts /etc/coredns/hosts { @@ -56,7 +56,7 @@ data: forward . {{ forwardTarget }} { force_tcp } - prometheus {% if nodelocaldns_bind_metrics_host_ip %}{$MY_HOST_IP}{% endif %}:9253 + prometheus {% if nodelocaldns_bind_metrics_host_ip %}{$MY_HOST_IP}{% endif %}:{{ nodelocaldns_prometheus_port }} } ip6.arpa:53 { errors @@ -67,7 +67,7 @@ data: forward . {{ forwardTarget }} { force_tcp } - prometheus {% if nodelocaldns_bind_metrics_host_ip %}{$MY_HOST_IP}{% endif %}:9253 + prometheus {% if nodelocaldns_bind_metrics_host_ip %}{$MY_HOST_IP}{% endif %}:{{ nodelocaldns_prometheus_port }} } .:53 { errors @@ -76,13 +76,91 @@ data: loop bind {{ nodelocaldns_ip }} forward . {{ upstreamForwardTarget }} - prometheus {% if nodelocaldns_bind_metrics_host_ip %}{$MY_HOST_IP}{% endif %}:9253 + prometheus {% if nodelocaldns_bind_metrics_host_ip %}{$MY_HOST_IP}{% endif %}:{{ nodelocaldns_prometheus_port }} {% if dns_etchosts | default(None) %} hosts /etc/coredns/hosts { fallthrough } {% endif %} } +{% if enable_nodelocaldns_secondary %} + Corefile-second: | +{% if nodelocaldns_external_zones is defined and nodelocaldns_external_zones|length > 0 %} +{% for block in nodelocaldns_external_zones %} + {{ block['zones'] | join(' ') }} { + errors + cache {{ block['cache'] | default(30) }} + reload + loop + bind {{ nodelocaldns_ip }} + forward . {{ block['nameservers'] | join(' ') }} + prometheus {% if nodelocaldns_bind_metrics_host_ip %}{$MY_HOST_IP}{% endif %}:{{ nodelocaldns_secondary_prometheus_port }} + log +{% if dns_etchosts | default(None) %} + hosts /etc/coredns/hosts { + fallthrough + } +{% endif %} + } +{% endfor %} +{% endif %} + {{ dns_domain }}:53 { + errors + cache { + success 9984 30 + denial 9984 5 + } + reload + loop + bind {{ nodelocaldns_ip }} + forward . {{ forwardTarget }} { + force_tcp + } + prometheus {% if nodelocaldns_bind_metrics_host_ip %}{$MY_HOST_IP}{% endif %}:{{ nodelocaldns_secondary_prometheus_port }} + health {{ nodelocaldns_ip }}:{{ nodelocaldns_second_health_port }} +{% if dns_etchosts | default(None) %} + hosts /etc/coredns/hosts { + fallthrough + } +{% endif %} + } + in-addr.arpa:53 { + errors + cache 30 + reload + loop + bind {{ nodelocaldns_ip }} + forward . {{ forwardTarget }} { + force_tcp + } + prometheus {% if nodelocaldns_bind_metrics_host_ip %}{$MY_HOST_IP}{% endif %}:{{ nodelocaldns_secondary_prometheus_port }} + } + ip6.arpa:53 { + errors + cache 30 + reload + loop + bind {{ nodelocaldns_ip }} + forward . {{ forwardTarget }} { + force_tcp + } + prometheus {% if nodelocaldns_bind_metrics_host_ip %}{$MY_HOST_IP}{% endif %}:{{ nodelocaldns_secondary_prometheus_port }} + } + .:53 { + errors + cache 30 + reload + loop + bind {{ nodelocaldns_ip }} + forward . {{ upstreamForwardTarget }} + prometheus {% if nodelocaldns_bind_metrics_host_ip %}{$MY_HOST_IP}{% endif %}:{{ nodelocaldns_secondary_prometheus_port }} +{% if dns_etchosts | default(None) %} + hosts /etc/coredns/hosts { + fallthrough + } +{% endif %} + } +{% endif %} {% if dns_etchosts | default(None) %} hosts: | {{ dns_etchosts | indent(width=4, indentfirst=None) }} diff --git a/roles/kubernetes-apps/ansible/templates/nodelocaldns-daemonset.yml.j2 b/roles/kubernetes-apps/ansible/templates/nodelocaldns-daemonset.yml.j2 index 7abd28ffa..7c63e28fa 100644 --- a/roles/kubernetes-apps/ansible/templates/nodelocaldns-daemonset.yml.j2 +++ b/roles/kubernetes-apps/ansible/templates/nodelocaldns-daemonset.yml.j2 @@ -16,7 +16,7 @@ spec: k8s-app: nodelocaldns annotations: prometheus.io/scrape: 'true' - prometheus.io/port: '9253' + prometheus.io/port: '{{ nodelocaldns_prometheus_port }}' spec: nodeSelector: {{ nodelocaldns_ds_nodeselector }} @@ -38,16 +38,16 @@ spec: requests: cpu: {{ nodelocaldns_cpu_requests }} memory: {{ nodelocaldns_memory_requests }} - args: [ "-localip", "{{ nodelocaldns_ip }}", "-conf", "/etc/coredns/Corefile", "-upstreamsvc", "coredns" ] - securityContext: - privileged: true -{% if nodelocaldns_bind_metrics_host_ip %} - env: - - name: MY_HOST_IP - valueFrom: - fieldRef: - fieldPath: status.hostIP -{% endif %} + args: + - -localip + - {{ nodelocaldns_ip }} + - -conf + - /etc/coredns/Corefile + - -upstreamsvc + - coredns +{% if enable_nodelocaldns_secondary %} + - -skipteardown +{% else %} ports: - containerPort: 53 name: dns @@ -58,6 +58,16 @@ spec: - containerPort: 9253 name: metrics protocol: TCP +{% endif %} + securityContext: + privileged: true +{% if nodelocaldns_bind_metrics_host_ip %} + env: + - name: MY_HOST_IP + valueFrom: + fieldRef: + fieldPath: status.hostIP +{% endif %} livenessProbe: httpGet: host: {{ nodelocaldns_ip }} diff --git a/roles/kubernetes-apps/ansible/templates/nodelocaldns-second-daemonset.yml.j2 b/roles/kubernetes-apps/ansible/templates/nodelocaldns-second-daemonset.yml.j2 new file mode 100644 index 000000000..037bf446e --- /dev/null +++ b/roles/kubernetes-apps/ansible/templates/nodelocaldns-second-daemonset.yml.j2 @@ -0,0 +1,103 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: nodelocaldns-second + namespace: kube-system + labels: + k8s-app: kube-dns + addonmanager.kubernetes.io/mode: Reconcile +spec: + selector: + matchLabels: + k8s-app: nodelocaldns-second + template: + metadata: + labels: + k8s-app: nodelocaldns-second + annotations: + prometheus.io/scrape: 'true' + prometheus.io/port: '{{ nodelocaldns_secondary_prometheus_port }}' + spec: + nodeSelector: + {{ nodelocaldns_ds_nodeselector }} + priorityClassName: system-cluster-critical + serviceAccountName: nodelocaldns + hostNetwork: true + dnsPolicy: Default # Don't use cluster DNS. + tolerations: + - effect: NoSchedule + operator: "Exists" + - effect: NoExecute + operator: "Exists" + containers: + - name: node-cache + image: "{{ nodelocaldns_image_repo }}:{{ nodelocaldns_image_tag }}" + resources: + limits: + memory: {{ nodelocaldns_memory_limit }} + requests: + cpu: {{ nodelocaldns_cpu_requests }} + memory: {{ nodelocaldns_memory_requests }} + args: [ "-localip", "{{ nodelocaldns_ip }}", "-conf", "/etc/coredns/Corefile", "-upstreamsvc", "coredns", "-skipteardown" ] + securityContext: + privileged: true +{% if nodelocaldns_bind_metrics_host_ip %} + env: + - name: MY_HOST_IP + valueFrom: + fieldRef: + fieldPath: status.hostIP +{% endif %} + livenessProbe: + httpGet: + host: {{ nodelocaldns_ip }} + path: /health + port: {{ nodelocaldns_health_port }} + scheme: HTTP + timeoutSeconds: 5 + successThreshold: 1 + failureThreshold: 10 + readinessProbe: + httpGet: + host: {{ nodelocaldns_ip }} + path: /health + port: {{ nodelocaldns_health_port }} + scheme: HTTP + timeoutSeconds: 5 + successThreshold: 1 + failureThreshold: 10 + volumeMounts: + - name: config-volume + mountPath: /etc/coredns + - name: xtables-lock + mountPath: /run/xtables.lock + lifecycle: + preStop: + exec: + command: + - sh + - -c + - sleep {{ nodelocaldns_secondary_skew_seconds }} && kill -9 1 + volumes: + - name: config-volume + configMap: + name: nodelocaldns + items: + - key: Corefile-second + path: Corefile +{% if dns_etchosts | default(None) %} + - key: hosts + path: hosts +{% endif %} + - name: xtables-lock + hostPath: + path: /run/xtables.lock + type: FileOrCreate + # Implement a time skew between the main nodelocaldns and this secondary. + # Since the two nodelocaldns instances share the :53 port, we want to keep + # at least one running at any time enven if the manifests are replaced simultaneously + terminationGracePeriodSeconds: {{ nodelocaldns_secondary_skew_seconds }} + updateStrategy: + rollingUpdate: + maxUnavailable: {{ serial | default('20%') }} + type: RollingUpdate diff --git a/roles/kubespray-defaults/defaults/main.yaml b/roles/kubespray-defaults/defaults/main.yaml index 99aec470e..488e1ae5b 100644 --- a/roles/kubespray-defaults/defaults/main.yaml +++ b/roles/kubespray-defaults/defaults/main.yaml @@ -93,9 +93,12 @@ dns_mode: coredns # Enable nodelocal dns cache enable_nodelocaldns: true +enable_nodelocaldns_secondary: false nodelocaldns_ip: 169.254.25.10 nodelocaldns_health_port: 9254 +nodelocaldns_second_health_port: 9256 nodelocaldns_bind_metrics_host_ip: false +nodelocaldns_secondary_skew_seconds: 5 # Should be set to a cluster IP if using a custom cluster DNS manual_dns_server: "" diff --git a/tests/files/packet_centos8-calico-nodelocaldns-secondary.yml b/tests/files/packet_centos8-calico-nodelocaldns-secondary.yml new file mode 100644 index 000000000..600ce6017 --- /dev/null +++ b/tests/files/packet_centos8-calico-nodelocaldns-secondary.yml @@ -0,0 +1,15 @@ +--- +# Instance settings +cloud_image: centos-8 +mode: default +vm_memory: 3072Mi + +# Kubespray settings +kube_network_plugin: calico +deploy_netchecker: true +dns_min_replicas: 1 +enable_nodelocaldns_secondary: true +loadbalancer_apiserver_type: haproxy + +# required +calico_iptables_backend: "Auto"