From b7692fad09e0855c0c02619c6b47bab9ffda1b08 Mon Sep 17 00:00:00 2001 From: Bogdan Dobrelya Date: Fri, 30 Sep 2016 17:23:47 +0200 Subject: [PATCH] Add advanced net check for DNS K8s app * Add an option to deploy K8s app to test e2e network connectivity and cluster DNS resolve via Kubedns for nethost/simple pods (defaults to false). * Parametrize existing k8s apps templates with kube_namespace and kube_config_dir instead of hardcode. * For CoreOS, ensure nameservers from inventory to be put in the first place to allow hostnet pods connectivity via short names or FQDN and hostnet agents to pass as well, if netchecker deployed. Signed-off-by: Bogdan Dobrelya --- docs/netcheck.md | 41 +++++++++++++++++++ inventory/group_vars/all.yml | 2 + roles/dnsmasq/tasks/resolvconf.yml | 17 +++++++- roles/download/defaults/main.yml | 21 ++++++++++ .../kubernetes-apps/ansible/defaults/main.yml | 12 ++++++ .../tasks/calico-policy-controller.yml | 6 +-- roles/kubernetes-apps/ansible/tasks/main.yaml | 10 +++-- .../ansible/tasks/netchecker.yml | 20 +++++++++ .../templates/calico-policy-controller.yml.j2 | 2 +- .../ansible/templates/kubedns-rc.yml | 2 +- .../ansible/templates/kubedns-svc.yml | 2 +- .../ansible/templates/netchecker-agent-ds.yml | 25 +++++++++++ .../templates/netchecker-agent-hostnet-ds.yml | 26 ++++++++++++ .../templates/netchecker-server-pod.yml | 21 ++++++++++ .../templates/netchecker-server-svc.yml | 15 +++++++ roles/kubernetes/node/meta/main.yml | 9 ++++ 16 files changed, 220 insertions(+), 11 deletions(-) create mode 100644 docs/netcheck.md create mode 100644 roles/kubernetes-apps/ansible/tasks/netchecker.yml create mode 100644 roles/kubernetes-apps/ansible/templates/netchecker-agent-ds.yml create mode 100644 roles/kubernetes-apps/ansible/templates/netchecker-agent-hostnet-ds.yml create mode 100644 roles/kubernetes-apps/ansible/templates/netchecker-server-pod.yml create mode 100644 roles/kubernetes-apps/ansible/templates/netchecker-server-svc.yml diff --git a/docs/netcheck.md b/docs/netcheck.md new file mode 100644 index 000000000..408b0fd8c --- /dev/null +++ b/docs/netcheck.md @@ -0,0 +1,41 @@ +Network Checker Application +=========================== + +With the ``deploy_netchecker`` var enabled (defaults to false), Kargo deploys a +Network Checker Application from the 3rd side `l23network/mcp-netchecker` docker +images. It consists of the server and agents trying to reach the server by usual +for Kubernetes applications network connectivity meanings. Therefore, this +automagically verifies a pod to pod connectivity via the cluster IP and checks +if DNS resolve is functioning as well. + +The checks are run by agents on a periodic basis and cover standard and host network +pods as well. The history of performed checks may be found in the agents' application +logs. + +To get the most recent and cluster-wide network connectivity report, run from +any of the cluster nodes: +``` +curl http://localhost:31081/api/v1/connectivity_check +``` +Note that Kargo does not invoke the check but only deploys the application, if +requested. + +There are related application specifc variables: +``` +netchecker_port: 31081 +agent_report_interval: 15 +netcheck_namespace: default +agent_img: "quay.io/l23network/mcp-netchecker-agent:v0.1" +server_img: "quay.io/l23network/mcp-netchecker-server:v0.1" +``` + +Note that the application verifies DNS resolve for FQDNs comprising only the +combination of the ``netcheck_namespace.dns_domain`` vars, for example the +``netchecker-service.default.cluster.local``. If you want to deploy the application +to the non default namespace, make sure as well to adjust the ``searchdomains`` var +so the resulting search domain records to contain that namespace, like: + +``` +search: foospace.cluster.local default.cluster.local ... +nameserver: ... +``` diff --git a/inventory/group_vars/all.yml b/inventory/group_vars/all.yml index 49abb1d03..daf641335 100644 --- a/inventory/group_vars/all.yml +++ b/inventory/group_vars/all.yml @@ -35,6 +35,8 @@ kube_users: cluster_name: cluster.local # Subdomains of DNS domain to be resolved via /etc/resolv.conf ndots: 5 +# Deploy netchecker app to verify DNS resolve as an HTTP service +deploy_netchecker: false # For some environments, each node has a pubilcally accessible # address and an address it should bind services to. These are diff --git a/roles/dnsmasq/tasks/resolvconf.yml b/roles/dnsmasq/tasks/resolvconf.yml index 9be70c7a5..ba367ac48 100644 --- a/roles/dnsmasq/tasks/resolvconf.yml +++ b/roles/dnsmasq/tasks/resolvconf.yml @@ -48,7 +48,20 @@ when: resolvconf.rc == 0 notify: Dnsmasq | update resolvconf -- name: Add search domains to resolv.conf +- name: Remove search and nameserver options from resolvconf cloud init temporary file + lineinfile: + dest: "{{resolvconffile}}" + state: absent + regexp: "^{{ item }}.*$" + backup: yes + follow: yes + with_items: + - search + - nameserver + when: ansible_os_family == "CoreOS" + notify: Dnsmasq | update resolvconf for CoreOS + +- name: Add search domains to resolvconf file lineinfile: line: "search {{searchentries}}" dest: "{{resolvconffile}}" @@ -66,7 +79,7 @@ nameserver {{ item }} {% endfor %} state: present - insertafter: "^search.*$" + insertafter: "^search default.svc.*$" create: yes backup: yes follow: yes diff --git a/roles/download/defaults/main.yml b/roles/download/defaults/main.yml index c66433c6d..966eee709 100644 --- a/roles/download/defaults/main.yml +++ b/roles/download/defaults/main.yml @@ -58,6 +58,12 @@ hyperkube_image_repo: "quay.io/coreos/hyperkube" hyperkube_image_tag: "{{ kube_version }}_coreos.0" pod_infra_image_repo: "gcr.io/google_containers/pause-amd64" pod_infra_image_tag: "{{ pod_infra_version }}" +netcheck_tag: v0.1 +netcheck_kubectl_tag: v0.18.0-120-gaeb4ac55ad12b1-dirty +netcheck_agent_img_repo: "quay.io/l23network/mcp-netchecker-agent" +netcheck_server_img_repo: "quay.io/l23network/mcp-netchecker-server" +netcheck_kubectl_img_repo: "gcr.io/google_containers/kubectl" + nginx_image_repo: nginx nginx_image_tag: 1.11.4-alpine dnsmasq_version: 2.72 @@ -73,6 +79,21 @@ test_image_repo: busybox test_image_tag: latest downloads: + netcheck_server: + container: true + repo: "{{ netcheck_server_img_repo }}" + tag: "{{ netcheck_tag }}" + enabled: "{{ deploy_netchecker|bool }}" + netcheck_agent: + container: true + repo: "{{ netcheck_agent_img_repo }}" + tag: "{{ netcheck_tag }}" + enabled: "{{ deploy_netchecker|bool }}" + netcheck_kubectl: + container: true + repo: "{{ netcheck_kubectl_img_repo }}" + tag: "{{ netcheck_kubectl_tag }}" + enabled: "{{ deploy_netchecker|bool }}" weave: dest: weave/bin/weave version: "{{weave_version}}" diff --git a/roles/kubernetes-apps/ansible/defaults/main.yml b/roles/kubernetes-apps/ansible/defaults/main.yml index e064984c6..02ca7b29d 100644 --- a/roles/kubernetes-apps/ansible/defaults/main.yml +++ b/roles/kubernetes-apps/ansible/defaults/main.yml @@ -1,3 +1,6 @@ +kube_config_dir: /etc/kubernetes +kube_namespace: kube-system + # Versions kubedns_version: 1.7 kubednsmasq_version: 1.3 @@ -13,5 +16,14 @@ exechealthz_image_tag: "{{ exechealthz_version }}" calico_policy_image_repo: "calico/kube-policy-controller" calico_policy_image_tag: latest +# Netchecker +deploy_netchecker: false +netchecker_port: 31081 +agent_report_interval: 15 +netcheck_namespace: default +agent_img: "quay.io/l23network/mcp-netchecker-agent:v0.1" +server_img: "quay.io/l23network/mcp-netchecker-server:v0.1" +kubectl_image: "gcr.io/google_containers/kubectl:v0.18.0-120-gaeb4ac55ad12b1-dirty" + # SSL etcd_cert_dir: "/etc/ssl/etcd/ssl" diff --git a/roles/kubernetes-apps/ansible/tasks/calico-policy-controller.yml b/roles/kubernetes-apps/ansible/tasks/calico-policy-controller.yml index 6ad8dd220..02a49f211 100644 --- a/roles/kubernetes-apps/ansible/tasks/calico-policy-controller.yml +++ b/roles/kubernetes-apps/ansible/tasks/calico-policy-controller.yml @@ -1,5 +1,5 @@ - name: Write calico-policy-controller yaml - template: src=calico-policy-controller.yml.j2 dest=/etc/kubernetes/calico-policy-controller.yml + template: src=calico-policy-controller.yml.j2 dest={{kube_config_dir}}/calico-policy-controller.yml when: inventory_hostname == groups['kube-master'][0] @@ -7,7 +7,7 @@ kube: name: "calico-policy-controller" kubectl: "{{bin_dir}}/kubectl" - filename: "/etc/kubernetes/calico-policy-controller.yml" - namespace: "kube-system" + filename: "{{kube_config_dir}}/calico-policy-controller.yml" + namespace: "{{kube_namespace}}" resource: "rs" when: inventory_hostname == groups['kube-master'][0] diff --git a/roles/kubernetes-apps/ansible/tasks/main.yaml b/roles/kubernetes-apps/ansible/tasks/main.yaml index 130a17a6f..a65b6b527 100644 --- a/roles/kubernetes-apps/ansible/tasks/main.yaml +++ b/roles/kubernetes-apps/ansible/tasks/main.yaml @@ -1,6 +1,6 @@ --- - name: Kubernetes Apps | Lay Down KubeDNS Template - template: src={{item.file}} dest=/etc/kubernetes/{{item.file}} + template: src={{item.file}} dest={{kube_config_dir}}/{{item.file}} with_items: - {file: kubedns-rc.yml, type: rc} - {file: kubedns-svc.yml, type: svc} @@ -10,10 +10,10 @@ - name: Kubernetes Apps | Start Resources kube: name: kubedns - namespace: kube-system + namespace: "{{ kube_namespace }}" kubectl: "{{bin_dir}}/kubectl" resource: "{{item.item.type}}" - filename: /etc/kubernetes/{{item.item.file}} + filename: "{{kube_config_dir}}/{{item.item.file}}" state: "{{item.changed | ternary('latest','present') }}" with_items: "{{ manifests.results }}" when: inventory_hostname == groups['kube-master'][0] @@ -21,3 +21,7 @@ - include: tasks/calico-policy-controller.yml when: ( enable_network_policy is defined and enable_network_policy == True ) or ( kube_network_plugin == 'canal' ) + +- name: Kubernetes Apps | Netchecker + include: tasks/netchecker.yml + when: deploy_netchecker diff --git a/roles/kubernetes-apps/ansible/tasks/netchecker.yml b/roles/kubernetes-apps/ansible/tasks/netchecker.yml new file mode 100644 index 000000000..c28d921b6 --- /dev/null +++ b/roles/kubernetes-apps/ansible/tasks/netchecker.yml @@ -0,0 +1,20 @@ +- name: Kubernetes Apps | Lay Down Netchecker Template + template: src={{item.file}} dest={{kube_config_dir}}/{{item.file}} + with_items: + - {file: netchecker-agent-ds.yml, type: ds, name: netchecker-agent} + - {file: netchecker-agent-hostnet-ds.yml, type: ds, name: netchecker-agent-hostnet} + - {file: netchecker-server-pod.yml, type: po, name: netchecker-server} + - {file: netchecker-server-svc.yml, type: svc, name: netchecker-service} + register: manifests + when: inventory_hostname == groups['kube-master'][0] + +- name: Kubernetes Apps | Start Netchecker Resources + kube: + name: "{{item.item.name}}" + namespace: "{{netcheck_namespace}}" + kubectl: "{{bin_dir}}/kubectl" + resource: "{{item.item.type}}" + filename: "{{kube_config_dir}}/{{item.item.file}}" + state: "{{item.changed | ternary('latest','present') }}" + with_items: "{{ manifests.results }}" + when: inventory_hostname == groups['kube-master'][0] diff --git a/roles/kubernetes-apps/ansible/templates/calico-policy-controller.yml.j2 b/roles/kubernetes-apps/ansible/templates/calico-policy-controller.yml.j2 index 469060278..a522c80ad 100644 --- a/roles/kubernetes-apps/ansible/templates/calico-policy-controller.yml.j2 +++ b/roles/kubernetes-apps/ansible/templates/calico-policy-controller.yml.j2 @@ -2,7 +2,7 @@ apiVersion: extensions/v1beta1 kind: ReplicaSet metadata: name: calico-policy-controller - namespace: kube-system + namespace: {{ kube_namespace }} labels: k8s-app: calico-policy kubernetes.io/cluster-service: "true" diff --git a/roles/kubernetes-apps/ansible/templates/kubedns-rc.yml b/roles/kubernetes-apps/ansible/templates/kubedns-rc.yml index fc29a0942..84e725cbc 100644 --- a/roles/kubernetes-apps/ansible/templates/kubedns-rc.yml +++ b/roles/kubernetes-apps/ansible/templates/kubedns-rc.yml @@ -2,7 +2,7 @@ apiVersion: v1 kind: ReplicationController metadata: name: kubedns - namespace: kube-system + namespace: {{ kube_namespace }} labels: k8s-app: kubedns version: v19 diff --git a/roles/kubernetes-apps/ansible/templates/kubedns-svc.yml b/roles/kubernetes-apps/ansible/templates/kubedns-svc.yml index 2e21bc9e6..7f88d0666 100644 --- a/roles/kubernetes-apps/ansible/templates/kubedns-svc.yml +++ b/roles/kubernetes-apps/ansible/templates/kubedns-svc.yml @@ -2,7 +2,7 @@ apiVersion: v1 kind: Service metadata: name: kubedns - namespace: kube-system + namespace: {{ kube_namespace }} labels: k8s-app: kubedns kubernetes.io/cluster-service: "true" diff --git a/roles/kubernetes-apps/ansible/templates/netchecker-agent-ds.yml b/roles/kubernetes-apps/ansible/templates/netchecker-agent-ds.yml new file mode 100644 index 000000000..a52329e50 --- /dev/null +++ b/roles/kubernetes-apps/ansible/templates/netchecker-agent-ds.yml @@ -0,0 +1,25 @@ +apiVersion: extensions/v1beta1 +kind: DaemonSet +metadata: + labels: + app: netchecker-agent + name: netchecker-agent + namespace: {{ netcheck_namespace }} +spec: + template: + metadata: + name: netchecker-agent + labels: + app: netchecker-agent + spec: + containers: + - name: netchecker-agent + image: "{{ agent_img }}" + env: + - name: MY_POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: REPORT_INTERVAL + value: '{{ agent_report_interval }}' + imagePullPolicy: {{ k8s_image_pull_policy }} diff --git a/roles/kubernetes-apps/ansible/templates/netchecker-agent-hostnet-ds.yml b/roles/kubernetes-apps/ansible/templates/netchecker-agent-hostnet-ds.yml new file mode 100644 index 000000000..4fd03e80a --- /dev/null +++ b/roles/kubernetes-apps/ansible/templates/netchecker-agent-hostnet-ds.yml @@ -0,0 +1,26 @@ +apiVersion: extensions/v1beta1 +kind: DaemonSet +metadata: + labels: + app: netchecker-agent-hostnet + name: netchecker-agent-hostnet + namespace: {{ netcheck_namespace }} +spec: + template: + metadata: + name: netchecker-agent-hostnet + labels: + app: netchecker-agent-hostnet + spec: + hostNetwork: True + containers: + - name: netchecker-agent + image: "{{ agent_img }}" + env: + - name: MY_POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: REPORT_INTERVAL + value: '{{ agent_report_interval }}' + imagePullPolicy: {{ k8s_image_pull_policy }} diff --git a/roles/kubernetes-apps/ansible/templates/netchecker-server-pod.yml b/roles/kubernetes-apps/ansible/templates/netchecker-server-pod.yml new file mode 100644 index 000000000..6f242bc51 --- /dev/null +++ b/roles/kubernetes-apps/ansible/templates/netchecker-server-pod.yml @@ -0,0 +1,21 @@ +apiVersion: v1 +kind: Pod +metadata: + name: netchecker-server + labels: + app: netchecker-server + namespace: {{ netcheck_namespace }} +spec: + containers: + - name: netchecker-server + image: "{{ server_img }}" + env: + imagePullPolicy: {{ k8s_image_pull_policy }} + ports: + - containerPort: 8081 + hostPort: 8081 + - name: kubectl-proxy + image: "{{ kubectl_image }}" + imagePullPolicy: {{ k8s_image_pull_policy }} + args: + - proxy diff --git a/roles/kubernetes-apps/ansible/templates/netchecker-server-svc.yml b/roles/kubernetes-apps/ansible/templates/netchecker-server-svc.yml new file mode 100644 index 000000000..dc3894676 --- /dev/null +++ b/roles/kubernetes-apps/ansible/templates/netchecker-server-svc.yml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: netchecker-service + namespace: {{ netcheck_namespace }} +spec: + selector: + app: netchecker-server + ports: + - + protocol: TCP + port: 8081 + targetPort: 8081 + nodePort: {{ netchecker_port }} + type: NodePort diff --git a/roles/kubernetes/node/meta/main.yml b/roles/kubernetes/node/meta/main.yml index 3e1dd5b3e..a65501113 100644 --- a/roles/kubernetes/node/meta/main.yml +++ b/roles/kubernetes/node/meta/main.yml @@ -9,6 +9,15 @@ dependencies: file: "{{ downloads.nginx }}" - role: download file: "{{ downloads.testbox }}" + - role: download + file: "{{ downloads.netcheck_server }}" + when: deploy_netchecker + - role: download + file: "{{ downloads.netcheck_agent }}" + when: deploy_netchecker + - role: download + file: "{{ downloads.netcheck_kubectl }}" + when: deploy_netchecker - role: download file: "{{ downloads.kubednsmasq }}" when: not skip_dnsmasq_k8s|default(false)