Fix random failure in debug: var=result.content|from_json (#4094)

* Fix random failure in debug: var=result.content|from_json

* netchecker agents are deployed on all k8s-cluster group members

* reducing limits/requests is not enough, switching to n1-standard-2

* gce_centos7 need more cpu
This commit is contained in:
Erwan Miran 2019-01-25 17:14:22 +01:00 committed by Kubernetes Prow Robot
parent 3e52f1a4e9
commit 61d88b8db2
4 changed files with 57 additions and 8 deletions

View file

@ -1,7 +1,7 @@
# Instance settings
cloud_image_family: centos-7
cloud_region: us-central1-c
cloud_machine_type: "n1-standard-1"
cloud_machine_type: "n1-standard-2"
mode: ha
# Deployment settings

View file

@ -1,7 +1,7 @@
# Instance settings
cloud_image_family: centos-7
cloud_region: us-central1-c
cloud_machine_type: "n1-standard-1"
cloud_machine_type: "n1-standard-2"
mode: ha
# Deployment settings
@ -15,7 +15,7 @@ deploy_netchecker: true
dns_min_replicas: 1
cloud_provider: gce
kube_encrypt_secret_data: true
ingress_nginx_enabled: true
#ingress_nginx_enabled: true
cert_manager_enabled: true
metrics_server_enabled: true
kube_token_auth: true

View file

@ -1,7 +1,7 @@
# Instance settings
cloud_image_family: centos-7
cloud_region: us-central1-c
cloud_machine_type: "n1-standard-1"
cloud_machine_type: "n1-standard-2"
mode: default
# Deployment settings

View file

@ -37,9 +37,23 @@
run_once: true
delegate_to: "{{groups['kube-master'][0]}}"
register: nca_pod
until: nca_pod.stdout_lines|length >= groups['kube-node']|intersect(play_hosts)|length * 2
until: nca_pod.stdout_lines|length >= groups['k8s-cluster']|intersect(play_hosts)|length * 2
retries: 3
delay: 10
failed_when: false
- command: "{{ bin_dir }}/kubectl -n {{netcheck_namespace}} describe pod -l app={{ item }}"
run_once: true
delegate_to: "{{groups['kube-master'][0]}}"
no_log: false
with_items:
- netchecker-agent
- netchecker-agent-hostnet
when: not nca_pod is success
- debug: var=nca_pod.stdout_lines
failed_when: not nca_pod is success
run_once: true
- name: Get netchecker agents
uri: url=http://{{ ansible_default_ipv4.address }}:{{netchecker_port}}/api/v1/agents/ return_content=yes
@ -50,7 +64,7 @@
delay: "{{ agent_report_interval }}"
until: agents.content|length > 0 and
agents.content[0] == '{' and
agents.content|from_json|length >= groups['kube-node']|intersect(play_hosts)|length * 2
agents.content|from_json|length >= groups['k8s-cluster']|intersect(play_hosts)|length * 2
failed_when: false
no_log: true
@ -65,16 +79,51 @@
register: result
retries: 3
delay: "{{ agent_report_interval }}"
until: result.content|length > 0 and
result.content[0] == '{'
no_log: true
failed_when: false
when:
- agents.content != '{}'
- debug: var=ncs_pod
run_once: true
when: not result is success
- command: "{{ bin_dir }}/kubectl -n kube-system logs -l k8s-app=kube-proxy"
run_once: true
when: not result is success
delegate_to: "{{groups['kube-master'][0]}}"
no_log: false
- command: "{{ bin_dir }}/kubectl -n kube-system logs -l k8s-app={{item}} --all-containers"
run_once: true
when: not result is success
delegate_to: "{{groups['kube-master'][0]}}"
no_log: false
with_items:
- kube-router
- flannel
- contiv-ovs
- contiv-netplugin
- contiv-netmaster
- canal-node
- calico-node
- cilium
- debug: var=result.content|from_json
failed_when: not result is success
run_once: true
when: not agents.content == '{}'
delegate_to: "{{groups['kube-master'][0]}}"
when:
- not agents.content == '{}'
- result.content[0] == '{'
- debug: var=result
failed_when: not result is success
run_once: true
when:
- not agents.content == '{}'
- result.content[0] != '{'
- debug: msg="Cannot get reports from agents, consider as PASSING"
run_once: true