diff --git a/roles/download/defaults/main.yml b/roles/download/defaults/main.yml
index 1f036d689..191b6bb16 100644
--- a/roles/download/defaults/main.yml
+++ b/roles/download/defaults/main.yml
@@ -133,8 +133,8 @@ test_image_tag: latest
elasticsearch_version: "v5.6.4"
elasticsearch_image_repo: "k8s.gcr.io/elasticsearch"
elasticsearch_image_tag: "{{ elasticsearch_version }}"
-fluentd_version: "1.22"
-fluentd_image_repo: "gcr.io/google_containers/fluentd-elasticsearch"
+fluentd_version: "v2.0.4"
+fluentd_image_repo: "k8s.gcr.io/fluentd-elasticsearch"
fluentd_image_tag: "{{ fluentd_version }}"
kibana_version: "5.6.4"
kibana_image_repo: "docker.elastic.co/kibana/kibana"
diff --git a/roles/kubernetes-apps/efk/fluentd/defaults/main.yml b/roles/kubernetes-apps/efk/fluentd/defaults/main.yml
index e8d93732c..0305a5f7a 100644
--- a/roles/kubernetes-apps/efk/fluentd/defaults/main.yml
+++ b/roles/kubernetes-apps/efk/fluentd/defaults/main.yml
@@ -1,7 +1,7 @@
---
fluentd_cpu_limit: 0m
-fluentd_mem_limit: 200Mi
+fluentd_mem_limit: 500Mi
fluentd_cpu_requests: 100m
fluentd_mem_requests: 200Mi
-fluentd_config_dir: /etc/kubernetes/fluentd
-fluentd_config_file: fluentd.conf
+fluentd_config_dir: /etc/fluent/config.d
+# fluentd_config_file: fluentd.conf
diff --git a/roles/kubernetes-apps/efk/fluentd/templates/fluentd-config.yml.j2 b/roles/kubernetes-apps/efk/fluentd/templates/fluentd-config.yml.j2
index b7de44dc0..19b43a130 100644
--- a/roles/kubernetes-apps/efk/fluentd/templates/fluentd-config.yml.j2
+++ b/roles/kubernetes-apps/efk/fluentd/templates/fluentd-config.yml.j2
@@ -1,10 +1,19 @@
+---
+# https://raw.githubusercontent.com/kubernetes/kubernetes/release-1.10/cluster/addons/fluentd-elasticsearch/fluentd-es-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: fluentd-config
namespace: "kube-system"
+ labels:
+ addonmanager.kubernetes.io/mode: Reconcile
data:
- {{ fluentd_config_file }}: |
+ system.conf: |-
+
+ root_dir /tmp/fluentd-buffers/
+
+
+ containers.input.conf: |-
# This configuration file for Fluentd / td-agent is used
# to watch changes to Docker log files. The kubelet creates symlinks that
# capture the pod name, namespace, container name & Docker container ID
@@ -18,7 +27,6 @@ data:
# See https://github.com/uken/fluent-plugin-elasticsearch &
# https://github.com/fabric8io/fluent-plugin-kubernetes_metadata_filter for
# more information about the plugins.
- # Maintainer: Jimmi Dyson
#
# Example
# =======
@@ -99,63 +107,87 @@ data:
# This makes it easier for users to search for logs by pod name or by
# the name of the Kubernetes container regardless of how many times the
# Kubernetes pod has been restarted (resulting in a several Docker container IDs).
- #
- # TODO: Propagate the labels associated with a container along with its logs
- # so users can query logs using labels as well as or instead of the pod name
- # and container name. This is simply done via configuration of the Kubernetes
- # fluentd plugin but requires secrets to be enabled in the fluent pod. This is a
- # problem yet to be solved as secrets are not usable in static pods which the fluentd
- # pod must be until a per-node controller is available in Kubernetes.
- # Prevent fluentd from handling records containing its own logs. Otherwise
- # it can lead to an infinite loop, when error in sending one message generates
- # another message which also fails to be sent and so on.
-
- type null
-
- # Example:
+
+ # Json Log Example:
# {"log":"[info:2016-02-16T16:04:05.930-08:00] Some log text here\n","stream":"stdout","time":"2016-02-17T00:04:05.931087621Z"}
+ # CRI Log Example:
+ # 2016-02-17T00:04:05.931087621Z stdout F [info:2016-02-16T16:04:05.930-08:00] Some log text here
- type tail
+ @id fluentd-containers.log
+ @type tail
path /var/log/containers/*.log
pos_file /var/log/es-containers.log.pos
time_format %Y-%m-%dT%H:%M:%S.%NZ
- tag kubernetes.*
- format json
+ tag raw.kubernetes.*
read_from_head true
+
+ @type multi_format
+
+ format json
+ time_key time
+ time_format %Y-%m-%dT%H:%M:%S.%NZ
+
+
+ format /^(?
+
+
+ # Detect exceptions in the log output and forward them as one log entry.
+
+ @id raw.kubernetes
+ @type detect_exceptions
+ remove_tag_prefix raw
+ message log
+ stream stream
+ multiline_flush_interval 5
+ max_bytes 500000
+ max_lines 1000
+
+
+ system.input.conf: |-
# Example:
# 2015-12-21 23:17:22,066 [salt.state ][INFO ] Completed state [net.ipv4.ip_forward] at time 23:17:22.066081
- type tail
+ @id minion
+ @type tail
format /^(?
+
# Example:
# Dec 21 23:17:22 gke-foo-1-1-4b5cbd14-node-4eoj startupscript: Finished running startup script /var/run/google.startup.script
- type tail
+ @id startupscript.log
+ @type tail
format syslog
path /var/log/startupscript.log
pos_file /var/log/es-startupscript.log.pos
tag startupscript
+
# Examples:
# time="2016-02-04T06:51:03.053580605Z" level=info msg="GET /containers/json"
# time="2016-02-04T07:53:57.505612354Z" level=error msg="HTTP Error" err="No such image: -f" statusCode=404
+ # TODO(random-liu): Remove this after cri container runtime rolls out.
- type tail
+ @id docker.log
+ @type tail
format /^time="(?
+
# Example:
# 2016/02/04 06:52:38 filePurge: successfully removed file /var/etcd/data/member/wal/00000000000006d0-00000000010a23d1.wal
- type tail
+ @id etcd.log
+ @type tail
# Not parsing this, because it doesn't have anything particularly useful to
# parse out of it (like severities).
format none
@@ -163,13 +195,16 @@ data:
pos_file /var/log/es-etcd.log.pos
tag etcd
+
# Multi-line parsing is required for all the kube logs because very large log
# statements, such as those that include entire object bodies, get split into
# multiple lines by glog.
+
# Example:
# I0204 07:32:30.020537 3368 server.go:1048] POST /stats/container/: (13.972191ms) 200 [[Go-http-client/1.1] 10.244.1.3:40537]
- type tail
+ @id kubelet.log
+ @type tail
format multiline
multiline_flush_interval 5s
format_firstline /^\w\d{4}/
@@ -179,10 +214,12 @@ data:
pos_file /var/log/es-kubelet.log.pos
tag kubelet
+
# Example:
# I1118 21:26:53.975789 6 proxier.go:1096] Port "nodePort for kube-system/default-http-backend:http" (:31429/tcp) was open before and is still needed
- type tail
+ @id kube-proxy.log
+ @type tail
format multiline
multiline_flush_interval 5s
format_firstline /^\w\d{4}/
@@ -192,10 +229,12 @@ data:
pos_file /var/log/es-kube-proxy.log.pos
tag kube-proxy
+
# Example:
# I0204 07:00:19.604280 5 handlers.go:131] GET /api/v1/nodes: (1.624207ms) 200 [[kube-controller-manager/v1.1.3 (linux/amd64) kubernetes/6a81b50] 127.0.0.1:38266]
- type tail
+ @id kube-apiserver.log
+ @type tail
format multiline
multiline_flush_interval 5s
format_firstline /^\w\d{4}/
@@ -205,10 +244,12 @@ data:
pos_file /var/log/es-kube-apiserver.log.pos
tag kube-apiserver
+
# Example:
# I0204 06:55:31.872680 5 servicecontroller.go:277] LB already exists and doesn't need update for service kube-system/kube-ui
- type tail
+ @id kube-controller-manager.log
+ @type tail
format multiline
multiline_flush_interval 5s
format_firstline /^\w\d{4}/
@@ -218,10 +259,12 @@ data:
pos_file /var/log/es-kube-controller-manager.log.pos
tag kube-controller-manager
+
# Example:
# W0204 06:49:18.239674 7 reflector.go:245] pkg/scheduler/factory/factory.go:193: watch of *api.Service ended with: 401: The event in requested index is outdated and cleared (the requested history has been cleared [2578313/2577886]) [2579312]
- type tail
+ @id kube-scheduler.log
+ @type tail
format multiline
multiline_flush_interval 5s
format_firstline /^\w\d{4}/
@@ -231,10 +274,12 @@ data:
pos_file /var/log/es-kube-scheduler.log.pos
tag kube-scheduler
+
# Example:
# I1104 10:36:20.242766 5 rescheduler.go:73] Running Rescheduler
- type tail
+ @id rescheduler.log
+ @type tail
format multiline
multiline_flush_interval 5s
format_firstline /^\w\d{4}/
@@ -244,10 +289,12 @@ data:
pos_file /var/log/es-rescheduler.log.pos
tag rescheduler
+
# Example:
# I0603 15:31:05.793605 6 cluster_manager.go:230] Reading config from path /etc/gce.conf
- type tail
+ @id glbc.log
+ @type tail
format multiline
multiline_flush_interval 5s
format_firstline /^\w\d{4}/
@@ -257,10 +304,12 @@ data:
pos_file /var/log/es-glbc.log.pos
tag glbc
+
# Example:
# I0603 15:31:05.793605 6 cluster_manager.go:230] Reading config from path /etc/gce.conf
- type tail
+ @id cluster-autoscaler.log
+ @type tail
format multiline
multiline_flush_interval 5s
format_firstline /^\w\d{4}/
@@ -270,59 +319,123 @@ data:
pos_file /var/log/es-cluster-autoscaler.log.pos
tag cluster-autoscaler
+
+ # Logs from systemd-journal for interesting services.
+ # TODO(random-liu): Remove this after cri container runtime rolls out.
+
+ @id journald-docker
+ @type systemd
+ filters [{ "_SYSTEMD_UNIT": "docker.service" }]
+
+ @type local
+ persistent true
+
+ read_from_head true
+ tag docker
+
+
+
+ @id journald-container-runtime
+ @type systemd
+ filters [{ "_SYSTEMD_UNIT": "{{ container_runtime }}.service" }]
+
+ @type local
+ persistent true
+
+ read_from_head true
+ tag container-runtime
+
+
+
+ @id journald-kubelet
+ @type systemd
+ filters [{ "_SYSTEMD_UNIT": "kubelet.service" }]
+
+ @type local
+ persistent true
+
+ read_from_head true
+ tag kubelet
+
+
+
+ @id journald-node-problem-detector
+ @type systemd
+ filters [{ "_SYSTEMD_UNIT": "node-problem-detector.service" }]
+
+ @type local
+ persistent true
+
+ read_from_head true
+ tag node-problem-detector
+
+
+ forward.input.conf: |-
+ # Takes the messages sent over TCP
+
+ @type forward
+
+
+ monitoring.conf: |-
+ # Prometheus Exporter Plugin
+ # input plugin that exports metrics
+
+ @type prometheus
+
+
+
+ @type monitor_agent
+
+
+ # input plugin that collects metrics from MonitorAgent
+
+ @type prometheus_monitor
+
+ host ${hostname}
+
+
+
+ # input plugin that collects metrics for output plugin
+
+ @type prometheus_output_monitor
+
+ host ${hostname}
+
+
+
+ # input plugin that collects metrics for in_tail plugin
+
+ @type prometheus_tail_monitor
+
+ host ${hostname}
+
+
+
+ output.conf: |-
+ # Enriches records with Kubernetes metadata
- type kubernetes_metadata
+ @type kubernetes_metadata
- ## Prometheus Exporter Plugin
- ## input plugin that exports metrics
- #
- # type prometheus
- #
- #
- # type monitor_agent
- #
- #
- # type forward
- #
- ## input plugin that collects metrics from MonitorAgent
- #
- # @type prometheus_monitor
- #
- # host ${hostname}
- #
- #
- ## input plugin that collects metrics for output plugin
- #
- # @type prometheus_output_monitor
- #
- # host ${hostname}
- #
- #
- ## input plugin that collects metrics for in_tail plugin
- #
- # @type prometheus_tail_monitor
- #
- # host ${hostname}
- #
- #
+
- type elasticsearch
- user "#{ENV['FLUENT_ELASTICSEARCH_USER']}"
- password "#{ENV['FLUENT_ELASTICSEARCH_PASSWORD']}"
- log_level info
- include_tag_key true
- host elasticsearch-logging
- port 9200
- logstash_format true
- # Set the chunk limit the same as for fluentd-gcp.
- buffer_chunk_limit 2M
- # Cap buffer memory usage to 2MiB/chunk * 32 chunks = 64 MiB
- buffer_queue_limit 32
- flush_interval 5s
- # Never wait longer than 5 minutes between retries.
- max_retry_wait 30
- # Disable the limit on the number of retries (retry forever).
- disable_retry_limit
- # Use multiple threads for processing.
- num_threads 8
-
+ @id elasticsearch
+ @type elasticsearch
+ @log_level info
+ include_tag_key true
+ host elasticsearch-logging
+ port 9200
+ logstash_format true
+
+ @type file
+ path /var/log/fluentd-buffers/kubernetes.system.buffer
+ flush_mode interval
+ retry_type exponential_backoff
+ flush_thread_count 2
+ flush_interval 5s
+ retry_forever
+ retry_max_interval 30
+ chunk_limit_size 2M
+ queue_limit_length 8
+ overflow_action block
+
+
\ No newline at end of file
diff --git a/roles/kubernetes-apps/efk/fluentd/templates/fluentd-ds.yml.j2 b/roles/kubernetes-apps/efk/fluentd/templates/fluentd-ds.yml.j2
index f23a8851c..e0506026f 100644
--- a/roles/kubernetes-apps/efk/fluentd/templates/fluentd-ds.yml.j2
+++ b/roles/kubernetes-apps/efk/fluentd/templates/fluentd-ds.yml.j2
@@ -1,21 +1,22 @@
---
-# https://raw.githubusercontent.com/kubernetes/kubernetes/v1.5.2/cluster/addons/fluentd-elasticsearch/es-controller.yaml
-apiVersion: extensions/v1beta1
+# https://raw.githubusercontent.com/kubernetes/kubernetes/release-1.10/cluster/addons/fluentd-elasticsearch/fluentd-es-ds.yaml
+apiVersion: apps/v1
kind: DaemonSet
metadata:
- name: "fluentd-es-v{{ fluentd_version }}"
+ name: "fluentd-es-{{ fluentd_version }}"
namespace: "kube-system"
labels:
k8s-app: fluentd-es
kubernetes.io/cluster-service: "true"
- version: "v{{ fluentd_version }}"
+ version: "{{ fluentd_version }}"
+ addonmanager.kubernetes.io/mode: Reconcile
spec:
template:
metadata:
labels:
k8s-app: fluentd-es
kubernetes.io/cluster-service: "true"
- version: "v{{ fluentd_version }}"
+ version: "{{ fluentd_version }}"
spec:
tolerations:
- effect: NoSchedule
@@ -23,10 +24,9 @@ spec:
containers:
- name: fluentd-es
image: "{{ fluentd_image_repo }}:{{ fluentd_image_tag }}"
- command:
- - '/bin/sh'
- - '-c'
- - '/usr/sbin/td-agent -c {{ fluentd_config_dir }}/{{ fluentd_config_file}} 2>&1 >> /var/log/fluentd.log'
+ env:
+ - name: FLUENTD_ARGS
+ value: "--no-supervisor -q"
resources:
limits:
{% if fluentd_cpu_limit is defined and fluentd_cpu_limit != "0m" %}
@@ -42,8 +42,10 @@ spec:
- name: dockercontainers
mountPath: "{{ docker_daemon_graph }}/containers"
readOnly: true
- - name: config
+ - name: config-volume
mountPath: "{{ fluentd_config_dir }}"
+ nodeSelector:
+ beta.kubernetes.io/fluentd-ds-ready: "true"
terminationGracePeriodSeconds: 30
volumes:
- name: varlog
@@ -52,7 +54,7 @@ spec:
- name: dockercontainers
hostPath:
path: {{ docker_daemon_graph }}/containers
- - name: config
+ - name: config-volume
configMap:
name: fluentd-config
{% if rbac_enabled %}