inital commit

Signed-off-by: Jordan Moore <crikket.007@gmail.com>
This commit is contained in:
Jordan Moore 2020-04-22 13:24:43 -05:00
parent d3c5f28243
commit 10b3bc378f
No known key found for this signature in database
GPG Key ID: AB5FB2D420741ED9
3 changed files with 293 additions and 0 deletions

View File

@ -0,0 +1,6 @@
ultimate-instrumentation
===
TODO
Refer comments in [`docker-compose.yaml`](docker-compose.yaml).

View File

@ -0,0 +1,134 @@
## Ultimate Instrumentation Stack
## Provides
## - log collection and forwarding (logspout) (localhost:8000/logs)
## - http/tcp/udp proxying (traefik) (localhost:8080/dashboard/)
## - service discovery + dns + health-checks (consul) (localhost:8500/ui) (depends on registrator)
## - time-series DB storage (influxdb) (localhost:8086)
## - metric & event visualization (grafana) (localhost:3000)
## http routing provided by traefik on `localhost/<route>` (see SERVICE_TAGS vars for routes)
version: '3'
services:
# logging - Logspount meant for log collection, not storage.
# Could pair this with syslog/GELF into FluentD/Logstash/Kafka with indexing by Graylog/Elasticsearch/Solr/Splunk
logspout:
image: gliderlabs/logspout:master
volumes: ['/var/run/docker.sock:/tmp/docker.sock:ro']
networks: ['backend','frontend']
ports: ['8000:80'] # logs avail at http://localhost:8000/logs
environment:
EXCLUDE_LABEL: logspout.exclude
# registrator
SERVICE_TAGS: 'traefik.enable=true,traefik.port=8000,traefik.docker.network=frontend,traefik.http.routers.router0.rule=PathPrefix(`/logs`)'
# service discovery (Consul + registrator)
registrator:
image: 'gliderlabs/registrator:master'
depends_on: ['consul']
networks: ['backend']
volumes: ['/var/run/docker.sock:/tmp/docker.sock:ro']
command: ["-internal=true", "consul://consul:8500"]
consul:
image: 'consul:1.6.4'
restart: always
networks: ['backend','frontend']
ports:
- '8500:8500' # web ui
# - '8300:8300' # server rpc
# - '8301:8301' # lan serf tcp
# - '8301:8301/udp' # lan serf udp
# - '8600:8600' # dns tcp
# - '8600:8600/udp' # dns udp
environment:
# registrator
SERVICE_TAGS: "traefik.enable=true,traefik.docker.network=frontend"
labels:
- logspount.exclude
# load balancing + routing
traefik:
image: traefik:2.2
depends_on: ['registrator', 'consul']
networks: ['backend','frontend']
ports:
- '8080:8080' # web ui
- '80:80'
command:
# - "--log.level=DEBUG"
- "--api.insecure=true"
- "--api.dashboard=true"
- "--providers.consulcatalog=true"
- "--providers.consulcatalog.endpoint.address=http://consul:8500"
- "--providers.consulcatalog.endpoint.datacenter=dc1"
- "--providers.consulcatalog.cache=true"
- "--providers.consulcatalog.exposedByDefault=false"
labels:
- logspount.exclude
#monitoring
# TODO: Tracing - Zipkin/Jaeger
## TIG Stack - Telegraf + Influx + Grafana
## This is functionally equivalent to just Prometheus + Grafana, but allows metric push & pull
telegraf:
image: telegraf:1.14-alpine
restart: unless-stopped
networks: ['backend', 'monitor']
depends_on: ['influxdb']
labels:
- logspount.exclude
volumes:
- ./telegraf.conf:/etc/telegraf/telegraf.conf:ro
# For docker stats
- /var/run/docker.sock:/var/run/docker.sock:ro
influxdb:
image: influxdb:1.7-alpine
restart: always
ports: ['8086:8086']
networks: ['monitor']
labels:
- logspount.exclude
environment:
INFLUXDB_DB: telegraf
INFLUXDB_USER: telegraf
INFLUXDB_USER_PASSWORD: 'compose'
INFLUXDB_REPORTING_DISABLED: 'true'
# volumes:
# - influxdb-volume:/var/lib/influxdb
grafana:
image: grafana/grafana:master
depends_on: ['influxdb']
ports: ["3000:3000"]
networks: ['monitor', 'frontend']
user: "0"
labels:
- logspount.exclude
environment:
GF_SECURITY_ADMIN_PASSWORD: 'compose'
GF_USERS_ALLOW_SIGN_UP: 'false'
# registrator
SERVICE_TAGS: 'traefik.enable=true,traefik.docker.network=frontend,traefik.http.routers.router0.rule=PathPrefix(`/grafana`)'
## TODO - get loki working (in grafana)
# loki:
# image: grafana/loki:latest
# depends_on: ['registrator', 'consul']
# ports: ["3100:3100"]
# command: ["-config.file=/etc/loki/local-config.yaml"]
# networks: ['backend']
# environment:
# # LOGSPOUT: ignore
# SERVICE_TAGS: "traefik.enable=true,traefik.http.routers.router0.rule=PathPrefix(`/loki`)"
# Create a network for intrastructure components
networks:
backend:
monitor:
frontend:
# Create local persistent volumes
volumes:
grafana-volume:
influxdb-volume:

View File

@ -0,0 +1,153 @@
# Configuration for telegraf agent
[agent]
interval = "10s"
round_interval = true
metric_batch_size = 1000
metric_buffer_limit = 10000
collection_jitter = "3s"
flush_interval = "10s"
flush_jitter = "5s"
debug = false
quiet = false
logfile = "/var/log/telegraf/telegraf.log"
logfile_rotation_interval = "0d"
logfile_rotation_max_size = "1MB"
logfile_rotation_max_archives = 5
hostname = ""
###############################################################################
# OUTPUT PLUGINS #
###############################################################################
# Configuration for sending metrics to InfluxDB
[[outputs.influxdb]]
urls = ["http://influxdb:8086"] # required
database = "telegraf" # required
username = "telegraf"
password = "minitrue"
## If true, the database tag will not be added to the metric.
exclude_database_tag = false
retention_policy = ""
write_consistency = "any"
timeout = "5s"
## If true, no CREATE DATABASE queries will be sent. Set to true when using
## Telegraf with a user without permissions to create databases or when the
## database already exists.
skip_database_creation = false
###############################################################################
# INPUT PLUGINS #
###############################################################################
# Read metrics about cpu usage
[[inputs.cpu]]
## Whether to report per-cpu stats or not
percpu = true
## Whether to report total system cpu stats or not
totalcpu = true
## If true, collect raw CPU time metrics.
collect_cpu_time = false
report_active = false
# Read metrics about disk usage by mount point
[[inputs.disk]]
## Ignore some mountpoints by filesystem type. For example (dev)tmpfs (usually
## present on /run, /var/run, /dev/shm or /dev).
ignore_fs = ["tmpfs", "devtmpfs", "devfs", "iso9660", "overlay", "aufs", "squashfs"]
# Read metrics about disk IO by device
[[inputs.diskio]]
## Setting devices will restrict the stats to the specified devices.
# devices = ["sda", "sdb"]
# Get kernel statistics from /proc/stat
[[inputs.kernel]]
# no configuration
# Read metrics about memory usage
[[inputs.mem]]
# no configuration
# Get the number of processes and group them by status
[[inputs.processes]]
# no configuration
# Read metrics about swap memory usage
[[inputs.swap]]
# no configuration
# Read metrics about system load & uptime
[[inputs.system]]
## Uncomment to remove deprecated metrics.
# fielddrop = ["uptime_format"]
[[inputs.internal]]
collect_memstats = true
# # Read metrics about network interface usage
[[inputs.net]]
interfaces = ["eth*"]
# # Read metrics about docker containers
[[inputs.docker]]
endpoint = "unix:///var/run/docker.sock"
timeout = "5s"
# Statsd Server
[[inputs.statsd]]
## Address and port to host UDP listener on
service_address = ":8125"
## Percentiles to calculate for timing & histogram stats.
percentiles = [50.0, 75.0, 99.0, 99.9]
## Delete gauges every interval (default=false)
delete_gauges = true
## Delete counters every interval (default=false)
delete_counters = true
## separator to use between elements of a statsd metric
metric_separator = "_"
## convert measurement names, “.” to “_” and “-” to “__”
convert_names = false
## used to parse StatD variable name correctly for InfluxDB
## it enables to have counters/gauges grouped by measurement
templates = [
"* measurement.field"
]
## Parses extensions to statsd in the datadog statsd format
## currently supports metrics and datadog tags.
## http://docs.datadoghq.com/guides/dogstatsd/
datadog_extensions = true
## Number of UDP messages allowed to queue up, once filled,
## the statsd server will start dropping packets
allowed_pending_messages = 10000
## Number of timing/histogram values to track per-measurement in the
## calculation of percentiles. Raising this limit increases the accuracy
## of percentiles but also increases the memory usage and cpu time.
percentile_limit = 1000
# [[inputs.docker_log]]
# ## To use TCP, set endpoint = "tcp://[ip]:[port]"
# ## To use environment variables (ie, docker-machine), set endpoint = "ENV"
# endpoint = "unix:///var/run/docker.sock"
# ## When true, container logs are read from the beginning; otherwise
# ## reading begins at the end of the log.
# # from_beginning = false
# ## docker labels to include. Globs accepted.
# ## Note that an empty array for both will include all labels as tags
# docker_label_include = ["logcapture"]
# ## Set the source tag for the metrics to the container ID hostname, eg first 12 chars
# source_tag = true