inital commit

Signed-off-by: Jordan Moore <crikket.007@gmail.com>
2020-04-22 13:24:43 -05:00
parent d3c5f28243
commit 10b3bc378f
3 changed files with 293 additions and 0 deletions
--- a/ultimate-instrumentation/README.md
+++ b/ultimate-instrumentation/README.md
@@ -0,0 +1,6 @@
+ultimate-instrumentation
+===
+
+TODO
+
+Refer comments in [`docker-compose.yaml`](docker-compose.yaml).
--- a/ultimate-instrumentation/docker-compose.yaml
+++ b/ultimate-instrumentation/docker-compose.yaml
@@ -0,0 +1,134 @@
+## Ultimate Instrumentation Stack
+## Provides
+##   - log collection and forwarding (logspout) (localhost:8000/logs)
+##   - http/tcp/udp proxying (traefik) (localhost:8080/dashboard/)
+##   - service discovery + dns + health-checks (consul) (localhost:8500/ui) (depends on registrator)
+##   - time-series DB storage (influxdb) (localhost:8086)
+##   - metric & event visualization (grafana) (localhost:3000) 
+## http routing provided by traefik on `localhost/<route>` (see SERVICE_TAGS vars for routes)
+
+version: '3'
+services:
+  # logging - Logspount meant for log collection, not storage. 
+  # Could pair this with syslog/GELF into FluentD/Logstash/Kafka with indexing by Graylog/Elasticsearch/Solr/Splunk
+  logspout:
+    image: gliderlabs/logspout:master
+    volumes: ['/var/run/docker.sock:/tmp/docker.sock:ro']
+    networks: ['backend','frontend']
+    ports: ['8000:80']  # logs avail at http://localhost:8000/logs
+    environment:
+      EXCLUDE_LABEL: logspout.exclude
+      # registrator
+      SERVICE_TAGS: 'traefik.enable=true,traefik.port=8000,traefik.docker.network=frontend,traefik.http.routers.router0.rule=PathPrefix(`/logs`)'
+
+  # service discovery (Consul + registrator)
+  registrator:
+    image: 'gliderlabs/registrator:master'
+    depends_on: ['consul']
+    networks: ['backend']
+    volumes: ['/var/run/docker.sock:/tmp/docker.sock:ro']
+    command: ["-internal=true", "consul://consul:8500"]
+  consul:
+    image: 'consul:1.6.4'
+    restart: always
+    networks: ['backend','frontend']
+    ports:
+      - '8500:8500' # web ui
+      # - '8300:8300' # server rpc
+      # - '8301:8301' # lan serf tcp
+      # - '8301:8301/udp' # lan serf udp
+      # - '8600:8600' # dns tcp
+      # - '8600:8600/udp' # dns udp
+    environment: 
+      # registrator
+      SERVICE_TAGS: "traefik.enable=true,traefik.docker.network=frontend"
+    labels:
+      - logspount.exclude 
+
+  # load balancing + routing
+  traefik:
+    image: traefik:2.2
+    depends_on: ['registrator', 'consul']
+    networks: ['backend','frontend']
+    ports: 
+      - '8080:8080' # web ui
+      - '80:80'
+    command: 
+      # - "--log.level=DEBUG"
+      - "--api.insecure=true"
+      - "--api.dashboard=true"
+      - "--providers.consulcatalog=true"
+      - "--providers.consulcatalog.endpoint.address=http://consul:8500"
+      - "--providers.consulcatalog.endpoint.datacenter=dc1"
+      - "--providers.consulcatalog.cache=true"
+      - "--providers.consulcatalog.exposedByDefault=false"
+    labels:
+      - logspount.exclude 
+
+  #monitoring
+  # TODO: Tracing - Zipkin/Jaeger
+
+  ## TIG Stack - Telegraf + Influx + Grafana
+  ## This is functionally equivalent to just Prometheus + Grafana, but allows metric push & pull
+  telegraf:
+    image: telegraf:1.14-alpine
+    restart: unless-stopped
+    networks: ['backend', 'monitor']
+    depends_on: ['influxdb']
+    labels:
+      - logspount.exclude 
+    volumes:
+      - ./telegraf.conf:/etc/telegraf/telegraf.conf:ro
+      # For docker stats
+      - /var/run/docker.sock:/var/run/docker.sock:ro
+  influxdb:
+    image: influxdb:1.7-alpine
+    restart: always
+    ports: ['8086:8086']
+    networks: ['monitor']
+    labels:
+      - logspount.exclude 
+    environment: 
+      INFLUXDB_DB: telegraf 
+      INFLUXDB_USER: telegraf
+      INFLUXDB_USER_PASSWORD: 'compose'
+      INFLUXDB_REPORTING_DISABLED: 'true'
+
+    # volumes:
+    #   - influxdb-volume:/var/lib/influxdb
+  grafana:
+    image: grafana/grafana:master
+    depends_on: ['influxdb']
+    ports: ["3000:3000"]
+    networks: ['monitor', 'frontend']
+    user: "0"
+    labels:
+      - logspount.exclude 
+    environment:
+      GF_SECURITY_ADMIN_PASSWORD: 'compose'
+      GF_USERS_ALLOW_SIGN_UP: 'false'
+      # registrator
+      SERVICE_TAGS: 'traefik.enable=true,traefik.docker.network=frontend,traefik.http.routers.router0.rule=PathPrefix(`/grafana`)'
+
+  ## TODO - get loki working (in grafana)
+  # loki:
+  #   image: grafana/loki:latest
+  #   depends_on: ['registrator', 'consul']
+  #   ports: ["3100:3100"]
+  #   command: ["-config.file=/etc/loki/local-config.yaml"]
+  #   networks: ['backend']
+  #   environment:
+  #     # LOGSPOUT: ignore
+  #     SERVICE_TAGS: "traefik.enable=true,traefik.http.routers.router0.rule=PathPrefix(`/loki`)"
+
+
+# Create a network for intrastructure components
+networks: 
+  backend:
+  monitor:
+  frontend:
+
+# Create local persistent volumes
+volumes:
+  grafana-volume:
+  influxdb-volume:
--- a/ultimate-instrumentation/telegraf.conf
+++ b/ultimate-instrumentation/telegraf.conf
@@ -0,0 +1,153 @@
+# Configuration for telegraf agent
+[agent]
+  interval = "10s"
+  round_interval = true
+  metric_batch_size = 1000
+  metric_buffer_limit = 10000
+
+  collection_jitter = "3s"
+  flush_interval = "10s"
+  flush_jitter = "5s"
+
+  debug = false
+  quiet = false
+  logfile = "/var/log/telegraf/telegraf.log"
+  logfile_rotation_interval = "0d"
+  logfile_rotation_max_size = "1MB"
+  logfile_rotation_max_archives = 5
+
+  hostname = ""
+
+###############################################################################
+#                            OUTPUT PLUGINS                                   #
+###############################################################################
+
+# Configuration for sending metrics to InfluxDB
+[[outputs.influxdb]]
+  urls = ["http://influxdb:8086"] # required
+  database = "telegraf" # required
+  username = "telegraf"
+  password = "minitrue"
+  ## If true, the database tag will not be added to the metric.
+  exclude_database_tag = false
+  retention_policy = ""
+  write_consistency = "any"
+  timeout = "5s"
+  ## If true, no CREATE DATABASE queries will be sent.  Set to true when using
+  ## Telegraf with a user without permissions to create databases or when the
+  ## database already exists.
+  skip_database_creation = false
+
+###############################################################################
+#                            INPUT PLUGINS                                    #
+###############################################################################
+
+# Read metrics about cpu usage
+[[inputs.cpu]]
+  ## Whether to report per-cpu stats or not
+  percpu = true
+  ## Whether to report total system cpu stats or not
+  totalcpu = true
+  ## If true, collect raw CPU time metrics.
+  collect_cpu_time = false
+  report_active = false
+
+# Read metrics about disk usage by mount point
+[[inputs.disk]]
+  ## Ignore some mountpoints by filesystem type. For example (dev)tmpfs (usually
+  ## present on /run, /var/run, /dev/shm or /dev).
+  ignore_fs = ["tmpfs", "devtmpfs", "devfs", "iso9660", "overlay", "aufs", "squashfs"]
+
+# Read metrics about disk IO by device
+[[inputs.diskio]]
+  ## Setting devices will restrict the stats to the specified devices.
+  # devices = ["sda", "sdb"]
+
+# Get kernel statistics from /proc/stat
+[[inputs.kernel]]
+  # no configuration
+
+# Read metrics about memory usage
+[[inputs.mem]]
+  # no configuration
+
+# Get the number of processes and group them by status
+[[inputs.processes]]
+  # no configuration
+
+# Read metrics about swap memory usage
+[[inputs.swap]]
+  # no configuration
+
+# Read metrics about system load & uptime
+[[inputs.system]]
+  ## Uncomment to remove deprecated metrics.  
+  # fielddrop = ["uptime_format"]
+
+[[inputs.internal]]  
+  collect_memstats = true
+
+# # Read metrics about network interface usage
+[[inputs.net]]
+  interfaces = ["eth*"]
+
+# # Read metrics about docker containers
+[[inputs.docker]]
+  endpoint = "unix:///var/run/docker.sock"
+  timeout = "5s"
+
+# Statsd Server
+[[inputs.statsd]]
+  ## Address and port to host UDP listener on
+  service_address = ":8125"
+
+  ## Percentiles to calculate for timing & histogram stats.
+  percentiles = [50.0, 75.0, 99.0, 99.9]
+
+  ## Delete gauges every interval (default=false)
+  delete_gauges = true
+  ## Delete counters every interval (default=false)
+   delete_counters = true
+
+  ## separator to use between elements of a statsd metric
+  metric_separator = "_"
+
+  ## convert measurement names, “.” to “_” and “-” to “__”
+  convert_names = false
+
+  ## used to parse StatD variable name correctly for InfluxDB
+  ## it enables to have counters/gauges grouped by measurement
+  templates = [
+    "* measurement.field"
+  ]
+
+  ## Parses extensions to statsd in the datadog statsd format
+  ## currently supports metrics and datadog tags.
+  ## http://docs.datadoghq.com/guides/dogstatsd/
+  datadog_extensions = true
+
+  ## Number of UDP messages allowed to queue up, once filled,
+  ## the statsd server will start dropping packets
+  allowed_pending_messages = 10000
+
+  ## Number of timing/histogram values to track per-measurement in the
+  ## calculation of percentiles. Raising this limit increases the accuracy
+  ## of percentiles but also increases the memory usage and cpu time.
+  percentile_limit = 1000
+
+
+# [[inputs.docker_log]]  
+#   ##   To use TCP, set endpoint = "tcp://[ip]:[port]"
+#   ##   To use environment variables (ie, docker-machine), set endpoint = "ENV"
+#   endpoint = "unix:///var/run/docker.sock"
+
+#   ## When true, container logs are read from the beginning; otherwise
+#   ## reading begins at the end of the log.
+#   # from_beginning = false
+
+#   ## docker labels to include.  Globs accepted.
+#   ## Note that an empty array for both will include all labels as tags
+#   docker_label_include = ["logcapture"]
+
+#   ## Set the source tag for the metrics to the container ID hostname, eg first 12 chars
+#   source_tag = true