inital commit

Signed-off-by: Jordan Moore <crikket.007@gmail.com>
2020-04-22 13:24:43 -05:00 · 2020-04-22 13:24:43 -05:00 · 10b3bc378f
commit 10b3bc378f
parent d3c5f28243
3 changed files with 293 additions and 0 deletions
--- a/ultimate-instrumentation/README.md
+++ b/ultimate-instrumentation/README.md
@ -0,0 +1,6 @@
 ultimate-instrumentation
 ===
 TODO
 Refer comments in [`docker-compose.yaml`](docker-compose.yaml).
--- a/ultimate-instrumentation/docker-compose.yaml
+++ b/ultimate-instrumentation/docker-compose.yaml
@ -0,0 +1,134 @@
 ## Ultimate Instrumentation Stack
 ## Provides
 ##   - log collection and forwarding (logspout) (localhost:8000/logs)
 ##   - http/tcp/udp proxying (traefik) (localhost:8080/dashboard/)
 ##   - service discovery + dns + health-checks (consul) (localhost:8500/ui) (depends on registrator)
 ##   - time-series DB storage (influxdb) (localhost:8086)
 ##   - metric & event visualization (grafana) (localhost:3000) 
 ## http routing provided by traefik on `localhost/<route>` (see SERVICE_TAGS vars for routes)
 version: '3'
 services:
  # logging - Logspount meant for log collection, not storage. 
  # Could pair this with syslog/GELF into FluentD/Logstash/Kafka with indexing by Graylog/Elasticsearch/Solr/Splunk
  logspout:
    image: gliderlabs/logspout:master
    volumes: ['/var/run/docker.sock:/tmp/docker.sock:ro']
    networks: ['backend','frontend']
    ports: ['8000:80']  # logs avail at http://localhost:8000/logs
    environment:
      EXCLUDE_LABEL: logspout.exclude
      # registrator
      SERVICE_TAGS: 'traefik.enable=true,traefik.port=8000,traefik.docker.network=frontend,traefik.http.routers.router0.rule=PathPrefix(`/logs`)'
  # service discovery (Consul + registrator)
  registrator:
    image: 'gliderlabs/registrator:master'
    depends_on: ['consul']
    networks: ['backend']
    volumes: ['/var/run/docker.sock:/tmp/docker.sock:ro']
    command: ["-internal=true", "consul://consul:8500"]
  consul:
    image: 'consul:1.6.4'
    restart: always
    networks: ['backend','frontend']
    ports:
      - '8500:8500' # web ui
      # - '8300:8300' # server rpc
      # - '8301:8301' # lan serf tcp
      # - '8301:8301/udp' # lan serf udp
      # - '8600:8600' # dns tcp
      # - '8600:8600/udp' # dns udp
    environment: 
      # registrator
      SERVICE_TAGS: "traefik.enable=true,traefik.docker.network=frontend"
    labels:
      - logspount.exclude 
  # load balancing + routing
  traefik:
    image: traefik:2.2
    depends_on: ['registrator', 'consul']
    networks: ['backend','frontend']
    ports: 
      - '8080:8080' # web ui
      - '80:80'
    command: 
      # - "--log.level=DEBUG"
      - "--api.insecure=true"
      - "--api.dashboard=true"
      - "--providers.consulcatalog=true"
      - "--providers.consulcatalog.endpoint.address=http://consul:8500"
      - "--providers.consulcatalog.endpoint.datacenter=dc1"
      - "--providers.consulcatalog.cache=true"
      - "--providers.consulcatalog.exposedByDefault=false"
    labels:
      - logspount.exclude 
  #monitoring
  # TODO: Tracing - Zipkin/Jaeger
  ## TIG Stack - Telegraf + Influx + Grafana
  ## This is functionally equivalent to just Prometheus + Grafana, but allows metric push & pull
  telegraf:
    image: telegraf:1.14-alpine
    restart: unless-stopped
    networks: ['backend', 'monitor']
    depends_on: ['influxdb']
    labels:
      - logspount.exclude 
    volumes:
      - ./telegraf.conf:/etc/telegraf/telegraf.conf:ro
      # For docker stats
      - /var/run/docker.sock:/var/run/docker.sock:ro
  influxdb:
    image: influxdb:1.7-alpine
    restart: always
    ports: ['8086:8086']
    networks: ['monitor']
    labels:
      - logspount.exclude 
    environment: 
      INFLUXDB_DB: telegraf 
      INFLUXDB_USER: telegraf
      INFLUXDB_USER_PASSWORD: 'compose'
      INFLUXDB_REPORTING_DISABLED: 'true'
    # volumes:
    #   - influxdb-volume:/var/lib/influxdb
  grafana:
    image: grafana/grafana:master
    depends_on: ['influxdb']
    ports: ["3000:3000"]
    networks: ['monitor', 'frontend']
    user: "0"
    labels:
      - logspount.exclude 
    environment:
      GF_SECURITY_ADMIN_PASSWORD: 'compose'
      GF_USERS_ALLOW_SIGN_UP: 'false'
      # registrator
      SERVICE_TAGS: 'traefik.enable=true,traefik.docker.network=frontend,traefik.http.routers.router0.rule=PathPrefix(`/grafana`)'
  ## TODO - get loki working (in grafana)
  # loki:
  #   image: grafana/loki:latest
  #   depends_on: ['registrator', 'consul']
  #   ports: ["3100:3100"]
  #   command: ["-config.file=/etc/loki/local-config.yaml"]
  #   networks: ['backend']
  #   environment:
  #     # LOGSPOUT: ignore
  #     SERVICE_TAGS: "traefik.enable=true,traefik.http.routers.router0.rule=PathPrefix(`/loki`)"
 # Create a network for intrastructure components
 networks: 
  backend:
  monitor:
  frontend:
 # Create local persistent volumes
 volumes:
  grafana-volume:
  influxdb-volume:
--- a/ultimate-instrumentation/telegraf.conf
+++ b/ultimate-instrumentation/telegraf.conf
@ -0,0 +1,153 @@
 # Configuration for telegraf agent
 [agent]
  interval = "10s"
  round_interval = true
  metric_batch_size = 1000
  metric_buffer_limit = 10000
  collection_jitter = "3s"
  flush_interval = "10s"
  flush_jitter = "5s"
  debug = false
  quiet = false
  logfile = "/var/log/telegraf/telegraf.log"
  logfile_rotation_interval = "0d"
  logfile_rotation_max_size = "1MB"
  logfile_rotation_max_archives = 5
  hostname = ""
 ###############################################################################
 #                            OUTPUT PLUGINS                                   #
 ###############################################################################
 # Configuration for sending metrics to InfluxDB
 [[outputs.influxdb]]
  urls = ["http://influxdb:8086"] # required
  database = "telegraf" # required
  username = "telegraf"
  password = "minitrue"
  ## If true, the database tag will not be added to the metric.
  exclude_database_tag = false
  retention_policy = ""
  write_consistency = "any"
  timeout = "5s"
  ## If true, no CREATE DATABASE queries will be sent.  Set to true when using
  ## Telegraf with a user without permissions to create databases or when the
  ## database already exists.
  skip_database_creation = false
 ###############################################################################
 #                            INPUT PLUGINS                                    #
 ###############################################################################
 # Read metrics about cpu usage
 [[inputs.cpu]]
  ## Whether to report per-cpu stats or not
  percpu = true
  ## Whether to report total system cpu stats or not
  totalcpu = true
  ## If true, collect raw CPU time metrics.
  collect_cpu_time = false
  report_active = false
 # Read metrics about disk usage by mount point
 [[inputs.disk]]
  ## Ignore some mountpoints by filesystem type. For example (dev)tmpfs (usually
  ## present on /run, /var/run, /dev/shm or /dev).
  ignore_fs = ["tmpfs", "devtmpfs", "devfs", "iso9660", "overlay", "aufs", "squashfs"]
 # Read metrics about disk IO by device
 [[inputs.diskio]]
  ## Setting devices will restrict the stats to the specified devices.
  # devices = ["sda", "sdb"]
 # Get kernel statistics from /proc/stat
 [[inputs.kernel]]
  # no configuration
 # Read metrics about memory usage
 [[inputs.mem]]
  # no configuration
 # Get the number of processes and group them by status
 [[inputs.processes]]
  # no configuration
 # Read metrics about swap memory usage
 [[inputs.swap]]
  # no configuration
 # Read metrics about system load & uptime
 [[inputs.system]]
  ## Uncomment to remove deprecated metrics.  
  # fielddrop = ["uptime_format"]
 [[inputs.internal]]  
  collect_memstats = true
 # # Read metrics about network interface usage
 [[inputs.net]]
  interfaces = ["eth*"]
 # # Read metrics about docker containers
 [[inputs.docker]]
  endpoint = "unix:///var/run/docker.sock"
  timeout = "5s"
 # Statsd Server
 [[inputs.statsd]]
  ## Address and port to host UDP listener on
  service_address = ":8125"
  ## Percentiles to calculate for timing & histogram stats.
  percentiles = [50.0, 75.0, 99.0, 99.9]
  ## Delete gauges every interval (default=false)
  delete_gauges = true
  ## Delete counters every interval (default=false)
   delete_counters = true
  ## separator to use between elements of a statsd metric
  metric_separator = "_"
  ## convert measurement names, “.” to “_” and “-” to “__”
  convert_names = false
  ## used to parse StatD variable name correctly for InfluxDB
  ## it enables to have counters/gauges grouped by measurement
  templates = [
    "* measurement.field"
  ]
  ## Parses extensions to statsd in the datadog statsd format
  ## currently supports metrics and datadog tags.
  ## http://docs.datadoghq.com/guides/dogstatsd/
  datadog_extensions = true
  ## Number of UDP messages allowed to queue up, once filled,
  ## the statsd server will start dropping packets
  allowed_pending_messages = 10000
  ## Number of timing/histogram values to track per-measurement in the
  ## calculation of percentiles. Raising this limit increases the accuracy
  ## of percentiles but also increases the memory usage and cpu time.
  percentile_limit = 1000
 # [[inputs.docker_log]]  
 #   ##   To use TCP, set endpoint = "tcp://[ip]:[port]"
 #   ##   To use environment variables (ie, docker-machine), set endpoint = "ENV"
 #   endpoint = "unix:///var/run/docker.sock"
 #   ## When true, container logs are read from the beginning; otherwise
 #   ## reading begins at the end of the log.
 #   # from_beginning = false
 #   ## docker labels to include.  Globs accepted.
 #   ## Note that an empty array for both will include all labels as tags
 #   docker_label_include = ["logcapture"]
 #   ## Set the source tag for the metrics to the container ID hostname, eg first 12 chars
 #   source_tag = true