433 lines
14 KiB
YAML
433 lines
14 KiB
YAML
groups:
|
|
# system.* metrics
|
|
- id: metric.system.uptime
|
|
type: metric
|
|
metric_name: system.uptime
|
|
stability: development
|
|
brief: "The time the system has been running"
|
|
note: |
|
|
Instrumentations SHOULD use a gauge with type `double` and measure uptime in seconds as a floating point number with the highest precision available.
|
|
The actual accuracy would depend on the instrumentation and operating system.
|
|
instrument: gauge
|
|
unit: "s"
|
|
entity_associations:
|
|
- host
|
|
|
|
# system.cpu.* metrics
|
|
- id: metric.system.cpu.physical.count
|
|
type: metric
|
|
metric_name: system.cpu.physical.count
|
|
stability: development
|
|
brief: "Reports the number of actual physical processor cores on the hardware"
|
|
note: "Calculated by multiplying the number of sockets by the number of cores per socket"
|
|
instrument: updowncounter
|
|
unit: "{cpu}"
|
|
attributes: []
|
|
entity_associations:
|
|
- host
|
|
|
|
- id: metric.system.cpu.logical.count
|
|
type: metric
|
|
metric_name: system.cpu.logical.count
|
|
stability: development
|
|
brief: "Reports the number of logical (virtual) processor cores created by the operating system to manage multitasking"
|
|
note: "Calculated by multiplying the number of sockets by the number of cores per socket, and then by the number of threads per core"
|
|
instrument: updowncounter
|
|
unit: "{cpu}"
|
|
attributes: []
|
|
entity_associations:
|
|
- host
|
|
|
|
# system.memory.* metrics
|
|
- id: metric.system.memory.usage
|
|
type: metric
|
|
metric_name: system.memory.usage
|
|
stability: development
|
|
brief: "Reports memory in use by state."
|
|
note: |
|
|
The sum over all `system.memory.state` values SHOULD equal the total memory
|
|
available on the system, that is `system.memory.limit`.
|
|
instrument: updowncounter
|
|
unit: "By"
|
|
attributes:
|
|
- ref: system.memory.state
|
|
entity_associations:
|
|
- host
|
|
|
|
- id: metric.system.memory.limit
|
|
type: metric
|
|
metric_name: system.memory.limit
|
|
stability: development
|
|
brief: "Total memory available in the system."
|
|
note: |
|
|
Its value SHOULD equal the sum of `system.memory.state` over all states.
|
|
instrument: updowncounter
|
|
unit: "By"
|
|
attributes: []
|
|
entity_associations:
|
|
- host
|
|
|
|
- id: metric.system.memory.shared
|
|
type: metric
|
|
metric_name: system.memory.shared
|
|
stability: development
|
|
brief: "Shared memory used (mostly by tmpfs)."
|
|
note: |
|
|
Equivalent of `shared` from [`free` command](https://man7.org/linux/man-pages/man1/free.1.html) or
|
|
`Shmem` from [`/proc/meminfo`](https://man7.org/linux/man-pages/man5/proc.5.html)"
|
|
instrument: updowncounter
|
|
unit: "By"
|
|
entity_associations:
|
|
- host
|
|
|
|
- id: metric.system.memory.utilization
|
|
type: metric
|
|
metric_name: system.memory.utilization
|
|
stability: development
|
|
brief: ""
|
|
instrument: gauge
|
|
unit: "1"
|
|
attributes:
|
|
- ref: system.memory.state
|
|
entity_associations:
|
|
- host
|
|
|
|
# system.paging.* metrics
|
|
- id: metric.system.paging.usage
|
|
type: metric
|
|
metric_name: system.paging.usage
|
|
stability: development
|
|
brief: "Unix swap or windows pagefile usage"
|
|
instrument: updowncounter
|
|
unit: "By"
|
|
attributes:
|
|
- ref: system.paging.state
|
|
- ref: system.device
|
|
brief: Unique identifier for the device responsible for managing paging operations.
|
|
examples: ["/dev/dm-0"]
|
|
entity_associations:
|
|
- host
|
|
|
|
- id: metric.system.paging.utilization
|
|
type: metric
|
|
metric_name: system.paging.utilization
|
|
stability: development
|
|
brief: ""
|
|
instrument: gauge
|
|
unit: "1"
|
|
attributes:
|
|
- ref: system.paging.state
|
|
- ref: system.device
|
|
brief: Unique identifier for the device responsible for managing paging operations.
|
|
examples: ["/dev/dm-0"]
|
|
entity_associations:
|
|
- host
|
|
|
|
- id: metric.system.paging.faults
|
|
type: metric
|
|
metric_name: system.paging.faults
|
|
stability: development
|
|
brief: ""
|
|
instrument: counter
|
|
unit: "{fault}"
|
|
attributes:
|
|
- ref: system.paging.type
|
|
entity_associations:
|
|
- host
|
|
|
|
- id: metric.system.paging.operations
|
|
type: metric
|
|
metric_name: system.paging.operations
|
|
stability: development
|
|
brief: ""
|
|
instrument: counter
|
|
unit: "{operation}"
|
|
attributes:
|
|
- ref: system.paging.type
|
|
- ref: system.paging.direction
|
|
entity_associations:
|
|
- host
|
|
|
|
# system.disk.* metrics
|
|
- id: metric.system.disk.io
|
|
type: metric
|
|
metric_name: system.disk.io
|
|
stability: development
|
|
brief: ""
|
|
instrument: counter
|
|
unit: "By"
|
|
attributes:
|
|
- ref: system.device
|
|
- ref: disk.io.direction
|
|
entity_associations:
|
|
- host
|
|
|
|
- id: metric.system.disk.operations
|
|
type: metric
|
|
metric_name: system.disk.operations
|
|
stability: development
|
|
brief: ""
|
|
instrument: counter
|
|
unit: "{operation}"
|
|
attributes:
|
|
- ref: system.device
|
|
- ref: disk.io.direction
|
|
entity_associations:
|
|
- host
|
|
|
|
- id: metric.system.disk.io_time
|
|
type: metric
|
|
metric_name: system.disk.io_time
|
|
stability: development
|
|
brief: "Time disk spent activated"
|
|
instrument: counter
|
|
unit: "s"
|
|
note: |
|
|
The real elapsed time ("wall clock") used in the I/O path (time from operations running in parallel are not counted). Measured as:
|
|
|
|
- Linux: Field 13 from [procfs-diskstats](https://www.kernel.org/doc/Documentation/ABI/testing/procfs-diskstats)
|
|
- Windows: The complement of
|
|
["Disk\% Idle Time"](https://learn.microsoft.com/archive/blogs/askcore/windows-performance-monitor-disk-counters-explained#windows-performance-monitor-disk-counters-explained)
|
|
performance counter: `uptime * (100 - "Disk\% Idle Time") / 100`
|
|
attributes:
|
|
- ref: system.device
|
|
entity_associations:
|
|
- host
|
|
|
|
- id: metric.system.disk.operation_time
|
|
type: metric
|
|
metric_name: system.disk.operation_time
|
|
stability: development
|
|
brief: "Sum of the time each operation took to complete"
|
|
instrument: counter
|
|
unit: "s"
|
|
note: |
|
|
Because it is the sum of time each request took, parallel-issued requests each contribute to make the count grow. Measured as:
|
|
|
|
- Linux: Fields 7 & 11 from [procfs-diskstats](https://www.kernel.org/doc/Documentation/ABI/testing/procfs-diskstats)
|
|
- Windows: "Avg. Disk sec/Read" perf counter multiplied by "Disk Reads/sec" perf counter (similar for Writes)
|
|
attributes:
|
|
- ref: system.device
|
|
- ref: disk.io.direction
|
|
entity_associations:
|
|
- host
|
|
|
|
- id: metric.system.disk.merged
|
|
type: metric
|
|
metric_name: system.disk.merged
|
|
stability: development
|
|
brief: ""
|
|
instrument: counter
|
|
unit: "{operation}"
|
|
attributes:
|
|
- ref: system.device
|
|
- ref: disk.io.direction
|
|
entity_associations:
|
|
- host
|
|
|
|
- id: metric.system.disk.limit
|
|
type: metric
|
|
metric_name: system.disk.limit
|
|
stability: development
|
|
brief: "The total storage capacity of the disk"
|
|
instrument: updowncounter
|
|
unit: "By"
|
|
attributes:
|
|
- ref: system.device
|
|
entity_associations:
|
|
- host
|
|
|
|
# system.filesystem.* metrics
|
|
- id: metric.system.filesystem.usage
|
|
type: metric
|
|
metric_name: system.filesystem.usage
|
|
stability: development
|
|
brief: "Reports a filesystem's space usage across different states."
|
|
note: |
|
|
The sum of all `system.filesystem.usage` values over the different `system.filesystem.state` attributes
|
|
SHOULD equal the total storage capacity of the filesystem, that is `system.filesystem.limit`.
|
|
instrument: updowncounter
|
|
unit: "By"
|
|
attributes:
|
|
- ref: system.device
|
|
brief: Identifier for the device where the filesystem resides.
|
|
examples: ["/dev/sda", "\\network-drive"]
|
|
- ref: system.filesystem.state
|
|
- ref: system.filesystem.type
|
|
- ref: system.filesystem.mode
|
|
- ref: system.filesystem.mountpoint
|
|
entity_associations:
|
|
- host
|
|
|
|
- id: metric.system.filesystem.utilization
|
|
type: metric
|
|
metric_name: system.filesystem.utilization
|
|
stability: development
|
|
brief: ""
|
|
instrument: gauge
|
|
unit: "1"
|
|
attributes:
|
|
- ref: system.device
|
|
brief: Identifier for the device where the filesystem resides.
|
|
examples: ["/dev/sda", "\\network-drive"]
|
|
- ref: system.filesystem.state
|
|
- ref: system.filesystem.type
|
|
- ref: system.filesystem.mode
|
|
- ref: system.filesystem.mountpoint
|
|
entity_associations:
|
|
- host
|
|
|
|
- id: metric.system.filesystem.limit
|
|
type: metric
|
|
metric_name: system.filesystem.limit
|
|
stability: development
|
|
brief: "The total storage capacity of the filesystem"
|
|
instrument: updowncounter
|
|
unit: "By"
|
|
attributes:
|
|
- ref: system.device
|
|
brief: Identifier for the device where the filesystem resides.
|
|
examples: ["/dev/sda", "\\network-drive"]
|
|
- ref: system.filesystem.type
|
|
- ref: system.filesystem.mode
|
|
- ref: system.filesystem.mountpoint
|
|
entity_associations:
|
|
- host
|
|
|
|
# system.network.* metrics
|
|
- id: metric.system.network.dropped
|
|
type: metric
|
|
metric_name: system.network.dropped
|
|
stability: development
|
|
brief: "Count of packets that are dropped or discarded even though there was no error"
|
|
instrument: counter
|
|
unit: "{packet}"
|
|
note: |
|
|
Measured as:
|
|
|
|
- Linux: the `drop` column in `/proc/dev/net` ([source](https://web.archive.org/web/20180321091318/http://www.onlamp.com/pub/a/linux/2000/11/16/LinuxAdmin.html))
|
|
- Windows: [`InDiscards`/`OutDiscards`](https://docs.microsoft.com/windows/win32/api/netioapi/ns-netioapi-mib_if_row2)
|
|
from [`GetIfEntry2`](https://docs.microsoft.com/windows/win32/api/netioapi/nf-netioapi-getifentry2)
|
|
attributes:
|
|
- ref: network.interface.name
|
|
- ref: network.io.direction
|
|
entity_associations:
|
|
- host
|
|
|
|
- id: metric.system.network.packets
|
|
type: metric
|
|
metric_name: system.network.packets
|
|
stability: development
|
|
brief: ""
|
|
instrument: counter
|
|
unit: "{packet}"
|
|
attributes:
|
|
- ref: system.device
|
|
- ref: network.io.direction
|
|
entity_associations:
|
|
- host
|
|
|
|
- id: metric.system.network.errors
|
|
type: metric
|
|
metric_name: system.network.errors
|
|
stability: development
|
|
brief: "Count of network errors detected"
|
|
instrument: counter
|
|
unit: "{error}"
|
|
note: |
|
|
Measured as:
|
|
|
|
- Linux: the `errs` column in `/proc/dev/net` ([source](https://web.archive.org/web/20180321091318/http://www.onlamp.com/pub/a/linux/2000/11/16/LinuxAdmin.html)).
|
|
- Windows: [`InErrors`/`OutErrors`](https://docs.microsoft.com/windows/win32/api/netioapi/ns-netioapi-mib_if_row2)
|
|
from [`GetIfEntry2`](https://docs.microsoft.com/windows/win32/api/netioapi/nf-netioapi-getifentry2).
|
|
attributes:
|
|
- ref: network.interface.name
|
|
- ref: network.io.direction
|
|
entity_associations:
|
|
- host
|
|
|
|
- id: metric.system.network.io
|
|
type: metric
|
|
metric_name: system.network.io
|
|
stability: development
|
|
brief: ""
|
|
instrument: counter
|
|
unit: "By"
|
|
attributes:
|
|
- ref: network.interface.name
|
|
- ref: network.io.direction
|
|
entity_associations:
|
|
- host
|
|
|
|
- id: metric.system.network.connections
|
|
type: metric
|
|
metric_name: system.network.connections
|
|
stability: development
|
|
brief: ""
|
|
instrument: updowncounter
|
|
unit: "{connection}"
|
|
attributes:
|
|
- ref: network.interface.name
|
|
- ref: network.connection.state
|
|
- ref: network.transport
|
|
entity_associations:
|
|
- host
|
|
|
|
# system.process.* metrics
|
|
- id: metric.system.process.count
|
|
type: metric
|
|
metric_name: system.process.count
|
|
stability: development
|
|
brief: "Total number of processes in each state"
|
|
instrument: updowncounter
|
|
unit: "{process}"
|
|
attributes:
|
|
- ref: system.process.status
|
|
entity_associations:
|
|
- host
|
|
|
|
- id: metric.system.process.created
|
|
type: metric
|
|
metric_name: system.process.created
|
|
stability: development
|
|
brief: "Total number of processes created over uptime of the host"
|
|
instrument: counter
|
|
unit: "{process}"
|
|
attributes: []
|
|
entity_associations:
|
|
- host
|
|
|
|
# system.linux.* metrics
|
|
- id: metric.system.linux.memory.available
|
|
type: metric
|
|
metric_name: system.linux.memory.available
|
|
stability: development
|
|
brief: "An estimate of how much memory is available for starting new applications, without causing swapping"
|
|
note: |
|
|
This is an alternative to `system.memory.usage` metric with `state=free`.
|
|
Linux starting from 3.14 exports "available" memory. It takes "free" memory as a baseline, and then factors in kernel-specific values.
|
|
This is supposed to be more accurate than just "free" memory.
|
|
For reference, see the calculations [here](https://superuser.com/a/980821).
|
|
See also `MemAvailable` in [/proc/meminfo](https://man7.org/linux/man-pages/man5/proc.5.html).
|
|
instrument: updowncounter
|
|
unit: "By"
|
|
attributes: []
|
|
entity_associations:
|
|
- host
|
|
|
|
- id: metric.system.linux.memory.slab.usage
|
|
type: metric
|
|
metric_name: system.linux.memory.slab.usage
|
|
stability: development
|
|
brief: "Reports the memory used by the Linux kernel for managing caches of frequently used objects."
|
|
note: |
|
|
The sum over the `reclaimable` and `unreclaimable` state values in `linux.memory.slab.usage` SHOULD be equal to the total slab memory available on the system.
|
|
Note that the total slab memory is not constant and may vary over time.
|
|
See also the [Slab allocator](https://blogs.oracle.com/linux/post/understanding-linux-kernel-memory-statistics) and `Slab` in [/proc/meminfo](https://man7.org/linux/man-pages/man5/proc.5.html).
|
|
instrument: updowncounter
|
|
unit: "By"
|
|
attributes:
|
|
- ref: linux.memory.slab.state
|
|
entity_associations:
|
|
- host
|