Feat: add log processor docs (#6)

2021-12-16 16:07:09 +08:00 · 2021-12-16 16:07:09 +08:00 · f33f40ab7b
parent 59b4f3993f
commit f33f40ab7b
7 changed files with 541 additions and 22 deletions
--- a/docs/reference/pipelines/interceptor/jsonDecode.md
+++ b/docs/reference/pipelines/interceptor/jsonDecode.md
@ -1 +0,0 @@
-# jsonDecode
--- a/docs/reference/pipelines/interceptor/normalize.md
+++ b/docs/reference/pipelines/interceptor/normalize.md
@ -0,0 +1,229 @@
+# normalize
+
+## 功能
+用于日志切分处理。  
+属于source interceptor。可指定只被某些source使用。  
+
+## 具体参数
+
+### processors
+
+|  `类型`  |  `是否必填`  |  `默认值`  |  `含义`  |
+| ------- | ----------- | --------- | ------- |
+| map数组  |    必填    |  无    | 所有的处理processor数组 |
+
+配置的processor将按照顺序依次执行。包括以下：
+#### regex
+将指定字段进行正则提取。  
+
+|    `字段`   |    `类型`    |  `是否必填`  |  `默认值`  |  `含义`  |
+| ---------- | ----------- | ----------- | --------- | -------- |
+| regex.pattern | string  |    必填    |  无    | 正则解析规则 |
+| regex.target | string  |    非必填    |  body    | 正则解析的目标字段 |
+
+!!! example
+    ```yaml
+    interceptors:
+    - type: normalize
+      processors:
+      - regex:
+          pattern: '(?<ip>\S+) (?<id>\S+) (?<u>\S+) (?<time>\[.*?\]) (?<url>\".*?\") (?<status>\S+) (?<size>\S+)'
+    ```
+
+    使用以上的正则表达式，可以将以下示例的日志：
+    ```
+    10.244.0.1 - - [13/Dec/2021:12:40:48 +0000] "GET / HTTP/1.1" 404 683
+    ```
+    转换成：
+    ```
+    "ip": "10.244.0.1",
+    "id": "-",
+    "u": "-",
+    "time": "[13/Dec/2021:12:40:48 +0000]",
+    "url": "\"GET / HTTP/1.1\"",
+    "status": "404",
+    "s": "683"
+    ```
+
+具体配置的时候，建议先使用一些正则调试工具 (https://regex101.com/) 验证是否可以匹配。  
+
+#### jsonDecode
+将指定字段json解析提取。
+
+|    `字段`   |    `类型`    |  `是否必填`  |  `默认值`  |  `含义`  |
+| ---------- | ----------- | ----------- | --------- | -------- |
+| jsonDecode.target | string  |    非必填    |  body    | json decode的目标字段 |
+
+!!! example
+    ```yaml
+    interceptors:
+    - type: normalize
+      processors:
+      - jsonDecode: ~
+    ```
+
+
+#### split
+将指定字段通过分隔符进行提取。
+
+|    `字段`   |    `类型`    |  `是否必填`  |  `默认值`  |  `含义`  |
+| ---------- | ----------- | ----------- | --------- | -------- |
+| split.target | string  |    非必填    |  body    | split的目标字段 |
+| split.separator | string  |    必填    |  无    | 分隔符 |
+| split.max | int  |    非必填    |  -1    | 通过分割符分割后得到的最多的字段数 |
+| split.keys | string数组  |    必填    |  无   | 分割后字段对应的key |
+
+!!! example
+    === "base"
+        ```yaml
+        interceptors:
+        - type: normalize
+          processors:
+          - split:
+            separator: '|'
+            keys: ["time", "order", "service", "price"]
+        ```
+        使用以上split配置可以将日志：
+        ```
+        2021-08-08|U12345|storeCenter|13.14
+        ```
+        转换成：
+        ```
+        "time": "2021-08-08"
+        "order": "U12345"
+        "service": "storeCenter"
+        "price: 13.14
+        ```
+
+    
+    === "max"
+        ```yaml
+        interceptors:
+        - type: normalize
+          processors:
+          - split:
+            separator: ' '
+            max: 2
+            keys: ["time", "content"]
+        ```
+        通过增加`max`参数，可以控制最多分割的字段。  
+        比如以下日志:
+        ```
+        2021-08-08 U12345 storeCenter 13.14
+        ```
+        可以通过以上配置提取为:
+        ```
+        "time": "2021-08-08"
+        "content": "U12345 storeCenter 13.14"
+        ```
+
+
+
+#### drop
+丢弃指定字段。
+
+|    `字段`   |    `类型`    |  `是否必填`  |  `默认值`  |  `含义`  |
+| ---------- | ----------- | ----------- | --------- | -------- |
+| drop.target | string数组  |    必填    |  无    | drop的字段 |
+
+!!! example
+    ```yaml
+    interceptors:
+    - type: normalize
+      processors:
+      - drop:
+          target: ["id", "body"]
+    ```
+
+
+#### rename
+重命名指定字段。
+
+|    `字段`   |    `类型`    |  `是否必填`  |  `默认值`  |  `含义`  |
+| ---------- | ----------- | ----------- | --------- | -------- |
+| rename.target | 数组  |    必填    |  无    |  |
+| rename.target[n].from | string  |    必填    |  无    | rename的目标 |
+| rename.target[n].to | string  |    必填    |  无    | rename后的名称 |
+
+!!! example
+    ```yaml
+    interceptors:
+    - type: normalize
+      processors:
+      - rename:
+          target:
+          - from: "hello"
+            to: "world"
+    ```
+
+#### add
+新增字段。  
+
+|    `字段`   |    `类型`    |  `是否必填`  |  `默认值`  |  `含义`  |
+| ---------- | ----------- | ----------- | --------- | -------- |
+| add.target | map  |    必填    |  无    | 新增的key:value值 |
+
+!!! example
+    ```yaml
+    interceptors:
+    - type: normalize
+      processors:
+      - add:
+          target:
+            hello: world
+    ```
+
+#### timestamp
+转换时间格式。
+
+|    `字段`   |    `类型`    |  `是否必填`  |  `默认值`  |  `含义`  |
+| ---------- | ----------- | ----------- | --------- | -------- |
+| timestamp.target | 数组  |    必填    |  无    |  |
+| timestamp.target[n].from | string  |    必填    |  无    | 指定转换时间格式的字段 |
+| timestamp.target[n].fromLayout | string  |    必填    |  无    | 指定字段的时间格式(golang形式) |
+| timestamp.target[n].toLayout | string  |    必填    |  无    | 转换后的时间格式(golang形式)，另外可为`unix`和`unix_ms` |
+| timestamp.target[n].toType | string  |    非必填    |  无    | 转换后的时间字段类型 |
+
+!!! example
+    ```yaml
+    interceptors:
+    - type: normalize
+      processors:
+      - timestamp:
+          target:
+          - from: logtime
+            fromLayout: ""
+            toLayout: "unix"
+    ```
+
+
+以上的layout参数需要填写golang形式，可参考：
+```
+const (
+	Layout      = "01/02 03:04:05PM '06 -0700" // The reference time, in numerical order.
+	ANSIC       = "Mon Jan _2 15:04:05 2006"
+	UnixDate    = "Mon Jan _2 15:04:05 MST 2006"
+	RubyDate    = "Mon Jan 02 15:04:05 -0700 2006"
+	RFC822      = "02 Jan 06 15:04 MST"
+	RFC822Z     = "02 Jan 06 15:04 -0700" // RFC822 with numeric zone
+	RFC850      = "Monday, 02-Jan-06 15:04:05 MST"
+	RFC1123     = "Mon, 02 Jan 2006 15:04:05 MST"
+	RFC1123Z    = "Mon, 02 Jan 2006 15:04:05 -0700" // RFC1123 with numeric zone
+	RFC3339     = "2006-01-02T15:04:05Z07:00"
+	RFC3339Nano = "2006-01-02T15:04:05.999999999Z07:00"
+	Kitchen     = "3:04PM"
+	// Handy time stamps.
+	Stamp      = "Jan _2 15:04:05"
+	StampMilli = "Jan _2 15:04:05.000"
+	StampMicro = "Jan _2 15:04:05.000000"
+	StampNano  = "Jan _2 15:04:05.000000000"
+)
+```
+还可以根据实际情况修改。  
+
+
+
+
+
+
+
--- a/docs/reference/pipelines/sink/codec.md
+++ b/docs/reference/pipelines/sink/codec.md
--- a/docs/user-guide/architecture/config.md
+++ b/docs/user-guide/architecture/config.md
@ -0,0 +1,3 @@
+# 配置设计
+
+
--- a/docs/user-guide/architecture/schema.md
+++ b/docs/user-guide/architecture/schema.md
@ -0,0 +1,64 @@
+# 数据格式
+
+!!! info
+    了解Loggie内部数据格式的设计，能帮助我们配置合适的日志处理和日志格式转换
+
+## 结构设计
+在Loggie内部的日志数据，会被存储为如下`key:value`的格式：
+
+!!! example
+    ```json
+    "body": "xxxxxxx",
+    "key1": "value1",
+    "key2": "value2"
+    
+    ```
+
+其中`body`为source接收到的原始内容，比如日志采集source，这里的`body`为文件采集到的原始日志数据。  
+其他的字段，在Loggie内部被统一称为header。header里一般包括：
+
+- 系统保留字段。统一system开头，比如systemPipelineName、systemSourceName、systemState等
+- 用户使用的字段。比如用户自己添加到source里的fields字段，用户切分日志后得到的字段等
+
+默认Json格式输出示例如下：
+
+!!! example
+    ```json
+    {
+      "body": "01-Dec-2021 03:13:58.298 INFO [main] Starting service [Catalina]",
+      "fields": {
+          "test": "demo"
+      },
+      "systemPipelineName": "svcA",
+      "systemSourceName": "accesslog",
+      "systemState": {
+          "nextOffset": 2790,
+          "filename": "/tmp/log/a.log",
+          "collectTime": "2021-12-12T11:58:12.141267+08:00",
+          "contentBytes": 160,
+          "jobUid": "39100914-16777234",
+          "lineNumber": 39,
+          "offset": 2630
+      }
+    }
+    ```
+
+
+## 格式转换
+如果以上的格式不满足需求，可以使用sink里的 **codec** 修改输出的字段：
+
+- [配置](../../reference/pipelines/sink/codec.md)
+- [使用示例](../best-practice/log-format.md)
+
+## 日志切分
+对于原始日志数据的切分与处理，请使用 **normalize interceptor**：
+
+- [配置](../../reference/pipelines/interceptor/normalize.md)
+- [使用示例](../best-practice/log-process.md)
+
+
+!!! caution
+    格式转换和日志切分两者区别：
+
+    - normalize interceptor除了可以drop `body`字段外，无法修改系统保留的字段（修改可能影响内部逻辑）。codec.transformer可以任意修改字段，但存在反序列化的性能开销。
+    - normalize interceptor可以通过配置`belongTo`指定关联source使用，codec.transformer则为pipeline级别，同时可以在系统配置中使用defaults来全局生效。  
--- a/docs/user-guide/best-practice/log-process.md
+++ b/docs/user-guide/best-practice/log-process.md
@ -0,0 +1,223 @@
+# 日志切分与处理
+> Loggie可使用[normalize interceptor](../../reference/pipelines/interceptor/normalize.md)来进行日志的切分和处理，将日志数据进行结构化的提取，同时可以对提取后的字段进行处理。  
+> 建议先了解Loggie内部日志数据[schema设计](../architecture/schema.md)。  
+
+!!! caution
+    normalize interceptor只用于用户日志的处理，并不能转换system开头的系统保留字段，系统保留字段请参考[日志格式转换](./log-format.md)。
+
+## 需求场景
+
+最主要的是对日志进行切分解析提取和处理。  
+
+比如以下日志：
+
+```
+01-Dec-2021 03:13:58.298 INFO [main] Starting service [Catalina]
+```
+
+我们可能会需要将其中的日期、日志级别解析出来，最终形成：
+
+```json
+{
+   "time": "01-Dec-2021 03:13:58.298",
+   "level": "INFO",
+   "message": "[main] Starting service [Catalina]"
+}
+```
+
+这种结构化的数据，存储的时候便于过滤查询，或者根据日志里的时间来排序，而不是采集的时间戳，或者根据日志级别进行一些过滤，可以方便查询到ERROR级别的日志等等。  
+当然不仅仅是像以上tomcat的运维类日志，还有诸如业务的一些订单等等日志，都有类似的需求和使用场景。  
+
+## 功能
+目前已经支持的功能有：
+
+- regex: 正则切分提取日志
+- jsonDecode: 解析提取json格式的日志
+- split: 通过分隔符来提取日志
+- add/rename/drop指定字段
+- timestamp: 转换指定字段的时间格式
+
+normalize可以配置内部的processor顺序执行。
+
+## 配置示例
+
+日志切分处理在Loggie Agent端或者Loggie中转机侧均可，取决于我们是否需要中转机，以及希望日志处理这种CPU密集型的计算是分布在Agent上，由各个节点承担，还是希望在中转机集群中集中进行。  
+
+下面以采集tomcat服务的标准输出和access日志为例，展示如何对标准输出格式进行处理和access日志进行字段切分。  
+  
+简单起见，示例使用CRD实例配置下发在Agent，同时使用dev sink直接输出处理结果展示。
+
+### sink
+创建如下的sink用于演示，将采集处理后的日志打印在所属节点Loggie的日志中。  
+
+!!! example
+
+    ```yaml
+    apiVersion: loggie.io/v1beta1
+    kind: Sink
+    metadata:
+      name: default
+    spec:
+      sink: |
+        type: dev
+        printEvents: true
+        codec:
+          type: json
+          pretty: true
+    ```
+
+### interceptor
+根据实际环境创建如下的interceptor。  
+
+!!! example
+
+    === "docker"
+
+        ```yaml
+        apiVersion: loggie.io/v1beta1
+        kind: Interceptor
+        metadata:
+          name: tomcat
+        spec:
+          interceptors: |
+            - type: normalize
+              name: stdproc
+              belongTo: ["stdout"]
+              processors:
+              - jsonDecode: ~
+              - drop:
+                  target: ["stream", "time", "body"]
+              - rename:
+                  target:
+                  - from: "log"
+                    to: "message"
+            - type: normalize
+              name: accproc
+              belongTo: ["access"]
+              processors:
+              - regex:
+                  pattern: '(?<ip>\S+) (?<id>\S+) (?<u>\S+) (?<time>\[.*?\]) (?<url>\".*?\") (?<status>\S+) (?<size>\S+)'
+        ```
+
+    === "containerd"
+
+        ```yaml
+        apiVersion: loggie.io/v1beta1
+        kind: Interceptor
+        metadata:
+          name: tomcat
+        spec:
+          interceptors: |
+            - type: normalize
+              name: stdproc
+              belongTo: ["stdout"]
+              processors:
+              - split:
+                  separator: ' '
+                  max: 4
+                  keys: ["time", "std", "F", "message"]
+              - drop:
+                  target: ["time", "std", "F", "body"]
+            - type: normalize
+              name: accproc
+              belongTo: ["access"]
+              processors:
+              - regex:
+                  pattern: '(?<ip>\S+) (?<id>\S+) (?<u>\S+) (?<time>\[.*?\]) (?<url>\".*?\") (?<status>\S+) (?<size>\S+)'
+
+        ```
+
+如果使用docker运行时，默认采集的标准输出为json格式。  
+类似：
+```json
+{
+"log":"I0610 08:29:07.698664 Waiting for caches to sync\n",
+"stream":"stderr", 
+"time":"2021-06-10T08:29:07.698731204Z"
+}
+```
+我们需要将json格式的原始日志解析，一般还需要drop `stream`和`time`字段，并将其中的`log`字段key改为和其他格式一致的`body`或者`message`。  
+
+使用filesource采集后，在Loggie里的原始格式为：
+```
+"body": '"log":"I0610 08:29:07.698664 Waiting for caches to sync\n","stream":"stderr", "time":"2021-06-10T08:29:07.698731204Z"'
+"systemXXX": xxx
+...
+```
+
+在normalize interceptors里，使用:  
+
+1. **`jsonDecode`**:  
+解析并提取字段，将Loggie里存储的日志格式变成：
+```
+"body": '"log":"I0610 08:29:07.698664 Waiting for caches to sync\n","stream":"stderr", "time":"2021-06-10T08:29:07.698731204Z"'
+"log": "I0610 08:29:07.698664 Waiting for caches to sync\n"
+"stream":"stderr"
+"time":"2021-06-10T08:29:07.698731204Z"
+"systemXXX": xxx
+...
+```
+
+1. **`drop`**:  
+将`body`、`stream`和`time`丢弃。（`body`为内置字段表示原始日志数据，仅能丢弃，不能修改）
+
+3. **`rename`**:  
+将`log`改名为统一的字段比如`message`。 
+ 
+最终发送的日志格式变成：  
+```
+"message": "I0610 08:29:07.698664 Waiting for caches to sync\n"
+"systemXXX": xxx
+...
+```
+
+
+在runtime为containerd时，原始日志如下所示：
+
+```
+2021-12-01T03:13:58.298476921Z stderr F INFO [main] Starting service [Catalina]
+```
+
+会默认加上类似`2021-12-01T03:13:58.298476921Z stderr F`的前缀，一般我们并不需要这个日志，发送的时候只保留后面数据。  
+和docker的方式类似，我们可以配置split等方式来切分处理。  
+
+另外，由于stdout的日志和access日志格式不一样，所以使用了两个不同的normalize interceptor，配置了不同的processor，需要添加`name`来区分。  
+最重要的是，这里使用了`belongTo`指定关联的`source`，让stdout日志和access日志分别采用不同的处理逻辑。  
+
+### logconfig
+配置source如下所示：
+
+!!! example
+
+    ```yaml
+    apiVersion: loggie.io/v1beta1
+    kind: LogConfig
+    metadata:
+      name: tomcat
+      namespace: default
+    spec:
+      selector:
+        labelSelector:
+          app: tomcat
+        type: pod
+      pipeline:
+        interceptorRef: tomcat
+        sinkRef: dev
+        sources: |
+          - type: file
+            name: stdout
+            paths:
+            - stdout
+          - type: file
+            name: access
+            paths:
+            - /usr/local/tomcat/logs/localhost_access_log.*.txt
+    ```
+
+注意这里的source.name和上面interceptor配置的belongTo需要关联上。  
+
+创建完以上的cr后，便可以采集指定的Pod日志，并进行切分处理。  
+
+
+
+
--- a/nav.yml
+++ b/nav.yml
@ -19,6 +19,8 @@ nav:
      - 架构与特性: 
        - 诞生背景: user-guide/architecture/background.md
        - 设计架构: user-guide/architecture/core-arch.md
+        - 配置设计: user-guide/architecture/config.md
+        - 数据格式: user-guide/architecture/schema.md
        - 优势与特性: user-guide/architecture/advantages.md
        - 开源项目对比: user-guide/architecture/compare.md

@ -31,7 +33,7 @@ nav:
        - 采集Kubernetes Events: user-guide/use-in-kubernetes/kube-event-source.md

      - 最佳实践:
-        - 发送日志格式转换: user-guide/best-practice/log-format.md
+        - 日志格式转换: user-guide/best-practice/log-format.md
        - 日志切分与处理: user-guide/best-practice/log-process.md
        - 日志定时清理: user-guide/best-practice/log-clean.md

@ -53,26 +55,25 @@ nav:
      - 全局: 
        - 启动参数与日志配置: reference/global/args.md
        - 系统配置: reference/global/system.md
-      - Pipelines:
-        - Overview: reference/pipelines/overview.md
-        - Source:
-          - file: reference/pipelines/source/file.md
-          - kafka: reference/pipelines/source/kafka.md
-          - kubeEvent: reference/pipelines/source/kubeEvent.md
-        - Sink:
-          - elasticsearch: reference/pipelines/sink/elasticsearch.md
-          - kafka: reference/pipelines/sink/kafka.md
-          - dev: reference/pipelines/sink/dev.md
-          - grpc: reference/pipelines/sink/grpc.md
-        - Interceptor:
-          - jsonDecode: reference/pipelines/interceptor/jsonDecode.md
-          - limit: reference/pipelines/interceptor/limit.md
-          - logalert: reference/pipelines/interceptor/logalert.md 
-          - metrics: reference/pipelines/interceptor/metrics.md
-          - retry: reference/pipelines/interceptor/retry.md
-        - Queue:
-          - channel: reference/pipelines/queue/channel.md
-          - memeory: reference/pipelines/queue/memory.md
+      - Source:
+        - file: reference/pipelines/source/file.md
+        - kafka: reference/pipelines/source/kafka.md
+        - kubeEvent: reference/pipelines/source/kubeEvent.md
+      - Sink:
+        - elasticsearch: reference/pipelines/sink/elasticsearch.md
+        - kafka: reference/pipelines/sink/kafka.md
+        - dev: reference/pipelines/sink/dev.md
+        - grpc: reference/pipelines/sink/grpc.md
+        - codec: reference/pipelines/sink/codec.md
+      - Interceptor:
+        - normalize: reference/pipelines/interceptor/normalize.md
+        - limit: reference/pipelines/interceptor/limit.md
+        - logalert: reference/pipelines/interceptor/logalert.md 
+        - metrics: reference/pipelines/interceptor/metrics.md
+        - retry: reference/pipelines/interceptor/retry.md
+      - Queue:
+        - channel: reference/pipelines/queue/channel.md
+        - memeory: reference/pipelines/queue/memory.md
      - Kubernetes CRD: 
        - Overview: reference/discovery/kubernetes/overview.md
        - LogConfig: reference/discovery/kubernetes/logconfig.md