# # collect interval
# interval = 15

[[instances]]
# # commands, support glob
commands = [
#     "/opt/categraf/scripts/*.sh"
]

# # timeout for each command to complete
# timeout = 5

# # interval = global.interval * interval_times
# interval_times = 1

  data_format = "influx"

数据格式支持以下3种：

influx
falcon
prometheus

influx 格式

mesurement,labelkey1=labelval1,labelkey2=labelval2 field1=1.2,field2=2.3

influx格式说明

mesurement，定义指标名称(或者前缀)，比如 connections；
mesurement后面是逗号，逗号后面是标签，如果没有标签，则mesurement后面不需要逗号
标签是k=v的格式，多个标签用逗号分隔，比如region=beijing,env=test
标签之后是空格
空格之后是属性字段，多个属性字段用逗号分隔
属性字段是字段名=值的格式，在categraf里值只能是数字

最终，mesurement和各个属性字段名称拼接成metric名字

falcon格式

Open-Falcon的格式如下，举例：

[
    {
        "endpoint": "test-endpoint",
        "metric": "test-metric",
        "timestamp": 1658490609,
        "step": 60,
        "value": 1,
        "counterType": "GAUGE",
        "tags": "idc=lg,loc=beijing"
    },
    {
        "endpoint": "test-endpoint",
        "metric": "test-metric2",
        "timestamp": 1658490609,
        "step": 60,
        "value": 2,
        "counterType": "GAUGE",
        "tags": "idc=lg,loc=beijing"
    }
]

timestamp、step、counterType，这三个字段在categraf处理的时候会直接忽略掉，endpoint会放到labels里上报。

prometheus格式

prometheus 格式大家不陌生了，比如我这里准备一个监控脚本，输出 prometheus 的格式数据：

#!/bin/sh

echo '# HELP demo_http_requests_total Total number of http api requests'
echo '# TYPE demo_http_requests_total counter'
echo 'demo_http_requests_total{api="add_product"} 4633433'

其中 # 注释的部分，其实会被 categraf 忽略，不要也罢，prometheus 协议的数据具体的格式，请大家参考 prometheus 官方文档

示例

再次提示，一般在复合型用途或独立的虚拟机启用此插件。

配置场景

本示例主要使用如下功能：

增加自定义标签，可通过自定义标签筛选数据及更加精确的告警推送。
响应超时时间为5秒。
commands字段正确应用脚本所在位置。

修改exec.toml文件配置

# # collect interval
# interval = 15

[[instances]]
# # commands, support glob
commands = [
     "/opt/categraf/scripts/cert/collect_*.sh"
     #"/opt/categraf/scripts/*/collect_*.py"
     #"/opt/categraf/scripts/*/collect_*.go"
     #"/opt/categraf/scripts/*/collect_*.lua"
     #"/opt/categraf/scripts/*/collect_*.java"
     #"/opt/categraf/scripts/*/collect_*.bat"
     #"/opt/categraf/scripts/*/collect_*.cmd"
     #"/opt/categraf/scripts/*/collect_*.ps1"
]

# # timeout for each command to complete
  timeout = 5

# # interval = global.interval * interval_times
# interval_times = 1

# # mesurement,labelkey1=labelval1,labelkey2=labelval2 field1=1.2,field2=2.3
data_format = "influx"

测试

以cert/collect_cert_expiretime.sh为例：执行 sh /opt/categraf/cert/collect_cert_expiretime.sh 输出:

cert,cloud=my-cloud,region=my-region,azone=az1,product=my-product,domain_name=www.baidu.com expire_days=116,reachable=1
cert,cloud=my-cloud,region=my-region,azone=az1,product=my-product,domain_name=www.weibo.com expire_days=37,reachable=1
cert,cloud=my-cloud,region=my-region,azone=az1,product=my-product,domain_name=www.csdn.net expire_days=233,reachable=1

执行./categraf --test --inputs exec 输出:

11:15:21 cert_expire_days agent_hostname=tt-fc-dev01.nj azone=az1 cloud=my-cloud domain_name=www.baidu.com product=my-product region=my-region scraper=contract-categraf 116
11:15:21 cert_reachable agent_hostname=tt-fc-dev01.nj azone=az1 cloud=my-cloud domain_name=www.baidu.com product=my-product region=my-region scraper=contract-categraf 1
11:15:21 cert_expire_days agent_hostname=tt-fc-dev01.nj azone=az1 cloud=my-cloud domain_name=www.weibo.com product=my-product region=my-region scraper=contract-categraf 37
11:15:21 cert_reachable agent_hostname=tt-fc-dev01.nj azone=az1 cloud=my-cloud domain_name=www.weibo.com product=my-product region=my-region scraper=contract-categraf 1
11:15:21 cert_reachable agent_hostname=tt-fc-dev01.nj azone=az1 cloud=my-cloud domain_name=www.csdn.net product=my-product region=my-region scraper=contract-categraf 1
11:15:21 cert_expire_days agent_hostname=tt-fc-dev01.nj azone=az1 cloud=my-cloud domain_name=www.csdn.net product=my-product region=my-region scraper=contract-categraf 233

启动categraf

启动categraf 方式见：categraf安装部署

检查数据呈现

如图：

远程下发脚本

企业版从v0.3.34, 开源版v0.3.62起，支持远程配置脚本(会按照配置创建目录和脚本)

注意：

同名脚本会被覆盖, 不用的脚本需要手动清理
instances.scripts 放到最后，否则会影响data_format解析

# # collect interval
# interval = 15

[[instances]]
# # commands, support glob
commands = [
    "/home/flashcat/categraf/scripts/*.sh"
]

# # timeout for each command to complete
# timeout = 5

# # interval = global.interval * interval_times
# interval_times = 1

# # choices: influx prometheus falcon
# # influx stdout example: mesurement,labelkey1=labelval1,labelkey2=labelval2 field1=1.2,field2=2.3
data_format = "influx"

[instances.scripts]
# 按照路径/home/flashcat/categraf/scripts/a.sh创建脚本
"/home/flashcat/categraf/scripts/a.sh"='''
#!/bin/sh

echo "test hello=1"
'''

# 按照路径 /home/flashcat/categraf/scripts/b.sh 创建脚本 
"/home/flashcat/categraf/scripts/b.sh"='''
#!/bin/sh

echo "test word=1"
'''

监控大盘和告警规则

脚本作用不同，监控大盘和规则就不同，先略过。