Compare commits
58 Commits
Author | SHA1 | Date |
---|---|---|
|
b5c329086e | |
|
189879f09c | |
|
b4d7410682 | |
|
1869034c55 | |
|
c21a069f30 | |
|
4191bd60a9 | |
|
9ba36d2707 | |
|
17522020ec | |
|
cc9b94c443 | |
|
b3d96f72af | |
|
ccaa483a4e | |
|
4ee04759e3 | |
|
19fdcfdce7 | |
|
cb913b6a6b | |
|
e1aaec8326 | |
|
a027b9ddcd | |
|
c74d040061 | |
|
5a4674a404 | |
![]() |
e3aaa5510a | |
|
97fd480824 | |
|
8ae18a1dd0 | |
|
367ad3e6a2 | |
|
5c8a6333b2 | |
|
81265ad8a4 | |
![]() |
beb02323c6 | |
![]() |
72516b8d42 | |
![]() |
dffa1284a2 | |
|
bfdc010cc1 | |
|
e519a3e46e | |
|
808cbe922f | |
![]() |
d52703ccbb | |
![]() |
b84209272b | |
![]() |
1a95972842 | |
![]() |
67ca28005c | |
![]() |
9128f7a3a3 | |
|
9c5bab33bf | |
|
7cd0e9b218 | |
|
ce51ab9d37 | |
|
63df94bcd0 | |
|
20fd689d1c | |
|
c82f41c869 | |
|
030494f792 | |
|
42901645ad | |
|
3617bf5857 | |
|
f3c9535713 | |
|
1cd7616e22 | |
|
dc2fb43987 | |
|
fc9c20d24a | |
|
93c15c0d30 | |
|
42b2204de9 | |
|
6f91fb52b2 | |
|
67aa0490de | |
|
43964c0387 | |
|
e6b9b84e24 | |
|
1c5155e15a | |
|
c74b061c83 | |
![]() |
230f7e97f1 | |
|
4e54cf49e3 |
|
@ -0,0 +1,5 @@
|
|||
*.o
|
||||
|
||||
bpf/include/vmlinux.h
|
||||
_output/
|
||||
cmd/**/bin/
|
|
@ -0,0 +1,128 @@
|
|||
---
|
||||
linters:
|
||||
disable-all: true
|
||||
enable:
|
||||
- goimports
|
||||
- gosimple
|
||||
- ineffassign # Detects when assignments to existing variables are not used
|
||||
- unconvert # Remove unnecessary type conversions
|
||||
- exportloopref # Checks for pointers to enclosing loop variables
|
||||
- tenv # Detects using os.Setenv instead of t.Setenv since Go 1.17
|
||||
- dupword # Checks for duplicate words in the source code
|
||||
- gofmt # Gofmt checks whether code was gofmt-ed
|
||||
- bodyclose # checks whether HTTP response body is closed successfully
|
||||
- misspell
|
||||
- staticcheck
|
||||
- typecheck
|
||||
- unused
|
||||
- loggercheck
|
||||
- nakedret
|
||||
- gofumpt
|
||||
- musttag
|
||||
- whitespace
|
||||
- dupword
|
||||
- gocritic
|
||||
- usestdlibvars
|
||||
- gosec
|
||||
- govet
|
||||
- nolintlint
|
||||
- unused
|
||||
- errcheck
|
||||
- errname
|
||||
- errorlint
|
||||
- fatcontext
|
||||
- gocheckcompilerdirectives
|
||||
- inamedparam
|
||||
|
||||
# Could be enabled later:
|
||||
# - gocyclo
|
||||
# - prealloc
|
||||
# - maligned
|
||||
|
||||
linters-settings:
|
||||
unused:
|
||||
# Mark all struct fields that have been written to as used.
|
||||
# Default: true
|
||||
field-writes-are-uses: false
|
||||
# Mark all local variables as used.
|
||||
# default: true
|
||||
local-variables-are-used: false
|
||||
misspell:
|
||||
# Correct spellings using locale preferences for US or UK.
|
||||
# Setting locale to US will correct the British spelling of 'colour' to 'color'.
|
||||
# Default is to use a neutral variety of English.
|
||||
locale: US
|
||||
gofumpt:
|
||||
# Choose whether to use the extra rules.
|
||||
# Default: false
|
||||
extra-rules: true
|
||||
# Module path which contains the source code being formatted.
|
||||
module-path: huatuo-bamai
|
||||
gocritic:
|
||||
enabled-tags:
|
||||
- diagnostic
|
||||
- style
|
||||
- performance
|
||||
- experimental
|
||||
- opinionated
|
||||
disabled-checks:
|
||||
- commentedOutCode
|
||||
- deferInLoop
|
||||
- evalOrder
|
||||
- exitAfterDefer
|
||||
- exposedSyncMutex
|
||||
- ifElseChain
|
||||
- importShadow
|
||||
- sloppyReassign
|
||||
- unnamedResult
|
||||
- whyNoLint
|
||||
- filepathJoin
|
||||
nolintlint:
|
||||
allow-unused: true
|
||||
gosec:
|
||||
# https://github.com/securego/gosec#available-rules
|
||||
#
|
||||
# The following issues surfaced when `gosec` linter
|
||||
# was enabled.
|
||||
# Disable G115:
|
||||
# "G115: integer overflow conversion int8 -> uint64 (gosec)"
|
||||
excludes:
|
||||
- G107
|
||||
- G115
|
||||
- G204
|
||||
- G401
|
||||
- G501
|
||||
exclude-dirs:
|
||||
- pkg/tracing
|
||||
- vendor
|
||||
issues:
|
||||
# List of regexps of issue texts to exclude.
|
||||
#
|
||||
# But independently of this option we use default exclude patterns,
|
||||
# it can be disabled by `exclude-use-default: false`.
|
||||
# To list all excluded by default patterns execute `golangci-lint run --help`
|
||||
#
|
||||
# Default: https://golangci-lint.run/usage/false-positives/#default-exclusions
|
||||
#
|
||||
# _xxx as used var.
|
||||
exclude:
|
||||
- "^(var|field) `_.*` is unused$"
|
||||
exclude-rules:
|
||||
- linters:
|
||||
- revive
|
||||
text: "if-return"
|
||||
- linters:
|
||||
- revive
|
||||
text: "empty-block"
|
||||
- linters:
|
||||
- revive
|
||||
text: "superfluous-else"
|
||||
- linters:
|
||||
- revive
|
||||
text: "unused-parameter"
|
||||
- linters:
|
||||
- revive
|
||||
text: "unreachable-code"
|
||||
- linters:
|
||||
- revive
|
||||
text: "redefines-builtin-id"
|
|
@ -0,0 +1,34 @@
|
|||
FROM golang:1.22.4-alpine AS base
|
||||
RUN sed -i 's/dl-cdn.alpinelinux.org/mirrors.aliyun.com/g' /etc/apk/repositories
|
||||
RUN apk add --no-cache \
|
||||
make \
|
||||
clang15 \
|
||||
libbpf-dev \
|
||||
bpftool \
|
||||
curl \
|
||||
git
|
||||
ENV PATH=$PATH:/usr/lib/llvm15/bin
|
||||
|
||||
|
||||
FROM base AS build
|
||||
ARG BUILD_PATH=${BUILD_PATH:-/go/huatuo-bamai}
|
||||
ARG RUN_PATH=${RUN_PATH:-/home/huatuo-bamai}
|
||||
WORKDIR ${BUILD_PATH}
|
||||
COPY . .
|
||||
RUN make && \
|
||||
mkdir -p ${RUN_PATH}/bpf && \
|
||||
mkdir -p ${RUN_PATH}/tracer && \
|
||||
cp ${BUILD_PATH}/_output/bin/huatuo-bamai ${RUN_PATH}/huatuo-bamai && \
|
||||
cp ${BUILD_PATH}/huatuo-bamai.conf ${RUN_PATH}/huatuo-bamai.conf && \
|
||||
cp ${BUILD_PATH}/bpf/*.o ${RUN_PATH}/bpf/ && \
|
||||
find ${BUILD_PATH}/cmd -type f -name "*.bin" -exec cp {} ${RUN_PATH}/tracer/ \;
|
||||
# Comment following line if elasticsearch is needed and repalce the ES configs in huatuo-bamai.conf
|
||||
RUN sed -i 's/"http:\/\/127.0.0.1:9200"/""/' ${RUN_PATH}/huatuo-bamai.conf
|
||||
|
||||
|
||||
FROM alpine:3.22.0 AS run
|
||||
ARG RUN_PATH=${RUN_PATH:-/home/huatuo-bamai}
|
||||
RUN apk add --no-cache curl
|
||||
COPY --from=build ${RUN_PATH} ${RUN_PATH}
|
||||
WORKDIR ${RUN_PATH}
|
||||
CMD ["./huatuo-bamai", "--region", "example", "--config", "huatuo-bamai.conf"]
|
|
@ -0,0 +1,202 @@
|
|||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "{}"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright {yyyy} Authors of Cilium
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
|
|
@ -0,0 +1,78 @@
|
|||
GO ?= go
|
||||
|
||||
# the root directory
|
||||
ROOT_DIR := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
|
||||
|
||||
# bpf source code files
|
||||
BPF_DIR := $(ROOT_DIR)/bpf
|
||||
|
||||
# used for go generate to compile eBPF
|
||||
BPF_COMPILE := $(ROOT_DIR)/build/clang.sh
|
||||
BPF_INCLUDE := "-I$(BPF_DIR)/include"
|
||||
|
||||
APP_COMMIT ?= $(shell git describe --dirty --long --always)
|
||||
APP_BUILD_TIME=$(shell date "+%Y%m%d%H%M%S")
|
||||
APP_VERSION="1.0"
|
||||
|
||||
GO_BUILD_STATIC := CGO_ENABLED=1 $(GO) build -tags "netgo osusergo $(GO_TAGS)" -gcflags=all="-N -l" \
|
||||
-ldflags "-extldflags -static
|
||||
GO_BUILD_STATIC_WITH_VERSION := $(GO_BUILD_STATIC) \
|
||||
-X main.AppVersion=$(APP_VERSION) \
|
||||
-X main.AppGitCommit=$(APP_COMMIT) \
|
||||
-X main.AppBuildTime=$(APP_BUILD_TIME)"
|
||||
|
||||
# export
|
||||
export GO_BUILD_STATIC
|
||||
|
||||
all: gen-deps gen build tracer
|
||||
|
||||
gen-deps:
|
||||
# maybe need to install libbpf-devel
|
||||
|
||||
gen:
|
||||
@BPF_DIR=$(BPF_DIR) \
|
||||
BPF_COMPILE=$(BPF_COMPILE) \
|
||||
BPF_INCLUDE=$(BPF_INCLUDE) \
|
||||
$(GO) generate -x ./...
|
||||
|
||||
build:
|
||||
$(GO_BUILD_STATIC_WITH_VERSION) -o _output/bin/huatuo-bamai ./cmd/huatuo-bamai
|
||||
|
||||
TRACER_DIR := cmd
|
||||
BIN_DIR := bin
|
||||
|
||||
SUBDIRS := $(shell find $(TRACER_DIR) -mindepth 1 -maxdepth 1 -type d -not -path "$(BIN_DIR)" | grep -v 'depend\|huatuo-bamai')
|
||||
TARGETS := $(patsubst %,$(BIN_DIR)/%,$(notdir $(SUBDIRS)))
|
||||
COMBINED := $(foreach dir,$(SUBDIRS),$(dir)/$(BIN_DIR)/*.bin)
|
||||
|
||||
tracer: $(TARGETS)
|
||||
$(BIN_DIR)/%: $(TRACER_DIR)/%
|
||||
cd $< && make
|
||||
|
||||
check: imports fmt golangci-lint
|
||||
|
||||
imports:
|
||||
@echo "imports"
|
||||
@goimports -w -local huatuo-bamai $(shell find . -type f -name '*.go' -not -path "./vendor/*")
|
||||
|
||||
fmt: fmt-rewrite-rules
|
||||
@echo "gofumpt"
|
||||
gofumpt -l -w $(shell find . -type f -name '*.go' -not -path "./vendor/*")
|
||||
|
||||
fmt-rewrite-rules:
|
||||
@echo "fmt-rewrite-rules"
|
||||
gofmt -w -r 'interface{} -> any' $(shell find . -type f -name '*.go' -not -path "./vendor/*")
|
||||
|
||||
golangci-lint:
|
||||
@echo "golangci-lint"
|
||||
golangci-lint run --build-tags=$(GO_TAGS) -v ./... --timeout=5m --config .golangci.yaml
|
||||
|
||||
vendor:
|
||||
$(GO) mod tidy
|
||||
$(GO) mod verify
|
||||
$(GO) mod vendor
|
||||
|
||||
clean:
|
||||
rm -rf _output $(shell find . -type f -name "*.o") $(COMBINED)
|
||||
|
||||
.PHONY: all gen-deps gen build tracer check imports golint fmt golangci-lint vendor clean
|
|
@ -0,0 +1,13 @@
|
|||
v2.0 - WIP
|
||||
---------------------
|
||||
- 支持指标、事件、 追踪 region 字段
|
||||
- 支持 softirq percpu 指标
|
||||
- 支持 golangci 静态检查
|
||||
- 支持组件的 cgroupv2 资源限制
|
||||
- 支持独立的 cgroup package, 应用无感知 cgroup 运行时类型
|
||||
- 支持根据 kubelet cgroupdriver 配置,实现 cgroupfs, systemd cgroup 路径转换
|
||||
- 若干代码优化和 BUG 修复
|
||||
|
||||
v1.0 - 2025-07-13
|
||||
---------------------
|
||||
- 初始版本发布,主要涉及指标,事件,自动化追踪
|
70
README.md
|
@ -1,2 +1,70 @@
|
|||
# huatuo-bamai
|
||||
简体中文 | [English](./README_EN.md)
|
||||
|
||||

|
||||
|
||||
# 什么是 HUATUO
|
||||
**HUATUO(华佗)**是由**滴滴**开源并依托 **CCF 开源发展委员会**孵化的云原生操作系统可观测性项目,专注于为复杂云原生环境提供操作系统内核级深度观测能力。该项目基于 [eBPF](https://docs.kernel.org/userspace-api/ebpf/syscall.html) 技术,通过整合 [kprobe](https://www.kernel.org/doc/html/latest/trace/kprobes.html)、 [tracepoint](https://www.kernel.org/doc/html/latest/trace/tracepoints.html)、 [ftrace](https://www.kernel.org/doc/html/latest/trace/ftrace.html) 等内核动态追踪技术,实现了多维度的内核观测能力:**1.** 更精细化的内核子系统埋点指标 Metric **2.** 异常事件驱动的内核运行时上下文捕获 Events **3.** 针对系统突发毛刺的自动追踪 AutoTracing、AutoProfiling。该项目逐步构建了完整的 Linux 内核深度可观测体系架构。目前,HUATUO 已在滴滴生产环境中实现规模化部署,在诸多故障场景中发挥关键作用,有效保障了云原生操作系统的高可用性和性能优化。通过持续的技术演进,希望 HUATUO 能够推动 eBPF 技术在云原生可观测领域向更细粒度、更低开销、更高时效性的方向发展。更多信息访问官网 [https://huatuo.tech](https://huatuo.tech/)。
|
||||
|
||||
|
||||
# 核心特性
|
||||
- **低损耗内核全景观测**:基于 BPF 技术,保持性能损耗小于1%的基准水位,实现对内存管理、CPU 调度、网络及块 IO 子系统等核心模块的精细化、全维度、全景观测与性能剖析。通过自适应采样机制,实现系统资源损耗与观测精度的动态平衡。
|
||||
- **异常事件驱动诊断**:构建基于异常事件驱动的运行时上下文捕获机制,聚焦内核异常与慢速路径的精准埋点。当发生缺页异常、调度延迟、锁竞争等关键事件时,自动触发调用链追踪,生成包含寄存器状态、堆栈轨迹及资源占用的图谱诊断信息。
|
||||
- **全自动化追踪 AutoTracing**:AutoTracing 模块采用启发式追踪算法,解决云原生复杂场景下的典型性能毛刺故障。针对 CPU idle 掉底,CPU sys 突增,IO 突增,loadavg 突增等棘手问题,实现自动化快照留存机制和根因分析。
|
||||
- **持续性能剖析 Profiling**:持续对操作系统内核,应用程序进行全方位性能剖析,涉及系统 CPU、内存、I/O、 锁、以及各种解释性编程语言,力助业务持续的优化迭代更新。该功能在哨兵压测,防火演练,重要节假日护堤等场景发挥关键作用。
|
||||
- **分布式链路追踪 Tracing**:以网络为中心的面向服务请求的分布式链路追踪,能够清晰的划分系统调用层级关系,节点关联关系,耗时记账等,支持在大规模分布式系统中的跨节点追踪,提供微服务调用的全景视图,保障系统在复杂场景下的稳定性。
|
||||
- **开源技术生态融合**:无缝对接主流开源可观测技术栈,如 Prometheus、Grafana、Pyroscope、Elasticsearch等。支持独立物理机和云原生部署,自动感知 K8S 容器资源/标签/注解,自动关联操作系统内核事件指标,消除数据孤岛。通过零侵扰、内核可编程方式兼容主流硬件平台和内核版本,确保其适应性、应用性。
|
||||
|
||||
|
||||
|
||||
# 快速上手
|
||||
|
||||
- **极速体验**
|
||||
如果你只关心底层原理,不关心存储、前端展示等,我们提供了编译好的镜像,已包含 HUATUO 底层运行的必要组件,直接运行即可:
|
||||
```bash
|
||||
$ docker run --privileged --cgroupns=host --network=host -v /sys:/sys -v /run:/run huatuo/huatuo-bamai:latest
|
||||
```
|
||||
|
||||
- **快速搭建**
|
||||
如果你想更进一步了解 HUATUO 运行机制,架构设计等,可在本地很方便地搭建 HUATUO 完整运行的所有组件,我们提供容器镜像以及简单配置,方便用户开发者快速了解 HUATUO。
|
||||

|
||||
<div style="text-align: center; margin: 8px 0 20px 0; color: #777;">
|
||||
<small>
|
||||
HUATUO 组件运行示意图<br>
|
||||
</small>
|
||||
</div>
|
||||
|
||||
为快速搭建运行环境,我们提供一键运行的方式,该命令会启动 [elasticsearch](https://www.elastic.co), [prometheus](https://prometheus.io), [grafana](https://grafana.com) 以及 huatuo-bamai 组件。命令执行成功后,打开浏览器访问 [http://localhost:3000](http://localhost:3000) 即可浏览监控大盘。
|
||||
|
||||
```bash
|
||||
$ docker compose --project-directory ./build/docker up
|
||||
```
|
||||
|
||||
# 软件架构
|
||||

|
||||
|
||||
# 开源协议
|
||||
该项目采用 Apache License 2.0 协议开源,BPF 代码采用 GPL 协议。
|
||||
|
||||
# 内核版本
|
||||
理论支持 4.18 之后的所有版本,主要测试内核、和操作系统发行版如下:
|
||||
|
||||
| HUATUO | 内核版本 | 操作系统发行版 |
|
||||
| :--- | :---- | :--- |
|
||||
| 1.0 | 4.18.x | Centos 8.5 |
|
||||
| 1.0 | 5.10.x | OpenEuler 22.03/Anolis OS 8.10 |
|
||||
| 1.0 | 6.6.x | OpenEuler 24.03/Anolis OS 23.3 |
|
||||
| 1.0 | 6.8.x | Ubuntu 24.04 |
|
||||
| 1.0 | 6.14.x | Fedora 42 |
|
||||
|
||||
|
||||
# 文档
|
||||
|
||||
更多信息访问官网 [https://huatuo.tech](https://huatuo.tech/)
|
||||
|
||||
|
||||
# 联系我们
|
||||
|
||||
@[hao022](https://github.com/hao022)
|
||||
@[nashuiliang](https://github.com/nashuiliang)
|
||||
@[fanzu8](https://github.com/fanzuba)
|
||||
|
||||
|
|
|
@ -0,0 +1,108 @@
|
|||
[简体中文](./README_CN.md) | English
|
||||
|
||||
# Abstract
|
||||
**HuaTuo (华佗)** aims to provide in-depth observability for the OS Linux kernel in complex **cloud-native** scenarios. The project is based on [eBPF](https://docs.kernel.org/userspace-api/ebpf/syscall.html) technology and has built a set of deep observation service components for the Linux kernel. By leveraging kernel dynamic tracing technologies such as [kprobe](https://www.kernel.org/doc/html/latest/trace/kprobes.html), [tracepoint](https://www.kernel.org/doc/html/latest/trace/tracepoints.html), and [ftrace](https://www.kernel.org/doc/html/latest/trace/ftrace.html), HuaTuo provides more observation perspectives for the Linux kernel, including kernel runtime context capture driven by anomalous events and more granular, accurate kernel per subsystem metrics.
|
||||
|
||||
HuaTuo also integrates core technologies such as automated tracing, profiling, and distributed tracing for system performance spikes. HuaTuo has been successfully applied on a large scale within Didi (DiDi Global Inc.), solidly guaranteeing the stability and performance optimization of cloud-native operating systems and showcasing the distinct advantages of eBPF technology in cloud-native scenarios.
|
||||
|
||||
# Key Features
|
||||
- **Continuous** Kernel Observability: Achieves in-depth, low-overhead (less than 1% performance impact) instrumentation of various kernel subsystems, providing comprehensive metrics on memory, CPU scheduling, network stack, and disk I/O.
|
||||
- Kernel **Anomaly-Driven** Observability: Instruments the kernel's exception paths and slow paths to capture rich runtime context triggered by anomalous events, enabling more insightful observability data.
|
||||
- **Automated** Tracing (AutoTracing): Implements automated tracing capabilities to address system resource spikes and performance jitters (e.g., CPU idle drop, raising CPU sys utilization, I/O bursts, and Loadavg raising).
|
||||
- **Smooth Transition** to Popular Observability Stacks: Provides standard data sources for Prometheus and Pyroscope, integrates with Kubernetes container resources, and automatically correlates Kubernetes labels/annotations with kernel event metrics, eliminating data silos, ensuring seamless integration and analysis across various data sources for comprehensive system monitoring.
|
||||
|
||||
# Getting Started
|
||||
- **Instant Experience**
|
||||
If you only care about the underlying principles and not about storage backends or frontend display, we provide a pre-built image containing all necessary components for HUATO's core operation. Just run:
|
||||
|
||||
```bash
|
||||
$ docker run --privileged --cgroupns=host --network=host -v /sys:/sys -v /run:/run huatuo/huatuo-bamai:latest
|
||||
```
|
||||
|
||||
- **Quick Setup**
|
||||
If you want to dive deeper into HUATO's operation mechanisms and architecture, you can easily set up all components locally. We provide container images and simple configurations for developers to quickly understand HUATO.
|
||||

|
||||
<div style="text-align: center; margin: 8px 0 20px 0; color: #777;">
|
||||
<small>
|
||||
HUATUO Component Workflow<br>
|
||||
</small>
|
||||
</div>
|
||||
|
||||
For a quick setup, we provide a one-command solution to launch [elasticsearch](https://www.elastic.co), [prometheus](https://prometheus.io), [grafana](https://grafana.com) and huatuo-bamai. Once executed, click [http://localhost:3000](http://localhost:3000) to view the monitoring dashboards on your browser.
|
||||
|
||||
- Data related to event-driven operations Autotracing and Events, are stored in elasticsearch
|
||||
- Metrics-related data is actively collected and stored by prometheus
|
||||
- elasticsearch data reporting port: 9200
|
||||
- prometheus data source port: 9090
|
||||
- grafana port: 3000
|
||||
|
||||
## User-Defined Collection
|
||||
The built-in modules cover most monitoring needs. Additionally, HuaTuo supports custom data collection with easy integration. [How to Add Custom Collection](./docs/CUSTOM.md)
|
||||
|
||||
# Architectures
|
||||

|
||||
|
||||
# Observability Overview
|
||||
## Exception Totals
|
||||

|
||||

|
||||
## Profiling
|
||||

|
||||
## SKB dropwatch
|
||||

|
||||
## Net Latency
|
||||

|
||||
|
||||
# Functionality Overview
|
||||
## Autotracing
|
||||
| Tracing Name | Core Functionality | Scenarios |
|
||||
| ------------ | ----------------------- | ------------------------------------ |
|
||||
| cpu sys | Detects rising host cpu.sys utilization | Issues caused by abnormal cpu.sys load leading to jitters |
|
||||
| cpu idle | Detects low CPU idle in containers, provides call stack, flame graphs, process context info, etc. | Abnormal container CPU usage, helps identify process hotspots |
|
||||
| dload | Tracks processes in the D (uninterruptible) state, provides container runtime info, D-state process call stack, etc. | Issues caused by a sudden increase in the number of system D or R (runnable) state processes, leading to higher load. A spike in D-state processes is often related to unavailable resources or long-held locks, while R-state process spikes may indicate unreasonable user logic design |
|
||||
| waitrate | Detects CPU contention in containers, provides information about the contending containers | CPU contention in containers can cause jitters, and the existing contention metrics lack specific container info. Waitrate tracking can provide the info about the containers involved in the contention, which can be used as a reference for resource isolation in hybrid deployment scenarios |
|
||||
| mmburst | Records burst memory allocation context | Detects events where the host allocates a large amount of memory in a short time, which can lead to direct reclaim or OOM |
|
||||
| iotracer | When the host disk is full or I/O latency is abnormal, provides the file name, path, device, inode, and container context info for the abnormal I/O access | Frequent disk I/O bandwidth saturation or sudden I/O spikes can lead to application request latency or system performance jitters |
|
||||
|
||||
## Events
|
||||
| Event Name | Core Functionality | Scenarios |
|
||||
| -------------- | --------------------- | ------------------------------------ |
|
||||
| softirq | When the kernel delayed response in soft interrupts or prolonged shutdown, supports the call stack and process information of the soft interrupts that have been shut down for an extended period of time. | This type of issue can severely impact network receive/transmit, leading to jitters or latency |
|
||||
| dropwatch | Detects TCP packet drops, provides host and network context info when drops occur | This type of issue can cause jitters and latency |
|
||||
| netrecvlat | Captures latency events along the data packet receive path from the driver, TCP/IP stack, to user-level | For network latency issues, there is a class where the receive-side exhibits latency, but the location is unclear. The netrecvlat case calculates latency by timestamping the skb at the interface, driver, TCP/IP stack, and user-level copy, and filters timed-out packets to point the latency location |
|
||||
| oom | Detects OOM events in the host or containers | When OOM events occur at the host or container level, it can obtain information about the triggering process, the killed process, and container details, which is helpful for diagnosing process memory leaks, abnormal exits, etc. |
|
||||
| softlockup | When the system encounters a softlockup, it collects information about the target process, CPU, and kernel stack for per CPU | Used for investigating system softlockup incidents |
|
||||
| hungtask | Provides the number of processes in the D (uninterruptible) state and their kernel stack info | Used to identify and save the context of processes that suddenly enter the D state, for later investigation |
|
||||
| memreclaim | Records the latency when a process enters direct reclaim, if it exceeds a time threshold | When under memory pressure, if a process requests memory, it may enter direct reclaim, a synchronous reclaim phase that can cause process jitters. This records the time a process spends in direct reclaim, helping assess the impact on the affected process |
|
||||
|
||||
## Metrics
|
||||
Metrics collection involves various indicators from per subsystem, including CPU, memory, IO, network, etc. The primary sources of these metrics are procfs, eBPF, and computational aggregation, as follows is a summary. [for details](docs/metrics.md)
|
||||
|
||||
| Subsystem | Metric | Description | Dimension |
|
||||
| ------------| --------------- |------------------------------------ | ----------------------- |
|
||||
| cpu | sys, usr, util | Percentage | host, container |
|
||||
| cpu | burst, throttled | Number of periods burst occurs, times the group has been throttled/limited | container |
|
||||
| cpu | inner, exter_wait_rate | Wait rate caused by processes inside/outside the container | container |
|
||||
| cpu | nr_running, nr_uninterruptible | The number of running/uninterruptible tasks in the container | container |
|
||||
| cpu | load 1, 5, 15 | System load avg over the last x minute | container |
|
||||
| cpu | softirq_latency | The number of NET_RX/NET_TX irq latency happened | host |
|
||||
| cpu | runqlat_nlat | The number of times when schedule latency of processes in host/container is within x~xms | host, container |
|
||||
| cpu | reschedipi_oversell_probability | The possibility of cpu overselling exists on the host where the vm is located | host |
|
||||
| memory | direct_reclaim | Time speed in page allocation in memory cgroup | container |
|
||||
| memory | asyncreclaim | Memory cgroup's direct reclaim time in cgroup async memory reclaim | container |
|
||||
| memory | vmstat, memory_stat | Memory statistics | host, container |
|
||||
| memory | hungtask, oom, softlockup | Count of event happened | host, container |
|
||||
| IO | d2c | Statistics of io latency when accessing the disk, including the time consumed by the driver and hardware components | host, container |
|
||||
| IO | q2c | Statistics of io latency for the entire io lifecycle when accessing the disk | host, container |
|
||||
| IO | disk_freeze | Statistics of disk freeze events | host |
|
||||
| IO | disk_flush | Statistics of delay for flush operations on disk raid device | host, container |
|
||||
| network | arp | ARP entries | system, host, container |
|
||||
| network | tcp, udp mem | Socket memory | system |
|
||||
| network | qdisc | Qdisc statistics | host |
|
||||
| network | netdev | Network device metrics | host, container |
|
||||
| network | netstat | Network statistics | host, container |
|
||||
| network | sockstat | Socket statistics | host, container |
|
||||
|
||||
|
||||
# Contact Us
|
||||
You can report bugs, provide suggestions, or engage in discussions via Github Issues and Github Discussions. Alternatively, you can contact us using the following ways:
|
|
@ -0,0 +1,66 @@
|
|||
#include "vmlinux.h"
|
||||
|
||||
#include <bpf/bpf_core_read.h>
|
||||
#include <bpf/bpf_helpers.h>
|
||||
#include <bpf/bpf_tracing.h>
|
||||
|
||||
#include "bpf_common.h"
|
||||
|
||||
#define CGROUP_KNODE_NAME_MAXLEN 64
|
||||
|
||||
struct cgroup_perf_event_t {
|
||||
u64 cgroup;
|
||||
u64 ops_type;
|
||||
s32 cgroup_root;
|
||||
s32 cgroup_level;
|
||||
u64 css[CGROUP_SUBSYS_COUNT];
|
||||
char knode_name[CGROUP_KNODE_NAME_MAXLEN + 2];
|
||||
};
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
|
||||
__uint(key_size, sizeof(int));
|
||||
__uint(value_size, sizeof(u32));
|
||||
} cgroup_perf_events SEC(".maps");
|
||||
|
||||
char __license[] SEC("license") = "GPL";
|
||||
|
||||
/* TP_PROTO(struct cgroup *cgrp, const char *path) */
|
||||
static int
|
||||
bpf_cgroup_event_class_prog(struct bpf_raw_tracepoint_args *ctx, u64 type)
|
||||
{
|
||||
struct cgroup *cgrp = (void *)ctx->args[0];
|
||||
struct cgroup_perf_event_t data = {};
|
||||
int knode_len;
|
||||
|
||||
/* knode name */
|
||||
knode_len =
|
||||
bpf_probe_read_str(&data.knode_name, sizeof(data.knode_name),
|
||||
BPF_CORE_READ(cgrp, kn, name));
|
||||
if (knode_len != CGROUP_KNODE_NAME_MAXLEN + 1)
|
||||
return 0;
|
||||
|
||||
data.ops_type = type;
|
||||
data.cgroup = (u64)cgrp;
|
||||
data.cgroup_root = BPF_CORE_READ(cgrp, root, hierarchy_id);
|
||||
data.cgroup_level = BPF_CORE_READ(cgrp, level);
|
||||
|
||||
bpf_probe_read(&data.css, sizeof(u64) * CGROUP_SUBSYS_COUNT,
|
||||
BPF_CORE_READ(cgrp, subsys));
|
||||
|
||||
bpf_perf_event_output(ctx, &cgroup_perf_events, COMPAT_BPF_F_CURRENT_CPU,
|
||||
&data, sizeof(data));
|
||||
return 0;
|
||||
}
|
||||
|
||||
SEC("raw_tracepoint/cgroup_mkdir")
|
||||
int bpf_cgroup_mkdir_prog(struct bpf_raw_tracepoint_args *ctx)
|
||||
{
|
||||
return bpf_cgroup_event_class_prog(ctx, 0);
|
||||
}
|
||||
|
||||
SEC("raw_tracepoint/cgroup_rmdir")
|
||||
int bpf_cgroup_rmdir_prog(struct bpf_raw_tracepoint_args *ctx)
|
||||
{
|
||||
return bpf_cgroup_event_class_prog(ctx, 1);
|
||||
}
|
|
@ -0,0 +1,56 @@
|
|||
#include "vmlinux.h"
|
||||
|
||||
#include <bpf/bpf_core_read.h>
|
||||
#include <bpf/bpf_helpers.h>
|
||||
#include <bpf/bpf_tracing.h>
|
||||
|
||||
#include "bpf_common.h"
|
||||
|
||||
#define CGROUP_KNODE_NAME_MAXLEN 64
|
||||
|
||||
struct cgroup_perf_event_t {
|
||||
u64 cgroup;
|
||||
u64 ops_type;
|
||||
s32 cgroup_root;
|
||||
s32 cgroup_level;
|
||||
u64 css[CGROUP_SUBSYS_COUNT];
|
||||
char knode_name[CGROUP_KNODE_NAME_MAXLEN + 2];
|
||||
};
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
|
||||
__uint(key_size, sizeof(int));
|
||||
__uint(value_size, sizeof(u32));
|
||||
} cgroup_perf_events SEC(".maps");
|
||||
|
||||
char __license[] SEC("license") = "GPL";
|
||||
|
||||
SEC("kprobe/cgroup_clone_children_read")
|
||||
int bpf_cgroup_clone_children_read_prog(struct pt_regs *ctx)
|
||||
{
|
||||
struct cgroup_subsys_state *css = (void *)PT_REGS_PARM1(ctx);
|
||||
struct cgroup *cgrp = BPF_CORE_READ(css, cgroup);
|
||||
struct cgroup_perf_event_t data = {};
|
||||
int knode_len;
|
||||
|
||||
/* knode name */
|
||||
knode_len =
|
||||
bpf_probe_read_str(&data.knode_name, sizeof(data.knode_name),
|
||||
BPF_CORE_READ(cgrp, kn, name));
|
||||
if (knode_len != CGROUP_KNODE_NAME_MAXLEN + 1)
|
||||
return 0;
|
||||
|
||||
data.cgroup = (u64)cgrp;
|
||||
data.ops_type = 0;
|
||||
data.cgroup_root = BPF_CORE_READ(cgrp, root, hierarchy_id);
|
||||
data.cgroup_level = BPF_CORE_READ(cgrp, level);
|
||||
|
||||
/* css */
|
||||
bpf_probe_read(&data.css, sizeof(u64) * CGROUP_SUBSYS_COUNT,
|
||||
BPF_CORE_READ(cgrp, subsys));
|
||||
|
||||
/* output */
|
||||
bpf_perf_event_output(ctx, &cgroup_perf_events, COMPAT_BPF_F_CURRENT_CPU,
|
||||
&data, sizeof(data));
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,181 @@
|
|||
#include "vmlinux.h"
|
||||
|
||||
#include "bpf_common.h"
|
||||
#include "bpf_ratelimit.h"
|
||||
#include "vmlinux_net.h"
|
||||
|
||||
#define TYPE_TCP_COMMON_DROP 1
|
||||
#define TYPE_TCP_SYN_FLOOD 2
|
||||
#define TYPE_TCP_LISTEN_OVERFLOW_HANDSHAKE1 3
|
||||
#define TYPE_TCP_LISTEN_OVERFLOW_HANDSHAKE3 4
|
||||
|
||||
#define SK_FL_PROTO_SHIFT 8
|
||||
#define SK_FL_PROTO_MASK 0x0000ff00
|
||||
#define SK_FL_TYPE_SHIFT 16
|
||||
#define SK_FL_TYPE_MASK 0xffff0000
|
||||
|
||||
struct perf_event_t {
|
||||
u64 tgid_pid;
|
||||
u32 saddr;
|
||||
u32 daddr;
|
||||
u16 sport;
|
||||
u16 dport;
|
||||
u32 seq;
|
||||
u32 ack_seq;
|
||||
u32 queue_mapping;
|
||||
u64 pkt_len;
|
||||
s64 stack_size;
|
||||
u64 stack[PERF_MAX_STACK_DEPTH];
|
||||
u32 sk_max_ack_backlog;
|
||||
u8 state;
|
||||
u8 type;
|
||||
char comm[COMPAT_TASK_COMM_LEN];
|
||||
};
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
|
||||
__uint(key_size, sizeof(int));
|
||||
__uint(value_size, sizeof(u32));
|
||||
} perf_events SEC(".maps");
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
|
||||
__uint(max_entries, 1);
|
||||
__uint(key_size, sizeof(u32));
|
||||
__uint(value_size, sizeof(struct perf_event_t));
|
||||
} dropwatch_stackmap SEC(".maps");
|
||||
|
||||
char __license[] SEC("license") = "Dual MIT/GPL";
|
||||
|
||||
static const struct perf_event_t zero_data = {};
|
||||
static const u32 stackmap_key = 0;
|
||||
|
||||
BPF_RATELIMIT(rate, 1, 100); // 100/s
|
||||
|
||||
struct sock___5_10 {
|
||||
u16 sk_type;
|
||||
u16 sk_protocol;
|
||||
} __attribute__((preserve_access_index));
|
||||
|
||||
static void sk_get_type_and_protocol(struct sock *sk, u16 *protocol, u16 *type)
|
||||
{
|
||||
// kernel version <= 4.18
|
||||
//
|
||||
// struct sock {
|
||||
// unsigned int __sk_flags_offset[0];
|
||||
// #ifdef __BIG_ENDIAN_BITFIELD
|
||||
// #define SK_FL_PROTO_SHIFT 16
|
||||
// #define SK_FL_PROTO_MASK 0x00ff0000
|
||||
// #
|
||||
// #define SK_FL_TYPE_SHIFT 0
|
||||
// #define SK_FL_TYPE_MASK 0x0000ffff
|
||||
// #else
|
||||
// #define SK_FL_PROTO_SHIFT 8
|
||||
// #define SK_FL_PROTO_MASK 0x0000ff00
|
||||
// #
|
||||
// #define SK_FL_TYPE_SHIFT 16
|
||||
// #define SK_FL_TYPE_MASK 0xffff0000
|
||||
// #endif
|
||||
//
|
||||
// unsigned int sk_padding : 1,
|
||||
// sk_kern_sock : 1,
|
||||
// sk_no_check_tx : 1,
|
||||
// sk_no_check_rx : 1,
|
||||
// sk_userlocks : 4,
|
||||
// sk_protocol : 8,
|
||||
// sk_type : 16;
|
||||
// }
|
||||
if (bpf_core_field_exists(sk->__sk_flags_offset)) {
|
||||
u32 sk_flags;
|
||||
|
||||
bpf_probe_read(&sk_flags, sizeof(sk_flags),
|
||||
&sk->__sk_flags_offset);
|
||||
*protocol = sk_flags >> SK_FL_PROTO_SHIFT;
|
||||
*type = sk_flags >> SK_FL_TYPE_SHIFT;
|
||||
return;
|
||||
}
|
||||
|
||||
// struct sock {
|
||||
// u16 sk_type;
|
||||
// u16 sk_protocol;
|
||||
// }
|
||||
struct sock___5_10 *sk_new = (struct sock___5_10 *)sk;
|
||||
|
||||
*protocol = BPF_CORE_READ(sk_new, sk_protocol);
|
||||
*type = BPF_CORE_READ(sk_new, sk_type);
|
||||
return;
|
||||
}
|
||||
|
||||
SEC("tracepoint/skb/kfree_skb")
|
||||
int bpf_kfree_skb_prog(struct trace_event_raw_kfree_skb *ctx)
|
||||
{
|
||||
struct sk_buff *skb = ctx->skbaddr;
|
||||
struct perf_event_t *data = NULL;
|
||||
struct sock_common *sk_common;
|
||||
struct tcphdr tcphdr;
|
||||
struct iphdr iphdr;
|
||||
struct sock *sk;
|
||||
u16 protocol = 0;
|
||||
u16 type = 0;
|
||||
u8 state = 0;
|
||||
|
||||
/* only for IP && TCP */
|
||||
if (ctx->protocol != ETH_P_IP)
|
||||
return 0;
|
||||
|
||||
bpf_probe_read(&iphdr, sizeof(iphdr), skb_network_header(skb));
|
||||
if (iphdr.protocol != IPPROTO_TCP)
|
||||
return 0;
|
||||
|
||||
sk = BPF_CORE_READ(skb, sk);
|
||||
if (!sk)
|
||||
return 0;
|
||||
|
||||
sk_common = (struct sock_common *)sk;
|
||||
|
||||
// filter the sock by AF_INET, SOCK_STREAM, IPPROTO_TCP
|
||||
if (BPF_CORE_READ(sk_common, skc_family) != AF_INET)
|
||||
return 0;
|
||||
|
||||
sk_get_type_and_protocol(sk, &protocol, &type);
|
||||
if ((u8)protocol != IPPROTO_TCP || type != SOCK_STREAM)
|
||||
return 0;
|
||||
|
||||
state = BPF_CORE_READ(sk_common, skc_state);
|
||||
if (state == TCP_CLOSE || state == 0)
|
||||
return 0;
|
||||
|
||||
if (bpf_ratelimited(&rate))
|
||||
return 0;
|
||||
|
||||
data = bpf_map_lookup_elem(&dropwatch_stackmap, &stackmap_key);
|
||||
if (!data) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
bpf_probe_read(&tcphdr, sizeof(tcphdr), skb_transport_header(skb));
|
||||
|
||||
/* event */
|
||||
data->tgid_pid = bpf_get_current_pid_tgid();
|
||||
bpf_get_current_comm(&data->comm, sizeof(data->comm));
|
||||
data->type = TYPE_TCP_COMMON_DROP;
|
||||
data->state = state;
|
||||
data->saddr = iphdr.saddr;
|
||||
data->daddr = iphdr.daddr;
|
||||
data->sport = tcphdr.source;
|
||||
data->dport = tcphdr.dest;
|
||||
data->seq = tcphdr.seq;
|
||||
data->ack_seq = tcphdr.ack_seq;
|
||||
data->pkt_len = BPF_CORE_READ(skb, len);
|
||||
data->queue_mapping = BPF_CORE_READ(skb, queue_mapping);
|
||||
data->stack_size =
|
||||
bpf_get_stack(ctx, data->stack, sizeof(data->stack), 0);
|
||||
data->sk_max_ack_backlog = 0;
|
||||
|
||||
bpf_perf_event_output(ctx, &perf_events, COMPAT_BPF_F_CURRENT_CPU, data,
|
||||
sizeof(*data));
|
||||
|
||||
bpf_map_update_elem(&dropwatch_stackmap, &stackmap_key, &zero_data,
|
||||
COMPAT_BPF_EXIST);
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,38 @@
|
|||
#include "vmlinux.h"
|
||||
|
||||
#include <bpf/bpf_core_read.h>
|
||||
#include <bpf/bpf_helpers.h>
|
||||
#include <bpf/bpf_tracing.h>
|
||||
|
||||
#include "bpf_common.h"
|
||||
#include "bpf_ratelimit.h"
|
||||
|
||||
char __license[] SEC("license") = "Dual MIT/GPL";
|
||||
|
||||
BPF_RATELIMIT_IN_MAP(rate, 1, COMPAT_CPU_NUM * 10000, 0);
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
|
||||
__uint(key_size, sizeof(int));
|
||||
__uint(value_size, sizeof(u32));
|
||||
} hungtask_perf_events SEC(".maps");
|
||||
|
||||
struct hungtask_info {
|
||||
int32_t pid;
|
||||
char comm[COMPAT_TASK_COMM_LEN];
|
||||
};
|
||||
|
||||
SEC("tracepoint/sched/sched_process_hang")
|
||||
int tracepoint_sched_process_hang(struct trace_event_raw_sched_process_hang *ctx)
|
||||
{
|
||||
struct hungtask_info info = {};
|
||||
|
||||
if (bpf_ratelimited_in_map(ctx, rate))
|
||||
return 0;
|
||||
|
||||
info.pid = ctx->pid;
|
||||
bpf_probe_read_str(&info.comm, COMPAT_TASK_COMM_LEN, ctx->comm);
|
||||
bpf_perf_event_output(ctx, &hungtask_perf_events,
|
||||
COMPAT_BPF_F_CURRENT_CPU, &info, sizeof(info));
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,33 @@
|
|||
#ifndef __BPF_COMMON_H__
|
||||
#define __BPF_COMMON_H__
|
||||
|
||||
#ifndef NULL
|
||||
#define NULL ((void *)0)
|
||||
#endif
|
||||
|
||||
/* define COMPAT_XXX for compat old kernel vmlinux.h */
|
||||
#define COMPAT_BPF_F_CURRENT_CPU 0xffffffffULL
|
||||
|
||||
#define COMPAT_TASK_COMM_LEN 16
|
||||
#define PATH_MAX 4096 /* # chars in a path name including nul */
|
||||
#define COMPAT_CPU_NUM 128
|
||||
|
||||
/* include/uapi/linux/perf_event.h */
|
||||
#define PERF_MAX_STACK_DEPTH 127
|
||||
#define PERF_MIN_STACK_DEPTH 16
|
||||
|
||||
/* flags for both BPF_FUNC_get_stackid and BPF_FUNC_get_stack. */
|
||||
#define COMPAT_BPF_F_USER_STACK 256
|
||||
|
||||
/* flags for BPF_MAP_UPDATE_ELEM command */
|
||||
#define COMPAT_BPF_ANY 0 /* create new element or update existing */
|
||||
#define COMPAT_BPF_NOEXIST 1 /* create new element if it didn't exist */
|
||||
#define COMPAT_BPF_EXIST 2 /* update existing element */
|
||||
#define COMPAT_BPF_F_LOCK 4 /* spin_lock-ed map_lookup/map_update */
|
||||
|
||||
#define NR_SOFTIRQS_MAX 16
|
||||
|
||||
#define NSEC_PER_MSEC 1000000UL
|
||||
#define NSEC_PER_USEC 1000UL
|
||||
|
||||
#endif /* __BPF_COMMON_H__ */
|
|
@ -0,0 +1,48 @@
|
|||
#ifndef __BPF_FUNC_TRACE_H__
|
||||
#define __BPF_FUNC_TRACE_H__
|
||||
|
||||
#include <bpf/bpf_helpers.h>
|
||||
|
||||
struct trace_entry_ctx {
|
||||
u64 id;
|
||||
u64 start_ns;
|
||||
u64 delta_ns;
|
||||
};
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_HASH);
|
||||
__type(key, u64);
|
||||
__type(value, struct trace_entry_ctx);
|
||||
__uint(max_entries, 10240);
|
||||
} func_trace_map SEC(".maps");
|
||||
|
||||
static __always_inline void func_trace_begain(u64 id)
|
||||
{
|
||||
struct trace_entry_ctx entry = {
|
||||
.start_ns = bpf_ktime_get_ns(),
|
||||
.id = id,
|
||||
};
|
||||
|
||||
bpf_map_update_elem(&func_trace_map, &id, &entry, COMPAT_BPF_ANY);
|
||||
}
|
||||
|
||||
static __always_inline struct trace_entry_ctx *func_trace_end(u64 id)
|
||||
{
|
||||
struct trace_entry_ctx *entry;
|
||||
|
||||
entry = bpf_map_lookup_elem(&func_trace_map, &id);
|
||||
if (!entry) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// update any elem you need!
|
||||
entry->delta_ns = bpf_ktime_get_ns() - entry->start_ns;
|
||||
return entry;
|
||||
}
|
||||
|
||||
static __always_inline void func_trace_destroy(u64 id)
|
||||
{
|
||||
bpf_map_delete_elem(&func_trace_map, &id);
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,113 @@
|
|||
#ifndef __BPF_RATELIMIT_H__
|
||||
#define __BPF_RATELIMIT_H__
|
||||
|
||||
#include <bpf/bpf_helpers.h>
|
||||
|
||||
struct bpf_ratelimit {
|
||||
uint64_t interval; // unit: second
|
||||
uint64_t begin;
|
||||
uint64_t burst; // max events/interval
|
||||
uint64_t max_burst; // max burst
|
||||
uint64_t events; // current events/interval
|
||||
uint64_t nmissed; // missed events/interval
|
||||
|
||||
uint64_t total_events; // total events
|
||||
uint64_t total_nmissed; // total missed events
|
||||
uint64_t total_interval; // total interval
|
||||
};
|
||||
|
||||
#define BPF_RATELIMIT(name, interval, burst) \
|
||||
struct bpf_ratelimit name = {interval, 0, burst, 0, 0, 0, 0, 0, 0}
|
||||
|
||||
// bpf_ratelimited: whether the threshold is exceeded
|
||||
//
|
||||
// @rate: struct bpf_ratelimit *
|
||||
// @return:
|
||||
// true: the threshold is exceeded
|
||||
// false: the threshold is not exceeded
|
||||
static __always_inline bool bpf_ratelimited(struct bpf_ratelimit *rate)
|
||||
{
|
||||
// validate
|
||||
if (rate == NULL || rate->interval == 0)
|
||||
return false;
|
||||
|
||||
u64 curr = bpf_ktime_get_ns() / 1000000000;
|
||||
|
||||
if (rate->begin == 0)
|
||||
rate->begin = curr;
|
||||
|
||||
if (curr > rate->begin + rate->interval) {
|
||||
__sync_fetch_and_add(&rate->total_interval, curr - rate->begin);
|
||||
rate->begin = curr;
|
||||
rate->events = rate->nmissed = 0;
|
||||
}
|
||||
|
||||
if (rate->burst && rate->burst > rate->events) {
|
||||
__sync_fetch_and_add(&rate->events, 1);
|
||||
__sync_fetch_and_add(&rate->total_events, 1);
|
||||
return false;
|
||||
}
|
||||
|
||||
__sync_fetch_and_add(&rate->nmissed, 1);
|
||||
__sync_fetch_and_add(&rate->total_nmissed, 1);
|
||||
return true;
|
||||
}
|
||||
|
||||
#define BPF_RATELIMIT_IN_MAP(name, interval, burst, max_burst) \
|
||||
struct { \
|
||||
__uint(type, BPF_MAP_TYPE_ARRAY); \
|
||||
__uint(key_size, sizeof(u32)); \
|
||||
__uint(value_size, sizeof(struct bpf_ratelimit)); \
|
||||
__uint(max_entries, 1); \
|
||||
} bpf_rlimit_##name SEC(".maps"); \
|
||||
struct { \
|
||||
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); \
|
||||
__uint(key_size, sizeof(int)); \
|
||||
__uint(value_size, sizeof(u32)); \
|
||||
} event_bpf_rlimit_##name SEC(".maps"); \
|
||||
volatile const struct bpf_ratelimit ___bpf_rlimit_cfg_##name = { \
|
||||
interval, 0, burst, max_burst, 0, 0, 0, 0, 0}
|
||||
|
||||
// bpf_ratelimited_in_map: whether the threshold is exceeded
|
||||
//
|
||||
// @rate: struct bpf_ratelimit *
|
||||
// @return:
|
||||
// true: the threshold is exceeded
|
||||
// false: the threshold is not exceeded
|
||||
#define bpf_ratelimited_in_map(ctx, rate) \
|
||||
bpf_ratelimited_core_in_map(ctx, &bpf_rlimit_##rate, \
|
||||
&event_bpf_rlimit_##rate, \
|
||||
&___bpf_rlimit_cfg_##rate)
|
||||
|
||||
static __always_inline bool
|
||||
bpf_ratelimited_core_in_map(void *ctx, void *map, void *perf_map,
|
||||
const volatile struct bpf_ratelimit *cfg)
|
||||
{
|
||||
u32 key = 0;
|
||||
struct bpf_ratelimit *rate = NULL;
|
||||
|
||||
rate = bpf_map_lookup_elem(map, &key);
|
||||
if (rate == NULL)
|
||||
return false;
|
||||
|
||||
// init from cfg
|
||||
if (rate->interval == 0) {
|
||||
rate->interval = cfg->interval;
|
||||
rate->burst = cfg->burst;
|
||||
rate->max_burst = cfg->max_burst;
|
||||
}
|
||||
|
||||
// the threshold is not exceeded, return false
|
||||
u64 old_nmissed = rate->nmissed;
|
||||
if (!bpf_ratelimited(rate))
|
||||
return false;
|
||||
|
||||
// the threshold/max_burst is exceeded, notify once in a cycle
|
||||
if (old_nmissed == 0 || (rate->max_burst > 0 &&
|
||||
rate->nmissed > rate->max_burst - rate->burst))
|
||||
bpf_perf_event_output(ctx, perf_map, COMPAT_BPF_F_CURRENT_CPU, rate,
|
||||
sizeof(struct bpf_ratelimit));
|
||||
return true;
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,26 @@
|
|||
#ifndef __VMLINUX_NET_H__
|
||||
#define __VMLINUX_NET_H__
|
||||
|
||||
#include <bpf/bpf_helpers.h>
|
||||
#include <bpf/bpf_core_read.h>
|
||||
|
||||
#define IFNAMSIZ 16
|
||||
|
||||
#define ETH_P_IP 0x0800 /* Internet Protocol packet */
|
||||
#define AF_INET 2 /* Internet IP Protocol */
|
||||
|
||||
#define IP_MF 0x2000 /* Flag: "More Fragments" */
|
||||
#define IP_OFFSET 0x1FFF /* "Fragment Offset" part */
|
||||
|
||||
// skb_network_header - get the network header from sk_buff
|
||||
static inline unsigned char *skb_network_header(struct sk_buff *skb)
|
||||
{
|
||||
return BPF_CORE_READ(skb, head) + BPF_CORE_READ(skb, network_header);
|
||||
}
|
||||
|
||||
// skb_transport_header - get the transport header from sk_buff
|
||||
static inline unsigned char *skb_transport_header(struct sk_buff *skb)
|
||||
{
|
||||
return BPF_CORE_READ(skb, head) + BPF_CORE_READ(skb, transport_header);
|
||||
}
|
||||
#endif
|
|
@ -0,0 +1,7 @@
|
|||
#ifndef __VMLINUX_SCHED_H__
|
||||
#define __VMLINUX_SCHED_H__
|
||||
|
||||
/* copy from include/linux/sched.h */
|
||||
#define PF_KSWAPD 0x00020000 /* I am kswapd */
|
||||
|
||||
#endif
|
|
@ -0,0 +1,25 @@
|
|||
#include "vmlinux.h"
|
||||
|
||||
#include <bpf/bpf_helpers.h>
|
||||
|
||||
#include "bpf_common.h"
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
|
||||
__uint(key_size, sizeof(int));
|
||||
__uint(value_size, sizeof(u32));
|
||||
} ad_event_map SEC(".maps");
|
||||
|
||||
SEC("kprobe/ad_disable_collecting_distributing")
|
||||
int ad_disable(struct pt_regs *ctx)
|
||||
{
|
||||
// nothing to do here, only notify user space, because this is a
|
||||
// ko module and CO-RE relocation is not supported directly at old
|
||||
// kernel
|
||||
u64 nothing = 0;
|
||||
bpf_perf_event_output(ctx, &ad_event_map, COMPAT_BPF_F_CURRENT_CPU,
|
||||
¬hing, sizeof(nothing));
|
||||
return 0;
|
||||
}
|
||||
|
||||
char __license[] SEC("license") = "Dual MIT/GPL";
|
|
@ -0,0 +1,91 @@
|
|||
#include "vmlinux.h"
|
||||
|
||||
#include <bpf/bpf_core_read.h>
|
||||
#include <bpf/bpf_helpers.h>
|
||||
|
||||
#include "bpf_common.h"
|
||||
#include "bpf_func_trace.h"
|
||||
|
||||
struct mm_free_compact_entry {
|
||||
/* host: compaction latency */
|
||||
unsigned long compaction_stat;
|
||||
/* host: page alloc latency in direct reclaim */
|
||||
unsigned long allocstall_stat;
|
||||
};
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_HASH);
|
||||
__type(key, int);
|
||||
__type(value, struct mm_free_compact_entry);
|
||||
__uint(max_entries, 10240);
|
||||
} mm_free_compact_map SEC(".maps");
|
||||
|
||||
char __license[] SEC("license") = "Dual MIT/GPL";
|
||||
|
||||
static __always_inline void
|
||||
update_metric_map(u64 free_delta_ns, u64 compact_delta_ns)
|
||||
{
|
||||
struct mm_free_compact_entry *valp;
|
||||
int key = 0;
|
||||
|
||||
valp = bpf_map_lookup_elem(&mm_free_compact_map, &key);
|
||||
if (!valp) {
|
||||
struct mm_free_compact_entry new_metrics = {
|
||||
.allocstall_stat = free_delta_ns,
|
||||
.compaction_stat = compact_delta_ns,
|
||||
};
|
||||
bpf_map_update_elem(&mm_free_compact_map, &key, &new_metrics,
|
||||
COMPAT_BPF_ANY);
|
||||
return;
|
||||
}
|
||||
|
||||
if (free_delta_ns)
|
||||
__sync_fetch_and_add(&valp->allocstall_stat, free_delta_ns);
|
||||
|
||||
if (compact_delta_ns)
|
||||
__sync_fetch_and_add(&valp->compaction_stat, compact_delta_ns);
|
||||
}
|
||||
|
||||
static __always_inline void func_trace_end_and_update_metric(bool free_pages)
|
||||
{
|
||||
struct trace_entry_ctx *entry;
|
||||
|
||||
entry = func_trace_end(bpf_get_current_pid_tgid());
|
||||
if (!entry)
|
||||
return;
|
||||
|
||||
if (free_pages)
|
||||
update_metric_map(entry->delta_ns, 0);
|
||||
else
|
||||
update_metric_map(0, entry->delta_ns);
|
||||
|
||||
func_trace_destroy(entry->id);
|
||||
}
|
||||
|
||||
SEC("tracepoint/vmscan/mm_vmscan_direct_reclaim_begin")
|
||||
int tracepoint_try_to_free_pages_begin(struct pt_regs *ctx)
|
||||
{
|
||||
func_trace_begain(bpf_get_current_pid_tgid());
|
||||
return 0;
|
||||
}
|
||||
|
||||
SEC("tracepoint/vmscan/mm_vmscan_direct_reclaim_end")
|
||||
int tracepoint_try_to_free_pages_end(struct pt_regs *ctx)
|
||||
{
|
||||
func_trace_end_and_update_metric(true);
|
||||
return 0;
|
||||
}
|
||||
|
||||
SEC("kprobe/try_to_compact_pages")
|
||||
int kprobe_try_to_compact_pages_host(struct pt_regs *ctx)
|
||||
{
|
||||
func_trace_begain(bpf_get_current_pid_tgid());
|
||||
return 0;
|
||||
}
|
||||
|
||||
SEC("kretprobe/try_to_compact_pages")
|
||||
int kretprobe_try_to_compact_pages_host(struct pt_regs *ctx)
|
||||
{
|
||||
func_trace_end_and_update_metric(false);
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,56 @@
|
|||
#include "vmlinux.h"
|
||||
|
||||
#include <bpf/bpf_core_read.h>
|
||||
#include <bpf/bpf_helpers.h>
|
||||
#include <bpf/bpf_tracing.h>
|
||||
|
||||
#include "bpf_common.h"
|
||||
#include "vmlinux_sched.h"
|
||||
|
||||
char __license[] SEC("license") = "Dual MIT/GPL";
|
||||
|
||||
struct mem_cgroup_metric {
|
||||
/* cg: direct reclaim count caused by try_charge */
|
||||
unsigned long directstall_count;
|
||||
};
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_HASH);
|
||||
__type(key, unsigned long);
|
||||
__type(value, struct mem_cgroup_metric);
|
||||
__uint(max_entries, 10240);
|
||||
} mem_cgroup_map SEC(".maps");
|
||||
|
||||
SEC("tracepoint/vmscan/mm_vmscan_memcg_reclaim_begin")
|
||||
int tracepoint_vmscan_mm_vmscan_memcg_reclaim_begin(struct pt_regs *ctx)
|
||||
{
|
||||
struct cgroup_subsys_state *mm_subsys;
|
||||
struct mem_cgroup_metric *valp;
|
||||
struct task_struct *task;
|
||||
|
||||
task = (struct task_struct *)bpf_get_current_task();
|
||||
if (BPF_CORE_READ(task, flags) & PF_KSWAPD)
|
||||
return 0;
|
||||
|
||||
mm_subsys = BPF_CORE_READ(task, cgroups, subsys[memory_cgrp_id]);
|
||||
valp = bpf_map_lookup_elem(&mem_cgroup_map, &mm_subsys);
|
||||
if (!valp) {
|
||||
struct mem_cgroup_metric new_metrics = {
|
||||
.directstall_count = 1,
|
||||
};
|
||||
bpf_map_update_elem(&mem_cgroup_map, &mm_subsys, &new_metrics,
|
||||
COMPAT_BPF_ANY);
|
||||
return 0;
|
||||
}
|
||||
|
||||
__sync_fetch_and_add(&valp->directstall_count, 1);
|
||||
return 0;
|
||||
}
|
||||
|
||||
SEC("kprobe/mem_cgroup_css_released")
|
||||
int kprobe_mem_cgroup_css_released(struct pt_regs *ctx)
|
||||
{
|
||||
u64 css = PT_REGS_PARM1(ctx);
|
||||
bpf_map_delete_elem(&mem_cgroup_map, &css);
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,64 @@
|
|||
#include "vmlinux.h"
|
||||
|
||||
#include <bpf/bpf_core_read.h>
|
||||
#include <bpf/bpf_helpers.h>
|
||||
#include <bpf/bpf_tracing.h>
|
||||
|
||||
#include "bpf_common.h"
|
||||
#include "bpf_func_trace.h"
|
||||
#include "bpf_ratelimit.h"
|
||||
|
||||
char __license[] SEC("license") = "Dual MIT/GPL";
|
||||
|
||||
volatile const unsigned long deltath = 0;
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
|
||||
__uint(key_size, sizeof(int));
|
||||
__uint(value_size, sizeof(u32));
|
||||
} reclaim_perf_events SEC(".maps");
|
||||
|
||||
struct reclaim_entry {
|
||||
char comm[COMPAT_TASK_COMM_LEN];
|
||||
u64 delta_time;
|
||||
u64 css;
|
||||
u64 pid;
|
||||
};
|
||||
|
||||
SEC("kprobe/try_to_free_pages")
|
||||
int kprobe_try_to_free_pages(struct pt_regs *ctx)
|
||||
{
|
||||
func_trace_begain(bpf_get_current_pid_tgid());
|
||||
return 0;
|
||||
}
|
||||
|
||||
SEC("kretprobe/try_to_free_pages")
|
||||
int kretprobe_try_to_free_pages(struct pt_regs *ctx)
|
||||
{
|
||||
struct trace_entry_ctx *entry;
|
||||
struct task_struct *task;
|
||||
|
||||
entry = func_trace_end(bpf_get_current_pid_tgid());
|
||||
if (!entry)
|
||||
return 0;
|
||||
|
||||
if (entry->delta_ns > deltath) {
|
||||
task = (struct task_struct *)bpf_get_current_task();
|
||||
|
||||
struct reclaim_entry data = {
|
||||
.pid = entry->id,
|
||||
.css = (u64)BPF_CORE_READ(task, cgroups,
|
||||
subsys[cpu_cgrp_id]),
|
||||
.delta_time = entry->delta_ns,
|
||||
};
|
||||
|
||||
bpf_get_current_comm(data.comm, sizeof(data.comm));
|
||||
|
||||
bpf_perf_event_output(ctx, &reclaim_perf_events,
|
||||
COMPAT_BPF_F_CURRENT_CPU, &data,
|
||||
sizeof(struct reclaim_entry));
|
||||
}
|
||||
|
||||
func_trace_destroy(entry->id);
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,177 @@
|
|||
// go:build ignore
|
||||
|
||||
#include "vmlinux.h"
|
||||
|
||||
#include <bpf/bpf_core_read.h>
|
||||
#include <bpf/bpf_endian.h>
|
||||
#include <bpf/bpf_helpers.h>
|
||||
#include <bpf/bpf_tracing.h>
|
||||
|
||||
#include "bpf_common.h"
|
||||
#include "bpf_ratelimit.h"
|
||||
#include "vmlinux_net.h"
|
||||
|
||||
volatile const long long mono_wall_offset = 0;
|
||||
volatile const long long to_netif = 5 * 1000 * 1000; // 5ms
|
||||
volatile const long long to_tcpv4 = 10 * 1000 * 1000; // 10ms
|
||||
volatile const long long to_user_copy = 115 * 1000 * 1000; // 115ms
|
||||
|
||||
#define likely(x) __builtin_expect(!!(x), 1)
|
||||
#define unlikely(x) __builtin_expect(!!(x), 0)
|
||||
|
||||
BPF_RATELIMIT(rate, 1, 100);
|
||||
|
||||
struct perf_event_t {
|
||||
char comm[COMPAT_TASK_COMM_LEN];
|
||||
u64 latency;
|
||||
u64 tgid_pid;
|
||||
u64 pkt_len;
|
||||
u16 sport;
|
||||
u16 dport;
|
||||
u32 saddr;
|
||||
u32 daddr;
|
||||
u32 seq;
|
||||
u32 ack_seq;
|
||||
u8 state;
|
||||
u8 where;
|
||||
};
|
||||
|
||||
enum skb_rcv_where {
|
||||
TO_NETIF_RCV,
|
||||
TO_TCPV4_RCV,
|
||||
TO_USER_COPY,
|
||||
};
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
|
||||
__uint(key_size, sizeof(int));
|
||||
__uint(value_size, sizeof(u32));
|
||||
} net_recv_lat_event_map SEC(".maps");
|
||||
|
||||
struct mix {
|
||||
struct iphdr *ip_hdr;
|
||||
u64 lat;
|
||||
u8 state;
|
||||
u8 where;
|
||||
};
|
||||
|
||||
static inline u64 delta_now_skb_tstamp(struct sk_buff *skb)
|
||||
{
|
||||
u64 tstamp = BPF_CORE_READ(skb, tstamp);
|
||||
// although the skb->tstamp record is opened in user space by
|
||||
// SOF_TIMESTAMPING_RX_SOFTWARE, it is still 0 in the following cases:
|
||||
// unix recv, netlink recv, few virtual dev(e.g. tun dev, napi dsabled)
|
||||
if (!tstamp)
|
||||
return 0;
|
||||
|
||||
return bpf_ktime_get_ns() + mono_wall_offset - tstamp;
|
||||
}
|
||||
|
||||
static inline u8 get_state(struct sk_buff *skb)
|
||||
{
|
||||
return BPF_CORE_READ(skb, sk, __sk_common.skc_state);
|
||||
}
|
||||
|
||||
static inline void
|
||||
fill_and_output_event(void *ctx, struct sk_buff *skb, struct mix *_mix)
|
||||
{
|
||||
struct perf_event_t event = {};
|
||||
struct tcphdr tcp_hdr;
|
||||
|
||||
// ratelimit
|
||||
if (bpf_ratelimited(&rate))
|
||||
return;
|
||||
|
||||
if (likely(_mix->where == TO_USER_COPY)) {
|
||||
event.tgid_pid = bpf_get_current_pid_tgid();
|
||||
bpf_get_current_comm(&event.comm, sizeof(event.comm));
|
||||
}
|
||||
|
||||
bpf_probe_read(&tcp_hdr, sizeof(tcp_hdr), skb_transport_header(skb));
|
||||
event.latency = _mix->lat;
|
||||
event.saddr = _mix->ip_hdr->saddr;
|
||||
event.daddr = _mix->ip_hdr->daddr;
|
||||
event.sport = tcp_hdr.source;
|
||||
event.dport = tcp_hdr.dest;
|
||||
event.seq = tcp_hdr.seq;
|
||||
event.ack_seq = tcp_hdr.ack_seq;
|
||||
event.pkt_len = BPF_CORE_READ(skb, len);
|
||||
event.state = _mix->state;
|
||||
event.where = _mix->where;
|
||||
|
||||
bpf_perf_event_output(ctx, &net_recv_lat_event_map,
|
||||
COMPAT_BPF_F_CURRENT_CPU, &event,
|
||||
sizeof(struct perf_event_t));
|
||||
}
|
||||
|
||||
SEC("tracepoint/net/netif_receive_skb")
|
||||
int netif_receive_skb_prog(struct trace_event_raw_net_dev_template *args)
|
||||
{
|
||||
struct sk_buff *skb = (struct sk_buff *)args->skbaddr;
|
||||
struct iphdr ip_hdr;
|
||||
u64 delta;
|
||||
|
||||
if (unlikely(BPF_CORE_READ(skb, protocol) !=
|
||||
bpf_ntohs(ETH_P_IP))) // IPv4
|
||||
return 0;
|
||||
|
||||
bpf_probe_read(&ip_hdr, sizeof(ip_hdr), skb_network_header(skb));
|
||||
if (ip_hdr.protocol != IPPROTO_TCP)
|
||||
return 0;
|
||||
|
||||
delta = delta_now_skb_tstamp(skb);
|
||||
if (delta < to_netif)
|
||||
return 0;
|
||||
|
||||
fill_and_output_event(args, skb,
|
||||
&(struct mix){&ip_hdr, delta, 0, TO_NETIF_RCV});
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
SEC("kprobe/tcp_v4_rcv")
|
||||
int tcp_v4_rcv_prog(struct pt_regs *ctx)
|
||||
{
|
||||
struct sk_buff *skb = (struct sk_buff *)PT_REGS_PARM1_CORE(ctx);
|
||||
struct iphdr ip_hdr;
|
||||
u64 delta;
|
||||
|
||||
delta = delta_now_skb_tstamp(skb);
|
||||
if (delta < to_tcpv4)
|
||||
return 0;
|
||||
|
||||
bpf_probe_read(&ip_hdr, sizeof(ip_hdr), skb_network_header(skb));
|
||||
fill_and_output_event(
|
||||
ctx, skb,
|
||||
&(struct mix){&ip_hdr, delta, get_state(skb), TO_TCPV4_RCV});
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
SEC("tracepoint/skb/skb_copy_datagram_iovec")
|
||||
int skb_copy_datagram_iovec_prog(
|
||||
struct trace_event_raw_skb_copy_datagram_iovec *args)
|
||||
{
|
||||
struct sk_buff *skb = (struct sk_buff *)args->skbaddr;
|
||||
struct iphdr ip_hdr;
|
||||
u64 delta;
|
||||
|
||||
if (unlikely(BPF_CORE_READ(skb, protocol) != bpf_ntohs(ETH_P_IP)))
|
||||
return 0;
|
||||
|
||||
bpf_probe_read(&ip_hdr, sizeof(ip_hdr), skb_network_header(skb));
|
||||
if (ip_hdr.protocol != IPPROTO_TCP)
|
||||
return 0;
|
||||
|
||||
delta = delta_now_skb_tstamp(skb);
|
||||
if (delta < to_user_copy)
|
||||
return 0;
|
||||
|
||||
fill_and_output_event(
|
||||
args, skb,
|
||||
&(struct mix){&ip_hdr, delta, get_state(skb), TO_USER_COPY});
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
char __license[] SEC("license") = "Dual MIT/GPL";
|
|
@ -0,0 +1,59 @@
|
|||
#include "vmlinux.h"
|
||||
|
||||
#include <bpf/bpf_core_read.h>
|
||||
#include <bpf/bpf_helpers.h>
|
||||
#include <bpf/bpf_tracing.h>
|
||||
|
||||
#include "bpf_common.h"
|
||||
#include "bpf_ratelimit.h"
|
||||
|
||||
char __license[] SEC("license") = "Dual MIT/GPL";
|
||||
|
||||
BPF_RATELIMIT_IN_MAP(rate, 1, COMPAT_CPU_NUM * 10000, 0);
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
|
||||
__uint(key_size, sizeof(int));
|
||||
__uint(value_size, sizeof(u32));
|
||||
} oom_perf_events SEC(".maps");
|
||||
|
||||
struct oom_info {
|
||||
char trigger_comm[COMPAT_TASK_COMM_LEN];
|
||||
char victim_comm[COMPAT_TASK_COMM_LEN];
|
||||
u32 trigger_pid;
|
||||
u32 victim_pid;
|
||||
u64 trigger_memcg_css;
|
||||
u64 victim_memcg_css;
|
||||
};
|
||||
|
||||
SEC("kprobe/oom_kill_process")
|
||||
int kprobe_oom_kill_process(struct pt_regs *ctx)
|
||||
{
|
||||
struct oom_control *oc;
|
||||
struct oom_info info = {};
|
||||
struct task_struct *trigger_task, *victim_task;
|
||||
|
||||
if (bpf_ratelimited_in_map(ctx, rate))
|
||||
return 0;
|
||||
|
||||
oc = (void *)ctx->di;
|
||||
|
||||
if (!oc)
|
||||
return 0;
|
||||
|
||||
trigger_task = (struct task_struct *)bpf_get_current_task();
|
||||
victim_task = BPF_CORE_READ(oc, chosen);
|
||||
info.trigger_pid = BPF_CORE_READ(trigger_task, pid);
|
||||
info.victim_pid = BPF_CORE_READ(victim_task, pid);
|
||||
BPF_CORE_READ_STR_INTO(&info.trigger_comm, trigger_task, comm);
|
||||
BPF_CORE_READ_STR_INTO(&info.victim_comm, victim_task, comm);
|
||||
|
||||
info.victim_memcg_css =
|
||||
(u64)BPF_CORE_READ(victim_task, cgroups, subsys[4]);
|
||||
info.trigger_memcg_css =
|
||||
(u64)BPF_CORE_READ(trigger_task, cgroups, subsys[4]);
|
||||
|
||||
bpf_perf_event_output(ctx, &oom_perf_events, COMPAT_BPF_F_CURRENT_CPU,
|
||||
&info, sizeof(info));
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,337 @@
|
|||
#include "vmlinux.h"
|
||||
|
||||
#include <bpf/bpf_core_read.h>
|
||||
#include <bpf/bpf_helpers.h>
|
||||
#include <bpf/bpf_tracing.h>
|
||||
|
||||
#include "bpf_common.h"
|
||||
|
||||
// defaultly, we use task_group address as key to operate map.
|
||||
#define TG_ADDR_KEY
|
||||
|
||||
#define TASK_RUNNING 0
|
||||
#define TASK_ON_RQ_QUEUED 1
|
||||
|
||||
#define _(P) \
|
||||
({ \
|
||||
typeof(P) val = 0; \
|
||||
bpf_probe_read(&val, sizeof(val), &(P)); \
|
||||
val; \
|
||||
})
|
||||
|
||||
char __license[] SEC("license") = "Dual MIT/GPL";
|
||||
|
||||
struct stat_t {
|
||||
unsigned long nvcsw; // task_group counts of voluntary context switch
|
||||
unsigned long nivcsw; // task_group counts of involuntary context switch
|
||||
unsigned long
|
||||
nlat_01; // task_group counts of sched latency range [0, 10)ms
|
||||
unsigned long
|
||||
nlat_02; // task_group counts of sched latency range [10, 20)ms
|
||||
unsigned long
|
||||
nlat_03; // task_group counts of sched latency range [20, 50)ms
|
||||
unsigned long
|
||||
nlat_04; // task_group counts of sched latency range [50, inf)ms
|
||||
};
|
||||
|
||||
struct g_stat_t {
|
||||
unsigned long g_nvcsw; // global counts of voluntary context switch
|
||||
unsigned long g_nivcsw; // global counts of involuntary context switch
|
||||
unsigned long
|
||||
g_nlat_01; // global counts of sched latency range [0, 10)ms
|
||||
unsigned long
|
||||
g_nlat_02; // global counts of sched latency range [10, 20)ms
|
||||
unsigned long
|
||||
g_nlat_03; // global counts of sched latency range [20, 50)ms
|
||||
unsigned long
|
||||
g_nlat_04; // global counts of sched latency range [50, inf)ms
|
||||
};
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_HASH);
|
||||
__type(key, u32);
|
||||
__type(value, u64);
|
||||
// FIXME: is 10000 enough or too large?
|
||||
__uint(max_entries, 10000);
|
||||
} latency SEC(".maps");
|
||||
|
||||
struct stat_t;
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_HASH);
|
||||
#ifdef TG_ADDR_KEY
|
||||
__type(key, u64);
|
||||
#else
|
||||
__type(key, u32);
|
||||
#endif
|
||||
__type(value, struct stat_t);
|
||||
__uint(max_entries, 10000);
|
||||
} cpu_tg_metric SEC(".maps");
|
||||
|
||||
struct g_stat_t;
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_ARRAY);
|
||||
__type(key, u32);
|
||||
__type(value, struct g_stat_t);
|
||||
// all global counts are integrated in one g_stat_t struct
|
||||
__uint(max_entries, 1);
|
||||
} cpu_host_metric SEC(".maps");
|
||||
|
||||
// record enqueue timestamp
|
||||
static int trace_enqueue(u32 pid)
|
||||
{
|
||||
// u64 *valp;
|
||||
u64 ts;
|
||||
|
||||
if (pid == 0)
|
||||
return 0;
|
||||
|
||||
ts = bpf_ktime_get_ns();
|
||||
bpf_map_update_elem(&latency, &pid, &ts, COMPAT_BPF_ANY);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
SEC("tracepoint/sched/sched_wakeup_new")
|
||||
int sched_wakeup_new_entry(struct trace_event_raw_sched_wakeup_template *ctx)
|
||||
{
|
||||
return trace_enqueue(ctx->pid);
|
||||
}
|
||||
|
||||
struct sched_wakeup_args {
|
||||
unsigned long long pad;
|
||||
char comm[16];
|
||||
int pid;
|
||||
int prio;
|
||||
int success;
|
||||
int target_cpu;
|
||||
};
|
||||
|
||||
SEC("tracepoint/sched/sched_wakeup")
|
||||
int sched_wakeup_entry(struct trace_event_raw_sched_wakeup_template *ctx)
|
||||
{
|
||||
return trace_enqueue(ctx->pid);
|
||||
}
|
||||
|
||||
struct task_struct___5_14 {
|
||||
unsigned int __state;
|
||||
} __attribute__((preserve_access_index));
|
||||
|
||||
long get_task_state(struct task_struct *task)
|
||||
{
|
||||
long state;
|
||||
|
||||
if (task == NULL)
|
||||
return -1;
|
||||
|
||||
if (bpf_core_field_exists(task->state))
|
||||
state = BPF_CORE_READ(task, state);
|
||||
else {
|
||||
struct task_struct___5_14 *task_new = (struct task_struct___5_14 *)task;
|
||||
state = (long)BPF_CORE_READ(task_new, __state);
|
||||
}
|
||||
|
||||
return state;
|
||||
}
|
||||
|
||||
SEC("raw_tracepoint/sched_switch")
|
||||
int sched_switch_entry(struct bpf_raw_tracepoint_args *ctx)
|
||||
{
|
||||
u32 prev_pid, next_pid, g_key = 0;
|
||||
u64 now, *tsp, delta;
|
||||
bool is_voluntary;
|
||||
long state;
|
||||
struct stat_t *entry;
|
||||
struct g_stat_t *g_entry;
|
||||
|
||||
// TP_PROTO(bool preempt, struct task_struct *prev, struct task_struct
|
||||
// *next)
|
||||
struct task_struct *prev = (struct task_struct *)ctx->args[1];
|
||||
struct task_struct *next = (struct task_struct *)ctx->args[2];
|
||||
|
||||
#ifdef TG_ADDR_KEY
|
||||
// get task_group addr: task_struct->sched_task_group
|
||||
u64 key = (u64)_(prev->sched_task_group);
|
||||
#else
|
||||
// get pid ns id: task_struct->nsproxy->pid_ns_for_children->ns.inum
|
||||
u32 key = BPF_CORE_READ(prev, nsproxy, pid_ns_for_children, ns.inum);
|
||||
#endif
|
||||
|
||||
state = get_task_state(prev);
|
||||
|
||||
// ivcsw: treat like an enqueue event and store timestamp
|
||||
prev_pid = _(prev->pid);
|
||||
if (state == TASK_RUNNING) {
|
||||
if (prev_pid != 0) {
|
||||
now = bpf_ktime_get_ns();
|
||||
bpf_map_update_elem(&latency, &prev_pid, &now,
|
||||
COMPAT_BPF_ANY);
|
||||
}
|
||||
is_voluntary = 0;
|
||||
} else {
|
||||
is_voluntary = 1;
|
||||
}
|
||||
|
||||
g_entry = bpf_map_lookup_elem(&cpu_host_metric, &g_key);
|
||||
if (!g_entry) {
|
||||
// init global counts map
|
||||
struct g_stat_t g_new_stat = {
|
||||
.g_nvcsw = 0,
|
||||
.g_nivcsw = 0,
|
||||
.g_nlat_01 = 0,
|
||||
.g_nlat_02 = 0,
|
||||
.g_nlat_03 = 0,
|
||||
.g_nlat_04 = 0,
|
||||
};
|
||||
bpf_map_update_elem(&cpu_host_metric, &g_key, &g_new_stat,
|
||||
COMPAT_BPF_NOEXIST);
|
||||
g_entry = bpf_map_lookup_elem(&cpu_host_metric, &g_key);
|
||||
if (!g_entry)
|
||||
return 0;
|
||||
}
|
||||
|
||||
// When use pid namespace id as key, sometimes we would encounter
|
||||
// null id because task->nsproxy is freed, usually means that this
|
||||
// task is almost dead (zombie), so ignore it.
|
||||
if (key && prev_pid) {
|
||||
entry = bpf_map_lookup_elem(&cpu_tg_metric, &key);
|
||||
if (!entry) {
|
||||
struct stat_t new_stat = {
|
||||
.nvcsw = 0,
|
||||
.nivcsw = 0,
|
||||
.nlat_01 = 0,
|
||||
.nlat_02 = 0,
|
||||
.nlat_03 = 0,
|
||||
.nlat_04 = 0,
|
||||
};
|
||||
bpf_map_update_elem(&cpu_tg_metric, &key, &new_stat,
|
||||
COMPAT_BPF_NOEXIST);
|
||||
entry = bpf_map_lookup_elem(&cpu_tg_metric, &key);
|
||||
if (!entry)
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (is_voluntary) {
|
||||
__sync_fetch_and_add(&entry->nvcsw, 1);
|
||||
__sync_fetch_and_add(&g_entry->g_nvcsw, 1);
|
||||
} else {
|
||||
__sync_fetch_and_add(&entry->nivcsw, 1);
|
||||
__sync_fetch_and_add(&g_entry->g_nivcsw, 1);
|
||||
}
|
||||
}
|
||||
|
||||
// trace_sched_switch is called under prev != next, no need to check
|
||||
// again.
|
||||
|
||||
next_pid = _(next->pid);
|
||||
// ignore idle
|
||||
if (next_pid == 0)
|
||||
return 0;
|
||||
|
||||
// fetch timestamp and calculate delta
|
||||
tsp = bpf_map_lookup_elem(&latency, &next_pid);
|
||||
if (tsp == 0 || *tsp == 0) {
|
||||
return 0; // missed enqueue
|
||||
}
|
||||
|
||||
now = bpf_ktime_get_ns();
|
||||
delta = now - *tsp;
|
||||
bpf_map_delete_elem(&latency, &next_pid);
|
||||
|
||||
#ifdef TG_ADDR_KEY
|
||||
key = (u64)_(next->sched_task_group);
|
||||
#else
|
||||
key = BPF_CORE_READ(next, nsproxy, pid_ns_for_children, ns.inum);
|
||||
#endif
|
||||
|
||||
if (key) {
|
||||
entry = bpf_map_lookup_elem(&cpu_tg_metric, &key);
|
||||
if (!entry) {
|
||||
struct stat_t new_stat = {
|
||||
.nvcsw = 0,
|
||||
.nivcsw = 0,
|
||||
.nlat_01 = 0,
|
||||
.nlat_02 = 0,
|
||||
.nlat_03 = 0,
|
||||
.nlat_04 = 0,
|
||||
};
|
||||
bpf_map_update_elem(&cpu_tg_metric, &key, &new_stat,
|
||||
COMPAT_BPF_NOEXIST);
|
||||
entry = bpf_map_lookup_elem(&cpu_tg_metric, &key);
|
||||
if (!entry)
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (delta < 10 * NSEC_PER_MSEC) {
|
||||
__sync_fetch_and_add(&entry->nlat_01, 1);
|
||||
__sync_fetch_and_add(&g_entry->g_nlat_01, 1);
|
||||
} else if (delta < 20 * NSEC_PER_MSEC) {
|
||||
__sync_fetch_and_add(&entry->nlat_02, 1);
|
||||
__sync_fetch_and_add(&g_entry->g_nlat_02, 1);
|
||||
} else if (delta < 50 * NSEC_PER_MSEC) {
|
||||
__sync_fetch_and_add(&entry->nlat_03, 1);
|
||||
__sync_fetch_and_add(&g_entry->g_nlat_03, 1);
|
||||
} else {
|
||||
__sync_fetch_and_add(&entry->nlat_04, 1);
|
||||
__sync_fetch_and_add(&g_entry->g_nlat_04, 1);
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
SEC("raw_tracepoint/sched_process_exit")
|
||||
int sched_process_exit_entry(struct bpf_raw_tracepoint_args *ctx)
|
||||
{
|
||||
u32 pid;
|
||||
|
||||
// TP_PROTO(struct task_struct *tsk)
|
||||
struct task_struct *p = (struct task_struct *)ctx->args[0];
|
||||
|
||||
pid = _(p->pid);
|
||||
/*
|
||||
* check latency table to fix latency table overflow in below scenario:
|
||||
* when wake up the target task, but the target task always running in
|
||||
* the other cpu, the target cpu will never be the next pid, because the
|
||||
* target task will be exiting, the latency item never delete.
|
||||
* To avoid latency table overflow, we should delete the latency item in
|
||||
* exit process.
|
||||
*/
|
||||
|
||||
if (bpf_map_lookup_elem(&latency, &pid)) {
|
||||
bpf_map_delete_elem(&latency, &pid);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef TG_ADDR_KEY
|
||||
// When cgroup is removed, the record should be deleted.
|
||||
SEC("kprobe/free_fair_sched_group")
|
||||
int free_fair_sched_group_entry(struct pt_regs *ctx)
|
||||
{
|
||||
struct task_group *tg = (void *)PT_REGS_PARM1(ctx);
|
||||
struct stat_t *entry;
|
||||
|
||||
entry = bpf_map_lookup_elem(&cpu_tg_metric, &tg);
|
||||
if (entry)
|
||||
bpf_map_delete_elem(&cpu_tg_metric, &tg);
|
||||
|
||||
return 0;
|
||||
}
|
||||
#else
|
||||
// When pid namespace is destroyed, the record should be deleted.
|
||||
SEC("kprobe/destroy_pid_namespace")
|
||||
int destroy_pid_namespace_entry(struct pt_regs *ctx)
|
||||
{
|
||||
struct pid_namespace *ns = (void *)PT_REGS_PARM1(ctx);
|
||||
struct stat_t *entry;
|
||||
|
||||
// ns->ns.inum
|
||||
u32 pidns = BPF_CORE_READ(ns, ns.inum);
|
||||
entry = bpf_map_lookup_elem(&cpu_tg_metric, &pidns);
|
||||
if (entry)
|
||||
bpf_map_delete_elem(&cpu_tg_metric, &pidns);
|
||||
|
||||
return 0;
|
||||
}
|
||||
#endif
|
|
@ -0,0 +1,71 @@
|
|||
#include "vmlinux.h"
|
||||
|
||||
#include <bpf/bpf_core_read.h>
|
||||
#include <bpf/bpf_helpers.h>
|
||||
#include <bpf/bpf_tracing.h>
|
||||
|
||||
#include "bpf_common.h"
|
||||
|
||||
enum lat_zone {
|
||||
LAT_ZONE0 = 0, // 0 ~ 10us
|
||||
LAT_ZONE1, // 10us ~ 100us
|
||||
LAT_ZONE2, // 100us ~ 1ms
|
||||
LAT_ZONE3, // 1ms ~ inf
|
||||
LAT_ZONE_MAX,
|
||||
};
|
||||
|
||||
struct softirq_lat {
|
||||
u64 timestamp;
|
||||
u64 total_latency[LAT_ZONE_MAX];
|
||||
};
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
|
||||
__uint(key_size, sizeof(u32));
|
||||
__uint(value_size, sizeof(struct softirq_lat));
|
||||
__uint(max_entries, NR_SOFTIRQS_MAX);
|
||||
} softirq_percpu_lats SEC(".maps");
|
||||
|
||||
SEC("tracepoint/irq/softirq_raise")
|
||||
int probe_softirq_raise(struct trace_event_raw_softirq *ctx)
|
||||
{
|
||||
struct softirq_lat lat = {
|
||||
.timestamp = bpf_ktime_get_ns(),
|
||||
};
|
||||
u32 vec = ctx->vec;
|
||||
|
||||
if (vec >= NR_SOFTIRQS)
|
||||
return 0;
|
||||
|
||||
bpf_map_update_elem(&softirq_percpu_lats, &vec, &lat, COMPAT_BPF_ANY);
|
||||
return 0;
|
||||
}
|
||||
|
||||
SEC("tracepoint/irq/softirq_entry")
|
||||
int probe_softirq_entry(struct trace_event_raw_softirq *ctx)
|
||||
{
|
||||
struct softirq_lat *lat;
|
||||
u32 vec = ctx->vec;
|
||||
|
||||
if (vec >= NR_SOFTIRQS)
|
||||
return 0;
|
||||
|
||||
lat = bpf_map_lookup_elem(&softirq_percpu_lats, &vec);
|
||||
if (!lat)
|
||||
return 0;
|
||||
|
||||
u64 latency = bpf_ktime_get_ns() - lat->timestamp;
|
||||
|
||||
if (latency < 10 * NSEC_PER_USEC) {
|
||||
__sync_fetch_and_add(&lat->total_latency[LAT_ZONE0], 1);
|
||||
} else if (latency < 100 * NSEC_PER_USEC) {
|
||||
__sync_fetch_and_add(&lat->total_latency[LAT_ZONE1], 1);
|
||||
} else if (latency < 1 * NSEC_PER_MSEC) {
|
||||
__sync_fetch_and_add(&lat->total_latency[LAT_ZONE2], 1);
|
||||
} else {
|
||||
__sync_fetch_and_add(&lat->total_latency[LAT_ZONE3], 1);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
char __license[] SEC("license") = "Dual MIT/GPL";
|
|
@ -0,0 +1,155 @@
|
|||
#include "vmlinux.h"
|
||||
|
||||
#include <bpf/bpf_helpers.h>
|
||||
#include <bpf/bpf_tracing.h>
|
||||
|
||||
#include "bpf_common.h"
|
||||
#include "bpf_ratelimit.h"
|
||||
|
||||
char __license[] SEC("license") = "Dual MIT/GPL";
|
||||
|
||||
#define NR_STACK_TRACE_MAX 0x4000
|
||||
#define MSEC_PER_NSEC 1000000UL
|
||||
#define TICK_DEP_MASK_NONE 0
|
||||
#define SOFTIRQ_THRESH 5000000UL
|
||||
|
||||
volatile const u64 softirq_thresh = SOFTIRQ_THRESH;
|
||||
|
||||
#define TICK 1000
|
||||
BPF_RATELIMIT(rate, 1, COMPAT_CPU_NUM *TICK * 1000);
|
||||
|
||||
struct timer_softirq_run_ts {
|
||||
u32 start_trace;
|
||||
u32 restarting_tick;
|
||||
u64 soft_ts;
|
||||
};
|
||||
|
||||
struct report_event {
|
||||
u64 stack[PERF_MAX_STACK_DEPTH];
|
||||
s64 stack_size;
|
||||
u64 now;
|
||||
u64 stall_time;
|
||||
char comm[COMPAT_TASK_COMM_LEN];
|
||||
u32 pid;
|
||||
u32 cpu;
|
||||
};
|
||||
|
||||
// the map for recording irq/softirq timer ts
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
|
||||
__uint(key_size, sizeof(u32));
|
||||
__uint(value_size, sizeof(struct timer_softirq_run_ts));
|
||||
__uint(max_entries, 1);
|
||||
} timerts_map SEC(".maps");
|
||||
|
||||
// the map use for storing struct report_event memory
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
|
||||
__uint(key_size, sizeof(u32)); // key = 0
|
||||
__uint(value_size, sizeof(struct report_event));
|
||||
__uint(max_entries, 1);
|
||||
} report_map SEC(".maps");
|
||||
|
||||
// the event map use for report userspace
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
|
||||
__uint(key_size, sizeof(int));
|
||||
__uint(value_size, sizeof(u32));
|
||||
} irqoff_event_map SEC(".maps");
|
||||
|
||||
SEC("kprobe/account_process_tick")
|
||||
void probe_account_process_tick(struct pt_regs *ctx)
|
||||
{
|
||||
// verify bpf-ratelimit
|
||||
if (bpf_ratelimited(&rate))
|
||||
return;
|
||||
|
||||
// update soft timer timestamps
|
||||
int key = 0;
|
||||
struct timer_softirq_run_ts *ts;
|
||||
// struct thresh_data *tdata;
|
||||
struct report_event *event;
|
||||
u64 now;
|
||||
u64 delta;
|
||||
|
||||
ts = bpf_map_lookup_elem(&timerts_map, &key);
|
||||
if (!ts)
|
||||
return;
|
||||
|
||||
if (!ts->start_trace)
|
||||
return;
|
||||
|
||||
// update soft timer timestamps
|
||||
if (!ts->soft_ts) {
|
||||
ts->soft_ts = bpf_ktime_get_ns();
|
||||
return;
|
||||
}
|
||||
|
||||
event = bpf_map_lookup_elem(&report_map, &key);
|
||||
if (!event)
|
||||
return;
|
||||
|
||||
if (ts->restarting_tick) {
|
||||
ts->restarting_tick = 0;
|
||||
ts->soft_ts = bpf_ktime_get_ns();
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
now = bpf_ktime_get_ns();
|
||||
delta = now - ts->soft_ts;
|
||||
|
||||
// if delta over threshold, dump important info to user
|
||||
if (delta >= softirq_thresh) {
|
||||
event->now = now;
|
||||
event->stall_time = delta;
|
||||
__builtin_memset(event->comm, 0, sizeof(event->comm));
|
||||
bpf_get_current_comm(&event->comm, sizeof(event->comm));
|
||||
event->pid = (u32)bpf_get_current_pid_tgid();
|
||||
event->cpu = bpf_get_smp_processor_id();
|
||||
event->stack_size =
|
||||
bpf_get_stack(ctx, event->stack, sizeof(event->stack), 0);
|
||||
|
||||
bpf_perf_event_output(ctx, &irqoff_event_map,
|
||||
COMPAT_BPF_F_CURRENT_CPU, event,
|
||||
sizeof(struct report_event));
|
||||
}
|
||||
|
||||
// update soft_ts, use for next trace
|
||||
ts->soft_ts = now;
|
||||
}
|
||||
|
||||
SEC("tracepoint/timer/tick_stop")
|
||||
void probe_tick_stop(struct trace_event_raw_tick_stop *ctx)
|
||||
{
|
||||
struct timer_softirq_run_ts *ts;
|
||||
int key = 0;
|
||||
|
||||
ts = bpf_map_lookup_elem(&timerts_map, &key);
|
||||
if (!ts)
|
||||
return;
|
||||
|
||||
if (ctx->success == 1 && ctx->dependency == TICK_DEP_MASK_NONE) {
|
||||
ts->start_trace = 0;
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
SEC("kprobe/tick_nohz_restart_sched_tick")
|
||||
void probe_tick_nohz_restart_sched_tick(struct pt_regs *ctx)
|
||||
{
|
||||
struct timer_softirq_run_ts *ts;
|
||||
int key = 0;
|
||||
u64 now;
|
||||
|
||||
ts = bpf_map_lookup_elem(&timerts_map, &key);
|
||||
if (!ts)
|
||||
return;
|
||||
|
||||
now = bpf_ktime_get_ns();
|
||||
|
||||
ts->soft_ts = now;
|
||||
ts->start_trace = 1;
|
||||
ts->restarting_tick = 1;
|
||||
}
|
|
@ -0,0 +1,42 @@
|
|||
#include "vmlinux.h"
|
||||
|
||||
#include <bpf/bpf_core_read.h>
|
||||
#include <bpf/bpf_helpers.h>
|
||||
#include <bpf/bpf_tracing.h>
|
||||
|
||||
#include "bpf_common.h"
|
||||
#include "bpf_ratelimit.h"
|
||||
|
||||
char __license[] SEC("license") = "Dual MIT/GPL";
|
||||
|
||||
BPF_RATELIMIT_IN_MAP(rate, 1, COMPAT_CPU_NUM * 10000, 0);
|
||||
|
||||
struct {
|
||||
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
|
||||
__uint(key_size, sizeof(int));
|
||||
__uint(value_size, sizeof(u32));
|
||||
} softlockup_perf_events SEC(".maps");
|
||||
|
||||
struct softlockup_info {
|
||||
u32 cpu;
|
||||
u32 pid;
|
||||
char comm[COMPAT_TASK_COMM_LEN];
|
||||
};
|
||||
|
||||
SEC("kprobe/watchdog_timer_fn+442")
|
||||
int kprobe_watchdog_timer_fn(struct pt_regs *ctx)
|
||||
{
|
||||
struct softlockup_info info = {};
|
||||
struct task_struct *task;
|
||||
|
||||
if (bpf_ratelimited_in_map(ctx, rate))
|
||||
return 0;
|
||||
|
||||
info.cpu = bpf_get_smp_processor_id();
|
||||
task = (struct task_struct *)bpf_get_current_task();
|
||||
info.pid = bpf_get_current_pid_tgid() & 0xffffffffUL;
|
||||
BPF_CORE_READ_STR_INTO(&info.comm, task, comm);
|
||||
bpf_perf_event_output(ctx, &softlockup_perf_events,
|
||||
COMPAT_BPF_F_CURRENT_CPU, &info, sizeof(info));
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,57 @@
|
|||
#!/bin/sh
|
||||
|
||||
usage() {
|
||||
echo "OVERVIEW: HuaTuo BPF compiler tool (clang LLVM)
|
||||
|
||||
USAGE: clang.sh -s <source.c> -o <output.o> -I [includes] -C '[compile_options]'
|
||||
EXAMPLE:
|
||||
clang.sh -s example.bpf.c -o example.o # run preprocess, compile, and assemble steps (-C '-c')
|
||||
clang.sh -s example.bpf.c -o example.o -I include -I include/4.18.0-193.6.3.el8_2.x86_64 # specify the headers, (-C '-c')
|
||||
clang.sh -s example.bpf.c -o example.o -C '-E' # only run the preprocessor
|
||||
clang.sh -s example.bpf.c -o example.o -C '-S' # only run preprocess and compilation steps"
|
||||
}
|
||||
|
||||
SRC=
|
||||
OBJ=
|
||||
INCLUDES=
|
||||
DEFAULT_INCLUDES="-I include -I include/4.18.0-193.6.3.el8_2.x86_64"
|
||||
COMPILE_OPTIONS=
|
||||
DEFAULT_COMPILE_OPTIONS="-Wall -O2 -g -target bpf -D__TARGET_ARCH_x86 -mcpu=v1 -c"
|
||||
|
||||
while getopts 'hs:o:C:I:' opt
|
||||
do
|
||||
case ${opt} in
|
||||
s)
|
||||
[ -n "${SRC}" ] && echo "-s(source) required 1 file (bpf.c)" && exit 1
|
||||
SRC=${OPTARG}
|
||||
;;
|
||||
o)
|
||||
[ -n "${OBJ}" ] && echo "-o(output) required 1 file (output.o)" && exit 1
|
||||
OBJ=${OPTARG}
|
||||
;;
|
||||
C)
|
||||
COMPILE_OPTIONS=${OPTARG}
|
||||
;;
|
||||
I)
|
||||
INCLUDES="${INCLUDES} -I ${OPTARG}"
|
||||
;;
|
||||
h)
|
||||
usage
|
||||
exit
|
||||
;;
|
||||
?)
|
||||
usage
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
[ -z "${SRC}" ] && echo -e "-s must be specified, such as -c example.bpf.c \n\n $(usage)" && exit 1
|
||||
[ -z "${OBJ}" ] && echo -e "-o must be specified, such as -o example.o \n\n $(usage)" && exit 1
|
||||
|
||||
|
||||
# Note: parameter ${DEFAULT_COMPILE_OPTIONS} will be overwritten by ${COMPILE_OPTIONS} in ${OPTIONS}
|
||||
OPTIONS="${DEFAULT_COMPILE_OPTIONS} ${COMPILE_OPTIONS}"
|
||||
[ -z "${INCLUDES}" ] && INCLUDES="${DEFAULT_INCLUDES}"
|
||||
|
||||
clang ${OPTIONS} ${SRC} -o ${OBJ} ${INCLUDES}
|
|
@ -0,0 +1,18 @@
|
|||
# elasticsearch
|
||||
ELASTIC_VERSION=8.15.5
|
||||
|
||||
# https://www.elastic.co/guide/en/elasticsearch/reference/current/built-in-users.html
|
||||
ELASTIC_PASSWORD='huatuo-bamai' # user 'elastic' (built-in)
|
||||
KIBANA_SYSTEM_PASSWORD='huatuo-bamai' # user 'kibana_system' (built-in)
|
||||
|
||||
# setup to init user
|
||||
ELASTICSEARCH_HOST='localhost'
|
||||
|
||||
# prometheus
|
||||
PROMETHEUS_VERSION=v2.53.3 # LTS v2.53
|
||||
|
||||
# Grafana
|
||||
GRAFANA_VERSION=11.0.0
|
||||
|
||||
# Run huatuo-bamai
|
||||
RUN_PATH=/home/huatuo-bamai
|
|
@ -0,0 +1,51 @@
|
|||
services:
|
||||
elasticsearch:
|
||||
image: docker.elastic.co/elasticsearch/elasticsearch:${ELASTIC_VERSION:-8.15.5}
|
||||
container_name: es
|
||||
network_mode: host
|
||||
environment:
|
||||
discovery.type: single-node
|
||||
ELASTIC_PASSWORD: ${ELASTIC_PASSWORD:-}
|
||||
KIBANA_SYSTEM_PASSWORD: ${KIBANA_SYSTEM_PASSWORD:-}
|
||||
volumes:
|
||||
- ./elasticsearch.yml:/usr/share/elasticsearch/config/elasticsearch.yml:ro
|
||||
|
||||
prometheus:
|
||||
image: prom/prometheus:${PROMETHEUS_VERSION:-v2.53.3}
|
||||
container_name: prometheus
|
||||
network_mode: host
|
||||
volumes:
|
||||
- ./prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
||||
|
||||
grafana:
|
||||
image: grafana/grafana-oss:${GRAFANA_VERSION:-11.0.0}
|
||||
container_name: grafana
|
||||
network_mode: host
|
||||
volumes:
|
||||
- ./grafana/datasources/elasticsearch.yaml:/etc/grafana/provisioning/datasources/elasticsearch.yaml:ro
|
||||
- ./grafana/datasources/prometheus.yaml:/etc/grafana/provisioning/datasources/prometheus.yaml:ro
|
||||
- ./grafana/dashboards:/etc/grafana/provisioning/dashboards:ro
|
||||
depends_on:
|
||||
- prometheus
|
||||
- elasticsearch
|
||||
|
||||
huatuo-bamai:
|
||||
image: huatuo/huatuo-bamai:latest
|
||||
container_name: huatuo-bamai
|
||||
network_mode: host
|
||||
cgroup: host
|
||||
privileged: true
|
||||
environment:
|
||||
ELASTICSEARCH_HOST: ${ELASTICSEARCH_HOST:-}
|
||||
ELASTIC_PASSWORD: ${ELASTIC_PASSWORD:-}
|
||||
RUN_PATH: ${RUN_PATH:-}
|
||||
volumes:
|
||||
- /sys:/sys:rw
|
||||
- /run:/run:rw
|
||||
- ../../huatuo-bamai.conf:${RUN_PATH}/huatuo-bamai.conf:rw
|
||||
- ./run.sh:${RUN_PATH}/run.sh:ro
|
||||
command: ["./run.sh"]
|
||||
depends_on:
|
||||
- elasticsearch
|
||||
- prometheus
|
||||
- grafana
|
|
@ -0,0 +1,4 @@
|
|||
cluster.name: "docker-cluster"
|
||||
network.host: 0.0.0.0
|
||||
http.port: 9200
|
||||
xpack.security.enabled: true
|
|
@ -0,0 +1,24 @@
|
|||
apiVersion: 1
|
||||
|
||||
providers:
|
||||
# <string> an unique provider name. Required
|
||||
- name: 'huatuo-bamai'
|
||||
# <int> Org id. Default to 1
|
||||
orgId: 1
|
||||
# <string> name of the dashboard folder.
|
||||
folder: ''
|
||||
# <string> folder UID. will be automatically generated if not specified
|
||||
folderUid: ''
|
||||
# <string> provider type. Default to 'file'
|
||||
type: file
|
||||
# <bool> disable dashboard deletion
|
||||
disableDeletion: false
|
||||
# <int> how often Grafana will scan for changed dashboards
|
||||
updateIntervalSeconds: 10
|
||||
# <bool> allow updating provisioned dashboards from the UI
|
||||
allowUiUpdates: false
|
||||
options:
|
||||
# <string, required> path to dashboard files on disk. Required when using the 'file' type
|
||||
path: /etc/grafana/provisioning/dashboards
|
||||
# <bool> use folder names from filesystem to create folders in Grafana
|
||||
foldersFromFilesStructure: true
|
|
@ -0,0 +1,63 @@
|
|||
# https://grafana.com/docs/grafana/latest/datasources/elasticsearch/
|
||||
|
||||
apiVersion: 1
|
||||
|
||||
# List of data sources to delete from the database.
|
||||
deleteDatasources:
|
||||
- name: huatuo-bamai-es
|
||||
|
||||
# Mark provisioned data sources for deletion if they are no longer in a provisioning file.
|
||||
# It takes no effect if data sources are already listed in the deleteDatasources section.
|
||||
prune: true
|
||||
|
||||
# List of data sources to insert/update depending on what's
|
||||
# available in the database.
|
||||
datasources:
|
||||
# <string, required> Sets the name you use to refer to
|
||||
# the data source in panels and queries.
|
||||
- name: huatuo-bamai-es
|
||||
# <string, required> Sets the data source type.
|
||||
type: elasticsearch
|
||||
# <string, required> Sets the access mode, either
|
||||
# proxy or direct (Server or Browser in the UI).
|
||||
# Some data sources are incompatible with any setting
|
||||
# but proxy (Server).
|
||||
access: proxy
|
||||
# <int> Sets the organization id. Defaults to orgId 1.
|
||||
orgId: 1
|
||||
# <string> Sets a custom UID to reference this
|
||||
# data source in other parts of the configuration.
|
||||
# If not specified, Grafana generates one.
|
||||
uid: huatuo-bamai-es
|
||||
# <string> Sets the data source's URL, including the
|
||||
# port.
|
||||
url: http://localhost:9200
|
||||
# <string> Sets the database user, if necessary.
|
||||
user: elastic
|
||||
# <string> Sets the database name, if necessary.
|
||||
database:
|
||||
# <bool> Enables credential headers.
|
||||
withCredentials:
|
||||
# <bool> Toggles whether the data source is pre-selected
|
||||
# for new panels. You can set only one default
|
||||
# data source per organization.
|
||||
isDefault:
|
||||
# <map> Fields to convert to JSON and store in jsonData.
|
||||
jsonData:
|
||||
index: 'huatuo_bamai*'
|
||||
timeField: 'uploaded_time'
|
||||
# <map> Fields to encrypt before storing in jsonData.
|
||||
secureJsonData:
|
||||
# <string> Defines the CA cert, client cert, and
|
||||
# client key for encrypted authentication.
|
||||
tlsCACert: '...'
|
||||
tlsClientCert: '...'
|
||||
tlsClientKey: '...'
|
||||
# <string> Sets the database password, if necessary.
|
||||
password: huatuo-bamai
|
||||
# <int> Sets the version. Used to compare versions when
|
||||
# updating. Ignored when creating a new data source.
|
||||
version: 1
|
||||
# <bool> Allows users to edit data sources from the
|
||||
# Grafana UI.
|
||||
editable: false
|
|
@ -0,0 +1,29 @@
|
|||
# https://grafana.com/docs/grafana/latest/datasources/prometheus/
|
||||
|
||||
apiVersion: 1
|
||||
|
||||
# List of data sources to delete from the database.
|
||||
deleteDatasources:
|
||||
- name: huatuo-bamai-prom
|
||||
|
||||
# Mark provisioned data sources for deletion if they are no longer in a provisioning file.
|
||||
# It takes no effect if data sources are already listed in the deleteDatasources section.
|
||||
prune: true
|
||||
|
||||
datasources:
|
||||
- name: huatuo-bamai-prom
|
||||
type: prometheus
|
||||
access: proxy
|
||||
# <int> Sets the organization id. Defaults to orgId 1.
|
||||
orgId: 1
|
||||
# <string> Sets a custom UID to reference this
|
||||
# data source in other parts of the configuration.
|
||||
# If not specified, Grafana generates one.
|
||||
uid: huatuo-bamai-prom
|
||||
url: http://localhost:9090
|
||||
jsonData:
|
||||
httpMethod: POST
|
||||
prometheusType: Prometheus
|
||||
cacheLevel: 'High'
|
||||
disableRecordingRules: false
|
||||
incrementalQueryOverlapWindow: 10m
|
|
@ -0,0 +1,33 @@
|
|||
# my global config
|
||||
global:
|
||||
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
|
||||
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
|
||||
# scrape_timeout is set to the global default (10s).
|
||||
|
||||
# Alertmanager configuration
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets:
|
||||
# - alertmanager:9093
|
||||
|
||||
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
|
||||
rule_files:
|
||||
# - "first_rules.yml"
|
||||
# - "second_rules.yml"
|
||||
|
||||
# A scrape configuration containing exactly one endpoint to scrape:
|
||||
# Here it's Prometheus itself.
|
||||
scrape_configs:
|
||||
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
|
||||
- job_name: "prometheus"
|
||||
|
||||
# metrics_path defaults to '/metrics'
|
||||
# scheme defaults to 'http'.
|
||||
|
||||
static_configs:
|
||||
- targets: ["localhost:9090"]
|
||||
# add huatuo
|
||||
- job_name: "huatuo"
|
||||
static_configs:
|
||||
- targets: ["localhost:19704"]
|
|
@ -0,0 +1,69 @@
|
|||
#!/bin/sh
|
||||
|
||||
ELASTICSEARCH_HOST=${ELASTICSEARCH_HOST:-localhost}
|
||||
ELASTIC_PASSWORD=${ELASTIC_PASSWORD:-huatuo-bamai}
|
||||
|
||||
RUN_PATH=${RUN_PATH:-/home/huatuo-bamai}
|
||||
|
||||
# Wait for Elasticsearch to be ready
|
||||
wait_for_elasticsearch() {
|
||||
args="-s -D- -m15 -w '%{http_code}' http://${ELASTICSEARCH_HOST}:9200/"
|
||||
if [ -n "${ELASTIC_PASSWORD}" ]; then
|
||||
args="$args -u elastic:${ELASTIC_PASSWORD}"
|
||||
fi
|
||||
|
||||
result=1
|
||||
output=""
|
||||
|
||||
# retry for up to 180 seconds
|
||||
for sec in $(seq 1 180); do
|
||||
exit_code=0
|
||||
output=$(eval "curl $args") || exit_code=$?
|
||||
# echo "exec curl $args, exit code: $exit_code, output: $output"
|
||||
if [ $exit_code -ne 0 ]; then
|
||||
result=$exit_code
|
||||
fi
|
||||
|
||||
# Extract the last three characters of the output to check the HTTP status code
|
||||
http_code=$(echo "$output" | tail -c 4)
|
||||
if [ "$http_code" -eq 200 ]; then
|
||||
result=0
|
||||
break
|
||||
fi
|
||||
|
||||
echo "Waiting for Elasticsearch ready... ${sec}s"
|
||||
sleep 1
|
||||
done
|
||||
|
||||
if [ $result -ne 0 ] && [ "$http_code" -ne 000 ]; then
|
||||
echo "$output" | head -c -3
|
||||
fi
|
||||
|
||||
if [ $result -ne 0 ]; then
|
||||
case $result in
|
||||
6)
|
||||
echo 'Could not resolve host. Is Elasticsearch running?'
|
||||
;;
|
||||
7)
|
||||
echo 'Failed to connect to host. Is Elasticsearch healthy?'
|
||||
;;
|
||||
28)
|
||||
echo 'Timeout connecting to host. Is Elasticsearch healthy?'
|
||||
;;
|
||||
*)
|
||||
echo "Connection to Elasticsearch failed. Exit code: ${result}"
|
||||
;;
|
||||
esac
|
||||
|
||||
exit $result
|
||||
fi
|
||||
}
|
||||
|
||||
|
||||
wait_for_elasticsearch
|
||||
sleep 5 # Waiting for initialization of Elasticsearch built-in users
|
||||
echo "Elasticsearch is ready."
|
||||
|
||||
# Run huatuo-bamai
|
||||
cd $RUN_PATH
|
||||
exec ./huatuo-bamai --region example --config huatuo-bamai.conf
|
|
@ -0,0 +1,250 @@
|
|||
// Copyright 2025 The HuaTuo Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"os/signal"
|
||||
"runtime"
|
||||
"strings"
|
||||
"syscall"
|
||||
|
||||
_ "huatuo-bamai/core/autotracing"
|
||||
_ "huatuo-bamai/core/events"
|
||||
_ "huatuo-bamai/core/metrics"
|
||||
"huatuo-bamai/internal/bpf"
|
||||
"huatuo-bamai/internal/cgroups"
|
||||
"huatuo-bamai/internal/conf"
|
||||
"huatuo-bamai/internal/log"
|
||||
"huatuo-bamai/internal/pod"
|
||||
"huatuo-bamai/internal/services"
|
||||
"huatuo-bamai/internal/storage"
|
||||
"huatuo-bamai/internal/utils/pidutil"
|
||||
"huatuo-bamai/pkg/tracing"
|
||||
|
||||
"github.com/urfave/cli/v2"
|
||||
)
|
||||
|
||||
func mainAction(ctx *cli.Context) error {
|
||||
if ctx.NArg() > 0 {
|
||||
return fmt.Errorf("invalid param %v", ctx.Args())
|
||||
}
|
||||
|
||||
if err := pidutil.LockPidFile(ctx.App.Name); err != nil {
|
||||
return fmt.Errorf("failed to lock pid file: %w", err)
|
||||
}
|
||||
defer pidutil.RemovePidFile(ctx.App.Name)
|
||||
|
||||
// init cpu quota
|
||||
cgr, err := cgroups.NewCgroupManager()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := cgr.NewRuntime(ctx.App.Name,
|
||||
cgroups.ToSpec(
|
||||
conf.Get().RuntimeCgroup.LimitInitCPU,
|
||||
conf.Get().RuntimeCgroup.LimitMem,
|
||||
),
|
||||
); err != nil {
|
||||
return fmt.Errorf("new runtime cgroup: %w", err)
|
||||
}
|
||||
defer func() {
|
||||
_ = cgr.DeleteRuntime()
|
||||
}()
|
||||
|
||||
if err := cgr.AddProc(uint64(os.Getpid())); err != nil {
|
||||
return fmt.Errorf("cgroup add pid to cgroups.proc")
|
||||
}
|
||||
|
||||
// initialize the storage clients.
|
||||
storageInitCtx := storage.InitContext{
|
||||
EsAddresses: conf.Get().Storage.ES.Address,
|
||||
EsUsername: conf.Get().Storage.ES.Username,
|
||||
EsPassword: conf.Get().Storage.ES.Password,
|
||||
EsIndex: conf.Get().Storage.ES.Index,
|
||||
LocalPath: conf.Get().Storage.LocalFile.Path,
|
||||
LocalMaxRotation: conf.Get().Storage.LocalFile.MaxRotation,
|
||||
LocalRotationSize: conf.Get().Storage.LocalFile.RotationSize,
|
||||
Region: conf.Region,
|
||||
}
|
||||
|
||||
if err := storage.InitDefaultClients(&storageInitCtx); err != nil {
|
||||
return fmt.Errorf("storage.InitDefaultClients: %w", err)
|
||||
}
|
||||
|
||||
// init the bpf manager.
|
||||
if err := bpf.InitBpfManager(); err != nil {
|
||||
return fmt.Errorf("failed to init bpf manager: %w", err)
|
||||
}
|
||||
|
||||
if err := pod.ContainerCgroupCssInit(); err != nil {
|
||||
return fmt.Errorf("init pod cgroup metadata: %w", err)
|
||||
}
|
||||
|
||||
podListInitCtx := pod.PodContainerInitCtx{
|
||||
PodListReadOnlyPort: conf.Get().Pod.KubeletPodListURL,
|
||||
PodListAuthorizedPort: conf.Get().Pod.KubeletPodListHTTPSURL,
|
||||
PodClientCertPath: conf.Get().Pod.KubeletPodClientCertPath,
|
||||
PodCACertPath: conf.Get().Pod.KubeletPodCACertPath,
|
||||
}
|
||||
|
||||
if err := pod.ContainerPodMgrInit(&podListInitCtx); err != nil {
|
||||
return fmt.Errorf("init podlist and sync module: %w", err)
|
||||
}
|
||||
|
||||
blacklisted := conf.Get().Blacklist
|
||||
prom, err := InitMetricsCollector(blacklisted, conf.Region)
|
||||
if err != nil {
|
||||
return fmt.Errorf("InitMetricsCollector: %w", err)
|
||||
}
|
||||
|
||||
mgr, err := tracing.NewMgrTracingEvent(blacklisted)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := mgr.MgrTracingEventStartAll(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
log.Infof("Initialize the Metrics collector: %v", prom)
|
||||
services.Start(conf.Get().APIServer.TCPAddr, mgr, prom)
|
||||
|
||||
// update cpu quota
|
||||
if err := cgr.UpdateRuntime(cgroups.ToSpec(conf.Get().RuntimeCgroup.LimitCPU, 0)); err != nil {
|
||||
return fmt.Errorf("update runtime: %w", err)
|
||||
}
|
||||
|
||||
waitExit := make(chan os.Signal, 1)
|
||||
signal.Notify(waitExit, syscall.SIGHUP, syscall.SIGQUIT, syscall.SIGUSR1, syscall.SIGINT, syscall.SIGTERM)
|
||||
for {
|
||||
s := <-waitExit
|
||||
switch s {
|
||||
case syscall.SIGQUIT, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM:
|
||||
log.Infof("huatuo-bamai exit by signal %d", s)
|
||||
bpf.CloseBpfManager()
|
||||
pod.ContainerPodMgrClose()
|
||||
return nil
|
||||
case syscall.SIGUSR1:
|
||||
return nil
|
||||
default:
|
||||
return nil
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var (
|
||||
// AppGitCommit will be the hash that the binary was built from
|
||||
// and will be populated by the Makefile
|
||||
AppGitCommit string
|
||||
// AppBuildTime will be populated by the Makefile
|
||||
AppBuildTime string
|
||||
// AppVersion will be populated by the Makefile, read from
|
||||
// VERSION file of the source code.
|
||||
AppVersion string
|
||||
AppUsage = "An In-depth Observation of Linux Kernel Application"
|
||||
)
|
||||
|
||||
func main() {
|
||||
app := cli.NewApp()
|
||||
app.Usage = AppUsage
|
||||
|
||||
if AppVersion == "" {
|
||||
panic("the value of AppVersion must be specified")
|
||||
}
|
||||
|
||||
v := []string{
|
||||
"",
|
||||
fmt.Sprintf(" app_version: %s", AppVersion),
|
||||
fmt.Sprintf(" go_version: %s", runtime.Version()),
|
||||
fmt.Sprintf(" git_commit: %s", AppGitCommit),
|
||||
fmt.Sprintf(" build_time: %s", AppBuildTime),
|
||||
}
|
||||
app.Version = strings.Join(v, "\n")
|
||||
|
||||
app.Flags = []cli.Flag{
|
||||
&cli.StringFlag{
|
||||
Name: "config",
|
||||
Value: "huatuo-bamai.conf",
|
||||
Usage: "huatuo-bamai config file",
|
||||
},
|
||||
&cli.StringFlag{
|
||||
Name: "region",
|
||||
Required: true,
|
||||
Usage: "the host and containers are in this region",
|
||||
},
|
||||
&cli.StringSliceFlag{
|
||||
Name: "disable-tracing",
|
||||
Usage: "disable tracing. This is related to Blacklist in config, and complement each other",
|
||||
},
|
||||
&cli.BoolFlag{
|
||||
Name: "log-debug",
|
||||
Usage: "enable debug output for logging",
|
||||
},
|
||||
}
|
||||
|
||||
app.Before = func(ctx *cli.Context) error {
|
||||
if err := conf.LoadConfig(ctx.String("config")); err != nil {
|
||||
return fmt.Errorf("failed to load config: %w", err)
|
||||
}
|
||||
|
||||
// set Region
|
||||
conf.Region = ctx.String("region")
|
||||
|
||||
// log level
|
||||
if conf.Get().LogLevel != "" {
|
||||
log.SetLevel(conf.Get().LogLevel)
|
||||
log.Infof("log level [%s] configured in file, use it", log.GetLevel())
|
||||
}
|
||||
|
||||
logFile := conf.Get().LogFile
|
||||
if logFile != "" {
|
||||
file, err := os.OpenFile(logFile, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0o666)
|
||||
if err == nil {
|
||||
log.SetOutput(file)
|
||||
} else {
|
||||
log.SetOutput(os.Stdout)
|
||||
log.Infof("Failed to log to file, using default stdout")
|
||||
}
|
||||
}
|
||||
|
||||
// tracer
|
||||
disabledTracing := ctx.StringSlice("disable-tracing")
|
||||
if len(disabledTracing) > 0 {
|
||||
definedTracers := conf.Get().Blacklist
|
||||
definedTracers = append(definedTracers, disabledTracing...)
|
||||
|
||||
conf.Set("Blacklist", definedTracers)
|
||||
log.Infof("The tracer black list by cli: %v", conf.Get().Blacklist)
|
||||
}
|
||||
|
||||
if ctx.Bool("log-debug") {
|
||||
log.SetLevel("Debug")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// core
|
||||
app.Action = mainAction
|
||||
|
||||
// run
|
||||
if err := app.Run(os.Args); err != nil {
|
||||
log.Errorf("Error: %v", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,43 @@
|
|||
// Copyright 2025 The HuaTuo Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"huatuo-bamai/pkg/metric"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/client_golang/prometheus/collectors"
|
||||
)
|
||||
|
||||
var promNamespace = "huatuo_bamai"
|
||||
|
||||
// InitMetricsCollector creates a new MetricsCollector instance.
|
||||
func InitMetricsCollector(blackListed []string, region string) (*prometheus.Registry, error) {
|
||||
nc, err := metric.NewCollectorManager(blackListed, region)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("create collector: %w", err)
|
||||
}
|
||||
|
||||
promRegistry := prometheus.NewRegistry()
|
||||
promRegistry.MustRegister(
|
||||
nc,
|
||||
collectors.NewGoCollector(),
|
||||
collectors.NewProcessCollector(
|
||||
collectors.ProcessCollectorOpts{Namespace: promNamespace}))
|
||||
|
||||
return promRegistry, nil
|
||||
}
|
|
@ -0,0 +1,334 @@
|
|||
// Copyright 2025 The HuaTuo Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package autotracing
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"huatuo-bamai/internal/cgroups"
|
||||
"huatuo-bamai/internal/conf"
|
||||
"huatuo-bamai/internal/flamegraph"
|
||||
"huatuo-bamai/internal/log"
|
||||
"huatuo-bamai/internal/pod"
|
||||
"huatuo-bamai/internal/storage"
|
||||
"huatuo-bamai/pkg/tracing"
|
||||
"huatuo-bamai/pkg/types"
|
||||
)
|
||||
|
||||
func init() {
|
||||
tracing.RegisterEventTracing("cpuidle", newCPUIdle)
|
||||
}
|
||||
|
||||
func newCPUIdle() (*tracing.EventTracingAttr, error) {
|
||||
return &tracing.EventTracingAttr{
|
||||
TracingData: &cpuIdleTracing{},
|
||||
Internal: 20,
|
||||
Flag: tracing.FlagTracing,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// GetCPUCoresInCgroup function returns the number of cgroup cores
|
||||
func GetCPUCoresInCgroup(cgroupPath string) (uint64, error) {
|
||||
periodPath := cgroupPath + "/cpu.cfs_period_us"
|
||||
quotaPath := cgroupPath + "/cpu.cfs_quota_us"
|
||||
|
||||
period, err := readIntFromFile(periodPath)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
quota, err := readIntFromFile(quotaPath)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
if quota == -1 {
|
||||
return uint64(runtime.NumCPU()), nil
|
||||
}
|
||||
|
||||
if period == 0 {
|
||||
return 0, fmt.Errorf("period not zero")
|
||||
}
|
||||
|
||||
return uint64(quota / period), nil
|
||||
}
|
||||
|
||||
func readIntFromFile(filePath string) (int, error) {
|
||||
data, err := os.ReadFile(filePath)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
str := strings.TrimSpace(string(data))
|
||||
value, err := strconv.Atoi(str)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
return value, nil
|
||||
}
|
||||
|
||||
func readCPUUsage(path string) (map[string]uint64, error) {
|
||||
// FIXME!!!
|
||||
cgr, err := cgroups.NewCgroupManager()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
usage, err := cgr.CpuUsage(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return map[string]uint64{
|
||||
"user": usage.User,
|
||||
"system": usage.System,
|
||||
"total": uint64(time.Now().UnixNano()),
|
||||
}, nil
|
||||
}
|
||||
|
||||
// UserHZtons because kernel USER_HZ = 100, the default value set to 10,000,000
|
||||
const (
|
||||
UserHZtons = 10000000
|
||||
USERHZ = 100
|
||||
)
|
||||
|
||||
func calculateCPUUsage(info *containerCPUInfo, currUsage map[string]uint64) error {
|
||||
deltaTotal := currUsage["total"] - info.prevUsage["total"]
|
||||
deltaUser := currUsage["user"] - info.prevUsage["user"]
|
||||
deltaSys := currUsage["system"] - info.prevUsage["system"]
|
||||
|
||||
cpuCores, err := GetCPUCoresInCgroup(info.path)
|
||||
if err != nil {
|
||||
return fmt.Errorf("get cgroup cpu err")
|
||||
}
|
||||
|
||||
if cpuCores == 0 || deltaTotal == 0 {
|
||||
return fmt.Errorf("division by zero error")
|
||||
}
|
||||
|
||||
log.Debugf("cpuidle calculate core %v currUsage %v prevUsage %v", cpuCores, currUsage, info.prevUsage)
|
||||
info.nowUsageP["cpuUser"] = deltaUser * UserHZtons * USERHZ / deltaTotal / cpuCores
|
||||
info.nowUsageP["cpuSys"] = deltaSys * UserHZtons * USERHZ / deltaTotal / cpuCores
|
||||
return nil
|
||||
}
|
||||
|
||||
type containerCPUInfo struct {
|
||||
prevUsage map[string]uint64
|
||||
prevUsageP map[string]uint64
|
||||
nowUsageP map[string]uint64
|
||||
deltaUser int64
|
||||
deltaSys int64
|
||||
timestamp int64
|
||||
path string
|
||||
alive bool
|
||||
}
|
||||
|
||||
// cpuIdleIDMap is the container information
|
||||
type cpuIdleIDMap map[string]*containerCPUInfo
|
||||
|
||||
func updateCPUIdleIDMap(m cpuIdleIDMap) error {
|
||||
containers, err := pod.GetNormalContainers()
|
||||
if err != nil {
|
||||
return fmt.Errorf("GetNormalContainers: %w", err)
|
||||
}
|
||||
|
||||
for _, container := range containers {
|
||||
_, ok := m[container.ID]
|
||||
if ok {
|
||||
m[container.ID].path = container.CgroupSuffix
|
||||
m[container.ID].alive = true
|
||||
} else {
|
||||
temp := &containerCPUInfo{
|
||||
prevUsage: map[string]uint64{
|
||||
"user": 0,
|
||||
"system": 0,
|
||||
"total": 0,
|
||||
},
|
||||
prevUsageP: map[string]uint64{
|
||||
"cpuUser": 0,
|
||||
"cpuSys": 0,
|
||||
},
|
||||
nowUsageP: map[string]uint64{
|
||||
"cpuUser": 0,
|
||||
"cpuSys": 0,
|
||||
},
|
||||
deltaUser: 0,
|
||||
deltaSys: 0,
|
||||
timestamp: 0,
|
||||
path: container.CgroupSuffix,
|
||||
alive: true,
|
||||
}
|
||||
m[container.ID] = temp
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
var cpuIdleIdMap = make(cpuIdleIDMap)
|
||||
|
||||
func cpuIdleDetect(ctx context.Context) (string, error) {
|
||||
// get config info
|
||||
userth := conf.Get().Tracing.Cpuidle.CgUserth
|
||||
deltauserth := conf.Get().Tracing.Cpuidle.CgDeltaUserth
|
||||
systh := conf.Get().Tracing.Cpuidle.CgSysth
|
||||
deltasysth := conf.Get().Tracing.Cpuidle.CgDeltaSysth
|
||||
usageth := conf.Get().Tracing.Cpuidle.CgUsageth
|
||||
deltausageth := conf.Get().Tracing.Cpuidle.CgDeltaUsageth
|
||||
step := conf.Get().Tracing.Cpuidle.CgStep
|
||||
graceth := conf.Get().Tracing.Cpuidle.CgGrace
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return "", types.ErrExitByCancelCtx
|
||||
case <-time.After(time.Duration(step) * time.Second):
|
||||
if err := updateCPUIdleIDMap(cpuIdleIdMap); err != nil {
|
||||
return "", err
|
||||
}
|
||||
for containerID, v := range cpuIdleIdMap {
|
||||
if !v.alive {
|
||||
delete(cpuIdleIdMap, containerID)
|
||||
} else {
|
||||
v.alive = false
|
||||
currUsage, err := readCPUUsage(v.path)
|
||||
if err != nil {
|
||||
log.Debugf("cpuidle failed to read %s CPU usage: %s", v.path, err)
|
||||
continue
|
||||
}
|
||||
|
||||
if v.prevUsage["user"] == 0 && v.prevUsage["system"] == 0 && v.prevUsage["total"] == 0 {
|
||||
v.prevUsage = currUsage
|
||||
continue
|
||||
}
|
||||
|
||||
err = calculateCPUUsage(v, currUsage)
|
||||
if err != nil {
|
||||
log.Debugf("cpuidle calculate err %s", err)
|
||||
continue
|
||||
}
|
||||
|
||||
v.deltaUser = int64(v.nowUsageP["cpuUser"] - v.prevUsageP["cpuUser"])
|
||||
v.deltaSys = int64(v.nowUsageP["cpuSys"] - v.prevUsageP["cpuSys"])
|
||||
v.prevUsageP["cpuUser"] = v.nowUsageP["cpuUser"]
|
||||
v.prevUsageP["cpuSys"] = v.nowUsageP["cpuSys"]
|
||||
v.prevUsage = currUsage
|
||||
nowtime := time.Now().Unix()
|
||||
gracetime := nowtime - v.timestamp
|
||||
nowUsage := v.nowUsageP["cpuUser"] + v.nowUsageP["cpuSys"]
|
||||
nowDeltaUsage := v.deltaUser + v.deltaSys
|
||||
|
||||
log.Debugf("cpuidle ctID %v user %v deltauser %v sys %v deltasys %v usage %v deltausage %v grace %v graceth %v",
|
||||
containerID, v.nowUsageP["cpuUser"], v.deltaUser, v.nowUsageP["cpuSys"], v.deltaSys, nowUsage, nowDeltaUsage, gracetime, graceth)
|
||||
|
||||
if gracetime > graceth {
|
||||
if (v.nowUsageP["cpuUser"] > userth && v.deltaUser > deltauserth) ||
|
||||
(v.nowUsageP["cpuSys"] > systh && v.deltaSys > deltasysth) ||
|
||||
(nowUsage > usageth && nowDeltaUsage > deltausageth) {
|
||||
v.timestamp = nowtime
|
||||
for key := range v.prevUsage {
|
||||
v.prevUsage[key] = 0
|
||||
}
|
||||
return containerID, nil
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
type cpuIdleTracing struct{}
|
||||
|
||||
// Cpuidle is an instance of cpuIdleTracer
|
||||
var (
|
||||
tracerTime time.Time
|
||||
)
|
||||
|
||||
type CPUIdleTracingData struct {
|
||||
NowUser uint64 `json:"nowuser"`
|
||||
UserThreshold uint64 `json:"userthreshold"`
|
||||
DeltaUser int64 `json:"deltauser"`
|
||||
DeltaUserTH int64 `json:"deltauserth"`
|
||||
NowSys uint64 `json:"nowsys"`
|
||||
SysThreshold uint64 `json:"systhreshold"`
|
||||
DeltaSys int64 `json:"deltasys"`
|
||||
DeltaSysTH int64 `json:"deltasysth"`
|
||||
NowUsage uint64 `json:"nowusage"`
|
||||
UsageThreshold uint64 `json:"usagethreshold"`
|
||||
DeltaUsage int64 `json:"deltausage"`
|
||||
DeltaUsageTH int64 `json:"deltausageth"`
|
||||
FlameData []flamegraph.FrameData `json:"flamedata"`
|
||||
}
|
||||
|
||||
// Start detect work, load bpf and wait data form perfevent
|
||||
func (c *cpuIdleTracing) Start(ctx context.Context) error {
|
||||
// TODO: Verify the conditions for startup.
|
||||
containerID, err := cpuIdleDetect(ctx)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
tracerTime = time.Now()
|
||||
dur := conf.Get().Tracing.Cpuidle.CgUsageToolduration
|
||||
durstr := strconv.FormatInt(dur, 10)
|
||||
|
||||
// exec tracerperf
|
||||
cmdctx, cancel := context.WithTimeout(ctx, time.Duration(dur+30)*time.Second)
|
||||
defer cancel()
|
||||
|
||||
log.Infof("cpuidle exec tracerperf ctid %v dur %v", containerID, durstr)
|
||||
cmd := exec.CommandContext(cmdctx, "./tracer/perf.bin", "--casename", "cpuidle.o", "--container-id", containerID, "--dur", durstr)
|
||||
output, err := cmd.CombinedOutput()
|
||||
if err != nil {
|
||||
log.Errorf("cpuidle cmd output %v", strings.TrimSuffix(string(output), "\n"))
|
||||
return fmt.Errorf("cpuidle tracerperf exec err: %w", err)
|
||||
}
|
||||
|
||||
// parse json
|
||||
log.Infof("cpuidle parse json")
|
||||
tracerData := CPUIdleTracingData{}
|
||||
err = json.Unmarshal(output, &tracerData.FlameData)
|
||||
if err != nil {
|
||||
return fmt.Errorf("parse JSON err: %w", err)
|
||||
}
|
||||
|
||||
// save
|
||||
log.Infof("cpuidle upload ES")
|
||||
log.Debugf("cpuidle FlameData %v", tracerData.FlameData)
|
||||
tracerData.NowUser = cpuIdleIdMap[containerID].nowUsageP["cpuUser"]
|
||||
tracerData.UserThreshold = conf.Get().Tracing.Cpuidle.CgUserth
|
||||
tracerData.DeltaUser = cpuIdleIdMap[containerID].deltaUser
|
||||
tracerData.DeltaUserTH = conf.Get().Tracing.Cpuidle.CgDeltaUserth
|
||||
tracerData.NowSys = cpuIdleIdMap[containerID].nowUsageP["cpuSys"]
|
||||
tracerData.SysThreshold = conf.Get().Tracing.Cpuidle.CgSysth
|
||||
tracerData.DeltaSys = cpuIdleIdMap[containerID].deltaSys
|
||||
tracerData.DeltaSysTH = conf.Get().Tracing.Cpuidle.CgDeltaSysth
|
||||
tracerData.NowUsage = cpuIdleIdMap[containerID].nowUsageP["cpuSys"] + cpuIdleIdMap[containerID].nowUsageP["cpuUser"]
|
||||
tracerData.UsageThreshold = conf.Get().Tracing.Cpuidle.CgUsageth
|
||||
tracerData.DeltaUsage = cpuIdleIdMap[containerID].deltaUser + cpuIdleIdMap[containerID].deltaSys
|
||||
tracerData.DeltaUsageTH = conf.Get().Tracing.Cpuidle.CgDeltaUsageth
|
||||
storage.Save("cpuidle", containerID, tracerTime, &tracerData)
|
||||
log.Infof("cpuidle upload ES end")
|
||||
return err
|
||||
}
|
|
@ -0,0 +1,182 @@
|
|||
// Copyright 2025 The HuaTuo Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package autotracing
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"huatuo-bamai/internal/conf"
|
||||
"huatuo-bamai/internal/flamegraph"
|
||||
"huatuo-bamai/internal/log"
|
||||
"huatuo-bamai/internal/storage"
|
||||
"huatuo-bamai/pkg/tracing"
|
||||
"huatuo-bamai/pkg/types"
|
||||
)
|
||||
|
||||
func init() {
|
||||
tracing.RegisterEventTracing("cpusys", newCpuSys)
|
||||
}
|
||||
|
||||
func newCpuSys() (*tracing.EventTracingAttr, error) {
|
||||
return &tracing.EventTracingAttr{
|
||||
TracingData: &cpuSysTracing{},
|
||||
Internal: 20,
|
||||
Flag: tracing.FlagTracing,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// CPUStats structure that records cpu usage
|
||||
type CPUStats struct {
|
||||
system uint64
|
||||
total uint64
|
||||
}
|
||||
|
||||
func CpuSysDetect(ctx context.Context) (uint64, int64, error) {
|
||||
var (
|
||||
percpuStats CPUStats
|
||||
pervSys uint64
|
||||
deltaSys int64
|
||||
err error
|
||||
)
|
||||
sysdelta := conf.Get().Tracing.Cpusys.CPUSysDelta
|
||||
sysstep := conf.Get().Tracing.Cpusys.CPUSysStep
|
||||
systh := conf.Get().Tracing.Cpusys.CPUSysth
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return 0, 0, types.ErrExitByCancelCtx
|
||||
case <-time.After(time.Duration(sysstep) * time.Second):
|
||||
if percpuStats.total == 0 {
|
||||
percpuStats, err = getCPUStats()
|
||||
if err != nil {
|
||||
return 0, 0, fmt.Errorf("get cpuStats err %w", err)
|
||||
}
|
||||
time.Sleep(1 * time.Second)
|
||||
continue
|
||||
}
|
||||
cpuStats, err := getCPUStats()
|
||||
if err != nil {
|
||||
return 0, 0, err
|
||||
}
|
||||
systotal := cpuStats.total - percpuStats.total
|
||||
if systotal == 0 {
|
||||
return 0, 0, fmt.Errorf("systotal is ZERO")
|
||||
}
|
||||
sys := (cpuStats.system - percpuStats.system) * 100 / systotal
|
||||
if pervSys != 0 {
|
||||
deltaSys = int64(sys - pervSys)
|
||||
}
|
||||
|
||||
log.Debugf("cpusys alarm sys %v pervsys %v deltasys %v", sys, pervSys, deltaSys)
|
||||
pervSys = sys
|
||||
percpuStats = cpuStats
|
||||
|
||||
if sys > systh || deltaSys > sysdelta {
|
||||
return sys, deltaSys, nil
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func getCPUStats() (CPUStats, error) {
|
||||
statData, err := os.ReadFile("/proc/stat")
|
||||
if err != nil {
|
||||
return CPUStats{}, err
|
||||
}
|
||||
|
||||
lines := strings.Split(string(statData), "\n")
|
||||
for _, line := range lines {
|
||||
fields := strings.Fields(line)
|
||||
if len(fields) < 5 {
|
||||
continue
|
||||
}
|
||||
|
||||
if fields[0] == "cpu" {
|
||||
var cpuStats CPUStats
|
||||
for i := 1; i < len(fields); i++ {
|
||||
value, err := strconv.ParseUint(fields[i], 10, 64)
|
||||
if err != nil {
|
||||
return CPUStats{}, err
|
||||
}
|
||||
cpuStats.total += value
|
||||
if i == 3 {
|
||||
cpuStats.system = value
|
||||
}
|
||||
}
|
||||
return cpuStats, nil
|
||||
}
|
||||
}
|
||||
return CPUStats{}, fmt.Errorf("failed to parse /proc/stat")
|
||||
}
|
||||
|
||||
type cpuSysTracing struct{}
|
||||
|
||||
type CpuSysTracingData struct {
|
||||
NowSys string `json:"now_sys"`
|
||||
SysThreshold string `json:"sys_threshold"`
|
||||
DeltaSys string `json:"delta_sys"`
|
||||
DeltaSysTh string `json:"delta_sys_th"`
|
||||
FlameData []flamegraph.FrameData `json:"flamedata"`
|
||||
}
|
||||
|
||||
// Start the tcpconnlat task.
|
||||
func (c *cpuSysTracing) Start(ctx context.Context) error {
|
||||
// TODO: Verify the conditions for startup.
|
||||
cpuSys, delta, err := CpuSysDetect(ctx)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
tracerTime := time.Now()
|
||||
dur := conf.Get().Tracing.Cpusys.CPUSysToolduration
|
||||
durstr := strconv.FormatInt(dur, 10)
|
||||
|
||||
// exec tracerperf
|
||||
cmdctx, cancel := context.WithTimeout(ctx, time.Duration(dur+30)*time.Second)
|
||||
defer cancel()
|
||||
|
||||
log.Infof("cpusys exec tracerperf dur %v", durstr)
|
||||
cmd := exec.CommandContext(cmdctx, "./tracer/perf.bin", "--casename", "cpusys.o", "--dur", durstr)
|
||||
output, err := cmd.CombinedOutput()
|
||||
if err != nil {
|
||||
log.Errorf("cpusys cmd output %v", strings.TrimSuffix(string(output), "\n"))
|
||||
return fmt.Errorf("cpusys tracerperf exec err: %w", err)
|
||||
}
|
||||
|
||||
// parse json
|
||||
log.Infof("cpusys parse json")
|
||||
tracerData := CpuSysTracingData{}
|
||||
err = json.Unmarshal(output, &tracerData.FlameData)
|
||||
if err != nil {
|
||||
return fmt.Errorf("parse JSON err: %w", err)
|
||||
}
|
||||
|
||||
// save
|
||||
log.Infof("cpusys upload ES")
|
||||
tracerData.NowSys = fmt.Sprintf("%d", cpuSys)
|
||||
tracerData.SysThreshold = fmt.Sprintf("%d", conf.Get().Tracing.Cpusys.CPUSysth)
|
||||
tracerData.DeltaSys = fmt.Sprintf("%d", delta)
|
||||
tracerData.DeltaSysTh = fmt.Sprintf("%d", conf.Get().Tracing.Cpusys.CPUSysDelta)
|
||||
storage.Save("cpusys", "", tracerTime, &tracerData)
|
||||
log.Infof("cpusys upload ES end")
|
||||
return err
|
||||
}
|
|
@ -0,0 +1,366 @@
|
|||
// Copyright 2025 The HuaTuo Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package autotracing
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"time"
|
||||
|
||||
"huatuo-bamai/internal/cgroups"
|
||||
"huatuo-bamai/internal/cgroups/paths"
|
||||
"huatuo-bamai/internal/conf"
|
||||
"huatuo-bamai/internal/log"
|
||||
"huatuo-bamai/internal/pod"
|
||||
"huatuo-bamai/internal/storage"
|
||||
"huatuo-bamai/pkg/tracing"
|
||||
"huatuo-bamai/pkg/types"
|
||||
|
||||
cadvisorV1 "github.com/google/cadvisor/info/v1"
|
||||
"github.com/google/cadvisor/utils/cpuload/netlink"
|
||||
"github.com/prometheus/procfs"
|
||||
"github.com/shirou/gopsutil/process"
|
||||
)
|
||||
|
||||
func init() {
|
||||
tracing.RegisterEventTracing("dload", newDload)
|
||||
}
|
||||
|
||||
func newDload() (*tracing.EventTracingAttr, error) {
|
||||
return &tracing.EventTracingAttr{
|
||||
TracingData: &dloadTracing{},
|
||||
Internal: 30,
|
||||
Flag: tracing.FlagTracing,
|
||||
}, nil
|
||||
}
|
||||
|
||||
type containerDloadInfo struct {
|
||||
path string
|
||||
name string
|
||||
container *pod.Container
|
||||
avgnrun [2]uint64
|
||||
load [2]float64
|
||||
avgnuni [2]uint64
|
||||
loaduni [2]float64
|
||||
alive bool
|
||||
}
|
||||
|
||||
type DloadTracingData struct {
|
||||
Threshold float64 `json:"threshold"`
|
||||
NrSleeping uint64 `json:"nr_sleeping"`
|
||||
NrRunning uint64 `json:"nr_running"`
|
||||
NrStopped uint64 `json:"nr_stopped"`
|
||||
NrUninterruptible uint64 `json:"nr_uninterruptible"`
|
||||
NrIoWait uint64 `json:"nr_iowait"`
|
||||
LoadAvg float64 `json:"load_avg"`
|
||||
DLoadAvg float64 `json:"dload_avg"`
|
||||
KnowIssue string `json:"known_issue"`
|
||||
InKnownList uint64 `json:"in_known_list"`
|
||||
Stack string `json:"stack"`
|
||||
}
|
||||
|
||||
const (
|
||||
taskHostType = 1
|
||||
taskCgroupType = 2
|
||||
)
|
||||
|
||||
const debugDload = false
|
||||
|
||||
type containersDloadMap map[string]*containerDloadInfo
|
||||
|
||||
var containersDloads = make(containersDloadMap)
|
||||
|
||||
func updateContainersDload() error {
|
||||
containers, err := pod.GetAllContainers()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, container := range containers {
|
||||
if _, ok := containersDloads[container.ID]; ok {
|
||||
containersDloads[container.ID].name = container.CgroupSuffix
|
||||
containersDloads[container.ID].path = paths.Path("cpu", container.CgroupSuffix)
|
||||
containersDloads[container.ID].container = container
|
||||
containersDloads[container.ID].alive = true
|
||||
continue
|
||||
}
|
||||
|
||||
containersDloads[container.ID] = &containerDloadInfo{
|
||||
path: paths.Path("cpu", container.CgroupSuffix),
|
||||
name: container.CgroupSuffix,
|
||||
container: container,
|
||||
alive: true,
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func detectDloadContainer(thresh float64, interval int) (*containerDloadInfo, cadvisorV1.LoadStats, error) {
|
||||
empty := cadvisorV1.LoadStats{}
|
||||
|
||||
n, err := netlink.New()
|
||||
if err != nil {
|
||||
return nil, empty, err
|
||||
}
|
||||
defer n.Stop()
|
||||
|
||||
for containerId, dload := range containersDloads {
|
||||
if !dload.alive {
|
||||
delete(containersDloads, containerId)
|
||||
} else {
|
||||
dload.alive = false
|
||||
|
||||
timeStart := dload.container.StartedAt.Add(time.Second * time.Duration(interval))
|
||||
if time.Now().Before(timeStart) {
|
||||
log.Debugf("%s were just started, we'll start monitoring it later.", dload.container.Hostname)
|
||||
continue
|
||||
}
|
||||
|
||||
stats, err := n.GetCpuLoad(dload.name, dload.path)
|
||||
if err != nil {
|
||||
log.Debugf("failed to get %s load, probably the container has been deleted: %s", dload.container.Hostname, err)
|
||||
continue
|
||||
}
|
||||
|
||||
updateLoad(dload, stats.NrRunning, stats.NrUninterruptible)
|
||||
|
||||
if dload.loaduni[0] > thresh || debugDload {
|
||||
log.Infof("dload event: Threshold=%0.2f %+v, LoadAvg=%0.2f, DLoadAvg=%0.2f",
|
||||
thresh, stats, dload.load[0], dload.loaduni[0])
|
||||
|
||||
return dload, stats, nil
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil, empty, fmt.Errorf("no dload containers")
|
||||
}
|
||||
|
||||
func buildAndSaveDloadContainer(thresh float64, container *containerDloadInfo, loadstat cadvisorV1.LoadStats) error {
|
||||
cgrpPath := container.name
|
||||
containerID := container.container.ID
|
||||
containerHostNamespace := container.container.LabelHostNamespace()
|
||||
|
||||
stackCgrp, err := dumpUninterruptibleTaskStack(taskCgroupType, cgrpPath, debugDload)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if stackCgrp == "" {
|
||||
return nil
|
||||
}
|
||||
|
||||
stackHost, err := dumpUninterruptibleTaskStack(taskHostType, "", debugDload)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
data := &DloadTracingData{
|
||||
NrSleeping: loadstat.NrSleeping,
|
||||
NrRunning: loadstat.NrRunning,
|
||||
NrStopped: loadstat.NrStopped,
|
||||
NrUninterruptible: loadstat.NrUninterruptible,
|
||||
NrIoWait: loadstat.NrIoWait,
|
||||
LoadAvg: container.load[0],
|
||||
DLoadAvg: container.loaduni[0],
|
||||
Threshold: thresh,
|
||||
Stack: fmt.Sprintf("%s%s", stackCgrp, stackHost),
|
||||
}
|
||||
|
||||
// Check if this is caused by known issues.
|
||||
knownIssue, inKnownList := conf.KnownIssueSearch(stackCgrp, containerHostNamespace, "")
|
||||
if knownIssue != "" {
|
||||
data.KnowIssue = knownIssue
|
||||
data.InKnownList = inKnownList
|
||||
} else {
|
||||
data.KnowIssue = "none"
|
||||
data.InKnownList = inKnownList
|
||||
}
|
||||
|
||||
storage.Save("dload", containerID, time.Now(), data)
|
||||
return nil
|
||||
}
|
||||
|
||||
const (
|
||||
fShift = 11
|
||||
fixed1 = 1 << fShift
|
||||
exp1 = 1884
|
||||
exp5 = 2014
|
||||
exp15 = 2037
|
||||
)
|
||||
|
||||
func calcLoad(load, exp, active uint64) uint64 {
|
||||
var newload uint64
|
||||
|
||||
newload = load*exp + active*(fixed1-exp)
|
||||
newload += 1 << (fShift - 1)
|
||||
|
||||
return newload / fixed1
|
||||
}
|
||||
|
||||
func calcLoadavg(avgnrun [2]uint64, active uint64) (avgnresult [2]uint64) {
|
||||
if active > 0 {
|
||||
active *= fixed1
|
||||
} else {
|
||||
active = 0
|
||||
}
|
||||
|
||||
avgnresult[0] = calcLoad(avgnrun[0], exp1, active)
|
||||
avgnresult[1] = calcLoad(avgnrun[1], exp5, active)
|
||||
|
||||
return avgnresult
|
||||
}
|
||||
|
||||
func loadInt(x uint64) (r uint64) {
|
||||
r = x >> fShift
|
||||
return r
|
||||
}
|
||||
|
||||
func loadFrac(x uint64) (r uint64) {
|
||||
r = loadInt((x & (fixed1 - 1)) * 100)
|
||||
return r
|
||||
}
|
||||
|
||||
func getAvenrun(avgnrun [2]uint64, offset uint64, shift int) (loadavgNew [2]float64) {
|
||||
var loads [2]uint64
|
||||
|
||||
loads[0] = (avgnrun[0] + offset) << shift
|
||||
loads[1] = (avgnrun[1] + offset) << shift
|
||||
|
||||
loadavgNew[0] = float64(loadInt(loads[0])) +
|
||||
float64(loadFrac(loads[0]))/float64(100)
|
||||
|
||||
loadavgNew[1] = float64(loadInt(loads[1])) +
|
||||
float64(loadFrac(loads[1]))/float64(100)
|
||||
|
||||
return loadavgNew
|
||||
}
|
||||
|
||||
func updateLoad(info *containerDloadInfo, nrRunning, nrUninterruptible uint64) {
|
||||
info.avgnrun = calcLoadavg(info.avgnrun, nrRunning+nrUninterruptible)
|
||||
info.load = getAvenrun(info.avgnrun, fixed1/200, 0)
|
||||
info.avgnuni = calcLoadavg(info.avgnuni, nrUninterruptible)
|
||||
info.loaduni = getAvenrun(info.avgnuni, fixed1/200, 0)
|
||||
}
|
||||
|
||||
func pidStack(pid int32) string {
|
||||
data, _ := os.ReadFile(fmt.Sprintf("/proc/%d/stack", pid))
|
||||
return string(data)
|
||||
}
|
||||
|
||||
func cgroupHostTasks(where int, path string) ([]int32, error) {
|
||||
switch where {
|
||||
case taskCgroupType:
|
||||
cgroup, err := cgroups.NewCgroupManager()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return cgroup.Pids(path)
|
||||
case taskHostType:
|
||||
var pidList []int32
|
||||
|
||||
procs, err := procfs.AllProcs()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for _, p := range procs {
|
||||
pidList = append(pidList, int32(p.PID))
|
||||
}
|
||||
return pidList, err
|
||||
default:
|
||||
return nil, fmt.Errorf("type not supported")
|
||||
}
|
||||
}
|
||||
|
||||
func dumpUninterruptibleTaskStack(where int, path string, all bool) (string, error) {
|
||||
var appended bool = false
|
||||
|
||||
stacks := new(bytes.Buffer)
|
||||
|
||||
tasks, err := cgroupHostTasks(where, path)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
for _, pid := range tasks {
|
||||
proc, err := process.NewProcess(pid)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
status, err := proc.Status()
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
if status == "D" || status == "U" || all {
|
||||
comm, err := proc.Name()
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
stack := pidStack(pid)
|
||||
if stack == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
fmt.Fprintf(stacks, "Comm: %s\tPid: %d\n%s\n", comm, pid, stack)
|
||||
appended = true
|
||||
}
|
||||
}
|
||||
|
||||
if appended {
|
||||
title := "\nstacktrace of D task in cgroup:\n"
|
||||
if where == taskHostType {
|
||||
title = "\nstacktrace of D task in host:\n"
|
||||
}
|
||||
|
||||
return fmt.Sprintf("%s%s", title, stacks), nil
|
||||
}
|
||||
|
||||
return "", nil
|
||||
}
|
||||
|
||||
type dloadTracing struct{}
|
||||
|
||||
// Start detect work, monitor the load of containers
|
||||
func (c *dloadTracing) Start(ctx context.Context) error {
|
||||
thresh := conf.Get().Tracing.Dload.ThresholdLoad
|
||||
interval := conf.Get().Tracing.Dload.MonitorGap
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return types.ErrExitByCancelCtx
|
||||
default:
|
||||
time.Sleep(5 * time.Second)
|
||||
|
||||
if err := updateContainersDload(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
container, loadstat, err := detectDloadContainer(thresh, interval)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
_ = buildAndSaveDloadContainer(thresh, container, loadstat)
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,246 @@
|
|||
// Copyright 2025 The HuaTuo Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package autotracing
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"context"
|
||||
"os"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"huatuo-bamai/internal/conf"
|
||||
"huatuo-bamai/internal/log"
|
||||
"huatuo-bamai/internal/storage"
|
||||
"huatuo-bamai/pkg/tracing"
|
||||
|
||||
"github.com/shirou/gopsutil/process"
|
||||
)
|
||||
|
||||
func init() {
|
||||
tracing.RegisterEventTracing("membust", newMemBurst)
|
||||
}
|
||||
|
||||
func newMemBurst() (*tracing.EventTracingAttr, error) {
|
||||
return &tracing.EventTracingAttr{
|
||||
TracingData: &memBurstTracing{},
|
||||
Internal: 10,
|
||||
Flag: tracing.FlagTracing,
|
||||
}, nil
|
||||
}
|
||||
|
||||
type memBurstTracing struct{}
|
||||
|
||||
type MemoryTracingData struct {
|
||||
TopMemoryUsage []ProcessMemoryInfo `json:"top_memory_usage"`
|
||||
}
|
||||
|
||||
// ProcessMemoryInfo holds process information for sorting
|
||||
type ProcessMemoryInfo struct {
|
||||
PID int32 `json:"pid"`
|
||||
ProcessName string `json:"process_name"`
|
||||
MemorySize uint64 `json:"memory_size"`
|
||||
}
|
||||
|
||||
// ByMemory is used to sorting processes by memory usage
|
||||
type ByMemory []ProcessMemoryInfo
|
||||
|
||||
func (a ByMemory) Len() int { return len(a) }
|
||||
func (a ByMemory) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
|
||||
func (a ByMemory) Less(i, j int) bool { return a[i].MemorySize > a[j].MemorySize }
|
||||
|
||||
// getTopMemoryProcesses returns the top N processes consuming the most memory.
|
||||
func getTopMemoryProcesses(topN int) ([]ProcessMemoryInfo, error) {
|
||||
processes, err := process.Processes()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var pmInfos []ProcessMemoryInfo
|
||||
for _, p := range processes {
|
||||
memInfo, err := p.MemoryInfo()
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
name, err := p.Name()
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
pmInfos = append(pmInfos, ProcessMemoryInfo{
|
||||
PID: p.Pid,
|
||||
ProcessName: name,
|
||||
MemorySize: memInfo.RSS,
|
||||
})
|
||||
}
|
||||
|
||||
// Sort the processes by memory usage
|
||||
sort.Sort(ByMemory(pmInfos))
|
||||
|
||||
if len(pmInfos) < topN {
|
||||
return pmInfos, nil
|
||||
}
|
||||
return pmInfos[:topN], nil
|
||||
}
|
||||
|
||||
// pass required keys and readMemInfo will return their values according to /proc/meminfo
|
||||
func readMemInfo(requiredKeys map[string]bool) (map[string]int, error) {
|
||||
file, err := os.Open("/proc/meminfo")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
results := make(map[string]int)
|
||||
scanner := bufio.NewScanner(file)
|
||||
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
fields := strings.Fields(line)
|
||||
if len(fields) < 2 {
|
||||
continue
|
||||
}
|
||||
|
||||
key := strings.Trim(fields[0], ":")
|
||||
if _, ok := requiredKeys[key]; ok {
|
||||
value, err := strconv.Atoi(strings.Trim(fields[1], " kB"))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
results[key] = value
|
||||
|
||||
if len(results) == len(requiredKeys) {
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if err := scanner.Err(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return results, nil
|
||||
}
|
||||
|
||||
func checkAndRecordMemoryUsage(currentIndex *int, isHistoryFull *bool,
|
||||
memTotal int, history []int, historyWindowLength, topNProcesses int,
|
||||
burstRatio float64, anonThreshold int,
|
||||
) ([]ProcessMemoryInfo, error) {
|
||||
memInfo, err := readMemInfo(map[string]bool{
|
||||
"Active(anon)": true,
|
||||
"Inactive(anon)": true,
|
||||
})
|
||||
if err != nil {
|
||||
log.Errorf("Error reading memory info: %v\n", err)
|
||||
return []ProcessMemoryInfo{}, nil
|
||||
}
|
||||
|
||||
currentSum := memInfo["Active(anon)"] + memInfo["Inactive(anon)"]
|
||||
history[*currentIndex] = currentSum
|
||||
|
||||
if *currentIndex == historyWindowLength-1 {
|
||||
*isHistoryFull = true
|
||||
}
|
||||
|
||||
*currentIndex = (*currentIndex + 1) % historyWindowLength
|
||||
|
||||
log.Debugf("Checked memory status. active_anon=%v KiB inactive_anon=%v KiB\n", memInfo["Active(anon)"], memInfo["Inactive(anon)"])
|
||||
|
||||
if *isHistoryFull {
|
||||
oldestSum := history[*currentIndex] // current index is the oldest element
|
||||
if float64(currentSum) >= burstRatio*float64(oldestSum) && currentSum >= (anonThreshold*memTotal/100) {
|
||||
topProcesses, err := getTopMemoryProcesses(topNProcesses)
|
||||
if err == nil {
|
||||
return topProcesses, nil
|
||||
}
|
||||
log.Errorf("Fail to getTopMemoryProcesses")
|
||||
return []ProcessMemoryInfo{}, err
|
||||
}
|
||||
}
|
||||
return []ProcessMemoryInfo{}, nil
|
||||
}
|
||||
|
||||
// Core function
|
||||
func (c *memBurstTracing) Start(ctx context.Context) error {
|
||||
var err error
|
||||
|
||||
historyWindowLength := conf.Get().Tracing.MemoryBurst.HistoryWindowLength
|
||||
sampleInterval := conf.Get().Tracing.MemoryBurst.SampleInterval
|
||||
silencePeriod := conf.Get().Tracing.MemoryBurst.SilencePeriod
|
||||
topNProcesses := conf.Get().Tracing.MemoryBurst.TopNProcesses
|
||||
burstRatio := conf.Get().Tracing.MemoryBurst.BurstRatio
|
||||
anonThreshold := conf.Get().Tracing.MemoryBurst.AnonThreshold
|
||||
|
||||
memInfo, err := readMemInfo(map[string]bool{"MemTotal": true})
|
||||
if err != nil {
|
||||
log.Infof("Error reading MemTotal from memory info: %v\n", err)
|
||||
return err
|
||||
}
|
||||
memTotal := memInfo["MemTotal"]
|
||||
history := make([]int, historyWindowLength) // circular buffer
|
||||
var currentIndex int
|
||||
var isHistoryFull bool // don't check memory burst until we have enough data
|
||||
var topProcesses []ProcessMemoryInfo
|
||||
lastReportTime := time.Now().Add(-24 * time.Hour)
|
||||
|
||||
_, err = checkAndRecordMemoryUsage(¤tIndex, &isHistoryFull, memTotal, history, historyWindowLength, topNProcesses, burstRatio, anonThreshold)
|
||||
if err != nil {
|
||||
log.Errorf("Fail to checkAndRecordMemoryUsage")
|
||||
return err
|
||||
}
|
||||
|
||||
for {
|
||||
ticker := time.NewTicker(time.Duration(sampleInterval) * time.Second)
|
||||
stoppedByUser := false
|
||||
|
||||
for range ticker.C {
|
||||
topProcesses, err = checkAndRecordMemoryUsage(¤tIndex, &isHistoryFull, memTotal, history, historyWindowLength, topNProcesses, burstRatio, anonThreshold)
|
||||
if err != nil {
|
||||
log.Errorf("Fail to checkAndRecordMemoryUsage")
|
||||
return err
|
||||
}
|
||||
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
log.Info("Caller request to stop")
|
||||
stoppedByUser = true
|
||||
default:
|
||||
}
|
||||
|
||||
if len(topProcesses) > 0 || stoppedByUser {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
ticker.Stop()
|
||||
|
||||
if stoppedByUser {
|
||||
break
|
||||
}
|
||||
|
||||
currentTime := time.Now()
|
||||
diff := currentTime.Sub(lastReportTime).Seconds()
|
||||
if diff < float64(silencePeriod) {
|
||||
continue
|
||||
}
|
||||
|
||||
lastReportTime = currentTime
|
||||
|
||||
storage.Save("memburst", "", time.Now(), &MemoryTracingData{TopMemoryUsage: topProcesses})
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
|
@ -0,0 +1,256 @@
|
|||
// Copyright 2025 The HuaTuo Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package events
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"net"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"huatuo-bamai/internal/bpf"
|
||||
"huatuo-bamai/internal/conf"
|
||||
"huatuo-bamai/internal/log"
|
||||
"huatuo-bamai/internal/storage"
|
||||
"huatuo-bamai/internal/utils/bpfutil"
|
||||
"huatuo-bamai/internal/utils/netutil"
|
||||
"huatuo-bamai/internal/utils/symbolutil"
|
||||
"huatuo-bamai/pkg/tracing"
|
||||
)
|
||||
|
||||
const (
|
||||
tracerName = "dropwatch"
|
||||
logPrefix = tracerName + ": "
|
||||
|
||||
// type
|
||||
typeTCPCommonDrop = 1
|
||||
typeTCPSynFlood = 2
|
||||
typeTCPListenOverflowHandshake1 = 3
|
||||
typeTCPListenOverflowHandshake3 = 4
|
||||
)
|
||||
|
||||
// from include/net/tcp_states.h
|
||||
var tcpstateMap = []string{
|
||||
"<nil>", // 0
|
||||
"ESTABLISHED",
|
||||
"SYN_SENT",
|
||||
"SYN_RECV",
|
||||
"FIN_WAIT1",
|
||||
"FIN_WAIT2",
|
||||
"TIME_WAIT",
|
||||
"CLOSE",
|
||||
"CLOSE_WAIT",
|
||||
"LAST_ACK",
|
||||
"LISTEN",
|
||||
"CLOSING",
|
||||
"NEW_SYN_RECV",
|
||||
}
|
||||
|
||||
var typeMap = map[uint8]string{
|
||||
typeTCPCommonDrop: "common_drop",
|
||||
typeTCPSynFlood: "syn_flood",
|
||||
typeTCPListenOverflowHandshake1: "listen_overflow_handshake1",
|
||||
typeTCPListenOverflowHandshake3: "listen_overflow_handshake3",
|
||||
}
|
||||
|
||||
type perfEventT struct {
|
||||
TgidPid uint64 `json:"tgid_pid"`
|
||||
Saddr uint32 `json:"saddr"`
|
||||
Daddr uint32 `json:"daddr"`
|
||||
Sport uint16 `json:"sport"`
|
||||
Dport uint16 `json:"dport"`
|
||||
Seq uint32 `json:"seq"`
|
||||
AckSeq uint32 `json:"ack_seq"`
|
||||
QueueMapping uint32 `json:"queue_mapping"`
|
||||
PktLen uint64 `json:"pkt_len"`
|
||||
StackSize int64 `json:"stack_size"`
|
||||
Stack [symbolutil.KsymbolStackMaxDepth]uint64 `json:"stack"`
|
||||
SkMaxAckBacklog uint32 `json:"sk_max_ack_backlog"`
|
||||
State uint8 `json:"state"`
|
||||
Type uint8 `json:"type"`
|
||||
Comm [bpfutil.TaskCommLen]byte `json:"comm"`
|
||||
}
|
||||
|
||||
type DropWatchTracingData struct {
|
||||
Type string `json:"type"`
|
||||
Comm string `json:"comm"`
|
||||
Pid uint64 `json:"pid"`
|
||||
Saddr string `json:"saddr"`
|
||||
Daddr string `json:"daddr"`
|
||||
Sport uint16 `json:"sport"`
|
||||
Dport uint16 `json:"dport"`
|
||||
SrcHostname string `json:"src_hostname"`
|
||||
DestHostname string `json:"dest_hostname"`
|
||||
MaxAckBacklog uint32 `json:"max_ack_backlog"`
|
||||
Seq uint32 `json:"seq"`
|
||||
AckSeq uint32 `json:"ack_seq"`
|
||||
QueueMapping uint32 `json:"queue_mapping"`
|
||||
PktLen uint64 `json:"pkt_len"`
|
||||
State string `json:"state"`
|
||||
Stack string `json:"stack"`
|
||||
}
|
||||
|
||||
type dropWatchTracing struct{}
|
||||
|
||||
//go:generate $BPF_COMPILE $BPF_INCLUDE -s $BPF_DIR/dropwatch.c -o $BPF_DIR/dropwatch.o
|
||||
|
||||
func init() {
|
||||
tracing.RegisterEventTracing(tracerName, newDropWatch)
|
||||
}
|
||||
|
||||
func newDropWatch() (*tracing.EventTracingAttr, error) {
|
||||
return &tracing.EventTracingAttr{
|
||||
TracingData: &dropWatchTracing{},
|
||||
Internal: 10,
|
||||
Flag: tracing.FlagTracing,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Start starts the tracer.
|
||||
func (c *dropWatchTracing) Start(ctx context.Context) error {
|
||||
b, err := bpf.LoadBpf(bpfutil.ThisBpfOBJ(), nil)
|
||||
if err != nil {
|
||||
return fmt.Errorf("load bpf: %w", err)
|
||||
}
|
||||
defer b.Close()
|
||||
|
||||
childCtx, cancel := context.WithCancel(ctx)
|
||||
defer cancel()
|
||||
|
||||
// attach
|
||||
reader, err := b.AttachAndEventPipe(childCtx, "perf_events", 8192)
|
||||
if err != nil {
|
||||
return fmt.Errorf("attach and event pipe: %w", err)
|
||||
}
|
||||
defer reader.Close()
|
||||
|
||||
b.WaitDetachByBreaker(childCtx, cancel)
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-childCtx.Done():
|
||||
log.Info(logPrefix + "tracer is stopped.")
|
||||
return nil
|
||||
default:
|
||||
var event perfEventT
|
||||
if err := reader.ReadInto(&event); err != nil {
|
||||
return fmt.Errorf(logPrefix+"failed to read from perf: %w", err)
|
||||
}
|
||||
|
||||
// format
|
||||
tracerData := c.formatEvent(&event)
|
||||
|
||||
if c.ignore(tracerData) {
|
||||
log.Debugf(logPrefix+"ignore dropwatch data: %v", tracerData)
|
||||
continue
|
||||
}
|
||||
|
||||
storage.Save(tracerName, "", time.Now(), tracerData)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (c *dropWatchTracing) formatEvent(event *perfEventT) *DropWatchTracingData {
|
||||
// hostname
|
||||
saddr := netutil.InetNtop(event.Saddr).String()
|
||||
daddr := netutil.InetNtop(event.Daddr).String()
|
||||
srcHostname := "<nil>"
|
||||
destHostname := "<nil>"
|
||||
h, err := net.LookupAddr(saddr)
|
||||
if err == nil && len(h) > 0 {
|
||||
srcHostname = h[0]
|
||||
}
|
||||
|
||||
h, err = net.LookupAddr(daddr)
|
||||
if err == nil && len(h) > 0 {
|
||||
destHostname = h[0]
|
||||
}
|
||||
|
||||
// stack
|
||||
stacks := strings.Join(symbolutil.DumpKernelBackTrace(event.Stack[:], symbolutil.KsymbolStackMaxDepth).BackTrace, "\n")
|
||||
|
||||
// tracer data
|
||||
data := &DropWatchTracingData{
|
||||
Type: typeMap[event.Type],
|
||||
Comm: strings.TrimRight(string(event.Comm[:]), "\x00"),
|
||||
Pid: event.TgidPid >> 32,
|
||||
Saddr: saddr,
|
||||
Daddr: daddr,
|
||||
Sport: netutil.InetNtohs(event.Sport),
|
||||
Dport: netutil.InetNtohs(event.Dport),
|
||||
SrcHostname: srcHostname,
|
||||
DestHostname: destHostname,
|
||||
Seq: netutil.InetNtohl(event.Seq),
|
||||
AckSeq: netutil.InetNtohl(event.AckSeq),
|
||||
QueueMapping: event.QueueMapping,
|
||||
PktLen: event.PktLen,
|
||||
State: tcpstateMap[event.State],
|
||||
Stack: stacks,
|
||||
MaxAckBacklog: event.SkMaxAckBacklog,
|
||||
}
|
||||
|
||||
log.Debugf(logPrefix+"tracing data: %v", data)
|
||||
return data
|
||||
}
|
||||
|
||||
func (c *dropWatchTracing) ignore(data *DropWatchTracingData) bool {
|
||||
stack := strings.Split(data.Stack, "\n")
|
||||
// state: CLOSE_WAIT
|
||||
// stack:
|
||||
// 1. kfree_skb/ffffffff963047b0
|
||||
// 2. kfree_skb/ffffffff963047b0
|
||||
// 3. skb_rbtree_purge/ffffffff963089e0
|
||||
// 4. tcp_fin/ffffffff963ac200
|
||||
// 5. ...
|
||||
if data.State == "CLOSE_WAIT" {
|
||||
if len(stack) >= 3 && strings.HasPrefix(stack[2], "skb_rbtree_purge/") {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
// stack:
|
||||
// 1. kfree_skb/ffffffff96d127b0
|
||||
// 2. kfree_skb/ffffffff96d127b0
|
||||
// 3. neigh_invalidate/ffffffff96d388b0
|
||||
// 4. neigh_timer_handler/ffffffff96d3a870
|
||||
// 5. ...
|
||||
if conf.Get().Tracing.Dropwatch.IgnoreNeighInvalidate {
|
||||
if len(stack) >= 3 && strings.HasPrefix(stack[2], "neigh_invalidate/") {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
// stack:
|
||||
// 1. kfree_skb/ffffffff82283d10
|
||||
// 2. kfree_skb/ffffffff82283d10
|
||||
// 3. bnxt_tx_int/ffffffffc05c6f20
|
||||
// 4. __bnxt_poll_work_done/ffffffffc05c50c0
|
||||
// 5. ...
|
||||
|
||||
// stack:
|
||||
// 1. kfree_skb/ffffffffaba83d10
|
||||
// 2. kfree_skb/ffffffffaba83d10
|
||||
// 3. __bnxt_tx_int/ffffffffc045df90
|
||||
// 4. bnxt_tx_int/ffffffffc045e250
|
||||
// 5. ...
|
||||
if len(stack) >= 3 &&
|
||||
(strings.HasPrefix(stack[2], "bnxt_tx_int/") || strings.HasPrefix(stack[2], "__bnxt_tx_int/")) {
|
||||
return true
|
||||
}
|
||||
|
||||
// default: false
|
||||
return false
|
||||
}
|
|
@ -0,0 +1,128 @@
|
|||
// Copyright 2025 The HuaTuo Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package events
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"huatuo-bamai/internal/bpf"
|
||||
"huatuo-bamai/internal/storage"
|
||||
"huatuo-bamai/internal/utils/bpfutil"
|
||||
"huatuo-bamai/internal/utils/kmsgutil"
|
||||
"huatuo-bamai/pkg/metric"
|
||||
"huatuo-bamai/pkg/tracing"
|
||||
)
|
||||
|
||||
//go:generate $BPF_COMPILE $BPF_INCLUDE -s $BPF_DIR/hungtask.c -o $BPF_DIR/hungtask.o
|
||||
|
||||
type hungTaskPerfEventData struct {
|
||||
Pid int32
|
||||
Comm [bpfutil.TaskCommLen]byte
|
||||
}
|
||||
|
||||
// HungTaskTracerData is the full data structure.
|
||||
type HungTaskTracerData struct {
|
||||
Pid int32 `json:"pid"`
|
||||
Comm string `json:"comm"`
|
||||
CPUsStack string `json:"cpus_stack"`
|
||||
BlockedProcessesStack string `json:"blocked_processes_stack"`
|
||||
}
|
||||
|
||||
type hungTaskTracing struct {
|
||||
metric []*metric.Data
|
||||
}
|
||||
|
||||
func init() {
|
||||
// Some OS distributions such as Fedora-42 may disable this feature.
|
||||
hungTaskSysctl := "/proc/sys/kernel/hung_task_timeout_secs"
|
||||
if _, err := os.Stat(hungTaskSysctl); err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
tracing.RegisterEventTracing("hungtask", newHungTask)
|
||||
}
|
||||
|
||||
func newHungTask() (*tracing.EventTracingAttr, error) {
|
||||
return &tracing.EventTracingAttr{
|
||||
TracingData: &hungTaskTracing{
|
||||
metric: []*metric.Data{
|
||||
metric.NewGaugeData("counter", 0, "hungtask counter", nil),
|
||||
},
|
||||
},
|
||||
Internal: 10,
|
||||
Flag: tracing.FlagMetric | tracing.FlagTracing,
|
||||
}, nil
|
||||
}
|
||||
|
||||
var hungtaskCounter float64
|
||||
|
||||
func (c *hungTaskTracing) Update() ([]*metric.Data, error) {
|
||||
c.metric[0].Value = hungtaskCounter
|
||||
return c.metric, nil
|
||||
}
|
||||
|
||||
func (c *hungTaskTracing) Start(ctx context.Context) error {
|
||||
b, err := bpf.LoadBpf(bpfutil.ThisBpfOBJ(), nil)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer b.Close()
|
||||
|
||||
childCtx, cancel := context.WithCancel(ctx)
|
||||
defer cancel()
|
||||
|
||||
reader, err := b.AttachAndEventPipe(childCtx, "hungtask_perf_events", 8192)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer reader.Close()
|
||||
|
||||
b.WaitDetachByBreaker(childCtx, cancel)
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-childCtx.Done():
|
||||
return nil
|
||||
default:
|
||||
var data hungTaskPerfEventData
|
||||
if err := reader.ReadInto(&data); err != nil {
|
||||
return fmt.Errorf("hungtask ReadFromPerfEvent: %w", err)
|
||||
}
|
||||
|
||||
cpusBT, err := kmsgutil.GetAllCPUsBT()
|
||||
if err != nil {
|
||||
cpusBT = err.Error()
|
||||
}
|
||||
|
||||
blockedProcessesBT, err := kmsgutil.GetBlockedProcessesBT()
|
||||
if err != nil {
|
||||
blockedProcessesBT = err.Error()
|
||||
}
|
||||
|
||||
hungtaskCounter++
|
||||
|
||||
storage.Save("hungtask", "", time.Now(), &HungTaskTracerData{
|
||||
Pid: data.Pid,
|
||||
Comm: strings.TrimRight(string(data.Comm[:]), "\x00"),
|
||||
CPUsStack: cpusBT,
|
||||
BlockedProcessesStack: blockedProcessesBT,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,147 @@
|
|||
// Copyright 2025 The HuaTuo Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package events
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"huatuo-bamai/internal/bpf"
|
||||
"huatuo-bamai/internal/log"
|
||||
"huatuo-bamai/internal/storage"
|
||||
"huatuo-bamai/internal/utils/bpfutil"
|
||||
"huatuo-bamai/pkg/metric"
|
||||
"huatuo-bamai/pkg/tracing"
|
||||
|
||||
"github.com/vishvananda/netlink"
|
||||
)
|
||||
|
||||
//go:generate $BPF_COMPILE $BPF_INCLUDE -s $BPF_DIR/lacp.c -o $BPF_DIR/lacp.o
|
||||
type lacpTracing struct {
|
||||
count uint64
|
||||
}
|
||||
|
||||
func init() {
|
||||
// bond mode4 (802.3ad) requires bonding.ko module,
|
||||
// the kprobe point is in bonding module, if not exist, should not load bpf
|
||||
if !isLacpEnv() {
|
||||
return
|
||||
}
|
||||
|
||||
tracing.RegisterEventTracing("lacp", newLACPTracing)
|
||||
}
|
||||
|
||||
func newLACPTracing() (*tracing.EventTracingAttr, error) {
|
||||
return &tracing.EventTracingAttr{
|
||||
TracingData: &lacpTracing{},
|
||||
Internal: 60,
|
||||
Flag: tracing.FlagTracing | tracing.FlagMetric,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (lacp *lacpTracing) Start(ctx context.Context) (err error) {
|
||||
b, err := bpf.LoadBpf(bpfutil.ThisBpfOBJ(), nil)
|
||||
if err != nil {
|
||||
return fmt.Errorf("load bpf: %w", err)
|
||||
}
|
||||
defer b.Close()
|
||||
|
||||
childCtx, cancel := context.WithCancel(ctx)
|
||||
defer cancel()
|
||||
|
||||
reader, err := b.AttachAndEventPipe(childCtx, "ad_event_map", 8192)
|
||||
if err != nil {
|
||||
return fmt.Errorf("attach and event pipe: %w", err)
|
||||
}
|
||||
defer reader.Close()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-childCtx.Done():
|
||||
log.Info("lacp tracing is stopped.")
|
||||
return nil
|
||||
default:
|
||||
var tmp uint64
|
||||
if err := reader.ReadInto(&tmp); err != nil {
|
||||
return fmt.Errorf("read lacp perf event fail: %w", err)
|
||||
}
|
||||
|
||||
atomic.AddUint64(&lacp.count, 1)
|
||||
|
||||
bondInfo, err := readAllFiles("/proc/net/bonding")
|
||||
if err != nil {
|
||||
log.Warnf("read dir /proc/net/bonding err: %v", err)
|
||||
continue
|
||||
}
|
||||
|
||||
tracerData := struct {
|
||||
Content string `json:"content"`
|
||||
}{
|
||||
Content: bondInfo,
|
||||
}
|
||||
|
||||
log.Debugf("bond info: %s", tracerData.Content)
|
||||
storage.Save("lacp", "", time.Now(), tracerData)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (lacp *lacpTracing) Update() ([]*metric.Data, error) {
|
||||
return []*metric.Data{
|
||||
metric.NewGaugeData("lacp", float64(atomic.LoadUint64(&lacp.count)),
|
||||
"lacp disabled count", nil),
|
||||
}, nil
|
||||
}
|
||||
|
||||
func readAllFiles(dir string) (string, error) {
|
||||
var content string
|
||||
|
||||
return content, filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if info.IsDir() {
|
||||
return nil
|
||||
}
|
||||
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
content += path + "\n" + string(data)
|
||||
return nil
|
||||
})
|
||||
}
|
||||
|
||||
func isLacpEnv() bool {
|
||||
links, err := netlink.LinkList()
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
|
||||
for _, l := range links {
|
||||
if l.Type() == "bond" &&
|
||||
l.(*netlink.Bond).Mode == netlink.BOND_MODE_802_3AD {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
|
@ -0,0 +1,117 @@
|
|||
// Copyright 2025 The HuaTuo Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package events
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"huatuo-bamai/internal/bpf"
|
||||
"huatuo-bamai/internal/conf"
|
||||
"huatuo-bamai/internal/log"
|
||||
"huatuo-bamai/internal/pod"
|
||||
"huatuo-bamai/internal/storage"
|
||||
"huatuo-bamai/internal/utils/bpfutil"
|
||||
"huatuo-bamai/pkg/tracing"
|
||||
)
|
||||
|
||||
type memoryReclaimTracing struct{}
|
||||
|
||||
type memoryReclaimPerfEvent struct {
|
||||
Comm [bpfutil.TaskCommLen]byte
|
||||
Deltatime uint64
|
||||
CSS uint64
|
||||
Pid uint64
|
||||
}
|
||||
|
||||
// MemoryReclaimTracingData is the full data structure.
|
||||
type MemoryReclaimTracingData struct {
|
||||
Pid uint64 `json:"pid"`
|
||||
Comm string `json:"comm"`
|
||||
Deltatime uint64 `json:"deltatime"`
|
||||
}
|
||||
|
||||
func init() {
|
||||
tracing.RegisterEventTracing("memory_reclaim_events", newMemoryReclaim)
|
||||
}
|
||||
|
||||
func newMemoryReclaim() (*tracing.EventTracingAttr, error) {
|
||||
return &tracing.EventTracingAttr{
|
||||
TracingData: &memoryReclaimTracing{},
|
||||
Internal: 5,
|
||||
Flag: tracing.FlagTracing,
|
||||
}, nil
|
||||
}
|
||||
|
||||
//go:generate $BPF_COMPILE $BPF_INCLUDE -s $BPF_DIR/memory_reclaim_events.c -o $BPF_DIR/memory_reclaim_events.o
|
||||
|
||||
// Start detect work, load bpf and wait data form perfevent
|
||||
func (c *memoryReclaimTracing) Start(ctx context.Context) error {
|
||||
b, err := bpf.LoadBpf(bpfutil.ThisBpfOBJ(), map[string]any{
|
||||
"deltath": conf.Get().Tracing.MemoryReclaim.Deltath,
|
||||
})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer b.Close()
|
||||
|
||||
childCtx, cancel := context.WithCancel(ctx)
|
||||
defer cancel()
|
||||
|
||||
reader, err := b.AttachAndEventPipe(childCtx, "reclaim_perf_events", 8192)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer reader.Close()
|
||||
|
||||
b.WaitDetachByBreaker(childCtx, cancel)
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-childCtx.Done():
|
||||
return nil
|
||||
default:
|
||||
var data memoryReclaimPerfEvent
|
||||
if err := reader.ReadInto(&data); err != nil {
|
||||
return fmt.Errorf("ReadFromPerfEvent fail: %w", err)
|
||||
}
|
||||
|
||||
container, err := pod.GetContainerByCSS(data.CSS, "cpu")
|
||||
if err != nil {
|
||||
return fmt.Errorf("GetContainerByCSS by CSS %d: %w", data.CSS, err)
|
||||
}
|
||||
|
||||
// We only care about the container and nothing else.
|
||||
// Though it may be unfair, that's just how life is.
|
||||
//
|
||||
// -- Tonghao Zhang, tonghao@bamaicloud.com
|
||||
if container == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
// save storage
|
||||
tracingData := &MemoryReclaimTracingData{
|
||||
Pid: data.Pid,
|
||||
Comm: strings.Trim(string(data.Comm[:]), "\x00"),
|
||||
Deltatime: data.Deltatime,
|
||||
}
|
||||
|
||||
log.Infof("memory_reclaim saves storage: %+v", tracingData)
|
||||
storage.Save("memory_reclaim", container.ID, time.Now(), tracingData)
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,229 @@
|
|||
// Copyright 2025 The HuaTuo Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package events
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"slices"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"huatuo-bamai/internal/conf"
|
||||
"huatuo-bamai/internal/log"
|
||||
"huatuo-bamai/internal/storage"
|
||||
"huatuo-bamai/pkg/metric"
|
||||
"huatuo-bamai/pkg/tracing"
|
||||
|
||||
"github.com/vishvananda/netlink"
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
type linkStatusType uint8
|
||||
|
||||
const (
|
||||
linkStatusUnknown linkStatusType = iota
|
||||
linkStatusAdminUp
|
||||
linkStatusAdminDown
|
||||
linkStatusCarrierUp
|
||||
linkStatusCarrierDown
|
||||
maxLinkStatus
|
||||
)
|
||||
|
||||
func (l linkStatusType) String() string {
|
||||
return [...]string{"linkstatus_unknown", "linkstatus_adminup", "linkstatus_admindown", "linkstatus_carrierup", "linkstatus_carrierdown"}[l]
|
||||
}
|
||||
|
||||
func flags2status(flags, change uint32) []linkStatusType {
|
||||
var status []linkStatusType
|
||||
|
||||
if change&unix.IFF_UP != 0 {
|
||||
if flags&unix.IFF_UP != 0 {
|
||||
status = append(status, linkStatusAdminUp)
|
||||
} else {
|
||||
status = append(status, linkStatusAdminDown)
|
||||
}
|
||||
}
|
||||
|
||||
if change&unix.IFF_LOWER_UP != 0 {
|
||||
if flags&unix.IFF_LOWER_UP != 0 {
|
||||
status = append(status, linkStatusCarrierUp)
|
||||
} else {
|
||||
status = append(status, linkStatusCarrierDown)
|
||||
}
|
||||
}
|
||||
|
||||
return status
|
||||
}
|
||||
|
||||
type netdevTracing struct {
|
||||
name string
|
||||
linkUpdateCh chan netlink.LinkUpdate
|
||||
linkDoneCh chan struct{}
|
||||
mu sync.Mutex
|
||||
ifFlagsMap map[string]uint32 // [ifname]ifinfomsg::if_flags
|
||||
metricsLinkStatusCountMap map[linkStatusType]map[string]int // [netdevEventType][ifname]count
|
||||
}
|
||||
|
||||
type netdevEventData struct {
|
||||
linkFlags uint32
|
||||
flagsChange uint32
|
||||
Ifname string `json:"ifname"`
|
||||
Index int `json:"index"`
|
||||
LinkStatus string `json:"linkstatus"`
|
||||
Mac string `json:"mac"`
|
||||
AtStart bool `json:"start"` // true: be scanned at start, false: event trigger
|
||||
}
|
||||
|
||||
func init() {
|
||||
tracing.RegisterEventTracing("netdev_events", newNetdevTracing)
|
||||
}
|
||||
|
||||
func newNetdevTracing() (*tracing.EventTracingAttr, error) {
|
||||
initMap := make(map[linkStatusType]map[string]int)
|
||||
for i := linkStatusUnknown; i < maxLinkStatus; i++ {
|
||||
initMap[i] = make(map[string]int)
|
||||
}
|
||||
|
||||
return &tracing.EventTracingAttr{
|
||||
TracingData: &netdevTracing{
|
||||
linkUpdateCh: make(chan netlink.LinkUpdate),
|
||||
linkDoneCh: make(chan struct{}),
|
||||
ifFlagsMap: make(map[string]uint32),
|
||||
metricsLinkStatusCountMap: initMap,
|
||||
name: "netdev_events",
|
||||
},
|
||||
Internal: 10,
|
||||
Flag: tracing.FlagTracing | tracing.FlagMetric,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (nt *netdevTracing) Start(ctx context.Context) (err error) {
|
||||
if err := nt.checkLinkStatus(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if err := netlink.LinkSubscribe(nt.linkUpdateCh, nt.linkDoneCh); err != nil {
|
||||
return err
|
||||
}
|
||||
defer nt.close()
|
||||
|
||||
for {
|
||||
update, ok := <-nt.linkUpdateCh
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
switch update.Header.Type {
|
||||
case unix.NLMSG_ERROR:
|
||||
return fmt.Errorf("NLMSG_ERROR")
|
||||
case unix.RTM_NEWLINK:
|
||||
ifname := update.Link.Attrs().Name
|
||||
if _, ok := nt.ifFlagsMap[ifname]; !ok {
|
||||
// new interface
|
||||
continue
|
||||
}
|
||||
nt.handleEvent(&update)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Update implement Collector
|
||||
func (nt *netdevTracing) Update() ([]*metric.Data, error) {
|
||||
nt.mu.Lock()
|
||||
defer nt.mu.Unlock()
|
||||
|
||||
var metrics []*metric.Data
|
||||
|
||||
for typ, value := range nt.metricsLinkStatusCountMap {
|
||||
for ifname, count := range value {
|
||||
metrics = append(metrics, metric.NewGaugeData(
|
||||
typ.String(), float64(count), typ.String(), map[string]string{"device": ifname}))
|
||||
}
|
||||
}
|
||||
|
||||
return metrics, nil
|
||||
}
|
||||
|
||||
func (nt *netdevTracing) checkLinkStatus() error {
|
||||
links, err := netlink.LinkList()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, link := range links {
|
||||
ifname := link.Attrs().Name
|
||||
if !slices.Contains(conf.Get().Tracing.Netdev.Whitelist,
|
||||
ifname) {
|
||||
continue
|
||||
}
|
||||
|
||||
flags := link.Attrs().RawFlags
|
||||
nt.ifFlagsMap[ifname] = flags
|
||||
|
||||
data := &netdevEventData{
|
||||
linkFlags: flags,
|
||||
Ifname: ifname,
|
||||
Index: link.Attrs().Index,
|
||||
Mac: link.Attrs().HardwareAddr.String(),
|
||||
AtStart: true,
|
||||
}
|
||||
nt.record(data)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (nt *netdevTracing) record(data *netdevEventData) {
|
||||
for _, status := range flags2status(data.linkFlags, data.flagsChange) {
|
||||
nt.mu.Lock()
|
||||
nt.metricsLinkStatusCountMap[status][data.Ifname]++
|
||||
nt.mu.Unlock()
|
||||
|
||||
if data.LinkStatus == "" {
|
||||
data.LinkStatus = status.String()
|
||||
} else {
|
||||
data.LinkStatus = data.LinkStatus + ", " + status.String()
|
||||
}
|
||||
}
|
||||
|
||||
if !data.AtStart && data.LinkStatus != "" {
|
||||
log.Infof("%s %+v", data.LinkStatus, data)
|
||||
storage.Save(nt.name, "", time.Now(), data)
|
||||
}
|
||||
}
|
||||
|
||||
func (nt *netdevTracing) handleEvent(ev *netlink.LinkUpdate) {
|
||||
ifname := ev.Link.Attrs().Name
|
||||
|
||||
currFlags := ev.Attrs().RawFlags
|
||||
lastFlags := nt.ifFlagsMap[ifname]
|
||||
change := currFlags ^ lastFlags
|
||||
nt.ifFlagsMap[ifname] = currFlags
|
||||
|
||||
data := &netdevEventData{
|
||||
linkFlags: currFlags,
|
||||
flagsChange: change,
|
||||
Ifname: ifname,
|
||||
Index: ev.Link.Attrs().Index,
|
||||
Mac: ev.Link.Attrs().HardwareAddr.String(),
|
||||
AtStart: false,
|
||||
}
|
||||
nt.record(data)
|
||||
}
|
||||
|
||||
func (nt *netdevTracing) close() {
|
||||
close(nt.linkDoneCh)
|
||||
close(nt.linkUpdateCh)
|
||||
}
|
|
@ -0,0 +1,292 @@
|
|||
// Copyright 2025 The HuaTuo Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package events
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"strings"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"huatuo-bamai/internal/bpf"
|
||||
"huatuo-bamai/internal/conf"
|
||||
"huatuo-bamai/internal/log"
|
||||
"huatuo-bamai/internal/pod"
|
||||
"huatuo-bamai/internal/storage"
|
||||
"huatuo-bamai/internal/utils/bpfutil"
|
||||
"huatuo-bamai/internal/utils/netutil"
|
||||
"huatuo-bamai/internal/utils/procfsutil"
|
||||
"huatuo-bamai/pkg/tracing"
|
||||
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
//go:generate $BPF_COMPILE $BPF_INCLUDE -s $BPF_DIR/netrecvlat.c -o $BPF_DIR/netrecvlat.o
|
||||
|
||||
type netRecvLatTracing struct{}
|
||||
|
||||
// NetTracingData is the full data structure.
|
||||
type NetTracingData struct {
|
||||
Comm string `json:"comm"`
|
||||
Pid uint64 `json:"pid"`
|
||||
Where string `json:"where"`
|
||||
Latency uint64 `json:"latency_ms"`
|
||||
State string `json:"state"`
|
||||
Saddr string `json:"saddr"`
|
||||
Daddr string `json:"daddr"`
|
||||
Sport uint16 `json:"sport"`
|
||||
Dport uint16 `json:"dport"`
|
||||
Seq uint32 `json:"seq"`
|
||||
AckSeq uint32 `json:"ack_seq"`
|
||||
PktLen uint64 `json:"pkt_len"`
|
||||
}
|
||||
|
||||
// from bpf perf
|
||||
type netRcvPerfEvent struct {
|
||||
Comm [bpfutil.TaskCommLen]byte
|
||||
Latency uint64
|
||||
TgidPid uint64
|
||||
PktLen uint64
|
||||
Sport uint16
|
||||
Dport uint16
|
||||
Saddr uint32
|
||||
Daddr uint32
|
||||
Seq uint32
|
||||
AckSeq uint32
|
||||
State uint8
|
||||
Where uint8
|
||||
}
|
||||
|
||||
// from include/net/tcp_states.h
|
||||
var tcpStateMap = []string{
|
||||
"<nil>", // 0
|
||||
"ESTABLISHED",
|
||||
"SYN_SENT",
|
||||
"SYN_RECV",
|
||||
"FIN_WAIT1",
|
||||
"FIN_WAIT2",
|
||||
"TIME_WAIT",
|
||||
"CLOSE",
|
||||
"CLOSE_WAIT",
|
||||
"LAST_ACK",
|
||||
"LISTEN",
|
||||
"CLOSING",
|
||||
"NEW_SYN_RECV",
|
||||
}
|
||||
|
||||
const userCopyCase = 2
|
||||
|
||||
var toWhere = []string{
|
||||
"TO_NETIF_RCV",
|
||||
"TO_TCPV4_RCV",
|
||||
"TO_USER_COPY",
|
||||
}
|
||||
|
||||
func init() {
|
||||
tracing.RegisterEventTracing("netrecvlat", newNetRcvLat)
|
||||
}
|
||||
|
||||
func newNetRcvLat() (*tracing.EventTracingAttr, error) {
|
||||
return &tracing.EventTracingAttr{
|
||||
TracingData: &netRecvLatTracing{},
|
||||
Internal: 10,
|
||||
Flag: tracing.FlagTracing,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (c *netRecvLatTracing) Start(ctx context.Context) error {
|
||||
toNetIf := conf.Get().Tracing.NetRecvLat.ToNetIf // ms, before RPS to a core recv(__netif_receive_skb)
|
||||
toTCPV4 := conf.Get().Tracing.NetRecvLat.ToTCPV4 // ms, before RPS to TCP recv(tcp_v4_rcv)
|
||||
toUserCopy := conf.Get().Tracing.NetRecvLat.ToUserCopy // ms, before RPS to user recv(skb_copy_datagram_iovec)
|
||||
|
||||
if toNetIf == 0 || toTCPV4 == 0 || toUserCopy == 0 {
|
||||
return fmt.Errorf("netrecvlat threshold [%v %v %v]ms invalid", toNetIf, toTCPV4, toUserCopy)
|
||||
}
|
||||
log.Infof("netrecvlat start, latency threshold [%v %v %v]ms", toNetIf, toTCPV4, toUserCopy)
|
||||
|
||||
monoWallOffset, err := estMonoWallOffset()
|
||||
if err != nil {
|
||||
return fmt.Errorf("estimate monoWallOffset failed: %w", err)
|
||||
}
|
||||
|
||||
log.Infof("netrecvlat offset of mono to walltime: %v ns", monoWallOffset)
|
||||
|
||||
args := map[string]any{
|
||||
"mono_wall_offset": monoWallOffset,
|
||||
"to_netif": toNetIf * 1000 * 1000,
|
||||
"to_tcpv4": toTCPV4 * 1000 * 1000,
|
||||
"to_user_copy": toUserCopy * 1000 * 1000,
|
||||
}
|
||||
b, err := bpf.LoadBpf(bpfutil.ThisBpfOBJ(), args)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer b.Close()
|
||||
|
||||
childCtx, cancel := context.WithCancel(ctx)
|
||||
defer cancel()
|
||||
|
||||
reader, err := b.AttachAndEventPipe(childCtx, "net_recv_lat_event_map", 8192)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer reader.Close()
|
||||
|
||||
b.WaitDetachByBreaker(childCtx, cancel)
|
||||
|
||||
// save host netns
|
||||
hostNetNsInode, err := procfsutil.NetNSInodeByPid(1)
|
||||
if err != nil {
|
||||
return fmt.Errorf("get host netns inode: %w", err)
|
||||
}
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-childCtx.Done():
|
||||
return nil
|
||||
default:
|
||||
var pd netRcvPerfEvent
|
||||
if err := reader.ReadInto(&pd); err != nil {
|
||||
return fmt.Errorf("read rrom perf event fail: %w", err)
|
||||
}
|
||||
tracerTime := time.Now()
|
||||
|
||||
comm := "<nil>" // not in process context
|
||||
var pid uint64
|
||||
var containerID string
|
||||
if pd.TgidPid != 0 {
|
||||
comm = strings.TrimRight(string(pd.Comm[:]), "\x00")
|
||||
pid = pd.TgidPid >> 32
|
||||
|
||||
// check if its netns same as host netns
|
||||
if pd.Where == userCopyCase {
|
||||
cid, skip, err := ignore(pid, comm, hostNetNsInode)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if skip {
|
||||
continue
|
||||
}
|
||||
containerID = cid
|
||||
}
|
||||
}
|
||||
|
||||
where := toWhere[pd.Where]
|
||||
lat := pd.Latency / 1000 / 1000 // ms
|
||||
state := tcpStateMap[pd.State]
|
||||
saddr, daddr := netutil.InetNtop(pd.Saddr).String(), netutil.InetNtop(pd.Daddr).String()
|
||||
sport, dport := netutil.InetNtohs(pd.Sport), netutil.InetNtohs(pd.Dport)
|
||||
seq, ackSeq := netutil.InetNtohl(pd.Seq), netutil.InetNtohl(pd.AckSeq)
|
||||
pktLen := pd.PktLen
|
||||
|
||||
title := fmt.Sprintf("comm=%s:%d to=%s lat(ms)=%v state=%s saddr=%s sport=%d daddr=%s dport=%d seq=%d ackSeq=%d pktLen=%d",
|
||||
comm, pid, where, lat, state, saddr, sport, daddr, dport, seq, ackSeq, pktLen)
|
||||
|
||||
// tcp state filter
|
||||
if (state != "ESTABLISHED") && (state != "<nil>") {
|
||||
continue
|
||||
}
|
||||
|
||||
// known issue filter
|
||||
caseName, _ := conf.KnownIssueSearch(title, "", "")
|
||||
if caseName == "netrecvlat" {
|
||||
log.Debugf("netrecvlat known issue")
|
||||
continue
|
||||
}
|
||||
|
||||
tracerData := &NetTracingData{
|
||||
Comm: comm,
|
||||
Pid: pid,
|
||||
Where: where,
|
||||
Latency: lat,
|
||||
State: state,
|
||||
Saddr: saddr,
|
||||
Daddr: daddr,
|
||||
Sport: sport,
|
||||
Dport: dport,
|
||||
Seq: seq,
|
||||
AckSeq: ackSeq,
|
||||
PktLen: pktLen,
|
||||
}
|
||||
log.Debugf("netrecvlat tracerData: %+v", tracerData)
|
||||
|
||||
// save storage
|
||||
storage.Save("netrecvlat", containerID, tracerTime, tracerData)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func ignore(pid uint64, comm string, hostNetnsInode uint64) (containerID string, skip bool, err error) {
|
||||
// check if its netns same as host netns
|
||||
dstInode, err := procfsutil.NetNSInodeByPid(int(pid))
|
||||
if err != nil {
|
||||
// ignore the missing program
|
||||
if errors.Is(err, syscall.ENOENT) {
|
||||
return "", true, nil
|
||||
}
|
||||
return "", skip, fmt.Errorf("get netns inode of pid %v failed: %w", pid, err)
|
||||
}
|
||||
if conf.Get().Tracing.NetRecvLat.IgnoreHost && dstInode == hostNetnsInode {
|
||||
log.Debugf("ignore %s:%v the same netns as host", comm, pid)
|
||||
return "", true, nil
|
||||
}
|
||||
|
||||
// check container level
|
||||
var container *pod.Container
|
||||
if container, err = pod.GetContainerByNetNamespaceInode(dstInode); err != nil {
|
||||
log.Warnf("get container info by netns inode %v pid %v, failed: %v", dstInode, pid, err)
|
||||
}
|
||||
if container != nil {
|
||||
for _, level := range conf.Get().Tracing.NetRecvLat.IgnoreContainerLevel {
|
||||
if container.Qos.Int() == level {
|
||||
log.Debugf("ignore container %+v", container)
|
||||
skip = true
|
||||
break
|
||||
}
|
||||
}
|
||||
containerID = container.ID
|
||||
}
|
||||
|
||||
return containerID, skip, nil
|
||||
}
|
||||
|
||||
// estimate the offset between clock monotonic and real time
|
||||
// bpf_ktime_get_ns() access to clock monotonic, but skb->tstamp = ktime_get_real() at netif_receive_skb_internal
|
||||
// ref: https://github.com/torvalds/linux/blob/v4.18/net/core/dev.c#L4736
|
||||
// t3 - t2 + (t3 - t1) / 2 => (t3 + t1) / 2 - t2
|
||||
func estMonoWallOffset() (int64, error) {
|
||||
var t1, t2, t3 unix.Timespec
|
||||
var bestDelta int64
|
||||
var offset int64
|
||||
|
||||
for i := 0; i < 10; i++ {
|
||||
err1 := unix.ClockGettime(unix.CLOCK_REALTIME, &t1)
|
||||
err2 := unix.ClockGettime(unix.CLOCK_MONOTONIC, &t2)
|
||||
err3 := unix.ClockGettime(unix.CLOCK_REALTIME, &t3)
|
||||
if err1 != nil || err2 != nil || err3 != nil {
|
||||
return 0, fmt.Errorf("%w, %w, %w", err1, err2, err3)
|
||||
}
|
||||
|
||||
delta := unix.TimespecToNsec(t3) - unix.TimespecToNsec(t1)
|
||||
if i == 0 || delta < bestDelta {
|
||||
bestDelta = delta
|
||||
offset = (unix.TimespecToNsec(t3)+unix.TimespecToNsec(t1))/2 - unix.TimespecToNsec(t2)
|
||||
}
|
||||
}
|
||||
|
||||
return offset, nil
|
||||
}
|
|
@ -0,0 +1,188 @@
|
|||
// Copyright 2025 The HuaTuo Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package events
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"huatuo-bamai/internal/bpf"
|
||||
"huatuo-bamai/internal/log"
|
||||
"huatuo-bamai/internal/pod"
|
||||
"huatuo-bamai/internal/storage"
|
||||
"huatuo-bamai/internal/utils/bpfutil"
|
||||
"huatuo-bamai/pkg/metric"
|
||||
"huatuo-bamai/pkg/tracing"
|
||||
)
|
||||
|
||||
//go:generate $BPF_COMPILE $BPF_INCLUDE -s $BPF_DIR/oom.c -o $BPF_DIR/oom.o
|
||||
|
||||
type perfEventData struct {
|
||||
TriggerProcessName [16]byte
|
||||
VictimProcessName [16]byte
|
||||
TriggerPid int32
|
||||
VictimPid int32
|
||||
TriggerMemcgCSS uint64
|
||||
VictimMemcgCSS uint64
|
||||
}
|
||||
|
||||
type OOMTracingData struct {
|
||||
TriggerMemcgCSS string `json:"trigger_memcg_css"`
|
||||
TriggerContainerID string `json:"trigger_container_id"`
|
||||
TriggerContainerHostname string `json:"trigger_container_hostname"`
|
||||
TriggerPid int32 `json:"trigger_pid"`
|
||||
TriggerProcessName string `json:"trigger_process_name"`
|
||||
|
||||
VictimMemcgCSS string `json:"victim_memcg_css"`
|
||||
VictimContainerID string `json:"victim_container_id"`
|
||||
VictimContainerHostname string `json:"victim_container_hostname"`
|
||||
VictimPid int32 `json:"victim_pid"`
|
||||
VictimProcessName string `json:"victim_process_name"`
|
||||
}
|
||||
|
||||
type oomMetric struct {
|
||||
count int
|
||||
victimProcessName string
|
||||
}
|
||||
|
||||
type oomCollector struct{}
|
||||
|
||||
func init() {
|
||||
tracing.RegisterEventTracing("oom", newOOMCollector)
|
||||
}
|
||||
|
||||
func newOOMCollector() (*tracing.EventTracingAttr, error) {
|
||||
return &tracing.EventTracingAttr{
|
||||
TracingData: &oomCollector{},
|
||||
Internal: 10,
|
||||
Flag: tracing.FlagTracing | tracing.FlagMetric,
|
||||
}, nil
|
||||
}
|
||||
|
||||
var (
|
||||
hostOOMCounter float64
|
||||
containerOOMCounter = make(map[string]oomMetric)
|
||||
mutex sync.Mutex
|
||||
)
|
||||
|
||||
func (c *oomCollector) Update() ([]*metric.Data, error) {
|
||||
containers, err := pod.GetNormalContainers()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("get normal container: %w", err)
|
||||
}
|
||||
metrics := []*metric.Data{}
|
||||
mutex.Lock()
|
||||
metrics = append(metrics, metric.NewGaugeData("host_counter", hostOOMCounter, "host oom counter", nil))
|
||||
for _, container := range containers {
|
||||
if val, exists := containerOOMCounter[container.ID]; exists {
|
||||
metrics = append(metrics,
|
||||
metric.NewContainerGaugeData(container, "counter", float64(val.count), "containers oom counter", map[string]string{"process": val.victimProcessName}),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
containerOOMCounter = make(map[string]oomMetric)
|
||||
mutex.Unlock()
|
||||
return metrics, nil
|
||||
}
|
||||
|
||||
// Info return case's base info
|
||||
func (c *oomCollector) Start(ctx context.Context) error {
|
||||
b, err := bpf.LoadBpf(bpfutil.ThisBpfOBJ(), nil)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer b.Close()
|
||||
|
||||
childCtx, cancel := context.WithCancel(ctx)
|
||||
defer cancel()
|
||||
|
||||
reader, err := b.AttachAndEventPipe(childCtx, "oom_perf_events", 8192)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer reader.Close()
|
||||
|
||||
b.WaitDetachByBreaker(childCtx, cancel)
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-childCtx.Done():
|
||||
return nil
|
||||
default:
|
||||
var data perfEventData
|
||||
if err := reader.ReadInto(&data); err != nil {
|
||||
return fmt.Errorf("ReadFromPerfEvent fail: %w", err)
|
||||
}
|
||||
cssToCtMap, err := pod.GetCSSToContainerID("memory")
|
||||
if err != nil {
|
||||
log.Errorf("failed to GetCSSToContainerID, err: %v", err)
|
||||
continue
|
||||
}
|
||||
cts, err := pod.GetAllContainers()
|
||||
if err != nil {
|
||||
log.Errorf("Can't get GetAllContainers, err: %v", err)
|
||||
return err
|
||||
}
|
||||
caseData := &OOMTracingData{
|
||||
TriggerMemcgCSS: fmt.Sprintf("0x%x", data.TriggerMemcgCSS),
|
||||
TriggerPid: data.TriggerPid,
|
||||
TriggerProcessName: strings.TrimRight(string(data.TriggerProcessName[:]), "\x00"),
|
||||
TriggerContainerID: cssToCtMap[data.TriggerMemcgCSS],
|
||||
VictimMemcgCSS: fmt.Sprintf("0x%x", data.VictimMemcgCSS),
|
||||
VictimPid: data.VictimPid,
|
||||
VictimProcessName: strings.TrimRight(string(data.VictimProcessName[:]), "\x00"),
|
||||
VictimContainerID: cssToCtMap[data.VictimMemcgCSS],
|
||||
}
|
||||
|
||||
if caseData.TriggerContainerID == "" {
|
||||
caseData.TriggerContainerID = "None"
|
||||
caseData.TriggerContainerHostname = "Non-Container Cgroup"
|
||||
} else {
|
||||
caseData.TriggerContainerHostname = cts[caseData.TriggerContainerID].Hostname
|
||||
if caseData.TriggerContainerHostname == "" {
|
||||
caseData.TriggerContainerHostname = "unknown"
|
||||
}
|
||||
}
|
||||
mutex.Lock()
|
||||
if caseData.VictimContainerID == "" {
|
||||
hostOOMCounter++
|
||||
caseData.VictimContainerID = "None"
|
||||
caseData.VictimContainerHostname = "Non-Container Cgroup"
|
||||
} else {
|
||||
if val, exists := containerOOMCounter[cts[caseData.VictimContainerID].ID]; exists {
|
||||
val.count++
|
||||
val.victimProcessName = val.victimProcessName + "," + caseData.VictimProcessName
|
||||
containerOOMCounter[cts[caseData.VictimContainerID].ID] = val
|
||||
} else {
|
||||
containerOOMCounter[cts[caseData.VictimContainerID].ID] = oomMetric{
|
||||
count: 1,
|
||||
victimProcessName: caseData.VictimProcessName,
|
||||
}
|
||||
}
|
||||
caseData.VictimContainerHostname = cts[caseData.VictimContainerID].Hostname
|
||||
if caseData.VictimContainerHostname == "" {
|
||||
caseData.VictimContainerHostname = "unknown"
|
||||
}
|
||||
}
|
||||
mutex.Unlock()
|
||||
|
||||
storage.Save("oom", "", time.Now(), caseData)
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,181 @@
|
|||
// Copyright 2025 The HuaTuo Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package events
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"huatuo-bamai/internal/bpf"
|
||||
"huatuo-bamai/internal/conf"
|
||||
"huatuo-bamai/internal/storage"
|
||||
"huatuo-bamai/internal/utils/bpfutil"
|
||||
"huatuo-bamai/internal/utils/symbolutil"
|
||||
"huatuo-bamai/pkg/tracing"
|
||||
)
|
||||
|
||||
//go:generate $BPF_COMPILE $BPF_INCLUDE -s $BPF_DIR/softirq_tracing.c -o $BPF_DIR/softirq_tracing.o
|
||||
|
||||
type softirqTracing struct{}
|
||||
|
||||
type softirqPerfEvent struct {
|
||||
Stack [symbolutil.KsymbolStackMaxDepth]uint64
|
||||
StackSize int64
|
||||
Now uint64
|
||||
StallTime uint64
|
||||
Comm [bpfutil.TaskCommLen]byte
|
||||
Pid uint32
|
||||
CPU uint32
|
||||
}
|
||||
|
||||
// SoftirqTracingData is the full data structure.
|
||||
type SoftirqTracingData struct {
|
||||
OffTime uint64 `json:"offtime"`
|
||||
Threshold uint64 `json:"threshold"`
|
||||
Comm string `json:"comm"`
|
||||
Pid uint32 `json:"pid"`
|
||||
CPU uint32 `json:"cpu"`
|
||||
Now uint64 `json:"now"`
|
||||
Stack string `json:"stack"`
|
||||
}
|
||||
|
||||
func init() {
|
||||
tracing.RegisterEventTracing("softirq_tracing", newSoftirq)
|
||||
}
|
||||
|
||||
func newSoftirq() (*tracing.EventTracingAttr, error) {
|
||||
return &tracing.EventTracingAttr{
|
||||
TracingData: &softirqTracing{},
|
||||
Internal: 10,
|
||||
Flag: tracing.FlagTracing,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (c *softirqTracing) Start(ctx context.Context) error {
|
||||
softirqThresh := conf.Get().Tracing.Softirq.ThresholdTime
|
||||
|
||||
b, err := bpf.LoadBpf(bpfutil.ThisBpfOBJ(), map[string]any{"softirq_thresh": softirqThresh})
|
||||
if err != nil {
|
||||
return fmt.Errorf("load bpf: %w", err)
|
||||
}
|
||||
defer b.Close()
|
||||
|
||||
childCtx, cancel := context.WithCancel(ctx)
|
||||
defer cancel()
|
||||
|
||||
reader, err := attachIrqAndEventPipe(childCtx, b)
|
||||
if err != nil {
|
||||
return fmt.Errorf("attach irq and event pipe: %w", err)
|
||||
}
|
||||
defer reader.Close()
|
||||
|
||||
b.WaitDetachByBreaker(childCtx, cancel)
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-childCtx.Done():
|
||||
return nil
|
||||
default:
|
||||
var data softirqPerfEvent
|
||||
|
||||
if err := reader.ReadInto(&data); err != nil {
|
||||
return fmt.Errorf("Read From Perf Event fail: %w", err)
|
||||
}
|
||||
comm := fmt.Sprintf("%s", data.Comm)
|
||||
index := strings.Index(comm, "ksoftirqd")
|
||||
|
||||
if index == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
// stop recording the noise from swapper
|
||||
index = strings.Index(comm, "swapper")
|
||||
|
||||
if index == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
var stack string
|
||||
|
||||
if data.StackSize > 0 {
|
||||
stack = softirqDumpTrace(data.Stack[:])
|
||||
}
|
||||
|
||||
storage.Save("softirq_tracing", "", time.Now(), &SoftirqTracingData{
|
||||
OffTime: data.StallTime,
|
||||
Threshold: softirqThresh,
|
||||
Comm: strings.TrimRight(comm, "\x00"),
|
||||
Pid: data.Pid,
|
||||
CPU: data.CPU,
|
||||
Now: data.Now,
|
||||
Stack: fmt.Sprintf("stack:\n%s", stack),
|
||||
})
|
||||
}
|
||||
} // forever
|
||||
}
|
||||
|
||||
// softirqDumpTrace is an interface for dump stacks in this case with offset and module info
|
||||
func softirqDumpTrace(addrs []uint64) string {
|
||||
stacks := symbolutil.DumpKernelBackTrace(addrs, symbolutil.KsymbolStackMaxDepth)
|
||||
return strings.Join(stacks.BackTrace, "\n")
|
||||
}
|
||||
|
||||
func attachIrqAndEventPipe(ctx context.Context, b bpf.BPF) (bpf.PerfEventReader, error) {
|
||||
var err error
|
||||
|
||||
reader, err := b.EventPipeByName(ctx, "irqoff_event_map", 8192)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
defer func() {
|
||||
if err != nil {
|
||||
reader.Close()
|
||||
}
|
||||
}()
|
||||
|
||||
/*
|
||||
* NOTE: There might be more than 100ms gap between the attachment of hooks,
|
||||
* so the order of attaching the kprobe and tracepoint is important for us.
|
||||
* probe_scheduler_tick should not be attached before probe_tick_stop and not be
|
||||
* attached later than probe_tick_nohz_restart_sched_tick. So only
|
||||
* probe_tick_stop -> probe_scheduler_tick -> probe_tick_nohz_restart_sched_tick
|
||||
* works for the scenario.
|
||||
*
|
||||
* But we can't control the order of detachment, as it is executed in a random
|
||||
* sequence in HuaTuo. Therefore, when we exit due to some special reasons, a
|
||||
* small number of false alarm might be hit.
|
||||
*/
|
||||
if err := b.AttachWithOptions([]bpf.AttachOption{
|
||||
{
|
||||
ProgramName: "probe_account_process_tick",
|
||||
Symbol: "account_process_tick",
|
||||
},
|
||||
{
|
||||
ProgramName: "probe_tick_nohz_restart_sched_tick",
|
||||
Symbol: "tick_nohz_restart_sched_tick",
|
||||
},
|
||||
{
|
||||
ProgramName: "probe_tick_stop",
|
||||
Symbol: "timer/tick_stop",
|
||||
},
|
||||
}); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return reader, nil
|
||||
}
|
|
@ -0,0 +1,117 @@
|
|||
// Copyright 2025 The HuaTuo Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package events
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"huatuo-bamai/internal/bpf"
|
||||
"huatuo-bamai/internal/storage"
|
||||
"huatuo-bamai/internal/utils/bpfutil"
|
||||
"huatuo-bamai/internal/utils/kmsgutil"
|
||||
"huatuo-bamai/pkg/metric"
|
||||
"huatuo-bamai/pkg/tracing"
|
||||
)
|
||||
|
||||
//go:generate $BPF_COMPILE $BPF_INCLUDE -s $BPF_DIR/softlockup.c -o $BPF_DIR/softlockup.o
|
||||
|
||||
type softLockupPerfEventData struct {
|
||||
CPU int32
|
||||
Pid int32
|
||||
Comm [16]byte
|
||||
}
|
||||
|
||||
// TracerData is the full data structure.
|
||||
type SoftLockupTracerData struct {
|
||||
CPU int32 `json:"cpu"`
|
||||
Pid int32 `json:"pid"`
|
||||
Comm string `json:"comm"`
|
||||
CPUsStack string `json:"cpus_stack"`
|
||||
}
|
||||
|
||||
type softLockupTracing struct {
|
||||
softlockupMetric []*metric.Data
|
||||
}
|
||||
|
||||
func init() {
|
||||
tracing.RegisterEventTracing("softlockup", newSoftLockup)
|
||||
}
|
||||
|
||||
func newSoftLockup() (*tracing.EventTracingAttr, error) {
|
||||
return &tracing.EventTracingAttr{
|
||||
TracingData: &softLockupTracing{
|
||||
softlockupMetric: []*metric.Data{
|
||||
metric.NewGaugeData("counter", 0, "softlockup counter", nil),
|
||||
},
|
||||
},
|
||||
Internal: 10,
|
||||
Flag: tracing.FlagTracing | tracing.FlagMetric,
|
||||
}, nil
|
||||
}
|
||||
|
||||
var softlockupCounter float64
|
||||
|
||||
func (c *softLockupTracing) Update() ([]*metric.Data, error) {
|
||||
c.softlockupMetric[0].Value = softlockupCounter
|
||||
return c.softlockupMetric, nil
|
||||
}
|
||||
|
||||
func (c *softLockupTracing) Start(ctx context.Context) error {
|
||||
b, err := bpf.LoadBpf(bpfutil.ThisBpfOBJ(), nil)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer b.Close()
|
||||
|
||||
childCtx, cancel := context.WithCancel(ctx)
|
||||
defer cancel()
|
||||
|
||||
reader, err := b.AttachAndEventPipe(childCtx, "softlockup_perf_events", 8192)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer reader.Close()
|
||||
|
||||
b.WaitDetachByBreaker(childCtx, cancel)
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-childCtx.Done():
|
||||
return nil
|
||||
default:
|
||||
var data softLockupPerfEventData
|
||||
if err := reader.ReadInto(&data); err != nil {
|
||||
return fmt.Errorf("ReadFromPerfEvent fail: %w", err)
|
||||
}
|
||||
|
||||
bt, err := kmsgutil.GetAllCPUsBT()
|
||||
if err != nil {
|
||||
bt = err.Error()
|
||||
}
|
||||
|
||||
softlockupCounter++
|
||||
|
||||
storage.Save("softlockup", "", time.Now(), &SoftLockupTracerData{
|
||||
CPU: data.CPU,
|
||||
Pid: data.Pid,
|
||||
Comm: strings.TrimRight(string(data.Comm[:]), "\x00"),
|
||||
CPUsStack: bt,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,121 @@
|
|||
// Copyright 2025 The HuaTuo Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package collector
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"huatuo-bamai/internal/pod"
|
||||
"huatuo-bamai/pkg/metric"
|
||||
"huatuo-bamai/pkg/tracing"
|
||||
)
|
||||
|
||||
var arpCachePath = "/proc/net/stat/arp_cache"
|
||||
|
||||
type arpCollector struct {
|
||||
metric []*metric.Data
|
||||
}
|
||||
|
||||
func init() {
|
||||
tracing.RegisterEventTracing("arp", newArp)
|
||||
}
|
||||
|
||||
func newArp() (*tracing.EventTracingAttr, error) {
|
||||
return &tracing.EventTracingAttr{
|
||||
TracingData: &arpCollector{
|
||||
metric: []*metric.Data{
|
||||
metric.NewGaugeData("entries", 0, "host init namespace", nil),
|
||||
metric.NewGaugeData("total", 0, "arp_cache entries", nil),
|
||||
},
|
||||
},
|
||||
Flag: tracing.FlagMetric,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// NetStat contains statistics for all the counters from one file.
|
||||
// should be exported for /proc/net/stat/ndisc_cache
|
||||
type NetStat struct {
|
||||
Stats map[string]uint64
|
||||
Filename string
|
||||
}
|
||||
|
||||
func parseNetstatCache(filePath string) (NetStat, error) {
|
||||
netStat := NetStat{
|
||||
Stats: make(map[string]uint64),
|
||||
}
|
||||
|
||||
file, err := os.Open(filePath)
|
||||
if err != nil {
|
||||
return netStat, err
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
scanner := bufio.NewScanner(file)
|
||||
scanner.Scan()
|
||||
|
||||
// First string is always a header for stats
|
||||
var headers []string
|
||||
headers = append(headers, strings.Fields(scanner.Text())...)
|
||||
|
||||
// Fast path ...
|
||||
scanner.Scan()
|
||||
for num, counter := range strings.Fields(scanner.Text()) {
|
||||
value, err := strconv.ParseUint(counter, 16, 64)
|
||||
if err != nil {
|
||||
return NetStat{}, err
|
||||
}
|
||||
netStat.Stats[headers[num]] = value
|
||||
}
|
||||
|
||||
return netStat, nil
|
||||
}
|
||||
|
||||
func (c *arpCollector) Update() ([]*metric.Data, error) {
|
||||
arpMetric := []*metric.Data{}
|
||||
|
||||
containers, err := pod.GetNormalContainers()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("GetNormalContainers: %w", err)
|
||||
}
|
||||
|
||||
for _, container := range containers {
|
||||
count, err := fileLineCounter(fmt.Sprintf("/proc/%d/net/arp", container.InitPid))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
arpMetric = append(arpMetric, metric.NewContainerGaugeData(container, "entries", float64(count-1), "arp for container and host", nil))
|
||||
}
|
||||
|
||||
count, err := fileLineCounter("/proc/1/net/arp")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
stat, err := parseNetstatCache(arpCachePath)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
c.metric[0].Value = float64(count - 1)
|
||||
c.metric[1].Value = float64(stat.Stats["entries"])
|
||||
|
||||
arpMetric = append(arpMetric, c.metric...)
|
||||
return arpMetric, nil
|
||||
}
|
|
@ -0,0 +1,70 @@
|
|||
// Copyright 2025 The HuaTuo Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package collector
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strconv"
|
||||
|
||||
"github.com/prometheus/procfs"
|
||||
|
||||
"huatuo-bamai/pkg/metric"
|
||||
"huatuo-bamai/pkg/tracing"
|
||||
)
|
||||
|
||||
type buddyInfoCollector struct {
|
||||
fs procfs.FS
|
||||
}
|
||||
|
||||
func init() {
|
||||
tracing.RegisterEventTracing("buddyinfo", newBuddyInfo)
|
||||
}
|
||||
|
||||
func newBuddyInfo() (*tracing.EventTracingAttr, error) {
|
||||
fs, err := procfs.NewDefaultFS()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("open procfs: %w", err)
|
||||
}
|
||||
|
||||
return &tracing.EventTracingAttr{
|
||||
TracingData: &buddyInfoCollector{fs: fs},
|
||||
Flag: tracing.FlagMetric,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (c *buddyInfoCollector) Update() ([]*metric.Data, error) {
|
||||
buddyInfo, err := c.fs.BuddyInfo()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var (
|
||||
buddyLabel = make(map[string]string)
|
||||
metrics = []*metric.Data{}
|
||||
)
|
||||
|
||||
for _, entry := range buddyInfo {
|
||||
for size, value := range entry.Sizes {
|
||||
buddyLabel["node"] = entry.Node
|
||||
buddyLabel["zone"] = entry.Zone
|
||||
buddyLabel["size"] = strconv.Itoa(size)
|
||||
|
||||
metrics = append(metrics,
|
||||
metric.NewGaugeData("blocks", value, "buddy info", buddyLabel))
|
||||
}
|
||||
}
|
||||
|
||||
return metrics, nil
|
||||
}
|
|
@ -0,0 +1,169 @@
|
|||
// Copyright 2025 The HuaTuo Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package collector
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"huatuo-bamai/internal/cgroups"
|
||||
"huatuo-bamai/internal/log"
|
||||
"huatuo-bamai/internal/pod"
|
||||
"huatuo-bamai/pkg/metric"
|
||||
"huatuo-bamai/pkg/tracing"
|
||||
)
|
||||
|
||||
type cpuStat struct {
|
||||
nrThrottled uint64
|
||||
throttledTime uint64
|
||||
nrBursts uint64
|
||||
burstTime uint64
|
||||
|
||||
// calculated values
|
||||
hierarchyWaitSum uint64
|
||||
innerWaitSum uint64
|
||||
cpuTotal uint64
|
||||
|
||||
waitrateHierarchy float64
|
||||
waitrateInner float64
|
||||
waitrateExter float64
|
||||
waitrateThrottled float64
|
||||
|
||||
lastUpdate time.Time
|
||||
}
|
||||
|
||||
type cpuStatCollector struct {
|
||||
cgroup cgroups.Cgroup
|
||||
mutex sync.Mutex
|
||||
}
|
||||
|
||||
func init() {
|
||||
tracing.RegisterEventTracing("cpu_stat", newCPUStat)
|
||||
_ = pod.RegisterContainerLifeResources("collector_cpu_stat", reflect.TypeOf(&cpuStat{}))
|
||||
}
|
||||
|
||||
func newCPUStat() (*tracing.EventTracingAttr, error) {
|
||||
cgroup, err := cgroups.NewCgroupManager()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &tracing.EventTracingAttr{
|
||||
TracingData: &cpuStatCollector{
|
||||
cgroup: cgroup,
|
||||
},
|
||||
Flag: tracing.FlagMetric,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (c *cpuStatCollector) cpuMetricUpdate(cpu *cpuStat, container *pod.Container) error {
|
||||
var (
|
||||
deltaThrottledSum uint64
|
||||
deltaHierarchyWaitSum uint64
|
||||
deltaInnerWaitSum uint64
|
||||
deltaExterWaitSum uint64
|
||||
)
|
||||
|
||||
c.mutex.Lock()
|
||||
defer c.mutex.Unlock()
|
||||
|
||||
now := time.Now()
|
||||
if now.Sub(cpu.lastUpdate).Nanoseconds() < 1000000000 {
|
||||
return nil
|
||||
}
|
||||
|
||||
raw, err := c.cgroup.CpuStatRaw(container.CgroupSuffix)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
usage, err := c.cgroup.CpuUsage(container.CgroupSuffix)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
stat := cpuStat{
|
||||
nrThrottled: raw["nr_throttled"],
|
||||
throttledTime: raw["throttled_time"],
|
||||
hierarchyWaitSum: raw["hierarchy_wait_sum"],
|
||||
innerWaitSum: raw["inner_wait_sum"],
|
||||
nrBursts: raw["nr_bursts"],
|
||||
burstTime: raw["burst_time"],
|
||||
cpuTotal: usage.Usage * 1000,
|
||||
lastUpdate: now,
|
||||
}
|
||||
|
||||
deltaHierarchyWaitSum = stat.hierarchyWaitSum - cpu.hierarchyWaitSum
|
||||
if deltaHierarchyWaitSum <= 0 {
|
||||
deltaThrottledSum = 0
|
||||
deltaHierarchyWaitSum = 0
|
||||
deltaInnerWaitSum = 0
|
||||
deltaExterWaitSum = 0
|
||||
} else {
|
||||
deltaThrottledSum = stat.throttledTime - cpu.throttledTime
|
||||
deltaInnerWaitSum = stat.innerWaitSum - cpu.innerWaitSum
|
||||
|
||||
if deltaHierarchyWaitSum < deltaThrottledSum+deltaInnerWaitSum {
|
||||
deltaHierarchyWaitSum = deltaThrottledSum + deltaInnerWaitSum
|
||||
}
|
||||
|
||||
deltaExterWaitSum = deltaHierarchyWaitSum - deltaThrottledSum - deltaInnerWaitSum
|
||||
}
|
||||
|
||||
deltaWaitRunSum := deltaHierarchyWaitSum + stat.cpuTotal - cpu.cpuTotal
|
||||
if deltaWaitRunSum == 0 {
|
||||
stat.waitrateHierarchy = 0
|
||||
stat.waitrateInner = 0
|
||||
stat.waitrateExter = 0
|
||||
stat.waitrateThrottled = 0
|
||||
} else {
|
||||
stat.waitrateHierarchy = float64(deltaHierarchyWaitSum) * 100 / float64(deltaWaitRunSum)
|
||||
stat.waitrateInner = float64(deltaInnerWaitSum) * 100 / float64(deltaWaitRunSum)
|
||||
stat.waitrateExter = float64(deltaExterWaitSum) * 100 / float64(deltaWaitRunSum)
|
||||
stat.waitrateThrottled = float64(deltaThrottledSum) * 100 / float64(deltaWaitRunSum)
|
||||
}
|
||||
|
||||
*cpu = stat
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *cpuStatCollector) Update() ([]*metric.Data, error) {
|
||||
metrics := []*metric.Data{}
|
||||
|
||||
containers, err := pod.GetContainersByType(pod.ContainerTypeNormal | pod.ContainerTypeSidecar)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for _, container := range containers {
|
||||
containerMetric := container.LifeResouces("collector_cpu_stat").(*cpuStat)
|
||||
if err := c.cpuMetricUpdate(containerMetric, container); err != nil {
|
||||
log.Infof("failed to update cpu info of %s, %v", container, err)
|
||||
continue
|
||||
}
|
||||
|
||||
metrics = append(metrics, metric.NewContainerGaugeData(container, "wait_rate", containerMetric.waitrateHierarchy, "wait rate for containers", nil),
|
||||
metric.NewContainerGaugeData(container, "inner_wait_rate", containerMetric.waitrateInner, "inner wait rate for container", nil),
|
||||
metric.NewContainerGaugeData(container, "exter_wait_rate", containerMetric.waitrateExter, "exter wait rate for container", nil),
|
||||
metric.NewContainerGaugeData(container, "throttle_wait_rate", containerMetric.waitrateThrottled, "throttle wait rate for container", nil),
|
||||
metric.NewContainerGaugeData(container, "nr_throttled", float64(containerMetric.nrThrottled), "throttle nr for container", nil),
|
||||
metric.NewContainerGaugeData(container, "nr_bursts", float64(containerMetric.nrBursts), "burst nr for container", nil),
|
||||
metric.NewContainerGaugeData(container, "burst_time", float64(containerMetric.burstTime), "burst time for container", nil),
|
||||
)
|
||||
}
|
||||
|
||||
return metrics, nil
|
||||
}
|
|
@ -0,0 +1,185 @@
|
|||
// Copyright 2025 The HuaTuo Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package collector
|
||||
|
||||
import (
|
||||
"math"
|
||||
"reflect"
|
||||
"runtime"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"huatuo-bamai/internal/cgroups"
|
||||
"huatuo-bamai/internal/log"
|
||||
"huatuo-bamai/internal/pod"
|
||||
"huatuo-bamai/pkg/metric"
|
||||
"huatuo-bamai/pkg/tracing"
|
||||
)
|
||||
|
||||
type cpuMetric struct {
|
||||
lastUsrTime uint64
|
||||
lastSysTime uint64
|
||||
lastCPUTotal uint64
|
||||
lasTimestamp time.Time
|
||||
utilTotal float64
|
||||
utilSys float64
|
||||
utilUsr float64
|
||||
}
|
||||
|
||||
type cpuUtilCollector struct {
|
||||
cpuUtil []*metric.Data
|
||||
cgroup cgroups.Cgroup
|
||||
|
||||
// included struct for used in multi modules
|
||||
hostCPUCount int
|
||||
hostCPUMetric cpuMetric
|
||||
|
||||
mutex sync.Mutex
|
||||
}
|
||||
|
||||
func init() {
|
||||
tracing.RegisterEventTracing("cpu_util", newCPUUtil)
|
||||
_ = pod.RegisterContainerLifeResources("collector_cpu_util", reflect.TypeOf(&cpuMetric{}))
|
||||
}
|
||||
|
||||
func newCPUUtil() (*tracing.EventTracingAttr, error) {
|
||||
cgroup, err := cgroups.NewCgroupManager()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &tracing.EventTracingAttr{
|
||||
TracingData: &cpuUtilCollector{
|
||||
cpuUtil: []*metric.Data{
|
||||
metric.NewGaugeData("usr", 0, "usr for container and host", nil),
|
||||
metric.NewGaugeData("sys", 0, "sys for container and host", nil),
|
||||
metric.NewGaugeData("total", 0, "total for container and host", nil),
|
||||
},
|
||||
hostCPUCount: runtime.NumCPU(),
|
||||
cgroup: cgroup,
|
||||
},
|
||||
Flag: tracing.FlagMetric,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (c *cpuUtilCollector) cpuMetricUpdate(cpuMetric *cpuMetric, container *pod.Container, cpuCount int) error {
|
||||
var (
|
||||
utilUsr float64
|
||||
utilSys float64
|
||||
utilTotal float64
|
||||
cgroupPath string
|
||||
)
|
||||
|
||||
c.mutex.Lock()
|
||||
defer c.mutex.Unlock()
|
||||
|
||||
now := time.Now()
|
||||
if now.Sub(cpuMetric.lasTimestamp).Nanoseconds() < 1000000000 {
|
||||
return nil
|
||||
}
|
||||
|
||||
if container != nil {
|
||||
cgroupPath = container.CgroupSuffix
|
||||
}
|
||||
|
||||
stat, err := c.cgroup.CpuUsage(cgroupPath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
usageTotal := stat.Usage
|
||||
usageUsr := stat.User
|
||||
usageSys := stat.System
|
||||
|
||||
// allow statistics 0
|
||||
deltaTotal := usageTotal - cpuMetric.lastCPUTotal
|
||||
deltaUsrTime := usageUsr - cpuMetric.lastUsrTime
|
||||
deltaSysTime := usageSys - cpuMetric.lastSysTime
|
||||
deltaUsageSum := float64(cpuCount) * float64(now.Sub(cpuMetric.lasTimestamp).Nanoseconds())
|
||||
|
||||
if (float64(deltaTotal) > deltaUsageSum) || (float64(deltaUsrTime+deltaSysTime) > deltaUsageSum) {
|
||||
cpuMetric.lastUsrTime = usageUsr
|
||||
cpuMetric.lastSysTime = usageSys
|
||||
cpuMetric.lastCPUTotal = usageTotal
|
||||
cpuMetric.lasTimestamp = now
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
utilTotal = float64(deltaTotal) * 100 / deltaUsageSum
|
||||
utilUsr = float64(deltaUsrTime) * 100 / deltaUsageSum
|
||||
utilSys = float64(deltaSysTime) * 100 / deltaUsageSum
|
||||
|
||||
cpuMetric.lastUsrTime = usageUsr
|
||||
cpuMetric.lastSysTime = usageSys
|
||||
cpuMetric.lastCPUTotal = usageTotal
|
||||
cpuMetric.utilTotal = utilTotal
|
||||
cpuMetric.utilUsr = utilUsr
|
||||
cpuMetric.utilSys = utilSys
|
||||
cpuMetric.lasTimestamp = now
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *cpuUtilCollector) hostMetricUpdate() error {
|
||||
if err := c.cpuMetricUpdate(&c.hostCPUMetric, nil, c.hostCPUCount); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
c.cpuUtil[0].Value = c.hostCPUMetric.utilUsr
|
||||
c.cpuUtil[1].Value = c.hostCPUMetric.utilSys
|
||||
c.cpuUtil[2].Value = c.hostCPUMetric.utilTotal
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *cpuUtilCollector) Update() ([]*metric.Data, error) {
|
||||
metrics := []*metric.Data{}
|
||||
|
||||
containers, err := pod.GetContainersByType(pod.ContainerTypeNormal | pod.ContainerTypeSidecar)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for _, container := range containers {
|
||||
cpuQuota, err := c.cgroup.CpuQuotaAndPeriod(container.CgroupSuffix)
|
||||
if err != nil {
|
||||
log.Infof("fetch container [%s] cpu quota and period: %v", container, err)
|
||||
continue
|
||||
}
|
||||
|
||||
if cpuQuota.Quota == math.MaxUint64 {
|
||||
continue
|
||||
}
|
||||
|
||||
count := int(cpuQuota.Quota / cpuQuota.Period)
|
||||
|
||||
containerMetric := container.LifeResouces("collector_cpu_util").(*cpuMetric)
|
||||
if err := c.cpuMetricUpdate(containerMetric, container, count); err != nil {
|
||||
log.Infof("failed to update cpu info of %s, %v", container, err)
|
||||
continue
|
||||
}
|
||||
|
||||
metrics = append(metrics, metric.NewContainerGaugeData(container, "count", float64(count), "cpu count for containers", nil),
|
||||
metric.NewContainerGaugeData(container, "usr", containerMetric.utilUsr, "usr for container and host", nil),
|
||||
metric.NewContainerGaugeData(container, "sys", containerMetric.utilSys, "sys for container and host", nil),
|
||||
metric.NewContainerGaugeData(container, "total", containerMetric.utilTotal, "total for container and host", nil))
|
||||
}
|
||||
|
||||
if err := c.hostMetricUpdate(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
metrics = append(metrics, c.cpuUtil...)
|
||||
return metrics, nil
|
||||
}
|
|
@ -0,0 +1,41 @@
|
|||
// Copyright 2025 The HuaTuo Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package collector
|
||||
|
||||
import "regexp"
|
||||
|
||||
type fieldFilter struct {
|
||||
ignorePattern *regexp.Regexp
|
||||
acceptPattern *regexp.Regexp
|
||||
}
|
||||
|
||||
func newFieldFilter(ignoredPattern, acceptPattern string) *fieldFilter {
|
||||
f := &fieldFilter{}
|
||||
if ignoredPattern != "" {
|
||||
f.ignorePattern = regexp.MustCompile(ignoredPattern)
|
||||
}
|
||||
|
||||
if acceptPattern != "" {
|
||||
f.acceptPattern = regexp.MustCompile(acceptPattern)
|
||||
}
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
// ignored returns whether the field should be ignored
|
||||
func (f *fieldFilter) ignored(name string) bool {
|
||||
return (f.ignorePattern != nil && f.ignorePattern.MatchString(name)) ||
|
||||
(f.acceptPattern != nil && !f.acceptPattern.MatchString(name))
|
||||
}
|
|
@ -0,0 +1,117 @@
|
|||
// Copyright 2025 The HuaTuo Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package collector
|
||||
|
||||
import (
|
||||
"huatuo-bamai/internal/cgroups"
|
||||
"huatuo-bamai/internal/cgroups/paths"
|
||||
"huatuo-bamai/internal/pod"
|
||||
"huatuo-bamai/pkg/metric"
|
||||
"huatuo-bamai/pkg/tracing"
|
||||
|
||||
"github.com/google/cadvisor/utils/cpuload/netlink"
|
||||
"github.com/prometheus/procfs"
|
||||
)
|
||||
|
||||
type loadavgCollector struct {
|
||||
loadAvg []*metric.Data
|
||||
}
|
||||
|
||||
func init() {
|
||||
tracing.RegisterEventTracing("loadavg", newLoadavg)
|
||||
}
|
||||
|
||||
// NewLoadavgCollector returns a new Collector exposing load average stats.
|
||||
func newLoadavg() (*tracing.EventTracingAttr, error) {
|
||||
collector := &loadavgCollector{
|
||||
// Load average of last 1, 5 & 15 minutes.
|
||||
// See linux kernel Documentation/filesystems/proc.rst
|
||||
loadAvg: []*metric.Data{
|
||||
metric.NewGaugeData("load1", 0, "1m load average", nil),
|
||||
metric.NewGaugeData("load5", 0, "5m load average", nil),
|
||||
metric.NewGaugeData("load15", 0, "15m load average", nil),
|
||||
},
|
||||
}
|
||||
|
||||
return &tracing.EventTracingAttr{
|
||||
TracingData: collector, Flag: tracing.FlagMetric,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Read loadavg from /proc.
|
||||
func (c *loadavgCollector) hostLoadAvg() error {
|
||||
fs, err := procfs.NewDefaultFS()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
load, err := fs.LoadAvg()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
c.loadAvg[0].Value = load.Load1
|
||||
c.loadAvg[1].Value = load.Load5
|
||||
c.loadAvg[2].Value = load.Load15
|
||||
return nil
|
||||
}
|
||||
|
||||
func containerLoadavg() ([]*metric.Data, error) {
|
||||
n, err := netlink.New()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer n.Stop()
|
||||
|
||||
containers, err := pod.GetContainersByType(pod.ContainerTypeNormal | pod.ContainerTypeSidecar)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
loadavgs := []*metric.Data{}
|
||||
for _, container := range containers {
|
||||
stats, err := n.GetCpuLoad(container.Hostname, paths.Path("cpu", container.CgroupSuffix))
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
loadavgs = append(loadavgs,
|
||||
metric.NewContainerGaugeData(container,
|
||||
"nr_running", float64(stats.NrRunning),
|
||||
"nr_running of container", nil),
|
||||
metric.NewContainerGaugeData(container,
|
||||
"nr_uninterruptible", float64(stats.NrUninterruptible),
|
||||
"nr_uninterruptible of container", nil))
|
||||
}
|
||||
|
||||
return loadavgs, nil
|
||||
}
|
||||
|
||||
func (c *loadavgCollector) Update() ([]*metric.Data, error) {
|
||||
loadavgs := []*metric.Data{}
|
||||
|
||||
if cgroups.CgroupMode() == cgroups.Legacy {
|
||||
if containersLoads, err := containerLoadavg(); err == nil {
|
||||
loadavgs = append(loadavgs, containersLoads...)
|
||||
}
|
||||
}
|
||||
|
||||
if err := c.hostLoadAvg(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
loadavgs = append(loadavgs, c.loadAvg...)
|
||||
return loadavgs, nil
|
||||
}
|
|
@ -0,0 +1,75 @@
|
|||
// Copyright 2025 The HuaTuo Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package collector
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"huatuo-bamai/internal/cgroups"
|
||||
"huatuo-bamai/internal/conf"
|
||||
"huatuo-bamai/internal/pod"
|
||||
"huatuo-bamai/pkg/metric"
|
||||
"huatuo-bamai/pkg/tracing"
|
||||
)
|
||||
|
||||
type memEventsCollector struct {
|
||||
cgroup cgroups.Cgroup
|
||||
}
|
||||
|
||||
func init() {
|
||||
tracing.RegisterEventTracing("memory_events", newMemEvents)
|
||||
}
|
||||
|
||||
func newMemEvents() (*tracing.EventTracingAttr, error) {
|
||||
cgroup, err := cgroups.NewCgroupManager()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &tracing.EventTracingAttr{
|
||||
TracingData: &memEventsCollector{
|
||||
cgroup: cgroup,
|
||||
}, Flag: tracing.FlagMetric,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (c *memEventsCollector) Update() ([]*metric.Data, error) {
|
||||
filter := newFieldFilter(conf.Get().MetricCollector.MemoryEvents.ExcludedMetrics,
|
||||
conf.Get().MetricCollector.MemoryEvents.IncludedMetrics)
|
||||
|
||||
containers, err := pod.GetNormalContainers()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("get normal container: %w", err)
|
||||
}
|
||||
|
||||
metrics := []*metric.Data{}
|
||||
for _, container := range containers {
|
||||
raw, err := c.cgroup.MemoryEventRaw(container.CgroupSuffix)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for key, value := range raw {
|
||||
if filter.ignored(key) {
|
||||
continue
|
||||
}
|
||||
|
||||
metrics = append(metrics,
|
||||
metric.NewContainerGaugeData(container, key, float64(value), fmt.Sprintf("memory events %s", key), nil))
|
||||
}
|
||||
}
|
||||
|
||||
return metrics, nil
|
||||
}
|
|
@ -0,0 +1,110 @@
|
|||
// Copyright 2025 The HuaTuo Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package collector
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/binary"
|
||||
"fmt"
|
||||
|
||||
"huatuo-bamai/internal/bpf"
|
||||
"huatuo-bamai/internal/utils/bpfutil"
|
||||
"huatuo-bamai/pkg/metric"
|
||||
"huatuo-bamai/pkg/tracing"
|
||||
)
|
||||
|
||||
func init() {
|
||||
tracing.RegisterEventTracing("memory_free", newMemoryHost)
|
||||
}
|
||||
|
||||
func newMemoryHost() (*tracing.EventTracingAttr, error) {
|
||||
mm := &memoryHost{
|
||||
metrics: []*metric.Data{
|
||||
metric.NewGaugeData("compaction", 0, "time elapsed in memory compaction", nil),
|
||||
metric.NewGaugeData("allocstall", 0, "time elapsed in memory allocstall", nil),
|
||||
},
|
||||
}
|
||||
return &tracing.EventTracingAttr{
|
||||
TracingData: mm,
|
||||
Internal: 10,
|
||||
Flag: tracing.FlagTracing | tracing.FlagMetric,
|
||||
}, nil
|
||||
}
|
||||
|
||||
//go:generate $BPF_COMPILE $BPF_INCLUDE -s $BPF_DIR/memory_free_compact.c -o $BPF_DIR/memory_free_compact.o
|
||||
|
||||
type memoryHost struct {
|
||||
metrics []*metric.Data
|
||||
bpf bpf.BPF
|
||||
isRuning bool
|
||||
}
|
||||
|
||||
type memoryHostMetric struct {
|
||||
/* host: compaction latency */
|
||||
CompactionStat uint64
|
||||
/* host: page alloc latency in direct reclaim */
|
||||
AllocstallStat uint64
|
||||
}
|
||||
|
||||
func (c *memoryHost) Update() ([]*metric.Data, error) {
|
||||
if !c.isRuning {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
items, err := c.bpf.DumpMapByName("mm_free_compact_map")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("dump map mm_free_compact_map: %w", err)
|
||||
}
|
||||
|
||||
if len(items) == 0 {
|
||||
c.metrics[0].Value = float64(0)
|
||||
c.metrics[1].Value = float64(0)
|
||||
} else {
|
||||
mmMetric := memoryHostMetric{}
|
||||
buf := bytes.NewReader(items[0].Value)
|
||||
err := binary.Read(buf, binary.LittleEndian, &mmMetric)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("read mem_cgroup_map: %w", err)
|
||||
}
|
||||
c.metrics[0].Value = float64(mmMetric.CompactionStat) / 1000 / 1000
|
||||
c.metrics[1].Value = float64(mmMetric.AllocstallStat) / 1000 / 1000
|
||||
}
|
||||
return c.metrics, nil
|
||||
}
|
||||
|
||||
// Start detect work, load bpf and wait data form perfevent
|
||||
func (c *memoryHost) Start(ctx context.Context) error {
|
||||
var err error
|
||||
c.bpf, err = bpf.LoadBpf(bpfutil.ThisBpfOBJ(), nil)
|
||||
if err != nil {
|
||||
return fmt.Errorf("load bpf: %w", err)
|
||||
}
|
||||
defer c.bpf.Close()
|
||||
|
||||
if err = c.bpf.Attach(); err != nil {
|
||||
return fmt.Errorf("attach: %w", err)
|
||||
}
|
||||
|
||||
childCtx, cancel := context.WithCancel(ctx)
|
||||
defer cancel()
|
||||
|
||||
c.bpf.WaitDetachByBreaker(childCtx, cancel)
|
||||
|
||||
c.isRuning = true
|
||||
<-childCtx.Done()
|
||||
c.isRuning = false
|
||||
return nil
|
||||
}
|
|
@ -0,0 +1,97 @@
|
|||
// Copyright 2025 The HuaTuo Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package collector
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"huatuo-bamai/internal/cgroups/paths"
|
||||
"huatuo-bamai/internal/pod"
|
||||
"huatuo-bamai/internal/utils/parseutil"
|
||||
"huatuo-bamai/pkg/metric"
|
||||
"huatuo-bamai/pkg/tracing"
|
||||
)
|
||||
|
||||
type memOthersCollector struct{}
|
||||
|
||||
func init() {
|
||||
// only for didicloud
|
||||
tracing.RegisterEventTracing("memory_others", newMemOthersCollector)
|
||||
}
|
||||
|
||||
func newMemOthersCollector() (*tracing.EventTracingAttr, error) {
|
||||
return &tracing.EventTracingAttr{
|
||||
TracingData: &memOthersCollector{},
|
||||
Flag: tracing.FlagMetric,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func parseValueWithKey(cgroupPath, cgroupFile, key string) (uint64, error) {
|
||||
filePath := paths.Path("memory", cgroupPath, cgroupFile)
|
||||
if key == "" {
|
||||
return parseutil.ReadUint(filePath)
|
||||
}
|
||||
|
||||
raw, err := parseutil.RawKV(filePath)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
return raw[key], nil
|
||||
}
|
||||
|
||||
func (c *memOthersCollector) Update() ([]*metric.Data, error) {
|
||||
containers, err := pod.GetNormalContainers()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("Can't get normal container: %w", err)
|
||||
}
|
||||
|
||||
metrics := []*metric.Data{}
|
||||
|
||||
for _, container := range containers {
|
||||
for _, t := range []struct {
|
||||
path string
|
||||
key string
|
||||
name string
|
||||
}{
|
||||
{
|
||||
path: "memory.directstall_stat",
|
||||
key: "directstall_time",
|
||||
name: "directstall_time",
|
||||
},
|
||||
{
|
||||
path: "memory.asynreclaim_stat",
|
||||
key: "asyncreclaim_time",
|
||||
name: "asyncreclaim_time",
|
||||
},
|
||||
{
|
||||
path: "memory.local_direct_reclaim_time",
|
||||
key: "",
|
||||
name: "local_direct_reclaim_time",
|
||||
},
|
||||
} {
|
||||
value, err := parseValueWithKey(container.CgroupSuffix, t.path, t.key)
|
||||
if err != nil {
|
||||
// FIXME: os maynot support this metric
|
||||
continue
|
||||
}
|
||||
|
||||
metrics = append(metrics,
|
||||
metric.NewContainerGaugeData(container, t.name, float64(value), fmt.Sprintf("memory cgroup %s", t.name), nil))
|
||||
}
|
||||
}
|
||||
|
||||
return metrics, nil
|
||||
}
|
|
@ -0,0 +1,129 @@
|
|||
// Copyright 2025 The HuaTuo Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package collector
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/binary"
|
||||
"fmt"
|
||||
|
||||
"huatuo-bamai/internal/bpf"
|
||||
"huatuo-bamai/internal/pod"
|
||||
"huatuo-bamai/internal/utils/bpfutil"
|
||||
"huatuo-bamai/pkg/metric"
|
||||
"huatuo-bamai/pkg/tracing"
|
||||
)
|
||||
|
||||
func init() {
|
||||
tracing.RegisterEventTracing("memory_reclaim", newMemoryCgroup)
|
||||
}
|
||||
|
||||
func newMemoryCgroup() (*tracing.EventTracingAttr, error) {
|
||||
return &tracing.EventTracingAttr{
|
||||
TracingData: &memoryCgroup{},
|
||||
Internal: 10,
|
||||
Flag: tracing.FlagTracing | tracing.FlagMetric,
|
||||
}, nil
|
||||
}
|
||||
|
||||
type memoryCgroupMetric struct {
|
||||
DirectstallCount uint64
|
||||
}
|
||||
|
||||
//go:generate $BPF_COMPILE $BPF_INCLUDE -s $BPF_DIR/memory_reclaim.c -o $BPF_DIR/memory_reclaim.o
|
||||
|
||||
type memoryCgroup struct {
|
||||
bpf bpf.BPF
|
||||
isRuning bool
|
||||
}
|
||||
|
||||
func (c *memoryCgroup) Update() ([]*metric.Data, error) {
|
||||
if !c.isRuning {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
containersMap := make(map[uint64]*pod.Container)
|
||||
containers, err := pod.GetNormalContainers()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("get container: %w", err)
|
||||
}
|
||||
|
||||
for _, container := range containers {
|
||||
containersMap[container.CSS["memory"]] = container
|
||||
}
|
||||
|
||||
items, err := c.bpf.DumpMapByName("mem_cgroup_map")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("dump mem_cgroup_map: %w", err)
|
||||
}
|
||||
|
||||
var (
|
||||
cgroupMetric memoryCgroupMetric
|
||||
containersMetric []*metric.Data
|
||||
css uint64
|
||||
)
|
||||
for _, v := range items {
|
||||
keyBuf := bytes.NewReader(v.Key)
|
||||
if err := binary.Read(keyBuf, binary.LittleEndian, &css); err != nil {
|
||||
return nil, fmt.Errorf("mem_cgroup_map key: %w", err)
|
||||
}
|
||||
|
||||
valBuf := bytes.NewReader(v.Value)
|
||||
if err := binary.Read(valBuf, binary.LittleEndian, &cgroupMetric); err != nil {
|
||||
return nil, fmt.Errorf("mem_cgroup_map value: %w", err)
|
||||
}
|
||||
|
||||
if container, exist := containersMap[css]; exist {
|
||||
containersMetric = append(containersMetric,
|
||||
metric.NewContainerGaugeData(container, "directstall",
|
||||
float64(cgroupMetric.DirectstallCount),
|
||||
"counting of cgroup try_charge reclaim", nil))
|
||||
}
|
||||
}
|
||||
|
||||
// if events haven't happened, upload zero for all containers.
|
||||
if len(items) == 0 {
|
||||
for _, container := range containersMap {
|
||||
containersMetric = append(containersMetric,
|
||||
metric.NewContainerGaugeData(container, "directstall", float64(0),
|
||||
"counting of cgroup try_charge reclaim", nil))
|
||||
}
|
||||
}
|
||||
|
||||
return containersMetric, nil
|
||||
}
|
||||
|
||||
func (c *memoryCgroup) Start(ctx context.Context) error {
|
||||
var err error
|
||||
c.bpf, err = bpf.LoadBpf(bpfutil.ThisBpfOBJ(), nil)
|
||||
if err != nil {
|
||||
return fmt.Errorf("load bpf: %w", err)
|
||||
}
|
||||
defer c.bpf.Close()
|
||||
|
||||
if err = c.bpf.Attach(); err != nil {
|
||||
return fmt.Errorf("attach: %w", err)
|
||||
}
|
||||
|
||||
childCtx, cancel := context.WithCancel(ctx)
|
||||
defer cancel()
|
||||
|
||||
c.bpf.WaitDetachByBreaker(childCtx, cancel)
|
||||
c.isRuning = true
|
||||
<-childCtx.Done()
|
||||
c.isRuning = false
|
||||
return nil
|
||||
}
|
|
@ -0,0 +1,78 @@
|
|||
// Copyright 2025 The HuaTuo Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package collector
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"huatuo-bamai/internal/cgroups"
|
||||
"huatuo-bamai/internal/conf"
|
||||
"huatuo-bamai/internal/log"
|
||||
"huatuo-bamai/internal/pod"
|
||||
"huatuo-bamai/pkg/metric"
|
||||
"huatuo-bamai/pkg/tracing"
|
||||
)
|
||||
|
||||
type memStatCollector struct {
|
||||
cgroup cgroups.Cgroup
|
||||
}
|
||||
|
||||
func init() {
|
||||
tracing.RegisterEventTracing("memory_stat", newMemStat)
|
||||
}
|
||||
|
||||
func newMemStat() (*tracing.EventTracingAttr, error) {
|
||||
cgroup, err := cgroups.NewCgroupManager()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &tracing.EventTracingAttr{
|
||||
TracingData: &memStatCollector{
|
||||
cgroup: cgroup,
|
||||
},
|
||||
Flag: tracing.FlagMetric,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (c *memStatCollector) Update() ([]*metric.Data, error) {
|
||||
filter := newFieldFilter(conf.Get().MetricCollector.MemoryStat.ExcludedMetrics,
|
||||
conf.Get().MetricCollector.MemoryStat.IncludedMetrics)
|
||||
|
||||
metrics := []*metric.Data{}
|
||||
containers, err := pod.GetNormalContainers()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for _, container := range containers {
|
||||
raw, err := c.cgroup.MemoryStatRaw(container.CgroupSuffix)
|
||||
if err != nil {
|
||||
log.Infof("parse %s memory.stat %v", container.CgroupSuffix, err)
|
||||
continue
|
||||
}
|
||||
|
||||
for m, v := range raw {
|
||||
if filter.ignored(m) {
|
||||
log.Debugf("Ignoring memory_stat metric: %s", m)
|
||||
continue
|
||||
}
|
||||
|
||||
metrics = append(metrics, metric.NewContainerGaugeData(container, m, float64(v), fmt.Sprintf("memory stat %s", m), nil))
|
||||
}
|
||||
}
|
||||
|
||||
return metrics, nil
|
||||
}
|
|
@ -0,0 +1,62 @@
|
|||
// Copyright 2025 The HuaTuo Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package collector
|
||||
|
||||
import (
|
||||
"github.com/prometheus/procfs"
|
||||
|
||||
"huatuo-bamai/internal/conf"
|
||||
"huatuo-bamai/pkg/metric"
|
||||
"huatuo-bamai/pkg/tracing"
|
||||
)
|
||||
|
||||
type mountPointStatCollector struct{}
|
||||
|
||||
func init() {
|
||||
tracing.RegisterEventTracing("mountpoint_perm", newMountPointStat)
|
||||
}
|
||||
|
||||
func newMountPointStat() (*tracing.EventTracingAttr, error) {
|
||||
return &tracing.EventTracingAttr{
|
||||
TracingData: &mountPointStatCollector{},
|
||||
Flag: tracing.FlagMetric,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (c *mountPointStatCollector) Update() ([]*metric.Data, error) {
|
||||
mountinfo, err := procfs.GetMounts()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
filter := newFieldFilter("", conf.Get().MetricCollector.MountPointStat.IncludedMountPoints)
|
||||
|
||||
metrics := []*metric.Data{}
|
||||
for _, v := range mountinfo {
|
||||
if filter.ignored(v.MountPoint) {
|
||||
continue
|
||||
}
|
||||
|
||||
mountTag := map[string]string{"mountpoint": v.MountPoint}
|
||||
ro := 0
|
||||
if _, ok := v.Options["ro"]; ok {
|
||||
ro = 1
|
||||
}
|
||||
|
||||
metrics = append(metrics,
|
||||
metric.NewGaugeData("ro", float64(ro), "whether mountpoint is readonly or not", mountTag))
|
||||
}
|
||||
return metrics, nil
|
||||
}
|
|
@ -0,0 +1,261 @@
|
|||
// Copyright 2025 The HuaTuo Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package collector
|
||||
|
||||
// ref: https://github.com/prometheus/node_exporter/tree/master/collector
|
||||
// - netdev_common.go
|
||||
// - netdev_linuxt.go
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
|
||||
"huatuo-bamai/internal/conf"
|
||||
"huatuo-bamai/internal/log"
|
||||
"huatuo-bamai/internal/pod"
|
||||
"huatuo-bamai/pkg/metric"
|
||||
"huatuo-bamai/pkg/tracing"
|
||||
|
||||
"github.com/jsimonetti/rtnetlink"
|
||||
"github.com/mdlayher/netlink"
|
||||
"github.com/prometheus/procfs"
|
||||
)
|
||||
|
||||
type (
|
||||
netdevStats map[string]map[string]uint64
|
||||
netdevCollector struct{}
|
||||
)
|
||||
|
||||
func init() {
|
||||
tracing.RegisterEventTracing("netdev", newNetdevCollector)
|
||||
}
|
||||
|
||||
func newNetdevCollector() (*tracing.EventTracingAttr, error) {
|
||||
return &tracing.EventTracingAttr{
|
||||
TracingData: &netdevCollector{},
|
||||
Flag: tracing.FlagMetric,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (c *netdevCollector) Update() ([]*metric.Data, error) {
|
||||
filter := newFieldFilter(conf.Get().MetricCollector.Netdev.IgnoredDevices,
|
||||
conf.Get().MetricCollector.Netdev.AcceptDevices)
|
||||
|
||||
log.Debugf("Updating netdev metrics by filter: %v", filter)
|
||||
|
||||
// normal containers
|
||||
containers, err := pod.GetNormalContainers()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("GetNormalContainers: %w", err)
|
||||
}
|
||||
|
||||
// support the empty container
|
||||
if containers == nil {
|
||||
containers = make(map[string]*pod.Container)
|
||||
}
|
||||
// append host into containers
|
||||
containers[""] = nil
|
||||
|
||||
var metrics []*metric.Data
|
||||
for _, container := range containers {
|
||||
devStats, err := c.getStats(container, filter)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("couldn't get netdev statistic for container %v: %w", container, err)
|
||||
}
|
||||
|
||||
for dev, stats := range devStats {
|
||||
for key, val := range stats {
|
||||
tags := map[string]string{"device": dev}
|
||||
if container != nil {
|
||||
metrics = append(metrics,
|
||||
metric.NewContainerGaugeData(container, key+"_total", float64(val), fmt.Sprintf("Network device statistic %s.", key), tags))
|
||||
} else {
|
||||
metrics = append(metrics,
|
||||
metric.NewGaugeData(key+"_total", float64(val), fmt.Sprintf("Network device statistic %s.", key), tags))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
log.Debugf("Updated netdev metrics by filter %v: %v", filter, metrics)
|
||||
return metrics, nil
|
||||
}
|
||||
|
||||
func (c *netdevCollector) getStats(container *pod.Container, filter *fieldFilter) (netdevStats, error) {
|
||||
if conf.Get().MetricCollector.Netdev.EnableNetlink {
|
||||
return c.netlinkStats(container, filter)
|
||||
}
|
||||
return c.procStats(container, filter)
|
||||
}
|
||||
|
||||
func (c *netdevCollector) netlinkStats(container *pod.Container, filter *fieldFilter) (netdevStats, error) {
|
||||
pid := 1 // host
|
||||
if container != nil {
|
||||
pid = container.InitPid
|
||||
}
|
||||
|
||||
file, err := os.Open(filepath.Join("/proc", strconv.Itoa(pid), "ns/net"))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
conn, err := rtnetlink.Dial(&netlink.Config{NetNS: int(file.Fd())})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer conn.Close()
|
||||
|
||||
links, err := conn.Link.List()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
metrics := netdevStats{}
|
||||
for _, msg := range links {
|
||||
if msg.Attributes == nil {
|
||||
log.Debug("No netlink attributes, skipping")
|
||||
continue
|
||||
}
|
||||
name := msg.Attributes.Name
|
||||
stats := msg.Attributes.Stats64
|
||||
if stats32 := msg.Attributes.Stats; stats == nil && stats32 != nil {
|
||||
stats = &rtnetlink.LinkStats64{
|
||||
RXPackets: uint64(stats32.RXPackets),
|
||||
TXPackets: uint64(stats32.TXPackets),
|
||||
RXBytes: uint64(stats32.RXBytes),
|
||||
TXBytes: uint64(stats32.TXBytes),
|
||||
RXErrors: uint64(stats32.RXErrors),
|
||||
TXErrors: uint64(stats32.TXErrors),
|
||||
RXDropped: uint64(stats32.RXDropped),
|
||||
TXDropped: uint64(stats32.TXDropped),
|
||||
Multicast: uint64(stats32.Multicast),
|
||||
Collisions: uint64(stats32.Collisions),
|
||||
RXLengthErrors: uint64(stats32.RXLengthErrors),
|
||||
RXOverErrors: uint64(stats32.RXOverErrors),
|
||||
RXCRCErrors: uint64(stats32.RXCRCErrors),
|
||||
RXFrameErrors: uint64(stats32.RXFrameErrors),
|
||||
RXFIFOErrors: uint64(stats32.RXFIFOErrors),
|
||||
RXMissedErrors: uint64(stats32.RXMissedErrors),
|
||||
TXAbortedErrors: uint64(stats32.TXAbortedErrors),
|
||||
TXCarrierErrors: uint64(stats32.TXCarrierErrors),
|
||||
TXFIFOErrors: uint64(stats32.TXFIFOErrors),
|
||||
TXHeartbeatErrors: uint64(stats32.TXHeartbeatErrors),
|
||||
TXWindowErrors: uint64(stats32.TXWindowErrors),
|
||||
RXCompressed: uint64(stats32.RXCompressed),
|
||||
TXCompressed: uint64(stats32.TXCompressed),
|
||||
RXNoHandler: uint64(stats32.RXNoHandler),
|
||||
RXOtherhostDropped: 0,
|
||||
}
|
||||
}
|
||||
|
||||
if filter.ignored(name) {
|
||||
log.Debugf("Ignoring device: %s", name)
|
||||
continue
|
||||
}
|
||||
|
||||
// Make sure we don't panic when accessing `stats` attributes below.
|
||||
if stats == nil {
|
||||
log.Debug("No netlink stats, skipping")
|
||||
continue
|
||||
}
|
||||
|
||||
// https://github.com/torvalds/linux/blob/master/include/uapi/linux/if_link.h#L42-L246
|
||||
metrics[name] = map[string]uint64{
|
||||
"receive_packets": stats.RXPackets,
|
||||
"transmit_packets": stats.TXPackets,
|
||||
"receive_bytes": stats.RXBytes,
|
||||
"transmit_bytes": stats.TXBytes,
|
||||
"receive_errors": stats.RXErrors,
|
||||
"transmit_errors": stats.TXErrors,
|
||||
"receive_dropped": stats.RXDropped,
|
||||
"transmit_dropped": stats.TXDropped,
|
||||
"multicast": stats.Multicast,
|
||||
"collisions": stats.Collisions,
|
||||
|
||||
// detailed rx_errors
|
||||
"receive_length_errors": stats.RXLengthErrors,
|
||||
"receive_over_errors": stats.RXOverErrors,
|
||||
"receive_crc_errors": stats.RXCRCErrors,
|
||||
"receive_frame_errors": stats.RXFrameErrors,
|
||||
"receive_fifo_errors": stats.RXFIFOErrors,
|
||||
"receive_missed_errors": stats.RXMissedErrors,
|
||||
|
||||
// detailed tx_errors
|
||||
"transmit_aborted_errors": stats.TXAbortedErrors,
|
||||
"transmit_carrier_errors": stats.TXCarrierErrors,
|
||||
"transmit_fifo_errors": stats.TXFIFOErrors,
|
||||
"transmit_heartbeat_errors": stats.TXHeartbeatErrors,
|
||||
"transmit_window_errors": stats.TXWindowErrors,
|
||||
|
||||
// for cslip etc
|
||||
"receive_compressed": stats.RXCompressed,
|
||||
"transmit_compressed": stats.TXCompressed,
|
||||
"receive_nohandler": stats.RXNoHandler,
|
||||
}
|
||||
}
|
||||
|
||||
return metrics, nil
|
||||
}
|
||||
|
||||
func (c *netdevCollector) procStats(container *pod.Container, filter *fieldFilter) (netdevStats, error) {
|
||||
pid := 1 // host
|
||||
if container != nil {
|
||||
pid = container.InitPid
|
||||
}
|
||||
|
||||
fs, err := procfs.NewProc(pid)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to open procfs: %w", err)
|
||||
}
|
||||
|
||||
netdev, err := fs.NetDev()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse /proc/[%d]/net/dev: %w", pid, err)
|
||||
}
|
||||
|
||||
metrics := netdevStats{}
|
||||
for name := range netdev {
|
||||
stats := netdev[name]
|
||||
|
||||
if filter.ignored(name) {
|
||||
log.Debugf("Ignoring device: %s", name)
|
||||
continue
|
||||
}
|
||||
|
||||
metrics[name] = map[string]uint64{
|
||||
"receive_bytes": stats.RxBytes,
|
||||
"receive_packets": stats.RxPackets,
|
||||
"receive_errors": stats.RxErrors,
|
||||
"receive_dropped": stats.RxDropped,
|
||||
"receive_fifo": stats.RxFIFO,
|
||||
"receive_frame": stats.RxFrame,
|
||||
"receive_compressed": stats.RxCompressed,
|
||||
"receive_multicast": stats.RxMulticast,
|
||||
"transmit_bytes": stats.TxBytes,
|
||||
"transmit_packets": stats.TxPackets,
|
||||
"transmit_errors": stats.TxErrors,
|
||||
"transmit_dropped": stats.TxDropped,
|
||||
"transmit_fifo": stats.TxFIFO,
|
||||
"transmit_colls": stats.TxCollisions,
|
||||
"transmit_carrier": stats.TxCarrier,
|
||||
"transmit_compressed": stats.TxCompressed,
|
||||
}
|
||||
}
|
||||
|
||||
return metrics, nil
|
||||
}
|
|
@ -0,0 +1,161 @@
|
|||
// Copyright 2025 The HuaTuo Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package collector
|
||||
|
||||
// ref: https://github.com/prometheus/node_exporter/tree/master/collector
|
||||
// - netstat_linux.go
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"huatuo-bamai/internal/conf"
|
||||
"huatuo-bamai/internal/log"
|
||||
"huatuo-bamai/internal/pod"
|
||||
"huatuo-bamai/pkg/metric"
|
||||
"huatuo-bamai/pkg/tracing"
|
||||
)
|
||||
|
||||
type netstatCollector struct{}
|
||||
|
||||
func init() {
|
||||
tracing.RegisterEventTracing("netstat", newNetstatCollector)
|
||||
}
|
||||
|
||||
func newNetstatCollector() (*tracing.EventTracingAttr, error) {
|
||||
return &tracing.EventTracingAttr{
|
||||
TracingData: &netstatCollector{},
|
||||
Flag: tracing.FlagMetric,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (c *netstatCollector) Update() ([]*metric.Data, error) {
|
||||
filter := newFieldFilter(conf.Get().MetricCollector.Netstat.ExcludedMetrics, conf.Get().MetricCollector.Netstat.IncludedMetrics)
|
||||
log.Debugf("Updating netstat metrics by filter: %v", filter)
|
||||
|
||||
containers, err := pod.GetNormalContainers()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// support the empty container
|
||||
if containers == nil {
|
||||
containers = make(map[string]*pod.Container)
|
||||
}
|
||||
// append host into containers
|
||||
containers[""] = nil
|
||||
|
||||
var metrics []*metric.Data
|
||||
for _, container := range containers {
|
||||
m, err := c.getStatMetrics(container, filter)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("couldn't get netstat metrics for container %v: %w", container, err)
|
||||
}
|
||||
metrics = append(metrics, m...)
|
||||
}
|
||||
|
||||
log.Debugf("Updated netstat metrics by filter %v: %v", filter, metrics)
|
||||
return metrics, nil
|
||||
}
|
||||
|
||||
func (c *netstatCollector) getStatMetrics(container *pod.Container, filter *fieldFilter) ([]*metric.Data, error) {
|
||||
pid := 1 // host
|
||||
if container != nil {
|
||||
pid = container.InitPid
|
||||
}
|
||||
|
||||
pidProc := filepath.Join("/proc", strconv.Itoa(pid))
|
||||
netStats, err := c.procNetstats(filepath.Join(pidProc, "net/netstat"))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("couldn't get netstats for %v: %w", container, err)
|
||||
}
|
||||
snmpStats, err := c.procNetstats(filepath.Join(pidProc, "net/snmp"))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("couldn't get SNMP stats for %v: %w", container, err)
|
||||
}
|
||||
|
||||
// Merge the results of snmpStats into netStats (collisions are possible, but
|
||||
// we know that the keys are always unique for the given use case).
|
||||
for k, v := range snmpStats {
|
||||
netStats[k] = v
|
||||
}
|
||||
|
||||
var metrics []*metric.Data
|
||||
for protocol, protocolStats := range netStats {
|
||||
for name, value := range protocolStats {
|
||||
key := protocol + "_" + name
|
||||
v, err := strconv.ParseFloat(value, 64)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("invalid value %s in netstats for %v: %w", value, container, err)
|
||||
}
|
||||
|
||||
if filter.ignored(key) {
|
||||
log.Debugf("Ignoring netstat metric %s", key)
|
||||
continue
|
||||
}
|
||||
|
||||
if container != nil {
|
||||
metrics = append(metrics,
|
||||
metric.NewContainerGaugeData(container, key, v, fmt.Sprintf("Statistic %s.", protocol+name), nil))
|
||||
} else {
|
||||
metrics = append(metrics,
|
||||
metric.NewGaugeData(key, v, fmt.Sprintf("Statistic %s.", protocol+name), nil))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return metrics, nil
|
||||
}
|
||||
|
||||
func (c *netstatCollector) procNetstats(fileName string) (map[string]map[string]string, error) {
|
||||
file, err := os.Open(fileName)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
var (
|
||||
netStats = map[string]map[string]string{}
|
||||
scanner = bufio.NewScanner(file)
|
||||
)
|
||||
|
||||
for scanner.Scan() {
|
||||
nameParts := strings.Split(scanner.Text(), " ")
|
||||
scanner.Scan()
|
||||
valueParts := strings.Split(scanner.Text(), " ")
|
||||
// Remove trailing :.
|
||||
protocol := nameParts[0][:len(nameParts[0])-1]
|
||||
|
||||
// protocol: only for Tcp/TcpExt
|
||||
if protocol != "Tcp" && protocol != "TcpExt" {
|
||||
continue
|
||||
}
|
||||
|
||||
netStats[protocol] = map[string]string{}
|
||||
if len(nameParts) != len(valueParts) {
|
||||
return nil, fmt.Errorf("mismatch field count mismatch in %s: %s",
|
||||
fileName, protocol)
|
||||
}
|
||||
for i := 1; i < len(nameParts); i++ {
|
||||
netStats[protocol][nameParts[i]] = valueParts[i]
|
||||
}
|
||||
}
|
||||
|
||||
return netStats, scanner.Err()
|
||||
}
|
|
@ -0,0 +1,132 @@
|
|||
// Copyright 2025 The HuaTuo Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package collector
|
||||
|
||||
// ref: https://github.com/prometheus/node_exporter/tree/master/collector
|
||||
// - qdisc_linux.go
|
||||
|
||||
import (
|
||||
"huatuo-bamai/internal/conf"
|
||||
"huatuo-bamai/pkg/metric"
|
||||
"huatuo-bamai/pkg/tracing"
|
||||
|
||||
"github.com/ema/qdisc"
|
||||
)
|
||||
|
||||
type qdiscStats struct {
|
||||
ifaceName string
|
||||
kind string
|
||||
bytes uint64
|
||||
packets uint32
|
||||
drops uint32
|
||||
requeues uint32
|
||||
overlimits uint32
|
||||
qlen uint32
|
||||
backlog uint32
|
||||
}
|
||||
|
||||
const tcHMajMask = 0xFFFF0000
|
||||
|
||||
type qdiscCollector struct{}
|
||||
|
||||
func init() {
|
||||
tracing.RegisterEventTracing("qdisc", newQdiscCollector)
|
||||
}
|
||||
|
||||
func newQdiscCollector() (*tracing.EventTracingAttr, error) {
|
||||
return &tracing.EventTracingAttr{
|
||||
TracingData: &qdiscCollector{},
|
||||
Flag: tracing.FlagMetric,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// sum of same level(parent major) for a device, example:
|
||||
// <device0> (1+2, 3)
|
||||
// 1: qidsc <kind> handle0 parent0
|
||||
// 2: qidsc <kind> handle1 parent0
|
||||
// 3: qidsc <kind> handle2 parent1
|
||||
//
|
||||
// <device1> (1, 2+3)
|
||||
// 1: qidsc <kind> handle0 parent0
|
||||
// 2: qidsc <kind> handle1 parent1
|
||||
// 3: qidsc <kind> handle2 parent1
|
||||
func (c *qdiscCollector) Update() ([]*metric.Data, error) {
|
||||
filter := newFieldFilter(conf.Get().MetricCollector.Qdisc.IgnoredDevices,
|
||||
conf.Get().MetricCollector.Qdisc.AcceptDevices)
|
||||
|
||||
allQdisc, err := qdisc.Get()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
allQdiscMap := make(map[string]map[uint32]*qdiscStats)
|
||||
for _, q := range allQdisc {
|
||||
if filter.ignored(q.IfaceName) || q.Kind == "noqueue" {
|
||||
continue
|
||||
}
|
||||
|
||||
parentMaj := (q.Parent & tcHMajMask) >> 16
|
||||
if _, ok := allQdiscMap[q.IfaceName]; !ok {
|
||||
allQdiscMap[q.IfaceName] = make(map[uint32]*qdiscStats)
|
||||
}
|
||||
netQdisc, ok := allQdiscMap[q.IfaceName][parentMaj]
|
||||
if !ok {
|
||||
allQdiscMap[q.IfaceName][parentMaj] = &qdiscStats{
|
||||
ifaceName: q.IfaceName,
|
||||
kind: q.Kind,
|
||||
bytes: q.Bytes,
|
||||
packets: q.Packets,
|
||||
drops: q.Drops,
|
||||
requeues: q.Requeues,
|
||||
overlimits: q.Overlimits,
|
||||
qlen: q.Qlen,
|
||||
backlog: q.Backlog,
|
||||
}
|
||||
} else {
|
||||
netQdisc.bytes += q.Bytes
|
||||
netQdisc.packets += q.Packets
|
||||
netQdisc.drops += q.Drops
|
||||
netQdisc.requeues += q.Requeues
|
||||
netQdisc.overlimits += q.Overlimits
|
||||
netQdisc.qlen += q.Qlen
|
||||
netQdisc.backlog += q.Backlog
|
||||
}
|
||||
}
|
||||
|
||||
var metrics []*metric.Data
|
||||
for _, netdevQdisc := range allQdiscMap {
|
||||
for _, oneQdisc := range netdevQdisc {
|
||||
tags := map[string]string{"device": oneQdisc.ifaceName, "kind": oneQdisc.kind}
|
||||
metrics = append(metrics,
|
||||
metric.NewGaugeData("bytes_total", float64(oneQdisc.bytes),
|
||||
"Number of bytes sent.", tags),
|
||||
metric.NewGaugeData("packets_total", float64(oneQdisc.packets),
|
||||
"Number of packets sent.", tags),
|
||||
metric.NewGaugeData("drops_total", float64(oneQdisc.drops),
|
||||
"Number of packet drops.", tags),
|
||||
metric.NewGaugeData("requeues_total", float64(oneQdisc.requeues),
|
||||
"Number of packets dequeued, not transmitted, and requeued.", tags),
|
||||
metric.NewGaugeData("overlimits_total", float64(oneQdisc.overlimits),
|
||||
"Number of packet overlimits.", tags),
|
||||
metric.NewGaugeData("current_queue_length", float64(oneQdisc.qlen),
|
||||
"Number of packets currently in queue to be sent.", tags),
|
||||
metric.NewGaugeData("backlog", float64(oneQdisc.backlog),
|
||||
"Number of bytes currently in queue to be sent.", tags),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
return metrics, nil
|
||||
}
|
|
@ -0,0 +1,81 @@
|
|||
// Copyright 2025 The HuaTuo Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package collector
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
|
||||
"huatuo-bamai/internal/pod"
|
||||
"huatuo-bamai/pkg/metric"
|
||||
"huatuo-bamai/pkg/tracing"
|
||||
)
|
||||
|
||||
type runqlatCollector struct {
|
||||
runqlatMetric []*metric.Data
|
||||
}
|
||||
|
||||
func init() {
|
||||
_ = pod.RegisterContainerLifeResources("runqlat", reflect.TypeOf(&latencyBpfData{}))
|
||||
tracing.RegisterEventTracing("runqlat", newRunqlatCollector)
|
||||
}
|
||||
|
||||
func newRunqlatCollector() (*tracing.EventTracingAttr, error) {
|
||||
collector := &runqlatCollector{
|
||||
runqlatMetric: []*metric.Data{
|
||||
metric.NewGaugeData("g_nlat_01", 0, "nlat_01 of host", nil),
|
||||
metric.NewGaugeData("g_nlat_02", 0, "nlat_02 of host", nil),
|
||||
metric.NewGaugeData("g_nlat_03", 0, "nlat_03 of host", nil),
|
||||
metric.NewGaugeData("g_nlat_04", 0, "nlat_04 of host", nil),
|
||||
},
|
||||
}
|
||||
|
||||
return &tracing.EventTracingAttr{
|
||||
TracingData: collector,
|
||||
Internal: 10,
|
||||
Flag: tracing.FlagTracing | tracing.FlagMetric,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (c *runqlatCollector) Update() ([]*metric.Data, error) {
|
||||
runqlatMetric := []*metric.Data{}
|
||||
|
||||
if !runqlatRunning {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
containers, err := pod.GetContainersByType(pod.ContainerTypeNormal)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for _, container := range containers {
|
||||
metrics := container.LifeResouces("runqlat").(*latencyBpfData)
|
||||
|
||||
runqlatMetric = append(runqlatMetric,
|
||||
metric.NewContainerGaugeData(container, "nlat_01", float64(metrics.NumLatency01), "nlat_01", nil),
|
||||
metric.NewContainerGaugeData(container, "nlat_02", float64(metrics.NumLatency02), "nlat_02", nil),
|
||||
metric.NewContainerGaugeData(container, "nlat_03", float64(metrics.NumLatency03), "nlat_03", nil),
|
||||
metric.NewContainerGaugeData(container, "nlat_04", float64(metrics.NumLatency04), "nlat_04", nil))
|
||||
}
|
||||
|
||||
c.runqlatMetric[0].Value = float64(globalRunqlat.NumLatency01)
|
||||
c.runqlatMetric[1].Value = float64(globalRunqlat.NumLatency02)
|
||||
c.runqlatMetric[2].Value = float64(globalRunqlat.NumLatency03)
|
||||
c.runqlatMetric[3].Value = float64(globalRunqlat.NumLatency04)
|
||||
|
||||
runqlatMetric = append(runqlatMetric, c.runqlatMetric...)
|
||||
|
||||
return runqlatMetric, nil
|
||||
}
|
|
@ -0,0 +1,120 @@
|
|||
// Copyright 2025 The HuaTuo Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package collector
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/binary"
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"huatuo-bamai/internal/bpf"
|
||||
"huatuo-bamai/internal/log"
|
||||
"huatuo-bamai/internal/pod"
|
||||
"huatuo-bamai/internal/utils/bpfutil"
|
||||
)
|
||||
|
||||
//go:generate $BPF_COMPILE $BPF_INCLUDE -s $BPF_DIR/runqlat_tracing.c -o $BPF_DIR/runqlat_tracing.o
|
||||
|
||||
type latencyBpfData struct {
|
||||
NumVoluntarySwitch uint64
|
||||
NumInVoluntarySwitch uint64
|
||||
NumLatency01 uint64
|
||||
NumLatency02 uint64
|
||||
NumLatency03 uint64
|
||||
NumLatency04 uint64
|
||||
}
|
||||
|
||||
var (
|
||||
globalRunqlat latencyBpfData
|
||||
runqlatRunning bool
|
||||
)
|
||||
|
||||
func startRunqlatTracerWork(ctx context.Context) error {
|
||||
// load bpf.
|
||||
b, err := bpf.LoadBpf(bpfutil.ThisBpfOBJ(), nil)
|
||||
if err != nil {
|
||||
return fmt.Errorf("load bpf: %w", err)
|
||||
}
|
||||
defer b.Close()
|
||||
|
||||
if err = b.Attach(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
childCtx, cancel := context.WithCancel(ctx)
|
||||
defer cancel()
|
||||
|
||||
b.WaitDetachByBreaker(childCtx, cancel)
|
||||
|
||||
runqlatRunning = true
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return nil
|
||||
default:
|
||||
var css uint64
|
||||
|
||||
items, err := b.DumpMapByName("cpu_tg_metric")
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to dump cpu_tg_metric: %w", err)
|
||||
}
|
||||
for _, v := range items {
|
||||
buf := bytes.NewReader(v.Key)
|
||||
if err = binary.Read(buf, binary.LittleEndian, &css); err != nil {
|
||||
return fmt.Errorf("can't read cpu_tg_metric key: %w", err)
|
||||
}
|
||||
container, _ := pod.GetContainerByCSS(css, "cpu")
|
||||
if container == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
buf = bytes.NewReader(v.Value)
|
||||
if err = binary.Read(buf, binary.LittleEndian, container.LifeResouces("runqlat").(*latencyBpfData)); err != nil {
|
||||
return fmt.Errorf("can't read cpu_tg_metric value: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
item, err := b.ReadMap(b.MapIDByName("cpu_host_metric"), []byte{0, 0, 0, 0})
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to read cpu_host_metric: %w", err)
|
||||
}
|
||||
buf := bytes.NewReader(item)
|
||||
if err = binary.Read(buf, binary.LittleEndian, &globalRunqlat); err != nil {
|
||||
log.Errorf("can't read cpu_host_metric: %v", err)
|
||||
return err
|
||||
}
|
||||
|
||||
time.Sleep(2 * time.Second)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Start runqlat work, load bpf and wait data form perfevent
|
||||
func (c *runqlatCollector) Start(ctx context.Context) error {
|
||||
err := startRunqlatTracerWork(ctx)
|
||||
|
||||
containers, _ := pod.GetContainersByType(pod.ContainerTypeNormal)
|
||||
for _, container := range containers {
|
||||
runqlatData := container.LifeResouces("runqlat").(*latencyBpfData)
|
||||
*runqlatData = latencyBpfData{}
|
||||
}
|
||||
|
||||
runqlatRunning = false
|
||||
|
||||
return err
|
||||
}
|
|
@ -0,0 +1,122 @@
|
|||
// Copyright 2025 The HuaTuo Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package collector
|
||||
|
||||
import (
|
||||
"time"
|
||||
|
||||
"huatuo-bamai/internal/cgroups"
|
||||
"huatuo-bamai/internal/log"
|
||||
"huatuo-bamai/internal/utils/parseutil"
|
||||
"huatuo-bamai/pkg/metric"
|
||||
"huatuo-bamai/pkg/tracing"
|
||||
|
||||
"github.com/prometheus/procfs"
|
||||
)
|
||||
|
||||
const (
|
||||
// CLK_TCK is a constant on Linux for all architectures except alpha and ia64.
|
||||
// See e.g.
|
||||
// https://git.musl-libc.org/cgit/musl/tree/src/conf/sysconf.c#n30
|
||||
// https://github.com/containerd/cgroups/pull/12
|
||||
// https://lore.kernel.org/lkml/agtlq6$iht$1@penguin.transmeta.com/
|
||||
userHZ int64 = 100
|
||||
)
|
||||
|
||||
type runtimeCollector struct {
|
||||
oldStat *procfs.ProcStat
|
||||
oldTs int64
|
||||
}
|
||||
|
||||
func init() {
|
||||
tracing.RegisterEventTracing("runtime", newQosCollector)
|
||||
}
|
||||
|
||||
func newQosCollector() (*tracing.EventTracingAttr, error) {
|
||||
return &tracing.EventTracingAttr{
|
||||
TracingData: &runtimeCollector{},
|
||||
Flag: tracing.FlagMetric,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (c *runtimeCollector) Update() ([]*metric.Data, error) {
|
||||
runtimeMetric := make([]*metric.Data, 0)
|
||||
|
||||
p, err := procfs.Self()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
runtimeMetric = append(runtimeMetric, getCPUMetric(c, &p)...)
|
||||
runtimeMetric = append(runtimeMetric, getMemoryMetric(&p)...)
|
||||
|
||||
return runtimeMetric, nil
|
||||
}
|
||||
|
||||
func getCPUMetric(c *runtimeCollector, p *procfs.Proc) []*metric.Data {
|
||||
stat, err := p.Stat()
|
||||
if err != nil {
|
||||
log.Warnf("not get process stat: %v", err)
|
||||
return nil
|
||||
}
|
||||
ts := time.Now().Unix()
|
||||
|
||||
if c.oldStat == nil {
|
||||
c.oldStat = &stat
|
||||
}
|
||||
|
||||
if c.oldTs == 0 {
|
||||
c.oldTs = ts
|
||||
return nil
|
||||
}
|
||||
|
||||
data := make([]*metric.Data, 2)
|
||||
duration := ts - c.oldTs
|
||||
|
||||
// huatuo-bamai.cpu.user(*100)
|
||||
user := float64(stat.UTime-c.oldStat.UTime) / float64(userHZ*duration)
|
||||
data[0] = metric.NewGaugeData("cpu_user", user*100, "user cpu", nil)
|
||||
|
||||
// huatuo-bamai.cpu.sys(*100)
|
||||
sys := float64(stat.STime-c.oldStat.STime) / float64(userHZ*duration)
|
||||
data[1] = metric.NewGaugeData("cpu_sys", sys*100, "sys cpu", nil)
|
||||
|
||||
// save stat
|
||||
c.oldStat = &stat
|
||||
c.oldTs = ts
|
||||
|
||||
return data
|
||||
}
|
||||
|
||||
func getMemoryMetric(p *procfs.Proc) []*metric.Data {
|
||||
data := make([]*metric.Data, 3)
|
||||
status, err := p.NewStatus()
|
||||
if err != nil {
|
||||
log.Warnf("not get process status: %v", err)
|
||||
return nil
|
||||
}
|
||||
|
||||
data[0] = metric.NewGaugeData("memory_vss", float64(status.VmSize)/1024, "memory vss", nil)
|
||||
data[1] = metric.NewGaugeData("memory_rss", float64(status.VmRSS)/1024, "memory rss", nil)
|
||||
|
||||
rssI, err := parseutil.ReadUint(cgroups.RootFsFilePath("memory") + "/huatuo-bamai/memory.usage_in_bytes")
|
||||
if err != nil {
|
||||
log.Warnf("can't ParseUint, err: %v", err)
|
||||
return nil
|
||||
}
|
||||
data[2] = metric.NewGaugeData("memory_cgroup_rss", float64(rssI)/1024, "memory cgroup rss", nil)
|
||||
|
||||
return data
|
||||
}
|
|
@ -0,0 +1,187 @@
|
|||
// Copyright 2025 The HuaTuo Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package collector
|
||||
|
||||
// ref: https://github.com/prometheus/node_exporter/tree/master/collector
|
||||
// - sockstat_linux.go
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
|
||||
"huatuo-bamai/internal/log"
|
||||
"huatuo-bamai/internal/pod"
|
||||
"huatuo-bamai/pkg/metric"
|
||||
"huatuo-bamai/pkg/tracing"
|
||||
|
||||
"github.com/prometheus/procfs"
|
||||
)
|
||||
|
||||
type sockstatCollector struct{}
|
||||
|
||||
func init() {
|
||||
tracing.RegisterEventTracing("sockstat", newSockstatCollector)
|
||||
}
|
||||
|
||||
func newSockstatCollector() (*tracing.EventTracingAttr, error) {
|
||||
return &tracing.EventTracingAttr{
|
||||
TracingData: &sockstatCollector{},
|
||||
Flag: tracing.FlagMetric,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (c *sockstatCollector) Update() ([]*metric.Data, error) {
|
||||
log.Debugf("Updating sockstat metrics")
|
||||
|
||||
containers, err := pod.GetNormalContainers()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// support the empty container
|
||||
if containers == nil {
|
||||
containers = make(map[string]*pod.Container)
|
||||
}
|
||||
// append host into containers
|
||||
containers[""] = nil
|
||||
|
||||
var metrics []*metric.Data
|
||||
for _, container := range containers {
|
||||
m, err := c.procStatMetrics(container)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("couldn't get sockstat metrics for container %v: %w", container, err)
|
||||
}
|
||||
metrics = append(metrics, m...)
|
||||
}
|
||||
|
||||
log.Debugf("Updated sockstat metrics: %v", metrics)
|
||||
return metrics, nil
|
||||
}
|
||||
|
||||
func (c *sockstatCollector) procStatMetrics(container *pod.Container) ([]*metric.Data, error) {
|
||||
pid := 1 // host
|
||||
if container != nil {
|
||||
pid = container.InitPid
|
||||
}
|
||||
|
||||
// NOTE: non-standard using procfs.NewFS.
|
||||
fs, err := procfs.NewFS(filepath.Join("/proc", strconv.Itoa(pid)))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to open procfs: %w", err)
|
||||
}
|
||||
|
||||
// If IPv4 and/or IPv6 are disabled on this kernel, handle it gracefully.
|
||||
stat, err := fs.NetSockstat()
|
||||
switch {
|
||||
case err == nil:
|
||||
case errors.Is(err, os.ErrNotExist):
|
||||
log.Debug("IPv4 sockstat statistics not found, skipping")
|
||||
default:
|
||||
return nil, fmt.Errorf("failed to get IPv4 sockstat data: %w", err)
|
||||
}
|
||||
|
||||
if stat == nil { // nothing to do.
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
var metrics []*metric.Data
|
||||
|
||||
// If sockstat contains the number of used sockets, export it.
|
||||
if stat.Used != nil {
|
||||
if container != nil {
|
||||
metrics = append(metrics,
|
||||
metric.NewContainerGaugeData(container, "sockets_used", float64(*stat.Used), "Number of IPv4 sockets in use.", nil))
|
||||
} else {
|
||||
metrics = append(metrics,
|
||||
metric.NewGaugeData("sockets_used", float64(*stat.Used), "Number of IPv4 sockets in use.", nil))
|
||||
}
|
||||
}
|
||||
|
||||
// A name and optional value for a sockstat metric.
|
||||
type ssPair struct {
|
||||
name string
|
||||
v *int
|
||||
}
|
||||
|
||||
// Previously these metric names were generated directly from the file output.
|
||||
// In order to keep the same level of compatibility, we must map the fields
|
||||
// to their correct names.
|
||||
for i := range stat.Protocols {
|
||||
p := stat.Protocols[i]
|
||||
pairs := []ssPair{
|
||||
{
|
||||
name: "inuse",
|
||||
v: &p.InUse,
|
||||
},
|
||||
{
|
||||
name: "orphan",
|
||||
v: p.Orphan,
|
||||
},
|
||||
{
|
||||
name: "tw",
|
||||
v: p.TW,
|
||||
},
|
||||
{
|
||||
name: "alloc",
|
||||
v: p.Alloc,
|
||||
},
|
||||
{
|
||||
name: "mem",
|
||||
v: p.Mem,
|
||||
},
|
||||
{
|
||||
name: "memory",
|
||||
v: p.Memory,
|
||||
},
|
||||
}
|
||||
|
||||
// Also export mem_bytes values for sockets which have a mem value
|
||||
// stored in pages.
|
||||
if p.Mem != nil {
|
||||
v := *p.Mem * skMemQuantum
|
||||
pairs = append(pairs, ssPair{
|
||||
name: "mem_bytes",
|
||||
v: &v,
|
||||
})
|
||||
}
|
||||
|
||||
for _, pair := range pairs {
|
||||
if pair.v == nil {
|
||||
// This value is not set for this protocol; nothing to do.
|
||||
continue
|
||||
}
|
||||
|
||||
// mem, mem_bytes are only for `Host` environment.
|
||||
if container != nil && (pair.name == "mem" || pair.name == "mem_bytes") {
|
||||
continue
|
||||
}
|
||||
|
||||
if container != nil {
|
||||
metrics = append(metrics,
|
||||
metric.NewContainerGaugeData(container, fmt.Sprintf("%s_%s", p.Protocol, pair.name), float64(*pair.v),
|
||||
fmt.Sprintf("Number of %s sockets in state %s.", p.Protocol, pair.name), nil))
|
||||
} else {
|
||||
metrics = append(metrics,
|
||||
metric.NewGaugeData(fmt.Sprintf("%s_%s", p.Protocol, pair.name), float64(*pair.v),
|
||||
fmt.Sprintf("Number of %s sockets in state %s.", p.Protocol, pair.name), nil))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return metrics, nil
|
||||
}
|
|
@ -0,0 +1,193 @@
|
|||
// Copyright 2025 The HuaTuo Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package collector
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/binary"
|
||||
"fmt"
|
||||
"strconv"
|
||||
|
||||
"huatuo-bamai/internal/bpf"
|
||||
"huatuo-bamai/internal/utils/bpfutil"
|
||||
"huatuo-bamai/pkg/metric"
|
||||
"huatuo-bamai/pkg/tracing"
|
||||
|
||||
"github.com/tklauser/numcpus"
|
||||
)
|
||||
|
||||
func init() {
|
||||
tracing.RegisterEventTracing("softirq", newSoftirq)
|
||||
}
|
||||
|
||||
func newSoftirq() (*tracing.EventTracingAttr, error) {
|
||||
cpuPossible, err := numcpus.GetPossible()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("fetch possible cpu num")
|
||||
}
|
||||
|
||||
cpuOnline, err := numcpus.GetOnline()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("fetch possible cpu num")
|
||||
}
|
||||
|
||||
return &tracing.EventTracingAttr{
|
||||
TracingData: &softirqLatency{
|
||||
bpf: nil,
|
||||
isRunning: false,
|
||||
cpuPossible: cpuPossible,
|
||||
cpuOnline: cpuOnline,
|
||||
},
|
||||
Internal: 10,
|
||||
Flag: tracing.FlagTracing | tracing.FlagMetric,
|
||||
}, nil
|
||||
}
|
||||
|
||||
//go:generate $BPF_COMPILE $BPF_INCLUDE -s $BPF_DIR/softirq.c -o $BPF_DIR/softirq.o
|
||||
|
||||
type softirqLatency struct {
|
||||
bpf bpf.BPF
|
||||
isRunning bool
|
||||
cpuPossible int
|
||||
cpuOnline int
|
||||
}
|
||||
|
||||
type softirqLatencyData struct {
|
||||
Timestamp uint64
|
||||
TotalLatency [4]uint64
|
||||
}
|
||||
|
||||
const (
|
||||
softirqHi = iota
|
||||
softirqTime
|
||||
softirqNetTx
|
||||
softirqNetRx
|
||||
softirqBlock
|
||||
softirqIrqPoll
|
||||
softirqTasklet
|
||||
softirqSched
|
||||
softirqHrtimer
|
||||
sofirqRcu
|
||||
softirqMax
|
||||
)
|
||||
|
||||
func irqTypeName(id int) string {
|
||||
switch id {
|
||||
case softirqHi:
|
||||
return "HI"
|
||||
case softirqTime:
|
||||
return "TIMER"
|
||||
case softirqNetTx:
|
||||
return "NET_TX"
|
||||
case softirqNetRx:
|
||||
return "NET_RX"
|
||||
case softirqBlock:
|
||||
return "BLOCK"
|
||||
case softirqIrqPoll:
|
||||
return "IRQ_POLL"
|
||||
case softirqTasklet:
|
||||
return "TASKLET"
|
||||
case softirqSched:
|
||||
return "SCHED"
|
||||
case softirqHrtimer:
|
||||
return "HRTIMER"
|
||||
case sofirqRcu:
|
||||
return "RCU"
|
||||
default:
|
||||
return "ERR_TYPE"
|
||||
}
|
||||
}
|
||||
|
||||
func irqAllowed(id int) bool {
|
||||
switch id {
|
||||
case softirqNetTx, softirqNetRx:
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func (s *softirqLatency) Update() ([]*metric.Data, error) {
|
||||
if !s.isRunning {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
items, err := s.bpf.DumpMapByName("softirq_percpu_lats")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("dump map: %w", err)
|
||||
}
|
||||
|
||||
labels := make(map[string]string)
|
||||
metricData := []*metric.Data{}
|
||||
|
||||
// IRQ: 0 ... NR_SOFTIRQS_MAX
|
||||
for _, item := range items {
|
||||
var irqVector uint32
|
||||
latencyOnAllCPU := make([]softirqLatencyData, s.cpuPossible)
|
||||
|
||||
if err = binary.Read(bytes.NewReader(item.Key), binary.LittleEndian, &irqVector); err != nil {
|
||||
return nil, fmt.Errorf("read map key: %w", err)
|
||||
}
|
||||
|
||||
if !irqAllowed(int(irqVector)) {
|
||||
continue
|
||||
}
|
||||
|
||||
if err = binary.Read(bytes.NewReader(item.Value), binary.LittleEndian, &latencyOnAllCPU); err != nil {
|
||||
return nil, fmt.Errorf("read map value: %w", err)
|
||||
}
|
||||
|
||||
labels["type"] = irqTypeName(int(irqVector))
|
||||
|
||||
for cpuid, lat := range latencyOnAllCPU {
|
||||
if cpuid >= s.cpuOnline {
|
||||
break
|
||||
}
|
||||
labels["cpuid"] = strconv.Itoa(cpuid)
|
||||
for zoneid, zone := range lat.TotalLatency {
|
||||
labels["zone"] = strconv.Itoa(zoneid)
|
||||
metricData = append(metricData, metric.NewGaugeData("latency", float64(zone), "softirq latency", labels))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return metricData, nil
|
||||
}
|
||||
|
||||
func (s *softirqLatency) Start(ctx context.Context) error {
|
||||
b, err := bpf.LoadBpf(bpfutil.ThisBpfOBJ(), nil)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer b.Close()
|
||||
|
||||
if err = b.Attach(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
s.bpf = b
|
||||
s.isRunning = true
|
||||
|
||||
childCtx, cancel := context.WithCancel(ctx)
|
||||
defer cancel()
|
||||
|
||||
b.WaitDetachByBreaker(childCtx, cancel)
|
||||
|
||||
<-childCtx.Done()
|
||||
|
||||
s.isRunning = false
|
||||
return nil
|
||||
}
|
|
@ -0,0 +1,102 @@
|
|||
// Copyright 2025 The HuaTuo Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package collector
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"huatuo-bamai/internal/log"
|
||||
"huatuo-bamai/pkg/metric"
|
||||
"huatuo-bamai/pkg/tracing"
|
||||
|
||||
"github.com/prometheus/procfs"
|
||||
)
|
||||
|
||||
const (
|
||||
skMemQuantum = 4096
|
||||
)
|
||||
|
||||
type tcpMemCollector struct {
|
||||
tcpMemMetric []*metric.Data
|
||||
}
|
||||
|
||||
func init() {
|
||||
tracing.RegisterEventTracing("tcp_mem", newTCPMemCollector)
|
||||
}
|
||||
|
||||
func newTCPMemCollector() (*tracing.EventTracingAttr, error) {
|
||||
return &tracing.EventTracingAttr{
|
||||
TracingData: &tcpMemCollector{
|
||||
tcpMemMetric: []*metric.Data{
|
||||
metric.NewGaugeData("usage_pages", 0, "tcp mem usage(pages)", nil),
|
||||
metric.NewGaugeData("usage_bytes", 0, "tcp mem usage(bytes)", nil),
|
||||
metric.NewGaugeData("limit_pages", 0, "tcp mem limit(pages)", nil),
|
||||
metric.NewGaugeData("usage_percent", 0, "tcp mem usage percent", nil),
|
||||
},
|
||||
},
|
||||
Flag: tracing.FlagMetric,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (c *tcpMemCollector) getTCPMem() (tcpMem, tcpMemBytes, tcpMemLimit float64, err error) {
|
||||
fs, err := procfs.NewDefaultFS()
|
||||
if err != nil {
|
||||
log.Infof("failed to open sysfs: %v", err)
|
||||
return -1, -1, -1, err
|
||||
}
|
||||
|
||||
values, err := fs.SysctlInts("net.ipv4.tcp_mem")
|
||||
if err != nil {
|
||||
log.Infof("error obtaining sysctl info: %v", err)
|
||||
return -1, -1, -1, err
|
||||
}
|
||||
|
||||
tcpMemLimit = float64(values[2])
|
||||
|
||||
stat4, err := fs.NetSockstat()
|
||||
if err != nil {
|
||||
log.Infof("failed to get NetSockstat: %v", err)
|
||||
return -1, -1, -1, err
|
||||
}
|
||||
|
||||
for _, p := range stat4.Protocols {
|
||||
if p.Protocol != "TCP" {
|
||||
continue
|
||||
}
|
||||
|
||||
if p.Mem == nil {
|
||||
return -1, -1, -1, fmt.Errorf("failed to read tcpmem usage")
|
||||
}
|
||||
|
||||
tcpMem = float64(*p.Mem)
|
||||
tcpMemBytes = float64(*p.Mem * skMemQuantum)
|
||||
}
|
||||
|
||||
return tcpMem, tcpMemBytes, tcpMemLimit, nil
|
||||
}
|
||||
|
||||
func (c *tcpMemCollector) Update() ([]*metric.Data, error) {
|
||||
tcpMem, tcpMemBytes, tcpMemLimit, err := c.getTCPMem()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
c.tcpMemMetric[0].Value = tcpMem
|
||||
c.tcpMemMetric[1].Value = tcpMemBytes
|
||||
c.tcpMemMetric[2].Value = tcpMemLimit
|
||||
c.tcpMemMetric[3].Value = tcpMem / tcpMemLimit
|
||||
|
||||
return c.tcpMemMetric, nil
|
||||
}
|
|
@ -0,0 +1,49 @@
|
|||
// Copyright 2025 The HuaTuo Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package collector
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"io"
|
||||
"os"
|
||||
)
|
||||
|
||||
func fileLineCounter(filePath string) (int, error) {
|
||||
count := 0
|
||||
buf := make([]byte, 8*20*4096)
|
||||
|
||||
file, err := os.Open(filePath)
|
||||
if err != nil {
|
||||
return count, err
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
r := io.Reader(file)
|
||||
|
||||
for {
|
||||
c, err := r.Read(buf)
|
||||
count += bytes.Count(buf[:c], []byte("\n"))
|
||||
|
||||
if err == io.EOF {
|
||||
break
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
return count, err
|
||||
}
|
||||
}
|
||||
|
||||
return count, nil
|
||||
}
|
|
@ -0,0 +1,94 @@
|
|||
// Copyright 2025 The HuaTuo Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package collector
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"huatuo-bamai/internal/conf"
|
||||
"huatuo-bamai/internal/log"
|
||||
"huatuo-bamai/pkg/metric"
|
||||
"huatuo-bamai/pkg/tracing"
|
||||
)
|
||||
|
||||
type vmStatCollector struct{}
|
||||
|
||||
func init() {
|
||||
tracing.RegisterEventTracing("vmstat", newVMStatCollector)
|
||||
}
|
||||
|
||||
var vmStatMetricDesc = map[string]string{
|
||||
"allocstall_normal": "host direct reclaim count on normal zone",
|
||||
"allocstall_movable": "host direct reclaim count on movable zone",
|
||||
"compact_stall": "memory compaction count",
|
||||
"nr_active_anon": "anonymous pages on active lru",
|
||||
"nr_active_file": "file pages on active lru",
|
||||
"nr_boost_pages": "kswapd boost pages",
|
||||
"nr_dirty": "dirty pages",
|
||||
"nr_free_pages": "free pages in buddy system",
|
||||
"nr_inactive_anon": "anonymous pages on inactive lru",
|
||||
"nr_inactive_file": "file pages on inactive lru",
|
||||
"nr_kswapd_boost": "kswapd boosting count",
|
||||
"nr_mlock": "mlocked pages",
|
||||
"nr_shmem": "shared memory pages",
|
||||
"nr_slab_reclaimable": "reclaimable slab pages",
|
||||
"nr_slab_unreclaimable": "unreclaimable slab pages",
|
||||
"nr_unevictable": "unevictable pages",
|
||||
"nr_writeback": "writing-back pages",
|
||||
"numa_pages_migrated": "numa migrated pages",
|
||||
"pgdeactivate": "pages deactivated from active lru to inactive lru",
|
||||
"pgrefill": "pages scanned on active lru",
|
||||
"pgscan_direct": "scanned pages in host direct reclaim",
|
||||
"pgscan_kswapd": "scanned pages in host kswapd reclaim",
|
||||
"pgsteal_direct": "reclaimed pages in host direct reclaim",
|
||||
"pgsteal_kswapd": "reclaimed pages in host kswapd reclaim",
|
||||
}
|
||||
|
||||
func newVMStatCollector() (*tracing.EventTracingAttr, error) {
|
||||
return &tracing.EventTracingAttr{
|
||||
TracingData: &vmStatCollector{},
|
||||
Flag: tracing.FlagMetric,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (c *vmStatCollector) Update() ([]*metric.Data, error) {
|
||||
filter := newFieldFilter(conf.Get().MetricCollector.Vmstat.ExcludedMetrics,
|
||||
conf.Get().MetricCollector.Vmstat.IncludedMetrics)
|
||||
|
||||
file, err := os.Open("/proc/vmstat")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer file.Close()
|
||||
scanner := bufio.NewScanner(file)
|
||||
var metrics []*metric.Data
|
||||
for scanner.Scan() {
|
||||
parts := strings.Fields(scanner.Text())
|
||||
if filter.ignored(parts[0]) {
|
||||
log.Debugf("Ignoring vmstat metric: %s", parts[0])
|
||||
continue
|
||||
}
|
||||
value, err := strconv.ParseFloat(parts[1], 64)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
metrics = append(metrics,
|
||||
metric.NewGaugeData(parts[0], value, vmStatMetricDesc[parts[0]], nil))
|
||||
}
|
||||
return metrics, nil
|
||||
}
|
|
@ -0,0 +1,136 @@
|
|||
[简体中文](./CUSTOM_CN.md) | English
|
||||
|
||||
HuaTuo framework provides three data collection modes: `autotracing`, `event`, and `metrics`, covering different monitoring scenarios, helping users gain comprehensive insights into system performance.
|
||||
|
||||
## Collection Mode Comparison
|
||||
| Mode | Type | Trigger Condition | Data Output | Use Case |
|
||||
|-----------------|----------------|-------------------|------------------|----------------|
|
||||
| **Autotracing** | Event-driven | Triggered on system anomalies | ES + Local Storage, Prometheus (optional) | Non-routine operations, triggered on anomalies |
|
||||
| **Event** | Event-driven | Continuously running, triggered on preset thresholds | ES + Local Storage, Prometheus (optional) | Continuous operations, directly dump context |
|
||||
| **Metrics** | Metric collection | Passive collection | Prometheus format | Monitoring system metrics |
|
||||
|
||||
- **Autotracing**
|
||||
- **Type**: Event-driven (tracing).
|
||||
- **Function**: Automatically tracks system anomalies and dump context when anomalies occur.
|
||||
- **Features**:
|
||||
- When a system anomaly occurs, `autotracing` is triggered automatically to dump relevant context.
|
||||
- Data is stored to ES in real-time and stored locally for subsequent analysis and troubleshooting. It can also be monitored in Prometheus format for statistics and alerts.
|
||||
- Suitable for scenarios with high performance overhead, such as triggering captures when metrics exceed a threshold or rise too quickly.
|
||||
- **Integrated Features**: CPU anomaly tracking (cpu idle), D-state tracking (dload), container contention (waitrate), memory burst allocation (memburst), disk anomaly tracking (iotracer).
|
||||
|
||||
- **Event**
|
||||
- **Type**: Event-driven (tracing).
|
||||
- **Function**: Continuously operates within the system context, directly dump context when preset thresholds are met.
|
||||
- **Features**:
|
||||
- Unlike `autotracing`, `event` continuously operates within the system context, rather than being triggered by anomalies.
|
||||
- Data is also stored to ES and locally, and can be monitored in Prometheus format.
|
||||
- Suitable for continuous monitoring and real-time analysis, enabling timely detection of abnormal behaviors. The performance impact of `event` collection is negligible.
|
||||
- **Integrated Features**: Soft interrupt anomalies (softirq), memory allocation anomalies (oom), soft lockups (softlockup), D-state processes (hungtask), memory reclamation (memreclaim), packet droped abnormal (dropwatch), network ingress latency (netrecvlat).
|
||||
|
||||
- **Metrics**
|
||||
- **Type**: Metric collection.
|
||||
- **Function**: Collects performance metrics from subsystems.
|
||||
- **Features**:
|
||||
- Metric data can be sourced from regular procfs collection or derived from `tracing` (autotracing, event) data.
|
||||
- Outputs in Prometheus format for easy integration into Prometheus monitoring systems.
|
||||
- Unlike `tracing` data, `metrics` primarily focus on system performance metrics such as CPU usage, memory usage, and network traffic, etc.
|
||||
- Suitable for monitoring system performance metrics, supporting real-time analysis and long-term trend observation.
|
||||
- **Integrated Features**: CPU (sys, usr, util, load, nr_running, etc.), memory (vmstat, memory_stat, directreclaim, asyncreclaim, etc.), IO (d2c, q2c, freeze, flush, etc.), network (arp, socket mem, qdisc, netstat, netdev, sockstat, etc.).
|
||||
|
||||
## Multiple Purpose of Tracing Mode
|
||||
Both `autotracing` and `event` belong to the **tracing** collection mode, offering the following dual purposes:
|
||||
1. **Real-time storage to ES and local storage**: For tracing and analyzing anomalies, helping users quickly identify root causes.
|
||||
2. **Output in Prometheus format**: As metric data integrated into Prometheus monitoring systems, providing comprehensive system monitoring capabilities.
|
||||
|
||||
By flexibly combining these three modes, users can comprehensively monitor system performance, capturing both contextual information during anomalies and continuous performance metrics to meet various monitoring needs.
|
||||
|
||||
# How to Add Custom Collection
|
||||
The framework provides convenient APIs, including module startup, data storage, container information, BPF-related (load, attach, read, detach, unload), etc. You can implement custom collection logic and flexibly choose the appropriate collection mode and storage method.
|
||||
|
||||
## Tracing Type
|
||||
Based on your scenarios, you can implement the `ITracingEvent` interface in the `core/autotracing` or `core/events` directory to complete tracing-type collection.
|
||||
```go
|
||||
// ITracingEvent represents a tracing/event
|
||||
type ITracingEvent interface {
|
||||
Start(ctx context.Context) error
|
||||
}
|
||||
```
|
||||
|
||||
example:
|
||||
```go
|
||||
type exampleTracing struct{}
|
||||
|
||||
// Register callback
|
||||
func init() {
|
||||
tracing.RegisterEventTracing("example", newExample)
|
||||
}
|
||||
|
||||
// Create tracing
|
||||
func newExample() (*tracing.EventTracingAttr, error) {
|
||||
return &tracing.EventTracingAttr{
|
||||
TracingData: &exampleTracing{},
|
||||
Internal: 10, // Interval for enable tracing again (in seconds)
|
||||
Flag: tracing.FlagTracing, // mark as tracing type
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Implement ITracingEvent
|
||||
func (t *exampleTracing) Start(ctx context.Context) error {
|
||||
// do something
|
||||
...
|
||||
|
||||
// Save data to ES and local file
|
||||
storage.Save("example", ccontainerID, time.Now(), tracerData)
|
||||
}
|
||||
|
||||
// Implement Collector interface for Prometheus format output (optional)
|
||||
func (c *exampleTracing) Update() ([]*metric.Data, error) {
|
||||
// from tracerData to prometheus.Metric
|
||||
...
|
||||
|
||||
return data, nil
|
||||
}
|
||||
```
|
||||
|
||||
## Metric Type
|
||||
Implement the `Collector` interface in the path `core/metrics` to complete metric-type collection.
|
||||
|
||||
```go
|
||||
type Collector interface {
|
||||
// Get new metrics and expose them via prometheus registry.
|
||||
Update() ([]*Data, error)
|
||||
}
|
||||
```
|
||||
|
||||
example:
|
||||
```go
|
||||
type exampleMetric struct{}
|
||||
|
||||
// Register callback
|
||||
func init() {
|
||||
tracing.RegisterEventTracing("example", newExample)
|
||||
}
|
||||
|
||||
// Create Metric
|
||||
func newExample() (*tracing.EventTracingAttr, error) {
|
||||
return &tracing.EventTracingAttr{
|
||||
TracingData: &filenrCollector{
|
||||
metric: []*metric.Data{
|
||||
metric.NewGaugeData("name1", 0, "description of example_name1", nil),
|
||||
metric.NewGaugeData("name2", 0, "description of example_name2", nil),
|
||||
},
|
||||
},
|
||||
Flag: tracing.FlagMetric, // mark as Metric type
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Implement Collector interface for Prometheus format output
|
||||
func (c *exampleMetric) Update() ([]*metric.Data, error) {
|
||||
// do something
|
||||
...
|
||||
|
||||
return data, nil
|
||||
}
|
||||
```
|
||||
|
||||
The path `core` of the project includes multiple useful examples of the three collection modules, covering BPF code, map data interaction, container information, and more. For further details, refer to the corresponding code implementations.
|
|
@ -0,0 +1,136 @@
|
|||
[English](./CUSTOM.md) | 简体中文
|
||||
|
||||
本框架提供三种数据采集模式:`autotracing`、`event` 和 `metrics`,分别针对不同的监控场景和需求,帮助用户全面掌握系统的运行状态。
|
||||
|
||||
## 采集模式对比
|
||||
| 模式 | 类型 | 触发条件 | 数据输出 | 适用场景 |
|
||||
|------------- |----------------|--------------|------------------|-----------------|
|
||||
| **Autotracing** | 异常事件驱动 | 系统异常时触发 | ES + 本地存储,Prometheus(可选)| 不能常态运行,异常时触发运行 |
|
||||
| **Event** | 异常事件驱动 | 常态运行 | ES + 本地存储,Prometheus(可选)| 常态运行,直接抓取上下文信息 |
|
||||
| **Metrics** | 指标数据采集 | 被动采集 | Prometheus 格式 | 监控系统性能指标 |
|
||||
|
||||
- **Autotracing**
|
||||
- **类型**:异常事件驱动(tracing)。
|
||||
- **功能**:自动跟踪系统异常状态,并在异常发生时再触发抓取现场上下文信息。
|
||||
- **特点**:
|
||||
- 当系统出现异常时,`autotracing` 会自动触发,捕获相关的上下文信息。
|
||||
- 数据会实时上报到 ES 并存储在本地,便于后续分析和排查问题,也可通过 Prometheus 格式进行监控,便于统计和告警。
|
||||
- 适用于获取现场时性能开销较大的场景,例如检测到指标上升到一定阈值、上升速度过快再触发抓取。
|
||||
- **已集成**:cpu 异常使用跟踪(cpu idle)、D状态跟踪(dload)、容器内外部争抢(waitrate)、内存突发分配(memburst)、磁盘异常跟踪(iotracer)。
|
||||
|
||||
- **Event**
|
||||
- **类型**:异常事件驱动(tracing)。
|
||||
- **功能**:常态运行在系统上下文中,达到预设阈值直接抓取上下文信息。
|
||||
- **特点**:
|
||||
- 与 `autotracing` 不同,`event` 是常态运行,而不是在异常时再触发。
|
||||
- 数据同样会实时上报到 ES 并存储在本地,也可通过 Prometheus 格式进行监控。
|
||||
- 适合用于常态监控和实时分析,能够及时发现系统中的异常行为, `event` 类型的采集对系统性能影响可忽略。
|
||||
- **已集成**:软中断异常(softirq)、内存异常分配(oom)、软锁定(softlockup)、D 状态进程(hungtask)、内存回收(memreclaim)、异常丢包(dropwatch)、网络入向延迟(netrecvlat)。
|
||||
|
||||
- **Metrics**
|
||||
- **类型**:指标数据采集。
|
||||
- **功能**:采集各子系统的性能指标数据。
|
||||
- **特点**:
|
||||
- 指标数据可以来自常规 procfs 采集,也可以从 `tracing` (autotracing,event) 类型获取数据。
|
||||
- 以 Prometheus 格式输出,便于集成到 Prometheus 监控系统中。
|
||||
- 与 `tracing` 类数据不同,`metrics` 主要用于采集系统的性能指标,如 CPU 使用率、内存使用率、网络等。
|
||||
- 适合用于监控系统的性能指标,支持实时分析和长期趋势观察。
|
||||
- **已集成**:cpu (sys, usr, util, load, nr_running...), memory(vmstat, memory_stat, directreclaim, asyncreclaim...), IO(d2c, q2c, freeze, flush...), 网络(arp, socket mem, qdisc, netstat, netdev, socketstat...)
|
||||
|
||||
## Tracing 模式的多重用途
|
||||
`autotracing` 和 `event` 都属于 **tracing** 类数据采集模式,它们具备以下双重用途:
|
||||
1. **实时保存到 ES 和 本地存储**:用于异常事件的追踪和分析,帮助用户快速根因定位。
|
||||
2. **以 Prometheus 格式输出**:作为指标数据集成到 Prometheus 监控系统中,提供更全面的系统监控能力。
|
||||
|
||||
通过这三种模式的灵活组合,用户可以全面监控系统的运行状态,既能捕获异常事件的上下文信息,也能持续采集性能指标数据,满足不同场景下的监控需求。
|
||||
|
||||
# 如何添加自定义采集
|
||||
框架提供了非常便捷的 API,包括模块启动、数据存储、容器信息、bpf 相关 (load, attach, read, detach, unload)等,用户可通过自定义的采集逻辑,灵活选择合适的采集模式和数据存储的方式。
|
||||
|
||||
## tracing 类型
|
||||
根据实际场景,你可以在 `core/autotracing` 或 `core/events` 目录下实现接口 `ITracingEvent` 即可完成 tracing 类型的采集。
|
||||
```go
|
||||
// ITracingEvent represents a tracing/event
|
||||
type ITracingEvent interface {
|
||||
Start(ctx context.Context) error
|
||||
}
|
||||
```
|
||||
|
||||
步骤如下:
|
||||
```go
|
||||
type exampleTracing struct{}
|
||||
|
||||
// 注册回调
|
||||
func init() {
|
||||
tracing.RegisterEventTracing("example", newExample)
|
||||
}
|
||||
|
||||
// 创建 tracing
|
||||
func newExample() (*tracing.EventTracingAttr, error) {
|
||||
return &tracing.EventTracingAttr{
|
||||
TracingData: &exampleTracing{},
|
||||
Internal: 10, // 再次开启 tracing 的间隔时间 seconds
|
||||
Flag: tracing.FlagTracing, // 标记为 tracing 类型
|
||||
}, nil
|
||||
}
|
||||
|
||||
// 实现接口 ITracingEvent
|
||||
func (t *exampleTracing) Start(ctx context.Context) error {
|
||||
// do something
|
||||
...
|
||||
|
||||
// 存储数据到 ES 和 本地
|
||||
storage.Save("example", ccontainerID, time.Now(), tracerData)
|
||||
}
|
||||
|
||||
// 也可同时实现接口 Collector 以 Prometheus 格式输出 (可选)
|
||||
func (c *exampleTracing) Update() ([]*metric.Data, error) {
|
||||
// from tracerData to prometheus.Metric
|
||||
...
|
||||
|
||||
return data, nil
|
||||
}
|
||||
```
|
||||
|
||||
## Metric 类型
|
||||
在 `core/metrics` 目录下添加接口 `Collector` 的实现即可完成 Metric 类型的采集。
|
||||
|
||||
```go
|
||||
type Collector interface {
|
||||
// Get new metrics and expose them via prometheus registry.
|
||||
Update() ([]*Data, error)
|
||||
}
|
||||
```
|
||||
|
||||
步骤如下:
|
||||
```go
|
||||
type exampleMetric struct{}
|
||||
|
||||
// 注册回调
|
||||
func init() {
|
||||
tracing.RegisterEventTracing("example", newExample)
|
||||
}
|
||||
|
||||
// 创建 Metric
|
||||
func newExample() (*tracing.EventTracingAttr, error) {
|
||||
return &tracing.EventTracingAttr{
|
||||
TracingData: &filenrCollector{
|
||||
metric: []*metric.Data{
|
||||
metric.NewGaugeData("name1", 0, "description of example_name1", nil),
|
||||
metric.NewGaugeData("name2", 0, "description of example_name2", nil),
|
||||
},
|
||||
},
|
||||
Flag: tracing.FlagMetric, // 标记为 Metric 类型
|
||||
}, nil
|
||||
}
|
||||
|
||||
// 实现接口 Collector 以 Prometheus 格式输出
|
||||
func (c *exampleMetric) Update() ([]*metric.Data, error) {
|
||||
// do something
|
||||
...
|
||||
|
||||
return data, nil
|
||||
}
|
||||
```
|
||||
|
||||
在项目 core 目录下已集成了 3 个采集模块的多种实际场景的示例,包括 bpf 代码、map 数据交互、容器信息等,更多详情可参考对应代码实现。
|
After Width: | Height: | Size: 630 KiB |
|
@ -0,0 +1,63 @@
|
|||
### 概述
|
||||
- **类型**:异常事件驱动(tracing/autotracing)
|
||||
- **功能**:自动跟踪系统异常状态,并在异常发生时再触发抓取现场上下文信息
|
||||
- **特点**:
|
||||
- 当系统出现异常时,`autotracing` 会自动触发,捕获相关的上下文信息
|
||||
- 事件数据会实时存储在本地并存储到远端ES,同时你也可以生成Prometheus 统计指标进行观测。
|
||||
- 适用于获取现场时**性能开销较大的场景**,例如检测到指标上升到一定阈值、上升速度过快再触发抓取
|
||||
- **已集成**:cpu 异常使用跟踪(cpu idle)、D状态跟踪(dload)、容器内外部争抢(waitrate)、内存突发分配(memburst)、磁盘异常跟踪(iotracer)
|
||||
|
||||
### 如何添加 Autotracing ?
|
||||
`AutoTracing` 只需实现 `ITracingEvent` 接口并完成注册,即可将事件添加到系统中。
|
||||
>`AutoTracing` 与 `Event` 类型在框架实现上没有任何区别,只是针对不同的场景进行了实际应用的区分。
|
||||
|
||||
```go
|
||||
// ITracingEvent represents a autotracing or event
|
||||
type ITracingEvent interface {
|
||||
Start(ctx context.Context) error
|
||||
}
|
||||
```
|
||||
|
||||
#### 1. 创建结构体
|
||||
```go
|
||||
type exampleTracing struct{}
|
||||
```
|
||||
|
||||
#### 2. 注册回调函数
|
||||
```go
|
||||
func init() {
|
||||
tracing.RegisterEventTracing("example", newExample)
|
||||
}
|
||||
|
||||
func newExample() (*tracing.EventTracingAttr, error) {
|
||||
return &tracing.EventTracingAttr{
|
||||
TracingData: &exampleTracing{},
|
||||
Internal: 10, // 再次开启 tracing 的间隔时间 seconds
|
||||
Flag: tracing.FlagTracing, // 标记为 tracing 类型; | tracing.FlagMetric(可选)
|
||||
}, nil
|
||||
}
|
||||
```
|
||||
|
||||
#### 3. 实现接口 ITracingEvent
|
||||
```go
|
||||
func (t *exampleTracing) Start(ctx context.Context) error {
|
||||
// detect your care about
|
||||
...
|
||||
|
||||
// 存储数据到 ES 和 本地
|
||||
storage.Save("example", ccontainerID, time.Now(), tracerData)
|
||||
}
|
||||
```
|
||||
|
||||
另外也可同时实现接口 Collector 以 Prometheus 格式输出 (可选)
|
||||
|
||||
```go
|
||||
func (c *exampleTracing) Update() ([]*metric.Data, error) {
|
||||
// from tracerData to prometheus.Metric
|
||||
...
|
||||
|
||||
return data, nil
|
||||
}
|
||||
```
|
||||
|
||||
在项目 `core/autotracing` 目录下已集成了多种实际场景的 `autotracing` 示例,以及框架提供的丰富底层接口,包括 bpf prog,map 数据交互、容器信息等,更多详情可参考对应代码实现。
|
|
@ -0,0 +1,64 @@
|
|||
### 概述
|
||||
|
||||
- **类型**:异常事件驱动(tracing/event)
|
||||
- **功能**:常态运行在系统达到预设阈值后抓取上下文信息
|
||||
- **特点**:
|
||||
- 与 `autotracing` 不同,`event` 是常态运行,而不是在异常时再触发。
|
||||
- 事件数据会实时存储在本地并存储到远端ES,同时你也可以生成Prometheus 统计指标进行观测。
|
||||
- 适合用于**常态监控**和**实时分析**,能够及时发现系统中的异常行为, `event` 类型的采集对系统性能影响可忽略。
|
||||
- **已集成**:软中断异常(softirq)、内存异常分配(oom)、软锁定(softlockup)、D 状态进程(hungtask)、内存回收(memreclaim)、异常丢包(dropwatch)、网络入向延迟(netrecvlat) 等
|
||||
|
||||
### 如何添加事件指标
|
||||
只需实现 `ITracingEvent` 接口并完成注册,即可将事件添加到系统。
|
||||
>`AutoTracing` 与 `Event` 类型在框架实现上没有任何区别,只是针对不同的场景进行了实际应用的区分。
|
||||
|
||||
```go
|
||||
// ITracingEvent represents a tracing/event
|
||||
type ITracingEvent interface {
|
||||
Start(ctx context.Context) error
|
||||
}
|
||||
```
|
||||
|
||||
#### 1. 创建 Event 结构体
|
||||
```go
|
||||
type exampleTracing struct{}
|
||||
```
|
||||
|
||||
#### 2. 注册回调函数
|
||||
```go
|
||||
func init() {
|
||||
tracing.RegisterEventTracing("example", newExample)
|
||||
}
|
||||
|
||||
func newExample() (*tracing.EventTracingAttr, error) {
|
||||
return &tracing.EventTracingAttr{
|
||||
TracingData: &exampleTracing{},
|
||||
Internal: 10, // 再次开启 tracing 的间隔时间 seconds
|
||||
Flag: tracing.FlagTracing, // 标记为 tracing 类型;| tracing.FlagMetric(可选)
|
||||
}, nil
|
||||
}
|
||||
```
|
||||
|
||||
#### 3. 实现接口 ITracingEvent
|
||||
```go
|
||||
func (t *exampleTracing) Start(ctx context.Context) error {
|
||||
// do something
|
||||
...
|
||||
|
||||
// 存储数据到 ES 和 本地
|
||||
storage.Save("example", ccontainerID, time.Now(), tracerData)
|
||||
}
|
||||
```
|
||||
|
||||
另外也可同时实现接口 Collector 以 Prometheus 格式输出 (可选)
|
||||
|
||||
```go
|
||||
func (c *exampleTracing) Update() ([]*metric.Data, error) {
|
||||
// from tracerData to prometheus.Metric
|
||||
...
|
||||
|
||||
return data, nil
|
||||
}
|
||||
```
|
||||
|
||||
在项目 `core/events` 目录下已集成了多种实际场景的 `events` 示例,以及框架提供的丰富底层接口,包括 bpf prog, map 数据交互、容器信息等,更多详情可参考对应代码实现。
|
|
@ -0,0 +1,65 @@
|
|||
### 概述
|
||||
|
||||
Metrics 类型用于采集系统性能等指标数据,可输出为 Prometheus 格式,作为服务端对外提供数据,通过接口 `/metrics` (`curl localhost:<port>/metrics`) 获取。
|
||||
|
||||
- **类型**:指标数据采集
|
||||
- **功能**:采集各子系统的性能指标数据
|
||||
- **特点**:
|
||||
- metrics 主要用于采集系统的性能指标,如 CPU 使用率、内存使用率、网络等,适合用于监控系统的性能指标,支持实时分析和长期趋势观察。
|
||||
- 指标数据可以来自常规 procfs/sysfs 采集,也可以从 tracing (autotracing, event) 类型生成指标数据
|
||||
- Prometheus 格式输出,便于无缝集成到 Prometheus 观测体系
|
||||
|
||||
- **已集成**:
|
||||
- cpu (sys, usr, util, load, nr_running...)
|
||||
- memory(vmstat, memory_stat, directreclaim, asyncreclaim...)
|
||||
- IO (d2c, q2c, freeze, flush...)
|
||||
- 网络(arp, socket mem, qdisc, netstat, netdev, socketstat...)
|
||||
|
||||
### 如何添加统计指标
|
||||
|
||||
只需实现 `Collector` 接口并完成注册,即可将指标添加到系统中。
|
||||
|
||||
```go
|
||||
type Collector interface {
|
||||
// Get new metrics and expose them via prometheus registry.
|
||||
Update() ([]*Data, error)
|
||||
}
|
||||
```
|
||||
|
||||
#### 1. 创建结构体
|
||||
在 `core/metrics` 目录下创建实现 `Collector` 接口的结构体:
|
||||
|
||||
```go
|
||||
type exampleMetric struct{
|
||||
}
|
||||
```
|
||||
|
||||
#### 2. 注册回调函数
|
||||
```go
|
||||
func init() {
|
||||
tracing.RegisterEventTracing("example", newExample)
|
||||
}
|
||||
|
||||
func newExample() (*tracing.EventTracingAttr, error) {
|
||||
return &tracing.EventTracingAttr{
|
||||
TracingData: &exampleMetric{},
|
||||
Flag: tracing.FlagMetric, // 标记为 Metric 类型
|
||||
}, nil
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
#### 3. 实现 `Update` 方法
|
||||
|
||||
```go
|
||||
func (c *exampleMetric) Update() ([]*metric.Data, error) {
|
||||
// do something
|
||||
...
|
||||
return []*metric.Data{
|
||||
metric.NewGaugeData("example", value, "description of example", nil),
|
||||
}, nil
|
||||
|
||||
}
|
||||
```
|
||||
|
||||
在项目 `core/metrics` 目录下已集成了多种实际场景的 `Metrics` 示例,以及框架提供的丰富底层接口,包括 bpf prog, map 数据交互、容器信息等,更多详情可参考对应代码实现。
|
After Width: | Height: | Size: 629 KiB |
After Width: | Height: | Size: 781 KiB |
After Width: | Height: | Size: 1.7 MiB |
After Width: | Height: | Size: 107 KiB |
After Width: | Height: | Size: 1.4 MiB |
After Width: | Height: | Size: 20 KiB |
After Width: | Height: | Size: 786 KiB |
After Width: | Height: | Size: 1.2 MiB |
|
@ -0,0 +1,37 @@
|
|||
### 概述
|
||||
HUATUO 已支持自动追踪指标如下:
|
||||
|
||||
| 追踪名称 | 核心功能 | 场景 |
|
||||
| ---------------| --------------------- |-------------------------------------- |
|
||||
| cpusys | 宿主 sys 突增检测 | 由于系统负载异常导致业务毛刺问题 |
|
||||
| cpuidle | 容器 cpu idle 掉底检测,提供调用栈,火焰图,进程上下文信息等 | 容器 cpu 使用异常,帮助业务描绘进程热点 |
|
||||
| dload | 跟踪容器loadavg状态进程状态,自动抓取容器 D 状态进程调用信息 | 系统 D 状态突增通常和资源不可用或者锁被长期持有相关,R 状态进程数量突增往往是业务代码设计不合理导致 |
|
||||
| waitrate | 容器资源争抢检测,容器调度被争抢时提供正在争抢的容器信息 | 容器被争抢可能会引起业务毛刺,已存在争抢指标缺乏具体正在争抢的容器信息,通过 waitrate 追踪可以获取参与争抢的容器信息,给混部资源隔离提供参考 |
|
||||
| memburst | 记录内存突发分配时上下文信息 | 宿主机短时间内大量分配内存,检测宿主机上短时间内大量分配内存事件。突发性内存分配可能引发直接回收或者 oom 等 |
|
||||
| iotracing | 检测宿主磁盘 IO 延迟异常。输出访问的文件名和路径、磁盘设备、inode 号、容器等上下文信息 | 频繁出现磁盘 IO 带宽打满、磁盘访问突增,进而导致应用请求延迟或者系统性能抖动 |
|
||||
|
||||
### CPUSYS
|
||||
系统态 CPU 时间反映内核执行开销,包括系统调用、中断处理、内核线程调度、内存管理及锁竞争等操作。该指标异常升高,通常表明存在内核级性能瓶颈:高频系统调用、硬件设备异常、锁争用或内存回收压力(kswapd 直接回收)等。
|
||||
|
||||
cpusys 检测到该指标异常时,自动会触发抓取系统的调用栈并生成火焰图,帮助定位问题根因。 既考虑到系统 cpu sys 达到阈值,或者sys 突发毛刺带来的问题,其中触发条件如下:
|
||||
- CPU Sys 使用率 > 阈值 A
|
||||
- CPU Sys 使用率单位时间内增长 > 阈值 B
|
||||
|
||||
### CPUIDLE
|
||||
K8S 容器环境,CPU idle 时间(即 CPU 处于空闲状态的时间比例)的突然下降通常表明容器内进程正在过度消耗 CPU 资源,可能引发业务延迟、调度争抢甚至整体系统性能下降。
|
||||
|
||||
cpuidle 自动会触发抓取调用栈生成火焰图,触发条件:
|
||||
- CPU Sys 使用率 > 阈值 A
|
||||
- CPU User 使用率 > 阈值 B && CPU User 使用率单位时间增长 > 阈值 C
|
||||
- CPU Usage > 阈值 D && CPU Usage 单位时间增长 > 阈值 E
|
||||
|
||||
### DLOAD
|
||||
D 状态是一种特殊的进程状态,指进程因等待内核或硬件资源而进入的一种特殊阻塞状态。与普通睡眠(S 状态)不同,D 状态进程无法被强制终止(包括 SIGKILL),也不会响应中断信号。该状态通常发生在 I/O 操作(如直接读写磁盘)、硬件驱动故障时。系统 D 状态突增往往和资源不可用或者锁被长期持有导致,可运行进程突增往往是业务代码设计不合理导致。dload 借助 netlink 获取容器 running + uninterruptible 进程数量,通过滑动窗口算法计算出过去 1 分钟内容器 D 进程对负载做出的贡献值,当平滑计算后的 D 状态进程负载值超过阈值的时候,表示容器内的 D 状态进程数量出现异常,开始触发收集容器运行情况、D 状态进程信息。
|
||||
|
||||
### MemBurst
|
||||
memburst 用于检测宿主机上短时间内大量分配内存的情况,突发性内存分配可能引发直接回收甚至 OOM,所以一旦突发性内存分配就需要记录相关信息。
|
||||
|
||||
### IOTracing
|
||||
当 I/O 带宽被占满 或 磁盘访问量突增 时,系统可能因 I/O 资源竞争而出现 请求延迟升高、性能抖动,甚至影响整个系统的稳定性。
|
||||
|
||||
iotracing 在宿主磁盘负载高、IO 延迟异常时,输出异常时 IO 访问的文件名和路径、磁盘设备、inode 号,容器名等上下文信息。
|
|
@ -0,0 +1,510 @@
|
|||
### 总览
|
||||
HUATUO 目前支持的异常上下文捕获事件如下:
|
||||
|
||||
| 事件名称 | 核心功能 | 场景 |
|
||||
| ---------------| --------------------- |----------------------------------------|
|
||||
| softirq | 宿主软中断延迟响应或长期关闭,输出长时间关闭软中断的内核调用栈,进程信息等 | 该类问题会严重影响网络收发,进而导致业务毛刺或者超时等其他问题 |
|
||||
| dropwatch | TCP 数据包丢包检测,输出发生丢包时主机、网络上下文信息等 | 该类问题主要会引起业务毛刺和延迟 |
|
||||
| netrecvlat | 在网络收方向获取数据包从驱动、协议栈、到用户主动收过程的延迟事件 | 网络延迟问题中有一类是数据传输阶段收方向存在延迟,但不清楚是延迟位置,netrecvlat 根据 skb 入网卡时间戳依次在驱动、协议栈和用户拷贝数据等路径计算延迟,通过预先设定的阈值过滤超时的数据包,定位延迟位置 |
|
||||
| oom | 检测宿主或容器内 oom 事件 | 当宿主机层面或者容器维度发生 oom 事件时,能够获取触发 oom 的进程信息、被 kill 的进程信息以及容器信息,便于定位进程内存泄漏、异常退出等问题 |
|
||||
| softlockup | 当系统上发生 softlockup 时,收集目标进程信息以及 cpu 信息,同时获取各个 cpu 上的内核栈信息 | 系统发生 softlockup |
|
||||
| hungtask | 提供系统内所有 D 状态进程数量、内核栈信息 | 用于定位瞬时出现 D 进程的场景,能及时保留现场便于后期问题跟踪 |
|
||||
| memreclaim | 进程进入直接回收的耗时,超过时间阈值,记录进程信息 | 内存压力过大时,如果此时进程申请内存,有可能进入直接回收,此时处于同步回收阶段,可能会造成业务进程的卡顿,此时记录进程进入直接回收的时间,有助于我们判断此进程被直接回收影响的剧烈程度 |
|
||||
| netdev | 检测网卡状态变化 | 网卡抖动、bond 环境下 slave 异常等 |
|
||||
| lacp | 检测 lacp 状态变化 | bond 模式 4 下,监控 lacp 协商状态 |
|
||||
|
||||
|
||||
### 软中断关闭过长检测
|
||||
|
||||
**功能介绍**
|
||||
|
||||
Linux 内核存在进程上下文,中断上下文,软中断上下文,NMI 上下文等概念,这些上下文之间可能存在共享数据情况,因此为了确保数据的一致性,正确性,内核代码可能会关闭软中断或者硬中断。从理论角度,单次关闭中断或者软中断时间不能太长,但高频的系统调用,陷入内核态频繁执行关闭中断或软中断,同样会造"长时间关闭"的现象,拖慢了系统的响应。“关闭中断,软中断时间过长”这类问题非常隐蔽,且定位手段有限,同时影响又非常大,体现在业务应用上一般为接收数据超时。针对这种场景我们基于BPF技术构建了检测硬件中断,软件中断关闭过长的能力。
|
||||
|
||||
**示例**
|
||||
|
||||
如下为抓取到的关闭中断过长的实例,这些信息被自动上传到 ES.
|
||||
|
||||
```
|
||||
{
|
||||
"_index": "***_2025-06-11",
|
||||
"_type": "_doc",
|
||||
"_id": "***",
|
||||
"_score": 0,
|
||||
"_source": {
|
||||
"uploaded_time": "2025-06-11T16:05:16.251152703+08:00",
|
||||
"hostname": "***",
|
||||
"tracer_data": {
|
||||
"comm": "observe-agent",
|
||||
"stack": "stack:\nscheduler_tick/ffffffffa471dbc0 [kernel]\nupdate_process_times/ffffffffa4789240 [kernel]\ntick_sched_handle.isra.8/ffffffffa479afa0 [kernel]\ntick_sched_timer/ffffffffa479b000 [kernel]\n__hrtimer_run_queues/ffffffffa4789b60 [kernel]\nhrtimer_interrupt/ffffffffa478a610 [kernel]\n__sysvec_apic_timer_interrupt/ffffffffa4661a60 [kernel]\nasm_call_sysvec_on_stack/ffffffffa5201130 [kernel]\nsysvec_apic_timer_interrupt/ffffffffa5090500 [kernel]\nasm_sysvec_apic_timer_interrupt/ffffffffa5200d30 [kernel]\ndump_stack/ffffffffa506335e [kernel]\ndump_header/ffffffffa5058eb0 [kernel]\noom_kill_process.cold.9/ffffffffa505921a [kernel]\nout_of_memory/ffffffffa48a1740 [kernel]\nmem_cgroup_out_of_memory/ffffffffa495ff70 [kernel]\ntry_charge/ffffffffa4964ff0 [kernel]\nmem_cgroup_charge/ffffffffa4968de0 [kernel]\n__add_to_page_cache_locked/ffffffffa4895c30 [kernel]\nadd_to_page_cache_lru/ffffffffa48961a0 [kernel]\npagecache_get_page/ffffffffa4897ad0 [kernel]\ngrab_cache_page_write_begin/ffffffffa4899d00 [kernel]\niomap_write_begin/ffffffffa49fddc0 [kernel]\niomap_write_actor/ffffffffa49fe980 [kernel]\niomap_apply/ffffffffa49fbd20 [kernel]\niomap_file_buffered_write/ffffffffa49fc040 [kernel]\nxfs_file_buffered_aio_write/ffffffffc0f3bed0 [xfs]\nnew_sync_write/ffffffffa497ffb0 [kernel]\nvfs_write/ffffffffa4982520 [kernel]\nksys_write/ffffffffa4982880 [kernel]\ndo_syscall_64/ffffffffa508d190 [kernel]\nentry_SYSCALL_64_after_hwframe/ffffffffa5200078 [kernel]",
|
||||
"now": 5532940660025295,
|
||||
"offtime": 237328905,
|
||||
"cpu": 1,
|
||||
"threshold": 100000000,
|
||||
"pid": 688073
|
||||
},
|
||||
"tracer_time": "2025-06-11 16:05:16.251 +0800",
|
||||
"tracer_type": "auto",
|
||||
"time": "2025-06-11 16:05:16.251 +0800",
|
||||
"region": "***",
|
||||
"tracer_name": "softirq",
|
||||
"es_index_time": 1749629116268
|
||||
},
|
||||
"fields": {
|
||||
"time": [
|
||||
"2025-06-11T08:05:16.251Z"
|
||||
]
|
||||
},
|
||||
"_ignored": [
|
||||
"tracer_data.stack"
|
||||
],
|
||||
"_version": 1,
|
||||
"sort": [
|
||||
1749629116251
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
本地物理机也会存储一份相同的数据:
|
||||
|
||||
```
|
||||
2025-06-11 16:05:16 *** Region=***
|
||||
{
|
||||
"hostname": "***",
|
||||
"region": "***",
|
||||
"uploaded_time": "2025-06-11T16:05:16.251152703+08:00",
|
||||
"time": "2025-06-11 16:05:16.251 +0800",
|
||||
"tracer_name": "softirq",
|
||||
"tracer_time": "2025-06-11 16:05:16.251 +0800",
|
||||
"tracer_type": "auto",
|
||||
"tracer_data": {
|
||||
"offtime": 237328905,
|
||||
"threshold": 100000000,
|
||||
"comm": "observe-agent",
|
||||
"pid": 688073,
|
||||
"cpu": 1,
|
||||
"now": 5532940660025295,
|
||||
"stack": "stack:\nscheduler_tick/ffffffffa471dbc0 [kernel]\nupdate_process_times/ffffffffa4789240 [kernel]\ntick_sched_handle.isra.8/ffffffffa479afa0 [kernel]\ntick_sched_timer/ffffffffa479b000 [kernel]\n__hrtimer_run_queues/ffffffffa4789b60 [kernel]\nhrtimer_interrupt/ffffffffa478a610 [kernel]\n__sysvec_apic_timer_interrupt/ffffffffa4661a60 [kernel]\nasm_call_sysvec_on_stack/ffffffffa5201130 [kernel]\nsysvec_apic_timer_interrupt/ffffffffa5090500 [kernel]\nasm_sysvec_apic_timer_interrupt/ffffffffa5200d30 [kernel]\ndump_stack/ffffffffa506335e [kernel]\ndump_header/ffffffffa5058eb0 [kernel]\noom_kill_process.cold.9/ffffffffa505921a [kernel]\nout_of_memory/ffffffffa48a1740 [kernel]\nmem_cgroup_out_of_memory/ffffffffa495ff70 [kernel]\ntry_charge/ffffffffa4964ff0 [kernel]\nmem_cgroup_charge/ffffffffa4968de0 [kernel]\n__add_to_page_cache_locked/ffffffffa4895c30 [kernel]\nadd_to_page_cache_lru/ffffffffa48961a0 [kernel]\npagecache_get_page/ffffffffa4897ad0 [kernel]\ngrab_cache_page_write_begin/ffffffffa4899d00 [kernel]\niomap_write_begin/ffffffffa49fddc0 [kernel]\niomap_write_actor/ffffffffa49fe980 [kernel]\niomap_apply/ffffffffa49fbd20 [kernel]\niomap_file_buffered_write/ffffffffa49fc040 [kernel]\nxfs_file_buffered_aio_write/ffffffffc0f3bed0 [xfs]\nnew_sync_write/ffffffffa497ffb0 [kernel]\nvfs_write/ffffffffa4982520 [kernel]\nksys_write/ffffffffa4982880 [kernel]\ndo_syscall_64/ffffffffa508d190 [kernel]\nentry_SYSCALL_64_after_hwframe/ffffffffa5200078 [kernel]"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 协议栈丢包检测
|
||||
|
||||
**功能介绍**
|
||||
|
||||
在数据包收发过程中由于各类原因,可能出现丢包的现象,丢包可能会导致业务请求延迟,甚至超时。dropwatch 借助 eBPF 观测内核网络数据包丢弃情况,输出丢包网络上下文,如:源目的地址,源目的端口,seq, seqack, pid, comm, stack 信息等。dorpwatch 主要用于检测 TCP 协议相关的丢包,通过预先埋点过滤数据包,确定丢包位置以便于排查丢包根因。
|
||||
|
||||
**示例**
|
||||
|
||||
通过 dropwatch 抓取到的相关信息会自动上传到 ES。如下为抓取到的一案例:kubelet 在发送 SYN 时,由于设备丢包,导致数据包发送失败。
|
||||
|
||||
```
|
||||
{
|
||||
"_index": "***_2025-06-11",
|
||||
"_type": "_doc",
|
||||
"_id": "***",
|
||||
"_score": 0,
|
||||
"_source": {
|
||||
"uploaded_time": "2025-06-11T16:58:15.100223795+08:00",
|
||||
"hostname": "***",
|
||||
"tracer_data": {
|
||||
"comm": "kubelet",
|
||||
"stack": "kfree_skb/ffffffff9a0cd5c0 [kernel]\nkfree_skb/ffffffff9a0cd5c0 [kernel]\nkfree_skb_list/ffffffff9a0cd670 [kernel]\n__dev_queue_xmit/ffffffff9a0ea020 [kernel]\nip_finish_output2/ffffffff9a18a720 [kernel]\n__ip_queue_xmit/ffffffff9a18d280 [kernel]\n__tcp_transmit_skb/ffffffff9a1ad890 [kernel]\ntcp_connect/ffffffff9a1ae610 [kernel]\ntcp_v4_connect/ffffffff9a1b3450 [kernel]\n__inet_stream_connect/ffffffff9a1d25f0 [kernel]\ninet_stream_connect/ffffffff9a1d2860 [kernel]\n__sys_connect/ffffffff9a0c1170 [kernel]\n__x64_sys_connect/ffffffff9a0c1240 [kernel]\ndo_syscall_64/ffffffff9a2ea9f0 [kernel]\nentry_SYSCALL_64_after_hwframe/ffffffff9a400078 [kernel]",
|
||||
"saddr": "10.79.68.62",
|
||||
"pid": 1687046,
|
||||
"type": "common_drop",
|
||||
"queue_mapping": 11,
|
||||
"dport": 2052,
|
||||
"pkt_len": 74,
|
||||
"ack_seq": 0,
|
||||
"daddr": "10.179.142.26",
|
||||
"state": "SYN_SENT",
|
||||
"src_hostname": "***",
|
||||
"sport": 15402,
|
||||
"dest_hostname": "***",
|
||||
"seq": 1902752773,
|
||||
"max_ack_backlog": 0
|
||||
},
|
||||
"tracer_time": "2025-06-11 16:58:15.099 +0800",
|
||||
"tracer_type": "auto",
|
||||
"time": "2025-06-11 16:58:15.099 +0800",
|
||||
"region": "***",
|
||||
"tracer_name": "dropwatch",
|
||||
"es_index_time": 1749632295120
|
||||
},
|
||||
"fields": {
|
||||
"time": [
|
||||
"2025-06-11T08:58:15.099Z"
|
||||
]
|
||||
},
|
||||
"_ignored": [
|
||||
"tracer_data.stack"
|
||||
],
|
||||
"_version": 1,
|
||||
"sort": [
|
||||
1749632295099
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
本地物理机也会存储一份相同的数据:
|
||||
|
||||
```
|
||||
2025-06-11 16:58:15 Host=*** Region=***
|
||||
{
|
||||
"hostname": "***",
|
||||
"region": "***",
|
||||
"uploaded_time": "2025-06-11T16:58:15.100223795+08:00",
|
||||
"time": "2025-06-11 16:58:15.099 +0800",
|
||||
"tracer_name": "dropwatch",
|
||||
"tracer_time": "2025-06-11 16:58:15.099 +0800",
|
||||
"tracer_type": "auto",
|
||||
"tracer_data": {
|
||||
"type": "common_drop",
|
||||
"comm": "kubelet",
|
||||
"pid": 1687046,
|
||||
"saddr": "10.79.68.62",
|
||||
"daddr": "10.179.142.26",
|
||||
"sport": 15402,
|
||||
"dport": 2052,
|
||||
"src_hostname": ***",
|
||||
"dest_hostname": "***",
|
||||
"max_ack_backlog": 0,
|
||||
"seq": 1902752773,
|
||||
"ack_seq": 0,
|
||||
"queue_mapping": 11,
|
||||
"pkt_len": 74,
|
||||
"state": "SYN_SENT",
|
||||
"stack": "kfree_skb/ffffffff9a0cd5c0 [kernel]\nkfree_skb/ffffffff9a0cd5c0 [kernel]\nkfree_skb_list/ffffffff9a0cd670 [kernel]\n__dev_queue_xmit/ffffffff9a0ea020 [kernel]\nip_finish_output2/ffffffff9a18a720 [kernel]\n__ip_queue_xmit/ffffffff9a18d280 [kernel]\n__tcp_transmit_skb/ffffffff9a1ad890 [kernel]\ntcp_connect/ffffffff9a1ae610 [kernel]\ntcp_v4_connect/ffffffff9a1b3450 [kernel]\n__inet_stream_connect/ffffffff9a1d25f0 [kernel]\ninet_stream_connect/ffffffff9a1d2860 [kernel]\n__sys_connect/ffffffff9a0c1170 [kernel]\n__x64_sys_connect/ffffffff9a0c1240 [kernel]\ndo_syscall_64/ffffffff9a2ea9f0 [kernel]\nentry_SYSCALL_64_after_hwframe/ffffffff9a400078 [kernel]"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 协议栈收包延迟
|
||||
|
||||
**功能介绍**
|
||||
|
||||
线上业务网络延迟问题是比较难定位的,任何方向,任何的阶段都有可能出现问题。比如收方向的延迟,驱动、协议栈、用户程序等都有可能出现问题,因此我们开发了 netrecvlat 检测功能,借助 skb 入网卡的时间戳,在驱动,协议栈层,用户态层检查延迟时间,当收包延迟达到阈值时,借助 eBPF 获取网络上下文信息(五元组、延迟位置、进程信息等)。收方向传输路径示意:**网卡 -> 驱动 -> 协议栈 -> 用户主动收**
|
||||
|
||||
**示例**
|
||||
|
||||
一个业务容器从内核收包延迟超过 90s,通过 netrecvlat 追踪,ES 查询输出如下:
|
||||
|
||||
```
|
||||
{
|
||||
"_index": "***_2025-06-11",
|
||||
"_type": "_doc",
|
||||
"_id": "***",
|
||||
"_score": 0,
|
||||
"_source": {
|
||||
"tracer_data": {
|
||||
"dport": 49000,
|
||||
"pkt_len": 26064,
|
||||
"comm": "nginx",
|
||||
"ack_seq": 689410995,
|
||||
"saddr": "10.156.248.76",
|
||||
"pid": 2921092,
|
||||
"where": "TO_USER_COPY",
|
||||
"state": "ESTABLISHED",
|
||||
"daddr": "10.134.72.4",
|
||||
"sport": 9213,
|
||||
"seq": 1009085774,
|
||||
"latency_ms": 95973
|
||||
},
|
||||
"container_host_namespace": "***",
|
||||
"container_hostname": "***.docker",
|
||||
"es_index_time": 1749628496541,
|
||||
"uploaded_time": "2025-06-11T15:54:56.404864955+08:00",
|
||||
"hostname": "***",
|
||||
"container_type": "normal",
|
||||
"tracer_time": "2025-06-11 15:54:56.404 +0800",
|
||||
"time": "2025-06-11 15:54:56.404 +0800",
|
||||
"region": "***",
|
||||
"container_level": "1",
|
||||
"container_id": "***",
|
||||
"tracer_name": "netrecvlat"
|
||||
},
|
||||
"fields": {
|
||||
"time": [
|
||||
"2025-06-11T07:54:56.404Z"
|
||||
]
|
||||
},
|
||||
"_version": 1,
|
||||
"sort": [
|
||||
1749628496404
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
本地物理机也会存储一份相同的数据:
|
||||
|
||||
```
|
||||
2025-06-11 15:54:46 Host=*** Region=*** ContainerHost=***.docker ContainerID=*** ContainerType=normal ContainerLevel=1
|
||||
{
|
||||
"hostname": "***",
|
||||
"region": "***",
|
||||
"container_id": "***",
|
||||
"container_hostname": "***.docker",
|
||||
"container_host_namespace": "***",
|
||||
"container_type": "normal",
|
||||
"container_level": "1",
|
||||
"uploaded_time": "2025-06-11T15:54:46.129136232+08:00",
|
||||
"time": "2025-06-11 15:54:46.129 +0800",
|
||||
"tracer_time": "2025-06-11 15:54:46.129 +0800",
|
||||
"tracer_name": "netrecvlat",
|
||||
"tracer_data": {
|
||||
"comm": "nginx",
|
||||
"pid": 2921092,
|
||||
"where": "TO_USER_COPY",
|
||||
"latency_ms": 95973,
|
||||
"state": "ESTABLISHED",
|
||||
"saddr": "10.156.248.76",
|
||||
"daddr": "10.134.72.4",
|
||||
"sport": 9213,
|
||||
"dport": 49000,
|
||||
"seq": 1009024958,
|
||||
"ack_seq": 689410995,
|
||||
"pkt_len": 20272
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 物理机、容器内存超用
|
||||
|
||||
**功能介绍**
|
||||
|
||||
程序运行时申请的内存超过了系统或进程可用的内存上限,导致系统或应用程序崩溃。常见于内存泄漏、大数据处理或资源配置不足的场景。通过在 oom 的内核流程插入 BPF 钩子,获取 oom 上下文的详细信息并传递到用户态。这些信息包括进程信息、被 kill 的进程信息、容器信息。
|
||||
|
||||
**示例**
|
||||
|
||||
一个容器内发生 oom 时,被抓取的信息如下:
|
||||
|
||||
```
|
||||
{
|
||||
"_index": "***_cases_2025-06-11",
|
||||
"_type": "_doc",
|
||||
"_id": "***",
|
||||
"_score": 0,
|
||||
"_source": {
|
||||
"uploaded_time": "2025-06-11T17:09:07.236482841+08:00",
|
||||
"hostname": "***",
|
||||
"tracer_data": {
|
||||
"victim_process_name": "java",
|
||||
"trigger_memcg_css": "0xff4b8d8be3818000",
|
||||
"victim_container_hostname": "***.docker",
|
||||
"victim_memcg_css": "0xff4b8d8be3818000",
|
||||
"trigger_process_name": "java",
|
||||
"victim_pid": 3218745,
|
||||
"trigger_pid": 3218804,
|
||||
"trigger_container_hostname": "***.docker",
|
||||
"victim_container_id": "***",
|
||||
"trigger_container_id": "***",
|
||||
"tracer_time": "2025-06-11 17:09:07.236 +0800",
|
||||
"tracer_type": "auto",
|
||||
"time": "2025-06-11 17:09:07.236 +0800",
|
||||
"region": "***",
|
||||
"tracer_name": "oom",
|
||||
"es_index_time": 1749632947258
|
||||
},
|
||||
"fields": {
|
||||
"time": [
|
||||
"2025-06-11T09:09:07.236Z"
|
||||
]
|
||||
},
|
||||
"_version": 1,
|
||||
"sort": [
|
||||
1749632947236
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
另外 oom event 还实现了 `Collector` 接口,这样还会通过 Prometheus 统计宿主 oom 发生的次数,并区分宿主机和容器的事件。
|
||||
|
||||
### 内核 softlockup
|
||||
|
||||
**功能介绍**
|
||||
|
||||
softlockup 是 Linux 内核检测到的一种异常状态,指某个 CPU 核心上的内核线程(或进程)长时间占用 CPU 且不调度,导致系统无法正常响应其他任务。如内核代码 bug、cpu 过载、设备驱动问题等都会导致 softlockup。当系统发生 softlockup 时,收集目标进程的信息以及 cpu 信息,获取各个 cpu 上的内核栈信息同时保存问题的发生次数。
|
||||
|
||||
### 进程阻塞
|
||||
|
||||
**功能介绍**
|
||||
|
||||
D 状态进程(也称为不可中断睡眠状态,Uninterruptible)是一种特殊的进程状态,表示进程因等待某些系统资源而阻塞,且不能被信号或外部中断唤醒。常见场景如:磁盘 I/O 操作、内核阻塞、硬件故障等。hungtask 捕获系统内所有 D 状态进程的内核栈并保存 D 进程的数量。用于定位瞬间出现一些 D 进程的场景,可以在现场消失后仍然分析到问题根因。
|
||||
|
||||
**示例**
|
||||
|
||||
```
|
||||
{
|
||||
"_index": "***_2025-06-10",
|
||||
"_type": "_doc",
|
||||
"_id": "8yyOV5cBGoYArUxjSdvr",
|
||||
"_score": 0,
|
||||
"_source": {
|
||||
"uploaded_time": "2025-06-10T09:57:12.202191192+08:00",
|
||||
"hostname": "***",
|
||||
"tracer_data": {
|
||||
"cpus_stack": "2025-06-10 09:57:14 sysrq: Show backtrace of all active CPUs\n2025-06-10 09:57:14 NMI backtrace for cpu 33\n2025-06-10 09:57:14 CPU: 33 PID: 768309 Comm: huatuo-bamai Kdump: loaded Tainted: G S W OEL 5.10.0-216.0.0.115.v1.0.x86_64 #1\n2025-06-10 09:57:14 Hardware name: Inspur SA5212M5/YZMB-00882-104, BIOS 4.1.12 11/27/2019\n2025-06-10 09:57:14 Call Trace:\n2025-06-10 09:57:14 dump_stack+0x57/0x6e\n2025-06-10 09:57:14 nmi_cpu_backtrace.cold.0+0x30/0x65\n2025-06-10 09:57:14 ? lapic_can_unplug_cpu+0x80/0x80\n2025-06-10 09:57:14 nmi_trigger_cpumask_backtrace+0xdf/0xf0\n2025-06-10 09:57:14 arch_trigger_cpumask_backtrace+0x15/0x20\n2025-06-10 09:57:14 sysrq_handle_showallcpus+0x14/0x90\n2025-06-10 09:57:14 __handle_sysrq.cold.8+0x77/0xe8\n2025-06-10 09:57:14 write_sysrq_trigger+0x3d/0x60\n2025-06-10 09:57:14 proc_reg_write+0x38/0x80\n2025-06-10 09:57:14 vfs_write+0xdb/0x250\n2025-06-10 09:57:14 ksys_write+0x59/0xd0\n2025-06-10 09:57:14 do_syscall_64+0x39/0x80\n2025-06-10 09:57:14 entry_SYSCALL_64_after_hwframe+0x62/0xc7\n2025-06-10 09:57:14 RIP: 0033:0x4088ae\n2025-06-10 09:57:14 Code: 48 83 ec 38 e8 13 00 00 00 48 83 c4 38 5d c3 cc cc cc cc cc cc cc cc cc cc cc cc cc 49 89 f2 48 89 fa 48 89 ce 48 89 df 0f 05 <48> 3d 01 f0 ff ff 76 15 48 f7 d8 48 89 c1 48 c7 c0 ff ff ff ff 48\n2025-06-10 09:57:14 RSP: 002b:000000c000adcc60 EFLAGS: 00000212 ORIG_RAX: 0000000000000001\n2025-06-10 09:57:14 RAX: ffffffffffffffda RBX: 0000000000000013 RCX: 00000000004088ae\n2025-06-10 09:57:14 RDX: 0000000000000001 RSI: 000000000274ab18 RDI: 0000000000000013\n2025-06-10 09:57:14 RBP: 000000c000adcca0 R08: 0000000000000000 R09: 0000000000000000\n2025-06-10 09:57:14 R10: 0000000000000000 R11: 0000000000000212 R12: 000000c000adcdc0\n2025-06-10 09:57:14 R13: 0000000000000002 R14: 000000c000caa540 R15: 0000000000000000\n2025-06-10 09:57:14 Sending NMI from CPU 33 to CPUs 0-32,34-95:\n2025-06-10 09:57:14 NMI backtrace for cpu 52 skipped: idling at intel_idle+0x6f/0xc0\n2025-06-10 09:57:14 NMI backtrace for cpu 54 skipped: idling at intel_idle+0x6f/0xc0\n2025-06-10 09:57:14 NMI backtrace for cpu 7 skipped: idling at intel_idle+0x6f/0xc0\n2025-06-10 09:57:14 NMI backtrace for cpu 81 skipped: idling at intel_idle+0x6f/0xc0\n2025-06-10 09:57:14 NMI backtrace for cpu 60 skipped: idling at intel_idle+0x6f/0xc0\n2025-06-10 09:57:14 NMI backtrace for cpu 2 skipped: idling at intel_idle+0x6f/0xc0\n2025-06-10 09:57:14 NMI backtrace for cpu 21 skipped: idling at intel_idle+0x6f/0xc0\n2025-06-10 09:57:14 NMI backtrace for cpu 69 skipped: idling at intel_idle+0x6f/0xc0\n2025-06-10 09:57:14 NMI backtrace for cpu 58 skipped: idling at intel_idle+0x6f/
|
||||
...
|
||||
"pid": 2567042
|
||||
},
|
||||
"tracer_time": "2025-06-10 09:57:12.202 +0800",
|
||||
"tracer_type": "auto",
|
||||
"time": "2025-06-10 09:57:12.202 +0800",
|
||||
"region": "***",
|
||||
"tracer_name": "hungtask",
|
||||
"es_index_time": 1749520632297
|
||||
},
|
||||
"fields": {
|
||||
"time": [
|
||||
"2025-06-10T01:57:12.202Z"
|
||||
]
|
||||
},
|
||||
"_ignored": [
|
||||
"tracer_data.blocked_processes_stack",
|
||||
"tracer_data.cpus_stack"
|
||||
],
|
||||
"_version": 1,
|
||||
"sort": [
|
||||
1749520632202
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
另外 hungtask event 还实现了 `Collector` 接口,这样还会通过 Prometheus 统计宿主 hungtask 发生的次数。
|
||||
|
||||
### 容器、物理机内存回收
|
||||
|
||||
**功能介绍**
|
||||
|
||||
内存压力过大时,如果此时进程申请内存,有可能进入直接回收,此时处于同步回收阶段,可能会造成业务进程的卡顿,在此记录进程进入直接回收的时间,有助于我们判断此进程被直接回收影响的剧烈程度。memreclaim event 计算同一个进程在 1s 周期,若进程处在直接回收状态超过 900ms, 则记录其上下文信息。
|
||||
|
||||
**示例**
|
||||
|
||||
业务容器的 chrome 进程进入直接回收状态,ES 查询输出如下:
|
||||
|
||||
```
|
||||
{
|
||||
"_index": "***_cases_2025-06-11",
|
||||
"_type": "_doc",
|
||||
"_id": "***",
|
||||
"_score": 0,
|
||||
"_source": {
|
||||
"tracer_data": {
|
||||
"comm": "chrome",
|
||||
"deltatime": 1412702917,
|
||||
"pid": 1896137
|
||||
},
|
||||
"container_host_namespace": "***",
|
||||
"container_hostname": "***.docker",
|
||||
"es_index_time": 1749641583290,
|
||||
"uploaded_time": "2025-06-11T19:33:03.26754495+08:00",
|
||||
"hostname": "***",
|
||||
"container_type": "normal",
|
||||
"tracer_time": "2025-06-11 19:33:03.267 +0800",
|
||||
"time": "2025-06-11 19:33:03.267 +0800",
|
||||
"region": "***",
|
||||
"container_level": "102",
|
||||
"container_id": "921d0ec0a20c",
|
||||
"tracer_name": "directreclaim"
|
||||
},
|
||||
"fields": {
|
||||
"time": [
|
||||
"2025-06-11T11:33:03.267Z"
|
||||
]
|
||||
},
|
||||
"_version": 1,
|
||||
"sort": [
|
||||
1749641583267
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
### 网络设备状态
|
||||
|
||||
**功能介绍**
|
||||
|
||||
网卡状态变化通常容易造成严重的网络问题,直接影响整机网络质量,如 down/up, MTU 改变等。以 down 状态为例,可能是有权限的进程操作、底层线缆、光模块、对端交换机等问题导致,netdev event 用于检测网络设备的状态变化,目前已实现网卡 down, up 的监控,并区分管理员或底层原因导致的网卡状态变化。
|
||||
|
||||
**示例**
|
||||
|
||||
一次管理员操作导致 eth1 网卡 down 时,ES 查询到事件输出如下:
|
||||
|
||||
```
|
||||
{
|
||||
"_index": "***_cases_2025-05-30",
|
||||
"_type": "_doc",
|
||||
"_id": "***",
|
||||
"_score": 0,
|
||||
"_source": {
|
||||
"uploaded_time": "2025-05-30T17:47:50.406913037+08:00",
|
||||
"hostname": "localhost.localdomain",
|
||||
"tracer_data": {
|
||||
"ifname": "eth1",
|
||||
"start": false,
|
||||
"index": 3,
|
||||
"linkstatus": "linkStatusAdminDown, linkStatusCarrierDown",
|
||||
"mac": "5c:6f:69:34:dc:72"
|
||||
},
|
||||
"tracer_time": "2025-05-30 17:47:50.406 +0800",
|
||||
"tracer_type": "auto",
|
||||
"time": "2025-05-30 17:47:50.406 +0800",
|
||||
"region": "***",
|
||||
"tracer_name": "netdev_event",
|
||||
"es_index_time": 1748598470407
|
||||
},
|
||||
"fields": {
|
||||
"time": [
|
||||
"2025-05-30T09:47:50.406Z"
|
||||
]
|
||||
},
|
||||
"_version": 1,
|
||||
"sort": [
|
||||
1748598470406
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
### LACP 协议状态
|
||||
|
||||
**功能介绍**
|
||||
|
||||
Bond 是 Linux 系统内核提供的一种将多个物理网络接口绑定为一个逻辑接口的技术。通过绑定,可以实现带宽叠加、故障切换或负载均衡。LACP 是 IEEE 802.3ad 标准定义的协议,用于动态管理链路聚合组(LAG)。目前没有优雅获取物理机LACP 协议协商异常事件的方法,HUATUO 实现了 lacp event,通过 BPF 在协议关键路径插桩检测到链路聚合状态发生变化时,触发事件记录相关信息。
|
||||
|
||||
**示例**
|
||||
|
||||
在宿主网卡 eth1 出现物理层 down/up 抖动时,lacp 动态协商状态异常,ES 查询输出如下:
|
||||
|
||||
```
|
||||
{
|
||||
"_index": "***_cases_2025-05-30",
|
||||
"_type": "_doc",
|
||||
"_id": "***",
|
||||
"_score": 0,
|
||||
"_source": {
|
||||
"uploaded_time": "2025-05-30T17:47:48.513318579+08:00",
|
||||
"hostname": "***",
|
||||
"tracer_data": {
|
||||
"content": "/proc/net/bonding/bond0\nEthernet Channel Bonding Driver: v4.18.0 (Apr 7, 2025)\n\nBonding Mode: load balancing (round-robin)\nMII Status: down\nMII Polling Interval (ms): 0\nUp Delay (ms): 0\nDown Delay (ms): 0\nPeer Notification Delay (ms): 0\n/proc/net/bonding/bond4\nEthernet Channel Bonding Driver: v4.18.0 (Apr 7, 2025)\n\nBonding Mode: IEEE 802.3ad Dynamic link aggregation\nTransmit Hash Policy: layer3+4 (1)\nMII Status: up\nMII Polling Interval (ms): 100\nUp Delay (ms): 0\nDown Delay (ms): 0\nPeer Notification Delay (ms): 1000\n\n802.3ad info\nLACP rate: fast\nMin links: 0\nAggregator selection policy (ad_select): stable\nSystem priority: 65535\nSystem MAC address: 5c:6f:69:34:dc:72\nActive Aggregator Info:\n\tAggregator ID: 1\n\tNumber of ports: 2\n\tActor Key: 21\n\tPartner Key: 50013\n\tPartner Mac Address: 00:00:5e:00:01:01\n\nSlave Interface: eth0\nMII Status: up\nSpeed: 25000 Mbps\nDuplex: full\nLink Failure Count: 0\nPermanent HW addr: 5c:6f:69:34:dc:72\nSlave queue ID: 0\nSlave active: 1\nSlave sm_vars: 0x172\nAggregator ID: 1\nAggregator active: 1\nActor Churn State: none\nPartner Churn State: none\nActor Churned Count: 0\nPartner Churned Count: 0\ndetails actor lacp pdu:\n system priority: 65535\n system mac address: 5c:6f:69:34:dc:72\n port key: 21\n port priority: 255\n port number: 1\n port state: 63\ndetails partner lacp pdu:\n system priority: 200\n system mac address: 00:00:5e:00:01:01\n oper key: 50013\n port priority: 32768\n port number: 16397\n port state: 63\n\nSlave Interface: eth1\nMII Status: up\nSpeed: 25000 Mbps\nDuplex: full\nLink Failure Count: 17\nPermanent HW addr: 5c:6f:69:34:dc:73\nSlave queue ID: 0\nSlave active: 0\nSlave sm_vars: 0x172\nAggregator ID: 1\nAggregator active: 1\nActor Churn State: monitoring\nPartner Churn State: monitoring\nActor Churned Count: 2\nPartner Churned Count: 2\ndetails actor lacp pdu:\n system priority: 65535\n system mac address: 5c:6f:69:34:dc:72\n port key: 21\n port priority: 255\n port number: 2\n port state: 15\ndetails partner lacp pdu:\n system priority: 200\n system mac address: 00:00:5e:00:01:01\n oper key: 50013\n port priority: 32768\n port number: 32781\n port state: 31\n"
|
||||
},
|
||||
"tracer_time": "2025-05-30 17:47:48.513 +0800",
|
||||
"tracer_type": "auto",
|
||||
"time": "2025-05-30 17:47:48.513 +0800",
|
||||
"region": "***",
|
||||
"tracer_name": "lacp",
|
||||
"es_index_time": 1748598468514
|
||||
},
|
||||
"fields": {
|
||||
"time": [
|
||||
"2025-05-30T09:47:48.513Z"
|
||||
]
|
||||
},
|
||||
"_ignored": [
|
||||
"tracer_data.content"
|
||||
],
|
||||
"_version": 1,
|
||||
"sort": [
|
||||
1748598468513
|
||||
]
|
||||
}
|
||||
```
|
|
@ -0,0 +1,271 @@
|
|||
该文档汇总了当前 v1.0 版本支持的所有的指标,涉及CPU,内存,网络,IO。
|
||||
|
||||
|子系统|指标|描述|单位|统计纬度|指标来源|
|
||||
|-------|-------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------|-----|-------------------------------------------------------------------|
|
||||
|cpu|cpu_util_sys|cpu 系统态利用率|%|宿主|基于 cgroup cpuacct.stat 和 cpuacct.usage 计算|
|
||||
|cpu|cpu_util_usr|cpu 用户态利用率|%|宿主|基于 cgroup cpuacct.stat 和 cpuacct.usage 计算|
|
||||
|cpu|cpu_util_total|容器 cpu 总利用率|%|宿主|基于 cgroup cpuacct.stat 和 cpuacct.usage 计算|
|
||||
|cpu|cpu_util_container_sys|容器 cpu 系统态利用率|%|容器|基于 cgroup cpuacct.stat 和 cpuacct.usage 计算|
|
||||
|cpu|cpu_util_container_usr|容器 cpu 用户态利用率|%|容器|基于 cgroup cpuacct.stat 和 cpuacct.usage 计算|
|
||||
|cpu|cpu_util_container_total|容器 cpu 总利用率|%|容器|基于 cgroup cpuacct.stat 和 cpuacct.usage 计算|
|
||||
|cpu|cpu_stat_container_burst_time|累计墙时(以纳秒为单位),周期内突发超出配额的时间|纳秒(ns)|容器|基于 cpu.stat 读取|
|
||||
|cpu|cpu_stat_container_nr_bursts|周期内突发次数|计数|容器|基于 cpu.stat 读取|
|
||||
|cpu|cpu_stat_container_nr_throttled|cgroup 被 throttled/limited 的次数|计数|容器|基于 cpu.stat 读取|
|
||||
|cpu|cpu_stat_container_exter_wait_rate|容器外进程导致的等待率|%|容器|基于 cpu.stat 读取的 throttled_time hierarchy_wait_sum inner_wait_sum 计算|
|
||||
|cpu|cpu_stat_container_inner_wait_rate|容器内部进程导致的等待率|%|容器|基于 cpu.stat 读取的 throttled_time hierarchy_wait_sum inner_wait_sum 计算|
|
||||
|cpu|cpu_stat_container_throttle_wait_rate|容器被限制而引起的等待率|%|容器|基于 cpu.stat 读取的 throttled_time hierarchy_wait_sum inner_wait_sum 计算|
|
||||
|cpu|cpu_stat_container_wait_rate|总的等待率: exter_wait_rate + inner_wait_rate + throttle_wait_rate|%|容器|基于 cpu.stat 读取的 throttled_time hierarchy_wait_sum inner_wait_sum 计算|
|
||||
|cpu|loadavg_container_container_nr_running|容器中运行的任务数量|计数|容器|从内核通过 netlink 获取|
|
||||
|cpu|loadavg_container_container_nr_uninterruptible|容器中不可中断任务的数量|计数|容器|从内核通过 netlink 获取|
|
||||
|cpu|loadavg_load1|系统过去 1 分钟的平均负载|计数|宿主|procfs|
|
||||
|cpu|loadavg_load5|系统过去 5 分钟的平均负载|计数|宿主|procfs|
|
||||
|cpu|loadavg_load15|系统过去 15 分钟的平均负载|计数|宿主|procfs|
|
||||
|cpu|softirq_latency|在不同时间域发生的 NET_RX/NET_TX 中断延迟次数:<br>0~10 us<br>100us ~ 1ms<br>10us ~ 100us<br>1ms ~ inf|计数|宿主|BPF 软中断埋点统计|
|
||||
|cpu|runqlat_container_nlat_01|容器中进程调度延迟在 0~10 毫秒内的次数|计数|容器|bpf 调度切换埋点统计|
|
||||
|cpu|runqlat_container_nlat_02|容器中进程调度延迟在 10~20 毫秒之间的次数|计数|容器|bpf 调度切换埋点统计|
|
||||
|cpu|runqlat_container_nlat_03|容器中进程调度延迟在 20~50 毫秒之间的次数|计数|容器|bpf 调度切换埋点统计|
|
||||
|cpu|runqlat_container_nlat_04|容器中进程调度延迟超过 50 毫秒的次数|计数|容器|bpf 调度切换埋点统计|
|
||||
|cpu|runqlat_g_nlat_01|宿主中进程调度延迟在范围内 0~10 毫秒的次数|计数|宿主|bpf 调度切换埋点统计|
|
||||
|cpu|runqlat_g_nlat_02|宿主中进程调度延迟在范围内 10~20 毫秒的次数|计数|宿主|bpf 调度切换埋点统计|
|
||||
|cpu|runqlat_g_nlat_03|宿主中进程调度延迟在范围内 20~50 毫秒的次数|计数|宿主|bpf 调度切换埋点统计|
|
||||
|cpu|runqlat_g_nlat_04|宿主中进程调度延迟超过 50 毫秒的次数|计数|宿主|bpf 调度切换埋点统计|
|
||||
|cpu|reschedipi_oversell_probability|vm 中 cpu 超卖检测|0-1|宿主|bpf 调度 ipi 埋点统计|
|
||||
|memory|buddyinfo_blocks|内核伙伴系统内存分配|页计数|宿主|procfs|
|
||||
|memory|memory_events_container_watermark_inc|内存水位计数|计数|容器|memory.events|
|
||||
|memory|memory_events_container_watermark_dec|内存水位计数|计数|容器|memory.events|
|
||||
|memory|memory_others_container_local_direct_reclaim_time|cgroup 中页分配速度|纳秒(ns)|容器|memory.local_direct_reclaim_time|
|
||||
|memory|memory_others_container_directstall_time|直接回收时间|纳秒(ns)|容器|memory.directstall_stat|
|
||||
|memory|memory_others_container_asyncreclaim_time|异步回收时间|纳秒(ns)|容器|memory.asynreclaim_stat|
|
||||
|memory|memory_stat_container_writeback|匿名/文件 cache sync 到磁盘排队字节数|字节(Bytes)|容器|memory.stat|
|
||||
|memory|memory_stat_container_unevictable|无法回收的内存(如 mlocked)|字节(Bytes)|容器|memory.stat|
|
||||
|memory|memory_stat_container_shmem|共享内存字节数|字节(Bytes)|容器|memory.stat|
|
||||
|memory|memory_stat_container_pgsteal_kswapd|kswapd 和 cswapd 回收的内存字节数|字节(Bytes)|容器|memory.stat|
|
||||
|memory|memory_stat_container_pgsteal_globalkswapd|由 kswapd 回收的字节数|字节(Bytes)|容器|memory.stat|
|
||||
|memory|memory_stat_container_pgsteal_globaldirect|过页面分配直接回收的内存字节数|字节(Bytes)|容器|memory.stat|
|
||||
|memory|memory_stat_container_pgsteal_direct|页分配和 try_charge 期间直接回收的内存字节数|字节(Bytes)|容器|memory.stat|
|
||||
|memory|memory_stat_container_pgsteal_cswapd|由 cswapd 回收的字节数|字节(Bytes)|容器|memory.stat|
|
||||
|memory|memory_stat_container_pgscan_kswapd|kswapd 和 cswapd 扫描的内存字节数|字节(Bytes)|容器|memory.stat|
|
||||
|memory|memory_stat_container_pgscan_globalkswapd|kswapd 扫描的内存字节数|字节(Bytes)|容器|memory.stat|
|
||||
|memory|memory_stat_container_pgscan_globaldirect|扫描内存中通过直接回收在页面分配期间的字节数|字节(Bytes)|容器|memory.stat|
|
||||
|memory|memory_stat_container_pgscan_direct|扫描内存的字节数,在页面分配和 try_charge 期间通过直接回收的字节数|字节(Bytes)|容器|memory.stat|
|
||||
|memory|memory_stat_container_pgscan_cswapd|由 cswapd 扫描内存的字节数|字节(Bytes)|容器|memory.stat|
|
||||
|memory|memory_stat_container_pgrefill|内存中扫描的字节数|字节(Bytes)|容器|memory.stat|
|
||||
|memory|memory_stat_container_pgdeactivate|内存中未激活的部分被添加到非活动列表中|字节(Bytes)|容器|memory.stat|
|
||||
|memory|memory_stat_container_inactive_file|文件内存中不活跃的 LRU 列表的字节数|字节(Bytes)|容器|memory.stat|
|
||||
|memory|memory_stat_container_inactive_anon|匿名和交换缓存内存中不活跃的 LRU 列表的字节数|字节(Bytes)|容器|memory.stat|
|
||||
|memory|memory_stat_container_dirty|等待写入磁盘的字节|字节(Bytes)|容器|memory.stat|
|
||||
|memory|memory_stat_container_active_file|活跃内存中文件内存的大小|字节(Bytes)|容器|memory.stat|
|
||||
|memory|memory_stat_container_active_anon|活跃内存中匿名和交换内存的大小|字节(Bytes)|容器|memory.stat|
|
||||
|memory|mountpoint_perm_ro|挂在点是否为只读|布尔(bool)|宿主|procfs|
|
||||
|memory|vmstat_allocstall_normal|宿主在 normal 域直接回收|计数|宿主|/proc/vmstat|
|
||||
|memory|vmstat_allocstall_movable|宿主在 movable 域直接回收|计数|宿主|/proc/vmstat|
|
||||
|memory|vmstat_compact_stall|内存压缩计数|计数|宿主|/proc/vmstat|
|
||||
|memory|vmstat_nr_active_anon|活跃的匿名页数量|页计数|宿主|/proc/vmstat|
|
||||
|memory|vmstat_nr_active_file|活跃的文件页数量|页计数|宿主|/proc/vmstat|
|
||||
|memory|vmstat_nr_boost_pages|kswapd boosting 页数量|页计数|宿主|/proc/vmstat|
|
||||
|memory|vmstat_nr_dirty|脏页数量|页计数|宿主|/proc/vmstat|
|
||||
|memory|vmstat_nr_free_pages|释放的页数量|页计数|宿主|/proc/vmstat|
|
||||
|memory|vmstat_nr_inactive_anon|非活跃的匿名页数量|页计数|宿主|/proc/vmstat|
|
||||
|memory|vmstat_nr_inactive_file|非活跃的文件页数量|页计数|宿主|/proc/vmstat|
|
||||
|memory|vmstat_nr_kswapd_boost|kswapd boosting 次数计数|页计数|宿主|/proc/vmstat|
|
||||
|memory|vmstat_nr_mlock|锁定的页面数量|页计数|宿主|/proc/vmstat|
|
||||
|memory|vmstat_nr_shmem|共享内存页面数|页计数|宿主|/proc/vmstat|
|
||||
|memory|vmstat_nr_slab_reclaimable|可回收的 slab 页数量|页计数|宿主|/proc/vmstat|
|
||||
|memory|vmstat_nr_slab_unreclaimable|无法回收的 slab 页数量|页计数|宿主|/proc/vmstat|
|
||||
|memory|vmstat_nr_unevictable|不可驱逐页面数量|页计数|宿主|/proc/vmstat|
|
||||
|memory|vmstat_nr_writeback|写入页面数|页计数|宿主|/proc/vmstat|
|
||||
|memory|vmstat_numa_pages_migrated|NUMA 迁移中的页面数|页计数|宿主|/proc/vmstat|
|
||||
|memory|vmstat_pgdeactivate|页数被停用进入非活动 LRU|页计数|宿主|/proc/vmstat|
|
||||
|memory|vmstat_pgrefill|扫描的活跃 LRU 页面数|页计数|宿主|/proc/vmstat|
|
||||
|memory|vmstat_pgscan_direct|扫描的页数|页计数|宿主|/proc/vmstat|
|
||||
|memory|vmstat_pgscan_kswapd|扫描的页面数量,由 kswapd 回收的数量|页计数|宿主|/proc/vmstat|
|
||||
|memory|vmstat_pgsteal_direct|直接回收的页面|页计数|宿主|/proc/vmstat|
|
||||
|memory|vmstat_pgsteal_kswapd|被 kswapd 回收的数量|页计数|宿主|/proc/vmstat|
|
||||
|memory|hungtask_counter|hungtask 事件计数|计数|宿主|BPF 埋点统计|
|
||||
|memory|oom_host_counter|oom 事件计数|计数|宿主|BPF 埋点统计|
|
||||
|memory|oom_container_counter|oom 事件计数|计数|容器|BPF 埋点统计|
|
||||
|memory|softlockup_counter|softlockup 事件计数|计数|宿主|BPF 埋点统计|
|
||||
|memory|memory_free_compaction|内存压缩的速度|纳秒(ns)|宿主|bpf 埋点统计|
|
||||
|memory|memory_free_allocstall|内存中主机直接回收速度|纳秒(ns)|宿主|bpf 埋点统计|
|
||||
|memory|memory_cgroup_container_directstall|cgroup 尝试直接回收的计数|计数|容器|bpf 埋点统计|
|
||||
|IO|iolatency_disk_d2c|磁盘访问时的 io 延迟统计,包括驱动程序和硬件组件消耗的时间|计数|宿主|bpf 埋点统计|
|
||||
|IO|iolatency_disk_q2c|磁盘访问整个 I/O 生命周期时的 I/O 延迟统计|计数|宿主|bpf 埋点统计|
|
||||
|IO|iolatency_container_d2c|磁盘访问时的 I/O 延迟统计,包括驱动程序和硬件组件消耗的时间|计数|容器|bpf 埋点统计|
|
||||
|IO|iolatency_container_q2c|磁盘访问整个 I/O 生命周期时的 I/O 延迟统计|计数|容器|bpf 埋点统计|
|
||||
|IO|iolatency_disk_flush|磁盘 RAID 设备刷新操作延迟统计|计数|宿主|bpf 埋点统计|
|
||||
|IO|iolatency_container_flush|磁盘 RAID 设备上由容器引起的刷新操作延迟统计|计数|容器|bpf 埋点统计|
|
||||
|IO|iolatency_disk_freeze|磁盘 freese 事件|计数|宿主|bpf 埋点统计|
|
||||
|network|tcp_mem_limit_pages|系统 TCP 总内存大小限制|页计数|系统|procfs|
|
||||
|network|tcp_mem_usage_bytes|系统使用的 TCP 内存总字节数|字节(Bytes)|系统|tcp_mem_usage_pages \* page_size|
|
||||
|network|tcp_mem_usage_pages|系统使用的 TCP 内存总量|页计数|系统|procfs|
|
||||
|network|tcp_mem_usage_percent|系统使用的 TCP 内存百分比(相对 TCP 内存总限制)|%|系统|tcp_mem_usage_pages / tcp_mem_limit_pages|
|
||||
|network|arp_entries|arp 缓存条目数量|计数|宿主,容器|procfs|
|
||||
|network|arp_total|总 arp 缓存条目数|计数|系统|procfs|
|
||||
|network|qdisc_backlog|待发送的字节数|字节(Bytes)|宿主|netlink qdisc 统计|
|
||||
|network|qdisc_bytes_total|已发送的字节数|字节(Bytes)|宿主|netlink qdisc 统计|
|
||||
|network|qdisc_current_queue_length|排队等待发送的包数量|计数|宿主|netlink qdisc 统计|
|
||||
|network|qdisc_drops_total|丢弃的数据包数量|计数|宿主|netlink qdisc 统计|
|
||||
|network|qdisc_overlimits_total|排队数据包里超限的数量|计数|宿主|netlink qdisc 统计|
|
||||
|network|qdisc_packets_total|已发送的包数量|计数|宿主|netlink qdisc 统计|
|
||||
|network|qdisc_requeues_total|重新入队的数量|计数|宿主|netlink qdisc 统计|
|
||||
|network|ethtool_hardware_rx_dropped_errors|接口接收丢包统计|计数|宿主|硬件驱动相关, 如 mlx, ixgbe, bnxt_en, etc.|
|
||||
|network|netdev_receive_bytes_total|接口接收的字节数|字节(Bytes)|宿主,容器|procfs|
|
||||
|network|netdev_receive_compressed_total|接口接收的压缩包数量|计数|宿主,容器|procfs|
|
||||
|network|netdev_receive_dropped_total|接口接收丢弃的包数量|计数|宿主,容器|procfs|
|
||||
|network|netdev_receive_errors_total|接口接收检测到错误的包数量|计数|宿主,容器|procfs|
|
||||
|network|netdev_receive_fifo_total|接口接收 fifo 缓冲区错误数量|计数|宿主,容器|procfs|
|
||||
|network|netdev_receive_frame_total|接口接收帧对齐错误|计数|宿主,容器|procfs|
|
||||
|network|netdev_receive_multicast_total|多播数据包已接收的包数量,对于硬件接口,此统计通常在设备层计算(与 rx_packets 不同),因此可能包括未到达的数据包|计数|宿主,容器|procfs|
|
||||
|network|netdev_receive_packets_total|接口接收到的有效数据包数量|计数|宿主,容器|procfs|
|
||||
|network|netdev_transmit_bytes_total|接口发送的字节数|字节(Bytes)|宿主,容器|procfs|
|
||||
|network|netdev_transmit_carrier_total|接口发送过程中由于载波丢失导致的帧传输错误数量|计数|宿主,容器|procfs|
|
||||
|network|netdev_transmit_colls_total|接口发送碰撞计数|计数|宿主,容器|procfs|
|
||||
|network|netdev_transmit_compressed_total|接口发送压缩数据包数量|计数|宿主,容器|procfs|
|
||||
|network|netdev_transmit_dropped_total|数据包在传输过程中丢失的数量,如资源不足|计数|宿主,容器|procfs|
|
||||
|network|netdev_transmit_errors_total|发送错误计数|计数|宿主,容器|procfs|
|
||||
|network|netdev_transmit_fifo_total|帧传输错误数量|计数|宿主,容器|procfs|
|
||||
|network|netdev_transmit_packets_total|发送数据包计数|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_ArpFilter|因 ARP 过滤规则而被拒绝的 ARP 请求/响应包数量|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_BusyPollRxPackets|通过 busy polling 机制接收到的网络数据包数量|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_DelayedACKLocked|由于用户态锁住了sock,而无法发送delayed ack的次数|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_DelayedACKLost|当收到已确认的包时,它将被更新。延迟 ACK 丢失可能会引起这个问题,但其他原因也可能触发,例如网络中重复的包。|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_DelayedACKs|延迟的 ACK 定时器已过期。TCP 堆栈将发送一个纯 ACK 数据包并退出延迟 ACK 模式|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_EmbryonicRsts|收到初始 SYN_RECV 套接字的重置|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_IPReversePathFilter|\-|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_ListenDrops|当内核收到客户端的 SYN 请求时,如果 TCP 接受队列已满,内核将丢弃 SYN 并将 TcpExtListenOverflows 加 1。同时,内核也会将 TcpExtListenDrops 加 1。当一个 TCP 套接字处于监听状态,且内核需要丢弃一个数据包时,内核会始终将 TcpExtListenDrops 加 1。因此,增加 TcpExtListenOverflows 会导致 TcpExtListenDrops 同时增加,但 TcpExtListenDrops 也会在没有 TcpExtListenOverflows 增加的情况下增加,例如内存分配失败也会导致 TcpExtListenDrops 增加。|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_ListenOverflows|当内核收到客户端的 SYN 请求时,如果 TCP 接受队列已满,内核将丢弃 SYN 并将 TcpExtListenOverflows 加 1。同时,内核也会将 TcpExtListenDrops 加 1。当一个 TCP 套接字处于监听状态,且内核需要丢弃一个数据包时,内核会始终将 TcpExtListenDrops 加 1。因此,增加 TcpExtListenOverflows 会导致 TcpExtListenDrops 同时增加,但 TcpExtListenDrops 也会在没有 TcpExtListenOverflows 增加的情况下增加,例如内存分配失败也会导致 TcpExtListenDrops 增加。|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_LockDroppedIcmps|由于套接字被锁定,ICMP 数据包被丢弃|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_OfoPruned|协议栈尝试在乱序队列中丢弃数据包|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_OutOfWindowIcmps|ICMP 数据包因超出窗口而被丢弃|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_PAWSActive|数据包在 Syn-Sent 状态被 PAWS 丢弃|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_PAWSEstab|数据包在除 Syn-Sent 之外的所有状态下都会被 PAWS 丢弃|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_PFMemallocDrop|\-|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_PruneCalled|协议栈尝试回收套接字内存。更新此计数器后,将尝试合并乱序队列和接收队列。如果内存仍然不足,将尝试丢弃乱序队列中的数据包(并更新 TcpExtOfoPruned 计数器)。|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_RcvPruned|在从顺序错误的队列中‘collapse’和丢弃数据包后,如果实际使用的内存仍然大于最大允许内存,则此计数器将被更新。这意味着‘prune’失败|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_SyncookiesFailed|MSS 从 SYN cookie 解码出来的无效。当这个计数器更新时,接收到的数据包不会被当作 SYN cookie 处理,并且 TcpExtSyncookiesRecv 计数器不会更新|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_SyncookiesRecv|接收了多少个 SYN cookies 的回复数据包|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_SyncookiesSent|发送了多少个 SYN cookies|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPACKSkippedChallenge|ACK 为 challenge ACK 时,将跳过 ACK|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPACKSkippedFinWait2|ACK 在 Fin-Wait-2 状态被跳过,原因可能是 PAWS 检查失败或接收到的序列号超出窗口|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPACKSkippedPAWS|由于 PAWS(保护包装序列号)检查失败,ACK 被跳过|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPACKSkippedSeq|序列号超出窗口范围,时间戳通过 PAWS 检查,TCP 状态不是 Syn-Recv、Fin-Wait-2 和 Time-Wait|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPACKSkippedSynRecv|ACK 在 Syn-Recv 状态中被跳过。Syn-Recv 状态表示协议栈收到一个 SYN 并回复 SYN+ACK|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPACKSkippedTimeWait|CK 在 Time-Wait 状态中被跳过,原因可能是 PAWS 检查失败或接收到的序列号超出窗口|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPAbortFailed|内核 TCP 层将在满足 RFC2525 2.17 节时发送 RST。如果在处理过程中发生内部错误,TcpExtTCPAbortFailed 将增加|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPAbortOnClose|用户模式程序缓冲区中有数据时关闭的套接字数量|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPAbortOnData|TCP 层有正在传输的数据,但需要关闭连接|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPAbortOnLinger|当 TCP 连接进入 FIN_WAIT_2 状态时,内核不会等待来自另一侧的 fin 包,而是发送 RST 并立即删除套接字|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPAbortOnMemory|当一个应用程序关闭 TCP 连接时,内核仍然需要跟踪该连接,让它完成 TCP 断开过程|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPAbortOnTimeout|此计数器将在任何 TCP 计时器到期时增加。在这种情况下,内核不会发送 RST,而是放弃连接|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPAckCompressed|\-|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPAutoCorking|发送数据包时,TCP 层会尝试将小数据包合并成更大的一个|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPBacklogDrop|\-|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPChallengeACK|challenge ack 发送的数量|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPDSACKIgnoredNoUndo|当 DSACK 块无效时,这两个计数器中的一个将被更新。哪个计数器将被更新取决于 TCP 套接字的 undo_marker 标志|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPDSACKIgnoredOld|当 DSACK 块无效时,这两个计数器中的一个将被更新。哪个计数器将被更新取决于 TCP 套接字的 undo_marker 标志|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPDSACKOfoRecv|收到一个 DSACK,表示收到一个顺序错误的重复数据包|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPDSACKOfoSent|收到一个乱序的重复数据包,因此向发送者发送 DSACK|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPDSACKOldSent|收到一个已确认的重复数据包,因此向发送者发送 DSACK|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPDSACKRecv|收到一个 DSACK,表示收到了一个已确认的重复数据包|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPDSACKUndo|\-|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPDeferAcceptDrop|\-|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPDelivered|\-|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPDeliveredCE|\-|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPFastOpenActive|当 TCP 栈在 SYN-SENT 状态接收到一个 ACK 包,并且 ACK 包确认了 SYN 包中的数据,理解 TFO cookie 已被对方接受,然后它更新这个计数器|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPFastOpenActiveFail|Fast Open 失败|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPFastOpenBlackhole|\-|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPFastOpenCookieReqd|客户端想要请求 TFO cookie 的次数|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPFastOpenListenOverflow|挂起的 Fast Open 请求数量大于 fastopenq->max_qlen 时,协议栈将拒绝 Fast Open 请求并更新此计数器|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPFastOpenPassive|指示 TCP 堆栈接受 Fast Open 请求的次数|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPFastOpenPassiveFail|协议栈拒绝 Fast Open 的次数,这是由于 TFO cookie 无效或 在创建套接字过程中发现错误所引起的|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPFastRetrans|快速重传|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPFromZeroWindowAdv|TCP 接收窗口设置为非零值|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPFullUndo|\-|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPHPAcks|如果数据包设置了 ACK 标志且没有数据,则是一个纯 ACK 数据包,如果内核在快速路径中处理它,TcpExtTCPHPAcks 将增加 1|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPHPHits|如果 TCP 数据包包含数据(这意味着它不是一个纯 ACK 数据包),并且此数据包在快速路径中处理,TcpExtTCPHPHits 将增加 1|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPHystartDelayCwnd|CWND 检测到的包延迟总和。将此值除以 TcpExtTCPHystartDelayDetect,即为通过包延迟检测到的平均 CWND|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPHystartDelayDetect|检测到数据包延迟阈值次数|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPHystartTrainCwnd|TCP Hystart 训练中使用的拥塞窗口大小,将此值除以 TcpExtTCPHystartTrainDetect 得到由 ACK 训练长度检测到的平均 CWND|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPHystartTrainDetect|TCP Hystart 训练检测的次数|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPKeepAlive|此计数器指示已发送的保活数据包。默认情况下不会启用保活功能。用户空间程序可以通过设置 SO_KEEPALIVE 套接字选项来启用它。|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPLossFailures|丢失数据包而进行恢复失败的次数|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPLossProbeRecovery|检测到丢失的数据包恢复的次数|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPLossProbes|TCP 检测到丢失的数据包数量,通常用于检测网络拥塞或丢包|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPLossUndo|TCP重传数据包成功到达目标端口,但之前已经由于超时或拥塞丢失,因此被视为“撤销”丢失的数据包数量|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPLostRetransmit|丢包重传个数|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPMD5Failure|校验错误|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPMD5NotFound|校验错误|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPMD5Unexpected|校验错误|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPMTUPFail|使用 DSACK 无需慢启动即可恢复拥塞窗口|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPMTUPSuccess|\-|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPMemoryPressures|到达 tcp 内存压力位 low 的次数|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPMemoryPressuresChrono|\-|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPMinTTLDrop|\-|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPOFODrop|TCP 层接收到一个乱序的数据包,但内存不足,因此丢弃它。此类数据包不会计入 TcpExtTCPOFOQueue 计数|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPOFOMerge|接收到的顺序错误的包与上一个包有重叠。重叠部分将被丢弃。所有 TcpExtTCPOFOMerge 包也将计入 TcpExtTCPOFOQueue|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPOFOQueue|TCP 层接收到一个乱序的数据包,并且有足够的内存来排队它|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPOrigDataSent|发送原始数据(不包括重传但包括 SYN 中的数据)的包数量。此计数器与 TcpOutSegs 不同,因为 TcpOutSegs 还跟踪纯 ACK。TCPOrigDataSent 更有助于跟踪 TCP 重传率|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPPartialUndo|检测到一些错误的重传,在我们快速重传的同时,收到了部分确认,因此能够部分撤销我们的一些 CWND 减少|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPPureAcks|如果数据包设置了 ACK 标志且没有数据,则是一个纯 ACK 数据包,如果内核在快速路径中处理它,TcpExtTCPHPAcks 将增加 1,如果内核在慢速路径中处理它,TcpExtTCPPureAcks 将增加 1|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPRcvCoalesce|当数据包被 TCP 层接收但未被应用程序读取时,TCP 层会尝试合并它们。这个计数器表示在这种情况下合并了多少个数据包。如果启用了 GRO,GRO 会合并大量数据包,这些数据包不会被计算到 TcpExtTCPRcvCoalesce 中|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPRcvCollapsed|在“崩溃”过程中释放了多少个 skbs|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPRenoFailures|TCP_CA_Disorder 阶段进入并经历 RTO 的重传失败次数|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPRenoRecovery|当拥塞控制进入恢复状态时,如果使用 sack,TcpExtTCPSackRecovery 增加 1,如果不使用 sack,TcpExtTCPRenoRecovery 增加 1。这两个计数器意味着协议栈开始重传丢失的数据包|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPRenoRecoveryFail|进入恢复阶段并 RTO 的连接数|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPRenoReorder|重排序数据包被快速恢复检测到。只有在 SACK 被禁用时才会使用|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPReqQFullDoCookies|\-|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPReqQFullDrop|\-|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPRetransFail|尝试将重传数据包发送到下层,但下层返回错误|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPSACKDiscard|有多少个 SACK 块无效。如果无效的 SACK 块是由 ACK 记录引起的,tcp 栈只会忽略它,而不会更新此计数器|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPSACKReneging|一个数据包被 SACK 确认,但接收方已丢弃此数据包,因此发送方需要重传此数据包|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPSACKReorder|SACK 检测到的重排序数据包|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPSYNChallenge|响应 SYN 数据包发送的 Challenge ack 数|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPSackFailures|TCP_CA_Disorder 阶段进入并经历 RTO 的重传失败次数|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPSackMerged|skb 已合并计数|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPSackRecovery|当拥塞控制进入恢复状态时,如果使用 sack,TcpExtTCPSackRecovery 增加 1,如果不使用 sack,TcpExtTCPRenoRecovery 增加 1。这两个计数器意味着 TCP 栈开始重传丢失的数据包|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPSackRecoveryFail|SACK 恢复失败的次数|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPSackShiftFallback|skb 应该被移动或合并,但由于某些原因,TCP 堆栈没有这样做|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPSackShifted|skb 被移位|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPSlowStartRetrans|重新传输一个数据包,拥塞控制状态为“丢失”|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPSpuriousRTOs|虚假重传超时|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPSpuriousRtxHostQueues|当 TCP 栈想要重传一个数据包,发现该数据包并未在网络中丢失,但数据包尚未发送,TCP 栈将放弃重传并更新此计数器。这可能会发生在数据包在 qdisc 或驱动程序队列中停留时间过长的情况下|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPSynRetrans|SYN 和 SYN/ACK 重传次数,将重传分解为 SYN、快速重传、超时重传等|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPTSReorder|tcp 栈在接收到时间截包而进行乱序包阀值调整的次数|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPTimeWaitOverflow|TIME_WAIT 状态的套接字因超出限制而无法分配的数量|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPTimeouts|TCP 超时事件|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPToZeroWindowAdv|TCP 接收窗口从非零值设置为零|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPWantZeroWindowAdv|根据当前内存使用情况,TCP 栈尝试将接收窗口设置为零。但接收窗口可能仍然是一个非零值|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPWinProbe|定期发送的 ACK 数据包数量,以确保打开窗口的反向 ACK 数据包没有丢失|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TCPWqueueTooBig|\-|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TW|TCP 套接字在快速计时器中完成 time wait 状态|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TWKilled|TCP 套接字在慢速计时器中完成 time wait 状态|计数|宿主,容器|procfs|
|
||||
|network|netstat_TcpExt_TWRecycled|等待套接字通过时间戳回收|计数|宿主,容器|procfs|
|
||||
|network|netstat_Tcp_ActiveOpens|TCP 层发送一个 SYN,进入 SYN-SENT 状态。每当 TcpActiveOpens 增加 1 时,TcpOutSegs 应该始终增加 1|计数|宿主,容器|procfs|
|
||||
|network|netstat_Tcp_AttemptFails|TCP 连接从 SYN-SENT 状态或 SYN-RCVD 状态直接过渡到 CLOSED 状态次数,加上 TCP 连接从 SYN-RCVD 状态直接过渡到 LISTEN 状态次数|计数|宿主,容器|procfs|
|
||||
|network|netstat_Tcp_CurrEstab|TCP 连接数,当前状态为 ESTABLISHED 或 CLOSE-WAIT|计数|宿主,容器|procfs|
|
||||
|network|netstat_Tcp_EstabResets|TCP 连接从 ESTABLISHED 状态或 CLOSE-WAIT 状态直接过渡到 CLOSED 状态次数|计数|宿主,容器|procfs|
|
||||
|network|netstat_Tcp_InCsumErrors|TCP 校验和错误|计数|宿主,容器|procfs|
|
||||
|network|netstat_Tcp_InErrs|错误接收到的段总数(例如,错误的 TCP 校验和)|计数|宿主,容器|procfs|
|
||||
|network|netstat_Tcp_InSegs|TCP 层接收到的数据包数量。如 RFC1213 所述,包括接收到的错误数据包,如校验和错误、无效 TCP 头等|计数|宿主,容器|procfs|
|
||||
|network|netstat_Tcp_MaxConn|可以支持的总 TCP 连接数限制,在最大连接数动态的实体中,此对象应包含值-1|计数|宿主,容器|procfs|
|
||||
|network|netstat_Tcp_OutRsts|TCP 段中包含 RST 标志的数量|计数|宿主,容器|procfs|
|
||||
|network|netstat_Tcp_OutSegs|发送的总段数,包括当前连接上的段,但不包括仅包含重传字节的段|计数|宿主,容器|procfs|
|
||||
|network|netstat_Tcp_PassiveOpens|TCP 连接从监听状态直接过渡到 SYN-RCVD 状态的次数|计数|宿主,容器|procfs|
|
||||
|network|netstat_Tcp_RetransSegs|总重传段数 - 即包含一个或多个先前已传输字节的 TCP 段传输的数量|计数|宿主,容器|procfs|
|
||||
|network|netstat_Tcp_RtoAlgorithm|The algorithm used to determine the timeout value used for retransmitting unacknowledged octets|计数|宿主,容器|procfs|
|
||||
|network|netstat_Tcp_RtoMax|TCP 实现允许的重传超时最大值,以毫秒为单位|毫秒|宿主,容器|procfs|
|
||||
|network|netstat_Tcp_RtoMin|TCP 实现允许的重传超时最小值,以毫秒为单位|毫秒|宿主,容器|procfs|
|
||||
|network|sockstat_FRAG_inuse|\-|计数|宿主,容器|procfs|
|
||||
|network|sockstat_FRAG_memory|\-|页计数|宿主,容器|procfs|
|
||||
|network|sockstat_RAW_inuse|使用的 RAW 套接字数量|计数|宿主,容器|procfs|
|
||||
|network|sockstat_TCP_alloc|TCP 已分配的套接字数量|计数|宿主,容器|procfs|
|
||||
|network|sockstat_TCP_inuse|已建立的 TCP 套接字数量|计数|宿主,容器|procfs|
|
||||
|network|sockstat_TCP_mem|系统使用的 TCP 内存总量|页计数|系统|procfs|
|
||||
|network|sockstat_TCP_mem_bytes|系统使用的 TCP 内存总量|字节(Bytes)|系统|sockstat_TCP_mem \* page_size|
|
||||
|network|sockstat_TCP_orphan|TCP 等待关闭的连接数|计数|宿主,容器|procfs|
|
||||
|network|sockstat_TCP_tw|TCP 套接字终止数量|计数|宿主,容器|procfs|
|
||||
|network|sockstat_UDPLITE_inuse|\-|计数|宿主,容器|procfs|
|
||||
|network|sockstat_UDP_inuse|使用的 UDP 套接字数量|计数|宿主,容器|procfs|
|
||||
|network|sockstat_UDP_mem|系统使用的 UDP 内存总量|页计数|系统|procfs|
|
||||
|network|sockstat_UDP_mem_bytes|系统使用的 UDP 内存字节数总和|字节(Bytes)|系统|sockstat_UDP_mem \* page_size|
|
||||
|network|sockstat_sockets_used|系统使用 socket 数量|计数|系统|procfs|
|
|
@ -0,0 +1,270 @@
|
|||
| Subsystem | Metric | Description | Unit | Dimension | Source |
|
||||
| --------- | ------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------- | -------------- | ------------------------------------------------------------------------------------- |
|
||||
| cpu | cpu_util_sys | Time of running kernel processes percentage of host | % | host | Calculate base on cpuacct.stat and cpuacct.usage |
|
||||
| cpu | cpu_util_usr | Time of running user processes percentage of host | % | host | Calculate base on cpuacct.stat and cpuacct.usage |
|
||||
| cpu | cpu_util_total | Total time of running percentage of host | % | host | Calculate base on cpuacct.stat and cpuacct.usage |
|
||||
| cpu | cpu_util_container_sys | Time of running kernel processes percentage of container | % | container | Calculate base on cpuacct.stat and cpuacct.usage |
|
||||
| cpu | cpu_util_container_usr | Time of running user processes percentage of container | % | container | Calculate base on cpuacct.stat and cpuacct.usage |
|
||||
| cpu | cpu_util_container_total | Total time of running percentage of container | % | container | Calculate base on cpuacct.stat and cpuacct.usage |
|
||||
| cpu | cpu_stat_container_burst_time | Cumulative wall-time (in nanoseconds) that any CPUs has used above quota in respective periods | ns | container | cpu.stat |
|
||||
| cpu | cpu_stat_container_nr_bursts | Number of periods burst occurs | count | container | cpu.stat |
|
||||
| cpu | cpu_stat_container_nr_throttled | Number of times the group has been throttled/limited | count | container | cpu.stat |
|
||||
| cpu | cpu_stat_container_exter_wait_rate | Wait rate caused by processes outside the container | % | container | Calculate base on throttled_time/hierarchy_wait_sum/inner_wait_sum read from cpu.stat |
|
||||
| cpu | cpu_stat_container_inner_wait_rate | Wait rate caused by processes inside the container | % | container | Calculate base on throttled_time/hierarchy_wait_sum/inner_wait_sum read from cpu.stat |
|
||||
| cpu | cpu_stat_container_throttle_wait_rate | Wait rate caused by throttle of container | % | container | Calculate base on throttled_time/hierarchy_wait_sum/inner_wait_sum read from cpu.stat |
|
||||
| cpu | cpu_stat_container_wait_rate | Total wait rate: exter_wait_rate + inner_wait_rate + throttle_wait_rate | % | container | Calculate base on throttled_time/hierarchy_wait_sum/inner_wait_sum read from cpu.stat |
|
||||
| cpu | loadavg_container_container_nr_running | The number of running tasks in the container | count | container | get from kernel via netlink |
|
||||
| cpu | loadavg_container_container_nr_uninterruptible | The number of uninterruptible tasks in the container | count | container | get from kernel via netlink |
|
||||
| cpu | loadavg_load1 | System load avg over the last 1 minute | count | host | proc fs |
|
||||
| cpu | loadavg_load5 | System load avg over the last 5 minute | count | host | proc fs |
|
||||
| cpu | loadavg_load15 | system load avg over the last 15 minute | count | host | proc fs |
|
||||
| cpu | monsoftirq_latency | The number of NET_RX/NET_TX irq latency happend in the following regions:<br>0~10 us<br>100us ~ 1ms<br>10us ~ 100us<br>1ms ~ inf | count | host | hook the softirq event and do time statistics via bpf |
|
||||
| cpu | runqlat_container_nlat_01 | The number of times when schedule latency of processes in the container is within 0~10ms | count | container | hook the scheduling switch event and do time statistics via bpf |
|
||||
| cpu | runqlat_container_nlat_02 | The number of times when schedule latency of processes in the container is within 10~20ms | count | container | hook the scheduling switch event and do time statistics via bpf |
|
||||
| cpu | runqlat_container_nlat_03 | The number of times when schedule latency of processes in the container is within 20~50ms | count | container | hook the scheduling switch event and do time statistics via bpf |
|
||||
| cpu | runqlat_container_nlat_04 | The number of times when schedule latency of processes in the container is more than 50ms | count | container | hook the scheduling switch event and do time statistics via bpf |
|
||||
| cpu | runqlat_g_nlat_01 | The number of times when schedule latency of processes in the host is within<br>0~10ms | count | host | hook the scheduling switch event and do time statistics via bpf |
|
||||
| cpu | runqlat_g_nlat_02 | The number of times when schedule latency of processes in the host is within 10~20ms | count | host | hook the scheduling switch event and do time statistics via bpf |
|
||||
| cpu | runqlat_g_nlat_03 | The number of times when schedule latency of processes in the host is within 20~50ms | count | host | hook the scheduling switch event and do time statistics via bpf |
|
||||
| cpu | runqlat_g_nlat_04 | The number of times when schedule latency of processes in the host is more than 50ms | count | host | hook the scheduling switch event and do time statistics via bpf |
|
||||
| cpu | reschedipi_oversell_probability | The possibility of cpu overselling exists on the host where the vm is located | 0-1 | host | hook the scheduling ipi event and do time statistics via bpf |
|
||||
| memory | buddyinfo_blocks | Kernel memory allocator information | pages | host | proc fs |
|
||||
| memory | memory_events_container_watermark_inc | Counts of memory allocation watermark increasing | count | container | memory.events |
|
||||
| memory | memory_events_container_watermark_dec | Counts of memory allocation watermark decreasing | count | container | memory.events |
|
||||
| memory | memory_others_container_local_direct_reclaim_time | Time speed in page allocation in memory cgroup | nanosecond | container | memory.local_direct_reclaim_time |
|
||||
| memory | memory_others_container_directstall_time | Memory cgroup's direct reclaim time in try_charge | nanosecond | container | memory.directstall_stat |
|
||||
| memory | memory_others_container_asyncreclaim_time | Memory cgroup's direct reclaim time in cgroup async memory reclaim | nanosecond | container | memory.asynreclaim_stat |
|
||||
| memory | priority_reclaim_kswapd | Kswapd's reclaim stat in priority reclaiming | pages | host | proc fs |
|
||||
| memory | priority_reclaim_direct | Direct reclaim stat in priority reclaiming | pages | host | proc fs |
|
||||
| memory | memory_stat_container_writeback | Bytes of file/anon cache that are queued for syncing to disk | bytes | container | memory.stat |
|
||||
| memory | memory_stat_container_unevictable | Bytes of memory that cannot be reclaimed (mlocked etc) | bytes | container | memory.stat |
|
||||
| memory | memory_stat_container_shmem | Bytes of shmem memory | bytes | container | memory.stat |
|
||||
| memory | memory_stat_container_pgsteal_kswapd | Bytes of reclaimed memory by kswapd and cswapd | bytes | container | memory.stat |
|
||||
| memory | memory_stat_container_pgsteal_globalkswapd | Bytes of reclaimed memory by kswapd | bytes | container | memory.stat |
|
||||
| memory | memory_stat_container_pgsteal_globaldirect | Bytes of reclaimed memory by direct reclaim during page allocation | bytes | container | memory.stat |
|
||||
| memory | memory_stat_container_pgsteal_direct | Bytes of reclaimed memory by direct reclaim during page allocation and try_charge | bytes | container | memory.stat |
|
||||
| memory | memory_stat_container_pgsteal_cswapd | Bytes of reclaimed memory by cswapd | bytes | container | memory.stat |
|
||||
| memory | memory_stat_container_pgscan_kswapd | Bytes of scanned memory by kswapd and cswapd | bytes | container | memory.stat |
|
||||
| memory | memory_stat_container_pgscan_globalkswapd | Bytes of scanned memory by kswapd | bytes | container | memory.stat |
|
||||
| memory | memory_stat_container_pgscan_globaldirect | Bytes of scanned memory by direct reclaim during page allocation | bytes | container | memory.stat |
|
||||
| memory | memory_stat_container_pgscan_direct | Bytes of scanned memory by direct reclaim during page allocation and try_charge | bytes | container | memory.stat |
|
||||
| memory | memory_stat_container_pgscan_cswapd | Bytes of scanned memory by cswapd | bytes | container | memory.stat |
|
||||
| memory | memory_stat_container_pgrefill | Bytes of memory that is scanned in active list | bytes | container | memory.stat |
|
||||
| memory | memory_stat_container_pgdeactivate | Bytes of memory that is deactivated into inactive list | bytes | container | memory.stat |
|
||||
| memory | memory_stat_container_inactive_file | Bytes of file-backed memory on inactive lru list. | bytes | container | memory.stat |
|
||||
| memory | memory_stat_container_inactive_anon | Bytes of anonymous and swap cache memory on inactive lru list | bytes | container | memory.stat |
|
||||
| memory | memory_stat_container_dirty | Bytes that are waiting to get written back to the disk | bytes | container | memory.stat |
|
||||
| memory | memory_stat_container_active_file | Bytes of file-backed memory on active lru list | bytes | container | memory.stat |
|
||||
| memory | memory_stat_container_active_anon | Bytes of anonymous and swap cache memory on active lru list | bytes | container | memory.stat |
|
||||
| memory | mountpoint_perm_ro | Whether mountpoint is readonly or not | bool | host | proc fs |
|
||||
| memory | vmstat_allocstall_normal | Host direct reclaim count on normal zone | count | host | /proc/vmstat |
|
||||
| memory | vmstat_allocstall_movable | Host direct reclaim count on movable zone | count | host | /proc/vmstat |
|
||||
| memory | vmstat_compact_stall | Count of memory compaction | count | host | /proc/vmstat |
|
||||
| memory | vmstat_nr_active_anon | Number of anonymous pages on active lru | pages | host | /proc/vmstat |
|
||||
| memory | vmstat_nr_active_file | Number of file-backed pages on active lru | pages | host | /proc/vmstat |
|
||||
| memory | vmstat_nr_boost_pages | Number of pages in kswapd boosting | pages | host | /proc/vmstat |
|
||||
| memory | vmstat_nr_dirty | Number of dirty pages | pages | host | /proc/vmstat |
|
||||
| memory | vmstat_nr_free_pages | Number of free pages | pages | host | /proc/vmstat |
|
||||
| memory | vmstat_nr_inactive_anon | Number of anonymous pages on inactive lru | pages | host | /proc/vmstat |
|
||||
| memory | vmstat_nr_inactive_file | Number of file-backed pages on inactive lru | pages | host | /proc/vmstat |
|
||||
| memory | vmstat_nr_kswapd_boost | Count of kswapd boosting | pages | host | /proc/vmstat |
|
||||
| memory | vmstat_nr_mlock | Number of locked pages | pages | host | /proc/vmstat |
|
||||
| memory | vmstat_nr_shmem | Number of shmem pages | pages | host | /proc/vmstat |
|
||||
| memory | vmstat_nr_slab_reclaimable | Number of relcaimable slab pages | pages | host | /proc/vmstat |
|
||||
| memory | vmstat_nr_slab_unreclaimable | Number of unrelcaimable slab pages | pages | host | /proc/vmstat |
|
||||
| memory | vmstat_nr_unevictable | Number of unevictable pages | pages | host | /proc/vmstat |
|
||||
| memory | vmstat_nr_writeback | Number of writebacking pages | pages | host | /proc/vmstat |
|
||||
| memory | vmstat_numa_pages_migrated | Number of pages in numa migrating | pages | host | /proc/vmstat |
|
||||
| memory | vmstat_pgdeactivate | Number of pages which are deactivated into inactive lru | pages | host | /proc/vmstat |
|
||||
| memory | vmstat_pgrefill | Number of pages which are scanned on active lru | pages | host | /proc/vmstat |
|
||||
| memory | vmstat_pgscan_direct | Number of pages which are scanned in direct reclaim | pages | host | /proc/vmstat |
|
||||
| memory | vmstat_pgscan_kswapd | Number of pages which are scanned in kswapd reclaim | pages | host | /proc/vmstat |
|
||||
| memory | vmstat_pgsteal_direct | Number of pages which are reclaimed in direct reclaim | pages | host | /proc/vmstat |
|
||||
| memory | vmstat_pgsteal_kswapd | Number of pages which are reclaimed in kswapd reclaim | pages | host | /proc/vmstat |
|
||||
| memory | hungtask_happened | Count of hungtask events | count | host | performance and statistics monitoring for BPF Programs |
|
||||
| memory | oom_happened | Count of oom events | count | host,container | performance and statistics monitoring for BPF Programs |
|
||||
| memory | softlockup_happened | Count of softlockup events | count | host | performance and statistics monitoring for BPF Programs |
|
||||
| memory | mmhostbpf_compactionstat | Time speed in memory compaction | nanosecond | host | performance and statistics monitoring for BPF Programs |
|
||||
| memory | mmhostbpf_allocstallstat | Time speed in memory direct reclaim on host | nanosecond | host | performance and statistics monitoring for BPF Programs |
|
||||
| memory | mmcgroupbpf_container_directstallcount | Count of cgroup's try_charge direct reclaim | count | container | performance and statistics monitoring for BPF Programs |
|
||||
| IO | iolatency_disk_d2c | Statistics of io latency when accessing the disk, including the time consumed by the driver and hardware components | count | host | performance and statistics monitoring for BPF Programs |
|
||||
| IO | iolatency_disk_q2c | Statistics of io latency for the entire io lifecycle when accessing the disk | count | host | performance and statistics monitoring for BPF Programs |
|
||||
| IO | iolatency_container_d2c | Statistics of io latency when accessing the disk, including the time consumed by the driver and hardware components | count | container | performance and statistics monitoring for BPF Programs |
|
||||
| IO | iolatency_container_q2c | Statistics of io latency for the entire io lifecycle when accessing the disk | count | container | performance and statistics monitoring for BPF Programs |
|
||||
| IO | iolatency_disk_flush | Statistics of delay for flush operations on disk raid device | count | host | performance and statistics monitoring for BPF Programs |
|
||||
| IO | iolatency_container_flush | Statistics of delay for flush operations on disk raid devices caused by containers | count | container | performance and statistics monitoring for BPF Programs |
|
||||
| IO | iolatency_disk_freeze | Statistics of disk freeze events | count | host | performance and statistics monitoring for BPF Programs |
|
||||
| network | tcp_mem_limit_pages | System TCP total memory size limit | pages | system | proc fs |
|
||||
| network | tcp_mem_usage_bytes | The total number of bytes of TCP memory used by the system | bytes | system | tcp_mem_usage_pages \* page_size |
|
||||
| network | tcp_mem_usage_pages | The total size of TCP memory used by the system | pages | system | proc fs |
|
||||
| network | tcp_mem_usage_percent | The percentage of TCP memory used by the system to the limit size | % | system | tcp_mem_usage_pages / tcp_mem_limit_pages |
|
||||
| network | arp_entries | The number of arp cache entries | count | host,container | proc fs |
|
||||
| network | arp_total | Total number of arp cache entries | count | system | proc fs |
|
||||
| network | qdisc_backlog | The number of bytes queued to be sent | bytes | host | sum of same level(parent major) for a device |
|
||||
| network | qdisc_bytes_total | The number of bytes sent | bytes | host | sum of same level(parent major) for a device |
|
||||
| network | qdisc_current_queue_length | The number of packets queued for sending | count | host | sum of same level(parent major) for a device |
|
||||
| network | qdisc_drops_total | The number of discarded packets | count | host | sum of same level(parent major) for a device |
|
||||
| network | qdisc_overlimits_total | The number of queued packets exceeds the limit | count | host | sum of same level(parent major) for a device |
|
||||
| network | qdisc_packets_total | The number of packets sent | count | host | sum of same level(parent major) for a device |
|
||||
| network | qdisc_requeues_total | The number of packets that were not sent successfully and were requeued | count | host | sum of same level(parent major) for a device |
|
||||
| network | ethtool_hardware_rx_dropped_errors | Statistics of inbound packet droped or errors of interface | count | host | related to hardware drivers, such as mlx, ixgbe, bnxt_en, etc. |
|
||||
| network | netdev_receive_bytes_total | Number of good received bytes | bytes | host,container | proc fs |
|
||||
| network | netdev_receive_compressed_total | Number of correctly received compressed packets | count | host,container | proc fs |
|
||||
| network | netdev_receive_dropped_total | Number of packets received but not processed | count | host,container | proc fs |
|
||||
| network | netdev_receive_errors_total | Total number of bad packets received on this network device | count | host,container | proc fs |
|
||||
| network | netdev_receive_fifo_total | Receiver FIFO error counter | count | host,container | proc fs |
|
||||
| network | netdev_receive_frame_total | Receiver frame alignment errors | count | host,container | proc fs |
|
||||
| network | netdev_receive_multicast_total | Multicast packets received. For hardware interfaces this statistic is commonly calculated at the device level (unlike rx_packets) and therefore may include packets which did not reach the host | count | host,container | proc fs |
|
||||
| network | netdev_receive_packets_total | Number of good packets received by the interface | count | host,container | proc fs |
|
||||
| network | netdev_transmit_bytes_total | Number of good transmitted bytes, corresponding to tx_packets | bytes | host,container | proc fs |
|
||||
| network | netdev_transmit_carrier_total | Number of frame transmission errors due to loss of carrier during transmission | count | host,container | proc fs |
|
||||
| network | netdev_transmit_colls_total | Number of collisions during packet transmissions | count | host,container | proc fs |
|
||||
| network | netdev_transmit_compressed_total | Number of transmitted compressed packets | count | host,container | proc fs |
|
||||
| network | netdev_transmit_dropped_total | Number of packets dropped on their way to transmission, e.g. due to lack of resources | count | host,container | proc fs |
|
||||
| network | netdev_transmit_errors_total | Total number of transmit problems | count | host,container | proc fs |
|
||||
| network | netdev_transmit_fifo_total | Number of frame transmission errors due to device FIFO underrun / underflow | count | host,container | proc fs |
|
||||
| network | netdev_transmit_packets_total | Number of packets successfully transmitted | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_ArpFilter | \- | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_BusyPollRxPackets | \- | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_DelayedACKLocked | A delayed ACK timer expires, but the TCP stack can’t send an ACK immediately due to the socket is locked by a userspace program. The TCP stack will send a pure ACK later (after the userspace program unlock the socket). When the TCP stack sends the pure ACK later, the TCP stack will also update TcpExtDelayedACKs and exit the delayed ACK mode | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_DelayedACKLost | It will be updated when the TCP stack receives a packet which has been ACKed. A Delayed ACK loss might cause this issue, but it would also be triggered by other reasons, such as a packet is duplicated in the network | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_DelayedACKs | A delayed ACK timer expires. The TCP stack will send a pure ACK packet and exit the delayed ACK mode | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_EmbryonicRsts | resets received for embryonic SYN_RECV sockets | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_IPReversePathFilter | \- | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_ListenDrops | When kernel receives a SYN from a client, and if the TCP accept queue is full, kernel will drop the SYN and add 1 to TcpExtListenOverflows. At the same time kernel will also add 1 to TcpExtListenDrops. When a TCP socket is in LISTEN state, and kernel need to drop a packet, kernel would always add 1 to TcpExtListenDrops. So increase TcpExtListenOverflows would let TcpExtListenDrops increasing at the same time, but TcpExtListenDrops would also increase without TcpExtListenOverflows increasing, e.g. a memory allocation fail would also let TcpExtListenDrops increase | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_ListenOverflows | When kernel receives a SYN from a client, and if the TCP accept queue is full, kernel will drop the SYN and add 1 to TcpExtListenOverflows. At the same time kernel will also add 1 to TcpExtListenDrops. When a TCP socket is in LISTEN state, and kernel need to drop a packet, kernel would always add 1 to TcpExtListenDrops. So increase TcpExtListenOverflows would let TcpExtListenDrops increasing at the same time, but TcpExtListenDrops would also increase without TcpExtListenOverflows increasing, e.g. a memory allocation fail would also let TcpExtListenDrops increase | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_LockDroppedIcmps | ICMP packets dropped because socket was locked | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_OfoPruned | The TCP stack tries to discard packet on the out of order queue | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_OutOfWindowIcmps | ICMP pkts dropped because they were out-of-window | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_PAWSActive | Packets are dropped by PAWS in Syn-Sent status | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_PAWSEstab | Packets are dropped by PAWS in any status other than Syn-Sent | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_PFMemallocDrop | \- | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_PruneCalled | The TCP stack tries to reclaim memory for a socket. After updates this counter, the TCP stack will try to collapse the out of order queue and the receiving queue. If the memory is still not enough, the TCP stack will try to discard packets from the out of order queue (and update the TcpExtOfoPruned counter) | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_RcvPruned | After ‘collapse’ and discard packets from the out of order queue, if the actually used memory is still larger than the max allowed memory, this counter will be updated. It means the ‘prune’ fails | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_SyncookiesFailed | The MSS decoded from the SYN cookie is invalid. When this counter is updated, the received packet won’t be treated as a SYN cookie and the TcpExtSyncookiesRecv counter won’t be updated | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_SyncookiesRecv | How many reply packets of the SYN cookies the TCP stack receives | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_SyncookiesSent | It indicates how many SYN cookies are sent | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPACKSkippedChallenge | The ACK is skipped if the ACK is a challenge ACK | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPACKSkippedFinWait2 | The ACK is skipped in Fin-Wait-2 status, the reason would be either PAWS check fails or the received sequence number is out of window | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPACKSkippedPAWS | The ACK is skipped due to PAWS (Protect Against Wrapped Sequence numbers) check fails | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPACKSkippedSeq | The sequence number is out of window and the timestamp passes the PAWS check and the TCP status is not Syn-Recv, Fin-Wait-2, and Time-Wait | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPACKSkippedSynRecv | The ACK is skipped in Syn-Recv status. The Syn-Recv status means the TCP stack receives a SYN and replies SYN+ACK | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPACKSkippedTimeWait | The ACK is skipped in Time-Wait status, the reason would be either PAWS check failed or the received sequence number is out of window | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPAbortFailed | The kernel TCP layer will send RST if the RFC2525 2.17 section is satisfied. If an internal error occurs during this process, TcpExtTCPAbortFailed will be increased | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPAbortOnClose | Number of sockets closed when the user-mode program has data in the buffer | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPAbortOnData | It means TCP layer has data in flight, but need to close the connection | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPAbortOnLinger | When a TCP connection comes into FIN_WAIT_2 state, instead of waiting for the fin packet from the other side, kernel could send a RST and delete the socket immediately | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPAbortOnMemory | When an application closes a TCP connection, kernel still need to track the connection, let it complete the TCP disconnect process | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPAbortOnTimeout | This counter will increase when any of the TCP timers expire. In such situation, kernel won’t send RST, just give up the connection | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPAckCompressed | \- | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPAutoCorking | When sending packets, the TCP layer will try to merge small packets to a bigger one | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPBacklogDrop | \- | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPChallengeACK | The number of challenge acks sent | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPDSACKIgnoredNoUndo | When a DSACK block is invalid, one of these two counters would be updated. Which counter will be updated depends on the undo_marker flag of the TCP socket | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPDSACKIgnoredOld | When a DSACK block is invalid, one of these two counters would be updated. Which counter will be updated depends on the undo_marker flag of the TCP socket | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPDSACKOfoRecv | The TCP stack receives a DSACK, which indicate an out of order duplicate packet is received | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPDSACKOfoSent | The TCP stack receives an out of order duplicate packet, so it sends a DSACK to the sender | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPDSACKOldSent | The TCP stack receives a duplicate packet which has been acked, so it sends a DSACK to the sender | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPDSACKRecv | The TCP stack receives a DSACK, which indicates an acknowledged duplicate packet is received | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPDSACKUndo | Congestion window recovered without slow start using DSACK | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPDeferAcceptDrop | \- | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPDelivered | \- | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPDeliveredCE | \- | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPFastOpenActive | When the TCP stack receives an ACK packet in the SYN-SENT status, and the ACK packet acknowledges the data in the SYN packet, the TCP stack understand the TFO cookie is accepted by the other side, then it updates this counter | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPFastOpenActiveFail | Fast Open attempts (SYN/data) failed because the remote does not accept it or the attempts timed out | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPFastOpenBlackhole | \- | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPFastOpenCookieReqd | This counter indicates how many times a client wants to request a TFO cookie | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPFastOpenListenOverflow | When the pending fast open request number is larger than fastopenq->max_qlen, the TCP stack will reject the fast open request and update this counter | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPFastOpenPassive | This counter indicates how many times the TCP stack accepts the fast open request | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPFastOpenPassiveFail | This counter indicates how many times the TCP stack rejects the fast open request. It is caused by either the TFO cookie is invalid or the TCP stack finds an error during the socket creating process | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPFastRetrans | The TCP stack wants to retransmit a packet and the congestion control state is not ‘Loss’ | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPFromZeroWindowAdv | The TCP receive window is set to no-zero value from zero | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPFullUndo | \- | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPHPAcks | If a packet set ACK flag and has no data, it is a pure ACK packet, if kernel handles it in the fast path, TcpExtTCPHPAcks will increase 1 | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPHPHits | If a TCP packet has data (which means it is not a pure ACK packet), and this packet is handled in the fast path, TcpExtTCPHPHits will increase 1 | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPHystartDelayCwnd | The sum of CWND detected by packet delay. Dividing this value by TcpExtTCPHystartDelayDetect is the average CWND which detected by the packet delay | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPHystartDelayDetect | How many times the packet delay threshold is detected | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPHystartTrainCwnd | The sum of CWND detected by ACK train length. Dividing this value by TcpExtTCPHystartTrainDetect is the average CWND which detected by the ACK train length | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPHystartTrainDetect | How many times the ACK train length threshold is detected | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPKeepAlive | This counter indicates many keepalive packets were sent. The keepalive won’t be enabled by default. A userspace program could enable it by setting the SO_KEEPALIVE socket option | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPLossFailures | Number of connections that enter the TCP_CA_Loss phase and then undergo RTO timeout | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPLossProbeRecovery | A packet loss is detected and recovered by TLP | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPLossProbes | A TLP probe packet is sent | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPLossUndo | \- | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPLostRetransmit | A SACK points out that a retransmission packet is lost again | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPMD5Failure | \- | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPMD5NotFound | \- | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPMD5Unexpected | \- | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPMTUPFail | \- | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPMTUPSuccess | \- | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPMemoryPressures | Number of times TCP ran low on memory | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPMemoryPressuresChrono | \- | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPMinTTLDrop | \- | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPOFODrop | The TCP layer receives an out of order packet but doesn’t have enough memory, so drops it. Such packets won’t be counted into TcpExtTCPOFOQueue | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPOFOMerge | The received out of order packet has an overlay with the previous packet. the overlay part will be dropped. All of TcpExtTCPOFOMerge packets will also be counted into TcpExtTCPOFOQueue | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPOFOQueue | The TCP layer receives an out of order packet and has enough memory to queue it | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPOrigDataSent | Number of outgoing packets with original data (excluding retransmission but including data-in-SYN). This counter is different from TcpOutSegs because TcpOutSegs also tracks pure ACKs. TCPOrigDataSent is more useful to track the TCP retransmission rate | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPPartialUndo | Detected some erroneous retransmits, a partial ACK arrived while were fast retransmitting, so able to partially undo some of our CWND reduction | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPPureAcks | If a packet set ACK flag and has no data, it is a pure ACK packet, if kernel handles it in the fast path, TcpExtTCPHPAcks will increase 1, if kernel handles it in the slow path, TcpExtTCPPureAcks will increase 1 | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPRcvCoalesce | When packets are received by the TCP layer and are not be read by the application, the TCP layer will try to merge them. This counter indicate how many packets are merged in such situation. If GRO is enabled, lots of packets would be merged by GRO, these packets wouldn’t be counted to TcpExtTCPRcvCoalesce | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPRcvCollapsed | This counter indicates how many skbs are freed during ‘collapse’ | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPRenoFailures | Number of failures that enter the TCP_CA_Disorder phase and then undergo RTO | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPRenoRecovery | When the congestion control comes into Recovery state, if sack is used, TcpExtTCPSackRecovery increases 1, if sack is not used, TcpExtTCPRenoRecovery increases 1. These two counters mean the TCP stack begins to retransmit the lost packets | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPRenoRecoveryFail | Number of connections that enter the Recovery phase and then undergo RTO | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPRenoReorder | The reorder packet is detected by fast recovery. It would only be used if SACK is disabled | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPReqQFullDoCookies | \- | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPReqQFullDrop | \- | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPRetransFail | The TCP stack tries to deliver a retransmission packet to lower layers but the lower layers return an error | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPSACKDiscard | This counter indicates how many SACK blocks are invalid. If the invalid SACK block is caused by ACK recording, the TCP stack will only ignore it and won’t update this counter | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPSACKReneging | A packet was acknowledged by SACK, but the receiver has dropped this packet, so the sender needs to retransmit this packet | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPSACKReorder | The reorder packet detected by SACK | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPSYNChallenge | The number of challenge acks sent in response to SYN packets | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPSackFailures | Number of failures that enter the TCP_CA_Disorder phase and then undergo RTO | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPSackMerged | A skb is merged | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPSackRecovery | When the congestion control comes into Recovery state, if sack is used, TcpExtTCPSackRecovery increases 1, if sack is not used, TcpExtTCPRenoRecovery increases 1. These two counters mean the TCP stack begins to retransmit the lost packets | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPSackRecoveryFail | When the congestion control comes into Recovery state, if sack is used, TcpExtTCPSackRecovery increases 1 | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPSackShiftFallback | A skb should be shifted or merged, but the TCP stack doesn’t do it for some reasons | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPSackShifted | A skb is shifted | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPSlowStartRetrans | The TCP stack wants to retransmit a packet and the congestion control state is ‘Loss’ | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPSpuriousRTOs | The spurious retransmission timeout detected by the F-RTO algorithm | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPSpuriousRtxHostQueues | When the TCP stack wants to retransmit a packet, and finds that packet is not lost in the network, but the packet is not sent yet, the TCP stack would give up the retransmission and update this counter. It might happen if a packet stays too long time in a qdisc or driver queue | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPSynRetrans | Number of SYN and SYN/ACK retransmits to break down retransmissions into SYN, fast-retransmits, timeout retransmits, etc | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPTSReorder | The reorder packet is detected when a hole is filled | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPTimeWaitOverflow | Number of TIME_WAIT sockets unable to be allocated due to limit exceeding | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPTimeouts | TCP timeout events | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPToZeroWindowAdv | The TCP receive window is set to zero from a no-zero value | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPWantZeroWindowAdv | Depending on current memory usage, the TCP stack tries to set receive window to zero. But the receive window might still be a no-zero value | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPWinProbe | Number of ACK packets to be sent at regular intervals to make sure a reverse ACK packet opening back a window has not been lost | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TCPWqueueTooBig | \- | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TW | TCP sockets finished time wait in fast timer | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TWKilled | TCP sockets finished time wait in slow timer | count | host,container | proc fs |
|
||||
| network | netstat_TcpExt_TWRecycled | Time wait sockets recycled by time stamp | count | host,container | proc fs |
|
||||
| network | netstat_Tcp_ActiveOpens | It means the TCP layer sends a SYN, and come into the SYN-SENT state. Every time TcpActiveOpens increases 1, TcpOutSegs should always increase 1 | count | host,container | proc fs |
|
||||
| network | netstat_Tcp_AttemptFails | The number of times TCP connections have made a direct transition to the CLOSED state from either the SYN-SENT state or the SYN-RCVD state, plus the number of times TCP connections have made a direct transition to the LISTEN state from the SYN-RCVD state | count | host,container | proc fs |
|
||||
| network | netstat_Tcp_CurrEstab | The number of TCP connections for which the current state is either ESTABLISHED or CLOSE-WAIT | count | host,container | proc fs |
|
||||
| network | netstat_Tcp_EstabResets | The number of times TCP connections have made a direct transition to the CLOSED state from either the ESTABLISHED state or the CLOSE-WAIT state | count | host,container | proc fs |
|
||||
| network | netstat_Tcp_InCsumErrors | Incremented when a TCP checksum failure is detected | count | host,container | proc fs |
|
||||
| network | netstat_Tcp_InErrs | The total number of segments received in error (e.g., bad TCP checksums) | count | host,container | proc fs |
|
||||
| network | netstat_Tcp_InSegs | The number of packets received by the TCP layer. As mentioned in RFC1213, it includes the packets received in error, such as checksum error, invalid TCP header and so on | count | host,container | proc fs |
|
||||
| network | netstat_Tcp_MaxConn | The limit on the total number of TCP connections the entity can support. In entities where the maximum number of connections is dynamic, this object should contain the value -1 | count | host,container | proc fs |
|
||||
| network | netstat_Tcp_OutRsts | The number of TCP segments sent containing the RST flag | count | host,container | proc fs |
|
||||
| network | netstat_Tcp_OutSegs | The total number of segments sent, including those on current connections but excluding those containing only retransmitted octets | count | host,container | proc fs |
|
||||
| network | netstat_Tcp_PassiveOpens | The number of times TCP connections have made a direct transition to the SYN-RCVD state from the LISTEN state | count | host,container | proc fs |
|
||||
| network | netstat_Tcp_RetransSegs | The total number of segments retransmitted - that is, the number of TCP segments transmitted containing one or more previously transmitted octets | count | host,container | proc fs |
|
||||
| network | netstat_Tcp_RtoAlgorithm | The algorithm used to determine the timeout value used for retransmitting unacknowledged octets | count | host,container | proc fs |
|
||||
| network | netstat_Tcp_RtoMax | The maximum value permitted by a TCP implementation for the retransmission timeout, measured in milliseconds. More refined semantics for objects of this type depend upon the algorithm used to determine the retransmission timeout | count | host,container | proc fs |
|
||||
| network | netstat_Tcp_RtoMin | The minimum value permitted by a TCP implementation for the retransmission timeout, measured in milliseconds. More refined semantics for objects of this type depend upon the algorithm used to determine the retransmission timeout | count | host,container | proc fs |
|
||||
| network | sockstat_FRAG_inuse | \- | count | host,container | proc fs |
|
||||
| network | sockstat_FRAG_memory | \- | pages | host,container | proc fs |
|
||||
| network | sockstat_RAW_inuse | Number of RAW socket used | count | host,container | proc fs |
|
||||
| network | sockstat_TCP_alloc | The number of TCP sockets that have been allocated | count | host,container | proc fs |
|
||||
| network | sockstat_TCP_inuse | Established TCP socket number | count | host,container | proc fs |
|
||||
| network | sockstat_TCP_mem | The total size of TCP memory used by the system | pages | system | proc fs |
|
||||
| network | sockstat_TCP_mem_bytes | The total size of TCP memory used by the system | bytes | system | sockstat_TCP_mem \* page_size |
|
||||
| network | sockstat_TCP_orphan | Number of TCP connections waiting to be closed | count | host,container | proc fs |
|
||||
| network | sockstat_TCP_tw | Number of TCP sockets to be terminated | count | host,container | proc fs |
|
||||
| network | sockstat_UDPLITE_inuse | \- | count | host,container | proc fs |
|
||||
| network | sockstat_UDP_inuse | Number of UDP socket used | count | host,container | proc fs |
|
||||
| network | sockstat_UDP_mem | The total size of udp memory used by the system | pages | system | proc fs |
|
||||
| network | sockstat_UDP_mem_bytes | The total number of bytes of udp memory used by the system | bytes | system | sockstat_UDP_mem \* page_size |
|
||||
| network | sockstat_sockets_used | The number of sockets used by the system | count | system | proc fs |
|
After Width: | Height: | Size: 111 KiB |
After Width: | Height: | Size: 111 KiB |
|
@ -0,0 +1,143 @@
|
|||
module huatuo-bamai
|
||||
|
||||
go 1.22.4
|
||||
|
||||
require (
|
||||
github.com/cilium/ebpf v0.16.0
|
||||
github.com/containerd/cgroups/v3 v3.0.3
|
||||
github.com/deckarep/golang-set v1.8.0
|
||||
github.com/docker/docker v27.2.0+incompatible
|
||||
github.com/elastic/go-elasticsearch/v7 v7.17.10
|
||||
github.com/ema/qdisc v1.0.0
|
||||
github.com/gin-contrib/pprof v1.5.1
|
||||
github.com/gin-gonic/gin v1.10.0
|
||||
github.com/go-playground/validator/v10 v10.22.1
|
||||
github.com/google/cadvisor v0.50.0
|
||||
github.com/grafana/grafana-plugin-sdk-go v0.251.0
|
||||
github.com/jsimonetti/rtnetlink v1.4.2
|
||||
github.com/mdlayher/netlink v1.7.2
|
||||
github.com/opencontainers/runtime-spec v1.2.0
|
||||
github.com/pelletier/go-toml v1.9.5
|
||||
github.com/pkg/errors v0.9.1
|
||||
github.com/prometheus/client_golang v1.20.3
|
||||
github.com/prometheus/procfs v0.15.1
|
||||
github.com/shirou/gopsutil v2.21.11+incompatible
|
||||
github.com/sirupsen/logrus v1.9.3
|
||||
github.com/tklauser/numcpus v0.6.1
|
||||
github.com/urfave/cli/v2 v2.27.4
|
||||
github.com/vishvananda/netlink v1.3.0
|
||||
golang.org/x/sys v0.27.0
|
||||
golang.org/x/time v0.6.0
|
||||
gopkg.in/natefinch/lumberjack.v2 v2.2.1
|
||||
k8s.io/api v0.31.3
|
||||
k8s.io/cri-client v0.31.3
|
||||
k8s.io/kubelet v0.29.0
|
||||
sigs.k8s.io/yaml v1.5.0
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 // indirect
|
||||
github.com/Microsoft/go-winio v0.6.2 // indirect
|
||||
github.com/apache/arrow/go/v15 v15.0.2 // indirect
|
||||
github.com/beorn7/perks v1.0.1 // indirect
|
||||
github.com/blang/semver/v4 v4.0.0 // indirect
|
||||
github.com/bytedance/sonic v1.12.4 // indirect
|
||||
github.com/bytedance/sonic/loader v0.2.1 // indirect
|
||||
github.com/cenkalti/backoff/v4 v4.3.0 // indirect
|
||||
github.com/cespare/xxhash/v2 v2.3.0 // indirect
|
||||
github.com/cheekybits/genny v1.0.0 // indirect
|
||||
github.com/cloudwego/base64x v0.1.4 // indirect
|
||||
github.com/cloudwego/iasm v0.2.0 // indirect
|
||||
github.com/coreos/go-systemd/v22 v22.5.0 // indirect
|
||||
github.com/cpuguy83/go-md2man/v2 v2.0.4 // indirect
|
||||
github.com/distribution/reference v0.6.0 // indirect
|
||||
github.com/docker/go-connections v0.5.0 // indirect
|
||||
github.com/docker/go-units v0.5.0 // indirect
|
||||
github.com/felixge/httpsnoop v1.0.4 // indirect
|
||||
github.com/fxamacker/cbor/v2 v2.7.0 // indirect
|
||||
github.com/gabriel-vasile/mimetype v1.4.6 // indirect
|
||||
github.com/gin-contrib/sse v0.1.0 // indirect
|
||||
github.com/go-logr/logr v1.4.2 // indirect
|
||||
github.com/go-logr/stdr v1.2.2 // indirect
|
||||
github.com/go-ole/go-ole v1.2.6 // indirect
|
||||
github.com/go-openapi/swag v0.22.9 // indirect
|
||||
github.com/go-playground/locales v0.14.1 // indirect
|
||||
github.com/go-playground/universal-translator v0.18.1 // indirect
|
||||
github.com/goccy/go-json v0.10.3 // indirect
|
||||
github.com/godbus/dbus/v5 v5.0.6 // indirect
|
||||
github.com/gogo/protobuf v1.3.2 // indirect
|
||||
github.com/google/flatbuffers v23.5.26+incompatible // indirect
|
||||
github.com/google/go-cmp v0.6.0 // indirect
|
||||
github.com/google/gofuzz v1.2.0 // indirect
|
||||
github.com/google/uuid v1.6.0 // indirect
|
||||
github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0 // indirect
|
||||
github.com/inconshreveable/mousetrap v1.1.0 // indirect
|
||||
github.com/josharian/native v1.1.0 // indirect
|
||||
github.com/json-iterator/go v1.1.12 // indirect
|
||||
github.com/klauspost/compress v1.17.9 // indirect
|
||||
github.com/klauspost/cpuid/v2 v2.2.9 // indirect
|
||||
github.com/leodido/go-urn v1.4.0 // indirect
|
||||
github.com/mattetti/filebuffer v1.0.1 // indirect
|
||||
github.com/mattn/go-isatty v0.0.20 // indirect
|
||||
github.com/mattn/go-runewidth v0.0.14 // indirect
|
||||
github.com/mdlayher/socket v0.4.1 // indirect
|
||||
github.com/moby/docker-image-spec v1.3.1 // indirect
|
||||
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
|
||||
github.com/modern-go/reflect2 v1.0.2 // indirect
|
||||
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
|
||||
github.com/olekukonko/tablewriter v0.0.5 // indirect
|
||||
github.com/opencontainers/go-digest v1.0.0 // indirect
|
||||
github.com/opencontainers/image-spec v1.1.0 // indirect
|
||||
github.com/pelletier/go-toml/v2 v2.2.3 // indirect
|
||||
github.com/pierrec/lz4/v4 v4.1.18 // indirect
|
||||
github.com/prometheus/client_model v0.6.1 // indirect
|
||||
github.com/prometheus/common v0.55.0 // indirect
|
||||
github.com/rivo/uniseg v0.4.3 // indirect
|
||||
github.com/russross/blackfriday/v2 v2.1.0 // indirect
|
||||
github.com/spf13/cobra v1.8.1 // indirect
|
||||
github.com/spf13/pflag v1.0.5 // indirect
|
||||
github.com/tklauser/go-sysconf v0.3.12 // indirect
|
||||
github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
|
||||
github.com/ugorji/go/codec v1.2.12 // indirect
|
||||
github.com/vishvananda/netns v0.0.4 // indirect
|
||||
github.com/x448/float16 v0.8.4 // indirect
|
||||
github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 // indirect
|
||||
github.com/yusufpapurcu/wmi v1.2.4 // indirect
|
||||
github.com/zeebo/xxh3 v1.0.2 // indirect
|
||||
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.53.0 // indirect
|
||||
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.53.0 // indirect
|
||||
go.opentelemetry.io/otel v1.29.0 // indirect
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.29.0 // indirect
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.28.0 // indirect
|
||||
go.opentelemetry.io/otel/metric v1.29.0 // indirect
|
||||
go.opentelemetry.io/otel/sdk v1.29.0 // indirect
|
||||
go.opentelemetry.io/otel/trace v1.29.0 // indirect
|
||||
go.opentelemetry.io/proto/otlp v1.3.1 // indirect
|
||||
go.yaml.in/yaml/v2 v2.4.2 // indirect
|
||||
golang.org/x/arch v0.12.0 // indirect
|
||||
golang.org/x/crypto v0.29.0 // indirect
|
||||
golang.org/x/exp v0.0.0-20240823005443-9b4947da3948 // indirect
|
||||
golang.org/x/mod v0.20.0 // indirect
|
||||
golang.org/x/net v0.31.0 // indirect
|
||||
golang.org/x/oauth2 v0.23.0 // indirect
|
||||
golang.org/x/sync v0.9.0 // indirect
|
||||
golang.org/x/text v0.20.0 // indirect
|
||||
golang.org/x/tools v0.24.0 // indirect
|
||||
golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 // indirect
|
||||
google.golang.org/genproto/googleapis/api v0.0.0-20240822170219-fc7c04adadcd // indirect
|
||||
google.golang.org/genproto/googleapis/rpc v0.0.0-20240814211410-ddb44dafa142 // indirect
|
||||
google.golang.org/grpc v1.66.0 // indirect
|
||||
google.golang.org/protobuf v1.35.2 // indirect
|
||||
gopkg.in/inf.v0 v0.9.1 // indirect
|
||||
gopkg.in/yaml.v2 v2.4.0 // indirect
|
||||
gopkg.in/yaml.v3 v3.0.1 // indirect
|
||||
gotest.tools/v3 v3.5.1 // indirect
|
||||
k8s.io/apimachinery v0.31.3 // indirect
|
||||
k8s.io/client-go v0.31.3 // indirect
|
||||
k8s.io/component-base v0.31.3 // indirect
|
||||
k8s.io/cri-api v0.31.3 // indirect
|
||||
k8s.io/klog/v2 v2.130.1 // indirect
|
||||
k8s.io/utils v0.0.0-20240711033017-18e509b52bc8 // indirect
|
||||
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect
|
||||
sigs.k8s.io/structured-merge-diff/v4 v4.4.1 // indirect
|
||||
)
|
|
@ -0,0 +1,432 @@
|
|||
github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 h1:L/gRVlceqvL25UVaW/CKtUDjefjrs0SPonmDGUVOYP0=
|
||||
github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161/go.mod h1:xomTg63KZ2rFqZQzSB4Vz2SUXa1BpHTVz9L5PTmPC4E=
|
||||
github.com/BurntSushi/toml v1.3.2 h1:o7IhLm0Msx3BaB+n3Ag7L8EVlByGnpq14C4YWiu/gL8=
|
||||
github.com/BurntSushi/toml v1.3.2/go.mod h1:CxXYINrC8qIiEnFrOxCa7Jy5BFHlXnUU2pbicEuybxQ=
|
||||
github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY=
|
||||
github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU=
|
||||
github.com/apache/arrow/go/v15 v15.0.2 h1:60IliRbiyTWCWjERBCkO1W4Qun9svcYoZrSLcyOsMLE=
|
||||
github.com/apache/arrow/go/v15 v15.0.2/go.mod h1:DGXsR3ajT524njufqf95822i+KTh+yea1jass9YXgjA=
|
||||
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
|
||||
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
|
||||
github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM=
|
||||
github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ=
|
||||
github.com/bytedance/sonic v1.12.4 h1:9Csb3c9ZJhfUWeMtpCDCq6BUoH5ogfDFLUgQ/jG+R0k=
|
||||
github.com/bytedance/sonic v1.12.4/go.mod h1:B8Gt/XvtZ3Fqj+iSKMypzymZxw/FVwgIGKzMzT9r/rk=
|
||||
github.com/bytedance/sonic/loader v0.1.1/go.mod h1:ncP89zfokxS5LZrJxl5z0UJcsk4M4yY2JpfqGeCtNLU=
|
||||
github.com/bytedance/sonic/loader v0.2.1 h1:1GgorWTqf12TA8mma4DDSbaQigE2wOgQo7iCjjJv3+E=
|
||||
github.com/bytedance/sonic/loader v0.2.1/go.mod h1:ncP89zfokxS5LZrJxl5z0UJcsk4M4yY2JpfqGeCtNLU=
|
||||
github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8=
|
||||
github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE=
|
||||
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
|
||||
github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
|
||||
github.com/cheekybits/genny v1.0.0 h1:uGGa4nei+j20rOSeDeP5Of12XVm7TGUd4dJA9RDitfE=
|
||||
github.com/cheekybits/genny v1.0.0/go.mod h1:+tQajlRqAUrPI7DOSpB0XAqZYtQakVtB7wXkRAgjxjQ=
|
||||
github.com/chromedp/cdproto v0.0.0-20220208224320-6efb837e6bc2 h1:XCdvHbz3LhewBHN7+mQPx0sg/Hxil/1USnBmxkjHcmY=
|
||||
github.com/chromedp/cdproto v0.0.0-20220208224320-6efb837e6bc2/go.mod h1:At5TxYYdxkbQL0TSefRjhLE3Q0lgvqKKMSFUglJ7i1U=
|
||||
github.com/cilium/ebpf v0.16.0 h1:+BiEnHL6Z7lXnlGUsXQPPAE7+kenAd4ES8MQ5min0Ok=
|
||||
github.com/cilium/ebpf v0.16.0/go.mod h1:L7u2Blt2jMM/vLAVgjxluxtBKlz3/GWjB0dMOEngfwE=
|
||||
github.com/cloudwego/base64x v0.1.4 h1:jwCgWpFanWmN8xoIUHa2rtzmkd5J2plF/dnLS6Xd/0Y=
|
||||
github.com/cloudwego/base64x v0.1.4/go.mod h1:0zlkT4Wn5C6NdauXdJRhSKRlJvmclQ1hhJgA0rcu/8w=
|
||||
github.com/cloudwego/iasm v0.2.0 h1:1KNIy1I1H9hNNFEEH3DVnI4UujN+1zjpuk6gwHLTssg=
|
||||
github.com/cloudwego/iasm v0.2.0/go.mod h1:8rXZaNYT2n95jn+zTI1sDr+IgcD2GVs0nlbbQPiEFhY=
|
||||
github.com/containerd/cgroups/v3 v3.0.3 h1:S5ByHZ/h9PMe5IOQoN7E+nMc2UcLEM/V48DGDJ9kip0=
|
||||
github.com/containerd/cgroups/v3 v3.0.3/go.mod h1:8HBe7V3aWGLFPd/k03swSIsGjZhHI2WzJmticMgVuz0=
|
||||
github.com/containerd/log v0.1.0 h1:TCJt7ioM2cr/tfR8GPbGf9/VRAX8D2B4PjzCpfX540I=
|
||||
github.com/containerd/log v0.1.0/go.mod h1:VRRf09a7mHDIRezVKTRCrOq78v577GXq3bSa3EhrzVo=
|
||||
github.com/coreos/go-systemd/v22 v22.5.0 h1:RrqgGjYQKalulkV8NGVIfkXQf6YYmOyiJKk8iXXhfZs=
|
||||
github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
|
||||
github.com/cpuguy83/go-md2man/v2 v2.0.4 h1:wfIWP927BUkWJb2NmU/kNDYIBTh/ziUX91+lVfRxZq4=
|
||||
github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
|
||||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM=
|
||||
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/deckarep/golang-set v1.8.0 h1:sk9/l/KqpunDwP7pSjUg0keiOOLEnOBHzykLrsPppp4=
|
||||
github.com/deckarep/golang-set v1.8.0/go.mod h1:5nI87KwE7wgsBU1F4GKAw2Qod7p5kyS383rP6+o6qqo=
|
||||
github.com/distribution/reference v0.6.0 h1:0IXCQ5g4/QMHHkarYzh5l+u8T3t73zM5QvfrDyIgxBk=
|
||||
github.com/distribution/reference v0.6.0/go.mod h1:BbU0aIcezP1/5jX/8MP0YiH4SdvB5Y4f/wlDRiLyi3E=
|
||||
github.com/docker/docker v27.2.0+incompatible h1:Rk9nIVdfH3+Vz4cyI/uhbINhEZ/oLmc+CBXmH6fbNk4=
|
||||
github.com/docker/docker v27.2.0+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk=
|
||||
github.com/docker/go-connections v0.5.0 h1:USnMq7hx7gwdVZq1L49hLXaFtUdTADjXGp+uj1Br63c=
|
||||
github.com/docker/go-connections v0.5.0/go.mod h1:ov60Kzw0kKElRwhNs9UlUHAE/F9Fe6GLaXnqyDdmEXc=
|
||||
github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4=
|
||||
github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk=
|
||||
github.com/elastic/go-elasticsearch/v7 v7.17.10 h1:TCQ8i4PmIJuBunvBS6bwT2ybzVFxxUhhltAs3Gyu1yo=
|
||||
github.com/elastic/go-elasticsearch/v7 v7.17.10/go.mod h1:OJ4wdbtDNk5g503kvlHLyErCgQwwzmDtaFC4XyOxXA4=
|
||||
github.com/elazarl/goproxy v0.0.0-20230731152917-f99041a5c027 h1:1L0aalTpPz7YlMxETKpmQoWMBkeiuorElZIXoNmgiPE=
|
||||
github.com/elazarl/goproxy v0.0.0-20230731152917-f99041a5c027/go.mod h1:Ro8st/ElPeALwNFlcTpWmkr6IoMFfkjXAvTHpevnDsM=
|
||||
github.com/ema/qdisc v1.0.0 h1:EHLG08FVRbWLg8uRICa3xzC9Zm0m7HyMHfXobWFnXYg=
|
||||
github.com/ema/qdisc v1.0.0/go.mod h1:FhIc0fLYi7f+lK5maMsesDqwYojIOh3VfRs8EVd5YJQ=
|
||||
github.com/fatih/color v1.15.0 h1:kOqh6YHBtK8aywxGerMG2Eq3H6Qgoqeo13Bk2Mv/nBs=
|
||||
github.com/fatih/color v1.15.0/go.mod h1:0h5ZqXfHYED7Bhv2ZJamyIOUej9KtShiJESRwBDUSsw=
|
||||
github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg=
|
||||
github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U=
|
||||
github.com/fxamacker/cbor/v2 v2.7.0 h1:iM5WgngdRBanHcxugY4JySA0nk1wZorNOpTgCMedv5E=
|
||||
github.com/fxamacker/cbor/v2 v2.7.0/go.mod h1:pxXPTn3joSm21Gbwsv0w9OSA2y1HFR9qXEeXQVeNoDQ=
|
||||
github.com/gabriel-vasile/mimetype v1.4.6 h1:3+PzJTKLkvgjeTbts6msPJt4DixhT4YtFNf1gtGe3zc=
|
||||
github.com/gabriel-vasile/mimetype v1.4.6/go.mod h1:JX1qVKqZd40hUPpAfiNTe0Sne7hdfKSbOqqmkq8GCXc=
|
||||
github.com/getkin/kin-openapi v0.124.0 h1:VSFNMB9C9rTKBnQ/fpyDU8ytMTr4dWI9QovSKj9kz/M=
|
||||
github.com/getkin/kin-openapi v0.124.0/go.mod h1:wb1aSZA/iWmorQP9KTAS/phLj/t17B5jT7+fS8ed9NM=
|
||||
github.com/gin-contrib/pprof v1.5.1 h1:Mzy+3HHtHbtwr4VewBTXZp/hR7pS6ZuZkueBIrQiLL4=
|
||||
github.com/gin-contrib/pprof v1.5.1/go.mod h1:uwzoF6FxdzJJGyMdcZB+VSuVjOBe1kSH+KMIvKGwvCQ=
|
||||
github.com/gin-contrib/sse v0.1.0 h1:Y/yl/+YNO8GZSjAhjMsSuLt29uWRFHdHYUb5lYOV9qE=
|
||||
github.com/gin-contrib/sse v0.1.0/go.mod h1:RHrZQHXnP2xjPF+u1gW/2HnVO7nvIa9PG3Gm+fLHvGI=
|
||||
github.com/gin-gonic/gin v1.10.0 h1:nTuyha1TYqgedzytsKYqna+DfLos46nTv2ygFy86HFU=
|
||||
github.com/gin-gonic/gin v1.10.0/go.mod h1:4PMNQiOhvDRa013RKVbsiNwoyezlm2rm0uX/T7kzp5Y=
|
||||
github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
|
||||
github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY=
|
||||
github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
|
||||
github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag=
|
||||
github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE=
|
||||
github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY=
|
||||
github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0=
|
||||
github.com/go-openapi/jsonpointer v0.20.2 h1:mQc3nmndL8ZBzStEo3JYF8wzmeWffDH4VbXz58sAx6Q=
|
||||
github.com/go-openapi/jsonpointer v0.20.2/go.mod h1:bHen+N0u1KEO3YlmqOjTT9Adn1RfD91Ar825/PuiRVs=
|
||||
github.com/go-openapi/swag v0.22.9 h1:XX2DssF+mQKM2DHsbgZK74y/zj4mo9I99+89xUmuZCE=
|
||||
github.com/go-openapi/swag v0.22.9/go.mod h1:3/OXnFfnMAwBD099SwYRk7GD3xOrr1iL7d/XNLXVVwE=
|
||||
github.com/go-playground/assert/v2 v2.2.0 h1:JvknZsQTYeFEAhQwI4qEt9cyV5ONwRHC+lYKSsYSR8s=
|
||||
github.com/go-playground/assert/v2 v2.2.0/go.mod h1:VDjEfimB/XKnb+ZQfWdccd7VUvScMdVu0Titje2rxJ4=
|
||||
github.com/go-playground/locales v0.14.1 h1:EWaQ/wswjilfKLTECiXz7Rh+3BjFhfDFKv/oXslEjJA=
|
||||
github.com/go-playground/locales v0.14.1/go.mod h1:hxrqLVvrK65+Rwrd5Fc6F2O76J/NuW9t0sjnWqG1slY=
|
||||
github.com/go-playground/universal-translator v0.18.1 h1:Bcnm0ZwsGyWbCzImXv+pAJnYK9S473LQFuzCbDbfSFY=
|
||||
github.com/go-playground/universal-translator v0.18.1/go.mod h1:xekY+UJKNuX9WP91TpwSH2VMlDf28Uj24BCp08ZFTUY=
|
||||
github.com/go-playground/validator/v10 v10.22.1 h1:40JcKH+bBNGFczGuoBYgX4I6m/i27HYW8P9FDk5PbgA=
|
||||
github.com/go-playground/validator/v10 v10.22.1/go.mod h1:dbuPbCMFw/DrkbEynArYaCwl3amGuJotoKCe95atGMM=
|
||||
github.com/go-quicktest/qt v1.101.0 h1:O1K29Txy5P2OK0dGo59b7b0LR6wKfIhttaAhHUyn7eI=
|
||||
github.com/go-quicktest/qt v1.101.0/go.mod h1:14Bz/f7NwaXPtdYEgzsx46kqSxVwTbzVZsDC26tQJow=
|
||||
github.com/goccy/go-json v0.10.3 h1:KZ5WoDbxAIgm2HNbYckL0se1fHD6rz5j4ywS6ebzDqA=
|
||||
github.com/goccy/go-json v0.10.3/go.mod h1:oq7eo15ShAhp70Anwd5lgX2pLfOS3QCiwU/PULtXL6M=
|
||||
github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
|
||||
github.com/godbus/dbus/v5 v5.0.6 h1:mkgN1ofwASrYnJ5W6U/BxG15eXXXjirgZc7CLqkcaro=
|
||||
github.com/godbus/dbus/v5 v5.0.6/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
|
||||
github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q=
|
||||
github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q=
|
||||
github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek=
|
||||
github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps=
|
||||
github.com/google/cadvisor v0.50.0 h1:7w/hKIbJKBWqQsRTy+Hpj2vj+fnxrLXcEXFy+LW0Bsg=
|
||||
github.com/google/cadvisor v0.50.0/go.mod h1:VxCDwZalpFyENvmfabFqaIGsqNKLtDzE62a19rfVTB8=
|
||||
github.com/google/flatbuffers v23.5.26+incompatible h1:M9dgRyhJemaM4Sw8+66GHBu8ioaQmyPLg1b8VwK5WJg=
|
||||
github.com/google/flatbuffers v23.5.26+incompatible/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8=
|
||||
github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
|
||||
github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
|
||||
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
|
||||
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
|
||||
github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0=
|
||||
github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
|
||||
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||
github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY=
|
||||
github.com/gorilla/mux v1.8.1/go.mod h1:AKf9I4AEqPTmMytcMc0KkNouC66V3BtZ4qD5fmWSiMQ=
|
||||
github.com/grafana/grafana-plugin-sdk-go v0.251.0 h1:gnOtxrC/1rqFvpSbQYyoZqkr47oWDlz4Q2L6Ozmsi3w=
|
||||
github.com/grafana/grafana-plugin-sdk-go v0.251.0/go.mod h1:gCGN9kHY3KeX4qyni3+Kead38Q+85pYOrsDcxZp6AIk=
|
||||
github.com/grafana/otel-profiling-go v0.5.1 h1:stVPKAFZSa7eGiqbYuG25VcqYksR6iWvF3YH66t4qL8=
|
||||
github.com/grafana/otel-profiling-go v0.5.1/go.mod h1:ftN/t5A/4gQI19/8MoWurBEtC6gFw8Dns1sJZ9W4Tls=
|
||||
github.com/grafana/pyroscope-go/godeltaprof v0.1.8 h1:iwOtYXeeVSAeYefJNaxDytgjKtUuKQbJqgAIjlnicKg=
|
||||
github.com/grafana/pyroscope-go/godeltaprof v0.1.8/go.mod h1:2+l7K7twW49Ct4wFluZD3tZ6e0SjanjcUUBPVD/UuGU=
|
||||
github.com/grpc-ecosystem/go-grpc-middleware/providers/prometheus v1.0.1 h1:qnpSQwGEnkcRpTqNOIR6bJbR0gAorgP9CSALpRcKoAA=
|
||||
github.com/grpc-ecosystem/go-grpc-middleware/providers/prometheus v1.0.1/go.mod h1:lXGCsh6c22WGtjr+qGHj1otzZpV/1kwTMAqkwZsnWRU=
|
||||
github.com/grpc-ecosystem/go-grpc-middleware/v2 v2.1.0 h1:pRhl55Yx1eC7BZ1N+BBWwnKaMyD8uC+34TLdndZMAKk=
|
||||
github.com/grpc-ecosystem/go-grpc-middleware/v2 v2.1.0/go.mod h1:XKMd7iuf/RGPSMJ/U4HP0zS2Z9Fh8Ps9a+6X26m/tmI=
|
||||
github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0 h1:bkypFPDjIYGfCYD5mRBvpqxfYX1YCS1PXdKYWi8FsN0=
|
||||
github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0/go.mod h1:P+Lt/0by1T8bfcF3z737NnSbmxQAppXMRziHUxPOC8k=
|
||||
github.com/hashicorp/go-hclog v1.6.3 h1:Qr2kF+eVWjTiYmU7Y31tYlP1h0q/X3Nl3tPGdaB11/k=
|
||||
github.com/hashicorp/go-hclog v1.6.3/go.mod h1:W4Qnvbt70Wk/zYJryRzDRU/4r0kIg0PVHBcfoyhpF5M=
|
||||
github.com/hashicorp/go-plugin v1.6.1 h1:P7MR2UP6gNKGPp+y7EZw2kOiq4IR9WiqLvp0XOsVdwI=
|
||||
github.com/hashicorp/go-plugin v1.6.1/go.mod h1:XPHFku2tFo3o3QKFgSYo+cghcUhw1NA1hZyMK0PWAw0=
|
||||
github.com/hashicorp/yamux v0.1.1 h1:yrQxtgseBDrq9Y652vSRDvsKCJKOUD+GzTS4Y0Y8pvE=
|
||||
github.com/hashicorp/yamux v0.1.1/go.mod h1:CtWFDAQgb7dxtzFs4tWbplKIe2jSi3+5vKbgIO0SLnQ=
|
||||
github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
|
||||
github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
|
||||
github.com/invopop/yaml v0.2.0 h1:7zky/qH+O0DwAyoobXUqvVBwgBFRxKoQ/3FjcVpjTMY=
|
||||
github.com/invopop/yaml v0.2.0/go.mod h1:2XuRLgs/ouIrW3XNzuNj7J3Nvu/Dig5MXvbCEdiBN3Q=
|
||||
github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY=
|
||||
github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y=
|
||||
github.com/josharian/native v1.1.0 h1:uuaP0hAbW7Y4l0ZRQ6C9zfb7Mg1mbFKry/xzDAfmtLA=
|
||||
github.com/josharian/native v1.1.0/go.mod h1:7X/raswPFr05uY3HiLlYeyQntB6OO7E/d2Cu7qoaN2w=
|
||||
github.com/jsimonetti/rtnetlink v1.4.2 h1:Df9w9TZ3npHTyDn0Ev9e1uzmN2odmXd0QX+J5GTEn90=
|
||||
github.com/jsimonetti/rtnetlink v1.4.2/go.mod h1:92s6LJdE+1iOrw+F2/RO7LYI2Qd8pPpFNNUYW06gcoM=
|
||||
github.com/jsimonetti/rtnetlink/v2 v2.0.1 h1:xda7qaHDSVOsADNouv7ukSuicKZO7GgVUCXxpaIEIlM=
|
||||
github.com/jsimonetti/rtnetlink/v2 v2.0.1/go.mod h1:7MoNYNbb3UaDHtF8udiJo/RH6VsTKP1pqKLUTVCvToE=
|
||||
github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
|
||||
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
|
||||
github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8=
|
||||
github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
|
||||
github.com/klauspost/compress v1.17.9 h1:6KIumPrER1LHsvBVuDa0r5xaG0Es51mhhB9BQB2qeMA=
|
||||
github.com/klauspost/compress v1.17.9/go.mod h1:Di0epgTjJY877eYKx5yC51cX2A2Vl2ibi7bDH9ttBbw=
|
||||
github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg=
|
||||
github.com/klauspost/cpuid/v2 v2.2.9 h1:66ze0taIn2H33fBvCkXuv9BmCwDfafmiIVpKV9kKGuY=
|
||||
github.com/klauspost/cpuid/v2 v2.2.9/go.mod h1:rqkxqrZ1EhYM9G+hXH7YdowN5R5RGN6NK4QwQ3WMXF8=
|
||||
github.com/knz/go-libedit v1.10.1/go.mod h1:MZTVkCWyz0oBc7JOWP3wNAzd002ZbM/5hgShxwh4x8M=
|
||||
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
|
||||
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
|
||||
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
|
||||
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
|
||||
github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc=
|
||||
github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
|
||||
github.com/leodido/go-urn v1.4.0 h1:WT9HwE9SGECu3lg4d/dIA+jxlljEa1/ffXKmRjqdmIQ=
|
||||
github.com/leodido/go-urn v1.4.0/go.mod h1:bvxc+MVxLKB4z00jd1z+Dvzr47oO32F/QSNjSBOlFxI=
|
||||
github.com/magefile/mage v1.15.0 h1:BvGheCMAsG3bWUDbZ8AyXXpCNwU9u5CB6sM+HNb9HYg=
|
||||
github.com/magefile/mage v1.15.0/go.mod h1:z5UZb/iS3GoOSn0JgWuiw7dxlurVYTu+/jHXqQg881A=
|
||||
github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0=
|
||||
github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
|
||||
github.com/mattetti/filebuffer v1.0.1 h1:gG7pyfnSIZCxdoKq+cPa8T0hhYtD9NxCdI4D7PTjRLM=
|
||||
github.com/mattetti/filebuffer v1.0.1/go.mod h1:YdMURNDOttIiruleeVr6f56OrMc+MydEnTcXwtkxNVs=
|
||||
github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA=
|
||||
github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg=
|
||||
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
|
||||
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
|
||||
github.com/mattn/go-runewidth v0.0.9/go.mod h1:H031xJmbD/WCDINGzjvQ9THkh0rPKHF+m2gUSrubnMI=
|
||||
github.com/mattn/go-runewidth v0.0.14 h1:+xnbZSEeDbOIg5/mE6JF0w6n9duR1l3/WmbinWVwUuU=
|
||||
github.com/mattn/go-runewidth v0.0.14/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
|
||||
github.com/mdlayher/netlink v1.7.2 h1:/UtM3ofJap7Vl4QWCPDGXY8d3GIY2UGSDbK+QWmY8/g=
|
||||
github.com/mdlayher/netlink v1.7.2/go.mod h1:xraEF7uJbxLhc5fpHL4cPe221LI2bdttWlU+ZGLfQSw=
|
||||
github.com/mdlayher/socket v0.4.1 h1:eM9y2/jlbs1M615oshPQOHZzj6R6wMT7bX5NPiQvn2U=
|
||||
github.com/mdlayher/socket v0.4.1/go.mod h1:cAqeGjoufqdxWkD7DkpyS+wcefOtmu5OQ8KuoJGIReA=
|
||||
github.com/mitchellh/go-testing-interface v1.14.1 h1:jrgshOhYAUVNMAJiKbEu7EqAwgJJ2JqpQmpLJOu07cU=
|
||||
github.com/mitchellh/go-testing-interface v1.14.1/go.mod h1:gfgS7OtZj6MA4U1UrDRp04twqAjfvlZyCfX3sDjEym8=
|
||||
github.com/moby/docker-image-spec v1.3.1 h1:jMKff3w6PgbfSa69GfNg+zN/XLhfXJGnEx3Nl2EsFP0=
|
||||
github.com/moby/docker-image-spec v1.3.1/go.mod h1:eKmb5VW8vQEh/BAr2yvVNvuiJuY6UIocYsFu/DxxRpo=
|
||||
github.com/moby/term v0.5.0 h1:xt8Q1nalod/v7BqbG21f8mQPqH+xAaC9C3N3wfWbVP0=
|
||||
github.com/moby/term v0.5.0/go.mod h1:8FzsFHVUBGZdbDsJw/ot+X+d5HLUbvklYLJ9uGfcI3Y=
|
||||
github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
|
||||
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
|
||||
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
|
||||
github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M=
|
||||
github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
|
||||
github.com/mohae/deepcopy v0.0.0-20170929034955-c48cc78d4826 h1:RWengNIwukTxcDr9M+97sNutRR1RKhG96O6jWumTTnw=
|
||||
github.com/mohae/deepcopy v0.0.0-20170929034955-c48cc78d4826/go.mod h1:TaXosZuwdSHYgviHp1DAtfrULt5eUgsSMsZf+YrPgl8=
|
||||
github.com/morikuni/aec v1.0.0 h1:nP9CBfwrvYnBRgY6qfDQkygYDmYwOilePFkwzv4dU8A=
|
||||
github.com/morikuni/aec v1.0.0/go.mod h1:BbKIizmSmc5MMPqRYbxO4ZU0S0+P200+tUnFx7PXmsc=
|
||||
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
|
||||
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
|
||||
github.com/oklog/run v1.1.0 h1:GEenZ1cK0+q0+wsJew9qUg/DyD8k3JzYsZAi5gYi2mA=
|
||||
github.com/oklog/run v1.1.0/go.mod h1:sVPdnTZT1zYwAJeCMu2Th4T21pA3FPOQRfWjQlk7DVU=
|
||||
github.com/olekukonko/tablewriter v0.0.5 h1:P2Ga83D34wi1o9J6Wh1mRuqd4mF/x/lgBS7N7AbDhec=
|
||||
github.com/olekukonko/tablewriter v0.0.5/go.mod h1:hPp6KlRPjbx+hW8ykQs1w3UBbZlj6HuIJcUGPhkA7kY=
|
||||
github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U=
|
||||
github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM=
|
||||
github.com/opencontainers/image-spec v1.1.0 h1:8SG7/vwALn54lVB/0yZ/MMwhFrPYtpEHQb2IpWsCzug=
|
||||
github.com/opencontainers/image-spec v1.1.0/go.mod h1:W4s4sFTMaBeK1BQLXbG4AdM2szdn85PY75RI83NrTrM=
|
||||
github.com/opencontainers/runtime-spec v1.2.0 h1:z97+pHb3uELt/yiAWD691HNHQIF07bE7dzrbT927iTk=
|
||||
github.com/opencontainers/runtime-spec v1.2.0/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0=
|
||||
github.com/pelletier/go-toml v1.9.5 h1:4yBQzkHv+7BHq2PQUZF3Mx0IYxG7LsP222s7Agd3ve8=
|
||||
github.com/pelletier/go-toml v1.9.5/go.mod h1:u1nR/EPcESfeI/szUZKdtJ0xRNbUoANCkoOuaOx1Y+c=
|
||||
github.com/pelletier/go-toml/v2 v2.2.3 h1:YmeHyLY8mFWbdkNWwpr+qIL2bEqT0o95WSdkNHvL12M=
|
||||
github.com/pelletier/go-toml/v2 v2.2.3/go.mod h1:MfCQTFTvCcUyyvvwm1+G6H/jORL20Xlb6rzQu9GuUkc=
|
||||
github.com/perimeterx/marshmallow v1.1.5 h1:a2LALqQ1BlHM8PZblsDdidgv1mWi1DgC2UmX50IvK2s=
|
||||
github.com/perimeterx/marshmallow v1.1.5/go.mod h1:dsXbUu8CRzfYP5a87xpp0xq9S3u0Vchtcl8we9tYaXw=
|
||||
github.com/pierrec/lz4/v4 v4.1.18 h1:xaKrnTkyoqfh1YItXl56+6KJNVYWlEEPuAQW9xsplYQ=
|
||||
github.com/pierrec/lz4/v4 v4.1.18/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4=
|
||||
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
|
||||
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
|
||||
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/prometheus/client_golang v1.20.3 h1:oPksm4K8B+Vt35tUhw6GbSNSgVlVSBH0qELP/7u83l4=
|
||||
github.com/prometheus/client_golang v1.20.3/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE=
|
||||
github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E=
|
||||
github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY=
|
||||
github.com/prometheus/common v0.55.0 h1:KEi6DK7lXW/m7Ig5i47x0vRzuBsHuvJdi5ee6Y3G1dc=
|
||||
github.com/prometheus/common v0.55.0/go.mod h1:2SECS4xJG1kd8XF9IcM1gMX6510RAEL65zxzNImwdc8=
|
||||
github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc=
|
||||
github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk=
|
||||
github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
|
||||
github.com/rivo/uniseg v0.4.3 h1:utMvzDsuh3suAEnhH0RdHmoPbU648o6CvXxTx4SBMOw=
|
||||
github.com/rivo/uniseg v0.4.3/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
|
||||
github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8=
|
||||
github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4=
|
||||
github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk=
|
||||
github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
|
||||
github.com/shirou/gopsutil v2.21.11+incompatible h1:lOGOyCG67a5dv2hq5Z1BLDUqqKp3HkbjPcz5j6XMS0U=
|
||||
github.com/shirou/gopsutil v2.21.11+incompatible/go.mod h1:5b4v6he4MtMOwMlS0TUMTu2PcXUg8+E1lC7eC3UO/RA=
|
||||
github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ=
|
||||
github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ=
|
||||
github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM=
|
||||
github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y=
|
||||
github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
|
||||
github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
|
||||
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
||||
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
|
||||
github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
|
||||
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
|
||||
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
|
||||
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
|
||||
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
|
||||
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
|
||||
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
|
||||
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
|
||||
github.com/tklauser/go-sysconf v0.3.12 h1:0QaGUFOdQaIVdPgfITYzaTegZvdCjmYO52cSFAEVmqU=
|
||||
github.com/tklauser/go-sysconf v0.3.12/go.mod h1:Ho14jnntGE1fpdOqQEEaiKRpvIavV0hSfmBq8nJbHYI=
|
||||
github.com/tklauser/numcpus v0.6.1 h1:ng9scYS7az0Bk4OZLvrNXNSAO2Pxr1XXRAPyjhIx+Fk=
|
||||
github.com/tklauser/numcpus v0.6.1/go.mod h1:1XfjsgE2zo8GVw7POkMbHENHzVg3GzmoZ9fESEdAacY=
|
||||
github.com/twitchyliquid64/golang-asm v0.15.1 h1:SU5vSMR7hnwNxj24w34ZyCi/FmDZTkS4MhqMhdFk5YI=
|
||||
github.com/twitchyliquid64/golang-asm v0.15.1/go.mod h1:a1lVb/DtPvCB8fslRZhAngC2+aY1QWCk3Cedj/Gdt08=
|
||||
github.com/ugorji/go/codec v1.2.12 h1:9LC83zGrHhuUA9l16C9AHXAqEV/2wBQ4nkvumAE65EE=
|
||||
github.com/ugorji/go/codec v1.2.12/go.mod h1:UNopzCgEMSXjBc6AOMqYvWC1ktqTAfzJZUZgYf6w6lg=
|
||||
github.com/unknwon/bra v0.0.0-20200517080246-1e3013ecaff8 h1:aVGB3YnaS/JNfOW3tiHIlmNmTDg618va+eT0mVomgyI=
|
||||
github.com/unknwon/bra v0.0.0-20200517080246-1e3013ecaff8/go.mod h1:fVle4kNr08ydeohzYafr20oZzbAkhQT39gKK/pFQ5M4=
|
||||
github.com/unknwon/com v1.0.1 h1:3d1LTxD+Lnf3soQiD4Cp/0BRB+Rsa/+RTvz8GMMzIXs=
|
||||
github.com/unknwon/com v1.0.1/go.mod h1:tOOxU81rwgoCLoOVVPHb6T/wt8HZygqH5id+GNnlCXM=
|
||||
github.com/unknwon/log v0.0.0-20150304194804-e617c87089d3 h1:4EYQaWAatQokdji3zqZloVIW/Ke1RQjYw2zHULyrHJg=
|
||||
github.com/unknwon/log v0.0.0-20150304194804-e617c87089d3/go.mod h1:1xEUf2abjfP92w2GZTV+GgaRxXErwRXcClbUwrNJffU=
|
||||
github.com/urfave/cli v1.22.15 h1:nuqt+pdC/KqswQKhETJjo7pvn/k4xMUxgW6liI7XpnM=
|
||||
github.com/urfave/cli v1.22.15/go.mod h1:wSan1hmo5zeyLGBjRJbzRTNk8gwoYa2B9n4q9dmRIc0=
|
||||
github.com/urfave/cli/v2 v2.27.4 h1:o1owoI+02Eb+K107p27wEX9Bb8eqIoZCfLXloLUSWJ8=
|
||||
github.com/urfave/cli/v2 v2.27.4/go.mod h1:m4QzxcD2qpra4z7WhzEGn74WZLViBnMpb1ToCAKdGRQ=
|
||||
github.com/vishvananda/netlink v1.3.0 h1:X7l42GfcV4S6E4vHTsw48qbrV+9PVojNfIhZcwQdrZk=
|
||||
github.com/vishvananda/netlink v1.3.0/go.mod h1:i6NetklAujEcC6fK0JPjT8qSwWyO0HLn4UKG+hGqeJs=
|
||||
github.com/vishvananda/netns v0.0.4 h1:Oeaw1EM2JMxD51g9uhtC0D7erkIjgmj8+JZc26m1YX8=
|
||||
github.com/vishvananda/netns v0.0.4/go.mod h1:SpkAiCQRtJ6TvvxPnOSyH3BMl6unz3xZlaprSwhNNJM=
|
||||
github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM=
|
||||
github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg=
|
||||
github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 h1:gEOO8jv9F4OT7lGCjxCBTO/36wtF6j2nSip77qHd4x4=
|
||||
github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1/go.mod h1:Ohn+xnUBiLI6FVj/9LpzZWtj1/D6lUovWYBkxHVV3aM=
|
||||
github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
|
||||
github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
|
||||
github.com/yusufpapurcu/wmi v1.2.4 h1:zFUKzehAFReQwLys1b/iSMl+JQGSCSjtVqQn9bBrPo0=
|
||||
github.com/yusufpapurcu/wmi v1.2.4/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0=
|
||||
github.com/zeebo/assert v1.3.0 h1:g7C04CbJuIDKNPFHmsk4hwZDO5O+kntRxzaUoNXj+IQ=
|
||||
github.com/zeebo/assert v1.3.0/go.mod h1:Pq9JiuJQpG8JLJdtkwrJESF0Foym2/D9XMU5ciN/wJ0=
|
||||
github.com/zeebo/xxh3 v1.0.2 h1:xZmwmqxHZA8AI603jOQ0tMqmBr9lPeFwGg6d+xy9DC0=
|
||||
github.com/zeebo/xxh3 v1.0.2/go.mod h1:5NWz9Sef7zIDm2JHfFlcQvNekmcEl9ekUZQQKCYaDcA=
|
||||
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.53.0 h1:9G6E0TXzGFVfTnawRzrPl83iHOAV7L8NJiR8RSGYV1g=
|
||||
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.53.0/go.mod h1:azvtTADFQJA8mX80jIH/akaE7h+dbm/sVuaHqN13w74=
|
||||
go.opentelemetry.io/contrib/instrumentation/net/http/httptrace/otelhttptrace v0.53.0 h1:IVtyPth4Rs5P8wIf0mP2KVKFNTJ4paX9qQ4Hkh5gFdc=
|
||||
go.opentelemetry.io/contrib/instrumentation/net/http/httptrace/otelhttptrace v0.53.0/go.mod h1:ImRBLMJv177/pwiLZ7tU7HDGNdBv7rS0HQ99eN/zBl8=
|
||||
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.53.0 h1:4K4tsIXefpVJtvA/8srF4V4y0akAoPHkIslgAkjixJA=
|
||||
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.53.0/go.mod h1:jjdQuTGVsXV4vSs+CJ2qYDeDPf9yIJV23qlIzBm73Vg=
|
||||
go.opentelemetry.io/contrib/propagators/jaeger v1.29.0 h1:+YPiqF5rR6PqHBlmEFLPumbSP0gY0WmCGFayXRcCLvs=
|
||||
go.opentelemetry.io/contrib/propagators/jaeger v1.29.0/go.mod h1:6PD7q7qquWSp3Z4HeM3e/2ipRubaY1rXZO8NIHVDZjs=
|
||||
go.opentelemetry.io/contrib/samplers/jaegerremote v0.23.0 h1:qKi9ntCcronqWqfuKxqrxZlZd82jXJEgGiAWH1+phxo=
|
||||
go.opentelemetry.io/contrib/samplers/jaegerremote v0.23.0/go.mod h1:1kbAgQa5lgYC3rC6cE3jSxQ/Q13l33wv/WI8U+htwag=
|
||||
go.opentelemetry.io/otel v1.29.0 h1:PdomN/Al4q/lN6iBJEN3AwPvUiHPMlt93c8bqTG5Llw=
|
||||
go.opentelemetry.io/otel v1.29.0/go.mod h1:N/WtXPs1CNCUEx+Agz5uouwCba+i+bJGFicT8SR4NP8=
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.29.0 h1:dIIDULZJpgdiHz5tXrTgKIMLkus6jEFa7x5SOKcyR7E=
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.29.0/go.mod h1:jlRVBe7+Z1wyxFSUs48L6OBQZ5JwH2Hg/Vbl+t9rAgI=
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.28.0 h1:R3X6ZXmNPRR8ul6i3WgFURCHzaXjHdm0karRG/+dj3s=
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.28.0/go.mod h1:QWFXnDavXWwMx2EEcZsf3yxgEKAqsxQ+Syjp+seyInw=
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.27.0 h1:QY7/0NeRPKlzusf40ZE4t1VlMKbqSNT7cJRYzWuja0s=
|
||||
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.27.0/go.mod h1:HVkSiDhTM9BoUJU8qE6j2eSWLLXvi1USXjyd2BXT8PY=
|
||||
go.opentelemetry.io/otel/metric v1.29.0 h1:vPf/HFWTNkPu1aYeIsc98l4ktOQaL6LeSoeV2g+8YLc=
|
||||
go.opentelemetry.io/otel/metric v1.29.0/go.mod h1:auu/QWieFVWx+DmQOUMgj0F8LHWdgalxXqvp7BII/W8=
|
||||
go.opentelemetry.io/otel/sdk v1.29.0 h1:vkqKjk7gwhS8VaWb0POZKmIEDimRCMsopNYnriHyryo=
|
||||
go.opentelemetry.io/otel/sdk v1.29.0/go.mod h1:pM8Dx5WKnvxLCb+8lG1PRNIDxu9g9b9g59Qr7hfAAok=
|
||||
go.opentelemetry.io/otel/trace v1.29.0 h1:J/8ZNK4XgR7a21DZUAsbF8pZ5Jcw1VhACmnYt39JTi4=
|
||||
go.opentelemetry.io/otel/trace v1.29.0/go.mod h1:eHl3w0sp3paPkYstJOmAimxhiFXPg+MMTlEh3nsQgWQ=
|
||||
go.opentelemetry.io/proto/otlp v1.3.1 h1:TrMUixzpM0yuc/znrFTP9MMRh8trP93mkCiDVeXrui0=
|
||||
go.opentelemetry.io/proto/otlp v1.3.1/go.mod h1:0X1WI4de4ZsLrrJNLAQbFeLCm3T7yBkR0XqQ7niQU+8=
|
||||
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
|
||||
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
|
||||
go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI=
|
||||
go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU=
|
||||
go.yaml.in/yaml/v3 v3.0.3 h1:bXOww4E/J3f66rav3pX3m8w6jDE4knZjGOw8b5Y6iNE=
|
||||
go.yaml.in/yaml/v3 v3.0.3/go.mod h1:tBHosrYAkRZjRAOREWbDnBXUf08JOwYq++0QNwQiWzI=
|
||||
golang.org/x/arch v0.12.0 h1:UsYJhbzPYGsT0HbEdmYcqtCv8UNGvnaL561NnIUvaKg=
|
||||
golang.org/x/arch v0.12.0/go.mod h1:FEVrYAQjsQXMVJ1nsMoVVXPZg6p2JE2mx8psSWTDQys=
|
||||
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
|
||||
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
|
||||
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
|
||||
golang.org/x/crypto v0.29.0 h1:L5SG1JTTXupVV3n6sUqMTeWbjAyfPwoda2DLX8J8FrQ=
|
||||
golang.org/x/crypto v0.29.0/go.mod h1:+F4F4N5hv6v38hfeYwTdx20oUvLLc+QfrE9Ax9HtgRg=
|
||||
golang.org/x/exp v0.0.0-20240823005443-9b4947da3948 h1:kx6Ds3MlpiUHKj7syVnbp57++8WpuKPcR5yjLBjvLEA=
|
||||
golang.org/x/exp v0.0.0-20240823005443-9b4947da3948/go.mod h1:akd2r19cwCdwSwWeIdzYQGa/EZZyqcOdwWiwj5L5eKQ=
|
||||
golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
|
||||
golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
|
||||
golang.org/x/mod v0.20.0 h1:utOm6MM3R3dnawAiJgn0y+xvuYRsm1RKM/4giyfDgV0=
|
||||
golang.org/x/mod v0.20.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
|
||||
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
|
||||
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
|
||||
golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
|
||||
golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
|
||||
golang.org/x/net v0.31.0 h1:68CPQngjLL0r2AlUKiSxtQFKvzRVbnzLwMUn5SzcLHo=
|
||||
golang.org/x/net v0.31.0/go.mod h1:P4fl1q7dY2hnZFxEk4pPSkDHF+QqjitcnDjUQyMM+pM=
|
||||
golang.org/x/oauth2 v0.23.0 h1:PbgcYx2W7i4LvjJWEbf0ngHV6qJYr86PkAV3bXdLEbs=
|
||||
golang.org/x/oauth2 v0.23.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI=
|
||||
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sync v0.9.0 h1:fEo0HyrW1GIgZdpbhCRO0PkJajUS5H9IFUztCgEo2jQ=
|
||||
golang.org/x/sync v0.9.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
|
||||
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
||||
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20210616094352-59db8d763f22/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.2.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.10.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.27.0 h1:wBqf8DvsY9Y/2P8gAfPDEYNuS30J4lPHJxXSb/nJZ+s=
|
||||
golang.org/x/sys v0.27.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
|
||||
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
|
||||
golang.org/x/text v0.20.0 h1:gK/Kv2otX8gz+wn7Rmb3vT96ZwuoxnQlY+HlJVj7Qug=
|
||||
golang.org/x/text v0.20.0/go.mod h1:D4IsuqiFMhST5bX19pQ9ikHC2GsaKyk/oF+pn3ducp4=
|
||||
golang.org/x/time v0.6.0 h1:eTDhh4ZXt5Qf0augr54TN6suAUudPcawVZeIAPU7D4U=
|
||||
golang.org/x/time v0.6.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM=
|
||||
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
||||
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
|
||||
golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
|
||||
golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
|
||||
golang.org/x/tools v0.24.0 h1:J1shsA93PJUEVaUSaay7UXAyE8aimq3GW0pjlolpa24=
|
||||
golang.org/x/tools v0.24.0/go.mod h1:YhNqVBIfWHdzvTLs0d8LCuMhkKUgSUKldakyV7W/WDQ=
|
||||
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 h1:H2TDz8ibqkAF6YGhCdN3jS9O0/s90v0rJh3X/OLHEUk=
|
||||
golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2/go.mod h1:K8+ghG5WaK9qNqU5K3HdILfMLy1f3aNYFI/wnl100a8=
|
||||
gonum.org/v1/gonum v0.12.0 h1:xKuo6hzt+gMav00meVPUlXwSdoEJP46BR+wdxQEFK2o=
|
||||
gonum.org/v1/gonum v0.12.0/go.mod h1:73TDxJfAAHeA8Mk9mf8NlIppyhQNo5GLTcYeqgo2lvY=
|
||||
google.golang.org/genproto/googleapis/api v0.0.0-20240822170219-fc7c04adadcd h1:BBOTEWLuuEGQy9n1y9MhVJ9Qt0BDu21X8qZs71/uPZo=
|
||||
google.golang.org/genproto/googleapis/api v0.0.0-20240822170219-fc7c04adadcd/go.mod h1:fO8wJzT2zbQbAjbIoos1285VfEIYKDDY+Dt+WpTkh6g=
|
||||
google.golang.org/genproto/googleapis/rpc v0.0.0-20240814211410-ddb44dafa142 h1:e7S5W7MGGLaSu8j3YjdezkZ+m1/Nm0uRVRMEMGk26Xs=
|
||||
google.golang.org/genproto/googleapis/rpc v0.0.0-20240814211410-ddb44dafa142/go.mod h1:UqMtugtsSgubUsoxbuAoiCXvqvErP7Gf0so0mK9tHxU=
|
||||
google.golang.org/grpc v1.66.0 h1:DibZuoBznOxbDQxRINckZcUvnCEvrW9pcWIE2yF9r1c=
|
||||
google.golang.org/grpc v1.66.0/go.mod h1:s3/l6xSSCURdVfAnL+TqCNMyTDAGN6+lZeVxnZR128Y=
|
||||
google.golang.org/protobuf v1.35.2 h1:8Ar7bF+apOIoThw1EdZl0p1oWvMqTHmpA2fRTyZO8io=
|
||||
google.golang.org/protobuf v1.35.2/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
|
||||
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
|
||||
gopkg.in/fsnotify/fsnotify.v1 v1.4.7 h1:XNNYLJHt73EyYiCZi6+xjupS9CpvmiDgjPTAjrBlQbo=
|
||||
gopkg.in/fsnotify/fsnotify.v1 v1.4.7/go.mod h1:Fyux9zXlo4rWoMSIzpn9fDAYjalPqJ/K1qJ27s+7ltE=
|
||||
gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc=
|
||||
gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw=
|
||||
gopkg.in/natefinch/lumberjack.v2 v2.2.1 h1:bBRl1b0OH9s/DuPhuXpNl+VtCaJXFZ5/uEFST95x9zc=
|
||||
gopkg.in/natefinch/lumberjack.v2 v2.2.1/go.mod h1:YD8tP3GAjkrDg1eZH7EGmyESg/lsYskCTPBJVb9jqSc=
|
||||
gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
|
||||
gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY=
|
||||
gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=
|
||||
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
gotest.tools/v3 v3.5.1 h1:EENdUnS3pdur5nybKYIh2Vfgc8IUNBjxDPSjtiJcOzU=
|
||||
gotest.tools/v3 v3.5.1/go.mod h1:isy3WKz7GK6uNw/sbHzfKBLvlvXwUyV06n6brMxxopU=
|
||||
k8s.io/api v0.31.3 h1:umzm5o8lFbdN/hIXbrK9oRpOproJO62CV1zqxXrLgk8=
|
||||
k8s.io/api v0.31.3/go.mod h1:UJrkIp9pnMOI9K2nlL6vwpxRzzEX5sWgn8kGQe92kCE=
|
||||
k8s.io/apimachinery v0.31.3 h1:6l0WhcYgasZ/wk9ktLq5vLaoXJJr5ts6lkaQzgeYPq4=
|
||||
k8s.io/apimachinery v0.31.3/go.mod h1:rsPdaZJfTfLsNJSQzNHQvYoTmxhoOEofxtOsF3rtsMo=
|
||||
k8s.io/client-go v0.31.3 h1:CAlZuM+PH2cm+86LOBemaJI/lQ5linJ6UFxKX/SoG+4=
|
||||
k8s.io/client-go v0.31.3/go.mod h1:2CgjPUTpv3fE5dNygAr2NcM8nhHzXvxB8KL5gYc3kJs=
|
||||
k8s.io/component-base v0.31.3 h1:DMCXXVx546Rfvhj+3cOm2EUxhS+EyztH423j+8sOwhQ=
|
||||
k8s.io/component-base v0.31.3/go.mod h1:xME6BHfUOafRgT0rGVBGl7TuSg8Z9/deT7qq6w7qjIU=
|
||||
k8s.io/cri-api v0.31.3 h1:dsZXzrGrCEwHjsTDlAV7rutEplpMLY8bfNRMIqrtXjo=
|
||||
k8s.io/cri-api v0.31.3/go.mod h1:Po3TMAYH/+KrZabi7QiwQI4a692oZcUOUThd/rqwxrI=
|
||||
k8s.io/cri-client v0.31.3 h1:9ZwddaNJomqkTBYQqSmB+Ccns3beY4HyYDwmRtWTCJM=
|
||||
k8s.io/cri-client v0.31.3/go.mod h1:klbWiYkOatOQOkXOYZMZMGSTM8q9eC/efsYGuXcgPes=
|
||||
k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk=
|
||||
k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE=
|
||||
k8s.io/kubelet v0.29.0 h1:SX5hlznTBcGIrS1scaf8r8p6m3e475KMifwt9i12iOk=
|
||||
k8s.io/kubelet v0.29.0/go.mod h1:kvKS2+Bz2tgDOG1S1q0TH2z1DasNuVF+8p6Aw7xvKkI=
|
||||
k8s.io/utils v0.0.0-20240711033017-18e509b52bc8 h1:pUdcCO1Lk/tbT5ztQWOBi5HBgbBP1J8+AsQnQCKsi8A=
|
||||
k8s.io/utils v0.0.0-20240711033017-18e509b52bc8/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
|
||||
nullprogram.com/x/optparse v1.0.0/go.mod h1:KdyPE+Igbe0jQUrVfMqDMeJQIJZEuyV7pjYmp6pbG50=
|
||||
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo=
|
||||
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0=
|
||||
sigs.k8s.io/structured-merge-diff/v4 v4.4.1 h1:150L+0vs/8DA78h1u02ooW1/fFq/Lwr+sGiqlzvrtq4=
|
||||
sigs.k8s.io/structured-merge-diff/v4 v4.4.1/go.mod h1:N8hJocpFajUSSeSJ9bOZ77VzejKZaXsTtZo4/u7Io08=
|
||||
sigs.k8s.io/yaml v1.5.0 h1:M10b2U7aEUY6hRtU870n2VTPgR5RZiL/I6Lcc2F4NUQ=
|
||||
sigs.k8s.io/yaml v1.5.0/go.mod h1:wZs27Rbxoai4C0f8/9urLZtZtF3avA3gKvGyPdDqTO4=
|