diff --git a/.cirrus.yml b/.cirrus.yml new file mode 100644 index 0000000..77ce98d --- /dev/null +++ b/.cirrus.yml @@ -0,0 +1,158 @@ +--- +# We use Cirrus for Vagrant tests and native CentOS 7 and 8, because macOS +# instances of GHA are too slow and flaky, and Linux instances of GHA do not +# support KVM. + +# NOTE Cirrus execution environments lack a terminal, needed for +# some integration tests. So we use `ssh -tt` command to fake a terminal. + +task: + timeout_in: 30m + + env: + DEBIAN_FRONTEND: noninteractive + HOME: /root + # yamllint disable rule:key-duplicates + matrix: + DISTRO: fedora + + name: vagrant DISTRO:$DISTRO + + compute_engine_instance: + image_project: cirrus-images + image: family/docker-kvm + platform: linux + nested_virtualization: true + # CPU limit: `16 / NTASK`: see https://cirrus-ci.org/faq/#are-there-any-limits + cpu: 8 + # Memory limit: `4GB * NCPU` + memory: 32G + + host_info_script: | + uname -a + echo "-----" + cat /etc/os-release + echo "-----" + cat /proc/cpuinfo + echo "-----" + df -T + install_libvirt_vagrant_script: | + apt-get update + apt-get install -y libvirt-daemon libvirt-daemon-system vagrant vagrant-libvirt + systemctl enable --now libvirtd + vagrant_cache: + fingerprint_script: uname -s ; cat Vagrantfile.$DISTRO + folder: /root/.vagrant.d + vagrant_up_script: | + ln -sf Vagrantfile.$DISTRO Vagrantfile + # Retry if it fails (download.fedoraproject.org returns 404 sometimes) + vagrant up --no-tty || vagrant up --no-tty + mkdir -p -m 0700 /root/.ssh + vagrant ssh-config >> /root/.ssh/config + guest_info_script: | + ssh default 'sh -exc "uname -a && systemctl --version && df -T && cat /etc/os-release"' + unit_tests_script: | + ssh default 'sudo -i make -C /vagrant localunittest' + integration_systemd_script: | + ssh -tt default "sudo -i make -C /vagrant localintegration RUNC_USE_SYSTEMD=yes" + integration_fs_script: | + ssh -tt default "sudo -i make -C /vagrant localintegration" + integration_systemd_rootless_script: | + ssh -tt default "sudo -i make -C /vagrant localrootlessintegration RUNC_USE_SYSTEMD=yes" + integration_fs_rootless_script: | + ssh -tt default "sudo -i make -C /vagrant localrootlessintegration" + +task: + timeout_in: 30m + + env: + HOME: /root + CIRRUS_WORKING_DIR: /home/runc + GO_VERSION: "1.17.3" + BATS_VERSION: "v1.3.0" + # yamllint disable rule:key-duplicates + matrix: + DISTRO: centos-7 + DISTRO: centos-stream-8 + + name: ci / $DISTRO + + compute_engine_instance: + image_project: centos-cloud + image: family/$DISTRO + platform: linux + cpu: 4 + memory: 8G + + install_dependencies_script: | + case $DISTRO in + centos-7) + (cd /etc/yum.repos.d && curl -O https://copr.fedorainfracloud.org/coprs/adrian/criu-el7/repo/epel-7/adrian-criu-el7-epel-7.repo) + # sysctl + echo "user.max_user_namespaces=15076" > /etc/sysctl.d/userns.conf + sysctl --system + ;; + centos-stream-8) + yum config-manager --set-enabled powertools # for glibc-static + ;; + esac + # Work around dnf mirror failures by retrying a few times. + for i in $(seq 0 2); do + sleep $i + yum install -y -q gcc git iptables jq glibc-static libseccomp-devel make criu fuse-sshfs && break + done + [ $? -eq 0 ] # fail if yum failed + # install Go + curl -fsSL "https://dl.google.com/go/go${GO_VERSION}.linux-amd64.tar.gz" | tar Cxz /usr/local + # install bats + cd /tmp + git clone https://github.com/bats-core/bats-core + cd bats-core + git checkout $BATS_VERSION + ./install.sh /usr/local + cd - + # Add a user for rootless tests + useradd -u2000 -m -d/home/rootless -s/bin/bash rootless + # Allow root and rootless itself to execute `ssh rootless@localhost` in tests/rootless.sh + ssh-keygen -t ecdsa -N "" -f /root/rootless.key + mkdir -m 0700 -p /home/rootless/.ssh + cp /root/rootless.key /home/rootless/.ssh/id_ecdsa + cat /root/rootless.key.pub >> /home/rootless/.ssh/authorized_keys + chown -R rootless.rootless /home/rootless + # set PATH + echo 'export PATH=/usr/local/go/bin:/usr/local/bin:$PATH' >> /root/.bashrc + # Setup ssh localhost for terminal emulation (script -e did not work) + ssh-keygen -t ed25519 -f /root/.ssh/id_ed25519 -N "" + cat /root/.ssh/id_ed25519.pub >> /root/.ssh/authorized_keys + chmod 400 /root/.ssh/authorized_keys + ssh-keyscan localhost >> /root/.ssh/known_hosts + echo -e "Host localhost\n\tStrictHostKeyChecking no\t\nIdentityFile /root/.ssh/id_ed25519\n" >> /root/.ssh/config + sed -e "s,PermitRootLogin.*,PermitRootLogin prohibit-password,g" -i /etc/ssh/sshd_config + systemctl restart sshd + host_info_script: | + uname -a + echo "-----" + cat /etc/os-release + echo "-----" + cat /proc/cpuinfo + echo "-----" + df -T + echo "-----" + systemctl --version + unit_tests_script: | + ssh -tt localhost "make -C /home/runc localunittest" + integration_systemd_script: | + ssh -tt localhost "make -C /home/runc localintegration RUNC_USE_SYSTEMD=yes" + integration_fs_script: | + ssh -tt localhost "make -C /home/runc localintegration" + integration_systemd_rootless_script: | + echo "SKIP: integration_systemd_rootless_script requires cgroup v2" + integration_fs_rootless_script: | + case $DISTRO in + centos-7) + echo "SKIP: FIXME: integration_fs_rootless_script is skipped because of EPERM on writing cgroup.procs" + ;; + centos-stream-8) + ssh -tt localhost "make -C /home/runc localrootlessintegration" + ;; + esac diff --git a/.codespellrc b/.codespellrc new file mode 100644 index 0000000..c626cb1 --- /dev/null +++ b/.codespellrc @@ -0,0 +1,3 @@ +[codespell] +skip = ./vendor,./.git +ignore-words-list = clos,creat diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..4bd9208 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,25 @@ +# Please see the documentation for all configuration options: +# https://help.github.com/github/administering-a-repository/configuration-options-for-dependency-updates + +version: 2 +updates: + # Dependencies listed in go.mod + - package-ecosystem: "gomod" + directory: "/" # Location of package manifests + schedule: + interval: "daily" + ignore: + # a regression in v1.22.2, see https://github.com/urfave/cli/issues/1092 + - dependency-name: "github.com/urfave/cli" + + # Dependencies listed in .github/workflows/*.yml + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "daily" + + # Dependencies listed in Dockerfile + - package-ecosystem: "docker" + directory: "/" + schedule: + interval: "daily" diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..72f6eb3 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,129 @@ +# NOTE Github Actions execution environments lack a terminal, needed for +# some integration tests. So we use `script` command to fake a terminal. + +name: ci +on: + push: + tags: + - v* + branches: + - master + - release-* + pull_request: + +env: + # Don't ignore C warnings. Note that the output of "go env CGO_CFLAGS" by default is "-g -O2", so we keep them. + CGO_CFLAGS: -g -O2 -Werror + +jobs: + test: + runs-on: ubuntu-20.04 + strategy: + fail-fast: false + matrix: + go-version: [1.16.x, 1.17.x] + rootless: ["rootless", ""] + race: ["-race", ""] + criu: [""] + include: + # Also test against latest criu-dev + - go-version: 1.17.x + rootless: "" + race: "" + criu: "criu-dev" + + steps: + + - name: checkout + uses: actions/checkout@v2 + + - name: install deps + if: matrix.criu == '' + env: + REPO: https://download.opensuse.org/repositories/devel:/tools:/criu/xUbuntu_20.04 + run: | + # criu repo + curl -fSsl $REPO/Release.key | sudo apt-key add - + echo "deb $REPO/ /" | sudo tee /etc/apt/sources.list.d/criu.list + sudo apt update + sudo apt install libseccomp-dev criu sshfs + + - name: install deps (criu ${{ matrix.criu }}) + if: matrix.criu != '' + run: | + sudo apt -q update + sudo apt -q install libseccomp-dev sshfs \ + libcap-dev libnet1-dev libnl-3-dev \ + libprotobuf-c-dev libprotobuf-dev protobuf-c-compiler protobuf-compiler + git clone https://github.com/checkpoint-restore/criu.git ~/criu + (cd ~/criu && git checkout ${{ matrix.criu }} && sudo make install-criu) + rm -rf ~/criu + + - name: install go ${{ matrix.go-version }} + uses: actions/setup-go@v2 + with: + stable: '!contains(${{ matrix.go-version }}, "beta") && !contains(${{ matrix.go-version }}, "rc")' + go-version: ${{ matrix.go-version }} + + - name: build + run: sudo -E PATH="$PATH" make EXTRA_FLAGS="${{ matrix.race }}" all + + - name: install bats + uses: mig4/setup-bats@v1 + with: + bats-version: 1.3.0 + + - name: unit test + if: matrix.rootless != 'rootless' + run: sudo -E PATH="$PATH" -- make TESTFLAGS="${{ matrix.race }}" localunittest + + - name: add rootless user + if: matrix.rootless == 'rootless' + run: | + sudo useradd -u2000 -m -d/home/rootless -s/bin/bash rootless + # Allow root and rootless itself to execute `ssh rootless@localhost` in tests/rootless.sh + ssh-keygen -t ecdsa -N "" -f $HOME/rootless.key + sudo mkdir -m 0700 -p /home/rootless/.ssh + sudo cp $HOME/rootless.key /home/rootless/.ssh/id_ecdsa + sudo cp $HOME/rootless.key.pub /home/rootless/.ssh/authorized_keys + sudo chown -R rootless.rootless /home/rootless + + - name: integration test (fs driver) + run: sudo -E PATH="$PATH" script -e -c 'make local${{ matrix.rootless }}integration' + + - name: integration test (systemd driver) + # can't use systemd driver with cgroupv1 + if: matrix.rootless != 'rootless' + run: sudo -E PATH="$PATH" script -e -c 'make RUNC_USE_SYSTEMD=yes local${{ matrix.rootless }}integration' + + # We need to continue support for 32-bit ARM. + # However, we do not have 32-bit ARM CI, so we use i386 for testing 32bit stuff. + # We are not interested in providing official support for i386. + cross-i386: + runs-on: ubuntu-20.04 + + steps: + + - name: checkout + uses: actions/checkout@v2 + + - name: install deps + run: | + sudo dpkg --add-architecture i386 + # add criu repo + sudo add-apt-repository -y ppa:criu/ppa + # apt-add-repository runs apt update so we don't have to. + + # Due to a bug in apt, we have to update it first + # (see https://bugs.launchpad.net/ubuntu-cdimage/+bug/1871268) + sudo apt -q install apt + sudo apt -q install libseccomp-dev libseccomp-dev:i386 gcc-multilib criu + + - name: install go + uses: actions/setup-go@v2 + with: + go-version: 1.x # Latest stable + + - name: unit test + # cgo is disabled by default when cross-compiling + run: sudo -E PATH="$PATH" -- make GOARCH=386 CGO_ENABLED=1 localunittest diff --git a/.github/workflows/validate.yml b/.github/workflows/validate.yml new file mode 100644 index 0000000..7194c64 --- /dev/null +++ b/.github/workflows/validate.yml @@ -0,0 +1,198 @@ +name: validate +on: + push: + tags: + - v* + branches: + - master + - release-* + pull_request: + +jobs: + + lint: + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v2 + - name: install deps + run: | + sudo apt -q update + sudo apt -q install libseccomp-dev + - uses: golangci/golangci-lint-action@v2 + with: + # must be specified without patch version + version: v1.42 + + lint-extra: + # Extra linters, only checking new code from pull requests. + if: github.event_name == 'pull_request' + runs-on: ubuntu-20.04 + permissions: + contents: read + steps: + - uses: actions/checkout@v2 + - name: install deps + run: | + sudo apt -q update + sudo apt -q install libseccomp-dev + - uses: golangci/golangci-lint-action@v2 + with: + only-new-issues: true + args: --config .golangci-extra.yml + # must be specified without patch version + version: v1.43 + + + compile-buildtags: + runs-on: ubuntu-20.04 + env: + # Don't ignore C warnings. Note that the output of "go env CGO_CFLAGS" by default is "-g -O2", so we keep them. + CGO_CFLAGS: -g -O2 -Werror + steps: + - uses: actions/checkout@v2 + - name: install go + uses: actions/setup-go@v2 + with: + go-version: 1.x # Latest stable + - name: compile with no build tags + run: make BUILDTAGS="" + + codespell: + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v2 + - name: install deps + # Version of codespell bundled with Ubuntu is way old, so use pip. + run: pip install codespell + - name: run codespell + run: codespell + + shfmt: + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v2 + - name: vars + run: | + echo "VERSION=3.3.1" >> $GITHUB_ENV + echo "$(go env GOPATH)/bin" >> $GITHUB_PATH + - name: cache go mod and $GOCACHE + uses: actions/cache@v2 + with: + path: | + ~/go/pkg/mod + ~/.cache/go-build + key: ${{ runner.os }}-shfmt-${{ env.VERSION }} + restore-keys: ${{ runner.os }}-shfmt- + - name: install shfmt + run: | + command -v shfmt || \ + (cd ~ && GO111MODULE=on time go get mvdan.cc/sh/v3/cmd/shfmt@v$VERSION) + - name: shfmt + run: make shfmt + + shellcheck: + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v2 + - name: vars + run: | + echo 'VERSION=v0.7.2' >> $GITHUB_ENV + echo 'BASEURL=https://github.com/koalaman/shellcheck/releases/download' >> $GITHUB_ENV + echo 'SHA256SUM=12ee2e0b90a3d1e9cae24ac9b2838be66b48573cb2c8e8f3c566b959df6f050c' >> $GITHUB_ENV + echo ~/bin >> $GITHUB_PATH + - name: install shellcheck + run: | + mkdir ~/bin + curl -sSfL --retry 5 $BASEURL/$VERSION/shellcheck-$VERSION.linux.x86_64.tar.xz | + tar xfJ - -C ~/bin --strip 1 shellcheck-$VERSION/shellcheck + sha256sum ~/bin/shellcheck | grep -q $SHA256SUM + # make sure to remove the old version + sudo rm -f /usr/bin/shellcheck + - uses: lumaxis/shellcheck-problem-matchers@v1 + - name: shellcheck + run: | + make shellcheck + + deps: + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v2 + - name: install go + uses: actions/setup-go@v2 + with: + go-version: 1.x # Latest stable + - name: cache go mod and $GOCACHE + uses: actions/cache@v2 + with: + path: | + ~/go/pkg/mod + ~/.cache/go-build + key: ${{ runner.os }}-go.sum-${{ hashFiles('**/go.sum') }} + restore-keys: ${{ runner.os }}-go.sum- + - name: verify deps + run: make verify-dependencies + + + commit: + runs-on: ubuntu-20.04 + # Only check commits on pull requests. + if: github.event_name == 'pull_request' + steps: + - name: get pr commits + id: 'get-pr-commits' + uses: tim-actions/get-pr-commits@v1.1.0 + with: + token: ${{ secrets.GITHUB_TOKEN }} + + - name: check subject line length + uses: tim-actions/commit-message-checker-with-regex@v0.3.1 + with: + commits: ${{ steps.get-pr-commits.outputs.commits }} + pattern: '^.{0,72}(\n.*)*$' + error: 'Subject too long (max 72)' + + + cfmt: + runs-on: ubuntu-20.04 + steps: + - name: checkout + uses: actions/checkout@v2 + with: + fetch-depth: 0 + - name: install deps + run: | + sudo apt -qq update + sudo apt -qq install indent + - name: cfmt + run: | + make cfmt + git diff --exit-code + + + release: + runs-on: ubuntu-20.04 + steps: + - name: checkout + uses: actions/checkout@v2 + with: + fetch-depth: 0 + # We have to run this under Docker as Ubuntu (host) does not support all + # the architectures we want to compile test against, and Dockerfile uses + # Debian (which does). + # + # XXX: as currently this is the only job that is using Docker, we are + # building and using the runcimage locally. In case more jobs running + # under Docker will emerge, it will be good to have a separate make + # runcimage job and share its result (the docker image) with whoever + # needs it. + - uses: satackey/action-docker-layer-caching@v0.0.11 + continue-on-error: true + - name: build docker image + run: make runcimage + - name: make releaseall + run: make releaseall + - name: upload artifacts + uses: actions/upload-artifact@v2 + with: + name: release-${{ github.run_id }} + path: release/* diff --git a/.gitignore b/.gitignore index 282e34e..76aefa1 100644 --- a/.gitignore +++ b/.gitignore @@ -2,5 +2,9 @@ vendor/pkg /runc /runc-* contrib/cmd/recvtty/recvtty +contrib/cmd/sd-helper/sd-helper +contrib/cmd/seccompagent/seccompagent man/man8 release +Vagrantfile +.vagrant diff --git a/.golangci-extra.yml b/.golangci-extra.yml new file mode 100644 index 0000000..1c160e6 --- /dev/null +++ b/.golangci-extra.yml @@ -0,0 +1,15 @@ +# This is golangci-lint config file which is used to check new code in +# github PRs only (see lint-extra job in .github/workflows/validate.yml). +# +# For the default linter config, see .golangci.yml. This config should +# only enable additional linters not enabled in the default config. + +run: + build-tags: + - seccomp + +linters: + disable-all: true + enable: + - godot + - revive diff --git a/.golangci.yml b/.golangci.yml new file mode 100644 index 0000000..96b3210 --- /dev/null +++ b/.golangci.yml @@ -0,0 +1,12 @@ +# For documentation, see https://golangci-lint.run/usage/configuration/ + +run: + build-tags: + - seccomp + +linters: + enable: + - gofumpt + - errorlint + - unconvert + - unparam diff --git a/.pullapprove.yml b/.pullapprove.yml deleted file mode 100644 index fc8c5d3..0000000 --- a/.pullapprove.yml +++ /dev/null @@ -1,10 +0,0 @@ -approve_by_comment: true -approve_regex: ^LGTM -reject_regex: ^Rejected -reset_on_push: true -author_approval: ignored -reviewers: - teams: - - runc-maintainers - name: default - required: 2 diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 5c2928d..0000000 --- a/.travis.yml +++ /dev/null @@ -1,54 +0,0 @@ -dist: bionic -language: go -go: - - 1.11.x - - 1.12.x - - tip - -matrix: - include: - - go: 1.12.x - env: - - RUNC_USE_SYSTEMD=1 - script: - - make BUILDTAGS="${BUILDTAGS}" all - - sudo PATH="$PATH" make localintegration RUNC_USE_SYSTEMD=1 - - go: 1.12.x - env: - - VIRTUALBOX_VERSION=6.0 - - VAGRANT_VERSION=2.2.6 - - FEDORA_VERSION=31 - before_install: - - cat /proc/cpuinfo - - wget -q https://www.virtualbox.org/download/oracle_vbox_2016.asc -O- | sudo apt-key add - && sudo sh -c "echo deb https://download.virtualbox.org/virtualbox/debian $(lsb_release -cs) contrib >> /etc/apt/sources.list" && sudo apt-get update && sudo apt-get install -yq build-essential gcc make linux-headers-$(uname -r) virtualbox-${VIRTUALBOX_VERSION} && sudo usermod -aG vboxusers $(whoami) - - wget https://releases.hashicorp.com/vagrant/${VAGRANT_VERSION}/vagrant_${VAGRANT_VERSION}_$(uname -m).deb && sudo dpkg -i vagrant_${VAGRANT_VERSION}_$(uname -m).deb - - vagrant init bento/fedora-${FEDORA_VERSION} && vagrant up && mkdir -p ~/.ssh && vagrant ssh-config >> ~/.ssh/config - - ssh default sudo dnf install -y podman - script: - - ssh default sudo podman build -t test /vagrant - - ssh default sudo podman run --privileged --cgroupns=private test make localunittest - allow_failures: - - go: tip - -go_import_path: github.com/opencontainers/runc - -# `make ci` uses Docker. -sudo: required -services: - - docker - -env: - global: - - BUILDTAGS="seccomp apparmor selinux ambient" - -before_install: - - sudo apt-get -qq update - - sudo apt-get install -y libseccomp-dev - - go get -u golang.org/x/lint/golint - - go get -u github.com/vbatts/git-validation - - env | grep TRAVIS_ - -script: - - git-validation -run DCO,short-subject -v - - make BUILDTAGS="${BUILDTAGS}" - - make BUILDTAGS="${BUILDTAGS}" clean ci cross diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..f10951b --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,248 @@ +# Changelog/ +This file documents all notable changes made to this project since runc 1.0. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] + +## [1.1.0] - 2022-01-14 + +> A plan depends as much upon execution as it does upon concept. + +## Changed + * libcontainer will now refuse to build without the nsenter package being + correctly compiled (specifically this requires CGO to be enabled). This + should avoid folks accidentally creating broken runc binaries (and + incorrectly importing our internal libraries into their projects). (#3331) + +## [1.1.0-rc.1] - 2021-12-14 + +> He who controls the spice controls the universe. + +### Deprecated + * runc run/start now warns if a new container cgroup is non-empty or frozen; + this warning will become an error in runc 1.2. (#3132, #3223) + * runc can only be built with Go 1.16 or later from this release onwards. + (#3100, #3245, #3325) + +### Removed + * `cgroup.GetHugePageSizes` has been removed entirely, and been replaced with + `cgroup.HugePageSizes` which is more efficient. (#3234) + * `intelrdt.GetIntelRdtPath` has been removed. Users who were using this + function to get the intelrdt root should use the new `intelrdt.Root` + instead. (#2920, #3239) + +### Added + * Add support for RDMA cgroup added in Linux 4.11. (#2883) + * runc exec now produces exit code of 255 when the exec failed. + This may help in distinguishing between runc exec failures + (such as invalid options, non-running container or non-existent + binary etc.) and failures of the command being executed. (#3073) + * runc run: new `--keep` option to skip removal exited containers artefacts. + This might be useful to check the state (e.g. of cgroup controllers) after + the container hasexited. (#2817, #2825) + * seccomp: add support for `SCMP_ACT_KILL_PROCESS` and `SCMP_ACT_KILL_THREAD` + (the latter is just an alias for `SCMP_ACT_KILL`). (#3204) + * seccomp: add support for `SCMP_ACT_NOTIFY` (seccomp actions). This allows + users to create sophisticated seccomp filters where syscalls can be + efficiently emulated by privileged processes on the host. (#2682) + * checkpoint/restore: add an option (`--lsm-mount-context`) to set + a different LSM mount context on restore. (#3068) + * runc releases are now cross-compiled for several architectures. Static + builds for said architectures will be available for all future releases. + (#3197) + * intelrdt: support ClosID parameter. (#2920) + * runc exec --cgroup: an option to specify a (non-top) in-container cgroup + to use for the process being executed. (#3040, #3059) + * cgroup v1 controllers now support hybrid hierarchy (i.e. when on a cgroup v1 + machine a cgroup2 filesystem is mounted to /sys/fs/cgroup/unified, runc + run/exec now adds the container to the appropriate cgroup under it). (#2087, + #3059) + * sysctl: allow slashes in sysctl names, to better match `sysctl(8)`'s + behaviour. (#3254, #3257) + * mounts: add support for bind-mounts which are inaccessible after switching + the user namespace. Note that this does not permit the container any + additional access to the host filesystem, it simply allows containers to + have bind-mounts configured for paths the user can access but have + restrictive access control settings for other users. (#2576) + * Add support for recursive mount attributes using `mount_setattr(2)`. These + have the same names as the proposed `mount(8)` options -- just prepend `r` + to the option name (such as `rro`). (#3272) + * Add `runc features` subcommand to allow runc users to detect what features + runc has been built with. This includes critical information such as + supported mount flags, hook names, and so on. Note that the output of this + command is subject to change and will not be considered stable until runc + 1.2 at the earliest. The runtime-spec specification for this feature is + being developed in [opencontainers/runtime-spec#1130]. (#3296) + +[opencontainers/runtime-spec#1130]: https://github.com/opencontainers/runtime-spec/pull/1130 + +### Changed + * system: improve performance of `/proc/$pid/stat` parsing. (#2696) + * cgroup2: when `/sys/fs/cgroup` is configured as a read-write mount, change + the ownership of certain cgroup control files (as per + `/sys/kernel/cgroup/delegate`) to allow for proper deferral to the container + process. (#3057) + * docs: series of improvements to man pages to make them easier to read and + use. (#3032) + +#### libcontainer API + * internal api: remove internal error types and handling system, switch to Go + wrapped errors. (#3033) + * New configs.Cgroup structure fields (#3177): + * Systemd (whether to use systemd cgroup manager); and + * Rootless (whether to use rootless cgroups). + * New cgroups/manager package aiming to simplify cgroup manager instantiation. + (#3177) + * All cgroup managers' instantiation methods now initialize cgroup paths and + can return errors. This allows to use any cgroup manager method (e.g. + Exists, Destroy, Set, GetStats) right after instantiation, which was not + possible before (as paths were initialized in Apply only). (#3178) + +### Fixed + * nsenter: do not try to close already-closed fds during container setup and + bail on close(2) failures. (#3058) + * runc checkpoint/restore: fixed for containers with an external bind mount + which destination is a symlink. (#3047). + * cgroup: improve openat2 handling for cgroup directory handle hardening. + (#3030) + * `runc delete -f` now succeeds (rather than timing out) on a paused + container. (#3134) + * runc run/start/exec now refuses a frozen cgroup (paused container in case of + exec). Users can disable this using `--ignore-paused`. (#3132, #3223) + * config: do not permit null bytes in mount fields. (#3287) + + +## [1.0.3] - 2021-12-06 + +> If you were waiting for the opportune moment, that was it. + +### Security + * A potential vulnerability was discovered in runc (related to an internal + usage of netlink), however upon further investigation we discovered that + while this bug was exploitable on the master branch of runc, no released + version of runc could be exploited using this bug. The exploit required being + able to create a netlink attribute with a length that would overflow a uint16 + but this was not possible in any released version of runc. For more + information, see [GHSA-v95c-p5hm-xq8f][] and CVE-2021-43784. + +### Fixed + * Fixed inability to start a container with read-write bind mount of a + read-only fuse host mount. (#3283, #3292) + * Fixed inability to start when read-only /dev in set in spec (#3276, #3277) + * Fixed not removing sub-cgroups upon container delete, when rootless cgroup v2 + is used with older systemd. (#3226, #3297) + * Fixed returning error from GetStats when hugetlb is unsupported (which causes + excessive logging for Kubernetes). (#3233, #3295) + * Improved an error message when dbus-user-session is not installed and + rootless + cgroup2 + systemd are used (#3212) + +[GHSA-v95c-p5hm-xq8f]: https://github.com/opencontainers/runc/security/advisories/GHSA-v95c-p5hm-xq8f + + +## [1.0.2] - 2021-07-16 + +> Given the right lever, you can move a planet. + +### Changed + * Made release builds reproducible from now on. (#3099, #3142) + +### Fixed + * Fixed a failure to set CPU quota period in some cases on cgroup v1. (#3090 + #3115) + * Fixed the inability to start a container with the "adding seccomp filter + rule for syscall ..." error, caused by redundant seccomp rules (i.e. those + that has action equal to the default one). Such redundant rules are now + skipped. (#3109, #3129) + * Fixed a rare debug log race in runc init, which can result in occasional + harmful "failed to decode ..." errors from runc run or exec. (#3120, #3130) + * Fixed the check in cgroup v1 systemd manager if a container needs to be + frozen before Set, and add a setting to skip such freeze unconditionally. + The previous fix for that issue, done in runc 1.0.1, was not working. + (#3166, #3167) + + +## [1.0.1] - 2021-07-16 + +> If in doubt, Meriadoc, always follow your nose. + +### Fixed + * Fixed occasional runc exec/run failure ("interrupted system call") on an + Azure volume. (#3045, #3074) + * Fixed "unable to find groups ... token too long" error with /etc/group + containing lines longer than 64K characters. (#3062, #3079) + * cgroup/systemd/v1: fix leaving cgroup frozen after Set if a parent cgroup is + frozen. This is a regression in 1.0.0, not affecting runc itself but some + of libcontainer users (e.g Kubernetes). (#3081, #3085) + * cgroupv2: bpf: Ignore inaccessible existing programs in case of + permission error when handling replacement of existing bpf cgroup + programs. This fixes a regression in 1.0.0, where some SELinux + policies would block runc from being able to run entirely. (#3055, #3087) + * cgroup/systemd/v2: don't freeze cgroup on Set. (#3067, #3092) + * cgroup/systemd/v1: avoid unnecessary freeze on Set. (#3082, #3093) + + +## [1.0.0] - 2021-06-22 + +> A wizard is never late, nor is he early, he arrives precisely when he means +> to. + +As runc follows Semantic Versioning, we will endeavour to not make any +breaking changes without bumping the major version number of runc. +However, it should be noted that Go API usage of runc's internal +implementation (libcontainer) is *not* covered by this policy. + +### Removed + * Removed libcontainer/configs.Device* identifiers (deprecated since rc94, + use libcontainer/devices). (#2999) + * Removed libcontainer/system.RunningInUserNS function (deprecated since + rc94, use libcontainer/userns). (#2999) + +### Deprecated + * The usage of relative paths for mountpoints will now produce a warning + (such configurations are outside of the spec, and in future runc will + produce an error when given such configurations). (#2917, #3004) + +### Fixed + * cgroupv2: devices: rework the filter generation to produce consistent + results with cgroupv1, and always clobber any existing eBPF + program(s) to fix `runc update` and avoid leaking eBPF programs + (resulting in errors when managing containers). (#2951) + * cgroupv2: correctly convert "number of IOs" statistics in a + cgroupv1-compatible way. (#2965, #2967, #2968, #2964) + * cgroupv2: support larger than 32-bit IO statistics on 32-bit architectures. + * cgroupv2: wait for freeze to finish before returning from the freezing + code, optimize the method for checking whether a cgroup is frozen. (#2955) + * cgroups/systemd: fixed "retry on dbus disconnect" logic introduced in rc94 + * cgroups/systemd: fixed returning "unit already exists" error from a systemd + cgroup manager (regression in rc94) (#2997, #2996) + +### Added + * cgroupv2: support SkipDevices with systemd driver. (#2958, #3019) + * cgroup1: blkio: support BFQ weights. (#3010) + * cgroupv2: set per-device io weights if BFQ IO scheduler is available. + (#3022) + +### Changed + * cgroup/systemd: return, not ignore, stop unit error from Destroy (#2946) + * Fix all golangci-lint failures. (#2781, #2962) + * Make `runc --version` output sane even when built with `go get` or + otherwise outside of our build scripts. (#2962) + * cgroups: set SkipDevices during runc update (so we don't modify + cgroups at all during `runc update`). (#2994) + + +[Unreleased]: https://github.com/opencontainers/runc/compare/v1.1.0...HEAD +[1.1.0]: https://github.com/opencontainers/runc/compare/v1.1.0-rc.1...v1.1.0 +[1.0.0]: https://github.com/opencontainers/runc/releases/tag/v1.0.0 + + +[Unreleased 1.0.z]: https://github.com/opencontainers/runc/compare/v1.0.3...release-1.0 +[1.0.3]: https://github.com/opencontainers/runc/compare/v1.0.2...v1.0.3 +[1.0.2]: https://github.com/opencontainers/runc/compare/v1.0.1...v1.0.2 +[1.0.1]: https://github.com/opencontainers/runc/compare/v1.0.0...v1.0.1 + + +[Unreleased 1.1.z]: https://github.com/opencontainers/runc/compare/v1.1.0...release-1.1 +[1.1.0-rc.1]: https://github.com/opencontainers/runc/compare/v1.0.0...v1.1.0-rc.1 diff --git a/Dockerfile b/Dockerfile index 5c65470..d6680cc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,34 +1,41 @@ -FROM golang:1.12-stretch +ARG GO_VERSION=1.17 +ARG BATS_VERSION=v1.3.0 +ARG LIBSECCOMP_VERSION=2.5.3 -RUN dpkg --add-architecture armel \ +FROM golang:${GO_VERSION}-bullseye +ARG DEBIAN_FRONTEND=noninteractive +ARG CRIU_REPO=https://download.opensuse.org/repositories/devel:/tools:/criu/Debian_11 + +RUN KEYFILE=/usr/share/keyrings/criu-repo-keyring.gpg; \ + wget -nv $CRIU_REPO/Release.key -O- | gpg --dearmor > "$KEYFILE" \ + && echo "deb [signed-by=$KEYFILE] $CRIU_REPO/ /" > /etc/apt/sources.list.d/criu.list \ + && dpkg --add-architecture armel \ && dpkg --add-architecture armhf \ && dpkg --add-architecture arm64 \ && dpkg --add-architecture ppc64el \ - && apt-get update && apt-get install -y \ - build-essential \ - curl \ - sudo \ - gawk \ - iptables \ - jq \ - pkg-config \ - libaio-dev \ - libcap-dev \ - libprotobuf-dev \ - libprotobuf-c0-dev \ - libnl-3-dev \ - libnet-dev \ - libseccomp2 \ - libseccomp-dev \ - protobuf-c-compiler \ - protobuf-compiler \ - python-minimal \ - uidmap \ - kmod \ - crossbuild-essential-armel crossbuild-essential-armhf crossbuild-essential-arm64 crossbuild-essential-ppc64el \ - libseccomp-dev:armel libseccomp-dev:armhf libseccomp-dev:arm64 libseccomp-dev:ppc64el \ - --no-install-recommends \ - && apt-get clean + && apt-get update \ + && apt-get install -y --no-install-recommends \ + build-essential \ + criu \ + crossbuild-essential-arm64 \ + crossbuild-essential-armel \ + crossbuild-essential-armhf \ + crossbuild-essential-ppc64el \ + crossbuild-essential-s390x \ + curl \ + gawk \ + gcc \ + gperf \ + iptables \ + jq \ + kmod \ + pkg-config \ + python3-minimal \ + sshfs \ + sudo \ + uidmap \ + && apt-get clean \ + && rm -rf /var/cache/apt /var/lib/apt/lists/* /etc/apt/sources.list.d/*.list # Add a dummy user for the rootless integration tests. While runC does # not require an entry in /etc/passwd to operate, one of the tests uses @@ -37,30 +44,21 @@ RUN dpkg --add-architecture armel \ RUN useradd -u1000 -m -d/home/rootless -s/bin/bash rootless # install bats +ARG BATS_VERSION RUN cd /tmp \ - && git clone https://github.com/sstephenson/bats.git \ - && cd bats \ - && git reset --hard 03608115df2071fff4eaaff1605768c275e5f81f \ + && git clone https://github.com/bats-core/bats-core.git \ + && cd bats-core \ + && git reset --hard "${BATS_VERSION}" \ && ./install.sh /usr/local \ - && rm -rf /tmp/bats + && rm -rf /tmp/bats-core -# install criu -ENV CRIU_VERSION v3.12 -RUN mkdir -p /usr/src/criu \ - && curl -sSL https://github.com/checkpoint-restore/criu/archive/${CRIU_VERSION}.tar.gz | tar -v -C /usr/src/criu/ -xz --strip-components=1 \ - && cd /usr/src/criu \ - && make install-criu \ - && rm -rf /usr/src/criu +# install libseccomp +ARG LIBSECCOMP_VERSION +COPY script/* /tmp/script/ +RUN mkdir -p /opt/libseccomp \ + && /tmp/script/seccomp.sh "$LIBSECCOMP_VERSION" /opt/libseccomp arm64 armel armhf ppc64le s390x +ENV LIBSECCOMP_VERSION=$LIBSECCOMP_VERSION +ENV LD_LIBRARY_PATH=/opt/libseccomp/lib +ENV PKG_CONFIG_PATH=/opt/libseccomp/lib/pkgconfig -# setup a playground for us to spawn containers in -ENV ROOTFS /busybox -RUN mkdir -p ${ROOTFS} - -COPY script/tmpmount / WORKDIR /go/src/github.com/opencontainers/runc -ENTRYPOINT ["/tmpmount"] - -ADD . /go/src/github.com/opencontainers/runc - -RUN . tests/integration/multi-arch.bash \ - && curl -o- -sSL `get_busybox` | tar xfJC - ${ROOTFS} diff --git a/EMERITUS.md b/EMERITUS.md new file mode 100644 index 0000000..4d4cd37 --- /dev/null +++ b/EMERITUS.md @@ -0,0 +1,11 @@ +## Emeritus ## + +We would like to acknowledge previous runc maintainers and their huge +contributions to our collective success: + + * Alexander Morozov (@lk4d4) + * Andrei Vagin (@avagin) + * Rohit Jnagal (@rjnagal) + * Victor Marmol (@vmarmol) + +We thank these members for their service to the OCI community. diff --git a/MAINTAINERS b/MAINTAINERS index 9fe08d3..e7fa530 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1,5 +1,8 @@ -Michael Crosby (@crosbymichael) +Michael Crosby (@crosbymichael) Mrunal Patel (@mrunalp) Daniel, Dao Quang Minh (@dqminh) Qiang Huang (@hqhq) -Aleksa Sarai (@cyphar) +Aleksa Sarai (@cyphar) +Akihiro Suda (@AkihiroSuda) +Kir Kolyshkin (@kolyshkin) +Sebastiaan van Stijn (@thaJeztah) diff --git a/Makefile b/Makefile index 81db9d9..f9045df 100644 --- a/Makefile +++ b/Makefile @@ -1,133 +1,158 @@ -.PHONY: all shell dbuild man release \ - localtest localunittest localintegration \ - test unittest integration \ - cross localcross - CONTAINER_ENGINE := docker -GO := go +GO ?= go -SOURCES := $(shell find . 2>&1 | grep -E '.*\.(c|h|go)$$') -PREFIX := $(DESTDIR)/usr/local +PREFIX ?= /usr/local BINDIR := $(PREFIX)/sbin +MANDIR := $(PREFIX)/share/man + GIT_BRANCH := $(shell git rev-parse --abbrev-ref HEAD 2>/dev/null) GIT_BRANCH_CLEAN := $(shell echo $(GIT_BRANCH) | sed -e "s/[^[:alnum:]]/-/g") RUNC_IMAGE := runc_dev$(if $(GIT_BRANCH_CLEAN),:$(GIT_BRANCH_CLEAN)) PROJECT := github.com/opencontainers/runc BUILDTAGS ?= seccomp -COMMIT_NO := $(shell git rev-parse HEAD 2> /dev/null || true) -COMMIT ?= $(if $(shell git status --porcelain --untracked-files=no),"${COMMIT_NO}-dirty","${COMMIT_NO}") +COMMIT ?= $(shell git describe --dirty --long --always) +VERSION := $(shell cat ./VERSION) -MAN_DIR := $(CURDIR)/man/man8 -MAN_PAGES = $(shell ls $(MAN_DIR)/*.8) -MAN_PAGES_BASE = $(notdir $(MAN_PAGES)) -MAN_INSTALL_PATH := ${PREFIX}/share/man/man8/ +ifeq ($(shell $(GO) env GOOS),linux) + ifeq (,$(filter $(shell $(GO) env GOARCH),mips mipsle mips64 mips64le ppc64)) + ifeq (,$(findstring -race,$(EXTRA_FLAGS))) + GO_BUILDMODE := "-buildmode=pie" + endif + endif +endif +GO_BUILD := $(GO) build -trimpath $(GO_BUILDMODE) $(EXTRA_FLAGS) -tags "$(BUILDTAGS)" \ + -ldflags "-X main.gitCommit=$(COMMIT) -X main.version=$(VERSION) $(EXTRA_LDFLAGS)" +GO_BUILD_STATIC := CGO_ENABLED=1 $(GO) build -trimpath $(EXTRA_FLAGS) -tags "$(BUILDTAGS) netgo osusergo" \ + -ldflags "-extldflags -static -X main.gitCommit=$(COMMIT) -X main.version=$(VERSION) $(EXTRA_LDFLAGS)" -RELEASE_DIR := $(CURDIR)/release - -VERSION := ${shell cat ./VERSION} - -SHELL := $(shell command -v bash 2>/dev/null) +GPG_KEYID ?= asarai@suse.de .DEFAULT: runc -runc: $(SOURCES) - $(GO) build -buildmode=pie $(EXTRA_FLAGS) -ldflags "-X main.gitCommit=${COMMIT} -X main.version=${VERSION} $(EXTRA_LDFLAGS)" -tags "$(BUILDTAGS)" -o runc . +runc: + $(GO_BUILD) -o runc . -all: runc recvtty +all: runc recvtty sd-helper seccompagent -recvtty: contrib/cmd/recvtty/recvtty +recvtty sd-helper seccompagent: + $(GO_BUILD) -o contrib/cmd/$@/$@ ./contrib/cmd/$@ -contrib/cmd/recvtty/recvtty: $(SOURCES) - $(GO) build -buildmode=pie $(EXTRA_FLAGS) -ldflags "-X main.gitCommit=${COMMIT} -X main.version=${VERSION} $(EXTRA_LDFLAGS)" -tags "$(BUILDTAGS)" -o contrib/cmd/recvtty/recvtty ./contrib/cmd/recvtty +static: + $(GO_BUILD_STATIC) -o runc . -static: $(SOURCES) - CGO_ENABLED=1 $(GO) build $(EXTRA_FLAGS) -tags "$(BUILDTAGS) netgo osusergo" -installsuffix netgo -ldflags "-w -extldflags -static -X main.gitCommit=${COMMIT} -X main.version=${VERSION} $(EXTRA_LDFLAGS)" -o runc . - CGO_ENABLED=1 $(GO) build $(EXTRA_FLAGS) -tags "$(BUILDTAGS) netgo osusergo" -installsuffix netgo -ldflags "-w -extldflags -static -X main.gitCommit=${COMMIT} -X main.version=${VERSION} $(EXTRA_LDFLAGS)" -o contrib/cmd/recvtty/recvtty ./contrib/cmd/recvtty +releaseall: RELEASE_ARGS := "-a arm64 -a armel -a armhf -a ppc64le -a s390x" +releaseall: release -release: - script/release.sh -r release/$(VERSION) -v $(VERSION) +release: runcimage + $(CONTAINER_ENGINE) run $(CONTAINER_ENGINE_RUN_FLAGS) \ + --rm -v $(CURDIR):/go/src/$(PROJECT) \ + -e RELEASE_ARGS=$(RELEASE_ARGS) \ + $(RUNC_IMAGE) make localrelease + script/release_sign.sh -S $(GPG_KEYID) -r release/$(VERSION) -v $(VERSION) + +localrelease: + script/release_build.sh -r release/$(VERSION) -v $(VERSION) $(RELEASE_ARGS) dbuild: runcimage - $(CONTAINER_ENGINE) run ${CONTAINER_ENGINE_RUN_FLAGS} --rm -v $(CURDIR):/go/src/$(PROJECT) --privileged $(RUNC_IMAGE) make clean all + $(CONTAINER_ENGINE) run $(CONTAINER_ENGINE_RUN_FLAGS) \ + --privileged --rm \ + -v $(CURDIR):/go/src/$(PROJECT) \ + $(RUNC_IMAGE) make clean all lint: - $(GO) vet $(allpackages) - $(GO) fmt $(allpackages) + golangci-lint run ./... man: man/md2man-all.sh runcimage: - $(CONTAINER_ENGINE) build ${CONTAINER_ENGINE_BUILD_FLAGS} -t $(RUNC_IMAGE) . + $(CONTAINER_ENGINE) build $(CONTAINER_ENGINE_BUILD_FLAGS) -t $(RUNC_IMAGE) . -test: - make unittest integration rootlessintegration +test: unittest integration rootlessintegration -localtest: - make localunittest localintegration localrootlessintegration +localtest: localunittest localintegration localrootlessintegration unittest: runcimage - $(CONTAINER_ENGINE) run ${CONTAINER_ENGINE_RUN_FLAGS} -t --privileged --rm -v /lib/modules:/lib/modules:ro -v $(CURDIR):/go/src/$(PROJECT) $(RUNC_IMAGE) make localunittest TESTFLAGS=${TESTFLAGS} + $(CONTAINER_ENGINE) run $(CONTAINER_ENGINE_RUN_FLAGS) \ + -t --privileged --rm \ + -v /lib/modules:/lib/modules:ro \ + -v $(CURDIR):/go/src/$(PROJECT) \ + $(RUNC_IMAGE) make localunittest TESTFLAGS=$(TESTFLAGS) localunittest: all - $(GO) test -timeout 3m -tags "$(BUILDTAGS)" ${TESTFLAGS} -v $(allpackages) + $(GO) test -timeout 3m -tags "$(BUILDTAGS)" $(TESTFLAGS) -v ./... integration: runcimage - $(CONTAINER_ENGINE) run ${CONTAINER_ENGINE_RUN_FLAGS} -t --privileged --rm -v /lib/modules:/lib/modules:ro -v $(CURDIR):/go/src/$(PROJECT) $(RUNC_IMAGE) make localintegration TESTPATH=${TESTPATH} + $(CONTAINER_ENGINE) run $(CONTAINER_ENGINE_RUN_FLAGS) \ + -t --privileged --rm \ + -v /lib/modules:/lib/modules:ro \ + -v $(CURDIR):/go/src/$(PROJECT) \ + $(RUNC_IMAGE) make localintegration TESTPATH=$(TESTPATH) localintegration: all - bats -t tests/integration${TESTPATH} + bats -t tests/integration$(TESTPATH) rootlessintegration: runcimage - $(CONTAINER_ENGINE) run ${CONTAINER_ENGINE_RUN_FLAGS} -t --privileged --rm -v $(CURDIR):/go/src/$(PROJECT) $(RUNC_IMAGE) make localrootlessintegration + $(CONTAINER_ENGINE) run $(CONTAINER_ENGINE_RUN_FLAGS) \ + -t --privileged --rm \ + -v $(CURDIR):/go/src/$(PROJECT) \ + -e ROOTLESS_TESTPATH \ + $(RUNC_IMAGE) make localrootlessintegration localrootlessintegration: all tests/rootless.sh shell: runcimage - $(CONTAINER_ENGINE) run ${CONTAINER_ENGINE_RUN_FLAGS} -ti --privileged --rm -v $(CURDIR):/go/src/$(PROJECT) $(RUNC_IMAGE) bash + $(CONTAINER_ENGINE) run $(CONTAINER_ENGINE_RUN_FLAGS) \ + -ti --privileged --rm \ + -v $(CURDIR):/go/src/$(PROJECT) \ + $(RUNC_IMAGE) bash install: - install -D -m0755 runc $(BINDIR)/runc + install -D -m0755 runc $(DESTDIR)$(BINDIR)/runc install-bash: - install -D -m0644 contrib/completions/bash/runc $(PREFIX)/share/bash-completion/completions/runc + install -D -m0644 contrib/completions/bash/runc $(DESTDIR)$(PREFIX)/share/bash-completion/completions/runc -install-man: - install -d -m 755 $(MAN_INSTALL_PATH) - install -m 644 $(MAN_PAGES) $(MAN_INSTALL_PATH) - -uninstall: - rm -f $(BINDIR)/runc - -uninstall-bash: - rm -f $(PREFIX)/share/bash-completion/completions/runc - -uninstall-man: - rm -f $(addprefix $(MAN_INSTALL_PATH),$(MAN_PAGES_BASE)) +install-man: man + install -d -m 755 $(DESTDIR)$(MANDIR)/man8 + install -D -m 644 man/man8/*.8 $(DESTDIR)$(MANDIR)/man8 clean: rm -f runc runc-* rm -f contrib/cmd/recvtty/recvtty - rm -rf $(RELEASE_DIR) - rm -rf $(MAN_DIR) + rm -f contrib/cmd/sd-helper/sd-helper + rm -f contrib/cmd/seccompagent/seccompagent + rm -rf release + rm -rf man/man8 -validate: - script/validate-gofmt - script/validate-c - $(GO) vet $(allpackages) +cfmt: C_SRC=$(shell git ls-files '*.c' | grep -v '^vendor/') +cfmt: + indent -linux -l120 -il0 -ppi2 -cp1 -T size_t -T jmp_buf $(C_SRC) -ci: validate test release +shellcheck: + shellcheck tests/integration/*.bats tests/integration/*.sh \ + tests/integration/*.bash tests/*.sh \ + script/release_*.sh script/seccomp.sh script/lib.sh + # TODO: add shellcheck for more sh files -cross: runcimage - $(CONTAINER_ENGINE) run ${CONTAINER_ENGINE_RUN_FLAGS} -e BUILDTAGS="$(BUILDTAGS)" --rm -v $(CURDIR):/go/src/$(PROJECT) $(RUNC_IMAGE) make localcross +shfmt: + shfmt -ln bats -d -w tests/integration/*.bats + shfmt -ln bash -d -w man/*.sh script/* tests/*.sh tests/integration/*.bash -localcross: - CGO_ENABLED=1 GOARCH=arm GOARM=6 CC=arm-linux-gnueabi-gcc $(GO) build -buildmode=pie $(EXTRA_FLAGS) -ldflags "-X main.gitCommit=${COMMIT} -X main.version=${VERSION} $(EXTRA_LDFLAGS)" -tags "$(BUILDTAGS)" -o runc-armel . - CGO_ENABLED=1 GOARCH=arm GOARM=7 CC=arm-linux-gnueabihf-gcc $(GO) build -buildmode=pie $(EXTRA_FLAGS) -ldflags "-X main.gitCommit=${COMMIT} -X main.version=${VERSION} $(EXTRA_LDFLAGS)" -tags "$(BUILDTAGS)" -o runc-armhf . - CGO_ENABLED=1 GOARCH=arm64 CC=aarch64-linux-gnu-gcc $(GO) build -buildmode=pie $(EXTRA_FLAGS) -ldflags "-X main.gitCommit=${COMMIT} -X main.version=${VERSION} $(EXTRA_LDFLAGS)" -tags "$(BUILDTAGS)" -o runc-arm64 . - CGO_ENABLED=1 GOARCH=ppc64le CC=powerpc64le-linux-gnu-gcc $(GO) build -buildmode=pie $(EXTRA_FLAGS) -ldflags "-X main.gitCommit=${COMMIT} -X main.version=${VERSION} $(EXTRA_LDFLAGS)" -tags "$(BUILDTAGS)" -o runc-ppc64le . +vendor: + $(GO) mod tidy + $(GO) mod vendor + $(GO) mod verify -# memoize allpackages, so that it's executed only once and only if used -_allpackages = $(shell $(GO) list ./... | grep -v vendor) -allpackages = $(if $(__allpackages),,$(eval __allpackages := $$(_allpackages)))$(__allpackages) +verify-dependencies: vendor + @test -z "$$(git status --porcelain -- go.mod go.sum vendor/)" \ + || (echo -e "git status:\n $$(git status -- go.mod go.sum vendor/)\nerror: vendor/, go.mod and/or go.sum not up to date. Run \"make vendor\" to update"; exit 1) \ + && echo "all vendor files are up to date." + +.PHONY: runc all recvtty sd-helper seccompagent static releaseall release \ + localrelease dbuild lint man runcimage \ + test localtest unittest localunittest integration localintegration \ + rootlessintegration localrootlessintegration shell install install-bash \ + install-man clean cfmt shfmt shellcheck \ + vendor verify-dependencies diff --git a/README.md b/README.md index a806f27..e2a7b44 100644 --- a/README.md +++ b/README.md @@ -1,39 +1,33 @@ # runc -[![Build Status](https://travis-ci.org/opencontainers/runc.svg?branch=master)](https://travis-ci.org/opencontainers/runc) [![Go Report Card](https://goreportcard.com/badge/github.com/opencontainers/runc)](https://goreportcard.com/report/github.com/opencontainers/runc) [![GoDoc](https://godoc.org/github.com/opencontainers/runc?status.svg)](https://godoc.org/github.com/opencontainers/runc) +[![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/588/badge)](https://bestpractices.coreinfrastructure.org/projects/588) +[![gha/validate](https://github.com/opencontainers/runc/workflows/validate/badge.svg)](https://github.com/opencontainers/runc/actions?query=workflow%3Avalidate) +[![gha/ci](https://github.com/opencontainers/runc/workflows/ci/badge.svg)](https://github.com/opencontainers/runc/actions?query=workflow%3Aci) ## Introduction -`runc` is a CLI tool for spawning and running containers according to the OCI specification. +`runc` is a CLI tool for spawning and running containers on Linux according to the OCI specification. ## Releases -`runc` depends on and tracks the [runtime-spec](https://github.com/opencontainers/runtime-spec) repository. -We will try to make sure that `runc` and the OCI specification major versions stay in lockstep. -This means that `runc` 1.0.0 should implement the 1.0 version of the specification. - You can find official releases of `runc` on the [release](https://github.com/opencontainers/runc/releases) page. -Currently, the following features are not considered to be production-ready: - -* Support for cgroup v2 - ## Security -The reporting process and disclosure communications are outlined in [/org/security](https://github.com/opencontainers/org/blob/master/security/). +The reporting process and disclosure communications are outlined [here](https://github.com/opencontainers/org/blob/master/SECURITY.md). + +### Security Audit +A third party security audit was performed by Cure53, you can see the full report [here](https://github.com/opencontainers/runc/blob/master/docs/Security-Audit.pdf). ## Building -`runc` currently supports the Linux platform with various architecture support. -It must be built with Go version 1.6 or higher in order for some features to function properly. +`runc` only supports Linux. It must be built with Go version 1.16 or higher. In order to enable seccomp support you will need to install `libseccomp` on your platform. > e.g. `libseccomp-devel` for CentOS, or `libseccomp-dev` for Ubuntu -Otherwise, if you do not want to build `runc` with seccomp support you can add `BUILDTAGS=""` when running make. - ```bash # create a 'github.com/opencontainers' in your GOPATH/src cd github.com/opencontainers @@ -58,21 +52,24 @@ sudo make install #### Build Tags -`runc` supports optional build tags for compiling support of various features. -To add build tags to the make option the `BUILDTAGS` variable must be set. +`runc` supports optional build tags for compiling support of various features, +with some of them enabled by default (see `BUILDTAGS` in top-level `Makefile`). + +To change build tags from the default, set the `BUILDTAGS` variable for make, +e.g. to disable seccomp: ```bash -make BUILDTAGS='seccomp apparmor' +make BUILDTAGS="" ``` -| Build Tag | Feature | Dependency | -|-----------|------------------------------------|-------------| -| seccomp | Syscall filtering | libseccomp | -| selinux | selinux process and mount labeling | | -| apparmor | apparmor profile support | | -| ambient | ambient capability support | kernel 4.3 | -| nokmem | disable kernel memory account | | +| Build Tag | Feature | Enabled by default | Dependency | +|-----------|------------------------------------|--------------------|------------| +| seccomp | Syscall filtering | yes | libseccomp | +The following build tags were used earlier, but are now obsoleted: + - **nokmem** (since runc v1.0.0-rc94 kernel memory settings are ignored) + - **apparmor** (since runc v1.0.0-rc93 the feature is always enabled) + - **selinux** (since runc v1.0.0-rc93 the feature is always enabled) ### Running the test suite @@ -97,20 +94,41 @@ You can run a specific integration test by setting the `TESTPATH` variable. # make test TESTPATH="/checkpoint.bats" ``` -You can run a test in your proxy environment by setting `DOCKER_BUILD_PROXY` and `DOCKER_RUN_PROXY` variables. +You can run a specific rootless integration test by setting the `ROOTLESS_TESTPATH` variable. ```bash -# make test DOCKER_BUILD_PROXY="--build-arg HTTP_PROXY=http://yourproxy/" DOCKER_RUN_PROXY="-e HTTP_PROXY=http://yourproxy/" +# make test ROOTLESS_TESTPATH="/checkpoint.bats" +``` + +You can run a test using your container engine's flags by setting `CONTAINER_ENGINE_BUILD_FLAGS` and `CONTAINER_ENGINE_RUN_FLAGS` variables. + +```bash +# make test CONTAINER_ENGINE_BUILD_FLAGS="--build-arg http_proxy=http://yourproxy/" CONTAINER_ENGINE_RUN_FLAGS="-e http_proxy=http://yourproxy/" ``` ### Dependencies Management -`runc` uses [vndr](https://github.com/LK4D4/vndr) for dependencies management. -Please refer to [vndr](https://github.com/LK4D4/vndr) for how to add or update +`runc` uses [Go Modules](https://github.com/golang/go/wiki/Modules) for dependencies management. +Please refer to [Go Modules](https://github.com/golang/go/wiki/Modules) for how to add or update new dependencies. +``` +# Update vendored dependencies +make vendor +# Verify all dependencies +make verify-dependencies +``` + ## Using runc +Please note that runc is a low level tool not designed with an end user +in mind. It is mostly employed by other higher level container software. + +Therefore, unless there is some specific use case that prevents the use +of tools like Docker or Podman, it is not recommended to use runc directly. + +If you still want to use runc, here's how. + ### Creating an OCI Bundle In order to use runc you must have your container in the format of an OCI bundle. @@ -152,7 +170,9 @@ If you used the unmodified `runc spec` template this should give you a `sh` sess The second way to start a container is using the specs lifecycle operations. This gives you more power over how the container is created and managed while it is running. -This will also launch the container in the background so you will have to edit the `config.json` to remove the `terminal` setting for the simple examples here. +This will also launch the container in the background so you will have to edit +the `config.json` to remove the `terminal` setting for the simple examples +below (see more details about [runc terminal handling](docs/terminals.md)). Your process field in the `config.json` should look like this below with `"terminal": false` and `"args": ["sleep", "5"]`. @@ -275,6 +295,14 @@ PIDFile=/run/mycontainerid.pid WantedBy=multi-user.target ``` +## More documentation + +* [cgroup v2](./docs/cgroup-v2.md) +* [Checkpoint and restore](./docs/checkpoint-restore.md) +* [systemd cgroup driver](./docs/systemd.md) +* [Terminals and standard IO](./docs/terminals.md) +* [Experimental features](./docs/experimental.md) + ## License The code and docs are released under the [Apache 2.0 license](LICENSE). diff --git a/SECURITY.md b/SECURITY.md index 63a7438..61e37bc 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -1,3 +1,3 @@ # Security -The reporting process and disclosure communications are outlined in [/org/security](https://github.com/opencontainers/org/blob/master/security/). +The reporting process and disclosure communications are outlined [here](https://github.com/opencontainers/org/blob/master/SECURITY.md). diff --git a/VERSION b/VERSION index 950f8ca..9084fa2 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.0.0-rc10 +1.1.0 diff --git a/Vagrantfile.fedora b/Vagrantfile.fedora new file mode 100644 index 0000000..fc96d7f --- /dev/null +++ b/Vagrantfile.fedora @@ -0,0 +1,52 @@ +# -*- mode: ruby -*- +# vi: set ft=ruby : + +Vagrant.configure("2") do |config| +# Fedora box is used for testing cgroup v2 support + config.vm.box = "fedora/35-cloud-base" + config.vm.provider :virtualbox do |v| + v.memory = 2048 + v.cpus = 2 + end + config.vm.provider :libvirt do |v| + v.memory = 2048 + v.cpus = 2 + end + config.vm.provision "shell", inline: <<-SHELL + set -e -u -o pipefail + # Work around dnf mirror failures by retrying a few times + for i in $(seq 0 2); do + sleep $i + # "config exclude" dnf shell command is not working in Fedora 35 + # (see https://bugzilla.redhat.com/show_bug.cgi?id=2022571); + # the workaround is to specify it as an option. + cat << EOF | dnf -y --exclude=kernel,kernel-core shell && break +config install_weak_deps false +update +install iptables gcc make golang-go glibc-static libseccomp-devel bats jq git-core criu fuse-sshfs +ts run +EOF + done + dnf clean all + + # Add a user for rootless tests + useradd -u2000 -m -d/home/rootless -s/bin/bash rootless + + # Allow root and rootless itself to execute `ssh rootless@localhost` in tests/rootless.sh + ssh-keygen -t ecdsa -N "" -f /root/rootless.key + mkdir -m 0700 -p /home/rootless/.ssh + cp /root/rootless.key /home/rootless/.ssh/id_ecdsa + cat /root/rootless.key.pub >> /home/rootless/.ssh/authorized_keys + chown -R rootless.rootless /home/rootless + + # Delegate cgroup v2 controllers to rootless user via --systemd-cgroup + mkdir -p /etc/systemd/system/user@.service.d + cat > /etc/systemd/system/user@.service.d/delegate.conf << EOF +[Service] +# default: Delegate=pids memory +# NOTE: delegation of cpuset requires systemd >= 244 (Fedora >= 32, Ubuntu >= 20.04). +Delegate=yes +EOF + systemctl daemon-reload + SHELL +end diff --git a/checkpoint.go b/checkpoint.go index ae01ea3..32a62a8 100644 --- a/checkpoint.go +++ b/checkpoint.go @@ -1,19 +1,19 @@ -// +build linux - package main import ( + "errors" "fmt" + "net" "os" + "path/filepath" "strconv" - "strings" + criu "github.com/checkpoint-restore/go-criu/v5/rpc" "github.com/opencontainers/runc/libcontainer" - "github.com/opencontainers/runc/libcontainer/system" + "github.com/opencontainers/runc/libcontainer/userns" "github.com/opencontainers/runtime-spec/specs-go" "github.com/sirupsen/logrus" "github.com/urfave/cli" - "golang.org/x/sys/unix" ) @@ -34,7 +34,7 @@ checkpointed.`, cli.BoolFlag{Name: "ext-unix-sk", Usage: "allow external unix sockets"}, cli.BoolFlag{Name: "shell-job", Usage: "allow shell jobs"}, cli.BoolFlag{Name: "lazy-pages", Usage: "use userfaultfd to lazily restore memory pages"}, - cli.StringFlag{Name: "status-fd", Value: "", Usage: "criu writes \\0 to this FD once lazy-pages is ready"}, + cli.IntFlag{Name: "status-fd", Value: -1, Usage: "criu writes \\0 to this FD once lazy-pages is ready"}, cli.StringFlag{Name: "page-server", Value: "", Usage: "ADDRESS:PORT of the page server"}, cli.BoolFlag{Name: "file-locks", Usage: "handle file locks, for safety"}, cli.BoolFlag{Name: "pre-dump", Usage: "dump container's memory information only, leave the container running after this"}, @@ -47,7 +47,7 @@ checkpointed.`, return err } // XXX: Currently this is untested with rootless containers. - if os.Geteuid() != 0 || system.RunningInUserNS() { + if os.Geteuid() != 0 || userns.RunningInUserNS() { logrus.Warn("runc checkpoint is untested with rootless containers") } @@ -60,10 +60,13 @@ checkpointed.`, return err } if status == libcontainer.Created || status == libcontainer.Stopped { - fatalf("Container cannot be checkpointed in %s state", status.String()) + fatal(fmt.Errorf("Container cannot be checkpointed in %s state", status.String())) } - defer destroy(container) options := criuOptions(context) + if !(options.LeaveRunning || options.PreDump) { + // destroy container unless we tell CRIU to keep it + defer destroy(container) + } // these are the mandatory criu options for a container setPageServer(context, options) setManageCgroupsMode(context, options) @@ -74,28 +77,53 @@ checkpointed.`, }, } -func getCheckpointImagePath(context *cli.Context) string { +func prepareImagePaths(context *cli.Context) (string, string, error) { imagePath := context.String("image-path") if imagePath == "" { - imagePath = getDefaultImagePath(context) + imagePath = getDefaultImagePath() } - return imagePath + + if err := os.MkdirAll(imagePath, 0o600); err != nil { + return "", "", err + } + + parentPath := context.String("parent-path") + if parentPath == "" { + return imagePath, parentPath, nil + } + + if filepath.IsAbs(parentPath) { + return "", "", errors.New("--parent-path must be relative") + } + + realParent := filepath.Join(imagePath, parentPath) + fi, err := os.Stat(realParent) + if err == nil && !fi.IsDir() { + err = &os.PathError{Path: realParent, Err: unix.ENOTDIR} + } + + if err != nil { + return "", "", fmt.Errorf("invalid --parent-path: %w", err) + } + + return imagePath, parentPath, nil } func setPageServer(context *cli.Context, options *libcontainer.CriuOpts) { // xxx following criu opts are optional // The dump image can be sent to a criu page server if psOpt := context.String("page-server"); psOpt != "" { - addressPort := strings.Split(psOpt, ":") - if len(addressPort) != 2 { - fatal(fmt.Errorf("Use --page-server ADDRESS:PORT to specify page server")) + address, port, err := net.SplitHostPort(psOpt) + + if err != nil || address == "" || port == "" { + fatal(errors.New("Use --page-server ADDRESS:PORT to specify page server")) } - portInt, err := strconv.Atoi(addressPort[1]) + portInt, err := strconv.Atoi(port) if err != nil { - fatal(fmt.Errorf("Invalid port number")) + fatal(errors.New("Invalid port number")) } options.PageServer = libcontainer.CriuPageServerInfo{ - Address: addressPort[0], + Address: address, Port: int32(portInt), } } @@ -105,13 +133,13 @@ func setManageCgroupsMode(context *cli.Context, options *libcontainer.CriuOpts) if cgOpt := context.String("manage-cgroups-mode"); cgOpt != "" { switch cgOpt { case "soft": - options.ManageCgroupsMode = libcontainer.CRIU_CG_MODE_SOFT + options.ManageCgroupsMode = criu.CriuCgMode_SOFT case "full": - options.ManageCgroupsMode = libcontainer.CRIU_CG_MODE_FULL + options.ManageCgroupsMode = criu.CriuCgMode_FULL case "strict": - options.ManageCgroupsMode = libcontainer.CRIU_CG_MODE_STRICT + options.ManageCgroupsMode = criu.CriuCgMode_STRICT default: - fatal(fmt.Errorf("Invalid manage cgroups mode")) + fatal(errors.New("Invalid manage cgroups mode")) } } } diff --git a/contrib/cmd/recvtty/recvtty.go b/contrib/cmd/recvtty/recvtty.go index a658b8d..35c293d 100644 --- a/contrib/cmd/recvtty/recvtty.go +++ b/contrib/cmd/recvtty/recvtty.go @@ -17,12 +17,13 @@ package main import ( + "errors" "fmt" "io" - "io/ioutil" "net" "os" "strings" + "sync" "github.com/containerd/console" "github.com/opencontainers/runc/libcontainer/utils" @@ -65,7 +66,7 @@ func bail(err error) { os.Exit(1) } -func handleSingle(path string) error { +func handleSingle(path string, noStdin bool) error { // Open a socket. ln, err := net.Listen("unix", path) if err != nil { @@ -87,7 +88,7 @@ func handleSingle(path string) error { // Get the fd of the connection. unixconn, ok := conn.(*net.UnixConn) if !ok { - return fmt.Errorf("failed to cast to unixconn") + return errors.New("failed to cast to unixconn") } socket, err := unixconn.File() @@ -105,23 +106,37 @@ func handleSingle(path string) error { if err != nil { return err } - console.ClearONLCR(c.Fd()) + if err := console.ClearONLCR(c.Fd()); err != nil { + return err + } // Copy from our stdio to the master fd. - quitChan := make(chan struct{}) + var ( + wg sync.WaitGroup + inErr, outErr error + ) + wg.Add(1) go func() { - io.Copy(os.Stdout, c) - quitChan <- struct{}{} - }() - go func() { - io.Copy(c, os.Stdin) - quitChan <- struct{}{} + _, outErr = io.Copy(os.Stdout, c) + wg.Done() }() + if !noStdin { + wg.Add(1) + go func() { + _, inErr = io.Copy(c, os.Stdin) + wg.Done() + }() + } // Only close the master fd once we've stopped copying. - <-quitChan + wg.Wait() c.Close() - return nil + + if outErr != nil { + return outErr + } + + return inErr } func handleNull(path string) error { @@ -161,15 +176,7 @@ func handleNull(path string) error { return } - // Just do a dumb copy to /dev/null. - devnull, err := os.OpenFile("/dev/null", os.O_RDWR, 0) - if err != nil { - // TODO: Handle this nicely. - return - } - - io.Copy(devnull, master) - devnull.Close() + _, _ = io.Copy(io.Discard, master) }(conn) } } @@ -185,7 +192,7 @@ func main() { v = append(v, version) } if gitCommit != "" { - v = append(v, fmt.Sprintf("commit: %s", gitCommit)) + v = append(v, "commit: "+gitCommit) } app.Version = strings.Join(v, "\n") @@ -201,26 +208,31 @@ func main() { Value: "", Usage: "Path to write daemon process ID to", }, + cli.BoolFlag{ + Name: "no-stdin", + Usage: "Disable stdin handling (no-op for null mode)", + }, } app.Action = func(ctx *cli.Context) error { args := ctx.Args() if len(args) != 1 { - return fmt.Errorf("need to specify a single socket path") + return errors.New("need to specify a single socket path") } path := ctx.Args()[0] pidPath := ctx.String("pid-file") if pidPath != "" { pid := fmt.Sprintf("%d\n", os.Getpid()) - if err := ioutil.WriteFile(pidPath, []byte(pid), 0644); err != nil { + if err := os.WriteFile(pidPath, []byte(pid), 0o644); err != nil { return err } } + noStdin := ctx.Bool("no-stdin") switch ctx.String("mode") { case "single": - if err := handleSingle(path); err != nil { + if err := handleSingle(path, noStdin); err != nil { return err } case "null": diff --git a/contrib/cmd/sd-helper/helper.go b/contrib/cmd/sd-helper/helper.go new file mode 100644 index 0000000..fc2bf38 --- /dev/null +++ b/contrib/cmd/sd-helper/helper.go @@ -0,0 +1,86 @@ +package main + +import ( + "flag" + "fmt" + "os" + + "github.com/sirupsen/logrus" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/systemd" + "github.com/opencontainers/runc/libcontainer/configs" +) + +func usage() { + fmt.Print(`Open Container Initiative contrib/cmd/sd-helper + +sd-helper is a tool that uses runc/libcontainer/cgroups/systemd package +functionality to communicate to systemd in order to perform various operations. +Currently this is limited to starting and stopping systemd transient slice +units. + +Usage: + sd-helper [-debug] [-parent ] {start|stop} + +Example: + sd-helper -parent system.slice start system-pod123.slice +`) + os.Exit(1) +} + +var ( + debug = flag.Bool("debug", false, "enable debug output") + parent = flag.String("parent", "", "parent unit name") +) + +func main() { + if !systemd.IsRunningSystemd() { + logrus.Fatal("systemd is required") + } + + // Set the flags. + flag.Parse() + if *debug { + logrus.SetLevel(logrus.DebugLevel) + } + if flag.NArg() != 2 { + usage() + } + + cmd := flag.Arg(0) + unit := flag.Arg(1) + + err := unitCommand(cmd, unit, *parent) + if err != nil { + logrus.Fatal(err) + } +} + +func newManager(config *configs.Cgroup) (cgroups.Manager, error) { + if cgroups.IsCgroup2UnifiedMode() { + return systemd.NewUnifiedManager(config, "") + } + return systemd.NewLegacyManager(config, nil) +} + +func unitCommand(cmd, name, parent string) error { + podConfig := &configs.Cgroup{ + Name: name, + Parent: parent, + Resources: &configs.Resources{}, + } + pm, err := newManager(podConfig) + if err != nil { + return err + } + + switch cmd { + case "start": + return pm.Apply(-1) + case "stop": + return pm.Destroy() + } + + return fmt.Errorf("unknown command: %s", cmd) +} diff --git a/contrib/cmd/seccompagent/README.md b/contrib/cmd/seccompagent/README.md new file mode 100644 index 0000000..d42d4bd --- /dev/null +++ b/contrib/cmd/seccompagent/README.md @@ -0,0 +1,70 @@ +# Seccomp Agent + +## Warning + +Please note this is an example agent, as such it is possible that specially +crafted messages can produce bad behaviour. Please use it as an example only. + +Also, this agent is used for integration tests. Be aware that changing the +behaviour can break the integration tests. + +## Get started + +Compile runc and seccompagent: +```bash +make all +``` + +Run the seccomp agent in the background: +```bash +sudo ./contrib/cmd/seccompagent/seccompagent & +``` + +Prepare a container: +```bash +mkdir container-seccomp-notify +cd container-seccomp-notify +mkdir rootfs +docker export $(docker create busybox) | tar -C rootfs -xvf - +``` + +Then, generate a config.json by running the script gen-seccomp-example-cfg.sh +from the directory where this README.md is in the container directory you +prepared earlier (`container-seccomp-notify`). + +Then start the container: +```bash +runc run mycontainerid +``` + +The container will output something like this: +```bash ++ cd /dev/shm ++ mkdir test-dir ++ touch test-file ++ chmod 777 test-file +chmod: changing permissions of 'test-file': No medium found ++ stat /dev/shm/test-dir-foo + File: /dev/shm/test-dir-foo + Size: 40 Blocks: 0 IO Block: 4096 directory +Device: 3eh/62d Inode: 2 Links: 2 +Access: (0755/drwxr-xr-x) Uid: ( 0/ root) Gid: ( 0/ root) +Access: 2021-09-09 15:03:13.043716040 +0000 +Modify: 2021-09-09 15:03:13.043716040 +0000 +Change: 2021-09-09 15:03:13.043716040 +0000 + Birth: - ++ ls -l /dev/shm +total 0 +drwxr-xr-x 2 root root 40 Sep 9 15:03 test-dir-foo +-rw-r--r-- 1 root root 0 Sep 9 15:03 test-file ++ echo Note the agent added a suffix for the directory name and chmod fails +Note the agent added a suffix for the directory name and chmod fails +``` + +This shows a simple example that runs in /dev/shm just because it is a tmpfs in +the example config.json. + +The agent makes all chmod calls fail with ENOMEDIUM, as the example output shows. + +For mkdir, the agent adds a "-foo" suffix: the container runs "mkdir test-dir" +but the directory created is "test-dir-foo". diff --git a/contrib/cmd/seccompagent/gen-seccomp-example-cfg.sh b/contrib/cmd/seccompagent/gen-seccomp-example-cfg.sh new file mode 100755 index 0000000..bd4e209 --- /dev/null +++ b/contrib/cmd/seccompagent/gen-seccomp-example-cfg.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# Detect if we are running inside bats (i.e. inside integration tests) or just +# called by an end-user +# bats-core v1.2.1 defines BATS_RUN_TMPDIR +if [ -z "$BATS_RUN_TMPDIR" ]; then + # When not running in bats, we create the config.json + set -e + runc spec +fi + +# We can't source $(dirname $0)/../../../tests/integration/helpers.bash as that +# exits when not running inside bats. We can do hacks, but just to redefine +# update_config() seems clearer. We don't even really need to keep them in sync. +function update_config() { + jq "$1" "./config.json" | awk 'BEGIN{RS="";getline<"-";print>ARGV[1]}' "./config.json" +} + +update_config '.linux.seccomp = { + "defaultAction": "SCMP_ACT_ALLOW", + "listenerPath": "/run/seccomp-agent.socket", + "listenerMetadata": "foo", + "architectures": [ "SCMP_ARCH_X86", "SCMP_ARCH_X32", "SCMP_ARCH_X86_64" ], + "syscalls": [ + { + "names": [ "chmod", "fchmod", "fchmodat", "mkdir" ], + "action": "SCMP_ACT_NOTIFY" + } + ] + }' + +update_config '.process.args = [ + "sh", + "-c", + "set -x; cd /dev/shm; mkdir test-dir; touch test-file; chmod 777 test-file; stat /dev/shm/test-dir-foo && ls -l /dev/shm && echo \"Note the agent added a suffix for the directory name and chmod fails\" " + ]' diff --git a/contrib/cmd/seccompagent/seccompagent.go b/contrib/cmd/seccompagent/seccompagent.go new file mode 100644 index 0000000..9a6abc8 --- /dev/null +++ b/contrib/cmd/seccompagent/seccompagent.go @@ -0,0 +1,291 @@ +//go:build linux && seccomp +// +build linux,seccomp + +package main + +import ( + "bytes" + "encoding/json" + "errors" + "flag" + "fmt" + "net" + "os" + "path/filepath" + "strings" + + securejoin "github.com/cyphar/filepath-securejoin" + "github.com/opencontainers/runtime-spec/specs-go" + libseccomp "github.com/seccomp/libseccomp-golang" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" +) + +var ( + socketFile string + pidFile string +) + +func closeStateFds(recvFds []int) { + for i := range recvFds { + unix.Close(i) + } +} + +// parseStateFds returns the seccomp-fd and closes the rest of the fds in recvFds. +// In case of error, no fd is closed. +// StateFds is assumed to be formatted as specs.ContainerProcessState.Fds and +// recvFds the corresponding list of received fds in the same SCM_RIGHT message. +func parseStateFds(stateFds []string, recvFds []int) (uintptr, error) { + // Let's find the index in stateFds of the seccomp-fd. + idx := -1 + err := false + + for i, name := range stateFds { + if name == specs.SeccompFdName && idx == -1 { + idx = i + continue + } + + // We found the seccompFdName twice. Error out! + if name == specs.SeccompFdName && idx != -1 { + err = true + } + } + + if idx == -1 || err { + return 0, errors.New("seccomp fd not found or malformed containerProcessState.Fds") + } + + if idx >= len(recvFds) || idx < 0 { + return 0, errors.New("seccomp fd index out of range") + } + + fd := uintptr(recvFds[idx]) + + for i := range recvFds { + if i == idx { + continue + } + + unix.Close(recvFds[i]) + } + + return fd, nil +} + +func handleNewMessage(sockfd int) (uintptr, string, error) { + const maxNameLen = 4096 + stateBuf := make([]byte, maxNameLen) + oobSpace := unix.CmsgSpace(4) + oob := make([]byte, oobSpace) + + n, oobn, _, _, err := unix.Recvmsg(sockfd, stateBuf, oob, 0) + if err != nil { + return 0, "", err + } + if n >= maxNameLen || oobn != oobSpace { + return 0, "", fmt.Errorf("recvfd: incorrect number of bytes read (n=%d oobn=%d)", n, oobn) + } + + // Truncate. + stateBuf = stateBuf[:n] + oob = oob[:oobn] + + scms, err := unix.ParseSocketControlMessage(oob) + if err != nil { + return 0, "", err + } + if len(scms) != 1 { + return 0, "", fmt.Errorf("recvfd: number of SCMs is not 1: %d", len(scms)) + } + scm := scms[0] + + fds, err := unix.ParseUnixRights(&scm) + if err != nil { + return 0, "", err + } + + containerProcessState := &specs.ContainerProcessState{} + err = json.Unmarshal(stateBuf, containerProcessState) + if err != nil { + closeStateFds(fds) + return 0, "", fmt.Errorf("cannot parse OCI state: %w", err) + } + + fd, err := parseStateFds(containerProcessState.Fds, fds) + if err != nil { + closeStateFds(fds) + return 0, "", err + } + + return fd, containerProcessState.Metadata, nil +} + +func readArgString(pid uint32, offset int64) (string, error) { + buffer := make([]byte, 4096) // PATH_MAX + + memfd, err := unix.Open(fmt.Sprintf("/proc/%d/mem", pid), unix.O_RDONLY, 0o777) + if err != nil { + return "", err + } + defer unix.Close(memfd) + + _, err = unix.Pread(memfd, buffer, offset) + if err != nil { + return "", err + } + + buffer[len(buffer)-1] = 0 + s := buffer[:bytes.IndexByte(buffer, 0)] + return string(s), nil +} + +func runMkdirForContainer(pid uint32, fileName string, mode uint32, metadata string) error { + // We validated before that metadata is not a string that can make + // newFile a file in a different location other than root. + newFile := fmt.Sprintf("%s-%s", fileName, metadata) + root := fmt.Sprintf("/proc/%d/cwd/", pid) + + if strings.HasPrefix(fileName, "/") { + // If it starts with /, use the rootfs as base + root = fmt.Sprintf("/proc/%d/root/", pid) + } + + path, err := securejoin.SecureJoin(root, newFile) + if err != nil { + return err + } + + return unix.Mkdir(path, mode) +} + +// notifHandler handles seccomp notifications and responses +func notifHandler(fd libseccomp.ScmpFd, metadata string) { + defer unix.Close(int(fd)) + for { + req, err := libseccomp.NotifReceive(fd) + if err != nil { + logrus.Errorf("Error in NotifReceive(): %s", err) + continue + } + syscallName, err := req.Data.Syscall.GetName() + if err != nil { + logrus.Errorf("Error decoding syscall %v(): %s", req.Data.Syscall, err) + continue + } + logrus.Debugf("Received syscall %q, pid %v, arch %q, args %+v", syscallName, req.Pid, req.Data.Arch, req.Data.Args) + + resp := &libseccomp.ScmpNotifResp{ + ID: req.ID, + Error: 0, + Val: 0, + Flags: libseccomp.NotifRespFlagContinue, + } + + // TOCTOU check + if err := libseccomp.NotifIDValid(fd, req.ID); err != nil { + logrus.Errorf("TOCTOU check failed: req.ID is no longer valid: %s", err) + continue + } + + switch syscallName { + case "mkdir": + fileName, err := readArgString(req.Pid, int64(req.Data.Args[0])) + if err != nil { + logrus.Errorf("Cannot read argument: %s", err) + resp.Error = int32(unix.ENOSYS) + resp.Val = ^uint64(0) // -1 + goto sendResponse + } + + logrus.Debugf("mkdir: %q", fileName) + + // TOCTOU check + if err := libseccomp.NotifIDValid(fd, req.ID); err != nil { + logrus.Errorf("TOCTOU check failed: req.ID is no longer valid: %s", err) + continue + } + + err = runMkdirForContainer(req.Pid, fileName, uint32(req.Data.Args[1]), metadata) + if err != nil { + resp.Error = int32(unix.ENOSYS) + resp.Val = ^uint64(0) // -1 + } + resp.Flags = 0 + case "chmod", "fchmod", "fchmodat": + resp.Error = int32(unix.ENOMEDIUM) + resp.Val = ^uint64(0) // -1 + resp.Flags = 0 + } + + sendResponse: + if err = libseccomp.NotifRespond(fd, resp); err != nil { + logrus.Errorf("Error in notification response: %s", err) + continue + } + } +} + +func main() { + flag.StringVar(&socketFile, "socketfile", "/run/seccomp-agent.socket", "Socket file") + flag.StringVar(&pidFile, "pid-file", "", "Pid file") + logrus.SetLevel(logrus.DebugLevel) + + // Parse arguments + flag.Parse() + if flag.NArg() > 0 { + flag.PrintDefaults() + logrus.Fatal("Invalid command") + } + + if err := os.Remove(socketFile); err != nil && !errors.Is(err, os.ErrNotExist) { + logrus.Fatalf("Cannot cleanup socket file: %v", err) + } + + if pidFile != "" { + pid := fmt.Sprintf("%d", os.Getpid()) + if err := os.WriteFile(pidFile, []byte(pid), 0o644); err != nil { + logrus.Fatalf("Cannot write pid file: %v", err) + } + } + + logrus.Info("Waiting for seccomp file descriptors") + l, err := net.Listen("unix", socketFile) + if err != nil { + logrus.Fatalf("Cannot listen: %s", err) + } + defer l.Close() + + for { + conn, err := l.Accept() + if err != nil { + logrus.Errorf("Cannot accept connection: %s", err) + continue + } + socket, err := conn.(*net.UnixConn).File() + conn.Close() + if err != nil { + logrus.Errorf("Cannot get socket: %v", err) + continue + } + newFd, metadata, err := handleNewMessage(int(socket.Fd())) + socket.Close() + if err != nil { + logrus.Errorf("Error receiving seccomp file descriptor: %v", err) + continue + } + + // Make sure we don't allow strings like "/../p", as that means + // a file in a different location than expected. We just want + // safe things to use as a suffix for a file name. + metadata = filepath.Base(metadata) + if strings.Contains(metadata, "/") { + // Fallback to a safe string. + metadata = "agent-generated-suffix" + } + + logrus.Infof("Received new seccomp fd: %v", newFd) + go notifHandler(libseccomp.ScmpFd(newFd), metadata) + } +} diff --git a/contrib/cmd/seccompagent/unsupported.go b/contrib/cmd/seccompagent/unsupported.go new file mode 100644 index 0000000..773eeaf --- /dev/null +++ b/contrib/cmd/seccompagent/unsupported.go @@ -0,0 +1,10 @@ +//go:build !linux || !seccomp +// +build !linux !seccomp + +package main + +import "fmt" + +func main() { + fmt.Println("Not supported, to use this compile with build tag: seccomp.") +} diff --git a/contrib/completions/bash/runc b/contrib/completions/bash/runc index 9517a5b..a4cd899 100644 --- a/contrib/completions/bash/runc +++ b/contrib/completions/bash/runc @@ -113,6 +113,8 @@ __runc_complete_capabilities() { AUDIT_WRITE AUDIT_READ BLOCK_SUSPEND + BPF + CHECKPOINT_RESTORE CHOWN DAC_OVERRIDE DAC_READ_SEARCH @@ -130,6 +132,7 @@ __runc_complete_capabilities() { NET_BIND_SERVICE NET_BROADCAST NET_RAW + PERFMON SETFCAP SETGID SETPCAP @@ -170,6 +173,7 @@ _runc_exec() { --apparmor --cap, -c --preserve-fds + --ignore-paused " local all_options="$options_with_args $boolean_options" @@ -221,6 +225,7 @@ _runc_runc() { --help --version -v --debug + --systemd-cgroup " local options_with_args=" --log @@ -733,8 +738,6 @@ _runc_update() { --cpu-share --cpuset-cpus --cpuset-mems - --kernel-memory - --kernel-memory-tcp --memory --memory-reservation --memory-swap @@ -769,7 +772,6 @@ _runc() { delete events exec - init kill list pause diff --git a/create.go b/create.go index 5f3ac60..97854b8 100644 --- a/create.go +++ b/create.go @@ -1,6 +1,7 @@ package main import ( + "fmt" "os" "github.com/urfave/cli" @@ -55,20 +56,12 @@ command(s) that get executed on start, edit the args parameter of the spec. See if err := checkArgs(context, 1, exactArgs); err != nil { return err } - if err := revisePidFile(context); err != nil { - return err + status, err := startContainer(context, CT_ACT_CREATE, nil) + if err == nil { + // exit with the container's exit status so any external supervisor + // is notified of the exit with the correct exit status. + os.Exit(status) } - spec, err := setupSpec(context) - if err != nil { - return err - } - status, err := startContainer(context, spec, CT_ACT_CREATE, nil) - if err != nil { - return err - } - // exit with the container's exit status so any external supervisor is - // notified of the exit with the correct exit status. - os.Exit(status) - return nil + return fmt.Errorf("runc create failed: %w", err) }, } diff --git a/debian/changelog b/debian/changelog index 4fcb02f..3a69717 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,9 @@ +runc (1.1.0-ok1) yangtze; urgency=medium + + * Merge new upstream version 1.1.0 + + -- Luoyaoming Fri, 30 Dec 2022 11:11:29 +0800 + runc (1.0.0~rc10-ok2) yangtze; urgency=medium * Update version. diff --git a/debian/patches/test--fix_TestGetAdditionalGroups.patch b/debian/patches/test--fix_TestGetAdditionalGroups.patch new file mode 100644 index 0000000..6f32e6b --- /dev/null +++ b/debian/patches/test--fix_TestGetAdditionalGroups.patch @@ -0,0 +1,39 @@ +From: Dmitry Smirnov +Date: Thu, 28 Jul 2022 16:28:22 +0800 +Subject: fix FTBFS on i686 + +src/github.com/opencontainers/runc/libcontainer/user/user_test.go:448:36: constant 2147483648 overflows int +Last-Update: 2018-06-16 +Forwarded: https://github.com/opencontainers/runc/pull/1821 +Bug-Upstream: https://github.com/opencontainers/runc/issues/941 +--- + libcontainer/user/user.go | 2 +- + libcontainer/user/user_test.go | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) + +diff --git a/libcontainer/user/user.go b/libcontainer/user/user.go +index 7b912bb..38caded 100644 +--- a/libcontainer/user/user.go ++++ b/libcontainer/user/user.go +@@ -473,7 +473,7 @@ func GetAdditionalGroups(additionalGroups []string, group io.Reader) ([]int, err + return nil, fmt.Errorf("Unable to find group %s", ag) + } + // Ensure gid is inside gid range. +- if gid < minId || gid > maxId { ++ if gid < minId || gid >= maxId { + return nil, ErrRange + } + gidMap[gid] = struct{}{} +diff --git a/libcontainer/user/user_test.go b/libcontainer/user/user_test.go +index 24ee559..a4aabdc 100644 +--- a/libcontainer/user/user_test.go ++++ b/libcontainer/user/user_test.go +@@ -445,7 +445,7 @@ this is just some garbage data + if utils.GetIntSize() > 4 { + tests = append(tests, foo{ + // groups with too large id +- groups: []string{strconv.Itoa(1 << 31)}, ++ groups: []string{strconv.Itoa( 1<<31 -1 )}, + expected: nil, + hasError: true, + }) diff --git a/debian/patches/test--skip-Hugetlb.patch b/debian/patches/test--skip-Hugetlb.patch new file mode 100644 index 0000000..a1fef73 --- /dev/null +++ b/debian/patches/test--skip-Hugetlb.patch @@ -0,0 +1,48 @@ +From: Dmitry Smirnov +Date: Thu, 28 Jul 2022 16:28:22 +0800 +Subject: disabled unreliable tests due to random failures on [ppc64el, + s390x]. + +Last-Update: 2018-09-27 +Forwarded: not-needed +Bug-Upstream: https://github.com/opencontainers/runc/issues/1822 +--- + libcontainer/cgroups/fs/hugetlb_test.go | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/libcontainer/cgroups/fs/hugetlb_test.go b/libcontainer/cgroups/fs/hugetlb_test.go +index 9ddacfe..9b60650 100644 +--- a/libcontainer/cgroups/fs/hugetlb_test.go ++++ b/libcontainer/cgroups/fs/hugetlb_test.go +@@ -89,6 +89,7 @@ func TestHugetlbStats(t *testing.T) { + } + + func TestHugetlbStatsNoUsageFile(t *testing.T) { ++t.Skip("Disabled unreliable test") + helper := NewCgroupTestUtil("hugetlb", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ +@@ -104,6 +105,7 @@ func TestHugetlbStatsNoUsageFile(t *testing.T) { + } + + func TestHugetlbStatsNoMaxUsageFile(t *testing.T) { ++t.Skip("Disabled unreliable test") + helper := NewCgroupTestUtil("hugetlb", t) + defer helper.cleanup() + for _, pageSize := range HugePageSizes { +@@ -121,6 +123,7 @@ func TestHugetlbStatsNoMaxUsageFile(t *testing.T) { + } + + func TestHugetlbStatsBadUsageFile(t *testing.T) { ++t.Skip("Disabled unreliable test") + helper := NewCgroupTestUtil("hugetlb", t) + defer helper.cleanup() + for _, pageSize := range HugePageSizes { +@@ -139,6 +142,7 @@ func TestHugetlbStatsBadUsageFile(t *testing.T) { + } + + func TestHugetlbStatsBadMaxUsageFile(t *testing.T) { ++t.Skip("Disabled unreliable test") + helper := NewCgroupTestUtil("hugetlb", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ diff --git a/debian/patches/test--skip_TestFactoryNewTmpfs.patch b/debian/patches/test--skip_TestFactoryNewTmpfs.patch new file mode 100644 index 0000000..88e0f83 --- /dev/null +++ b/debian/patches/test--skip_TestFactoryNewTmpfs.patch @@ -0,0 +1,22 @@ +From: Dmitry Smirnov +Date: Thu, 28 Jul 2022 16:28:22 +0800 +Subject: disable test (requires root) + +Last-Update: 2018-06-15 +Forwarded: not-needed +--- + libcontainer/factory_linux_test.go | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/libcontainer/factory_linux_test.go b/libcontainer/factory_linux_test.go +index 8d0ca8a..1dc0180 100644 +--- a/libcontainer/factory_linux_test.go ++++ b/libcontainer/factory_linux_test.go +@@ -78,6 +78,7 @@ func TestFactoryNewIntelRdt(t *testing.T) { + } + + func TestFactoryNewTmpfs(t *testing.T) { ++t.Skip("DM - skipping privileged test") + root, rerr := newTestRoot() + if rerr != nil { + t.Fatal(rerr) diff --git a/delete.go b/delete.go index fb6f38e..746b0df 100644 --- a/delete.go +++ b/delete.go @@ -1,12 +1,10 @@ -// +build !solaris - package main import ( + "errors" "fmt" "os" "path/filepath" - "syscall" "time" "github.com/opencontainers/runc/libcontainer" @@ -19,12 +17,12 @@ func killContainer(container libcontainer.Container) error { _ = container.Signal(unix.SIGKILL, false) for i := 0; i < 100; i++ { time.Sleep(100 * time.Millisecond) - if err := container.Signal(syscall.Signal(0), false); err != nil { + if err := container.Signal(unix.Signal(0), false); err != nil { destroy(container) return nil } } - return fmt.Errorf("container init still running") + return errors.New("container init still running") } var deleteCommand = cli.Command{ @@ -55,7 +53,7 @@ status of "ubuntu01" as "stopped" the following will delete resources held for force := context.Bool("force") container, err := getContainer(context) if err != nil { - if lerr, ok := err.(libcontainer.Error); ok && lerr.Code() == libcontainer.ContainerNotExists { + if errors.Is(err, libcontainer.ErrNotExist) { // if there was an aborted start or something of the sort then the container's directory could exist but // libcontainer does not see it because the state.json file inside that directory was never created. path := filepath.Join(context.GlobalString("root"), id) @@ -81,7 +79,7 @@ status of "ubuntu01" as "stopped" the following will delete resources held for if force { return killContainer(container) } - return fmt.Errorf("cannot delete container %s that is not stopped: %s\n", id, s) + return fmt.Errorf("cannot delete container %s that is not stopped: %s", id, s) } return nil diff --git a/docs/Security-Audit.pdf b/docs/Security-Audit.pdf new file mode 100644 index 0000000..c41af42 Binary files /dev/null and b/docs/Security-Audit.pdf differ diff --git a/docs/cgroup-v2.md b/docs/cgroup-v2.md new file mode 100644 index 0000000..3d573d5 --- /dev/null +++ b/docs/cgroup-v2.md @@ -0,0 +1,62 @@ +# cgroup v2 + +runc fully supports cgroup v2 (unified mode) since v1.0.0-rc93. + +To use cgroup v2, you might need to change the configuration of the host init system. +Fedora (>= 31) uses cgroup v2 by default and no extra configuration is required. +On other systemd-based distros, cgroup v2 can be enabled by adding `systemd.unified_cgroup_hierarchy=1` to the kernel cmdline. + +## Am I using cgroup v2? + +Yes if `/sys/fs/cgroup/cgroup.controllers` is present. + +## Host Requirements +### Kernel +* Recommended version: 5.2 or later +* Minimum version: 4.15 + +Kernel older than 5.2 is not recommended due to lack of freezer. + +Notably, kernel older than 4.15 MUST NOT be used (unless you are running containers with user namespaces), as it lacks support for controlling permissions of devices. + +### Systemd +On cgroup v2 hosts, it is highly recommended to run runc with the systemd cgroup driver (`runc --systemd-cgroup`), though not mandatory. + +The recommended systemd version is 244 or later. Older systemd does not support delegation of `cpuset` controller. + +Make sure you also have the `dbus-user-session` (Debian/Ubuntu) or `dbus-daemon` (CentOS/Fedora) package installed, and that `dbus` is running. On Debian-flavored distros, this can be accomplished like so: + +```console +$ sudo apt install -y dbus-user-session +$ systemctl --user start dbus +``` + +## Rootless +On cgroup v2 hosts, rootless runc can talk to systemd to get cgroup permissions to be delegated. + +```console +$ runc spec --rootless +$ jq '.linux.cgroupsPath="user.slice:runc:foo"' config.json | sponge config.json +$ runc --systemd-cgroup run foo +``` + +The container processes are executed in a cgroup like `/user.slice/user-$(id -u).slice/user@$(id -u).service/user.slice/runc-foo.scope`. + +### Configuring delegation +Typically, only `memory` and `pids` controllers are delegated to non-root users by default. + +```console +$ cat /sys/fs/cgroup/user.slice/user-$(id -u).slice/user@$(id -u).service/cgroup.controllers +memory pids +``` + +To allow delegation of other controllers, you need to change the systemd configuration as follows: + +```console +# mkdir -p /etc/systemd/system/user@.service.d +# cat > /etc/systemd/system/user@.service.d/delegate.conf << EOF +[Service] +Delegate=cpu cpuset io memory pids +EOF +# systemctl daemon-reload +``` diff --git a/docs/experimental.md b/docs/experimental.md new file mode 100644 index 0000000..a68dd06 --- /dev/null +++ b/docs/experimental.md @@ -0,0 +1,11 @@ +# Experimental features + +The following features are experimental and subject to change: + +- The `runc features` command (since runc v1.1.0) + +The following features were experimental in the past: + +Feature | Experimental release | Graduation release +---------------------------------------- | -------------------- | ------------------ +cgroup v2 | v1.0.0-rc91 | v1.0.0-rc93 diff --git a/docs/systemd.md b/docs/systemd.md new file mode 100644 index 0000000..c74e2e2 --- /dev/null +++ b/docs/systemd.md @@ -0,0 +1,130 @@ +## systemd cgroup driver + +By default, runc creates cgroups and sets cgroup limits on its own (this mode +is known as fs cgroup driver). When `--systemd-cgroup` global option is given +(as in e.g. `runc --systemd-cgroup run ...`), runc switches to systemd cgroup +driver. This document describes its features and peculiarities. + +### systemd unit name and placement + +When creating a container, runc requests systemd (over dbus) to create +a transient unit for the container, and place it into a specified slice. + +The name of the unit and the containing slice is derived from the container +runtime spec in the following way: + +1. If `Linux.CgroupsPath` is set, it is expected to be in the form + `[slice]:[prefix]:[name]`. + + Here `slice` is a systemd slice under which the container is placed. + If empty, it defaults to `system.slice`, except when cgroup v2 is + used and rootless container is created, in which case it defaults + to `user.slice`. + + Note that `slice` can contain dashes to denote a sub-slice + (e.g. `user-1000.slice` is a correct notation, meaning a subslice + of `user.slice`), but it must not contain slashes (e.g. + `user.slice/user-1000.slice` is invalid). + + A `slice` of `-` represents a root slice. + + Next, `prefix` and `name` are used to compose the unit name, which + is `-.scope`, unless `name` has `.slice` suffix, in + which case `prefix` is ignored and the `name` is used as is. + +2. If `Linux.CgroupsPath` is not set or empty, it works the same way as if it + would be set to `:runc:`. See the description above to see + what it transforms to. + +As described above, a unit being created can either be a scope or a slice. +For a scope, runc specifies its parent slice via a _Slice=_ systemd property, +and also sets _Delegate=true_. For a slice, runc specifies a weak dependency on +the parent slice via a _Wants=_ property. + +### Resource limits + +runc always enables accounting for all controllers, regardless of any limits +being set. This means it unconditionally sets the following properties for the +systemd unit being created: + + * _CPUAccounting=true_ + * _IOAccounting=true_ (_BlockIOAccounting_ for cgroup v1) + * _MemoryAccounting=true_ + * _TasksAccounting=true_ + +The resource limits of the systemd unit are set by runc by translating the +runtime spec resources to systemd unit properties. + +Such translation is by no means complete, as there are some cgroup properties +that can not be set via systemd. Therefore, runc systemd cgroup driver is +backed by fs driver (in other words, cgroup limits are first set via systemd +unit properties, and when by writing to cgroupfs files). + +The set of runtime spec resources which is translated by runc to systemd unit +properties depends on kernel cgroup version being used (v1 or v2), and on the +systemd version being run. If an older systemd version (which does not support +some resources) is used, runc do not set those resources. + +The following tables summarize which properties are translated. + +#### cgroup v1 + +| runtime spec resource | systemd property name | min systemd version | +|-----------------------|-----------------------|---------------------| +| memory.limit | MemoryLimit | | +| cpu.shares | CPUShares | | +| blockIO.weight | BlockIOWeight | | +| pids.limit | TasksMax | | +| cpu.cpus | AllowedCPUs | v244 | +| cpu.mems | AllowedMemoryNodes | v244 | + +#### cgroup v2 + +| runtime spec resource | systemd property name | min systemd version | +|-------------------------|-----------------------|---------------------| +| memory.limit | MemoryMax | | +| memory.reservation | MemoryLow | | +| memory.swap | MemorySwapMax | | +| cpu.shares | CPUWeight | | +| pids.limit | TasksMax | | +| cpu.cpus | AllowedCPUs | v244 | +| cpu.mems | AllowedMemoryNodes | v244 | +| unified.cpu.max | CPUQuota, CPUQuotaPeriodSec | v242 | +| unified.cpu.weight | CPUWeight | | +| unified.cpuset.cpus | AllowedCPUs | v244 | +| unified.cpuset.mems | AllowedMemoryNodes | v244 | +| unified.memory.high | MemoryHigh | | +| unified.memory.low | MemoryLow | | +| unified.memory.min | MemoryMin | | +| unified.memory.max | MemoryMax | | +| unified.memory.swap.max | MemorySwapMax | | +| unified.pids.max | TasksMax | | + +For documentation on systemd unit resource properties, see +`systemd.resource-control(5)` man page. + +### Auxiliary properties + +Auxiliary properties of a systemd unit (as shown by `systemctl show +` after the container is created) can be set (or overwritten) by +adding annotations to the container runtime spec (`config.json`). + +For example: + +```json + "annotations": { + "org.systemd.property.TimeoutStopUSec": "uint64 123456789", + "org.systemd.property.CollectMode":"'inactive-or-failed'" + }, +``` + +The above will set the following properties: + +* `TimeoutStopSec` to 2 minutes and 3 seconds; +* `CollectMode` to "inactive-or-failed". + +The values must be in the gvariant format (for details, see +[gvariant documentation](https://developer.gnome.org/glib/stable/gvariant-text.html)). + +To find out which type systemd expects for a particular parameter, please +consult systemd sources. diff --git a/docs/terminals.md b/docs/terminals.md index fc000e1..aa9f71e 100644 --- a/docs/terminals.md +++ b/docs/terminals.md @@ -113,6 +113,33 @@ interact with pseudo-terminal `stdio`][tty_ioctl(4)]. > means that it is not really possible to uniquely distinguish between `stdout` > and `stderr` from the caller's perspective. +#### Issues + +If you see an error like + +``` +open /dev/tty: no such device or address +``` + +from runc, it means it can't open a terminal (because there isn't one). This +can happen when stdin (and possibly also stdout and stderr) are redirected, +or in some environments that lack a tty (such as GitHub Actions runners). + +The solution to this is to *not* use a terminal for the container, i.e. have +`terminal: false` in `config.json`. If the container really needs a terminal +(some programs require one), you can provide one, using one of the following +methods. + +One way is to use `ssh` with the `-tt` flag. The second `t` forces a terminal +allocation even if there's no local one -- and so it is required when stdin is +not a terminal (some `ssh` implementations only look for a terminal on stdin). + +Another way is to run runc under the `script` utility, like this + +```console +$ script -e -c 'runc run ' +``` + [tty_ioctl(4)]: https://linux.die.net/man/4/tty_ioctl ### Pass-Through ### @@ -124,7 +151,7 @@ passing of file descriptors -- [details below](#runc-modes)). As an example (assuming that `terminal: false` is set in `config.json`): ``` -% echo input | runc run some_container > /tmp/log.out 2>& /tmp/log.err +% echo input | runc run some_container > /tmp/log.out 2> /tmp/log.err ``` Here the container's various `stdio` file descriptors will be substituted with @@ -228,6 +255,19 @@ Unfortunately using detached mode is a bit more complicated and requires more care than the foreground mode -- mainly because it is now up to the caller to handle the `stdio` of the container. +Another complication is that the parent process is responsible for acting as +the subreaper for the container. In short, you need to call +`prctl(PR_SET_CHILD_SUBREAPER, 1, ...)` in the parent process and correctly +handle the implications of being a subreaper. Failing to do so may result in +zombie processes being accumulated on your host. + +These tasks are usually performed by a dedicated (and minimal) monitor process +per-container. For the sake of comparison, other runtimes such as LXC do not +have an equivalent detached mode and instead integrate this monitor process +into the container runtime itself -- this has several tradeoffs, and runc has +opted to support delegating the monitoring responsibility to the parent process +through this detached mode. + #### Detached Pass-Through #### In detached mode, pass-through actually does what it says on the tin -- the diff --git a/events.go b/events.go index fb3f630..6cdc01c 100644 --- a/events.go +++ b/events.go @@ -1,9 +1,8 @@ -// +build linux - package main import ( "encoding/json" + "errors" "fmt" "os" "sync" @@ -40,7 +39,7 @@ information is displayed once every 5 seconds.`, } duration := context.Duration("interval") if duration <= 0 { - return fmt.Errorf("duration interval must be greater than 0") + return errors.New("duration interval must be greater than 0") } status, err := container.Status() if err != nil { @@ -125,10 +124,14 @@ func convertLibcontainerStats(ls *libcontainer.Stats) *types.Stats { s.CPU.Usage.User = cg.CpuStats.CpuUsage.UsageInUsermode s.CPU.Usage.Total = cg.CpuStats.CpuUsage.TotalUsage s.CPU.Usage.Percpu = cg.CpuStats.CpuUsage.PercpuUsage + s.CPU.Usage.PercpuKernel = cg.CpuStats.CpuUsage.PercpuUsageInKernelmode + s.CPU.Usage.PercpuUser = cg.CpuStats.CpuUsage.PercpuUsageInUsermode s.CPU.Throttling.Periods = cg.CpuStats.ThrottlingData.Periods s.CPU.Throttling.ThrottledPeriods = cg.CpuStats.ThrottlingData.ThrottledPeriods s.CPU.Throttling.ThrottledTime = cg.CpuStats.ThrottlingData.ThrottledTime + s.CPUSet = types.CPUSet(cg.CPUSetStats) + s.Memory.Cache = cg.MemoryStats.Cache s.Memory.Kernel = convertMemoryEntry(cg.MemoryStats.KernelUsage) s.Memory.KernelTCP = convertMemoryEntry(cg.MemoryStats.KernelTCPUsage) @@ -151,16 +154,22 @@ func convertLibcontainerStats(ls *libcontainer.Stats) *types.Stats { } if is := ls.IntelRdtStats; is != nil { - if intelrdt.IsCatEnabled() { + if intelrdt.IsCATEnabled() { s.IntelRdt.L3CacheInfo = convertL3CacheInfo(is.L3CacheInfo) s.IntelRdt.L3CacheSchemaRoot = is.L3CacheSchemaRoot s.IntelRdt.L3CacheSchema = is.L3CacheSchema } - if intelrdt.IsMbaEnabled() { + if intelrdt.IsMBAEnabled() { s.IntelRdt.MemBwInfo = convertMemBwInfo(is.MemBwInfo) s.IntelRdt.MemBwSchemaRoot = is.MemBwSchemaRoot s.IntelRdt.MemBwSchema = is.MemBwSchema } + if intelrdt.IsMBMEnabled() { + s.IntelRdt.MBMStats = is.MBMStats + } + if intelrdt.IsCMTEnabled() { + s.IntelRdt.CMTStats = is.CMTStats + } } s.NetworkInterfaces = ls.Interfaces @@ -187,29 +196,17 @@ func convertMemoryEntry(c cgroups.MemoryData) types.MemoryEntry { func convertBlkioEntry(c []cgroups.BlkioStatEntry) []types.BlkioEntry { var out []types.BlkioEntry for _, e := range c { - out = append(out, types.BlkioEntry{ - Major: e.Major, - Minor: e.Minor, - Op: e.Op, - Value: e.Value, - }) + out = append(out, types.BlkioEntry(e)) } return out } func convertL3CacheInfo(i *intelrdt.L3CacheInfo) *types.L3CacheInfo { - return &types.L3CacheInfo{ - CbmMask: i.CbmMask, - MinCbmBits: i.MinCbmBits, - NumClosids: i.NumClosids, - } + ci := types.L3CacheInfo(*i) + return &ci } func convertMemBwInfo(i *intelrdt.MemBwInfo) *types.MemBwInfo { - return &types.MemBwInfo{ - BandwidthGran: i.BandwidthGran, - DelayLinear: i.DelayLinear, - MinBandwidth: i.MinBandwidth, - NumClosids: i.NumClosids, - } + mi := types.MemBwInfo(*i) + return &mi } diff --git a/exec.go b/exec.go index b963d68..18c6bff 100644 --- a/exec.go +++ b/exec.go @@ -1,9 +1,8 @@ -// +build linux - package main import ( "encoding/json" + "errors" "fmt" "os" "strconv" @@ -84,15 +83,18 @@ following will output a list of processes running in the container: Value: &cli.StringSlice{}, Usage: "add a capability to the bounding set for the process", }, - cli.BoolFlag{ - Name: "no-subreaper", - Usage: "disable the use of the subreaper used to reap reparented processes", - Hidden: true, - }, cli.IntFlag{ Name: "preserve-fds", Usage: "Pass N additional file descriptors to the container (stdio + $LISTEN_FDS + N in total)", }, + cli.StringSliceFlag{ + Name: "cgroup", + Usage: "run the process in an (existing) sub-cgroup(s). Format is [:].", + }, + cli.BoolFlag{ + Name: "ignore-paused", + Usage: "allow exec in a paused container", + }, }, Action: func(context *cli.Context) error { if err := checkArgs(context, 1, minArgs); err != nil { @@ -105,11 +107,38 @@ following will output a list of processes running in the container: if err == nil { os.Exit(status) } - return fmt.Errorf("exec failed: %v", err) + fatalWithCode(fmt.Errorf("exec failed: %w", err), 255) + return nil // to satisfy the linter }, SkipArgReorder: true, } +func getSubCgroupPaths(args []string) (map[string]string, error) { + if len(args) == 0 { + return nil, nil + } + paths := make(map[string]string, len(args)) + for _, c := range args { + // Split into controller:path. + cs := strings.SplitN(c, ":", 3) + if len(cs) > 2 { + return nil, fmt.Errorf("invalid --cgroup argument: %s", c) + } + if len(cs) == 1 { // no controller: prefix + if len(args) != 1 { + return nil, fmt.Errorf("invalid --cgroup argument: %s (missing : prefix)", c) + } + paths[""] = c + } else { + // There may be a few comma-separated controllers. + for _, ctrl := range strings.Split(cs[0], ",") { + paths[ctrl] = cs[1] + } + } + } + return paths, nil +} + func execProcess(context *cli.Context) (int, error) { container, err := getContainer(context) if err != nil { @@ -120,13 +149,15 @@ func execProcess(context *cli.Context) (int, error) { return -1, err } if status == libcontainer.Stopped { - return -1, fmt.Errorf("cannot exec a container that has stopped") + return -1, errors.New("cannot exec in a stopped container") + } + if status == libcontainer.Paused && !context.Bool("ignore-paused") { + return -1, errors.New("cannot exec in a paused container (use --ignore-paused to override)") } path := context.String("process") if path == "" && len(context.Args()) == 1 { - return -1, fmt.Errorf("process args cannot be empty") + return -1, errors.New("process args cannot be empty") } - detach := context.Bool("detach") state, err := container.State() if err != nil { return -1, err @@ -137,9 +168,9 @@ func execProcess(context *cli.Context) (int, error) { return -1, err } - logLevel := "info" - if context.GlobalBool("debug") { - logLevel = "debug" + cgPaths, err := getSubCgroupPaths(context.StringSlice("cgroup")) + if err != nil { + return -1, err } r := &runner{ @@ -147,12 +178,12 @@ func execProcess(context *cli.Context) (int, error) { shouldDestroy: false, container: container, consoleSocket: context.String("console-socket"), - detach: detach, + detach: context.Bool("detach"), pidFile: context.String("pid-file"), action: CT_ACT_RUN, init: false, preserveFDs: context.Int("preserve-fds"), - logLevel: logLevel, + subCgroupPaths: cgPaths, } return r.run(p) } @@ -203,6 +234,7 @@ func getProcess(context *cli.Context, bundle string) (*specs.Process, error) { p.Env = append(p.Env, context.StringSlice("env")...) // set the tty + p.Terminal = false if context.IsSet("tty") { p.Terminal = context.Bool("tty") } @@ -215,13 +247,13 @@ func getProcess(context *cli.Context, bundle string) (*specs.Process, error) { if len(u) > 1 { gid, err := strconv.Atoi(u[1]) if err != nil { - return nil, fmt.Errorf("parsing %s as int for gid failed: %v", u[1], err) + return nil, fmt.Errorf("parsing %s as int for gid failed: %w", u[1], err) } p.User.GID = uint32(gid) } uid, err := strconv.Atoi(u[0]) if err != nil { - return nil, fmt.Errorf("parsing %s as int for uid failed: %v", u[0], err) + return nil, fmt.Errorf("parsing %s as int for uid failed: %w", u[0], err) } p.User.UID = uint32(uid) } diff --git a/features.go b/features.go new file mode 100644 index 0000000..c9cd15c --- /dev/null +++ b/features.go @@ -0,0 +1,75 @@ +package main + +import ( + "encoding/json" + "fmt" + + "github.com/opencontainers/runc/libcontainer/capabilities" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/seccomp" + "github.com/opencontainers/runc/libcontainer/specconv" + "github.com/opencontainers/runc/types/features" + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/urfave/cli" +) + +var featuresCommand = cli.Command{ + Name: "features", + Usage: "show the enabled features", + ArgsUsage: "", + Description: `Show the enabled features. + The result is parsable as a JSON. + See https://pkg.go.dev/github.com/opencontainers/runc/types/features for the type definition. + The types are experimental and subject to change. +`, + Action: func(context *cli.Context) error { + if err := checkArgs(context, 0, exactArgs); err != nil { + return err + } + + tru := true + + feat := features.Features{ + OCIVersionMin: "1.0.0", + OCIVersionMax: specs.Version, + Annotations: map[string]string{ + features.AnnotationRuncVersion: version, + features.AnnotationRuncCommit: gitCommit, + features.AnnotationRuncCheckpointEnabled: "true", + }, + Hooks: configs.KnownHookNames(), + MountOptions: specconv.KnownMountOptions(), + Linux: &features.Linux{ + Namespaces: specconv.KnownNamespaces(), + Capabilities: capabilities.KnownCapabilities(), + Cgroup: &features.Cgroup{ + V1: &tru, + V2: &tru, + Systemd: &tru, + SystemdUser: &tru, + }, + Apparmor: &features.Apparmor{ + Enabled: &tru, + }, + Selinux: &features.Selinux{ + Enabled: &tru, + }, + }, + } + + if seccomp.Enabled { + feat.Linux.Seccomp = &features.Seccomp{ + Enabled: &tru, + Actions: seccomp.KnownActions(), + Operators: seccomp.KnownOperators(), + Archs: seccomp.KnownArchs(), + } + major, minor, patch := seccomp.Version() + feat.Annotations[features.AnnotationLibseccompVersion] = fmt.Sprintf("%d.%d.%d", major, minor, patch) + } + + enc := json.NewEncoder(context.App.Writer) + enc.SetIndent("", " ") + return enc.Encode(feat) + }, +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..f7a4e55 --- /dev/null +++ b/go.mod @@ -0,0 +1,26 @@ +module github.com/opencontainers/runc + +go 1.16 + +require ( + github.com/checkpoint-restore/go-criu/v5 v5.3.0 + github.com/cilium/ebpf v0.7.0 + github.com/containerd/console v1.0.3 + github.com/coreos/go-systemd/v22 v22.3.2 + github.com/cyphar/filepath-securejoin v0.2.3 + github.com/docker/go-units v0.4.0 + github.com/godbus/dbus/v5 v5.0.6 + github.com/moby/sys/mountinfo v0.5.0 + github.com/mrunalp/fileutils v0.5.0 + github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417 + github.com/opencontainers/selinux v1.10.0 + github.com/seccomp/libseccomp-golang v0.9.2-0.20210429002308-3879420cc921 + github.com/sirupsen/logrus v1.8.1 + github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635 + // NOTE: urfave/cli must be <= v1.22.1 due to a regression: https://github.com/urfave/cli/issues/1092 + github.com/urfave/cli v1.22.1 + github.com/vishvananda/netlink v1.1.0 + golang.org/x/net v0.0.0-20201224014010-6772e930b67b + golang.org/x/sys v0.0.0-20211116061358-0a5406a5449c + google.golang.org/protobuf v1.27.1 +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..7c2a60b --- /dev/null +++ b/go.sum @@ -0,0 +1,80 @@ +github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= +github.com/checkpoint-restore/go-criu/v5 v5.3.0 h1:wpFFOoomK3389ue2lAb0Boag6XPht5QYpipxmSNL4d8= +github.com/checkpoint-restore/go-criu/v5 v5.3.0/go.mod h1:E/eQpaFtUKGOOSEBZgmKAcn+zUUwWxqcaKZlF54wK8E= +github.com/cilium/ebpf v0.7.0 h1:1k/q3ATgxSXRdrmPfH8d7YK0GfqVsEKZAX9dQZvs56k= +github.com/cilium/ebpf v0.7.0/go.mod h1:/oI2+1shJiTGAMgl6/RgJr36Eo1jzrRcAWbcXO2usCA= +github.com/containerd/console v1.0.3 h1:lIr7SlA5PxZyMV30bDW0MGbiOPXwc63yRuCP0ARubLw= +github.com/containerd/console v1.0.3/go.mod h1:7LqA/THxQ86k76b8c/EMSiaJ3h1eZkMkXar0TQ1gf3U= +github.com/coreos/go-systemd/v22 v22.3.2 h1:D9/bQk5vlXQFZ6Kwuu6zaiXJ9oTPe68++AzAJc1DzSI= +github.com/coreos/go-systemd/v22 v22.3.2/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= +github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d h1:U+s90UTSYgptZMwQh2aRr3LuazLJIa+Pg3Kc1ylSYVY= +github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU= +github.com/cyphar/filepath-securejoin v0.2.3 h1:YX6ebbZCZP7VkM3scTTokDgBL2TY741X51MTk3ycuNI= +github.com/cyphar/filepath-securejoin v0.2.3/go.mod h1:aPGpWjXOXUn2NCNjFvBE6aRxGGx79pTxQpKOJNYHHl4= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/docker/go-units v0.4.0 h1:3uh0PgVws3nIA0Q+MwDC8yjEPf9zjRfZZWXZYDct3Tw= +github.com/docker/go-units v0.4.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= +github.com/frankban/quicktest v1.11.3 h1:8sXhOn0uLys67V8EsXLc6eszDs8VXWxL3iRvebPhedY= +github.com/frankban/quicktest v1.11.3/go.mod h1:wRf/ReqHper53s+kmmSZizM8NamnL3IM0I9ntUbOk+k= +github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= +github.com/godbus/dbus/v5 v5.0.6 h1:mkgN1ofwASrYnJ5W6U/BxG15eXXXjirgZc7CLqkcaro= +github.com/godbus/dbus/v5 v5.0.6/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= +github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= +github.com/google/go-cmp v0.5.4/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.5 h1:Khx7svrCpmxxtHBq5j2mp/xVjsi8hQMfNLvJFAlrGgU= +github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/kr/pretty v0.2.1 h1:Fmg33tUaq4/8ym9TJN1x7sLJnHVwhP33CNkpYV/7rwI= +github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/moby/sys/mountinfo v0.5.0 h1:2Ks8/r6lopsxWi9m58nlwjaeSzUX9iiL1vj5qB/9ObI= +github.com/moby/sys/mountinfo v0.5.0/go.mod h1:3bMD3Rg+zkqx8MRYPi7Pyb0Ie97QEBmdxbhnCLlSvSU= +github.com/mrunalp/fileutils v0.5.0 h1:NKzVxiH7eSk+OQ4M+ZYW1K6h27RUV3MI6NUTsHhU6Z4= +github.com/mrunalp/fileutils v0.5.0/go.mod h1:M1WthSahJixYnrXQl/DFQuteStB1weuxD2QJNHXfbSQ= +github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417 h1:3snG66yBm59tKhhSPQrQ/0bCrv1LQbKt40LnUPiUxdc= +github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= +github.com/opencontainers/selinux v1.10.0 h1:rAiKF8hTcgLI3w0DHm6i0ylVVcOrlgR1kK99DRLDhyU= +github.com/opencontainers/selinux v1.10.0/go.mod h1:2i0OySw99QjzBBQByd1Gr9gSjvuho1lHsJxIJ3gGbJI= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/russross/blackfriday/v2 v2.0.1 h1:lPqVAte+HuHNfhJ/0LC98ESWRz8afy9tM/0RK8m9o+Q= +github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= +github.com/seccomp/libseccomp-golang v0.9.2-0.20210429002308-3879420cc921 h1:58EBmR2dMNL2n/FnbQewK3D14nXr0V9CObDSvMJLq+Y= +github.com/seccomp/libseccomp-golang v0.9.2-0.20210429002308-3879420cc921/go.mod h1:JA8cRccbGaA1s33RQf7Y1+q9gHmZX1yB/z9WDN1C6fg= +github.com/shurcooL/sanitized_anchor_name v1.0.0 h1:PdmoCO6wvbs+7yrJyMORt4/BmY5IYyJwS/kOiWx8mHo= +github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc= +github.com/sirupsen/logrus v1.8.1 h1:dJKuHgqk1NNQlqoA6BTlM1Wf9DOH3NBjQyu0h9+AZZE= +github.com/sirupsen/logrus v1.8.1/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0= +github.com/stretchr/testify v1.2.2 h1:bSDNvY7ZPG5RlJ8otE/7V6gMiyenm9RtJ7IUVIAoJ1w= +github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= +github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635 h1:kdXcSzyDtseVEc4yCz2qF8ZrQvIDBJLl4S1c3GCXmoI= +github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635/go.mod h1:hkRG7XYTFWNJGYcbNJQlaLq0fg1yr4J4t/NcTQtrfww= +github.com/urfave/cli v1.22.1 h1:+mkCCcOFKPnCmVYVcURKps1Xe+3zP90gSYGNfRkjoIY= +github.com/urfave/cli v1.22.1/go.mod h1:Gos4lmkARVdJ6EkW0WaNv/tZAAMe9V7XWyB60NtXRu0= +github.com/vishvananda/netlink v1.1.0 h1:1iyaYNBLmP6L0220aDnYQpo1QEV4t4hJ+xEEhhJH8j0= +github.com/vishvananda/netlink v1.1.0/go.mod h1:cTgwzPIzzgDAYoQrMm0EdrjRUBkTqKYppBueQtXaqoE= +github.com/vishvananda/netns v0.0.0-20191106174202-0a2b9b5464df h1:OviZH7qLw/7ZovXvuNyL3XQl8UFofeikI1NW1Gypu7k= +github.com/vishvananda/netns v0.0.0-20191106174202-0a2b9b5464df/go.mod h1:JP3t17pCcGlemwknint6hfoeCVQrEMVwxRLRjXpq+BU= +golang.org/x/net v0.0.0-20201224014010-6772e930b67b h1:iFwSg7t5GZmB/Q5TjiEAsdoLDrdJRC1RiF2WhuV29Qw= +golang.org/x/net v0.0.0-20201224014010-6772e930b67b/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/sys v0.0.0-20190606203320-7fc4e5ec1444/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20191115151921-52ab43148777/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210906170528-6f6e22806c34/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20211025201205-69cdffdb9359/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20211116061358-0a5406a5449c h1:DHcbWVXeY+0Y8HHKR+rbLwnoh2F4tNCY7rTiHJ30RmA= +golang.org/x/sys v0.0.0-20211116061358-0a5406a5449c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= +google.golang.org/protobuf v1.27.1 h1:SnqbnDw1V7RiZcXPx5MEeqPv2s79L9i7BJUlG/+RurQ= +google.golang.org/protobuf v1.27.1/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= diff --git a/init.go b/init.go index 08351fd..bddc237 100644 --- a/init.go +++ b/init.go @@ -1,44 +1,37 @@ package main import ( - "fmt" "os" "runtime" + "strconv" "github.com/opencontainers/runc/libcontainer" - "github.com/opencontainers/runc/libcontainer/logs" _ "github.com/opencontainers/runc/libcontainer/nsenter" "github.com/sirupsen/logrus" - "github.com/urfave/cli" ) func init() { if len(os.Args) > 1 && os.Args[1] == "init" { + // This is the golang entry point for runc init, executed + // before main() but after libcontainer/nsenter's nsexec(). runtime.GOMAXPROCS(1) runtime.LockOSThread() - level := os.Getenv("_LIBCONTAINER_LOGLEVEL") - logLevel, err := logrus.ParseLevel(level) + level, err := strconv.Atoi(os.Getenv("_LIBCONTAINER_LOGLEVEL")) if err != nil { - panic(fmt.Sprintf("libcontainer: failed to parse log level: %q: %v", level, err)) + panic(err) } - err = logs.ConfigureLogging(logs.Config{ - LogPipeFd: os.Getenv("_LIBCONTAINER_LOGPIPE"), - LogFormat: "json", - LogLevel: logLevel, - }) + logPipeFd, err := strconv.Atoi(os.Getenv("_LIBCONTAINER_LOGPIPE")) if err != nil { - panic(fmt.Sprintf("libcontainer: failed to configure logging: %v", err)) + panic(err) } + + logrus.SetLevel(logrus.Level(level)) + logrus.SetOutput(os.NewFile(uintptr(logPipeFd), "logpipe")) + logrus.SetFormatter(new(logrus.JSONFormatter)) logrus.Debug("child process in init()") - } -} -var initCommand = cli.Command{ - Name: "init", - Usage: `initialize the namespaces and launch the process (do not call it outside of runc)`, - Action: func(context *cli.Context) error { factory, _ := libcontainer.New("") if err := factory.StartInitialization(); err != nil { // as the error is sent back to the parent there is no need to log @@ -46,5 +39,5 @@ var initCommand = cli.Command{ os.Exit(1) } panic("libcontainer: container init failed to exec") - }, + } } diff --git a/kill.go b/kill.go index c2d7929..e5b13b1 100644 --- a/kill.go +++ b/kill.go @@ -1,14 +1,12 @@ -// +build linux - package main import ( "fmt" "strconv" "strings" - "syscall" "github.com/urfave/cli" + "golang.org/x/sys/unix" ) var killCommand = cli.Command{ @@ -22,7 +20,7 @@ Where "" is the name for the instance of the container and EXAMPLE: For example, if the container id is "ubuntu01" the following will send a "KILL" signal to the init process of the "ubuntu01" container: - + # runc kill ubuntu01 KILL`, Flags: []cli.Flag{ cli.BoolFlag{ @@ -55,13 +53,17 @@ signal to the init process of the "ubuntu01" container: }, } -func parseSignal(rawSignal string) (syscall.Signal, error) { +func parseSignal(rawSignal string) (unix.Signal, error) { s, err := strconv.Atoi(rawSignal) if err == nil { - return syscall.Signal(s), nil + return unix.Signal(s), nil } - signal, ok := signalMap[strings.TrimPrefix(strings.ToUpper(rawSignal), "SIG")] - if !ok { + sig := strings.ToUpper(rawSignal) + if !strings.HasPrefix(sig, "SIG") { + sig = "SIG" + sig + } + signal := unix.SignalNum(sig) + if signal == 0 { return -1, fmt.Errorf("unknown signal %q", rawSignal) } return signal, nil diff --git a/libcontainer/README.md b/libcontainer/README.md index a791ca2..13eee49 100644 --- a/libcontainer/README.md +++ b/libcontainer/README.md @@ -57,90 +57,94 @@ struct describing how the container is to be created. A sample would look simila ```go defaultMountFlags := unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV +var devices []*configs.DeviceRule +for _, device := range specconv.AllowedDevices { + devices = append(devices, &device.Rule) +} config := &configs.Config{ Rootfs: "/your/path/to/rootfs", Capabilities: &configs.Capabilities{ - Bounding: []string{ - "CAP_CHOWN", - "CAP_DAC_OVERRIDE", - "CAP_FSETID", - "CAP_FOWNER", - "CAP_MKNOD", - "CAP_NET_RAW", - "CAP_SETGID", - "CAP_SETUID", - "CAP_SETFCAP", - "CAP_SETPCAP", - "CAP_NET_BIND_SERVICE", - "CAP_SYS_CHROOT", - "CAP_KILL", - "CAP_AUDIT_WRITE", - }, - Effective: []string{ - "CAP_CHOWN", - "CAP_DAC_OVERRIDE", - "CAP_FSETID", - "CAP_FOWNER", - "CAP_MKNOD", - "CAP_NET_RAW", - "CAP_SETGID", - "CAP_SETUID", - "CAP_SETFCAP", - "CAP_SETPCAP", - "CAP_NET_BIND_SERVICE", - "CAP_SYS_CHROOT", - "CAP_KILL", - "CAP_AUDIT_WRITE", - }, - Inheritable: []string{ - "CAP_CHOWN", - "CAP_DAC_OVERRIDE", - "CAP_FSETID", - "CAP_FOWNER", - "CAP_MKNOD", - "CAP_NET_RAW", - "CAP_SETGID", - "CAP_SETUID", - "CAP_SETFCAP", - "CAP_SETPCAP", - "CAP_NET_BIND_SERVICE", - "CAP_SYS_CHROOT", - "CAP_KILL", - "CAP_AUDIT_WRITE", - }, - Permitted: []string{ - "CAP_CHOWN", - "CAP_DAC_OVERRIDE", - "CAP_FSETID", - "CAP_FOWNER", - "CAP_MKNOD", - "CAP_NET_RAW", - "CAP_SETGID", - "CAP_SETUID", - "CAP_SETFCAP", - "CAP_SETPCAP", - "CAP_NET_BIND_SERVICE", - "CAP_SYS_CHROOT", - "CAP_KILL", - "CAP_AUDIT_WRITE", - }, - Ambient: []string{ - "CAP_CHOWN", - "CAP_DAC_OVERRIDE", - "CAP_FSETID", - "CAP_FOWNER", - "CAP_MKNOD", - "CAP_NET_RAW", - "CAP_SETGID", - "CAP_SETUID", - "CAP_SETFCAP", - "CAP_SETPCAP", - "CAP_NET_BIND_SERVICE", - "CAP_SYS_CHROOT", - "CAP_KILL", - "CAP_AUDIT_WRITE", - }, - }, + Bounding: []string{ + "CAP_CHOWN", + "CAP_DAC_OVERRIDE", + "CAP_FSETID", + "CAP_FOWNER", + "CAP_MKNOD", + "CAP_NET_RAW", + "CAP_SETGID", + "CAP_SETUID", + "CAP_SETFCAP", + "CAP_SETPCAP", + "CAP_NET_BIND_SERVICE", + "CAP_SYS_CHROOT", + "CAP_KILL", + "CAP_AUDIT_WRITE", + }, + Effective: []string{ + "CAP_CHOWN", + "CAP_DAC_OVERRIDE", + "CAP_FSETID", + "CAP_FOWNER", + "CAP_MKNOD", + "CAP_NET_RAW", + "CAP_SETGID", + "CAP_SETUID", + "CAP_SETFCAP", + "CAP_SETPCAP", + "CAP_NET_BIND_SERVICE", + "CAP_SYS_CHROOT", + "CAP_KILL", + "CAP_AUDIT_WRITE", + }, + Inheritable: []string{ + "CAP_CHOWN", + "CAP_DAC_OVERRIDE", + "CAP_FSETID", + "CAP_FOWNER", + "CAP_MKNOD", + "CAP_NET_RAW", + "CAP_SETGID", + "CAP_SETUID", + "CAP_SETFCAP", + "CAP_SETPCAP", + "CAP_NET_BIND_SERVICE", + "CAP_SYS_CHROOT", + "CAP_KILL", + "CAP_AUDIT_WRITE", + }, + Permitted: []string{ + "CAP_CHOWN", + "CAP_DAC_OVERRIDE", + "CAP_FSETID", + "CAP_FOWNER", + "CAP_MKNOD", + "CAP_NET_RAW", + "CAP_SETGID", + "CAP_SETUID", + "CAP_SETFCAP", + "CAP_SETPCAP", + "CAP_NET_BIND_SERVICE", + "CAP_SYS_CHROOT", + "CAP_KILL", + "CAP_AUDIT_WRITE", + }, + Ambient: []string{ + "CAP_CHOWN", + "CAP_DAC_OVERRIDE", + "CAP_FSETID", + "CAP_FOWNER", + "CAP_MKNOD", + "CAP_NET_RAW", + "CAP_SETGID", + "CAP_SETUID", + "CAP_SETFCAP", + "CAP_SETPCAP", + "CAP_NET_BIND_SERVICE", + "CAP_SYS_CHROOT", + "CAP_KILL", + "CAP_AUDIT_WRITE", + }, + }, Namespaces: configs.Namespaces([]configs.Namespace{ {Type: configs.NEWNS}, {Type: configs.NEWUTS}, @@ -155,8 +159,7 @@ config := &configs.Config{ Parent: "system", Resources: &configs.Resources{ MemorySwappiness: nil, - AllowAllDevices: nil, - AllowedDevices: configs.DefaultAllowedDevices, + Devices: devices, }, }, MaskPaths: []string{ @@ -166,7 +169,7 @@ config := &configs.Config{ ReadonlyPaths: []string{ "/proc/sys", "/proc/sysrq-trigger", "/proc/irq", "/proc/bus", }, - Devices: configs.DefaultAutoCreatedDevices, + Devices: specconv.AllowedDevices, Hostname: "testing", Mounts: []*configs.Mount{ { @@ -314,7 +317,7 @@ state, err := container.State() #### Checkpoint & Restore libcontainer now integrates [CRIU](http://criu.org/) for checkpointing and restoring containers. -This let's you save the state of a process running inside a container to disk, and then restore +This lets you save the state of a process running inside a container to disk, and then restore that state into a new process, on the same machine or on another machine. `criu` version 1.5.2 or higher is required to use checkpoint and restore. diff --git a/libcontainer/apparmor/apparmor.go b/libcontainer/apparmor/apparmor.go index debfc1e..4b03d4c 100644 --- a/libcontainer/apparmor/apparmor.go +++ b/libcontainer/apparmor/apparmor.go @@ -1,60 +1,16 @@ -// +build apparmor,linux - package apparmor -import ( - "fmt" - "io/ioutil" - "os" +import "errors" - "github.com/opencontainers/runc/libcontainer/utils" +var ( + // IsEnabled returns true if apparmor is enabled for the host. + IsEnabled = isEnabled + + // ApplyProfile will apply the profile with the specified name to the process after + // the next exec. It is only supported on Linux and produces an ErrApparmorNotEnabled + // on other platforms. + ApplyProfile = applyProfile + + // ErrApparmorNotEnabled indicates that AppArmor is not enabled or not supported. + ErrApparmorNotEnabled = errors.New("apparmor: config provided but apparmor not supported") ) - -// IsEnabled returns true if apparmor is enabled for the host. -func IsEnabled() bool { - if _, err := os.Stat("/sys/kernel/security/apparmor"); err == nil && os.Getenv("container") == "" { - if _, err = os.Stat("/sbin/apparmor_parser"); err == nil { - buf, err := ioutil.ReadFile("/sys/module/apparmor/parameters/enabled") - return err == nil && len(buf) > 1 && buf[0] == 'Y' - } - } - return false -} - -func setProcAttr(attr, value string) error { - // Under AppArmor you can only change your own attr, so use /proc/self/ - // instead of /proc// like libapparmor does - path := fmt.Sprintf("/proc/self/attr/%s", attr) - - f, err := os.OpenFile(path, os.O_WRONLY, 0) - if err != nil { - return err - } - defer f.Close() - - if err := utils.EnsureProcHandle(f); err != nil { - return err - } - - _, err = fmt.Fprintf(f, "%s", value) - return err -} - -// changeOnExec reimplements aa_change_onexec from libapparmor in Go -func changeOnExec(name string) error { - value := "exec " + name - if err := setProcAttr("exec", value); err != nil { - return fmt.Errorf("apparmor failed to apply profile: %s", err) - } - return nil -} - -// ApplyProfile will apply the profile with the specified name to the process after -// the next exec. -func ApplyProfile(name string) error { - if name == "" { - return nil - } - - return changeOnExec(name) -} diff --git a/libcontainer/apparmor/apparmor_disabled.go b/libcontainer/apparmor/apparmor_disabled.go deleted file mode 100644 index d4110cf..0000000 --- a/libcontainer/apparmor/apparmor_disabled.go +++ /dev/null @@ -1,20 +0,0 @@ -// +build !apparmor !linux - -package apparmor - -import ( - "errors" -) - -var ErrApparmorNotEnabled = errors.New("apparmor: config provided but apparmor not supported") - -func IsEnabled() bool { - return false -} - -func ApplyProfile(name string) error { - if name != "" { - return ErrApparmorNotEnabled - } - return nil -} diff --git a/libcontainer/apparmor/apparmor_linux.go b/libcontainer/apparmor/apparmor_linux.go new file mode 100644 index 0000000..8b1483c --- /dev/null +++ b/libcontainer/apparmor/apparmor_linux.go @@ -0,0 +1,68 @@ +package apparmor + +import ( + "errors" + "fmt" + "os" + "sync" + + "github.com/opencontainers/runc/libcontainer/utils" +) + +var ( + appArmorEnabled bool + checkAppArmor sync.Once +) + +// isEnabled returns true if apparmor is enabled for the host. +func isEnabled() bool { + checkAppArmor.Do(func() { + if _, err := os.Stat("/sys/kernel/security/apparmor"); err == nil { + buf, err := os.ReadFile("/sys/module/apparmor/parameters/enabled") + appArmorEnabled = err == nil && len(buf) > 1 && buf[0] == 'Y' + } + }) + return appArmorEnabled +} + +func setProcAttr(attr, value string) error { + // Under AppArmor you can only change your own attr, so use /proc/self/ + // instead of /proc// like libapparmor does + attrPath := "/proc/self/attr/apparmor/" + attr + if _, err := os.Stat(attrPath); errors.Is(err, os.ErrNotExist) { + // fall back to the old convention + attrPath = "/proc/self/attr/" + attr + } + + f, err := os.OpenFile(attrPath, os.O_WRONLY, 0) + if err != nil { + return err + } + defer f.Close() + + if err := utils.EnsureProcHandle(f); err != nil { + return err + } + + _, err = f.WriteString(value) + return err +} + +// changeOnExec reimplements aa_change_onexec from libapparmor in Go +func changeOnExec(name string) error { + if err := setProcAttr("exec", "exec "+name); err != nil { + return fmt.Errorf("apparmor failed to apply profile: %w", err) + } + return nil +} + +// applyProfile will apply the profile with the specified name to the process after +// the next exec. It is only supported on Linux and produces an error on other +// platforms. +func applyProfile(name string) error { + if name == "" { + return nil + } + + return changeOnExec(name) +} diff --git a/libcontainer/apparmor/apparmor_unsupported.go b/libcontainer/apparmor/apparmor_unsupported.go new file mode 100644 index 0000000..684248f --- /dev/null +++ b/libcontainer/apparmor/apparmor_unsupported.go @@ -0,0 +1,15 @@ +//go:build !linux +// +build !linux + +package apparmor + +func isEnabled() bool { + return false +} + +func applyProfile(name string) error { + if name != "" { + return ErrApparmorNotEnabled + } + return nil +} diff --git a/libcontainer/capabilities/capabilities.go b/libcontainer/capabilities/capabilities.go new file mode 100644 index 0000000..d38b8a7 --- /dev/null +++ b/libcontainer/capabilities/capabilities.go @@ -0,0 +1,123 @@ +//go:build linux +// +build linux + +package capabilities + +import ( + "sort" + "strings" + + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/sirupsen/logrus" + "github.com/syndtr/gocapability/capability" +) + +const allCapabilityTypes = capability.CAPS | capability.BOUNDING | capability.AMBIENT + +var ( + capabilityMap map[string]capability.Cap + capTypes = []capability.CapType{ + capability.BOUNDING, + capability.PERMITTED, + capability.INHERITABLE, + capability.EFFECTIVE, + capability.AMBIENT, + } +) + +func init() { + capabilityMap = make(map[string]capability.Cap, capability.CAP_LAST_CAP+1) + for _, c := range capability.List() { + if c > capability.CAP_LAST_CAP { + continue + } + capabilityMap["CAP_"+strings.ToUpper(c.String())] = c + } +} + +// KnownCapabilities returns the list of the known capabilities. +// Used by `runc features`. +func KnownCapabilities() []string { + list := capability.List() + res := make([]string, len(list)) + for i, c := range list { + res[i] = "CAP_" + strings.ToUpper(c.String()) + } + return res +} + +// New creates a new Caps from the given Capabilities config. Unknown Capabilities +// or Capabilities that are unavailable in the current environment are ignored, +// printing a warning instead. +func New(capConfig *configs.Capabilities) (*Caps, error) { + var ( + err error + c Caps + ) + + unknownCaps := make(map[string]struct{}) + c.caps = map[capability.CapType][]capability.Cap{ + capability.BOUNDING: capSlice(capConfig.Bounding, unknownCaps), + capability.EFFECTIVE: capSlice(capConfig.Effective, unknownCaps), + capability.INHERITABLE: capSlice(capConfig.Inheritable, unknownCaps), + capability.PERMITTED: capSlice(capConfig.Permitted, unknownCaps), + capability.AMBIENT: capSlice(capConfig.Ambient, unknownCaps), + } + if c.pid, err = capability.NewPid2(0); err != nil { + return nil, err + } + if err = c.pid.Load(); err != nil { + return nil, err + } + if len(unknownCaps) > 0 { + logrus.Warn("ignoring unknown or unavailable capabilities: ", mapKeys(unknownCaps)) + } + return &c, nil +} + +// capSlice converts the slice of capability names in caps, to their numeric +// equivalent, and returns them as a slice. Unknown or unavailable capabilities +// are not returned, but appended to unknownCaps. +func capSlice(caps []string, unknownCaps map[string]struct{}) []capability.Cap { + var out []capability.Cap + for _, c := range caps { + if v, ok := capabilityMap[c]; !ok { + unknownCaps[c] = struct{}{} + } else { + out = append(out, v) + } + } + return out +} + +// mapKeys returns the keys of input in sorted order +func mapKeys(input map[string]struct{}) []string { + var keys []string + for c := range input { + keys = append(keys, c) + } + sort.Strings(keys) + return keys +} + +// Caps holds the capabilities for a container. +type Caps struct { + pid capability.Capabilities + caps map[capability.CapType][]capability.Cap +} + +// ApplyBoundingSet sets the capability bounding set to those specified in the whitelist. +func (c *Caps) ApplyBoundingSet() error { + c.pid.Clear(capability.BOUNDING) + c.pid.Set(capability.BOUNDING, c.caps[capability.BOUNDING]...) + return c.pid.Apply(capability.BOUNDING) +} + +// Apply sets all the capabilities for the current process in the config. +func (c *Caps) ApplyCaps() error { + c.pid.Clear(allCapabilityTypes) + for _, g := range capTypes { + c.pid.Set(g, c.caps[g]...) + } + return c.pid.Apply(allCapabilityTypes) +} diff --git a/libcontainer/capabilities/capabilities_linux_test.go b/libcontainer/capabilities/capabilities_linux_test.go new file mode 100644 index 0000000..dfbb44b --- /dev/null +++ b/libcontainer/capabilities/capabilities_linux_test.go @@ -0,0 +1,71 @@ +package capabilities + +import ( + "io" + "os" + "testing" + + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/sirupsen/logrus" + "github.com/sirupsen/logrus/hooks/test" + "github.com/syndtr/gocapability/capability" +) + +func TestNew(t *testing.T) { + cs := []string{"CAP_CHOWN", "CAP_UNKNOWN", "CAP_UNKNOWN2"} + conf := configs.Capabilities{ + Bounding: cs, + Effective: cs, + Inheritable: cs, + Permitted: cs, + Ambient: cs, + } + + hook := test.NewGlobal() + defer hook.Reset() + + logrus.SetOutput(io.Discard) + caps, err := New(&conf) + logrus.SetOutput(os.Stderr) + + if err != nil { + t.Error(err) + } + e := hook.AllEntries() + if len(e) != 1 { + t.Errorf("expected 1 warning, got %d", len(e)) + } + + expectedLogs := logrus.Entry{ + Level: logrus.WarnLevel, + Message: "ignoring unknown or unavailable capabilities: [CAP_UNKNOWN CAP_UNKNOWN2]", + } + + l := hook.LastEntry() + if l == nil { + t.Fatal("expected a warning, but got none") + } + if l.Level != expectedLogs.Level { + t.Errorf("expected %q, got %q", expectedLogs.Level, l.Level) + } + if l.Message != expectedLogs.Message { + t.Errorf("expected %q, got %q", expectedLogs.Message, l.Message) + } + + if len(caps.caps) != len(capTypes) { + t.Errorf("expected %d capability types, got %d: %v", len(capTypes), len(caps.caps), caps.caps) + } + + for _, cType := range capTypes { + if i := len(caps.caps[cType]); i != 1 { + t.Errorf("expected 1 capability for %s, got %d: %v", cType, i, caps.caps[cType]) + continue + } + if caps.caps[cType][0] != capability.CAP_CHOWN { + t.Errorf("expected CAP_CHOWN, got %s: ", caps.caps[cType][0]) + continue + } + } + + hook.Reset() +} diff --git a/libcontainer/capabilities/capabilities_unsupported.go b/libcontainer/capabilities/capabilities_unsupported.go new file mode 100644 index 0000000..0eafa4f --- /dev/null +++ b/libcontainer/capabilities/capabilities_unsupported.go @@ -0,0 +1,4 @@ +//go:build !linux +// +build !linux + +package capabilities diff --git a/libcontainer/capabilities_linux.go b/libcontainer/capabilities_linux.go deleted file mode 100644 index 9daef29..0000000 --- a/libcontainer/capabilities_linux.go +++ /dev/null @@ -1,117 +0,0 @@ -// +build linux - -package libcontainer - -import ( - "fmt" - "strings" - - "github.com/opencontainers/runc/libcontainer/configs" - "github.com/syndtr/gocapability/capability" -) - -const allCapabilityTypes = capability.CAPS | capability.BOUNDS | capability.AMBS - -var capabilityMap map[string]capability.Cap - -func init() { - capabilityMap = make(map[string]capability.Cap) - last := capability.CAP_LAST_CAP - // workaround for RHEL6 which has no /proc/sys/kernel/cap_last_cap - if last == capability.Cap(63) { - last = capability.CAP_BLOCK_SUSPEND - } - for _, cap := range capability.List() { - if cap > last { - continue - } - capKey := fmt.Sprintf("CAP_%s", strings.ToUpper(cap.String())) - capabilityMap[capKey] = cap - } -} - -func newContainerCapList(capConfig *configs.Capabilities) (*containerCapabilities, error) { - bounding := []capability.Cap{} - for _, c := range capConfig.Bounding { - v, ok := capabilityMap[c] - if !ok { - return nil, fmt.Errorf("unknown capability %q", c) - } - bounding = append(bounding, v) - } - effective := []capability.Cap{} - for _, c := range capConfig.Effective { - v, ok := capabilityMap[c] - if !ok { - return nil, fmt.Errorf("unknown capability %q", c) - } - effective = append(effective, v) - } - inheritable := []capability.Cap{} - for _, c := range capConfig.Inheritable { - v, ok := capabilityMap[c] - if !ok { - return nil, fmt.Errorf("unknown capability %q", c) - } - inheritable = append(inheritable, v) - } - permitted := []capability.Cap{} - for _, c := range capConfig.Permitted { - v, ok := capabilityMap[c] - if !ok { - return nil, fmt.Errorf("unknown capability %q", c) - } - permitted = append(permitted, v) - } - ambient := []capability.Cap{} - for _, c := range capConfig.Ambient { - v, ok := capabilityMap[c] - if !ok { - return nil, fmt.Errorf("unknown capability %q", c) - } - ambient = append(ambient, v) - } - pid, err := capability.NewPid2(0) - if err != nil { - return nil, err - } - err = pid.Load() - if err != nil { - return nil, err - } - return &containerCapabilities{ - bounding: bounding, - effective: effective, - inheritable: inheritable, - permitted: permitted, - ambient: ambient, - pid: pid, - }, nil -} - -type containerCapabilities struct { - pid capability.Capabilities - bounding []capability.Cap - effective []capability.Cap - inheritable []capability.Cap - permitted []capability.Cap - ambient []capability.Cap -} - -// ApplyBoundingSet sets the capability bounding set to those specified in the whitelist. -func (c *containerCapabilities) ApplyBoundingSet() error { - c.pid.Clear(capability.BOUNDS) - c.pid.Set(capability.BOUNDS, c.bounding...) - return c.pid.Apply(capability.BOUNDS) -} - -// Apply sets all the capabilities for the current process in the config. -func (c *containerCapabilities) ApplyCaps() error { - c.pid.Clear(allCapabilityTypes) - c.pid.Set(capability.BOUNDS, c.bounding...) - c.pid.Set(capability.PERMITTED, c.permitted...) - c.pid.Set(capability.INHERITABLE, c.inheritable...) - c.pid.Set(capability.EFFECTIVE, c.effective...) - c.pid.Set(capability.AMBIENT, c.ambient...) - return c.pid.Apply(allCapabilityTypes) -} diff --git a/libcontainer/cgroups/cgroups.go b/libcontainer/cgroups/cgroups.go index c0a9659..ba2b226 100644 --- a/libcontainer/cgroups/cgroups.go +++ b/libcontainer/cgroups/cgroups.go @@ -1,74 +1,59 @@ -// +build linux - package cgroups import ( - "fmt" - "github.com/opencontainers/runc/libcontainer/configs" ) type Manager interface { - // Applies cgroup configuration to the process with the specified pid + // Apply creates a cgroup, if not yet created, and adds a process + // with the specified pid into that cgroup. A special value of -1 + // can be used to merely create a cgroup. Apply(pid int) error - // Returns the PIDs inside the cgroup set + // GetPids returns the PIDs of all processes inside the cgroup. GetPids() ([]int, error) - // Returns the PIDs inside the cgroup set & all sub-cgroups + // GetAllPids returns the PIDs of all processes inside the cgroup + // any all its sub-cgroups. GetAllPids() ([]int, error) - // Returns statistics for the cgroup set + // GetStats returns cgroups statistics. GetStats() (*Stats, error) - // Toggles the freezer cgroup according with specified state + // Freeze sets the freezer cgroup to the specified state. Freeze(state configs.FreezerState) error - // Destroys the cgroup set + // Destroy removes cgroup. Destroy() error - // The option func SystemdCgroups() and Cgroupfs() require following attributes: - // Paths map[string]string - // Cgroups *configs.Cgroup - // Paths maps cgroup subsystem to path at which it is mounted. - // Cgroups specifies specific cgroup settings for the various subsystems + // Path returns a cgroup path to the specified controller/subsystem. + // For cgroupv2, the argument is unused and can be empty. + Path(string) string - // Returns cgroup paths to save in a state file and to be able to - // restore the object later. + // Set sets cgroup resources parameters/limits. If the argument is nil, + // the resources specified during Manager creation (or the previous call + // to Set) are used. + Set(r *configs.Resources) error + + // GetPaths returns cgroup path(s) to save in a state file in order to + // restore later. + // + // For cgroup v1, a key is cgroup subsystem name, and the value is the + // path to the cgroup for this subsystem. + // + // For cgroup v2 unified hierarchy, a key is "", and the value is the + // unified path. GetPaths() map[string]string - // GetUnifiedPath returns the unified path when running in unified mode. - // The value corresponds to the all values of GetPaths() map. - // - // GetUnifiedPath returns error when running in hybrid mode as well as - // in legacy mode. - GetUnifiedPath() (string, error) - - // Sets the cgroup as configured. - Set(container *configs.Config) error - - // Gets the cgroup as configured. + // GetCgroups returns the cgroup data as configured. GetCgroups() (*configs.Cgroup, error) -} -type NotFoundError struct { - Subsystem string -} + // GetFreezerState retrieves the current FreezerState of the cgroup. + GetFreezerState() (configs.FreezerState, error) -func (e *NotFoundError) Error() string { - return fmt.Sprintf("mountpoint for %s not found", e.Subsystem) -} + // Exists returns whether the cgroup path exists or not. + Exists() bool -func NewNotFoundError(sub string) error { - return &NotFoundError{ - Subsystem: sub, - } -} - -func IsNotFound(err error) bool { - if err == nil { - return false - } - _, ok := err.(*NotFoundError) - return ok + // OOMKillCount reports OOM kill count for the cgroup. + OOMKillCount() (uint64, error) } diff --git a/libcontainer/cgroups/cgroups_test.go b/libcontainer/cgroups/cgroups_test.go index 9efb83e..b31412f 100644 --- a/libcontainer/cgroups/cgroups_test.go +++ b/libcontainer/cgroups/cgroups_test.go @@ -1,5 +1,3 @@ -// +build linux - package cgroups import ( diff --git a/libcontainer/cgroups/cgroups_unsupported.go b/libcontainer/cgroups/cgroups_unsupported.go deleted file mode 100644 index 278d507..0000000 --- a/libcontainer/cgroups/cgroups_unsupported.go +++ /dev/null @@ -1,3 +0,0 @@ -// +build !linux - -package cgroups diff --git a/libcontainer/cgroups/devices/devices_emulator.go b/libcontainer/cgroups/devices/devices_emulator.go new file mode 100644 index 0000000..6c61ee4 --- /dev/null +++ b/libcontainer/cgroups/devices/devices_emulator.go @@ -0,0 +1,386 @@ +// SPDX-License-Identifier: Apache-2.0 +/* + * Copyright (C) 2020 Aleksa Sarai + * Copyright (C) 2020 SUSE LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package devices + +import ( + "bufio" + "fmt" + "io" + "sort" + "strconv" + "strings" + + "github.com/opencontainers/runc/libcontainer/devices" +) + +// deviceMeta is a Rule without the Allow or Permissions fields, and no +// wildcard-type support. It's effectively the "match" portion of a metadata +// rule, for the purposes of our emulation. +type deviceMeta struct { + node devices.Type + major int64 + minor int64 +} + +// deviceRule is effectively the tuple (deviceMeta, Permissions). +type deviceRule struct { + meta deviceMeta + perms devices.Permissions +} + +// deviceRules is a mapping of device metadata rules to the associated +// permissions in the ruleset. +type deviceRules map[deviceMeta]devices.Permissions + +func (r deviceRules) orderedEntries() []deviceRule { + var rules []deviceRule + for meta, perms := range r { + rules = append(rules, deviceRule{meta: meta, perms: perms}) + } + sort.Slice(rules, func(i, j int) bool { + // Sort by (major, minor, type). + a, b := rules[i].meta, rules[j].meta + return a.major < b.major || + (a.major == b.major && a.minor < b.minor) || + (a.major == b.major && a.minor == b.minor && a.node < b.node) + }) + return rules +} + +type Emulator struct { + defaultAllow bool + rules deviceRules +} + +func (e *Emulator) IsBlacklist() bool { + return e.defaultAllow +} + +func (e *Emulator) IsAllowAll() bool { + return e.IsBlacklist() && len(e.rules) == 0 +} + +func parseLine(line string) (*deviceRule, error) { + // Input: node major:minor perms. + fields := strings.FieldsFunc(line, func(r rune) bool { + return r == ' ' || r == ':' + }) + if len(fields) != 4 { + return nil, fmt.Errorf("malformed devices.list rule %s", line) + } + + var ( + rule deviceRule + node = fields[0] + major = fields[1] + minor = fields[2] + perms = fields[3] + ) + + // Parse the node type. + switch node { + case "a": + // Super-special case -- "a" always means every device with every + // access mode. In fact, for devices.list this actually indicates that + // the cgroup is in black-list mode. + // TODO: Double-check that the entire file is "a *:* rwm". + return nil, nil + case "b": + rule.meta.node = devices.BlockDevice + case "c": + rule.meta.node = devices.CharDevice + default: + return nil, fmt.Errorf("unknown device type %q", node) + } + + // Parse the major number. + if major == "*" { + rule.meta.major = devices.Wildcard + } else { + val, err := strconv.ParseUint(major, 10, 32) + if err != nil { + return nil, fmt.Errorf("invalid major number: %w", err) + } + rule.meta.major = int64(val) + } + + // Parse the minor number. + if minor == "*" { + rule.meta.minor = devices.Wildcard + } else { + val, err := strconv.ParseUint(minor, 10, 32) + if err != nil { + return nil, fmt.Errorf("invalid minor number: %w", err) + } + rule.meta.minor = int64(val) + } + + // Parse the access permissions. + rule.perms = devices.Permissions(perms) + if !rule.perms.IsValid() || rule.perms.IsEmpty() { + return nil, fmt.Errorf("parse access mode: contained unknown modes or is empty: %q", perms) + } + return &rule, nil +} + +func (e *Emulator) addRule(rule deviceRule) error { //nolint:unparam + if e.rules == nil { + e.rules = make(map[deviceMeta]devices.Permissions) + } + + // Merge with any pre-existing permissions. + oldPerms := e.rules[rule.meta] + newPerms := rule.perms.Union(oldPerms) + e.rules[rule.meta] = newPerms + return nil +} + +func (e *Emulator) rmRule(rule deviceRule) error { + // Give an error if any of the permissions requested to be removed are + // present in a partially-matching wildcard rule, because such rules will + // be ignored by cgroupv1. + // + // This is a diversion from cgroupv1, but is necessary to avoid leading + // users into a false sense of security. cgroupv1 will silently(!) ignore + // requests to remove partial exceptions, but we really shouldn't do that. + // + // It may seem like we could just "split" wildcard rules which hit this + // issue, but unfortunately there are 2^32 possible major and minor + // numbers, which would exhaust kernel memory quickly if we did this. Not + // to mention it'd be really slow (the kernel side is implemented as a + // linked-list of exceptions). + for _, partialMeta := range []deviceMeta{ + {node: rule.meta.node, major: devices.Wildcard, minor: rule.meta.minor}, + {node: rule.meta.node, major: rule.meta.major, minor: devices.Wildcard}, + {node: rule.meta.node, major: devices.Wildcard, minor: devices.Wildcard}, + } { + // This wildcard rule is equivalent to the requested rule, so skip it. + if rule.meta == partialMeta { + continue + } + // Only give an error if the set of permissions overlap. + partialPerms := e.rules[partialMeta] + if !partialPerms.Intersection(rule.perms).IsEmpty() { + return fmt.Errorf("requested rule [%v %v] not supported by devices cgroupv1 (cannot punch hole in existing wildcard rule [%v %v])", rule.meta, rule.perms, partialMeta, partialPerms) + } + } + + // Subtract all of the permissions listed from the full match rule. If the + // rule didn't exist, all of this is a no-op. + newPerms := e.rules[rule.meta].Difference(rule.perms) + if newPerms.IsEmpty() { + delete(e.rules, rule.meta) + } else { + e.rules[rule.meta] = newPerms + } + // TODO: The actual cgroup code doesn't care if an exception didn't exist + // during removal, so not erroring out here is /accurate/ but quite + // worrying. Maybe we should do additional validation, but again we + // have to worry about backwards-compatibility. + return nil +} + +func (e *Emulator) allow(rule *deviceRule) error { + // This cgroup is configured as a black-list. Reset the entire emulator, + // and put is into black-list mode. + if rule == nil || rule.meta.node == devices.WildcardDevice { + *e = Emulator{ + defaultAllow: true, + rules: nil, + } + return nil + } + + var err error + if e.defaultAllow { + err = wrapErr(e.rmRule(*rule), "unable to remove 'deny' exception") + } else { + err = wrapErr(e.addRule(*rule), "unable to add 'allow' exception") + } + return err +} + +func (e *Emulator) deny(rule *deviceRule) error { + // This cgroup is configured as a white-list. Reset the entire emulator, + // and put is into white-list mode. + if rule == nil || rule.meta.node == devices.WildcardDevice { + *e = Emulator{ + defaultAllow: false, + rules: nil, + } + return nil + } + + var err error + if e.defaultAllow { + err = wrapErr(e.addRule(*rule), "unable to add 'deny' exception") + } else { + err = wrapErr(e.rmRule(*rule), "unable to remove 'allow' exception") + } + return err +} + +func (e *Emulator) Apply(rule devices.Rule) error { + if !rule.Type.CanCgroup() { + return fmt.Errorf("cannot add rule [%#v] with non-cgroup type %q", rule, rule.Type) + } + + innerRule := &deviceRule{ + meta: deviceMeta{ + node: rule.Type, + major: rule.Major, + minor: rule.Minor, + }, + perms: rule.Permissions, + } + if innerRule.meta.node == devices.WildcardDevice { + innerRule = nil + } + + if rule.Allow { + return e.allow(innerRule) + } + + return e.deny(innerRule) +} + +// EmulatorFromList takes a reader to a "devices.list"-like source, and returns +// a new Emulator that represents the state of the devices cgroup. Note that +// black-list devices cgroups cannot be fully reconstructed, due to limitations +// in the devices cgroup API. Instead, such cgroups are always treated as +// "allow all" cgroups. +func EmulatorFromList(list io.Reader) (*Emulator, error) { + // Normally cgroups are in black-list mode by default, but the way we + // figure out the current mode is whether or not devices.list has an + // allow-all rule. So we default to a white-list, and the existence of an + // "a *:* rwm" entry will tell us otherwise. + e := &Emulator{ + defaultAllow: false, + } + + // Parse the "devices.list". + s := bufio.NewScanner(list) + for s.Scan() { + line := s.Text() + deviceRule, err := parseLine(line) + if err != nil { + return nil, fmt.Errorf("error parsing line %q: %w", line, err) + } + // "devices.list" is an allow list. Note that this means that in + // black-list mode, we have no idea what rules are in play. As a + // result, we need to be very careful in Transition(). + if err := e.allow(deviceRule); err != nil { + return nil, fmt.Errorf("error adding devices.list rule: %w", err) + } + } + if err := s.Err(); err != nil { + return nil, fmt.Errorf("error reading devices.list lines: %w", err) + } + return e, nil +} + +// Transition calculates what is the minimally-disruptive set of rules need to +// be applied to a devices cgroup in order to transition to the given target. +// This means that any already-existing rules will not be applied, and +// disruptive rules (like denying all device access) will only be applied if +// necessary. +// +// This function is the sole reason for all of Emulator -- to allow us +// to figure out how to update a containers' cgroups without causing spurious +// device errors (if possible). +func (source *Emulator) Transition(target *Emulator) ([]*devices.Rule, error) { + var transitionRules []*devices.Rule + oldRules := source.rules + + // If the default policy doesn't match, we need to include a "disruptive" + // rule (either allow-all or deny-all) in order to switch the cgroup to the + // correct default policy. + // + // However, due to a limitation in "devices.list" we cannot be sure what + // deny rules are in place in a black-list cgroup. Thus if the source is a + // black-list we also have to include a disruptive rule. + if source.IsBlacklist() || source.defaultAllow != target.defaultAllow { + transitionRules = append(transitionRules, &devices.Rule{ + Type: 'a', + Major: -1, + Minor: -1, + Permissions: devices.Permissions("rwm"), + Allow: target.defaultAllow, + }) + // The old rules are only relevant if we aren't starting out with a + // disruptive rule. + oldRules = nil + } + + // NOTE: We traverse through the rules in a sorted order so we always write + // the same set of rules (this is to aid testing). + + // First, we create inverse rules for any old rules not in the new set. + // This includes partial-inverse rules for specific permissions. This is a + // no-op if we added a disruptive rule, since oldRules will be empty. + for _, rule := range oldRules.orderedEntries() { + meta, oldPerms := rule.meta, rule.perms + newPerms := target.rules[meta] + droppedPerms := oldPerms.Difference(newPerms) + if !droppedPerms.IsEmpty() { + transitionRules = append(transitionRules, &devices.Rule{ + Type: meta.node, + Major: meta.major, + Minor: meta.minor, + Permissions: droppedPerms, + Allow: target.defaultAllow, + }) + } + } + + // Add any additional rules which weren't in the old set. We happen to + // filter out rules which are present in both sets, though this isn't + // strictly necessary. + for _, rule := range target.rules.orderedEntries() { + meta, newPerms := rule.meta, rule.perms + oldPerms := oldRules[meta] + gainedPerms := newPerms.Difference(oldPerms) + if !gainedPerms.IsEmpty() { + transitionRules = append(transitionRules, &devices.Rule{ + Type: meta.node, + Major: meta.major, + Minor: meta.minor, + Permissions: gainedPerms, + Allow: !target.defaultAllow, + }) + } + } + return transitionRules, nil +} + +// Rules returns the minimum set of rules necessary to convert a *deny-all* +// cgroup to the emulated filter state (note that this is not the same as a +// default cgroupv1 cgroup -- which is allow-all). This is effectively just a +// wrapper around Transition() with the source emulator being an empty cgroup. +func (e *Emulator) Rules() ([]*devices.Rule, error) { + defaultCgroup := &Emulator{defaultAllow: false} + return defaultCgroup.Transition(e) +} + +func wrapErr(err error, text string) error { + if err == nil { + return nil + } + return fmt.Errorf(text+": %w", err) +} diff --git a/libcontainer/cgroups/devices/devices_emulator_test.go b/libcontainer/cgroups/devices/devices_emulator_test.go new file mode 100644 index 0000000..4dac242 --- /dev/null +++ b/libcontainer/cgroups/devices/devices_emulator_test.go @@ -0,0 +1,1144 @@ +// SPDX-License-Identifier: Apache-2.0 +/* + * Copyright (C) 2020 Aleksa Sarai + * Copyright (C) 2020 SUSE LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package devices + +import ( + "bufio" + "bytes" + "reflect" + "strings" + "testing" + + "github.com/opencontainers/runc/libcontainer/devices" +) + +func TestDeviceEmulatorLoad(t *testing.T) { + tests := []struct { + name, list string + expected *Emulator + }{ + { + name: "BlacklistMode", + list: `a *:* rwm`, + expected: &Emulator{ + defaultAllow: true, + }, + }, + { + name: "WhitelistBasic", + list: `c 4:2 rw`, + expected: &Emulator{ + defaultAllow: false, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 4, + minor: 2, + }: devices.Permissions("rw"), + }, + }, + }, + { + name: "WhitelistWildcard", + list: `b 0:* m`, + expected: &Emulator{ + defaultAllow: false, + rules: deviceRules{ + { + node: devices.BlockDevice, + major: 0, + minor: devices.Wildcard, + }: devices.Permissions("m"), + }, + }, + }, + { + name: "WhitelistDuplicate", + list: `c *:* rwm +c 1:1 r`, + expected: &Emulator{ + defaultAllow: false, + rules: deviceRules{ + { + node: devices.CharDevice, + major: devices.Wildcard, + minor: devices.Wildcard, + }: devices.Permissions("rwm"), + // To match the kernel, we allow redundant rules. + { + node: devices.CharDevice, + major: 1, + minor: 1, + }: devices.Permissions("r"), + }, + }, + }, + { + name: "WhitelistComplicated", + list: `c *:* m +b *:* m +c 1:3 rwm +c 1:5 rwm +c 1:7 rwm +c 1:8 rwm +c 1:9 rwm +c 5:0 rwm +c 5:2 rwm +c 136:* rwm +c 10:200 rwm`, + expected: &Emulator{ + defaultAllow: false, + rules: deviceRules{ + { + node: devices.CharDevice, + major: devices.Wildcard, + minor: devices.Wildcard, + }: devices.Permissions("m"), + { + node: devices.BlockDevice, + major: devices.Wildcard, + minor: devices.Wildcard, + }: devices.Permissions("m"), + { + node: devices.CharDevice, + major: 1, + minor: 3, + }: devices.Permissions("rwm"), + { + node: devices.CharDevice, + major: 1, + minor: 5, + }: devices.Permissions("rwm"), + { + node: devices.CharDevice, + major: 1, + minor: 7, + }: devices.Permissions("rwm"), + { + node: devices.CharDevice, + major: 1, + minor: 8, + }: devices.Permissions("rwm"), + { + node: devices.CharDevice, + major: 1, + minor: 9, + }: devices.Permissions("rwm"), + { + node: devices.CharDevice, + major: 5, + minor: 0, + }: devices.Permissions("rwm"), + { + node: devices.CharDevice, + major: 5, + minor: 2, + }: devices.Permissions("rwm"), + { + node: devices.CharDevice, + major: 136, + minor: devices.Wildcard, + }: devices.Permissions("rwm"), + { + node: devices.CharDevice, + major: 10, + minor: 200, + }: devices.Permissions("rwm"), + }, + }, + }, + // Some invalid lists. + { + name: "InvalidFieldNumber", + list: `b 1:0`, + expected: nil, + }, + { + name: "InvalidDeviceType", + list: `p *:* rwm`, + expected: nil, + }, + { + name: "InvalidMajorNumber1", + list: `p -1:3 rwm`, + expected: nil, + }, + { + name: "InvalidMajorNumber2", + list: `c foo:27 rwm`, + expected: nil, + }, + { + name: "InvalidMinorNumber1", + list: `b 1:-4 rwm`, + expected: nil, + }, + { + name: "InvalidMinorNumber2", + list: `b 1:foo rwm`, + expected: nil, + }, + { + name: "InvalidPermissions", + list: `b 1:7 rwk`, + expected: nil, + }, + } + + for _, test := range tests { + test := test // capture range variable + t.Run(test.name, func(t *testing.T) { + list := bytes.NewBufferString(test.list) + emu, err := EmulatorFromList(list) + if err != nil && test.expected != nil { + t.Fatalf("unexpected failure when creating emulator: %v", err) + } else if err == nil && test.expected == nil { + t.Fatalf("unexpected success when creating emulator: %#v", emu) + } + + if !reflect.DeepEqual(emu, test.expected) { + t.Errorf("final emulator state mismatch: %#v != %#v", emu, test.expected) + } + }) + } +} + +func testDeviceEmulatorApply(t *testing.T, baseDefaultAllow bool) { + tests := []struct { + name string + rule devices.Rule + base, expected *Emulator + }{ + // Switch between default modes. + { + name: "SwitchToOtherMode", + rule: devices.Rule{ + Type: devices.WildcardDevice, + Major: devices.Wildcard, + Minor: devices.Wildcard, + Permissions: devices.Permissions("rwm"), + Allow: !baseDefaultAllow, + }, + base: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: devices.Wildcard, + minor: devices.Wildcard, + }: devices.Permissions("rwm"), + { + node: devices.CharDevice, + major: 1, + minor: 1, + }: devices.Permissions("r"), + }, + }, + expected: &Emulator{ + defaultAllow: !baseDefaultAllow, + rules: nil, + }, + }, + { + name: "SwitchToSameModeNoop", + rule: devices.Rule{ + Type: devices.WildcardDevice, + Major: devices.Wildcard, + Minor: devices.Wildcard, + Permissions: devices.Permissions("rwm"), + Allow: baseDefaultAllow, + }, + base: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: nil, + }, + expected: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: nil, + }, + }, + { + name: "SwitchToSameMode", + rule: devices.Rule{ + Type: devices.WildcardDevice, + Major: devices.Wildcard, + Minor: devices.Wildcard, + Permissions: devices.Permissions("rwm"), + Allow: baseDefaultAllow, + }, + base: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: devices.Wildcard, + minor: devices.Wildcard, + }: devices.Permissions("rwm"), + { + node: devices.CharDevice, + major: 1, + minor: 1, + }: devices.Permissions("r"), + }, + }, + expected: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: nil, + }, + }, + // Rule addition logic. + { + name: "RuleAdditionBasic", + rule: devices.Rule{ + Type: devices.CharDevice, + Major: 42, + Minor: 1337, + Permissions: devices.Permissions("rm"), + Allow: !baseDefaultAllow, + }, + base: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 2, + minor: 1, + }: devices.Permissions("rwm"), + { + node: devices.BlockDevice, + major: 1, + minor: 5, + }: devices.Permissions("r"), + }, + }, + expected: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 2, + minor: 1, + }: devices.Permissions("rwm"), + { + node: devices.BlockDevice, + major: 1, + minor: 5, + }: devices.Permissions("r"), + { + node: devices.CharDevice, + major: 42, + minor: 1337, + }: devices.Permissions("rm"), + }, + }, + }, + { + name: "RuleAdditionBasicDuplicate", + rule: devices.Rule{ + Type: devices.CharDevice, + Major: 42, + Minor: 1337, + Permissions: devices.Permissions("rm"), + Allow: !baseDefaultAllow, + }, + base: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 42, + minor: devices.Wildcard, + }: devices.Permissions("rwm"), + }, + }, + expected: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 42, + minor: devices.Wildcard, + }: devices.Permissions("rwm"), + // To match the kernel, we allow redundant rules. + { + node: devices.CharDevice, + major: 42, + minor: 1337, + }: devices.Permissions("rm"), + }, + }, + }, + { + name: "RuleAdditionBasicDuplicateNoop", + rule: devices.Rule{ + Type: devices.CharDevice, + Major: 42, + Minor: 1337, + Permissions: devices.Permissions("rm"), + Allow: !baseDefaultAllow, + }, + base: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 42, + minor: 1337, + }: devices.Permissions("rm"), + }, + }, + expected: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 42, + minor: 1337, + }: devices.Permissions("rm"), + }, + }, + }, + { + name: "RuleAdditionMerge", + rule: devices.Rule{ + Type: devices.BlockDevice, + Major: 5, + Minor: 12, + Permissions: devices.Permissions("rm"), + Allow: !baseDefaultAllow, + }, + base: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 2, + minor: 1, + }: devices.Permissions("rwm"), + { + node: devices.BlockDevice, + major: 5, + minor: 12, + }: devices.Permissions("rw"), + }, + }, + expected: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 2, + minor: 1, + }: devices.Permissions("rwm"), + { + node: devices.BlockDevice, + major: 5, + minor: 12, + }: devices.Permissions("rwm"), + }, + }, + }, + { + name: "RuleAdditionMergeWildcard", + rule: devices.Rule{ + Type: devices.BlockDevice, + Major: 5, + Minor: devices.Wildcard, + Permissions: devices.Permissions("rm"), + Allow: !baseDefaultAllow, + }, + base: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 2, + minor: 1, + }: devices.Permissions("rwm"), + { + node: devices.BlockDevice, + major: 5, + minor: devices.Wildcard, + }: devices.Permissions("rw"), + }, + }, + expected: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 2, + minor: 1, + }: devices.Permissions("rwm"), + { + node: devices.BlockDevice, + major: 5, + minor: devices.Wildcard, + }: devices.Permissions("rwm"), + }, + }, + }, + { + name: "RuleAdditionMergeNoop", + rule: devices.Rule{ + Type: devices.BlockDevice, + Major: 5, + Minor: 12, + Permissions: devices.Permissions("r"), + Allow: !baseDefaultAllow, + }, + base: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 2, + minor: 1, + }: devices.Permissions("rwm"), + { + node: devices.BlockDevice, + major: 5, + minor: 12, + }: devices.Permissions("rw"), + }, + }, + expected: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 2, + minor: 1, + }: devices.Permissions("rwm"), + { + node: devices.BlockDevice, + major: 5, + minor: 12, + }: devices.Permissions("rw"), + }, + }, + }, + // Rule removal logic. + { + name: "RuleRemovalBasic", + rule: devices.Rule{ + Type: devices.CharDevice, + Major: 42, + Minor: 1337, + Permissions: devices.Permissions("rm"), + Allow: baseDefaultAllow, + }, + base: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 42, + minor: 1337, + }: devices.Permissions("rm"), + { + node: devices.BlockDevice, + major: 1, + minor: 5, + }: devices.Permissions("r"), + }, + }, + expected: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.BlockDevice, + major: 1, + minor: 5, + }: devices.Permissions("r"), + }, + }, + }, + { + name: "RuleRemovalNonexistent", + rule: devices.Rule{ + Type: devices.CharDevice, + Major: 4, + Minor: 1, + Permissions: devices.Permissions("rw"), + Allow: baseDefaultAllow, + }, + base: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.BlockDevice, + major: 1, + minor: 5, + }: devices.Permissions("r"), + }, + }, + expected: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.BlockDevice, + major: 1, + minor: 5, + }: devices.Permissions("r"), + }, + }, + }, + { + name: "RuleRemovalFull", + rule: devices.Rule{ + Type: devices.CharDevice, + Major: 42, + Minor: 1337, + Permissions: devices.Permissions("rw"), + Allow: baseDefaultAllow, + }, + base: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 42, + minor: 1337, + }: devices.Permissions("w"), + { + node: devices.BlockDevice, + major: 1, + minor: 5, + }: devices.Permissions("r"), + }, + }, + expected: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.BlockDevice, + major: 1, + minor: 5, + }: devices.Permissions("r"), + }, + }, + }, + { + name: "RuleRemovalPartial", + rule: devices.Rule{ + Type: devices.CharDevice, + Major: 42, + Minor: 1337, + Permissions: devices.Permissions("r"), + Allow: baseDefaultAllow, + }, + base: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 42, + minor: 1337, + }: devices.Permissions("rm"), + { + node: devices.BlockDevice, + major: 1, + minor: 5, + }: devices.Permissions("r"), + }, + }, + expected: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 42, + minor: 1337, + }: devices.Permissions("m"), + { + node: devices.BlockDevice, + major: 1, + minor: 5, + }: devices.Permissions("r"), + }, + }, + }, + // Check our non-canonical behaviour when it comes to try to "punch + // out" holes in a wildcard rule. + { + name: "RuleRemovalWildcardPunchoutImpossible", + rule: devices.Rule{ + Type: devices.CharDevice, + Major: 42, + Minor: 1337, + Permissions: devices.Permissions("r"), + Allow: baseDefaultAllow, + }, + base: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 42, + minor: devices.Wildcard, + }: devices.Permissions("rm"), + { + node: devices.CharDevice, + major: 42, + minor: 1337, + }: devices.Permissions("r"), + }, + }, + expected: nil, + }, + { + name: "RuleRemovalWildcardPunchoutPossible", + rule: devices.Rule{ + Type: devices.CharDevice, + Major: 42, + Minor: 1337, + Permissions: devices.Permissions("r"), + Allow: baseDefaultAllow, + }, + base: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 42, + minor: devices.Wildcard, + }: devices.Permissions("wm"), + { + node: devices.CharDevice, + major: 42, + minor: 1337, + }: devices.Permissions("r"), + }, + }, + expected: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 42, + minor: devices.Wildcard, + }: devices.Permissions("wm"), + }, + }, + }, + } + + for _, test := range tests { + test := test + t.Run(test.name, func(t *testing.T) { + err := test.base.Apply(test.rule) + if err != nil && test.expected != nil { + t.Fatalf("unexpected failure when applying apply rule: %v", err) + } else if err == nil && test.expected == nil { + t.Fatalf("unexpected success when applying apply rule: %#v", test.base) + } + + if test.expected != nil && !reflect.DeepEqual(test.base, test.expected) { + t.Errorf("final emulator state mismatch: %#v != %#v", test.base, test.expected) + } + }) + } +} + +func TestDeviceEmulatorWhitelistApply(t *testing.T) { + testDeviceEmulatorApply(t, false) +} + +func TestDeviceEmulatorBlacklistApply(t *testing.T) { + testDeviceEmulatorApply(t, true) +} + +func testDeviceEmulatorTransition(t *testing.T, sourceDefaultAllow bool) { + tests := []struct { + name string + source, target *Emulator + expected []*devices.Rule + }{ + // No-op changes. + { + name: "Noop", + source: &Emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 42, + minor: devices.Wildcard, + }: devices.Permissions("wm"), + }, + }, + target: &Emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 42, + minor: devices.Wildcard, + }: devices.Permissions("wm"), + }, + }, + // Identical white-lists produce no extra rules. + expected: nil, + }, + // Switching modes. + { + name: "SwitchToOtherMode", + source: &Emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 1, + minor: 2, + }: devices.Permissions("rwm"), + }, + }, + target: &Emulator{ + defaultAllow: !sourceDefaultAllow, + rules: deviceRules{ + { + node: devices.BlockDevice, + major: 42, + minor: devices.Wildcard, + }: devices.Permissions("wm"), + }, + }, + expected: []*devices.Rule{ + // Clear-all rule. + { + Type: devices.WildcardDevice, + Major: devices.Wildcard, + Minor: devices.Wildcard, + Permissions: devices.Permissions("rwm"), + Allow: !sourceDefaultAllow, + }, + // The actual rule-set. + { + Type: devices.BlockDevice, + Major: 42, + Minor: devices.Wildcard, + Permissions: devices.Permissions("wm"), + Allow: sourceDefaultAllow, + }, + }, + }, + // Rule changes. + { + name: "RuleAddition", + source: &Emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 1, + minor: 2, + }: devices.Permissions("rwm"), + }, + }, + target: &Emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 1, + minor: 2, + }: devices.Permissions("rwm"), + { + node: devices.BlockDevice, + major: 42, + minor: 1337, + }: devices.Permissions("rwm"), + }, + }, + expected: []*devices.Rule{ + { + Type: devices.BlockDevice, + Major: 42, + Minor: 1337, + Permissions: devices.Permissions("rwm"), + Allow: !sourceDefaultAllow, + }, + }, + }, + { + name: "RuleRemoval", + source: &Emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 1, + minor: 2, + }: devices.Permissions("rwm"), + { + node: devices.BlockDevice, + major: 42, + minor: 1337, + }: devices.Permissions("rwm"), + }, + }, + target: &Emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 1, + minor: 2, + }: devices.Permissions("rwm"), + }, + }, + expected: []*devices.Rule{ + { + Type: devices.BlockDevice, + Major: 42, + Minor: 1337, + Permissions: devices.Permissions("rwm"), + Allow: sourceDefaultAllow, + }, + }, + }, + { + name: "RuleMultipleAdditionRemoval", + source: &Emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 1, + minor: 2, + }: devices.Permissions("rwm"), + { + node: devices.BlockDevice, + major: 3, + minor: 9, + }: devices.Permissions("rw"), + }, + }, + target: &Emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 1, + minor: 2, + }: devices.Permissions("rwm"), + }, + }, + expected: []*devices.Rule{ + { + Type: devices.BlockDevice, + Major: 3, + Minor: 9, + Permissions: devices.Permissions("rw"), + Allow: sourceDefaultAllow, + }, + }, + }, + // Modifying the access permissions. + { + name: "RulePartialAddition", + source: &Emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 1, + minor: 2, + }: devices.Permissions("r"), + }, + }, + target: &Emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 1, + minor: 2, + }: devices.Permissions("rwm"), + }, + }, + expected: []*devices.Rule{ + { + Type: devices.CharDevice, + Major: 1, + Minor: 2, + Permissions: devices.Permissions("wm"), + Allow: !sourceDefaultAllow, + }, + }, + }, + { + name: "RulePartialRemoval", + source: &Emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 1, + minor: 2, + }: devices.Permissions("rw"), + }, + }, + target: &Emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 1, + minor: 2, + }: devices.Permissions("w"), + }, + }, + expected: []*devices.Rule{ + { + Type: devices.CharDevice, + Major: 1, + Minor: 2, + Permissions: devices.Permissions("r"), + Allow: sourceDefaultAllow, + }, + }, + }, + { + name: "RulePartialBoth", + source: &Emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 1, + minor: 2, + }: devices.Permissions("rw"), + }, + }, + target: &Emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 1, + minor: 2, + }: devices.Permissions("rm"), + }, + }, + expected: []*devices.Rule{ + { + Type: devices.CharDevice, + Major: 1, + Minor: 2, + Permissions: devices.Permissions("w"), + Allow: sourceDefaultAllow, + }, + { + Type: devices.CharDevice, + Major: 1, + Minor: 2, + Permissions: devices.Permissions("m"), + Allow: !sourceDefaultAllow, + }, + }, + }, + } + + for _, test := range tests { + test := test + t.Run(test.name, func(t *testing.T) { + // If we are in black-list mode, we need to prepend the relevant + // clear-all rule (the expected rule lists are written with + // white-list mode in mind), and then make a full copy of the + // target rules. + if sourceDefaultAllow && test.source.defaultAllow == test.target.defaultAllow { + test.expected = []*devices.Rule{{ + Type: devices.WildcardDevice, + Major: devices.Wildcard, + Minor: devices.Wildcard, + Permissions: devices.Permissions("rwm"), + Allow: test.target.defaultAllow, + }} + for _, rule := range test.target.rules.orderedEntries() { + test.expected = append(test.expected, &devices.Rule{ + Type: rule.meta.node, + Major: rule.meta.major, + Minor: rule.meta.minor, + Permissions: rule.perms, + Allow: !test.target.defaultAllow, + }) + } + } + + rules, err := test.source.Transition(test.target) + if err != nil { + t.Fatalf("unexpected error while calculating transition rules: %#v", err) + } + + if !reflect.DeepEqual(rules, test.expected) { + t.Errorf("rules don't match expected set: %#v != %#v", rules, test.expected) + } + + // Apply the rules to the source to see if it actually transitions + // correctly. This is all emulated but it's a good thing to + // double-check. + for _, rule := range rules { + if err := test.source.Apply(*rule); err != nil { + t.Fatalf("error while applying transition rule [%#v]: %v", rule, err) + } + } + if !reflect.DeepEqual(test.source, test.target) { + t.Errorf("transition incomplete after applying all rules: %#v != %#v", test.source, test.target) + } + }) + } +} + +func TestDeviceEmulatorTransitionFromBlacklist(t *testing.T) { + testDeviceEmulatorTransition(t, true) +} + +func TestDeviceEmulatorTransitionFromWhitelist(t *testing.T) { + testDeviceEmulatorTransition(t, false) +} + +func BenchmarkParseLine(b *testing.B) { + list := `c *:* m +b *:* m +c 1:3 rwm +c 1:5 rwm +c 1:7 rwm +c 1:8 rwm +c 1:9 rwm +c 5:0 rwm +c 5:2 rwm +c 136:* rwm +c 10:200 rwm` + + var r *deviceRule + var err error + for i := 0; i < b.N; i++ { + s := bufio.NewScanner(strings.NewReader(list)) + for s.Scan() { + line := s.Text() + r, err = parseLine(line) + } + if err := s.Err(); err != nil { + b.Fatal(err) + } + } + b.Logf("rule: %v, err: %v", r, err) +} diff --git a/libcontainer/cgroups/ebpf/devicefilter/devicefilter.go b/libcontainer/cgroups/ebpf/devicefilter/devicefilter.go index 847ce8e..4e69b35 100644 --- a/libcontainer/cgroups/ebpf/devicefilter/devicefilter.go +++ b/libcontainer/cgroups/ebpf/devicefilter/devicefilter.go @@ -1,4 +1,4 @@ -// Package devicefilter containes eBPF device filter program +// Package devicefilter contains eBPF device filter program // // The implementation is based on https://github.com/containers/crun/blob/0.10.2/src/libcrun/ebpf.c // @@ -7,12 +7,14 @@ package devicefilter import ( + "errors" "fmt" "math" + "strconv" "github.com/cilium/ebpf/asm" - "github.com/opencontainers/runc/libcontainer/configs" - "github.com/pkg/errors" + devicesemulator "github.com/opencontainers/runc/libcontainer/cgroups/devices" + "github.com/opencontainers/runc/libcontainer/devices" "golang.org/x/sys/unix" ) @@ -22,22 +24,54 @@ const ( ) // DeviceFilter returns eBPF device filter program and its license string -func DeviceFilter(devices []*configs.Device) (asm.Instructions, string, error) { - p := &program{} - p.init() - for i := len(devices) - 1; i >= 0; i-- { - if err := p.appendDevice(devices[i]); err != nil { +func DeviceFilter(rules []*devices.Rule) (asm.Instructions, string, error) { + // Generate the minimum ruleset for the device rules we are given. While we + // don't care about minimum transitions in cgroupv2, using the emulator + // gives us a guarantee that the behaviour of devices filtering is the same + // as cgroupv1, including security hardenings to avoid misconfiguration + // (such as punching holes in wildcard rules). + emu := new(devicesemulator.Emulator) + for _, rule := range rules { + if err := emu.Apply(*rule); err != nil { return nil, "", err } } - insts, err := p.finalize() - return insts, license, err + cleanRules, err := emu.Rules() + if err != nil { + return nil, "", err + } + + p := &program{ + defaultAllow: emu.IsBlacklist(), + } + p.init() + + for idx, rule := range cleanRules { + if rule.Type == devices.WildcardDevice { + // We can safely skip over wildcard entries because there should + // only be one (at most) at the very start to instruct cgroupv1 to + // go into allow-list mode. However we do double-check this here. + if idx != 0 || rule.Allow != emu.IsBlacklist() { + return nil, "", fmt.Errorf("[internal error] emulated cgroupv2 devices ruleset had bad wildcard at idx %v (%s)", idx, rule.CgroupString()) + } + continue + } + if rule.Allow == p.defaultAllow { + // There should be no rules which have an action equal to the + // default action, the emulator removes those. + return nil, "", fmt.Errorf("[internal error] emulated cgroupv2 devices ruleset had no-op rule at idx %v (%s)", idx, rule.CgroupString()) + } + if err := p.appendRule(rule); err != nil { + return nil, "", err + } + } + return p.finalize(), license, nil } type program struct { - insts asm.Instructions - hasWildCard bool - blockID int + insts asm.Instructions + defaultAllow bool + blockID int } func (p *program) init() { @@ -49,7 +83,8 @@ func (p *program) init() { */ // R2 <- type (lower 16 bit of u32 access_type at R1[0]) p.insts = append(p.insts, - asm.LoadMem(asm.R2, asm.R1, 0, asm.Half)) + asm.LoadMem(asm.R2, asm.R1, 0, asm.Word), + asm.And.Imm32(asm.R2, 0xFFFF)) // R3 <- access (upper 16 bit of u32 access_type at R1[0]) p.insts = append(p.insts, @@ -66,39 +101,35 @@ func (p *program) init() { asm.LoadMem(asm.R5, asm.R1, 8, asm.Word)) } -// appendDevice needs to be called from the last element of OCI linux.resources.devices to the head element. -func (p *program) appendDevice(dev *configs.Device) error { +// appendRule rule converts an OCI rule to the relevant eBPF block and adds it +// to the in-progress filter program. In order to operate properly, it must be +// called with a "clean" rule list (generated by devices.Emulator.Rules() -- +// with any "a" rules removed). +func (p *program) appendRule(rule *devices.Rule) error { if p.blockID < 0 { return errors.New("the program is finalized") } - if p.hasWildCard { - // All entries after wildcard entry are ignored - return nil - } - bpfType := int32(-1) - hasType := true - switch dev.Type { - case 'c': + var bpfType int32 + switch rule.Type { + case devices.CharDevice: bpfType = int32(unix.BPF_DEVCG_DEV_CHAR) - case 'b': + case devices.BlockDevice: bpfType = int32(unix.BPF_DEVCG_DEV_BLOCK) - case 'a': - hasType = false default: - // if not specified in OCI json, typ is set to DeviceTypeAll - return errors.Errorf("invalid DeviceType %q", string(dev.Type)) + // We do not permit 'a', nor any other types we don't know about. + return fmt.Errorf("invalid type %q", string(rule.Type)) } - if dev.Major > math.MaxUint32 { - return errors.Errorf("invalid major %d", dev.Major) + if rule.Major > math.MaxUint32 { + return fmt.Errorf("invalid major %d", rule.Major) } - if dev.Minor > math.MaxUint32 { - return errors.Errorf("invalid minor %d", dev.Major) + if rule.Minor > math.MaxUint32 { + return fmt.Errorf("invalid minor %d", rule.Major) } - hasMajor := dev.Major >= 0 // if not specified in OCI json, major is set to -1 - hasMinor := dev.Minor >= 0 + hasMajor := rule.Major >= 0 // if not specified in OCI json, major is set to -1 + hasMinor := rule.Minor >= 0 bpfAccess := int32(0) - for _, r := range dev.Permissions { + for _, r := range rule.Permissions { switch r { case 'r': bpfAccess |= unix.BPF_DEVCG_ACC_READ @@ -107,68 +138,65 @@ func (p *program) appendDevice(dev *configs.Device) error { case 'm': bpfAccess |= unix.BPF_DEVCG_ACC_MKNOD default: - return errors.Errorf("unknown device access %v", r) + return fmt.Errorf("unknown device access %v", r) } } // If the access is rwm, skip the check. hasAccess := bpfAccess != (unix.BPF_DEVCG_ACC_READ | unix.BPF_DEVCG_ACC_WRITE | unix.BPF_DEVCG_ACC_MKNOD) - blockSym := fmt.Sprintf("block-%d", p.blockID) - nextBlockSym := fmt.Sprintf("block-%d", p.blockID+1) - prevBlockLastIdx := len(p.insts) - 1 - if hasType { - p.insts = append(p.insts, - // if (R2 != bpfType) goto next - asm.JNE.Imm(asm.R2, bpfType, nextBlockSym), - ) - } + var ( + blockSym = "block-" + strconv.Itoa(p.blockID) + nextBlockSym = "block-" + strconv.Itoa(p.blockID+1) + prevBlockLastIdx = len(p.insts) - 1 + ) + p.insts = append(p.insts, + // if (R2 != bpfType) goto next + asm.JNE.Imm(asm.R2, bpfType, nextBlockSym), + ) if hasAccess { p.insts = append(p.insts, - // if (R3 & bpfAccess == 0 /* use R1 as a temp var */) goto next + // if (R3 & bpfAccess != R3 /* use R1 as a temp var */) goto next asm.Mov.Reg32(asm.R1, asm.R3), asm.And.Imm32(asm.R1, bpfAccess), - asm.JEq.Imm(asm.R1, 0, nextBlockSym), + asm.JNE.Reg(asm.R1, asm.R3, nextBlockSym), ) } if hasMajor { p.insts = append(p.insts, // if (R4 != major) goto next - asm.JNE.Imm(asm.R4, int32(dev.Major), nextBlockSym), + asm.JNE.Imm(asm.R4, int32(rule.Major), nextBlockSym), ) } if hasMinor { p.insts = append(p.insts, // if (R5 != minor) goto next - asm.JNE.Imm(asm.R5, int32(dev.Minor), nextBlockSym), + asm.JNE.Imm(asm.R5, int32(rule.Minor), nextBlockSym), ) } - if !hasType && !hasAccess && !hasMajor && !hasMinor { - p.hasWildCard = true - } - p.insts = append(p.insts, acceptBlock(dev.Allow)...) + p.insts = append(p.insts, acceptBlock(rule.Allow)...) // set blockSym to the first instruction we added in this iteration p.insts[prevBlockLastIdx+1] = p.insts[prevBlockLastIdx+1].Sym(blockSym) p.blockID++ return nil } -func (p *program) finalize() (asm.Instructions, error) { - if p.hasWildCard { - // acceptBlock with asm.Return() is already inserted - return p.insts, nil +func (p *program) finalize() asm.Instructions { + var v int32 + if p.defaultAllow { + v = 1 } - blockSym := fmt.Sprintf("block-%d", p.blockID) + blockSym := "block-" + strconv.Itoa(p.blockID) p.insts = append(p.insts, - // R0 <- 0 - asm.Mov.Imm32(asm.R0, 0).Sym(blockSym), + // R0 <- v + asm.Mov.Imm32(asm.R0, v).Sym(blockSym), asm.Return(), ) p.blockID = -1 - return p.insts, nil + return p.insts } func acceptBlock(accept bool) asm.Instructions { - v := int32(0) + var v int32 if accept { v = 1 } diff --git a/libcontainer/cgroups/ebpf/devicefilter/devicefilter_test.go b/libcontainer/cgroups/ebpf/devicefilter/devicefilter_test.go index 59ff4b4..a8fc562 100644 --- a/libcontainer/cgroups/ebpf/devicefilter/devicefilter_test.go +++ b/libcontainer/cgroups/ebpf/devicefilter/devicefilter_test.go @@ -4,7 +4,7 @@ import ( "strings" "testing" - "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/devices" "github.com/opencontainers/runc/libcontainer/specconv" ) @@ -20,13 +20,12 @@ func hash(s, comm string) string { return strings.Join(res, "\n") } -func testDeviceFilter(t testing.TB, devices []*configs.Device, expectedStr string) { +func testDeviceFilter(t testing.TB, devices []*devices.Rule, expectedStr string) { insts, _, err := DeviceFilter(devices) if err != nil { t.Fatalf("%s: %v (devices: %+v)", t.Name(), err, devices) } s := insts.String() - t.Logf("%s: devices: %+v\n%s", t.Name(), devices, s) if expectedStr != "" { hashed := hash(s, "//") expectedHashed := hash(expectedStr, "//") @@ -39,15 +38,16 @@ func testDeviceFilter(t testing.TB, devices []*configs.Device, expectedStr strin func TestDeviceFilter_Nil(t *testing.T) { expected := ` // load parameters into registers - 0: LdXMemH dst: r2 src: r1 off: 0 imm: 0 - 1: LdXMemW dst: r3 src: r1 off: 0 imm: 0 - 2: RSh32Imm dst: r3 imm: 16 - 3: LdXMemW dst: r4 src: r1 off: 4 imm: 0 - 4: LdXMemW dst: r5 src: r1 off: 8 imm: 0 + 0: LdXMemW dst: r2 src: r1 off: 0 imm: 0 + 1: And32Imm dst: r2 imm: 65535 + 2: LdXMemW dst: r3 src: r1 off: 0 imm: 0 + 3: RSh32Imm dst: r3 imm: 16 + 4: LdXMemW dst: r4 src: r1 off: 4 imm: 0 + 5: LdXMemW dst: r5 src: r1 off: 8 imm: 0 block-0: // return 0 (reject) - 5: Mov32Imm dst: r0 imm: 0 - 6: Exit + 6: Mov32Imm dst: r0 imm: 0 + 7: Exit ` testDeviceFilter(t, nil, expected) } @@ -55,97 +55,96 @@ block-0: func TestDeviceFilter_BuiltInAllowList(t *testing.T) { expected := ` // load parameters into registers - 0: LdXMemH dst: r2 src: r1 off: 0 imm: 0 - 1: LdXMemW dst: r3 src: r1 off: 0 imm: 0 - 2: RSh32Imm dst: r3 imm: 16 - 3: LdXMemW dst: r4 src: r1 off: 4 imm: 0 - 4: LdXMemW dst: r5 src: r1 off: 8 imm: 0 + 0: LdXMemW dst: r2 src: r1 off: 0 imm: 0 + 1: And32Imm dst: r2 imm: 65535 + 2: LdXMemW dst: r3 src: r1 off: 0 imm: 0 + 3: RSh32Imm dst: r3 imm: 16 + 4: LdXMemW dst: r4 src: r1 off: 4 imm: 0 + 5: LdXMemW dst: r5 src: r1 off: 8 imm: 0 block-0: -// tuntap (c, 10, 200, rwm, allow) - 5: JNEImm dst: r2 off: -1 imm: 2 - 6: JNEImm dst: r4 off: -1 imm: 10 - 7: JNEImm dst: r5 off: -1 imm: 200 - 8: Mov32Imm dst: r0 imm: 1 - 9: Exit -block-1: - 10: JNEImm dst: r2 off: -1 imm: 2 - 11: JNEImm dst: r4 off: -1 imm: 5 - 12: JNEImm dst: r5 off: -1 imm: 2 - 13: Mov32Imm dst: r0 imm: 1 - 14: Exit -block-2: -// /dev/pts (c, 136, wildcard, rwm, true) - 15: JNEImm dst: r2 off: -1 imm: 2 - 16: JNEImm dst: r4 off: -1 imm: 136 - 17: Mov32Imm dst: r0 imm: 1 - 18: Exit -block-3: - 19: JNEImm dst: r2 off: -1 imm: 2 - 20: JNEImm dst: r4 off: -1 imm: 5 - 21: JNEImm dst: r5 off: -1 imm: 1 - 22: Mov32Imm dst: r0 imm: 1 - 23: Exit -block-4: - 24: JNEImm dst: r2 off: -1 imm: 2 - 25: JNEImm dst: r4 off: -1 imm: 1 - 26: JNEImm dst: r5 off: -1 imm: 9 - 27: Mov32Imm dst: r0 imm: 1 - 28: Exit -block-5: - 29: JNEImm dst: r2 off: -1 imm: 2 - 30: JNEImm dst: r4 off: -1 imm: 1 - 31: JNEImm dst: r5 off: -1 imm: 5 - 32: Mov32Imm dst: r0 imm: 1 - 33: Exit -block-6: - 34: JNEImm dst: r2 off: -1 imm: 2 - 35: JNEImm dst: r4 off: -1 imm: 5 - 36: JNEImm dst: r5 off: -1 imm: 0 - 37: Mov32Imm dst: r0 imm: 1 - 38: Exit -block-7: - 39: JNEImm dst: r2 off: -1 imm: 2 - 40: JNEImm dst: r4 off: -1 imm: 1 - 41: JNEImm dst: r5 off: -1 imm: 7 - 42: Mov32Imm dst: r0 imm: 1 - 43: Exit -block-8: - 44: JNEImm dst: r2 off: -1 imm: 2 - 45: JNEImm dst: r4 off: -1 imm: 1 - 46: JNEImm dst: r5 off: -1 imm: 8 - 47: Mov32Imm dst: r0 imm: 1 - 48: Exit -block-9: - 49: JNEImm dst: r2 off: -1 imm: 2 - 50: JNEImm dst: r4 off: -1 imm: 1 - 51: JNEImm dst: r5 off: -1 imm: 3 - 52: Mov32Imm dst: r0 imm: 1 - 53: Exit -block-10: // (b, wildcard, wildcard, m, true) - 54: JNEImm dst: r2 off: -1 imm: 1 - 55: Mov32Reg dst: r1 src: r3 - 56: And32Imm dst: r1 imm: 1 - 57: JEqImm dst: r1 off: -1 imm: 0 - 58: Mov32Imm dst: r0 imm: 1 - 59: Exit -block-11: + 6: JNEImm dst: r2 off: -1 imm: 1 + 7: Mov32Reg dst: r1 src: r3 + 8: And32Imm dst: r1 imm: 1 + 9: JNEReg dst: r1 off: -1 src: r3 + 10: Mov32Imm dst: r0 imm: 1 + 11: Exit +block-1: // (c, wildcard, wildcard, m, true) - 60: JNEImm dst: r2 off: -1 imm: 2 - 61: Mov32Reg dst: r1 src: r3 - 62: And32Imm dst: r1 imm: 1 - 63: JEqImm dst: r1 off: -1 imm: 0 - 64: Mov32Imm dst: r0 imm: 1 - 65: Exit -block-12: - 66: Mov32Imm dst: r0 imm: 0 - 67: Exit + 12: JNEImm dst: r2 off: -1 imm: 2 + 13: Mov32Reg dst: r1 src: r3 + 14: And32Imm dst: r1 imm: 1 + 15: JNEReg dst: r1 off: -1 src: r3 + 16: Mov32Imm dst: r0 imm: 1 + 17: Exit +block-2: + 18: JNEImm dst: r2 off: -1 imm: 2 + 19: JNEImm dst: r4 off: -1 imm: 1 + 20: JNEImm dst: r5 off: -1 imm: 3 + 21: Mov32Imm dst: r0 imm: 1 + 22: Exit +block-3: + 23: JNEImm dst: r2 off: -1 imm: 2 + 24: JNEImm dst: r4 off: -1 imm: 1 + 25: JNEImm dst: r5 off: -1 imm: 5 + 26: Mov32Imm dst: r0 imm: 1 + 27: Exit +block-4: + 28: JNEImm dst: r2 off: -1 imm: 2 + 29: JNEImm dst: r4 off: -1 imm: 1 + 30: JNEImm dst: r5 off: -1 imm: 7 + 31: Mov32Imm dst: r0 imm: 1 + 32: Exit +block-5: + 33: JNEImm dst: r2 off: -1 imm: 2 + 34: JNEImm dst: r4 off: -1 imm: 1 + 35: JNEImm dst: r5 off: -1 imm: 8 + 36: Mov32Imm dst: r0 imm: 1 + 37: Exit +block-6: + 38: JNEImm dst: r2 off: -1 imm: 2 + 39: JNEImm dst: r4 off: -1 imm: 1 + 40: JNEImm dst: r5 off: -1 imm: 9 + 41: Mov32Imm dst: r0 imm: 1 + 42: Exit +block-7: + 43: JNEImm dst: r2 off: -1 imm: 2 + 44: JNEImm dst: r4 off: -1 imm: 5 + 45: JNEImm dst: r5 off: -1 imm: 0 + 46: Mov32Imm dst: r0 imm: 1 + 47: Exit +block-8: + 48: JNEImm dst: r2 off: -1 imm: 2 + 49: JNEImm dst: r4 off: -1 imm: 5 + 50: JNEImm dst: r5 off: -1 imm: 2 + 51: Mov32Imm dst: r0 imm: 1 + 52: Exit +block-9: +// tuntap (c, 10, 200, rwm, allow) + 53: JNEImm dst: r2 off: -1 imm: 2 + 54: JNEImm dst: r4 off: -1 imm: 10 + 55: JNEImm dst: r5 off: -1 imm: 200 + 56: Mov32Imm dst: r0 imm: 1 + 57: Exit +block-10: +// /dev/pts (c, 136, wildcard, rwm, true) + 58: JNEImm dst: r2 off: -1 imm: 2 + 59: JNEImm dst: r4 off: -1 imm: 136 + 60: Mov32Imm dst: r0 imm: 1 + 61: Exit +block-11: + 62: Mov32Imm dst: r0 imm: 0 + 63: Exit ` - testDeviceFilter(t, specconv.AllowedDevices, expected) + var devices []*devices.Rule + for _, device := range specconv.AllowedDevices { + devices = append(devices, &device.Rule) + } + testDeviceFilter(t, devices, expected) } func TestDeviceFilter_Privileged(t *testing.T) { - devices := []*configs.Device{ + devices := []*devices.Rule{ { Type: 'a', Major: -1, @@ -157,21 +156,22 @@ func TestDeviceFilter_Privileged(t *testing.T) { expected := ` // load parameters into registers - 0: LdXMemH dst: r2 src: r1 off: 0 imm: 0 - 1: LdXMemW dst: r3 src: r1 off: 0 imm: 0 - 2: RSh32Imm dst: r3 imm: 16 - 3: LdXMemW dst: r4 src: r1 off: 4 imm: 0 - 4: LdXMemW dst: r5 src: r1 off: 8 imm: 0 + 0: LdXMemW dst: r2 src: r1 off: 0 imm: 0 + 1: And32Imm dst: r2 imm: 65535 + 2: LdXMemW dst: r3 src: r1 off: 0 imm: 0 + 3: RSh32Imm dst: r3 imm: 16 + 4: LdXMemW dst: r4 src: r1 off: 4 imm: 0 + 5: LdXMemW dst: r5 src: r1 off: 8 imm: 0 block-0: // return 1 (accept) - 5: Mov32Imm dst: r0 imm: 1 - 6: Exit + 6: Mov32Imm dst: r0 imm: 1 + 7: Exit ` testDeviceFilter(t, devices, expected) } func TestDeviceFilter_PrivilegedExceptSingleDevice(t *testing.T) { - devices := []*configs.Device{ + devices := []*devices.Rule{ { Type: 'a', Major: -1, @@ -189,28 +189,29 @@ func TestDeviceFilter_PrivilegedExceptSingleDevice(t *testing.T) { } expected := ` // load parameters into registers - 0: LdXMemH dst: r2 src: r1 off: 0 imm: 0 - 1: LdXMemW dst: r3 src: r1 off: 0 imm: 0 - 2: RSh32Imm dst: r3 imm: 16 - 3: LdXMemW dst: r4 src: r1 off: 4 imm: 0 - 4: LdXMemW dst: r5 src: r1 off: 8 imm: 0 + 0: LdXMemW dst: r2 src: r1 off: 0 imm: 0 + 1: And32Imm dst: r2 imm: 65535 + 2: LdXMemW dst: r3 src: r1 off: 0 imm: 0 + 3: RSh32Imm dst: r3 imm: 16 + 4: LdXMemW dst: r4 src: r1 off: 4 imm: 0 + 5: LdXMemW dst: r5 src: r1 off: 8 imm: 0 block-0: // return 0 (reject) if type==b && major == 8 && minor == 0 - 5: JNEImm dst: r2 off: -1 imm: 1 - 6: JNEImm dst: r4 off: -1 imm: 8 - 7: JNEImm dst: r5 off: -1 imm: 0 - 8: Mov32Imm dst: r0 imm: 0 - 9: Exit + 6: JNEImm dst: r2 off: -1 imm: 1 + 7: JNEImm dst: r4 off: -1 imm: 8 + 8: JNEImm dst: r5 off: -1 imm: 0 + 9: Mov32Imm dst: r0 imm: 0 + 10: Exit block-1: // return 1 (accept) - 10: Mov32Imm dst: r0 imm: 1 - 11: Exit + 11: Mov32Imm dst: r0 imm: 1 + 12: Exit ` testDeviceFilter(t, devices, expected) } func TestDeviceFilter_Weird(t *testing.T) { - devices := []*configs.Device{ + devices := []*devices.Rule{ { Type: 'b', Major: 8, @@ -237,22 +238,23 @@ func TestDeviceFilter_Weird(t *testing.T) { // This conforms to runc v1.0.0-rc.9 (cgroup1) behavior. expected := ` // load parameters into registers - 0: LdXMemH dst: r2 src: r1 off: 0 imm: 0 - 1: LdXMemW dst: r3 src: r1 off: 0 imm: 0 - 2: RSh32Imm dst: r3 imm: 16 - 3: LdXMemW dst: r4 src: r1 off: 4 imm: 0 - 4: LdXMemW dst: r5 src: r1 off: 8 imm: 0 + 0: LdXMemW dst: r2 src: r1 off: 0 imm: 0 + 1: And32Imm dst: r2 imm: 65535 + 2: LdXMemW dst: r3 src: r1 off: 0 imm: 0 + 3: RSh32Imm dst: r3 imm: 16 + 4: LdXMemW dst: r4 src: r1 off: 4 imm: 0 + 5: LdXMemW dst: r5 src: r1 off: 8 imm: 0 block-0: // return 0 (reject) if type==b && major == 8 && minor == 2 - 5: JNEImm dst: r2 off: -1 imm: 1 - 6: JNEImm dst: r4 off: -1 imm: 8 - 7: JNEImm dst: r5 off: -1 imm: 2 - 8: Mov32Imm dst: r0 imm: 0 - 9: Exit + 6: JNEImm dst: r2 off: -1 imm: 1 + 7: JNEImm dst: r4 off: -1 imm: 8 + 8: JNEImm dst: r5 off: -1 imm: 2 + 9: Mov32Imm dst: r0 imm: 0 + 10: Exit block-1: // return 1 (accept) - 10: Mov32Imm dst: r0 imm: 1 - 11: Exit + 11: Mov32Imm dst: r0 imm: 1 + 12: Exit ` testDeviceFilter(t, devices, expected) } diff --git a/libcontainer/cgroups/ebpf/ebpf.go b/libcontainer/cgroups/ebpf/ebpf.go deleted file mode 100644 index 4795e0a..0000000 --- a/libcontainer/cgroups/ebpf/ebpf.go +++ /dev/null @@ -1,45 +0,0 @@ -package ebpf - -import ( - "github.com/cilium/ebpf" - "github.com/cilium/ebpf/asm" - "github.com/pkg/errors" - "golang.org/x/sys/unix" -) - -// LoadAttachCgroupDeviceFilter installs eBPF device filter program to /sys/fs/cgroup/ directory. -// -// Requires the system to be running in cgroup2 unified-mode with kernel >= 4.15 . -// -// https://github.com/torvalds/linux/commit/ebc614f687369f9df99828572b1d85a7c2de3d92 -func LoadAttachCgroupDeviceFilter(insts asm.Instructions, license string, dirFD int) (func() error, error) { - nilCloser := func() error { - return nil - } - // Increase `ulimit -l` limit to avoid BPF_PROG_LOAD error (#2167). - // This limit is not inherited into the container. - memlockLimit := &unix.Rlimit{ - Cur: unix.RLIM_INFINITY, - Max: unix.RLIM_INFINITY, - } - _ = unix.Setrlimit(unix.RLIMIT_MEMLOCK, memlockLimit) - spec := &ebpf.ProgramSpec{ - Type: ebpf.CGroupDevice, - Instructions: insts, - License: license, - } - prog, err := ebpf.NewProgram(spec) - if err != nil { - return nilCloser, err - } - if err := prog.Attach(dirFD, ebpf.AttachCGroupDevice, unix.BPF_F_ALLOW_MULTI); err != nil { - return nilCloser, errors.Wrap(err, "failed to call BPF_PROG_ATTACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI)") - } - closer := func() error { - if err := prog.Detach(dirFD, ebpf.AttachCGroupDevice, unix.BPF_F_ALLOW_MULTI); err != nil { - return errors.Wrap(err, "failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI)") - } - return nil - } - return closer, nil -} diff --git a/libcontainer/cgroups/ebpf/ebpf_linux.go b/libcontainer/cgroups/ebpf/ebpf_linux.go new file mode 100644 index 0000000..104c74a --- /dev/null +++ b/libcontainer/cgroups/ebpf/ebpf_linux.go @@ -0,0 +1,253 @@ +package ebpf + +import ( + "errors" + "fmt" + "os" + "runtime" + "sync" + "unsafe" + + "github.com/cilium/ebpf" + "github.com/cilium/ebpf/asm" + "github.com/cilium/ebpf/link" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" +) + +func nilCloser() error { + return nil +} + +func findAttachedCgroupDeviceFilters(dirFd int) ([]*ebpf.Program, error) { + type bpfAttrQuery struct { + TargetFd uint32 + AttachType uint32 + QueryType uint32 + AttachFlags uint32 + ProgIds uint64 // __aligned_u64 + ProgCnt uint32 + } + + // Currently you can only have 64 eBPF programs attached to a cgroup. + size := 64 + retries := 0 + for retries < 10 { + progIds := make([]uint32, size) + query := bpfAttrQuery{ + TargetFd: uint32(dirFd), + AttachType: uint32(unix.BPF_CGROUP_DEVICE), + ProgIds: uint64(uintptr(unsafe.Pointer(&progIds[0]))), + ProgCnt: uint32(len(progIds)), + } + + // Fetch the list of program ids. + _, _, errno := unix.Syscall(unix.SYS_BPF, + uintptr(unix.BPF_PROG_QUERY), + uintptr(unsafe.Pointer(&query)), + unsafe.Sizeof(query)) + size = int(query.ProgCnt) + runtime.KeepAlive(query) + if errno != 0 { + // On ENOSPC we get the correct number of programs. + if errno == unix.ENOSPC { + retries++ + continue + } + return nil, fmt.Errorf("bpf_prog_query(BPF_CGROUP_DEVICE) failed: %w", errno) + } + + // Convert the ids to program handles. + progIds = progIds[:size] + programs := make([]*ebpf.Program, 0, len(progIds)) + for _, progId := range progIds { + program, err := ebpf.NewProgramFromID(ebpf.ProgramID(progId)) + if err != nil { + // We skip over programs that give us -EACCES or -EPERM. This + // is necessary because there may be BPF programs that have + // been attached (such as with --systemd-cgroup) which have an + // LSM label that blocks us from interacting with the program. + // + // Because additional BPF_CGROUP_DEVICE programs only can add + // restrictions, there's no real issue with just ignoring these + // programs (and stops runc from breaking on distributions with + // very strict SELinux policies). + if errors.Is(err, os.ErrPermission) { + logrus.Debugf("ignoring existing CGROUP_DEVICE program (prog_id=%v) which cannot be accessed by runc -- likely due to LSM policy: %v", progId, err) + continue + } + return nil, fmt.Errorf("cannot fetch program from id: %w", err) + } + programs = append(programs, program) + } + runtime.KeepAlive(progIds) + return programs, nil + } + + return nil, errors.New("could not get complete list of CGROUP_DEVICE programs") +} + +var ( + haveBpfProgReplaceBool bool + haveBpfProgReplaceOnce sync.Once +) + +// Loosely based on the BPF_F_REPLACE support check in +// . +// +// TODO: move this logic to cilium/ebpf +func haveBpfProgReplace() bool { + haveBpfProgReplaceOnce.Do(func() { + prog, err := ebpf.NewProgram(&ebpf.ProgramSpec{ + Type: ebpf.CGroupDevice, + License: "MIT", + Instructions: asm.Instructions{ + asm.Mov.Imm(asm.R0, 0), + asm.Return(), + }, + }) + if err != nil { + logrus.Debugf("checking for BPF_F_REPLACE support: ebpf.NewProgram failed: %v", err) + return + } + defer prog.Close() + + devnull, err := os.Open("/dev/null") + if err != nil { + logrus.Debugf("checking for BPF_F_REPLACE support: open dummy target fd: %v", err) + return + } + defer devnull.Close() + + // We know that we have BPF_PROG_ATTACH since we can load + // BPF_CGROUP_DEVICE programs. If passing BPF_F_REPLACE gives us EINVAL + // we know that the feature isn't present. + err = link.RawAttachProgram(link.RawAttachProgramOptions{ + // We rely on this fd being checked after attachFlags. + Target: int(devnull.Fd()), + // Attempt to "replace" bad fds with this program. + Program: prog, + Attach: ebpf.AttachCGroupDevice, + Flags: unix.BPF_F_ALLOW_MULTI | unix.BPF_F_REPLACE, + }) + if errors.Is(err, unix.EINVAL) { + // not supported + return + } + // attach_flags test succeeded. + if !errors.Is(err, unix.EBADF) { + logrus.Debugf("checking for BPF_F_REPLACE: got unexpected (not EBADF or EINVAL) error: %v", err) + } + haveBpfProgReplaceBool = true + }) + return haveBpfProgReplaceBool +} + +// LoadAttachCgroupDeviceFilter installs eBPF device filter program to /sys/fs/cgroup/ directory. +// +// Requires the system to be running in cgroup2 unified-mode with kernel >= 4.15 . +// +// https://github.com/torvalds/linux/commit/ebc614f687369f9df99828572b1d85a7c2de3d92 +func LoadAttachCgroupDeviceFilter(insts asm.Instructions, license string, dirFd int) (func() error, error) { + // Increase `ulimit -l` limit to avoid BPF_PROG_LOAD error (#2167). + // This limit is not inherited into the container. + memlockLimit := &unix.Rlimit{ + Cur: unix.RLIM_INFINITY, + Max: unix.RLIM_INFINITY, + } + _ = unix.Setrlimit(unix.RLIMIT_MEMLOCK, memlockLimit) + + // Get the list of existing programs. + oldProgs, err := findAttachedCgroupDeviceFilters(dirFd) + if err != nil { + return nilCloser, err + } + useReplaceProg := haveBpfProgReplace() && len(oldProgs) == 1 + + // Generate new program. + spec := &ebpf.ProgramSpec{ + Type: ebpf.CGroupDevice, + Instructions: insts, + License: license, + } + prog, err := ebpf.NewProgram(spec) + if err != nil { + return nilCloser, err + } + + // If there is only one old program, we can just replace it directly. + var ( + replaceProg *ebpf.Program + attachFlags uint32 = unix.BPF_F_ALLOW_MULTI + ) + if useReplaceProg { + replaceProg = oldProgs[0] + attachFlags |= unix.BPF_F_REPLACE + } + err = link.RawAttachProgram(link.RawAttachProgramOptions{ + Target: dirFd, + Program: prog, + Replace: replaceProg, + Attach: ebpf.AttachCGroupDevice, + Flags: attachFlags, + }) + if err != nil { + return nilCloser, fmt.Errorf("failed to call BPF_PROG_ATTACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI): %w", err) + } + closer := func() error { + err = link.RawDetachProgram(link.RawDetachProgramOptions{ + Target: dirFd, + Program: prog, + Attach: ebpf.AttachCGroupDevice, + }) + if err != nil { + return fmt.Errorf("failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE): %w", err) + } + // TODO: Should we attach the old filters back in this case? Otherwise + // we fail-open on a security feature, which is a bit scary. + return nil + } + if !useReplaceProg { + logLevel := logrus.DebugLevel + // If there was more than one old program, give a warning (since this + // really shouldn't happen with runc-managed cgroups) and then detach + // all the old programs. + if len(oldProgs) > 1 { + // NOTE: Ideally this should be a warning but it turns out that + // systemd-managed cgroups trigger this warning (apparently + // systemd doesn't delete old non-systemd programs when + // setting properties). + logrus.Infof("found more than one filter (%d) attached to a cgroup -- removing extra filters!", len(oldProgs)) + logLevel = logrus.InfoLevel + } + for idx, oldProg := range oldProgs { + // Output some extra debug info. + if info, err := oldProg.Info(); err == nil { + fields := logrus.Fields{ + "type": info.Type.String(), + "tag": info.Tag, + "name": info.Name, + } + if id, ok := info.ID(); ok { + fields["id"] = id + } + if runCount, ok := info.RunCount(); ok { + fields["run_count"] = runCount + } + if runtime, ok := info.Runtime(); ok { + fields["runtime"] = runtime.String() + } + logrus.WithFields(fields).Logf(logLevel, "removing old filter %d from cgroup", idx) + } + err = link.RawDetachProgram(link.RawDetachProgramOptions{ + Target: dirFd, + Program: oldProg, + Attach: ebpf.AttachCGroupDevice, + }) + if err != nil { + return closer, fmt.Errorf("failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE) on old filter program: %w", err) + } + } + } + return closer, nil +} diff --git a/libcontainer/cgroups/file.go b/libcontainer/cgroups/file.go new file mode 100644 index 0000000..0cdaf74 --- /dev/null +++ b/libcontainer/cgroups/file.go @@ -0,0 +1,190 @@ +package cgroups + +import ( + "bytes" + "errors" + "fmt" + "os" + "path" + "strconv" + "strings" + "sync" + + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" +) + +// OpenFile opens a cgroup file in a given dir with given flags. +// It is supposed to be used for cgroup files only, and returns +// an error if the file is not a cgroup file. +// +// Arguments dir and file are joined together to form an absolute path +// to a file being opened. +func OpenFile(dir, file string, flags int) (*os.File, error) { + if dir == "" { + return nil, fmt.Errorf("no directory specified for %s", file) + } + return openFile(dir, file, flags) +} + +// ReadFile reads data from a cgroup file in dir. +// It is supposed to be used for cgroup files only. +func ReadFile(dir, file string) (string, error) { + fd, err := OpenFile(dir, file, unix.O_RDONLY) + if err != nil { + return "", err + } + defer fd.Close() + var buf bytes.Buffer + + _, err = buf.ReadFrom(fd) + return buf.String(), err +} + +// WriteFile writes data to a cgroup file in dir. +// It is supposed to be used for cgroup files only. +func WriteFile(dir, file, data string) error { + fd, err := OpenFile(dir, file, unix.O_WRONLY) + if err != nil { + return err + } + defer fd.Close() + if err := retryingWriteFile(fd, data); err != nil { + // Having data in the error message helps in debugging. + return fmt.Errorf("failed to write %q: %w", data, err) + } + return nil +} + +func retryingWriteFile(fd *os.File, data string) error { + for { + _, err := fd.Write([]byte(data)) + if errors.Is(err, unix.EINTR) { + logrus.Infof("interrupted while writing %s to %s", data, fd.Name()) + continue + } + return err + } +} + +const ( + cgroupfsDir = "/sys/fs/cgroup" + cgroupfsPrefix = cgroupfsDir + "/" +) + +var ( + // TestMode is set to true by unit tests that need "fake" cgroupfs. + TestMode bool + + cgroupFd int = -1 + prepOnce sync.Once + prepErr error + resolveFlags uint64 +) + +func prepareOpenat2() error { + prepOnce.Do(func() { + fd, err := unix.Openat2(-1, cgroupfsDir, &unix.OpenHow{ + Flags: unix.O_DIRECTORY | unix.O_PATH, + }) + if err != nil { + prepErr = &os.PathError{Op: "openat2", Path: cgroupfsDir, Err: err} + if err != unix.ENOSYS { //nolint:errorlint // unix errors are bare + logrus.Warnf("falling back to securejoin: %s", prepErr) + } else { + logrus.Debug("openat2 not available, falling back to securejoin") + } + return + } + var st unix.Statfs_t + if err = unix.Fstatfs(fd, &st); err != nil { + prepErr = &os.PathError{Op: "statfs", Path: cgroupfsDir, Err: err} + logrus.Warnf("falling back to securejoin: %s", prepErr) + return + } + + cgroupFd = fd + + resolveFlags = unix.RESOLVE_BENEATH | unix.RESOLVE_NO_MAGICLINKS + if st.Type == unix.CGROUP2_SUPER_MAGIC { + // cgroupv2 has a single mountpoint and no "cpu,cpuacct" symlinks + resolveFlags |= unix.RESOLVE_NO_XDEV | unix.RESOLVE_NO_SYMLINKS + } + }) + + return prepErr +} + +func openFile(dir, file string, flags int) (*os.File, error) { + mode := os.FileMode(0) + if TestMode && flags&os.O_WRONLY != 0 { + // "emulate" cgroup fs for unit tests + flags |= os.O_TRUNC | os.O_CREATE + mode = 0o600 + } + path := path.Join(dir, file) + if prepareOpenat2() != nil { + return openFallback(path, flags, mode) + } + relPath := strings.TrimPrefix(path, cgroupfsPrefix) + if len(relPath) == len(path) { // non-standard path, old system? + return openFallback(path, flags, mode) + } + + fd, err := unix.Openat2(cgroupFd, relPath, + &unix.OpenHow{ + Resolve: resolveFlags, + Flags: uint64(flags) | unix.O_CLOEXEC, + Mode: uint64(mode), + }) + if err != nil { + err = &os.PathError{Op: "openat2", Path: path, Err: err} + // Check if cgroupFd is still opened to cgroupfsDir + // (happens when this package is incorrectly used + // across the chroot/pivot_root/mntns boundary, or + // when /sys/fs/cgroup is remounted). + // + // TODO: if such usage will ever be common, amend this + // to reopen cgroupFd and retry openat2. + fdStr := strconv.Itoa(cgroupFd) + fdDest, _ := os.Readlink("/proc/self/fd/" + fdStr) + if fdDest != cgroupfsDir { + // Wrap the error so it is clear that cgroupFd + // is opened to an unexpected/wrong directory. + err = fmt.Errorf("cgroupFd %s unexpectedly opened to %s != %s: %w", + fdStr, fdDest, cgroupfsDir, err) + } + return nil, err + } + + return os.NewFile(uintptr(fd), path), nil +} + +var errNotCgroupfs = errors.New("not a cgroup file") + +// Can be changed by unit tests. +var openFallback = openAndCheck + +// openAndCheck is used when openat2(2) is not available. It checks the opened +// file is on cgroupfs, returning an error otherwise. +func openAndCheck(path string, flags int, mode os.FileMode) (*os.File, error) { + fd, err := os.OpenFile(path, flags, mode) + if err != nil { + return nil, err + } + if TestMode { + return fd, nil + } + // Check this is a cgroupfs file. + var st unix.Statfs_t + if err := unix.Fstatfs(int(fd.Fd()), &st); err != nil { + _ = fd.Close() + return nil, &os.PathError{Op: "statfs", Path: path, Err: err} + } + if st.Type != unix.CGROUP_SUPER_MAGIC && st.Type != unix.CGROUP2_SUPER_MAGIC { + _ = fd.Close() + return nil, &os.PathError{Op: "open", Path: path, Err: errNotCgroupfs} + } + + return fd, nil +} diff --git a/libcontainer/cgroups/file_test.go b/libcontainer/cgroups/file_test.go new file mode 100644 index 0000000..dc2b063 --- /dev/null +++ b/libcontainer/cgroups/file_test.go @@ -0,0 +1,73 @@ +package cgroups + +import ( + "errors" + "fmt" + "os" + "path/filepath" + "strconv" + "testing" + "time" +) + +func TestWriteCgroupFileHandlesInterrupt(t *testing.T) { + const ( + memoryCgroupMount = "/sys/fs/cgroup/memory" + memoryLimit = "memory.limit_in_bytes" + ) + if _, err := os.Stat(memoryCgroupMount); err != nil { + // most probably cgroupv2 + t.Skip(err) + } + + cgroupName := fmt.Sprintf("test-eint-%d", time.Now().Nanosecond()) + cgroupPath := filepath.Join(memoryCgroupMount, cgroupName) + if err := os.MkdirAll(cgroupPath, 0o755); err != nil { + t.Fatal(err) + } + defer os.RemoveAll(cgroupPath) + + if _, err := os.Stat(filepath.Join(cgroupPath, memoryLimit)); err != nil { + // either cgroupv2, or memory controller is not available + t.Skip(err) + } + + for i := 0; i < 100000; i++ { + limit := 1024*1024 + i + if err := WriteFile(cgroupPath, memoryLimit, strconv.Itoa(limit)); err != nil { + t.Fatalf("Failed to write %d on attempt %d: %+v", limit, i, err) + } + } +} + +func TestOpenat2(t *testing.T) { + if !IsCgroup2UnifiedMode() { + // The reason is many test cases below test opening files from + // the top-level directory, where cgroup v1 has no files. + t.Skip("test requires cgroup v2") + } + + // Make sure we test openat2, not its fallback. + openFallback = func(_ string, _ int, _ os.FileMode) (*os.File, error) { + return nil, errors.New("fallback") + } + defer func() { openFallback = openAndCheck }() + + for _, tc := range []struct{ dir, file string }{ + {"/sys/fs/cgroup", "cgroup.controllers"}, + {"/sys/fs/cgroup", "/cgroup.controllers"}, + {"/sys/fs/cgroup/", "cgroup.controllers"}, + {"/sys/fs/cgroup/", "/cgroup.controllers"}, + {"/sys/fs/cgroup/user.slice", "cgroup.controllers"}, + {"/sys/fs/cgroup/user.slice/", "/cgroup.controllers"}, + {"/", "/sys/fs/cgroup/cgroup.controllers"}, + {"/", "sys/fs/cgroup/cgroup.controllers"}, + {"/sys/fs/cgroup/cgroup.controllers", ""}, + } { + fd, err := OpenFile(tc.dir, tc.file, os.O_RDONLY) + if err != nil { + t.Errorf("case %+v: %v", tc, err) + } + fd.Close() + } +} diff --git a/libcontainer/cgroups/fs/apply_raw.go b/libcontainer/cgroups/fs/apply_raw.go deleted file mode 100644 index ec148b4..0000000 --- a/libcontainer/cgroups/fs/apply_raw.go +++ /dev/null @@ -1,411 +0,0 @@ -// +build linux - -package fs - -import ( - "fmt" - "io" - "os" - "path/filepath" - "sync" - - "github.com/opencontainers/runc/libcontainer/cgroups" - "github.com/opencontainers/runc/libcontainer/configs" - libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils" - "github.com/pkg/errors" - "golang.org/x/sys/unix" -) - -var ( - subsystemsLegacy = subsystemSet{ - &CpusetGroup{}, - &DevicesGroup{}, - &MemoryGroup{}, - &CpuGroup{}, - &CpuacctGroup{}, - &PidsGroup{}, - &BlkioGroup{}, - &HugetlbGroup{}, - &NetClsGroup{}, - &NetPrioGroup{}, - &PerfEventGroup{}, - &FreezerGroup{}, - &NameGroup{GroupName: "name=systemd", Join: true}, - } - HugePageSizes, _ = cgroups.GetHugePageSize() -) - -var errSubsystemDoesNotExist = fmt.Errorf("cgroup: subsystem does not exist") - -type subsystemSet []subsystem - -func (s subsystemSet) Get(name string) (subsystem, error) { - for _, ss := range s { - if ss.Name() == name { - return ss, nil - } - } - return nil, errSubsystemDoesNotExist -} - -type subsystem interface { - // Name returns the name of the subsystem. - Name() string - // Returns the stats, as 'stats', corresponding to the cgroup under 'path'. - GetStats(path string, stats *cgroups.Stats) error - // Removes the cgroup represented by 'cgroupData'. - Remove(*cgroupData) error - // Creates and joins the cgroup represented by 'cgroupData'. - Apply(*cgroupData) error - // Set the cgroup represented by cgroup. - Set(path string, cgroup *configs.Cgroup) error -} - -type Manager struct { - mu sync.Mutex - Cgroups *configs.Cgroup - Rootless bool // ignore permission-related errors - Paths map[string]string -} - -// The absolute path to the root of the cgroup hierarchies. -var cgroupRootLock sync.Mutex -var cgroupRoot string - -// Gets the cgroupRoot. -func getCgroupRoot() (string, error) { - cgroupRootLock.Lock() - defer cgroupRootLock.Unlock() - - if cgroupRoot != "" { - return cgroupRoot, nil - } - - root, err := cgroups.FindCgroupMountpointDir() - if err != nil { - return "", err - } - - if _, err := os.Stat(root); err != nil { - return "", err - } - - cgroupRoot = root - return cgroupRoot, nil -} - -type cgroupData struct { - root string - innerPath string - config *configs.Cgroup - pid int -} - -// isIgnorableError returns whether err is a permission error (in the loose -// sense of the word). This includes EROFS (which for an unprivileged user is -// basically a permission error) and EACCES (for similar reasons) as well as -// the normal EPERM. -func isIgnorableError(rootless bool, err error) bool { - // We do not ignore errors if we are root. - if !rootless { - return false - } - // Is it an ordinary EPERM? - if os.IsPermission(errors.Cause(err)) { - return true - } - - // Try to handle other errnos. - var errno error - switch err := errors.Cause(err).(type) { - case *os.PathError: - errno = err.Err - case *os.LinkError: - errno = err.Err - case *os.SyscallError: - errno = err.Err - } - return errno == unix.EROFS || errno == unix.EPERM || errno == unix.EACCES -} - -func (m *Manager) getSubsystems() subsystemSet { - return subsystemsLegacy -} - -func (m *Manager) Apply(pid int) (err error) { - if m.Cgroups == nil { - return nil - } - m.mu.Lock() - defer m.mu.Unlock() - - var c = m.Cgroups - - d, err := getCgroupData(m.Cgroups, pid) - if err != nil { - return err - } - - m.Paths = make(map[string]string) - if c.Paths != nil { - for name, path := range c.Paths { - _, err := d.path(name) - if err != nil { - if cgroups.IsNotFound(err) { - continue - } - return err - } - m.Paths[name] = path - } - return cgroups.EnterPid(m.Paths, pid) - } - - for _, sys := range m.getSubsystems() { - // TODO: Apply should, ideally, be reentrant or be broken up into a separate - // create and join phase so that the cgroup hierarchy for a container can be - // created then join consists of writing the process pids to cgroup.procs - p, err := d.path(sys.Name()) - if err != nil { - // The non-presence of the devices subsystem is - // considered fatal for security reasons. - if cgroups.IsNotFound(err) && sys.Name() != "devices" { - continue - } - return err - } - m.Paths[sys.Name()] = p - - if err := sys.Apply(d); err != nil { - // In the case of rootless (including euid=0 in userns), where an explicit cgroup path hasn't - // been set, we don't bail on error in case of permission problems. - // Cases where limits have been set (and we couldn't create our own - // cgroup) are handled by Set. - if isIgnorableError(m.Rootless, err) && m.Cgroups.Path == "" { - delete(m.Paths, sys.Name()) - continue - } - return err - } - - } - return nil -} - -func (m *Manager) Destroy() error { - if m.Cgroups == nil || m.Cgroups.Paths != nil { - return nil - } - m.mu.Lock() - defer m.mu.Unlock() - if err := cgroups.RemovePaths(m.Paths); err != nil { - return err - } - m.Paths = make(map[string]string) - return nil -} - -func (m *Manager) GetPaths() map[string]string { - m.mu.Lock() - paths := m.Paths - m.mu.Unlock() - return paths -} - -func (m *Manager) GetUnifiedPath() (string, error) { - return "", errors.New("unified path is only supported when running in unified mode") -} - -func (m *Manager) GetStats() (*cgroups.Stats, error) { - m.mu.Lock() - defer m.mu.Unlock() - stats := cgroups.NewStats() - for name, path := range m.Paths { - sys, err := m.getSubsystems().Get(name) - if err == errSubsystemDoesNotExist || !cgroups.PathExists(path) { - continue - } - if err := sys.GetStats(path, stats); err != nil { - return nil, err - } - } - return stats, nil -} - -func (m *Manager) Set(container *configs.Config) error { - if container.Cgroups == nil { - return nil - } - - // If Paths are set, then we are just joining cgroups paths - // and there is no need to set any values. - if m.Cgroups != nil && m.Cgroups.Paths != nil { - return nil - } - - paths := m.GetPaths() - for _, sys := range m.getSubsystems() { - path := paths[sys.Name()] - if err := sys.Set(path, container.Cgroups); err != nil { - if m.Rootless && sys.Name() == "devices" { - continue - } - // When m.Rootless is true, errors from the device subsystem are ignored because it is really not expected to work. - // However, errors from other subsystems are not ignored. - // see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error" - if path == "" { - // We never created a path for this cgroup, so we cannot set - // limits for it (though we have already tried at this point). - return fmt.Errorf("cannot set %s limit: container could not join or create cgroup", sys.Name()) - } - return err - } - } - - if m.Paths["cpu"] != "" { - if err := CheckCpushares(m.Paths["cpu"], container.Cgroups.Resources.CpuShares); err != nil { - return err - } - } - return nil -} - -// Freeze toggles the container's freezer cgroup depending on the state -// provided -func (m *Manager) Freeze(state configs.FreezerState) error { - if m.Cgroups == nil { - return errors.New("cannot toggle freezer: cgroups not configured for container") - } - - paths := m.GetPaths() - dir := paths["freezer"] - prevState := m.Cgroups.Resources.Freezer - m.Cgroups.Resources.Freezer = state - freezer, err := m.getSubsystems().Get("freezer") - if err != nil { - return err - } - err = freezer.Set(dir, m.Cgroups) - if err != nil { - m.Cgroups.Resources.Freezer = prevState - return err - } - return nil -} - -func (m *Manager) GetPids() ([]int, error) { - paths := m.GetPaths() - return cgroups.GetPids(paths["devices"]) -} - -func (m *Manager) GetAllPids() ([]int, error) { - paths := m.GetPaths() - return cgroups.GetAllPids(paths["devices"]) -} - -func getCgroupData(c *configs.Cgroup, pid int) (*cgroupData, error) { - root, err := getCgroupRoot() - if err != nil { - return nil, err - } - - if (c.Name != "" || c.Parent != "") && c.Path != "" { - return nil, fmt.Errorf("cgroup: either Path or Name and Parent should be used") - } - - // XXX: Do not remove this code. Path safety is important! -- cyphar - cgPath := libcontainerUtils.CleanPath(c.Path) - cgParent := libcontainerUtils.CleanPath(c.Parent) - cgName := libcontainerUtils.CleanPath(c.Name) - - innerPath := cgPath - if innerPath == "" { - innerPath = filepath.Join(cgParent, cgName) - } - - return &cgroupData{ - root: root, - innerPath: innerPath, - config: c, - pid: pid, - }, nil -} - -func (raw *cgroupData) path(subsystem string) (string, error) { - mnt, err := cgroups.FindCgroupMountpoint(raw.root, subsystem) - // If we didn't mount the subsystem, there is no point we make the path. - if err != nil { - return "", err - } - - // If the cgroup name/path is absolute do not look relative to the cgroup of the init process. - if filepath.IsAbs(raw.innerPath) { - // Sometimes subsystems can be mounted together as 'cpu,cpuacct'. - return filepath.Join(raw.root, filepath.Base(mnt), raw.innerPath), nil - } - - // Use GetOwnCgroupPath instead of GetInitCgroupPath, because the creating - // process could in container and shared pid namespace with host, and - // /proc/1/cgroup could point to whole other world of cgroups. - parentPath, err := cgroups.GetOwnCgroupPath(subsystem) - if err != nil { - return "", err - } - - return filepath.Join(parentPath, raw.innerPath), nil -} - -func (raw *cgroupData) join(subsystem string) (string, error) { - path, err := raw.path(subsystem) - if err != nil { - return "", err - } - if err := os.MkdirAll(path, 0755); err != nil { - return "", err - } - if err := cgroups.WriteCgroupProc(path, raw.pid); err != nil { - return "", err - } - return path, nil -} - -func removePath(p string, err error) error { - if err != nil { - return err - } - if p != "" { - return os.RemoveAll(p) - } - return nil -} - -func CheckCpushares(path string, c uint64) error { - var cpuShares uint64 - - if c == 0 { - return nil - } - - fd, err := os.Open(filepath.Join(path, "cpu.shares")) - if err != nil { - return err - } - defer fd.Close() - - _, err = fmt.Fscanf(fd, "%d", &cpuShares) - if err != nil && err != io.EOF { - return err - } - - if c > cpuShares { - return fmt.Errorf("The maximum allowed cpu-shares is %d", cpuShares) - } else if c < cpuShares { - return fmt.Errorf("The minimum allowed cpu-shares is %d", cpuShares) - } - - return nil -} - -func (m *Manager) GetCgroups() (*configs.Cgroup, error) { - return m.Cgroups, nil -} diff --git a/libcontainer/cgroups/fs/apply_raw_test.go b/libcontainer/cgroups/fs/apply_raw_test.go deleted file mode 100644 index f3b6556..0000000 --- a/libcontainer/cgroups/fs/apply_raw_test.go +++ /dev/null @@ -1,297 +0,0 @@ -// +build linux - -package fs - -import ( - "path/filepath" - "strings" - "testing" - - "github.com/opencontainers/runc/libcontainer/cgroups" - "github.com/opencontainers/runc/libcontainer/configs" -) - -func TestInvalidCgroupPath(t *testing.T) { - if cgroups.IsCgroup2UnifiedMode() { - t.Skip("cgroup v1 is not supported") - } - root, err := getCgroupRoot() - if err != nil { - t.Errorf("couldn't get cgroup root: %v", err) - } - - config := &configs.Cgroup{ - Path: "../../../../../../../../../../some/path", - } - - data, err := getCgroupData(config, 0) - if err != nil { - t.Errorf("couldn't get cgroup data: %v", err) - } - - // Make sure the final innerPath doesn't go outside the cgroup mountpoint. - if strings.HasPrefix(data.innerPath, "..") { - t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!") - } - - // Double-check, using an actual cgroup. - deviceRoot := filepath.Join(root, "devices") - devicePath, err := data.path("devices") - if err != nil { - t.Errorf("couldn't get cgroup path: %v", err) - } - if !strings.HasPrefix(devicePath, deviceRoot) { - t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!") - } -} - -func TestInvalidAbsoluteCgroupPath(t *testing.T) { - if cgroups.IsCgroup2UnifiedMode() { - t.Skip("cgroup v1 is not supported") - } - root, err := getCgroupRoot() - if err != nil { - t.Errorf("couldn't get cgroup root: %v", err) - } - - config := &configs.Cgroup{ - Path: "/../../../../../../../../../../some/path", - } - - data, err := getCgroupData(config, 0) - if err != nil { - t.Errorf("couldn't get cgroup data: %v", err) - } - - // Make sure the final innerPath doesn't go outside the cgroup mountpoint. - if strings.HasPrefix(data.innerPath, "..") { - t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!") - } - - // Double-check, using an actual cgroup. - deviceRoot := filepath.Join(root, "devices") - devicePath, err := data.path("devices") - if err != nil { - t.Errorf("couldn't get cgroup path: %v", err) - } - if !strings.HasPrefix(devicePath, deviceRoot) { - t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!") - } -} - -// XXX: Remove me after we get rid of configs.Cgroup.Name and configs.Cgroup.Parent. -func TestInvalidCgroupParent(t *testing.T) { - if cgroups.IsCgroup2UnifiedMode() { - t.Skip("cgroup v1 is not supported") - } - root, err := getCgroupRoot() - if err != nil { - t.Errorf("couldn't get cgroup root: %v", err) - } - - config := &configs.Cgroup{ - Parent: "../../../../../../../../../../some/path", - Name: "name", - } - - data, err := getCgroupData(config, 0) - if err != nil { - t.Errorf("couldn't get cgroup data: %v", err) - } - - // Make sure the final innerPath doesn't go outside the cgroup mountpoint. - if strings.HasPrefix(data.innerPath, "..") { - t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!") - } - - // Double-check, using an actual cgroup. - deviceRoot := filepath.Join(root, "devices") - devicePath, err := data.path("devices") - if err != nil { - t.Errorf("couldn't get cgroup path: %v", err) - } - if !strings.HasPrefix(devicePath, deviceRoot) { - t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!") - } -} - -// XXX: Remove me after we get rid of configs.Cgroup.Name and configs.Cgroup.Parent. -func TestInvalidAbsoluteCgroupParent(t *testing.T) { - if cgroups.IsCgroup2UnifiedMode() { - t.Skip("cgroup v1 is not supported") - } - root, err := getCgroupRoot() - if err != nil { - t.Errorf("couldn't get cgroup root: %v", err) - } - - config := &configs.Cgroup{ - Parent: "/../../../../../../../../../../some/path", - Name: "name", - } - - data, err := getCgroupData(config, 0) - if err != nil { - t.Errorf("couldn't get cgroup data: %v", err) - } - - // Make sure the final innerPath doesn't go outside the cgroup mountpoint. - if strings.HasPrefix(data.innerPath, "..") { - t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!") - } - - // Double-check, using an actual cgroup. - deviceRoot := filepath.Join(root, "devices") - devicePath, err := data.path("devices") - if err != nil { - t.Errorf("couldn't get cgroup path: %v", err) - } - if !strings.HasPrefix(devicePath, deviceRoot) { - t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!") - } -} - -// XXX: Remove me after we get rid of configs.Cgroup.Name and configs.Cgroup.Parent. -func TestInvalidCgroupName(t *testing.T) { - if cgroups.IsCgroup2UnifiedMode() { - t.Skip("cgroup v1 is not supported") - } - root, err := getCgroupRoot() - if err != nil { - t.Errorf("couldn't get cgroup root: %v", err) - } - - config := &configs.Cgroup{ - Parent: "parent", - Name: "../../../../../../../../../../some/path", - } - - data, err := getCgroupData(config, 0) - if err != nil { - t.Errorf("couldn't get cgroup data: %v", err) - } - - // Make sure the final innerPath doesn't go outside the cgroup mountpoint. - if strings.HasPrefix(data.innerPath, "..") { - t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!") - } - - // Double-check, using an actual cgroup. - deviceRoot := filepath.Join(root, "devices") - devicePath, err := data.path("devices") - if err != nil { - t.Errorf("couldn't get cgroup path: %v", err) - } - if !strings.HasPrefix(devicePath, deviceRoot) { - t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!") - } - -} - -// XXX: Remove me after we get rid of configs.Cgroup.Name and configs.Cgroup.Parent. -func TestInvalidAbsoluteCgroupName(t *testing.T) { - if cgroups.IsCgroup2UnifiedMode() { - t.Skip("cgroup v1 is not supported") - } - root, err := getCgroupRoot() - if err != nil { - t.Errorf("couldn't get cgroup root: %v", err) - } - - config := &configs.Cgroup{ - Parent: "parent", - Name: "/../../../../../../../../../../some/path", - } - - data, err := getCgroupData(config, 0) - if err != nil { - t.Errorf("couldn't get cgroup data: %v", err) - } - - // Make sure the final innerPath doesn't go outside the cgroup mountpoint. - if strings.HasPrefix(data.innerPath, "..") { - t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!") - } - - // Double-check, using an actual cgroup. - deviceRoot := filepath.Join(root, "devices") - devicePath, err := data.path("devices") - if err != nil { - t.Errorf("couldn't get cgroup path: %v", err) - } - if !strings.HasPrefix(devicePath, deviceRoot) { - t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!") - } -} - -// XXX: Remove me after we get rid of configs.Cgroup.Name and configs.Cgroup.Parent. -func TestInvalidCgroupNameAndParent(t *testing.T) { - if cgroups.IsCgroup2UnifiedMode() { - t.Skip("cgroup v1 is not supported") - } - root, err := getCgroupRoot() - if err != nil { - t.Errorf("couldn't get cgroup root: %v", err) - } - - config := &configs.Cgroup{ - Parent: "../../../../../../../../../../some/path", - Name: "../../../../../../../../../../some/path", - } - - data, err := getCgroupData(config, 0) - if err != nil { - t.Errorf("couldn't get cgroup data: %v", err) - } - - // Make sure the final innerPath doesn't go outside the cgroup mountpoint. - if strings.HasPrefix(data.innerPath, "..") { - t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!") - } - - // Double-check, using an actual cgroup. - deviceRoot := filepath.Join(root, "devices") - devicePath, err := data.path("devices") - if err != nil { - t.Errorf("couldn't get cgroup path: %v", err) - } - if !strings.HasPrefix(devicePath, deviceRoot) { - t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!") - } -} - -// XXX: Remove me after we get rid of configs.Cgroup.Name and configs.Cgroup.Parent. -func TestInvalidAbsoluteCgroupNameAndParent(t *testing.T) { - if cgroups.IsCgroup2UnifiedMode() { - t.Skip("cgroup v1 is not supported") - } - root, err := getCgroupRoot() - if err != nil { - t.Errorf("couldn't get cgroup root: %v", err) - } - - config := &configs.Cgroup{ - Parent: "/../../../../../../../../../../some/path", - Name: "/../../../../../../../../../../some/path", - } - - data, err := getCgroupData(config, 0) - if err != nil { - t.Errorf("couldn't get cgroup data: %v", err) - } - - // Make sure the final innerPath doesn't go outside the cgroup mountpoint. - if strings.HasPrefix(data.innerPath, "..") { - t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!") - } - - // Double-check, using an actual cgroup. - deviceRoot := filepath.Join(root, "devices") - devicePath, err := data.path("devices") - if err != nil { - t.Errorf("couldn't get cgroup path: %v", err) - } - if !strings.HasPrefix(devicePath, deviceRoot) { - t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!") - } -} diff --git a/libcontainer/cgroups/fs/blkio.go b/libcontainer/cgroups/fs/blkio.go index 52c118d..c81b656 100644 --- a/libcontainer/cgroups/fs/blkio.go +++ b/libcontainer/cgroups/fs/blkio.go @@ -1,72 +1,71 @@ -// +build linux - package fs import ( "bufio" - "fmt" "os" "path/filepath" "strconv" "strings" "github.com/opencontainers/runc/libcontainer/cgroups" - "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" "github.com/opencontainers/runc/libcontainer/configs" ) type BlkioGroup struct { + weightFilename string + weightDeviceFilename string } func (s *BlkioGroup) Name() string { return "blkio" } -func (s *BlkioGroup) Apply(d *cgroupData) error { - _, err := d.join("blkio") - if err != nil && !cgroups.IsNotFound(err) { - return err - } - return nil +func (s *BlkioGroup) Apply(path string, _ *configs.Resources, pid int) error { + return apply(path, pid) } -func (s *BlkioGroup) Set(path string, cgroup *configs.Cgroup) error { - if cgroup.Resources.BlkioWeight != 0 { - if err := fscommon.WriteFile(path, "blkio.weight", strconv.FormatUint(uint64(cgroup.Resources.BlkioWeight), 10)); err != nil { +func (s *BlkioGroup) Set(path string, r *configs.Resources) error { + s.detectWeightFilenames(path) + if r.BlkioWeight != 0 { + if err := cgroups.WriteFile(path, s.weightFilename, strconv.FormatUint(uint64(r.BlkioWeight), 10)); err != nil { return err } } - if cgroup.Resources.BlkioLeafWeight != 0 { - if err := fscommon.WriteFile(path, "blkio.leaf_weight", strconv.FormatUint(uint64(cgroup.Resources.BlkioLeafWeight), 10)); err != nil { + if r.BlkioLeafWeight != 0 { + if err := cgroups.WriteFile(path, "blkio.leaf_weight", strconv.FormatUint(uint64(r.BlkioLeafWeight), 10)); err != nil { return err } } - for _, wd := range cgroup.Resources.BlkioWeightDevice { - if err := fscommon.WriteFile(path, "blkio.weight_device", wd.WeightString()); err != nil { - return err + for _, wd := range r.BlkioWeightDevice { + if wd.Weight != 0 { + if err := cgroups.WriteFile(path, s.weightDeviceFilename, wd.WeightString()); err != nil { + return err + } } - if err := fscommon.WriteFile(path, "blkio.leaf_weight_device", wd.LeafWeightString()); err != nil { + if wd.LeafWeight != 0 { + if err := cgroups.WriteFile(path, "blkio.leaf_weight_device", wd.LeafWeightString()); err != nil { + return err + } + } + } + for _, td := range r.BlkioThrottleReadBpsDevice { + if err := cgroups.WriteFile(path, "blkio.throttle.read_bps_device", td.String()); err != nil { return err } } - for _, td := range cgroup.Resources.BlkioThrottleReadBpsDevice { - if err := fscommon.WriteFile(path, "blkio.throttle.read_bps_device", td.String()); err != nil { + for _, td := range r.BlkioThrottleWriteBpsDevice { + if err := cgroups.WriteFile(path, "blkio.throttle.write_bps_device", td.String()); err != nil { return err } } - for _, td := range cgroup.Resources.BlkioThrottleWriteBpsDevice { - if err := fscommon.WriteFile(path, "blkio.throttle.write_bps_device", td.String()); err != nil { + for _, td := range r.BlkioThrottleReadIOPSDevice { + if err := cgroups.WriteFile(path, "blkio.throttle.read_iops_device", td.String()); err != nil { return err } } - for _, td := range cgroup.Resources.BlkioThrottleReadIOPSDevice { - if err := fscommon.WriteFile(path, "blkio.throttle.read_iops_device", td.String()); err != nil { - return err - } - } - for _, td := range cgroup.Resources.BlkioThrottleWriteIOPSDevice { - if err := fscommon.WriteFile(path, "blkio.throttle.write_iops_device", td.String()); err != nil { + for _, td := range r.BlkioThrottleWriteIOPSDevice { + if err := cgroups.WriteFile(path, "blkio.throttle.write_iops_device", td.String()); err != nil { return err } } @@ -74,10 +73,6 @@ func (s *BlkioGroup) Set(path string, cgroup *configs.Cgroup) error { return nil } -func (s *BlkioGroup) Remove(d *cgroupData) error { - return removePath(d.path("blkio")) -} - /* examples: @@ -113,9 +108,9 @@ func splitBlkioStatLine(r rune) bool { return r == ' ' || r == ':' } -func getBlkioStat(path string) ([]cgroups.BlkioStatEntry, error) { +func getBlkioStat(dir, file string) ([]cgroups.BlkioStatEntry, error) { var blkioStats []cgroups.BlkioStatEntry - f, err := os.Open(path) + f, err := cgroups.OpenFile(dir, file, os.O_RDONLY) if err != nil { if os.IsNotExist(err) { return blkioStats, nil @@ -133,19 +128,19 @@ func getBlkioStat(path string) ([]cgroups.BlkioStatEntry, error) { // skip total line continue } else { - return nil, fmt.Errorf("Invalid line found while parsing %s: %s", path, sc.Text()) + return nil, malformedLine(dir, file, sc.Text()) } } v, err := strconv.ParseUint(fields[0], 10, 64) if err != nil { - return nil, err + return nil, &parseError{Path: dir, File: file, Err: err} } major := v v, err = strconv.ParseUint(fields[1], 10, 64) if err != nil { - return nil, err + return nil, &parseError{Path: dir, File: file, Err: err} } minor := v @@ -157,82 +152,160 @@ func getBlkioStat(path string) ([]cgroups.BlkioStatEntry, error) { } v, err = strconv.ParseUint(fields[valueField], 10, 64) if err != nil { - return nil, err + return nil, &parseError{Path: dir, File: file, Err: err} } blkioStats = append(blkioStats, cgroups.BlkioStatEntry{Major: major, Minor: minor, Op: op, Value: v}) } + if err := sc.Err(); err != nil { + return nil, &parseError{Path: dir, File: file, Err: err} + } return blkioStats, nil } func (s *BlkioGroup) GetStats(path string, stats *cgroups.Stats) error { - // Try to read CFQ stats available on all CFQ enabled kernels first - if blkioStats, err := getBlkioStat(filepath.Join(path, "blkio.io_serviced_recursive")); err == nil && blkioStats != nil { - return getCFQStats(path, stats) + type blkioStatInfo struct { + filename string + blkioStatEntriesPtr *[]cgroups.BlkioStatEntry + } + bfqDebugStats := []blkioStatInfo{ + { + filename: "blkio.bfq.sectors_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.SectorsRecursive, + }, + { + filename: "blkio.bfq.io_service_time_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServiceTimeRecursive, + }, + { + filename: "blkio.bfq.io_wait_time_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoWaitTimeRecursive, + }, + { + filename: "blkio.bfq.io_merged_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoMergedRecursive, + }, + { + filename: "blkio.bfq.io_queued_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoQueuedRecursive, + }, + { + filename: "blkio.bfq.time_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoTimeRecursive, + }, + { + filename: "blkio.bfq.io_serviced_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive, + }, + { + filename: "blkio.bfq.io_service_bytes_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive, + }, + } + bfqStats := []blkioStatInfo{ + { + filename: "blkio.bfq.io_serviced_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive, + }, + { + filename: "blkio.bfq.io_service_bytes_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive, + }, + } + cfqStats := []blkioStatInfo{ + { + filename: "blkio.sectors_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.SectorsRecursive, + }, + { + filename: "blkio.io_service_time_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServiceTimeRecursive, + }, + { + filename: "blkio.io_wait_time_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoWaitTimeRecursive, + }, + { + filename: "blkio.io_merged_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoMergedRecursive, + }, + { + filename: "blkio.io_queued_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoQueuedRecursive, + }, + { + filename: "blkio.time_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoTimeRecursive, + }, + { + filename: "blkio.io_serviced_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive, + }, + { + filename: "blkio.io_service_bytes_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive, + }, + } + throttleRecursiveStats := []blkioStatInfo{ + { + filename: "blkio.throttle.io_serviced_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive, + }, + { + filename: "blkio.throttle.io_service_bytes_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive, + }, + } + baseStats := []blkioStatInfo{ + { + filename: "blkio.throttle.io_serviced", + blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive, + }, + { + filename: "blkio.throttle.io_service_bytes", + blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive, + }, + } + orderedStats := [][]blkioStatInfo{ + bfqDebugStats, + bfqStats, + cfqStats, + throttleRecursiveStats, + baseStats, } - return getStats(path, stats) // Use generic stats as fallback -} -func getCFQStats(path string, stats *cgroups.Stats) error { var blkioStats []cgroups.BlkioStatEntry var err error - if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.sectors_recursive")); err != nil { - return err + for _, statGroup := range orderedStats { + for i, statInfo := range statGroup { + if blkioStats, err = getBlkioStat(path, statInfo.filename); err != nil || blkioStats == nil { + // if error occurs on first file, move to next group + if i == 0 { + break + } + return err + } + *statInfo.blkioStatEntriesPtr = blkioStats + // finish if all stats are gathered + if i == len(statGroup)-1 { + return nil + } + } } - stats.BlkioStats.SectorsRecursive = blkioStats - - if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_service_bytes_recursive")); err != nil { - return err - } - stats.BlkioStats.IoServiceBytesRecursive = blkioStats - - if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_serviced_recursive")); err != nil { - return err - } - stats.BlkioStats.IoServicedRecursive = blkioStats - - if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_queued_recursive")); err != nil { - return err - } - stats.BlkioStats.IoQueuedRecursive = blkioStats - - if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_service_time_recursive")); err != nil { - return err - } - stats.BlkioStats.IoServiceTimeRecursive = blkioStats - - if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_wait_time_recursive")); err != nil { - return err - } - stats.BlkioStats.IoWaitTimeRecursive = blkioStats - - if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_merged_recursive")); err != nil { - return err - } - stats.BlkioStats.IoMergedRecursive = blkioStats - - if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.time_recursive")); err != nil { - return err - } - stats.BlkioStats.IoTimeRecursive = blkioStats - return nil } -func getStats(path string, stats *cgroups.Stats) error { - var blkioStats []cgroups.BlkioStatEntry - var err error - - if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.throttle.io_service_bytes")); err != nil { - return err +func (s *BlkioGroup) detectWeightFilenames(path string) { + if s.weightFilename != "" { + // Already detected. + return } - stats.BlkioStats.IoServiceBytesRecursive = blkioStats - - if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.throttle.io_serviced")); err != nil { - return err + if cgroups.PathExists(filepath.Join(path, "blkio.weight")) { + s.weightFilename = "blkio.weight" + s.weightDeviceFilename = "blkio.weight_device" + } else { + s.weightFilename = "blkio.bfq.weight" + s.weightDeviceFilename = "blkio.bfq.weight_device" } - stats.BlkioStats.IoServicedRecursive = blkioStats - - return nil } diff --git a/libcontainer/cgroups/fs/blkio_test.go b/libcontainer/cgroups/fs/blkio_test.go index 5ba60fa..09abd71 100644 --- a/libcontainer/cgroups/fs/blkio_test.go +++ b/libcontainer/cgroups/fs/blkio_test.go @@ -1,8 +1,7 @@ -// +build linux - package fs import ( + "fmt" "strconv" "testing" @@ -13,43 +12,81 @@ import ( const ( sectorsRecursiveContents = `8:0 1024` + sectorsRecursiveContentsBFQ = `8:0 2048` serviceBytesRecursiveContents = `8:0 Read 100 8:0 Write 200 8:0 Sync 300 8:0 Async 500 8:0 Total 500 Total 500` + + serviceBytesRecursiveContentsBFQ = `8:0 Read 1100 +8:0 Write 1200 +8:0 Sync 1300 +8:0 Async 1500 +8:0 Total 1500 +Total 1500` servicedRecursiveContents = `8:0 Read 10 8:0 Write 40 8:0 Sync 20 8:0 Async 30 8:0 Total 50 Total 50` + servicedRecursiveContentsBFQ = `8:0 Read 11 +8:0 Write 41 +8:0 Sync 21 +8:0 Async 31 +8:0 Total 51 +Total 51` queuedRecursiveContents = `8:0 Read 1 8:0 Write 4 8:0 Sync 2 8:0 Async 3 8:0 Total 5 Total 5` + queuedRecursiveContentsBFQ = `8:0 Read 2 +8:0 Write 3 +8:0 Sync 4 +8:0 Async 5 +8:0 Total 6 +Total 6` serviceTimeRecursiveContents = `8:0 Read 173959 8:0 Write 0 8:0 Sync 0 8:0 Async 173959 8:0 Total 17395 Total 17395` + serviceTimeRecursiveContentsBFQ = `8:0 Read 173959 +8:0 Write 0 +8:0 Sync 0 +8:0 Async 173 +8:0 Total 174 +Total 174` waitTimeRecursiveContents = `8:0 Read 15571 8:0 Write 0 8:0 Sync 0 8:0 Async 15571 8:0 Total 15571` + waitTimeRecursiveContentsBFQ = `8:0 Read 1557 +8:0 Write 0 +8:0 Sync 0 +8:0 Async 1557 +8:0 Total 1557` mergedRecursiveContents = `8:0 Read 5 8:0 Write 10 8:0 Sync 0 8:0 Async 0 8:0 Total 15 Total 15` - timeRecursiveContents = `8:0 8` - throttleServiceBytes = `8:0 Read 11030528 + mergedRecursiveContentsBFQ = `8:0 Read 51 +8:0 Write 101 +8:0 Sync 0 +8:0 Async 0 +8:0 Total 151 +Total 151` + timeRecursiveContents = `8:0 8` + timeRecursiveContentsBFQ = `8:0 16` + throttleServiceBytes = `8:0 Read 11030528 8:0 Write 23 8:0 Sync 42 8:0 Async 11030528 @@ -60,6 +97,17 @@ Total 15` 252:0 Async 11030528 252:0 Total 11030528 Total 22061056` + throttleServiceBytesRecursive = `8:0 Read 110305281 +8:0 Write 231 +8:0 Sync 421 +8:0 Async 110305281 +8:0 Total 110305281 +252:0 Read 110305281 +252:0 Write 231 +252:0 Sync 421 +252:0 Async 110305281 +252:0 Total 110305281 +Total 220610561` throttleServiced = `8:0 Read 164 8:0 Write 23 8:0 Sync 42 @@ -71,76 +119,138 @@ Total 22061056` 252:0 Async 164 252:0 Total 164 Total 328` + throttleServicedRecursive = `8:0 Read 1641 +8:0 Write 231 +8:0 Sync 421 +8:0 Async 1641 +8:0 Total 1641 +252:0 Read 1641 +252:0 Write 231 +252:0 Sync 421 +252:0 Async 1641 +252:0 Total 1641 +Total 3281` ) -func appendBlkioStatEntry(blkioStatEntries *[]cgroups.BlkioStatEntry, major, minor, value uint64, op string) { +var blkioBFQDebugStatsTestFiles = map[string]string{ + "blkio.bfq.io_service_bytes_recursive": serviceBytesRecursiveContentsBFQ, + "blkio.bfq.io_serviced_recursive": servicedRecursiveContentsBFQ, + "blkio.bfq.io_queued_recursive": queuedRecursiveContentsBFQ, + "blkio.bfq.io_service_time_recursive": serviceTimeRecursiveContentsBFQ, + "blkio.bfq.io_wait_time_recursive": waitTimeRecursiveContentsBFQ, + "blkio.bfq.io_merged_recursive": mergedRecursiveContentsBFQ, + "blkio.bfq.time_recursive": timeRecursiveContentsBFQ, + "blkio.bfq.sectors_recursive": sectorsRecursiveContentsBFQ, +} + +var blkioBFQStatsTestFiles = map[string]string{ + "blkio.bfq.io_service_bytes_recursive": serviceBytesRecursiveContentsBFQ, + "blkio.bfq.io_serviced_recursive": servicedRecursiveContentsBFQ, +} + +var blkioCFQStatsTestFiles = map[string]string{ + "blkio.io_service_bytes_recursive": serviceBytesRecursiveContents, + "blkio.io_serviced_recursive": servicedRecursiveContents, + "blkio.io_queued_recursive": queuedRecursiveContents, + "blkio.io_service_time_recursive": serviceTimeRecursiveContents, + "blkio.io_wait_time_recursive": waitTimeRecursiveContents, + "blkio.io_merged_recursive": mergedRecursiveContents, + "blkio.time_recursive": timeRecursiveContents, + "blkio.sectors_recursive": sectorsRecursiveContents, +} + +type blkioStatFailureTestCase struct { + desc string + filename string +} + +func appendBlkioStatEntry(blkioStatEntries *[]cgroups.BlkioStatEntry, major, minor, value uint64, op string) { //nolint:unparam *blkioStatEntries = append(*blkioStatEntries, cgroups.BlkioStatEntry{Major: major, Minor: minor, Value: value, Op: op}) } func TestBlkioSetWeight(t *testing.T) { - helper := NewCgroupTestUtil("blkio", t) - defer helper.cleanup() - const ( weightBefore = 100 weightAfter = 200 ) - helper.writeFileContents(map[string]string{ - "blkio.weight": strconv.Itoa(weightBefore), - }) - - helper.CgroupData.config.Resources.BlkioWeight = weightAfter - blkio := &BlkioGroup{} - if err := blkio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { - t.Fatal(err) - } - - value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "blkio.weight") - if err != nil { - t.Fatalf("Failed to parse blkio.weight - %s", err) - } - - if value != weightAfter { - t.Fatal("Got the wrong value, set blkio.weight failed.") + for _, legacyIOScheduler := range []bool{false, true} { + // Populate cgroup + path := tempDir(t, "blkio") + weightFilename := "blkio.bfq.weight" + if legacyIOScheduler { + weightFilename = "blkio.weight" + } + writeFileContents(t, path, map[string]string{ + weightFilename: strconv.Itoa(weightBefore), + }) + // Apply new configuration + r := &configs.Resources{ + BlkioWeight: weightAfter, + } + blkio := &BlkioGroup{} + if err := blkio.Set(path, r); err != nil { + t.Fatal(err) + } + // Verify results + if weightFilename != blkio.weightFilename { + t.Fatalf("weight filename detection failed: expected %q, detected %q", weightFilename, blkio.weightFilename) + } + value, err := fscommon.GetCgroupParamUint(path, weightFilename) + if err != nil { + t.Fatal(err) + } + if value != weightAfter { + t.Fatalf("Got the wrong value, set %s failed.", weightFilename) + } } } func TestBlkioSetWeightDevice(t *testing.T) { - helper := NewCgroupTestUtil("blkio", t) - defer helper.cleanup() - const ( weightDeviceBefore = "8:0 400" ) - wd := configs.NewWeightDevice(8, 0, 500, 0) - weightDeviceAfter := wd.WeightString() - - helper.writeFileContents(map[string]string{ - "blkio.weight_device": weightDeviceBefore, - }) - - helper.CgroupData.config.Resources.BlkioWeightDevice = []*configs.WeightDevice{wd} - blkio := &BlkioGroup{} - if err := blkio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { - t.Fatal(err) - } - - value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "blkio.weight_device") - if err != nil { - t.Fatalf("Failed to parse blkio.weight_device - %s", err) - } - - if value != weightDeviceAfter { - t.Fatal("Got the wrong value, set blkio.weight_device failed.") + for _, legacyIOScheduler := range []bool{false, true} { + // Populate cgroup + path := tempDir(t, "blkio") + weightFilename := "blkio.bfq.weight" + weightDeviceFilename := "blkio.bfq.weight_device" + if legacyIOScheduler { + weightFilename = "blkio.weight" + weightDeviceFilename = "blkio.weight_device" + } + writeFileContents(t, path, map[string]string{ + weightFilename: "", + weightDeviceFilename: weightDeviceBefore, + }) + // Apply new configuration + wd := configs.NewWeightDevice(8, 0, 500, 0) + weightDeviceAfter := wd.WeightString() + r := &configs.Resources{ + BlkioWeightDevice: []*configs.WeightDevice{wd}, + } + blkio := &BlkioGroup{} + if err := blkio.Set(path, r); err != nil { + t.Fatal(err) + } + // Verify results + if weightDeviceFilename != blkio.weightDeviceFilename { + t.Fatalf("weight_device filename detection failed: expected %q, detected %q", weightDeviceFilename, blkio.weightDeviceFilename) + } + value, err := fscommon.GetCgroupParamString(path, weightDeviceFilename) + if err != nil { + t.Fatal(err) + } + if value != weightDeviceAfter { + t.Fatalf("Got the wrong value, set %s failed.", weightDeviceFilename) + } } } // regression #274 func TestBlkioSetMultipleWeightDevice(t *testing.T) { - helper := NewCgroupTestUtil("blkio", t) - defer helper.cleanup() + path := tempDir(t, "blkio") const ( weightDeviceBefore = "8:0 400" @@ -148,49 +258,238 @@ func TestBlkioSetMultipleWeightDevice(t *testing.T) { wd1 := configs.NewWeightDevice(8, 0, 500, 0) wd2 := configs.NewWeightDevice(8, 16, 500, 0) - // we cannot actually set and check both because normal ioutil.WriteFile + // we cannot actually set and check both because normal os.WriteFile // when writing to cgroup file will overwrite the whole file content instead // of updating it as the kernel is doing. Just check the second device // is present will suffice for the test to ensure multiple writes are done. weightDeviceAfter := wd2.WeightString() - helper.writeFileContents(map[string]string{ - "blkio.weight_device": weightDeviceBefore, + blkio := &BlkioGroup{} + blkio.detectWeightFilenames(path) + if blkio.weightDeviceFilename != "blkio.bfq.weight_device" { + t.Fatalf("when blkio controller is unavailable, expected to use \"blkio.bfq.weight_device\", tried to use %q", blkio.weightDeviceFilename) + } + writeFileContents(t, path, map[string]string{ + blkio.weightDeviceFilename: weightDeviceBefore, }) - helper.CgroupData.config.Resources.BlkioWeightDevice = []*configs.WeightDevice{wd1, wd2} - blkio := &BlkioGroup{} - if err := blkio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + r := &configs.Resources{ + BlkioWeightDevice: []*configs.WeightDevice{wd1, wd2}, + } + if err := blkio.Set(path, r); err != nil { t.Fatal(err) } - value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "blkio.weight_device") + value, err := fscommon.GetCgroupParamString(path, blkio.weightDeviceFilename) if err != nil { - t.Fatalf("Failed to parse blkio.weight_device - %s", err) + t.Fatal(err) } - if value != weightDeviceAfter { - t.Fatal("Got the wrong value, set blkio.weight_device failed.") + t.Fatalf("Got the wrong value, set %s failed.", blkio.weightDeviceFilename) } } -func TestBlkioStats(t *testing.T) { - helper := NewCgroupTestUtil("blkio", t) - defer helper.cleanup() - helper.writeFileContents(map[string]string{ - "blkio.io_service_bytes_recursive": serviceBytesRecursiveContents, - "blkio.io_serviced_recursive": servicedRecursiveContents, - "blkio.io_queued_recursive": queuedRecursiveContents, - "blkio.io_service_time_recursive": serviceTimeRecursiveContents, - "blkio.io_wait_time_recursive": waitTimeRecursiveContents, - "blkio.io_merged_recursive": mergedRecursiveContents, - "blkio.time_recursive": timeRecursiveContents, - "blkio.sectors_recursive": sectorsRecursiveContents, - }) +func TestBlkioBFQDebugStats(t *testing.T) { + path := tempDir(t, "blkio") + writeFileContents(t, path, blkioBFQDebugStatsTestFiles) + blkio := &BlkioGroup{} + actualStats := *cgroups.NewStats() + err := blkio.GetStats(path, &actualStats) + if err != nil { + t.Fatal(err) + } + + expectedStats := cgroups.BlkioStats{} + appendBlkioStatEntry(&expectedStats.SectorsRecursive, 8, 0, 2048, "") + + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1100, "Read") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1200, "Write") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1300, "Sync") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1500, "Async") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1500, "Total") + + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 11, "Read") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 41, "Write") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 21, "Sync") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 31, "Async") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 51, "Total") + + appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 2, "Read") + appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 3, "Write") + appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 4, "Sync") + appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 5, "Async") + appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 6, "Total") + + appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 173959, "Read") + appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 0, "Write") + appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 0, "Sync") + appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 173, "Async") + appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 174, "Total") + + appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 1557, "Read") + appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 0, "Write") + appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 0, "Sync") + appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 1557, "Async") + appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 1557, "Total") + + appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 51, "Read") + appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 101, "Write") + appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 0, "Sync") + appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 0, "Async") + appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 151, "Total") + + appendBlkioStatEntry(&expectedStats.IoTimeRecursive, 8, 0, 16, "") + + expectBlkioStatsEquals(t, expectedStats, actualStats.BlkioStats) +} + +func TestBlkioMultipleStatsFiles(t *testing.T) { + path := tempDir(t, "blkio") + writeFileContents(t, path, blkioBFQDebugStatsTestFiles) + writeFileContents(t, path, blkioCFQStatsTestFiles) + blkio := &BlkioGroup{} + actualStats := *cgroups.NewStats() + err := blkio.GetStats(path, &actualStats) + if err != nil { + t.Fatal(err) + } + + expectedStats := cgroups.BlkioStats{} + appendBlkioStatEntry(&expectedStats.SectorsRecursive, 8, 0, 2048, "") + + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1100, "Read") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1200, "Write") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1300, "Sync") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1500, "Async") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1500, "Total") + + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 11, "Read") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 41, "Write") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 21, "Sync") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 31, "Async") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 51, "Total") + + appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 2, "Read") + appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 3, "Write") + appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 4, "Sync") + appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 5, "Async") + appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 6, "Total") + + appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 173959, "Read") + appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 0, "Write") + appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 0, "Sync") + appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 173, "Async") + appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 174, "Total") + + appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 1557, "Read") + appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 0, "Write") + appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 0, "Sync") + appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 1557, "Async") + appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 1557, "Total") + + appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 51, "Read") + appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 101, "Write") + appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 0, "Sync") + appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 0, "Async") + appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 151, "Total") + + appendBlkioStatEntry(&expectedStats.IoTimeRecursive, 8, 0, 16, "") + + expectBlkioStatsEquals(t, expectedStats, actualStats.BlkioStats) +} + +func TestBlkioBFQStats(t *testing.T) { + path := tempDir(t, "blkio") + writeFileContents(t, path, blkioBFQStatsTestFiles) + blkio := &BlkioGroup{} + actualStats := *cgroups.NewStats() + err := blkio.GetStats(path, &actualStats) + if err != nil { + t.Fatal(err) + } + + expectedStats := cgroups.BlkioStats{} + + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1100, "Read") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1200, "Write") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1300, "Sync") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1500, "Async") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1500, "Total") + + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 11, "Read") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 41, "Write") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 21, "Sync") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 31, "Async") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 51, "Total") + + expectBlkioStatsEquals(t, expectedStats, actualStats.BlkioStats) +} + +func TestBlkioStatsNoFilesBFQDebug(t *testing.T) { + if testing.Short() { + t.Skip("skipping test in short mode.") + } + testCases := []blkioStatFailureTestCase{ + { + desc: "missing blkio.bfq.io_service_bytes_recursive file", + filename: "blkio.bfq.io_service_bytes_recursive", + }, + { + desc: "missing blkio.bfq.io_serviced_recursive file", + filename: "blkio.bfq.io_serviced_recursive", + }, + { + desc: "missing blkio.bfq.io_queued_recursive file", + filename: "blkio.bfq.io_queued_recursive", + }, + { + desc: "missing blkio.bfq.sectors_recursive file", + filename: "blkio.bfq.sectors_recursive", + }, + { + desc: "missing blkio.bfq.io_service_time_recursive file", + filename: "blkio.bfq.io_service_time_recursive", + }, + { + desc: "missing blkio.bfq.io_wait_time_recursive file", + filename: "blkio.bfq.io_wait_time_recursive", + }, + { + desc: "missing blkio.bfq.io_merged_recursive file", + filename: "blkio.bfq.io_merged_recursive", + }, + { + desc: "missing blkio.bfq.time_recursive file", + filename: "blkio.bfq.time_recursive", + }, + } + + for _, testCase := range testCases { + path := tempDir(t, "cpuset") + + tempBlkioTestFiles := map[string]string{} + for i, v := range blkioBFQDebugStatsTestFiles { + tempBlkioTestFiles[i] = v + } + delete(tempBlkioTestFiles, testCase.filename) + + writeFileContents(t, path, tempBlkioTestFiles) + cpuset := &CpusetGroup{} + actualStats := *cgroups.NewStats() + err := cpuset.GetStats(path, &actualStats) + if err != nil { + t.Errorf(fmt.Sprintf("test case '%s' failed unexpectedly: %s", testCase.desc, err)) + } + } +} + +func TestBlkioCFQStats(t *testing.T) { + path := tempDir(t, "blkio") + writeFileContents(t, path, blkioCFQStatsTestFiles) blkio := &BlkioGroup{} actualStats := *cgroups.NewStats() - err := blkio.GetStats(helper.CgroupPath, &actualStats) + err := blkio.GetStats(path, &actualStats) if err != nil { t.Fatal(err) } @@ -240,190 +539,67 @@ func TestBlkioStats(t *testing.T) { expectBlkioStatsEquals(t, expectedStats, actualStats.BlkioStats) } -func TestBlkioStatsNoSectorsFile(t *testing.T) { - helper := NewCgroupTestUtil("blkio", t) - defer helper.cleanup() - helper.writeFileContents(map[string]string{ - "blkio.io_service_bytes_recursive": serviceBytesRecursiveContents, - "blkio.io_serviced_recursive": servicedRecursiveContents, - "blkio.io_queued_recursive": queuedRecursiveContents, - "blkio.io_service_time_recursive": serviceTimeRecursiveContents, - "blkio.io_wait_time_recursive": waitTimeRecursiveContents, - "blkio.io_merged_recursive": mergedRecursiveContents, - "blkio.time_recursive": timeRecursiveContents, - }) - - blkio := &BlkioGroup{} - actualStats := *cgroups.NewStats() - err := blkio.GetStats(helper.CgroupPath, &actualStats) - if err != nil { - t.Fatalf("Failed unexpectedly: %s", err) - } -} - -func TestBlkioStatsNoServiceBytesFile(t *testing.T) { - helper := NewCgroupTestUtil("blkio", t) - defer helper.cleanup() - helper.writeFileContents(map[string]string{ - "blkio.io_serviced_recursive": servicedRecursiveContents, - "blkio.io_queued_recursive": queuedRecursiveContents, - "blkio.sectors_recursive": sectorsRecursiveContents, - "blkio.io_service_time_recursive": serviceTimeRecursiveContents, - "blkio.io_wait_time_recursive": waitTimeRecursiveContents, - "blkio.io_merged_recursive": mergedRecursiveContents, - "blkio.time_recursive": timeRecursiveContents, - }) - - blkio := &BlkioGroup{} - actualStats := *cgroups.NewStats() - err := blkio.GetStats(helper.CgroupPath, &actualStats) - if err != nil { - t.Fatalf("Failed unexpectedly: %s", err) - } -} - -func TestBlkioStatsNoServicedFile(t *testing.T) { - helper := NewCgroupTestUtil("blkio", t) - defer helper.cleanup() - helper.writeFileContents(map[string]string{ - "blkio.io_service_bytes_recursive": serviceBytesRecursiveContents, - "blkio.io_queued_recursive": queuedRecursiveContents, - "blkio.sectors_recursive": sectorsRecursiveContents, - "blkio.io_service_time_recursive": serviceTimeRecursiveContents, - "blkio.io_wait_time_recursive": waitTimeRecursiveContents, - "blkio.io_merged_recursive": mergedRecursiveContents, - "blkio.time_recursive": timeRecursiveContents, - }) - - blkio := &BlkioGroup{} - actualStats := *cgroups.NewStats() - err := blkio.GetStats(helper.CgroupPath, &actualStats) - if err != nil { - t.Fatalf("Failed unexpectedly: %s", err) - } -} - -func TestBlkioStatsNoQueuedFile(t *testing.T) { - helper := NewCgroupTestUtil("blkio", t) - defer helper.cleanup() - helper.writeFileContents(map[string]string{ - "blkio.io_service_bytes_recursive": serviceBytesRecursiveContents, - "blkio.io_serviced_recursive": servicedRecursiveContents, - "blkio.sectors_recursive": sectorsRecursiveContents, - "blkio.io_service_time_recursive": serviceTimeRecursiveContents, - "blkio.io_wait_time_recursive": waitTimeRecursiveContents, - "blkio.io_merged_recursive": mergedRecursiveContents, - "blkio.time_recursive": timeRecursiveContents, - }) - - blkio := &BlkioGroup{} - actualStats := *cgroups.NewStats() - err := blkio.GetStats(helper.CgroupPath, &actualStats) - if err != nil { - t.Fatalf("Failed unexpectedly: %s", err) - } -} - -func TestBlkioStatsNoServiceTimeFile(t *testing.T) { +func TestBlkioStatsNoFilesCFQ(t *testing.T) { if testing.Short() { t.Skip("skipping test in short mode.") } - helper := NewCgroupTestUtil("blkio", t) - defer helper.cleanup() - helper.writeFileContents(map[string]string{ - "blkio.io_service_bytes_recursive": serviceBytesRecursiveContents, - "blkio.io_serviced_recursive": servicedRecursiveContents, - "blkio.io_queued_recursive": queuedRecursiveContents, - "blkio.io_wait_time_recursive": waitTimeRecursiveContents, - "blkio.io_merged_recursive": mergedRecursiveContents, - "blkio.time_recursive": timeRecursiveContents, - "blkio.sectors_recursive": sectorsRecursiveContents, - }) - - blkio := &BlkioGroup{} - actualStats := *cgroups.NewStats() - err := blkio.GetStats(helper.CgroupPath, &actualStats) - if err != nil { - t.Fatalf("Failed unexpectedly: %s", err) + testCases := []blkioStatFailureTestCase{ + { + desc: "missing blkio.io_service_bytes_recursive file", + filename: "blkio.io_service_bytes_recursive", + }, + { + desc: "missing blkio.io_serviced_recursive file", + filename: "blkio.io_serviced_recursive", + }, + { + desc: "missing blkio.io_queued_recursive file", + filename: "blkio.io_queued_recursive", + }, + { + desc: "missing blkio.sectors_recursive file", + filename: "blkio.sectors_recursive", + }, + { + desc: "missing blkio.io_service_time_recursive file", + filename: "blkio.io_service_time_recursive", + }, + { + desc: "missing blkio.io_wait_time_recursive file", + filename: "blkio.io_wait_time_recursive", + }, + { + desc: "missing blkio.io_merged_recursive file", + filename: "blkio.io_merged_recursive", + }, + { + desc: "missing blkio.time_recursive file", + filename: "blkio.time_recursive", + }, } -} -func TestBlkioStatsNoWaitTimeFile(t *testing.T) { - if testing.Short() { - t.Skip("skipping test in short mode.") - } - helper := NewCgroupTestUtil("blkio", t) - defer helper.cleanup() - helper.writeFileContents(map[string]string{ - "blkio.io_service_bytes_recursive": serviceBytesRecursiveContents, - "blkio.io_serviced_recursive": servicedRecursiveContents, - "blkio.io_queued_recursive": queuedRecursiveContents, - "blkio.io_service_time_recursive": serviceTimeRecursiveContents, - "blkio.io_merged_recursive": mergedRecursiveContents, - "blkio.time_recursive": timeRecursiveContents, - "blkio.sectors_recursive": sectorsRecursiveContents, - }) + for _, testCase := range testCases { + path := tempDir(t, "cpuset") - blkio := &BlkioGroup{} - actualStats := *cgroups.NewStats() - err := blkio.GetStats(helper.CgroupPath, &actualStats) - if err != nil { - t.Fatalf("Failed unexpectedly: %s", err) - } -} + tempBlkioTestFiles := map[string]string{} + for i, v := range blkioCFQStatsTestFiles { + tempBlkioTestFiles[i] = v + } + delete(tempBlkioTestFiles, testCase.filename) -func TestBlkioStatsNoMergedFile(t *testing.T) { - if testing.Short() { - t.Skip("skipping test in short mode.") - } - helper := NewCgroupTestUtil("blkio", t) - defer helper.cleanup() - helper.writeFileContents(map[string]string{ - "blkio.io_service_bytes_recursive": serviceBytesRecursiveContents, - "blkio.io_serviced_recursive": servicedRecursiveContents, - "blkio.io_queued_recursive": queuedRecursiveContents, - "blkio.io_service_time_recursive": serviceTimeRecursiveContents, - "blkio.io_wait_time_recursive": waitTimeRecursiveContents, - "blkio.time_recursive": timeRecursiveContents, - "blkio.sectors_recursive": sectorsRecursiveContents, - }) - - blkio := &BlkioGroup{} - actualStats := *cgroups.NewStats() - err := blkio.GetStats(helper.CgroupPath, &actualStats) - if err != nil { - t.Fatalf("Failed unexpectedly: %s", err) - } -} - -func TestBlkioStatsNoTimeFile(t *testing.T) { - if testing.Short() { - t.Skip("skipping test in short mode.") - } - helper := NewCgroupTestUtil("blkio", t) - defer helper.cleanup() - helper.writeFileContents(map[string]string{ - "blkio.io_service_bytes_recursive": serviceBytesRecursiveContents, - "blkio.io_serviced_recursive": servicedRecursiveContents, - "blkio.io_queued_recursive": queuedRecursiveContents, - "blkio.io_service_time_recursive": serviceTimeRecursiveContents, - "blkio.io_wait_time_recursive": waitTimeRecursiveContents, - "blkio.io_merged_recursive": mergedRecursiveContents, - "blkio.sectors_recursive": sectorsRecursiveContents, - }) - - blkio := &BlkioGroup{} - actualStats := *cgroups.NewStats() - err := blkio.GetStats(helper.CgroupPath, &actualStats) - if err != nil { - t.Fatalf("Failed unexpectedly: %s", err) + writeFileContents(t, path, tempBlkioTestFiles) + cpuset := &CpusetGroup{} + actualStats := *cgroups.NewStats() + err := cpuset.GetStats(path, &actualStats) + if err != nil { + t.Errorf(fmt.Sprintf("test case '%s' failed unexpectedly: %s", testCase.desc, err)) + } } } func TestBlkioStatsUnexpectedNumberOfFields(t *testing.T) { - helper := NewCgroupTestUtil("blkio", t) - defer helper.cleanup() - helper.writeFileContents(map[string]string{ + path := tempDir(t, "blkio") + writeFileContents(t, path, map[string]string{ "blkio.io_service_bytes_recursive": "8:0 Read 100 100", "blkio.io_serviced_recursive": servicedRecursiveContents, "blkio.io_queued_recursive": queuedRecursiveContents, @@ -436,16 +612,15 @@ func TestBlkioStatsUnexpectedNumberOfFields(t *testing.T) { blkio := &BlkioGroup{} actualStats := *cgroups.NewStats() - err := blkio.GetStats(helper.CgroupPath, &actualStats) + err := blkio.GetStats(path, &actualStats) if err == nil { t.Fatal("Expected to fail, but did not") } } func TestBlkioStatsUnexpectedFieldType(t *testing.T) { - helper := NewCgroupTestUtil("blkio", t) - defer helper.cleanup() - helper.writeFileContents(map[string]string{ + path := tempDir(t, "blkio") + writeFileContents(t, path, map[string]string{ "blkio.io_service_bytes_recursive": "8:0 Read Write", "blkio.io_serviced_recursive": servicedRecursiveContents, "blkio.io_queued_recursive": queuedRecursiveContents, @@ -458,16 +633,65 @@ func TestBlkioStatsUnexpectedFieldType(t *testing.T) { blkio := &BlkioGroup{} actualStats := *cgroups.NewStats() - err := blkio.GetStats(helper.CgroupPath, &actualStats) + err := blkio.GetStats(path, &actualStats) if err == nil { t.Fatal("Expected to fail, but did not") } } -func TestNonCFQBlkioStats(t *testing.T) { - helper := NewCgroupTestUtil("blkio", t) - defer helper.cleanup() - helper.writeFileContents(map[string]string{ +func TestThrottleRecursiveBlkioStats(t *testing.T) { + path := tempDir(t, "blkio") + writeFileContents(t, path, map[string]string{ + "blkio.io_service_bytes_recursive": "", + "blkio.io_serviced_recursive": "", + "blkio.io_queued_recursive": "", + "blkio.sectors_recursive": "", + "blkio.io_service_time_recursive": "", + "blkio.io_wait_time_recursive": "", + "blkio.io_merged_recursive": "", + "blkio.time_recursive": "", + "blkio.throttle.io_service_bytes_recursive": throttleServiceBytesRecursive, + "blkio.throttle.io_serviced_recursive": throttleServicedRecursive, + }) + + blkio := &BlkioGroup{} + actualStats := *cgroups.NewStats() + err := blkio.GetStats(path, &actualStats) + if err != nil { + t.Fatal(err) + } + + // Verify expected stats. + expectedStats := cgroups.BlkioStats{} + + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 110305281, "Read") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 231, "Write") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 421, "Sync") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 110305281, "Async") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 110305281, "Total") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 110305281, "Read") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 231, "Write") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 421, "Sync") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 110305281, "Async") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 110305281, "Total") + + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 1641, "Read") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 231, "Write") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 421, "Sync") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 1641, "Async") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 1641, "Total") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 1641, "Read") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 231, "Write") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 421, "Sync") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 1641, "Async") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 1641, "Total") + + expectBlkioStatsEquals(t, expectedStats, actualStats.BlkioStats) +} + +func TestThrottleBlkioStats(t *testing.T) { + path := tempDir(t, "blkio") + writeFileContents(t, path, map[string]string{ "blkio.io_service_bytes_recursive": "", "blkio.io_serviced_recursive": "", "blkio.io_queued_recursive": "", @@ -482,7 +706,7 @@ func TestNonCFQBlkioStats(t *testing.T) { blkio := &BlkioGroup{} actualStats := *cgroups.NewStats() - err := blkio.GetStats(helper.CgroupPath, &actualStats) + err := blkio.GetStats(path, &actualStats) if err != nil { t.Fatal(err) } @@ -516,8 +740,7 @@ func TestNonCFQBlkioStats(t *testing.T) { } func TestBlkioSetThrottleReadBpsDevice(t *testing.T) { - helper := NewCgroupTestUtil("blkio", t) - defer helper.cleanup() + path := tempDir(t, "blkio") const ( throttleBefore = `8:0 1024` @@ -526,28 +749,29 @@ func TestBlkioSetThrottleReadBpsDevice(t *testing.T) { td := configs.NewThrottleDevice(8, 0, 2048) throttleAfter := td.String() - helper.writeFileContents(map[string]string{ + writeFileContents(t, path, map[string]string{ "blkio.throttle.read_bps_device": throttleBefore, }) - helper.CgroupData.config.Resources.BlkioThrottleReadBpsDevice = []*configs.ThrottleDevice{td} + r := &configs.Resources{ + BlkioThrottleReadBpsDevice: []*configs.ThrottleDevice{td}, + } blkio := &BlkioGroup{} - if err := blkio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + if err := blkio.Set(path, r); err != nil { t.Fatal(err) } - value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "blkio.throttle.read_bps_device") + value, err := fscommon.GetCgroupParamString(path, "blkio.throttle.read_bps_device") if err != nil { - t.Fatalf("Failed to parse blkio.throttle.read_bps_device - %s", err) + t.Fatal(err) } - if value != throttleAfter { t.Fatal("Got the wrong value, set blkio.throttle.read_bps_device failed.") } } + func TestBlkioSetThrottleWriteBpsDevice(t *testing.T) { - helper := NewCgroupTestUtil("blkio", t) - defer helper.cleanup() + path := tempDir(t, "blkio") const ( throttleBefore = `8:0 1024` @@ -556,28 +780,29 @@ func TestBlkioSetThrottleWriteBpsDevice(t *testing.T) { td := configs.NewThrottleDevice(8, 0, 2048) throttleAfter := td.String() - helper.writeFileContents(map[string]string{ + writeFileContents(t, path, map[string]string{ "blkio.throttle.write_bps_device": throttleBefore, }) - helper.CgroupData.config.Resources.BlkioThrottleWriteBpsDevice = []*configs.ThrottleDevice{td} + r := &configs.Resources{ + BlkioThrottleWriteBpsDevice: []*configs.ThrottleDevice{td}, + } blkio := &BlkioGroup{} - if err := blkio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + if err := blkio.Set(path, r); err != nil { t.Fatal(err) } - value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "blkio.throttle.write_bps_device") + value, err := fscommon.GetCgroupParamString(path, "blkio.throttle.write_bps_device") if err != nil { - t.Fatalf("Failed to parse blkio.throttle.write_bps_device - %s", err) + t.Fatal(err) } - if value != throttleAfter { t.Fatal("Got the wrong value, set blkio.throttle.write_bps_device failed.") } } + func TestBlkioSetThrottleReadIOpsDevice(t *testing.T) { - helper := NewCgroupTestUtil("blkio", t) - defer helper.cleanup() + path := tempDir(t, "blkio") const ( throttleBefore = `8:0 1024` @@ -586,28 +811,29 @@ func TestBlkioSetThrottleReadIOpsDevice(t *testing.T) { td := configs.NewThrottleDevice(8, 0, 2048) throttleAfter := td.String() - helper.writeFileContents(map[string]string{ + writeFileContents(t, path, map[string]string{ "blkio.throttle.read_iops_device": throttleBefore, }) - helper.CgroupData.config.Resources.BlkioThrottleReadIOPSDevice = []*configs.ThrottleDevice{td} + r := &configs.Resources{ + BlkioThrottleReadIOPSDevice: []*configs.ThrottleDevice{td}, + } blkio := &BlkioGroup{} - if err := blkio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + if err := blkio.Set(path, r); err != nil { t.Fatal(err) } - value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "blkio.throttle.read_iops_device") + value, err := fscommon.GetCgroupParamString(path, "blkio.throttle.read_iops_device") if err != nil { - t.Fatalf("Failed to parse blkio.throttle.read_iops_device - %s", err) + t.Fatal(err) } - if value != throttleAfter { t.Fatal("Got the wrong value, set blkio.throttle.read_iops_device failed.") } } + func TestBlkioSetThrottleWriteIOpsDevice(t *testing.T) { - helper := NewCgroupTestUtil("blkio", t) - defer helper.cleanup() + path := tempDir(t, "blkio") const ( throttleBefore = `8:0 1024` @@ -616,21 +842,22 @@ func TestBlkioSetThrottleWriteIOpsDevice(t *testing.T) { td := configs.NewThrottleDevice(8, 0, 2048) throttleAfter := td.String() - helper.writeFileContents(map[string]string{ + writeFileContents(t, path, map[string]string{ "blkio.throttle.write_iops_device": throttleBefore, }) - helper.CgroupData.config.Resources.BlkioThrottleWriteIOPSDevice = []*configs.ThrottleDevice{td} + r := &configs.Resources{ + BlkioThrottleWriteIOPSDevice: []*configs.ThrottleDevice{td}, + } blkio := &BlkioGroup{} - if err := blkio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + if err := blkio.Set(path, r); err != nil { t.Fatal(err) } - value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "blkio.throttle.write_iops_device") + value, err := fscommon.GetCgroupParamString(path, "blkio.throttle.write_iops_device") if err != nil { - t.Fatalf("Failed to parse blkio.throttle.write_iops_device - %s", err) + t.Fatal(err) } - if value != throttleAfter { t.Fatal("Got the wrong value, set blkio.throttle.write_iops_device failed.") } diff --git a/libcontainer/cgroups/fs/cpu.go b/libcontainer/cgroups/fs/cpu.go index 4db7b64..6c79f89 100644 --- a/libcontainer/cgroups/fs/cpu.go +++ b/libcontainer/cgroups/fs/cpu.go @@ -1,94 +1,105 @@ -// +build linux - package fs import ( "bufio" + "errors" + "fmt" "os" - "path/filepath" "strconv" "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" "github.com/opencontainers/runc/libcontainer/configs" + "golang.org/x/sys/unix" ) -type CpuGroup struct { -} +type CpuGroup struct{} func (s *CpuGroup) Name() string { return "cpu" } -func (s *CpuGroup) Apply(d *cgroupData) error { - // We always want to join the cpu group, to allow fair cpu scheduling - // on a container basis - path, err := d.path("cpu") - if err != nil && !cgroups.IsNotFound(err) { - return err - } - return s.ApplyDir(path, d.config, d.pid) -} - -func (s *CpuGroup) ApplyDir(path string, cgroup *configs.Cgroup, pid int) error { - // This might happen if we have no cpu cgroup mounted. - // Just do nothing and don't fail. - if path == "" { - return nil - } - if err := os.MkdirAll(path, 0755); err != nil { +func (s *CpuGroup) Apply(path string, r *configs.Resources, pid int) error { + if err := os.MkdirAll(path, 0o755); err != nil { return err } // We should set the real-Time group scheduling settings before moving // in the process because if the process is already in SCHED_RR mode // and no RT bandwidth is set, adding it will fail. - if err := s.SetRtSched(path, cgroup); err != nil { + if err := s.SetRtSched(path, r); err != nil { return err } - // because we are not using d.join we need to place the pid into the procs file - // unlike the other subsystems + // Since we are not using apply(), we need to place the pid + // into the procs file. return cgroups.WriteCgroupProc(path, pid) } -func (s *CpuGroup) SetRtSched(path string, cgroup *configs.Cgroup) error { - if cgroup.Resources.CpuRtPeriod != 0 { - if err := fscommon.WriteFile(path, "cpu.rt_period_us", strconv.FormatUint(cgroup.Resources.CpuRtPeriod, 10)); err != nil { +func (s *CpuGroup) SetRtSched(path string, r *configs.Resources) error { + if r.CpuRtPeriod != 0 { + if err := cgroups.WriteFile(path, "cpu.rt_period_us", strconv.FormatUint(r.CpuRtPeriod, 10)); err != nil { return err } } - if cgroup.Resources.CpuRtRuntime != 0 { - if err := fscommon.WriteFile(path, "cpu.rt_runtime_us", strconv.FormatInt(cgroup.Resources.CpuRtRuntime, 10)); err != nil { + if r.CpuRtRuntime != 0 { + if err := cgroups.WriteFile(path, "cpu.rt_runtime_us", strconv.FormatInt(r.CpuRtRuntime, 10)); err != nil { return err } } return nil } -func (s *CpuGroup) Set(path string, cgroup *configs.Cgroup) error { - if cgroup.Resources.CpuShares != 0 { - if err := fscommon.WriteFile(path, "cpu.shares", strconv.FormatUint(cgroup.Resources.CpuShares, 10)); err != nil { +func (s *CpuGroup) Set(path string, r *configs.Resources) error { + if r.CpuShares != 0 { + shares := r.CpuShares + if err := cgroups.WriteFile(path, "cpu.shares", strconv.FormatUint(shares, 10)); err != nil { return err } - } - if cgroup.Resources.CpuPeriod != 0 { - if err := fscommon.WriteFile(path, "cpu.cfs_period_us", strconv.FormatUint(cgroup.Resources.CpuPeriod, 10)); err != nil { + // read it back + sharesRead, err := fscommon.GetCgroupParamUint(path, "cpu.shares") + if err != nil { return err } - } - if cgroup.Resources.CpuQuota != 0 { - if err := fscommon.WriteFile(path, "cpu.cfs_quota_us", strconv.FormatInt(cgroup.Resources.CpuQuota, 10)); err != nil { - return err + // ... and check + if shares > sharesRead { + return fmt.Errorf("the maximum allowed cpu-shares is %d", sharesRead) + } else if shares < sharesRead { + return fmt.Errorf("the minimum allowed cpu-shares is %d", sharesRead) } } - return s.SetRtSched(path, cgroup) -} -func (s *CpuGroup) Remove(d *cgroupData) error { - return removePath(d.path("cpu")) + var period string + if r.CpuPeriod != 0 { + period = strconv.FormatUint(r.CpuPeriod, 10) + if err := cgroups.WriteFile(path, "cpu.cfs_period_us", period); err != nil { + // Sometimes when the period to be set is smaller + // than the current one, it is rejected by the kernel + // (EINVAL) as old_quota/new_period exceeds the parent + // cgroup quota limit. If this happens and the quota is + // going to be set, ignore the error for now and retry + // after setting the quota. + if !errors.Is(err, unix.EINVAL) || r.CpuQuota == 0 { + return err + } + } else { + period = "" + } + } + if r.CpuQuota != 0 { + if err := cgroups.WriteFile(path, "cpu.cfs_quota_us", strconv.FormatInt(r.CpuQuota, 10)); err != nil { + return err + } + if period != "" { + if err := cgroups.WriteFile(path, "cpu.cfs_period_us", period); err != nil { + return err + } + } + } + return s.SetRtSched(path, r) } func (s *CpuGroup) GetStats(path string, stats *cgroups.Stats) error { - f, err := os.Open(filepath.Join(path, "cpu.stat")) + const file = "cpu.stat" + f, err := cgroups.OpenFile(path, file, os.O_RDONLY) if err != nil { if os.IsNotExist(err) { return nil @@ -99,9 +110,9 @@ func (s *CpuGroup) GetStats(path string, stats *cgroups.Stats) error { sc := bufio.NewScanner(f) for sc.Scan() { - t, v, err := fscommon.GetCgroupParamKeyValue(sc.Text()) + t, v, err := fscommon.ParseKeyValue(sc.Text()) if err != nil { - return err + return &parseError{Path: path, File: file, Err: err} } switch t { case "nr_periods": diff --git a/libcontainer/cgroups/fs/cpu_test.go b/libcontainer/cgroups/fs/cpu_test.go index 2eeb489..bbdd45a 100644 --- a/libcontainer/cgroups/fs/cpu_test.go +++ b/libcontainer/cgroups/fs/cpu_test.go @@ -1,5 +1,3 @@ -// +build linux - package fs import ( @@ -9,40 +7,40 @@ import ( "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" ) func TestCpuSetShares(t *testing.T) { - helper := NewCgroupTestUtil("cpu", t) - defer helper.cleanup() + path := tempDir(t, "cpu") const ( sharesBefore = 1024 sharesAfter = 512 ) - helper.writeFileContents(map[string]string{ + writeFileContents(t, path, map[string]string{ "cpu.shares": strconv.Itoa(sharesBefore), }) - helper.CgroupData.config.Resources.CpuShares = sharesAfter + r := &configs.Resources{ + CpuShares: sharesAfter, + } cpu := &CpuGroup{} - if err := cpu.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + if err := cpu.Set(path, r); err != nil { t.Fatal(err) } - value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "cpu.shares") + value, err := fscommon.GetCgroupParamUint(path, "cpu.shares") if err != nil { - t.Fatalf("Failed to parse cpu.shares - %s", err) + t.Fatal(err) } - if value != sharesAfter { t.Fatal("Got the wrong value, set cpu.shares failed.") } } func TestCpuSetBandWidth(t *testing.T) { - helper := NewCgroupTestUtil("cpu", t) - defer helper.cleanup() + path := tempDir(t, "cpu") const ( quotaBefore = 8000 @@ -55,47 +53,51 @@ func TestCpuSetBandWidth(t *testing.T) { rtPeriodAfter = 7000 ) - helper.writeFileContents(map[string]string{ + writeFileContents(t, path, map[string]string{ "cpu.cfs_quota_us": strconv.Itoa(quotaBefore), "cpu.cfs_period_us": strconv.Itoa(periodBefore), "cpu.rt_runtime_us": strconv.Itoa(rtRuntimeBefore), "cpu.rt_period_us": strconv.Itoa(rtPeriodBefore), }) - helper.CgroupData.config.Resources.CpuQuota = quotaAfter - helper.CgroupData.config.Resources.CpuPeriod = periodAfter - helper.CgroupData.config.Resources.CpuRtRuntime = rtRuntimeAfter - helper.CgroupData.config.Resources.CpuRtPeriod = rtPeriodAfter + r := &configs.Resources{ + CpuQuota: quotaAfter, + CpuPeriod: periodAfter, + CpuRtRuntime: rtRuntimeAfter, + CpuRtPeriod: rtPeriodAfter, + } cpu := &CpuGroup{} - if err := cpu.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + if err := cpu.Set(path, r); err != nil { t.Fatal(err) } - quota, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "cpu.cfs_quota_us") + quota, err := fscommon.GetCgroupParamUint(path, "cpu.cfs_quota_us") if err != nil { - t.Fatalf("Failed to parse cpu.cfs_quota_us - %s", err) + t.Fatal(err) } if quota != quotaAfter { t.Fatal("Got the wrong value, set cpu.cfs_quota_us failed.") } - period, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "cpu.cfs_period_us") + period, err := fscommon.GetCgroupParamUint(path, "cpu.cfs_period_us") if err != nil { - t.Fatalf("Failed to parse cpu.cfs_period_us - %s", err) + t.Fatal(err) } if period != periodAfter { t.Fatal("Got the wrong value, set cpu.cfs_period_us failed.") } - rtRuntime, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "cpu.rt_runtime_us") + + rtRuntime, err := fscommon.GetCgroupParamUint(path, "cpu.rt_runtime_us") if err != nil { - t.Fatalf("Failed to parse cpu.rt_runtime_us - %s", err) + t.Fatal(err) } if rtRuntime != rtRuntimeAfter { t.Fatal("Got the wrong value, set cpu.rt_runtime_us failed.") } - rtPeriod, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "cpu.rt_period_us") + + rtPeriod, err := fscommon.GetCgroupParamUint(path, "cpu.rt_period_us") if err != nil { - t.Fatalf("Failed to parse cpu.rt_period_us - %s", err) + t.Fatal(err) } if rtPeriod != rtPeriodAfter { t.Fatal("Got the wrong value, set cpu.rt_period_us failed.") @@ -103,8 +105,7 @@ func TestCpuSetBandWidth(t *testing.T) { } func TestCpuStats(t *testing.T) { - helper := NewCgroupTestUtil("cpu", t) - defer helper.cleanup() + path := tempDir(t, "cpu") const ( nrPeriods = 2000 @@ -112,15 +113,15 @@ func TestCpuStats(t *testing.T) { throttledTime = uint64(18446744073709551615) ) - cpuStatContent := fmt.Sprintf("nr_periods %d\n nr_throttled %d\n throttled_time %d\n", + cpuStatContent := fmt.Sprintf("nr_periods %d\nnr_throttled %d\nthrottled_time %d\n", nrPeriods, nrThrottled, throttledTime) - helper.writeFileContents(map[string]string{ + writeFileContents(t, path, map[string]string{ "cpu.stat": cpuStatContent, }) cpu := &CpuGroup{} actualStats := *cgroups.NewStats() - err := cpu.GetStats(helper.CgroupPath, &actualStats) + err := cpu.GetStats(path, &actualStats) if err != nil { t.Fatal(err) } @@ -128,44 +129,43 @@ func TestCpuStats(t *testing.T) { expectedStats := cgroups.ThrottlingData{ Periods: nrPeriods, ThrottledPeriods: nrThrottled, - ThrottledTime: throttledTime} + ThrottledTime: throttledTime, + } expectThrottlingDataEquals(t, expectedStats, actualStats.CpuStats.ThrottlingData) } func TestNoCpuStatFile(t *testing.T) { - helper := NewCgroupTestUtil("cpu", t) - defer helper.cleanup() + path := tempDir(t, "cpu") cpu := &CpuGroup{} actualStats := *cgroups.NewStats() - err := cpu.GetStats(helper.CgroupPath, &actualStats) + err := cpu.GetStats(path, &actualStats) if err != nil { t.Fatal("Expected not to fail, but did") } } func TestInvalidCpuStat(t *testing.T) { - helper := NewCgroupTestUtil("cpu", t) - defer helper.cleanup() + path := tempDir(t, "cpu") + cpuStatContent := `nr_periods 2000 nr_throttled 200 throttled_time fortytwo` - helper.writeFileContents(map[string]string{ + writeFileContents(t, path, map[string]string{ "cpu.stat": cpuStatContent, }) cpu := &CpuGroup{} actualStats := *cgroups.NewStats() - err := cpu.GetStats(helper.CgroupPath, &actualStats) + err := cpu.GetStats(path, &actualStats) if err == nil { t.Fatal("Expected failed stat parsing.") } } func TestCpuSetRtSchedAtApply(t *testing.T) { - helper := NewCgroupTestUtil("cpu", t) - defer helper.cleanup() + path := tempDir(t, "cpu") const ( rtRuntimeBefore = 0 @@ -174,35 +174,40 @@ func TestCpuSetRtSchedAtApply(t *testing.T) { rtPeriodAfter = 7000 ) - helper.writeFileContents(map[string]string{ + writeFileContents(t, path, map[string]string{ "cpu.rt_runtime_us": strconv.Itoa(rtRuntimeBefore), "cpu.rt_period_us": strconv.Itoa(rtPeriodBefore), }) - helper.CgroupData.config.Resources.CpuRtRuntime = rtRuntimeAfter - helper.CgroupData.config.Resources.CpuRtPeriod = rtPeriodAfter + r := &configs.Resources{ + CpuRtRuntime: rtRuntimeAfter, + CpuRtPeriod: rtPeriodAfter, + } cpu := &CpuGroup{} - if err := cpu.ApplyDir(helper.CgroupPath, helper.CgroupData.config, 1234); err != nil { + + if err := cpu.Apply(path, r, 1234); err != nil { t.Fatal(err) } - rtRuntime, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "cpu.rt_runtime_us") + rtRuntime, err := fscommon.GetCgroupParamUint(path, "cpu.rt_runtime_us") if err != nil { - t.Fatalf("Failed to parse cpu.rt_runtime_us - %s", err) + t.Fatal(err) } if rtRuntime != rtRuntimeAfter { t.Fatal("Got the wrong value, set cpu.rt_runtime_us failed.") } - rtPeriod, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "cpu.rt_period_us") + + rtPeriod, err := fscommon.GetCgroupParamUint(path, "cpu.rt_period_us") if err != nil { - t.Fatalf("Failed to parse cpu.rt_period_us - %s", err) + t.Fatal(err) } if rtPeriod != rtPeriodAfter { t.Fatal("Got the wrong value, set cpu.rt_period_us failed.") } - pid, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "cgroup.procs") + + pid, err := fscommon.GetCgroupParamUint(path, "cgroup.procs") if err != nil { - t.Fatalf("Failed to parse cgroup.procs - %s", err) + t.Fatal(err) } if pid != 1234 { t.Fatal("Got the wrong value, set cgroup.procs failed.") diff --git a/libcontainer/cgroups/fs/cpuacct.go b/libcontainer/cgroups/fs/cpuacct.go index 95dc9a1..d3bd7e1 100644 --- a/libcontainer/cgroups/fs/cpuacct.go +++ b/libcontainer/cgroups/fs/cpuacct.go @@ -1,52 +1,51 @@ -// +build linux - package fs import ( - "fmt" - "io/ioutil" - "path/filepath" + "bufio" + "os" "strconv" "strings" "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" "github.com/opencontainers/runc/libcontainer/configs" - "github.com/opencontainers/runc/libcontainer/system" ) const ( - cgroupCpuacctStat = "cpuacct.stat" + cgroupCpuacctStat = "cpuacct.stat" + cgroupCpuacctUsageAll = "cpuacct.usage_all" + nanosecondsInSecond = 1000000000 + + userModeColumn = 1 + kernelModeColumn = 2 + cuacctUsageAllColumnsNumber = 3 + + // The value comes from `C.sysconf(C._SC_CLK_TCK)`, and + // on Linux it's a constant which is safe to be hard coded, + // so we can avoid using cgo here. For details, see: + // https://github.com/containerd/cgroups/pull/12 + clockTicks uint64 = 100 ) -var clockTicks = uint64(system.GetClockTicks()) - -type CpuacctGroup struct { -} +type CpuacctGroup struct{} func (s *CpuacctGroup) Name() string { return "cpuacct" } -func (s *CpuacctGroup) Apply(d *cgroupData) error { - // we just want to join this group even though we don't set anything - if _, err := d.join("cpuacct"); err != nil && !cgroups.IsNotFound(err) { - return err - } - - return nil +func (s *CpuacctGroup) Apply(path string, _ *configs.Resources, pid int) error { + return apply(path, pid) } -func (s *CpuacctGroup) Set(path string, cgroup *configs.Cgroup) error { +func (s *CpuacctGroup) Set(_ string, _ *configs.Resources) error { return nil } -func (s *CpuacctGroup) Remove(d *cgroupData) error { - return removePath(d.path("cpuacct")) -} - func (s *CpuacctGroup) GetStats(path string, stats *cgroups.Stats) error { + if !cgroups.PathExists(path) { + return nil + } userModeUsage, kernelModeUsage, err := getCpuUsageBreakdown(path) if err != nil { return err @@ -62,8 +61,15 @@ func (s *CpuacctGroup) GetStats(path string, stats *cgroups.Stats) error { return err } + percpuUsageInKernelmode, percpuUsageInUsermode, err := getPercpuUsageInModes(path) + if err != nil { + return err + } + stats.CpuStats.CpuUsage.TotalUsage = totalUsage stats.CpuStats.CpuUsage.PercpuUsage = percpuUsage + stats.CpuStats.CpuUsage.PercpuUsageInKernelmode = percpuUsageInKernelmode + stats.CpuStats.CpuUsage.PercpuUsageInUsermode = percpuUsageInUsermode stats.CpuStats.CpuUsage.UsageInUsermode = userModeUsage stats.CpuStats.CpuUsage.UsageInKernelmode = kernelModeUsage return nil @@ -71,52 +77,90 @@ func (s *CpuacctGroup) GetStats(path string, stats *cgroups.Stats) error { // Returns user and kernel usage breakdown in nanoseconds. func getCpuUsageBreakdown(path string) (uint64, uint64, error) { - userModeUsage := uint64(0) - kernelModeUsage := uint64(0) + var userModeUsage, kernelModeUsage uint64 const ( userField = "user" systemField = "system" + file = cgroupCpuacctStat ) // Expected format: // user // system - data, err := ioutil.ReadFile(filepath.Join(path, cgroupCpuacctStat)) + data, err := cgroups.ReadFile(path, file) if err != nil { return 0, 0, err } - fields := strings.Fields(string(data)) - if len(fields) < 4 { - return 0, 0, fmt.Errorf("failure - %s is expected to have at least 4 fields", filepath.Join(path, cgroupCpuacctStat)) - } - if fields[0] != userField { - return 0, 0, fmt.Errorf("unexpected field %q in %q, expected %q", fields[0], cgroupCpuacctStat, userField) - } - if fields[2] != systemField { - return 0, 0, fmt.Errorf("unexpected field %q in %q, expected %q", fields[2], cgroupCpuacctStat, systemField) + // TODO: use strings.SplitN instead. + fields := strings.Fields(data) + if len(fields) < 4 || fields[0] != userField || fields[2] != systemField { + return 0, 0, malformedLine(path, file, data) } if userModeUsage, err = strconv.ParseUint(fields[1], 10, 64); err != nil { - return 0, 0, err + return 0, 0, &parseError{Path: path, File: file, Err: err} } if kernelModeUsage, err = strconv.ParseUint(fields[3], 10, 64); err != nil { - return 0, 0, err + return 0, 0, &parseError{Path: path, File: file, Err: err} } return (userModeUsage * nanosecondsInSecond) / clockTicks, (kernelModeUsage * nanosecondsInSecond) / clockTicks, nil } func getPercpuUsage(path string) ([]uint64, error) { + const file = "cpuacct.usage_percpu" percpuUsage := []uint64{} - data, err := ioutil.ReadFile(filepath.Join(path, "cpuacct.usage_percpu")) + data, err := cgroups.ReadFile(path, file) if err != nil { return percpuUsage, err } - for _, value := range strings.Fields(string(data)) { + // TODO: use strings.SplitN instead. + for _, value := range strings.Fields(data) { value, err := strconv.ParseUint(value, 10, 64) if err != nil { - return percpuUsage, fmt.Errorf("Unable to convert param value to uint64: %s", err) + return percpuUsage, &parseError{Path: path, File: file, Err: err} } percpuUsage = append(percpuUsage, value) } return percpuUsage, nil } + +func getPercpuUsageInModes(path string) ([]uint64, []uint64, error) { + usageKernelMode := []uint64{} + usageUserMode := []uint64{} + const file = cgroupCpuacctUsageAll + + fd, err := cgroups.OpenFile(path, file, os.O_RDONLY) + if os.IsNotExist(err) { + return usageKernelMode, usageUserMode, nil + } else if err != nil { + return nil, nil, err + } + defer fd.Close() + + scanner := bufio.NewScanner(fd) + scanner.Scan() // skipping header line + + for scanner.Scan() { + lineFields := strings.SplitN(scanner.Text(), " ", cuacctUsageAllColumnsNumber+1) + if len(lineFields) != cuacctUsageAllColumnsNumber { + continue + } + + usageInKernelMode, err := strconv.ParseUint(lineFields[kernelModeColumn], 10, 64) + if err != nil { + return nil, nil, &parseError{Path: path, File: file, Err: err} + } + usageKernelMode = append(usageKernelMode, usageInKernelMode) + + usageInUserMode, err := strconv.ParseUint(lineFields[userModeColumn], 10, 64) + if err != nil { + return nil, nil, &parseError{Path: path, File: file, Err: err} + } + usageUserMode = append(usageUserMode, usageInUserMode) + } + if err := scanner.Err(); err != nil { + return nil, nil, &parseError{Path: path, File: file, Err: err} + } + + return usageKernelMode, usageUserMode, nil +} diff --git a/libcontainer/cgroups/fs/cpuacct_test.go b/libcontainer/cgroups/fs/cpuacct_test.go new file mode 100644 index 0000000..70b237a --- /dev/null +++ b/libcontainer/cgroups/fs/cpuacct_test.go @@ -0,0 +1,97 @@ +package fs + +import ( + "reflect" + "testing" + + "github.com/opencontainers/runc/libcontainer/cgroups" +) + +const ( + cpuAcctUsageContents = "12262454190222160" + cpuAcctUsagePerCPUContents = "1564936537989058 1583937096487821 1604195415465681 1596445226820187 1481069084155629 1478735613864327 1477610593414743 1476362015778086" + cpuAcctStatContents = "user 452278264\nsystem 291429664" + cpuAcctUsageAll = `cpu user system + 0 962250696038415 637727786389114 + 1 981956408513304 638197595421064 + 2 1002658817529022 638956774598358 + 3 994937703492523 637985531181620 + 4 874843781648690 638837766495476 + 5 872544369885276 638763309884944 + 6 870104915696359 640081778921247 + 7 870202363887496 638716766259495 + ` +) + +func TestCpuacctStats(t *testing.T) { + path := tempDir(t, "cpuacct") + writeFileContents(t, path, map[string]string{ + "cpuacct.usage": cpuAcctUsageContents, + "cpuacct.usage_percpu": cpuAcctUsagePerCPUContents, + "cpuacct.stat": cpuAcctStatContents, + "cpuacct.usage_all": cpuAcctUsageAll, + }) + + cpuacct := &CpuacctGroup{} + actualStats := *cgroups.NewStats() + err := cpuacct.GetStats(path, &actualStats) + if err != nil { + t.Fatal(err) + } + + expectedStats := cgroups.CpuUsage{ + TotalUsage: uint64(12262454190222160), + PercpuUsage: []uint64{ + 1564936537989058, 1583937096487821, 1604195415465681, 1596445226820187, + 1481069084155629, 1478735613864327, 1477610593414743, 1476362015778086, + }, + PercpuUsageInKernelmode: []uint64{ + 637727786389114, 638197595421064, 638956774598358, 637985531181620, + 638837766495476, 638763309884944, 640081778921247, 638716766259495, + }, + PercpuUsageInUsermode: []uint64{ + 962250696038415, 981956408513304, 1002658817529022, 994937703492523, + 874843781648690, 872544369885276, 870104915696359, 870202363887496, + }, + UsageInKernelmode: (uint64(291429664) * nanosecondsInSecond) / clockTicks, + UsageInUsermode: (uint64(452278264) * nanosecondsInSecond) / clockTicks, + } + + if !reflect.DeepEqual(expectedStats, actualStats.CpuStats.CpuUsage) { + t.Errorf("Expected CPU usage %#v but found %#v\n", + expectedStats, actualStats.CpuStats.CpuUsage) + } +} + +func TestCpuacctStatsWithoutUsageAll(t *testing.T) { + path := tempDir(t, "cpuacct") + writeFileContents(t, path, map[string]string{ + "cpuacct.usage": cpuAcctUsageContents, + "cpuacct.usage_percpu": cpuAcctUsagePerCPUContents, + "cpuacct.stat": cpuAcctStatContents, + }) + + cpuacct := &CpuacctGroup{} + actualStats := *cgroups.NewStats() + err := cpuacct.GetStats(path, &actualStats) + if err != nil { + t.Fatal(err) + } + + expectedStats := cgroups.CpuUsage{ + TotalUsage: uint64(12262454190222160), + PercpuUsage: []uint64{ + 1564936537989058, 1583937096487821, 1604195415465681, 1596445226820187, + 1481069084155629, 1478735613864327, 1477610593414743, 1476362015778086, + }, + PercpuUsageInKernelmode: []uint64{}, + PercpuUsageInUsermode: []uint64{}, + UsageInKernelmode: (uint64(291429664) * nanosecondsInSecond) / clockTicks, + UsageInUsermode: (uint64(452278264) * nanosecondsInSecond) / clockTicks, + } + + if !reflect.DeepEqual(expectedStats, actualStats.CpuStats.CpuUsage) { + t.Errorf("Expected CPU usage %#v but found %#v\n", + expectedStats, actualStats.CpuStats.CpuUsage) + } +} diff --git a/libcontainer/cgroups/fs/cpuset.go b/libcontainer/cgroups/fs/cpuset.go index bfc900e..550baa4 100644 --- a/libcontainer/cgroups/fs/cpuset.go +++ b/libcontainer/cgroups/fs/cpuset.go @@ -1,75 +1,159 @@ -// +build linux - package fs import ( - "bytes" - "fmt" - "io/ioutil" + "errors" "os" "path/filepath" + "strconv" + "strings" + + "golang.org/x/sys/unix" "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" "github.com/opencontainers/runc/libcontainer/configs" - libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils" ) -type CpusetGroup struct { -} +type CpusetGroup struct{} func (s *CpusetGroup) Name() string { return "cpuset" } -func (s *CpusetGroup) Apply(d *cgroupData) error { - dir, err := d.path("cpuset") - if err != nil && !cgroups.IsNotFound(err) { - return err - } - return s.ApplyDir(dir, d.config, d.pid) +func (s *CpusetGroup) Apply(path string, r *configs.Resources, pid int) error { + return s.ApplyDir(path, r, pid) } -func (s *CpusetGroup) Set(path string, cgroup *configs.Cgroup) error { - if cgroup.Resources.CpusetCpus != "" { - if err := fscommon.WriteFile(path, "cpuset.cpus", cgroup.Resources.CpusetCpus); err != nil { +func (s *CpusetGroup) Set(path string, r *configs.Resources) error { + if r.CpusetCpus != "" { + if err := cgroups.WriteFile(path, "cpuset.cpus", r.CpusetCpus); err != nil { return err } } - if cgroup.Resources.CpusetMems != "" { - if err := fscommon.WriteFile(path, "cpuset.mems", cgroup.Resources.CpusetMems); err != nil { + if r.CpusetMems != "" { + if err := cgroups.WriteFile(path, "cpuset.mems", r.CpusetMems); err != nil { return err } } return nil } -func (s *CpusetGroup) Remove(d *cgroupData) error { - return removePath(d.path("cpuset")) +func getCpusetStat(path string, file string) ([]uint16, error) { + var extracted []uint16 + fileContent, err := fscommon.GetCgroupParamString(path, file) + if err != nil { + return extracted, err + } + if len(fileContent) == 0 { + return extracted, &parseError{Path: path, File: file, Err: errors.New("empty file")} + } + + for _, s := range strings.Split(fileContent, ",") { + sp := strings.SplitN(s, "-", 3) + switch len(sp) { + case 3: + return extracted, &parseError{Path: path, File: file, Err: errors.New("extra dash")} + case 2: + min, err := strconv.ParseUint(sp[0], 10, 16) + if err != nil { + return extracted, &parseError{Path: path, File: file, Err: err} + } + max, err := strconv.ParseUint(sp[1], 10, 16) + if err != nil { + return extracted, &parseError{Path: path, File: file, Err: err} + } + if min > max { + return extracted, &parseError{Path: path, File: file, Err: errors.New("invalid values, min > max")} + } + for i := min; i <= max; i++ { + extracted = append(extracted, uint16(i)) + } + case 1: + value, err := strconv.ParseUint(s, 10, 16) + if err != nil { + return extracted, &parseError{Path: path, File: file, Err: err} + } + extracted = append(extracted, uint16(value)) + } + } + + return extracted, nil } func (s *CpusetGroup) GetStats(path string, stats *cgroups.Stats) error { + var err error + + stats.CPUSetStats.CPUs, err = getCpusetStat(path, "cpuset.cpus") + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.CPUExclusive, err = fscommon.GetCgroupParamUint(path, "cpuset.cpu_exclusive") + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.Mems, err = getCpusetStat(path, "cpuset.mems") + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.MemHardwall, err = fscommon.GetCgroupParamUint(path, "cpuset.mem_hardwall") + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.MemExclusive, err = fscommon.GetCgroupParamUint(path, "cpuset.mem_exclusive") + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.MemoryMigrate, err = fscommon.GetCgroupParamUint(path, "cpuset.memory_migrate") + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.MemorySpreadPage, err = fscommon.GetCgroupParamUint(path, "cpuset.memory_spread_page") + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.MemorySpreadSlab, err = fscommon.GetCgroupParamUint(path, "cpuset.memory_spread_slab") + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.MemoryPressure, err = fscommon.GetCgroupParamUint(path, "cpuset.memory_pressure") + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.SchedLoadBalance, err = fscommon.GetCgroupParamUint(path, "cpuset.sched_load_balance") + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.SchedRelaxDomainLevel, err = fscommon.GetCgroupParamInt(path, "cpuset.sched_relax_domain_level") + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + return nil } -func (s *CpusetGroup) ApplyDir(dir string, cgroup *configs.Cgroup, pid int) error { +func (s *CpusetGroup) ApplyDir(dir string, r *configs.Resources, pid int) error { // This might happen if we have no cpuset cgroup mounted. // Just do nothing and don't fail. if dir == "" { return nil } - mountInfo, err := ioutil.ReadFile("/proc/self/mountinfo") - if err != nil { - return err - } - root := filepath.Dir(cgroups.GetClosestMountpointAncestor(dir, string(mountInfo))) // 'ensureParent' start with parent because we don't want to // explicitly inherit from parent, it could conflict with // 'cpuset.cpu_exclusive'. - if err := s.ensureParent(filepath.Dir(dir), root); err != nil { + if err := cpusetEnsureParent(filepath.Dir(dir)); err != nil { return err } - if err := os.MkdirAll(dir, 0755); err != nil { + if err := os.Mkdir(dir, 0o755); err != nil && !os.IsExist(err) { return err } // We didn't inherit cpuset configs from parent, but we have @@ -79,82 +163,83 @@ func (s *CpusetGroup) ApplyDir(dir string, cgroup *configs.Cgroup, pid int) erro // specified configs, otherwise, inherit from parent. This makes // cpuset configs work correctly with 'cpuset.cpu_exclusive', and // keep backward compatibility. - if err := s.ensureCpusAndMems(dir, cgroup); err != nil { + if err := s.ensureCpusAndMems(dir, r); err != nil { return err } - - // because we are not using d.join we need to place the pid into the procs file - // unlike the other subsystems + // Since we are not using apply(), we need to place the pid + // into the procs file. return cgroups.WriteCgroupProc(dir, pid) } -func (s *CpusetGroup) getSubsystemSettings(parent string) (cpus []byte, mems []byte, err error) { - if cpus, err = ioutil.ReadFile(filepath.Join(parent, "cpuset.cpus")); err != nil { +func getCpusetSubsystemSettings(parent string) (cpus, mems string, err error) { + if cpus, err = cgroups.ReadFile(parent, "cpuset.cpus"); err != nil { return } - if mems, err = ioutil.ReadFile(filepath.Join(parent, "cpuset.mems")); err != nil { + if mems, err = cgroups.ReadFile(parent, "cpuset.mems"); err != nil { return } return cpus, mems, nil } -// ensureParent makes sure that the parent directory of current is created -// and populated with the proper cpus and mems files copied from -// it's parent. -func (s *CpusetGroup) ensureParent(current, root string) error { +// cpusetEnsureParent makes sure that the parent directories of current +// are created and populated with the proper cpus and mems files copied +// from their respective parent. It does that recursively, starting from +// the top of the cpuset hierarchy (i.e. cpuset cgroup mount point). +func cpusetEnsureParent(current string) error { + var st unix.Statfs_t + parent := filepath.Dir(current) - if libcontainerUtils.CleanPath(parent) == root { + err := unix.Statfs(parent, &st) + if err == nil && st.Type != unix.CGROUP_SUPER_MAGIC { return nil } - // Avoid infinite recursion. - if parent == current { - return fmt.Errorf("cpuset: cgroup parent path outside cgroup root") + // Treat non-existing directory as cgroupfs as it will be created, + // and the root cpuset directory obviously exists. + if err != nil && err != unix.ENOENT { //nolint:errorlint // unix errors are bare + return &os.PathError{Op: "statfs", Path: parent, Err: err} } - if err := s.ensureParent(parent, root); err != nil { + + if err := cpusetEnsureParent(parent); err != nil { return err } - if err := os.MkdirAll(current, 0755); err != nil { + if err := os.Mkdir(current, 0o755); err != nil && !os.IsExist(err) { return err } - return s.copyIfNeeded(current, parent) + return cpusetCopyIfNeeded(current, parent) } -// copyIfNeeded copies the cpuset.cpus and cpuset.mems from the parent +// cpusetCopyIfNeeded copies the cpuset.cpus and cpuset.mems from the parent // directory to the current directory if the file's contents are 0 -func (s *CpusetGroup) copyIfNeeded(current, parent string) error { - var ( - err error - currentCpus, currentMems []byte - parentCpus, parentMems []byte - ) - - if currentCpus, currentMems, err = s.getSubsystemSettings(current); err != nil { +func cpusetCopyIfNeeded(current, parent string) error { + currentCpus, currentMems, err := getCpusetSubsystemSettings(current) + if err != nil { return err } - if parentCpus, parentMems, err = s.getSubsystemSettings(parent); err != nil { + parentCpus, parentMems, err := getCpusetSubsystemSettings(parent) + if err != nil { return err } - if s.isEmpty(currentCpus) { - if err := fscommon.WriteFile(current, "cpuset.cpus", string(parentCpus)); err != nil { + if isEmptyCpuset(currentCpus) { + if err := cgroups.WriteFile(current, "cpuset.cpus", parentCpus); err != nil { return err } } - if s.isEmpty(currentMems) { - if err := fscommon.WriteFile(current, "cpuset.mems", string(parentMems)); err != nil { + if isEmptyCpuset(currentMems) { + if err := cgroups.WriteFile(current, "cpuset.mems", parentMems); err != nil { return err } } return nil } -func (s *CpusetGroup) isEmpty(b []byte) bool { - return len(bytes.Trim(b, "\n")) == 0 +func isEmptyCpuset(str string) bool { + return str == "" || str == "\n" } -func (s *CpusetGroup) ensureCpusAndMems(path string, cgroup *configs.Cgroup) error { - if err := s.Set(path, cgroup); err != nil { +func (s *CpusetGroup) ensureCpusAndMems(path string, r *configs.Resources) error { + if err := s.Set(path, r); err != nil { return err } - return s.copyIfNeeded(path, filepath.Dir(path)) + return cpusetCopyIfNeeded(path, filepath.Dir(path)) } diff --git a/libcontainer/cgroups/fs/cpuset_test.go b/libcontainer/cgroups/fs/cpuset_test.go index 927e631..8933b3c 100644 --- a/libcontainer/cgroups/fs/cpuset_test.go +++ b/libcontainer/cgroups/fs/cpuset_test.go @@ -1,67 +1,242 @@ -// +build linux - package fs import ( + "reflect" "testing" + "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" ) -func TestCpusetSetCpus(t *testing.T) { - helper := NewCgroupTestUtil("cpuset", t) - defer helper.cleanup() +const ( + cpus = "0-2,7,12-14\n" + cpuExclusive = "1\n" + mems = "1-4,6,9\n" + memHardwall = "0\n" + memExclusive = "0\n" + memoryMigrate = "1\n" + memorySpreadPage = "0\n" + memorySpeadSlab = "1\n" + memoryPressure = "34377\n" + schedLoadBalance = "1\n" + schedRelaxDomainLevel = "-1\n" +) + +var cpusetTestFiles = map[string]string{ + "cpuset.cpus": cpus, + "cpuset.cpu_exclusive": cpuExclusive, + "cpuset.mems": mems, + "cpuset.mem_hardwall": memHardwall, + "cpuset.mem_exclusive": memExclusive, + "cpuset.memory_migrate": memoryMigrate, + "cpuset.memory_spread_page": memorySpreadPage, + "cpuset.memory_spread_slab": memorySpeadSlab, + "cpuset.memory_pressure": memoryPressure, + "cpuset.sched_load_balance": schedLoadBalance, + "cpuset.sched_relax_domain_level": schedRelaxDomainLevel, +} + +func TestCPUSetSetCpus(t *testing.T) { + path := tempDir(t, "cpuset") const ( cpusBefore = "0" cpusAfter = "1-3" ) - helper.writeFileContents(map[string]string{ + writeFileContents(t, path, map[string]string{ "cpuset.cpus": cpusBefore, }) - helper.CgroupData.config.Resources.CpusetCpus = cpusAfter + r := &configs.Resources{ + CpusetCpus: cpusAfter, + } cpuset := &CpusetGroup{} - if err := cpuset.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + if err := cpuset.Set(path, r); err != nil { t.Fatal(err) } - value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "cpuset.cpus") + value, err := fscommon.GetCgroupParamString(path, "cpuset.cpus") if err != nil { - t.Fatalf("Failed to parse cpuset.cpus - %s", err) + t.Fatal(err) } - if value != cpusAfter { t.Fatal("Got the wrong value, set cpuset.cpus failed.") } } -func TestCpusetSetMems(t *testing.T) { - helper := NewCgroupTestUtil("cpuset", t) - defer helper.cleanup() +func TestCPUSetSetMems(t *testing.T) { + path := tempDir(t, "cpuset") const ( memsBefore = "0" memsAfter = "1" ) - helper.writeFileContents(map[string]string{ + writeFileContents(t, path, map[string]string{ "cpuset.mems": memsBefore, }) - helper.CgroupData.config.Resources.CpusetMems = memsAfter + r := &configs.Resources{ + CpusetMems: memsAfter, + } cpuset := &CpusetGroup{} - if err := cpuset.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + if err := cpuset.Set(path, r); err != nil { t.Fatal(err) } - value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "cpuset.mems") + value, err := fscommon.GetCgroupParamString(path, "cpuset.mems") if err != nil { - t.Fatalf("Failed to parse cpuset.mems - %s", err) + t.Fatal(err) } - if value != memsAfter { t.Fatal("Got the wrong value, set cpuset.mems failed.") } } + +func TestCPUSetStatsCorrect(t *testing.T) { + path := tempDir(t, "cpuset") + writeFileContents(t, path, cpusetTestFiles) + + cpuset := &CpusetGroup{} + actualStats := *cgroups.NewStats() + err := cpuset.GetStats(path, &actualStats) + if err != nil { + t.Fatal(err) + } + expectedStats := cgroups.CPUSetStats{ + CPUs: []uint16{0, 1, 2, 7, 12, 13, 14}, + CPUExclusive: 1, + Mems: []uint16{1, 2, 3, 4, 6, 9}, + MemoryMigrate: 1, + MemHardwall: 0, + MemExclusive: 0, + MemorySpreadPage: 0, + MemorySpreadSlab: 1, + MemoryPressure: 34377, + SchedLoadBalance: 1, + SchedRelaxDomainLevel: -1, + } + if !reflect.DeepEqual(expectedStats, actualStats.CPUSetStats) { + t.Fatalf("Expected Cpuset stats usage %#v but found %#v", + expectedStats, actualStats.CPUSetStats) + } +} + +func TestCPUSetStatsMissingFiles(t *testing.T) { + for _, testCase := range []struct { + desc string + filename, contents string + removeFile bool + }{ + { + desc: "empty cpus file", + filename: "cpuset.cpus", + contents: "", + removeFile: false, + }, + { + desc: "empty mems file", + filename: "cpuset.mems", + contents: "", + removeFile: false, + }, + { + desc: "corrupted cpus file", + filename: "cpuset.cpus", + contents: "0-3,*4^2", + removeFile: false, + }, + { + desc: "corrupted mems file", + filename: "cpuset.mems", + contents: "0,1,2-5,8-7", + removeFile: false, + }, + { + desc: "missing cpu_exclusive file", + filename: "cpuset.cpu_exclusive", + contents: "", + removeFile: true, + }, + { + desc: "missing memory_migrate file", + filename: "cpuset.memory_migrate", + contents: "", + removeFile: true, + }, + { + desc: "missing mem_hardwall file", + filename: "cpuset.mem_hardwall", + contents: "", + removeFile: true, + }, + { + desc: "missing mem_exclusive file", + filename: "cpuset.mem_exclusive", + contents: "", + removeFile: true, + }, + { + desc: "missing memory_spread_page file", + filename: "cpuset.memory_spread_page", + contents: "", + removeFile: true, + }, + { + desc: "missing memory_spread_slab file", + filename: "cpuset.memory_spread_slab", + contents: "", + removeFile: true, + }, + { + desc: "missing memory_pressure file", + filename: "cpuset.memory_pressure", + contents: "", + removeFile: true, + }, + { + desc: "missing sched_load_balance file", + filename: "cpuset.sched_load_balance", + contents: "", + removeFile: true, + }, + { + desc: "missing sched_relax_domain_level file", + filename: "cpuset.sched_relax_domain_level", + contents: "", + removeFile: true, + }, + } { + t.Run(testCase.desc, func(t *testing.T) { + path := tempDir(t, "cpuset") + + tempCpusetTestFiles := map[string]string{} + for i, v := range cpusetTestFiles { + tempCpusetTestFiles[i] = v + } + + if testCase.removeFile { + delete(tempCpusetTestFiles, testCase.filename) + writeFileContents(t, path, tempCpusetTestFiles) + cpuset := &CpusetGroup{} + actualStats := *cgroups.NewStats() + err := cpuset.GetStats(path, &actualStats) + if err != nil { + t.Errorf("failed unexpectedly: %q", err) + } + } else { + tempCpusetTestFiles[testCase.filename] = testCase.contents + writeFileContents(t, path, tempCpusetTestFiles) + cpuset := &CpusetGroup{} + actualStats := *cgroups.NewStats() + err := cpuset.GetStats(path, &actualStats) + + if err == nil { + t.Error("failed to return expected error") + } + } + }) + } +} diff --git a/libcontainer/cgroups/fs/devices.go b/libcontainer/cgroups/fs/devices.go index 036c8db..4527a70 100644 --- a/libcontainer/cgroups/fs/devices.go +++ b/libcontainer/cgroups/fs/devices.go @@ -1,81 +1,109 @@ -// +build linux - package fs import ( + "bytes" + "errors" + "reflect" + "github.com/opencontainers/runc/libcontainer/cgroups" - "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + cgroupdevices "github.com/opencontainers/runc/libcontainer/cgroups/devices" "github.com/opencontainers/runc/libcontainer/configs" - "github.com/opencontainers/runc/libcontainer/system" + "github.com/opencontainers/runc/libcontainer/devices" + "github.com/opencontainers/runc/libcontainer/userns" ) type DevicesGroup struct { + TestingSkipFinalCheck bool } func (s *DevicesGroup) Name() string { return "devices" } -func (s *DevicesGroup) Apply(d *cgroupData) error { - _, err := d.join("devices") +func (s *DevicesGroup) Apply(path string, r *configs.Resources, pid int) error { + if r.SkipDevices { + return nil + } + if path == "" { + // Return error here, since devices cgroup + // is a hard requirement for container's security. + return errSubsystemDoesNotExist + } + + return apply(path, pid) +} + +func loadEmulator(path string) (*cgroupdevices.Emulator, error) { + list, err := cgroups.ReadFile(path, "devices.list") + if err != nil { + return nil, err + } + return cgroupdevices.EmulatorFromList(bytes.NewBufferString(list)) +} + +func buildEmulator(rules []*devices.Rule) (*cgroupdevices.Emulator, error) { + // This defaults to a white-list -- which is what we want! + emu := &cgroupdevices.Emulator{} + for _, rule := range rules { + if err := emu.Apply(*rule); err != nil { + return nil, err + } + } + return emu, nil +} + +func (s *DevicesGroup) Set(path string, r *configs.Resources) error { + if userns.RunningInUserNS() || r.SkipDevices { + return nil + } + + // Generate two emulators, one for the current state of the cgroup and one + // for the requested state by the user. + current, err := loadEmulator(path) if err != nil { - // We will return error even it's `not found` error, devices - // cgroup is hard requirement for container's security. return err } - return nil -} - -func (s *DevicesGroup) Set(path string, cgroup *configs.Cgroup) error { - if system.RunningInUserNS() { - return nil + target, err := buildEmulator(r.Devices) + if err != nil { + return err } - devices := cgroup.Resources.Devices - if len(devices) > 0 { - for _, dev := range devices { - file := "devices.deny" - if dev.Allow { - file = "devices.allow" - } - if err := fscommon.WriteFile(path, file, dev.CgroupString()); err != nil { - return err - } - } - return nil + // Compute the minimal set of transition rules needed to achieve the + // requested state. + transitionRules, err := current.Transition(target) + if err != nil { + return err } - if cgroup.Resources.AllowAllDevices != nil { - if *cgroup.Resources.AllowAllDevices == false { - if err := fscommon.WriteFile(path, "devices.deny", "a"); err != nil { - return err - } - - for _, dev := range cgroup.Resources.AllowedDevices { - if err := fscommon.WriteFile(path, "devices.allow", dev.CgroupString()); err != nil { - return err - } - } - return nil + for _, rule := range transitionRules { + file := "devices.deny" + if rule.Allow { + file = "devices.allow" } - - if err := fscommon.WriteFile(path, "devices.allow", "a"); err != nil { + if err := cgroups.WriteFile(path, file, rule.CgroupString()); err != nil { return err } } - for _, dev := range cgroup.Resources.DeniedDevices { - if err := fscommon.WriteFile(path, "devices.deny", dev.CgroupString()); err != nil { + // Final safety check -- ensure that the resulting state is what was + // requested. This is only really correct for white-lists, but for + // black-lists we can at least check that the cgroup is in the right mode. + // + // This safety-check is skipped for the unit tests because we cannot + // currently mock devices.list correctly. + if !s.TestingSkipFinalCheck { + currentAfter, err := loadEmulator(path) + if err != nil { return err } + if !target.IsBlacklist() && !reflect.DeepEqual(currentAfter, target) { + return errors.New("resulting devices cgroup doesn't precisely match target") + } else if target.IsBlacklist() != currentAfter.IsBlacklist() { + return errors.New("resulting devices cgroup doesn't match target mode") + } } - return nil } -func (s *DevicesGroup) Remove(d *cgroupData) error { - return removePath(d.path("devices")) -} - func (s *DevicesGroup) GetStats(path string, stats *cgroups.Stats) error { return nil } diff --git a/libcontainer/cgroups/fs/devices_test.go b/libcontainer/cgroups/fs/devices_test.go index 648f4a2..bdd1967 100644 --- a/libcontainer/cgroups/fs/devices_test.go +++ b/libcontainer/cgroups/fs/devices_test.go @@ -1,5 +1,3 @@ -// +build linux - package fs import ( @@ -7,93 +5,48 @@ import ( "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" "github.com/opencontainers/runc/libcontainer/configs" -) - -var ( - allowedDevices = []*configs.Device{ - { - Path: "/dev/zero", - Type: 'c', - Major: 1, - Minor: 5, - Permissions: "rwm", - FileMode: 0666, - }, - } - allowedList = "c 1:5 rwm" - deniedDevices = []*configs.Device{ - { - Path: "/dev/null", - Type: 'c', - Major: 1, - Minor: 3, - Permissions: "rwm", - FileMode: 0666, - }, - } - deniedList = "c 1:3 rwm" + "github.com/opencontainers/runc/libcontainer/devices" ) func TestDevicesSetAllow(t *testing.T) { - helper := NewCgroupTestUtil("devices", t) - defer helper.cleanup() + path := tempDir(t, "devices") - helper.writeFileContents(map[string]string{ - "devices.deny": "a", - }) - allowAllDevices := false - helper.CgroupData.config.Resources.AllowAllDevices = &allowAllDevices - helper.CgroupData.config.Resources.AllowedDevices = allowedDevices - devices := &DevicesGroup{} - if err := devices.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { - t.Fatal(err) - } - - value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "devices.allow") - if err != nil { - t.Fatalf("Failed to parse devices.allow - %s", err) - } - - if value != allowedList { - t.Fatal("Got the wrong value, set devices.allow failed.") - } - - // When AllowAllDevices is nil, devices.allow file should not be modified. - helper.CgroupData.config.Resources.AllowAllDevices = nil - if err := devices.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { - t.Fatal(err) - } - value, err = fscommon.GetCgroupParamString(helper.CgroupPath, "devices.allow") - if err != nil { - t.Fatalf("Failed to parse devices.allow - %s", err) - } - if value != allowedList { - t.Fatal("devices policy shouldn't have changed on AllowedAllDevices=nil.") - } -} - -func TestDevicesSetDeny(t *testing.T) { - helper := NewCgroupTestUtil("devices", t) - defer helper.cleanup() - - helper.writeFileContents(map[string]string{ - "devices.allow": "a", + writeFileContents(t, path, map[string]string{ + "devices.allow": "", + "devices.deny": "", + "devices.list": "a *:* rwm", }) - allowAllDevices := true - helper.CgroupData.config.Resources.AllowAllDevices = &allowAllDevices - helper.CgroupData.config.Resources.DeniedDevices = deniedDevices - devices := &DevicesGroup{} - if err := devices.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + r := &configs.Resources{ + Devices: []*devices.Rule{ + { + Type: devices.CharDevice, + Major: 1, + Minor: 5, + Permissions: devices.Permissions("rwm"), + Allow: true, + }, + }, + } + + d := &DevicesGroup{TestingSkipFinalCheck: true} + if err := d.Set(path, r); err != nil { t.Fatal(err) } - value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "devices.deny") + // The default deny rule must be written. + value, err := fscommon.GetCgroupParamString(path, "devices.deny") if err != nil { - t.Fatalf("Failed to parse devices.deny - %s", err) + t.Fatal(err) + } + if value[0] != 'a' { + t.Errorf("Got the wrong value (%q), set devices.deny failed.", value) } - if value != deniedList { - t.Fatal("Got the wrong value, set devices.deny failed.") + // Permitted rule must be written. + if value, err := fscommon.GetCgroupParamString(path, "devices.allow"); err != nil { + t.Fatal(err) + } else if value != "c 1:5 rwm" { + t.Errorf("Got the wrong value (%q), set devices.allow failed.", value) } } diff --git a/libcontainer/cgroups/fs/error.go b/libcontainer/cgroups/fs/error.go new file mode 100644 index 0000000..f2ab6f1 --- /dev/null +++ b/libcontainer/cgroups/fs/error.go @@ -0,0 +1,15 @@ +package fs + +import ( + "fmt" + + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" +) + +type parseError = fscommon.ParseError + +// malformedLine is used by all cgroupfs file parsers that expect a line +// in a particular format but get some garbage instead. +func malformedLine(path, file, line string) error { + return &parseError{Path: path, File: file, Err: fmt.Errorf("malformed line: %s", line)} +} diff --git a/libcontainer/cgroups/fs/freezer.go b/libcontainer/cgroups/fs/freezer.go index 9dc81bd..987f1bf 100644 --- a/libcontainer/cgroups/fs/freezer.go +++ b/libcontainer/cgroups/fs/freezer.go @@ -1,67 +1,158 @@ -// +build linux - package fs import ( + "errors" "fmt" + "os" "strings" "time" "github.com/opencontainers/runc/libcontainer/cgroups" - "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" "github.com/opencontainers/runc/libcontainer/configs" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" ) -type FreezerGroup struct { -} +type FreezerGroup struct{} func (s *FreezerGroup) Name() string { return "freezer" } -func (s *FreezerGroup) Apply(d *cgroupData) error { - _, err := d.join("freezer") - if err != nil && !cgroups.IsNotFound(err) { - return err - } - return nil +func (s *FreezerGroup) Apply(path string, _ *configs.Resources, pid int) error { + return apply(path, pid) } -func (s *FreezerGroup) Set(path string, cgroup *configs.Cgroup) error { - switch cgroup.Resources.Freezer { - case configs.Frozen, configs.Thawed: - for { - // In case this loop does not exit because it doesn't get the expected - // state, let's write again this state, hoping it's going to be properly - // set this time. Otherwise, this loop could run infinitely, waiting for - // a state change that would never happen. - if err := fscommon.WriteFile(path, "freezer.state", string(cgroup.Resources.Freezer)); err != nil { +func (s *FreezerGroup) Set(path string, r *configs.Resources) (Err error) { + switch r.Freezer { + case configs.Frozen: + defer func() { + if Err != nil { + // Freezing failed, and it is bad and dangerous + // to leave the cgroup in FROZEN or FREEZING + // state, so (try to) thaw it back. + _ = cgroups.WriteFile(path, "freezer.state", string(configs.Thawed)) + } + }() + + // As per older kernel docs (freezer-subsystem.txt before + // kernel commit ef9fe980c6fcc1821), if FREEZING is seen, + // userspace should either retry or thaw. While current + // kernel cgroup v1 docs no longer mention a need to retry, + // even a recent kernel (v5.4, Ubuntu 20.04) can't reliably + // freeze a cgroup v1 while new processes keep appearing in it + // (either via fork/clone or by writing new PIDs to + // cgroup.procs). + // + // The numbers below are empirically chosen to have a decent + // chance to succeed in various scenarios ("runc pause/unpause + // with parallel runc exec" and "bare freeze/unfreeze on a very + // slow system"), tested on RHEL7 and Ubuntu 20.04 kernels. + // + // Adding any amount of sleep in between retries did not + // increase the chances of successful freeze in "pause/unpause + // with parallel exec" reproducer. OTOH, adding an occasional + // sleep helped for the case where the system is extremely slow + // (CentOS 7 VM on GHA CI). + // + // Alas, this is still a game of chances, since the real fix + // belong to the kernel (cgroup v2 do not have this bug). + + for i := 0; i < 1000; i++ { + if i%50 == 49 { + // Occasional thaw and sleep improves + // the chances to succeed in freezing + // in case new processes keep appearing + // in the cgroup. + _ = cgroups.WriteFile(path, "freezer.state", string(configs.Thawed)) + time.Sleep(10 * time.Millisecond) + } + + if err := cgroups.WriteFile(path, "freezer.state", string(configs.Frozen)); err != nil { return err } - state, err := fscommon.ReadFile(path, "freezer.state") + if i%25 == 24 { + // Occasional short sleep before reading + // the state back also improves the chances to + // succeed in freezing in case of a very slow + // system. + time.Sleep(10 * time.Microsecond) + } + state, err := cgroups.ReadFile(path, "freezer.state") if err != nil { return err } - if strings.TrimSpace(state) == string(cgroup.Resources.Freezer) { - break + state = strings.TrimSpace(state) + switch state { + case "FREEZING": + continue + case string(configs.Frozen): + if i > 1 { + logrus.Debugf("frozen after %d retries", i) + } + return nil + default: + // should never happen + return fmt.Errorf("unexpected state %s while freezing", strings.TrimSpace(state)) } - - time.Sleep(1 * time.Millisecond) } + // Despite our best efforts, it got stuck in FREEZING. + return errors.New("unable to freeze") + case configs.Thawed: + return cgroups.WriteFile(path, "freezer.state", string(configs.Thawed)) case configs.Undefined: return nil default: - return fmt.Errorf("Invalid argument '%s' to freezer.state", string(cgroup.Resources.Freezer)) + return fmt.Errorf("Invalid argument '%s' to freezer.state", string(r.Freezer)) } - - return nil -} - -func (s *FreezerGroup) Remove(d *cgroupData) error { - return removePath(d.path("freezer")) } func (s *FreezerGroup) GetStats(path string, stats *cgroups.Stats) error { return nil } + +func (s *FreezerGroup) GetState(path string) (configs.FreezerState, error) { + for { + state, err := cgroups.ReadFile(path, "freezer.state") + if err != nil { + // If the kernel is too old, then we just treat the freezer as + // being in an "undefined" state. + if os.IsNotExist(err) || errors.Is(err, unix.ENODEV) { + err = nil + } + return configs.Undefined, err + } + switch strings.TrimSpace(state) { + case "THAWED": + return configs.Thawed, nil + case "FROZEN": + // Find out whether the cgroup is frozen directly, + // or indirectly via an ancestor. + self, err := cgroups.ReadFile(path, "freezer.self_freezing") + if err != nil { + // If the kernel is too old, then we just treat + // it as being frozen. + if errors.Is(err, os.ErrNotExist) || errors.Is(err, unix.ENODEV) { + err = nil + } + return configs.Frozen, err + } + switch self { + case "0\n": + return configs.Thawed, nil + case "1\n": + return configs.Frozen, nil + default: + return configs.Undefined, fmt.Errorf(`unknown "freezer.self_freezing" state: %q`, self) + } + case "FREEZING": + // Make sure we get a stable freezer state, so retry if the cgroup + // is still undergoing freezing. This should be a temporary delay. + time.Sleep(1 * time.Millisecond) + continue + default: + return configs.Undefined, fmt.Errorf("unknown freezer.state %q", state) + } + } +} diff --git a/libcontainer/cgroups/fs/freezer_test.go b/libcontainer/cgroups/fs/freezer_test.go index ad80261..bbdd371 100644 --- a/libcontainer/cgroups/fs/freezer_test.go +++ b/libcontainer/cgroups/fs/freezer_test.go @@ -1,5 +1,3 @@ -// +build linux - package fs import ( @@ -10,22 +8,23 @@ import ( ) func TestFreezerSetState(t *testing.T) { - helper := NewCgroupTestUtil("freezer", t) - defer helper.cleanup() + path := tempDir(t, "freezer") - helper.writeFileContents(map[string]string{ + writeFileContents(t, path, map[string]string{ "freezer.state": string(configs.Frozen), }) - helper.CgroupData.config.Resources.Freezer = configs.Thawed + r := &configs.Resources{ + Freezer: configs.Thawed, + } freezer := &FreezerGroup{} - if err := freezer.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + if err := freezer.Set(path, r); err != nil { t.Fatal(err) } - value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "freezer.state") + value, err := fscommon.GetCgroupParamString(path, "freezer.state") if err != nil { - t.Fatalf("Failed to parse freezer.state - %s", err) + t.Fatal(err) } if value != string(configs.Thawed) { t.Fatal("Got the wrong value, set freezer.state failed.") @@ -33,16 +32,15 @@ func TestFreezerSetState(t *testing.T) { } func TestFreezerSetInvalidState(t *testing.T) { - helper := NewCgroupTestUtil("freezer", t) - defer helper.cleanup() + path := tempDir(t, "freezer") - const ( - invalidArg configs.FreezerState = "Invalid" - ) + const invalidArg configs.FreezerState = "Invalid" - helper.CgroupData.config.Resources.Freezer = invalidArg + r := &configs.Resources{ + Freezer: invalidArg, + } freezer := &FreezerGroup{} - if err := freezer.Set(helper.CgroupPath, helper.CgroupData.config); err == nil { + if err := freezer.Set(path, r); err == nil { t.Fatal("Failed to return invalid argument error") } } diff --git a/libcontainer/cgroups/fs/fs.go b/libcontainer/cgroups/fs/fs.go new file mode 100644 index 0000000..fb4fcc7 --- /dev/null +++ b/libcontainer/cgroups/fs/fs.go @@ -0,0 +1,264 @@ +package fs + +import ( + "errors" + "fmt" + "os" + "sync" + + "golang.org/x/sys/unix" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +var subsystems = []subsystem{ + &CpusetGroup{}, + &DevicesGroup{}, + &MemoryGroup{}, + &CpuGroup{}, + &CpuacctGroup{}, + &PidsGroup{}, + &BlkioGroup{}, + &HugetlbGroup{}, + &NetClsGroup{}, + &NetPrioGroup{}, + &PerfEventGroup{}, + &FreezerGroup{}, + &RdmaGroup{}, + &NameGroup{GroupName: "name=systemd", Join: true}, +} + +var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist") + +func init() { + // If using cgroups-hybrid mode then add a "" controller indicating + // it should join the cgroups v2. + if cgroups.IsCgroup2HybridMode() { + subsystems = append(subsystems, &NameGroup{GroupName: "", Join: true}) + } +} + +type subsystem interface { + // Name returns the name of the subsystem. + Name() string + // GetStats fills in the stats for the subsystem. + GetStats(path string, stats *cgroups.Stats) error + // Apply creates and joins a cgroup, adding pid into it. Some + // subsystems use resources to pre-configure the cgroup parents + // before creating or joining it. + Apply(path string, r *configs.Resources, pid int) error + // Set sets the cgroup resources. + Set(path string, r *configs.Resources) error +} + +type manager struct { + mu sync.Mutex + cgroups *configs.Cgroup + paths map[string]string +} + +func NewManager(cg *configs.Cgroup, paths map[string]string) (cgroups.Manager, error) { + // Some v1 controllers (cpu, cpuset, and devices) expect + // cgroups.Resources to not be nil in Apply. + if cg.Resources == nil { + return nil, errors.New("cgroup v1 manager needs configs.Resources to be set during manager creation") + } + if cg.Resources.Unified != nil { + return nil, cgroups.ErrV1NoUnified + } + + if paths == nil { + var err error + paths, err = initPaths(cg) + if err != nil { + return nil, err + } + } + + return &manager{ + cgroups: cg, + paths: paths, + }, nil +} + +// isIgnorableError returns whether err is a permission error (in the loose +// sense of the word). This includes EROFS (which for an unprivileged user is +// basically a permission error) and EACCES (for similar reasons) as well as +// the normal EPERM. +func isIgnorableError(rootless bool, err error) bool { + // We do not ignore errors if we are root. + if !rootless { + return false + } + // Is it an ordinary EPERM? + if errors.Is(err, os.ErrPermission) { + return true + } + // Handle some specific syscall errors. + var errno unix.Errno + if errors.As(err, &errno) { + return errno == unix.EROFS || errno == unix.EPERM || errno == unix.EACCES + } + return false +} + +func (m *manager) Apply(pid int) (err error) { + m.mu.Lock() + defer m.mu.Unlock() + + c := m.cgroups + + for _, sys := range subsystems { + name := sys.Name() + p, ok := m.paths[name] + if !ok { + continue + } + + if err := sys.Apply(p, c.Resources, pid); err != nil { + // In the case of rootless (including euid=0 in userns), where an + // explicit cgroup path hasn't been set, we don't bail on error in + // case of permission problems here, but do delete the path from + // the m.paths map, since it is either non-existent and could not + // be created, or the pid could not be added to it. + // + // Cases where limits for the subsystem have been set are handled + // later by Set, which fails with a friendly error (see + // if path == "" in Set). + if isIgnorableError(c.Rootless, err) && c.Path == "" { + delete(m.paths, name) + continue + } + return err + } + + } + return nil +} + +func (m *manager) Destroy() error { + m.mu.Lock() + defer m.mu.Unlock() + return cgroups.RemovePaths(m.paths) +} + +func (m *manager) Path(subsys string) string { + m.mu.Lock() + defer m.mu.Unlock() + return m.paths[subsys] +} + +func (m *manager) GetStats() (*cgroups.Stats, error) { + m.mu.Lock() + defer m.mu.Unlock() + stats := cgroups.NewStats() + for _, sys := range subsystems { + path := m.paths[sys.Name()] + if path == "" { + continue + } + if err := sys.GetStats(path, stats); err != nil { + return nil, err + } + } + return stats, nil +} + +func (m *manager) Set(r *configs.Resources) error { + if r == nil { + return nil + } + + if r.Unified != nil { + return cgroups.ErrV1NoUnified + } + + m.mu.Lock() + defer m.mu.Unlock() + for _, sys := range subsystems { + path := m.paths[sys.Name()] + if err := sys.Set(path, r); err != nil { + // When rootless is true, errors from the device subsystem + // are ignored, as it is really not expected to work. + if m.cgroups.Rootless && sys.Name() == "devices" { + continue + } + // However, errors from other subsystems are not ignored. + // see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error" + if path == "" { + // We never created a path for this cgroup, so we cannot set + // limits for it (though we have already tried at this point). + return fmt.Errorf("cannot set %s limit: container could not join or create cgroup", sys.Name()) + } + return err + } + } + + return nil +} + +// Freeze toggles the container's freezer cgroup depending on the state +// provided +func (m *manager) Freeze(state configs.FreezerState) error { + path := m.Path("freezer") + if path == "" { + return errors.New("cannot toggle freezer: cgroups not configured for container") + } + + prevState := m.cgroups.Resources.Freezer + m.cgroups.Resources.Freezer = state + freezer := &FreezerGroup{} + if err := freezer.Set(path, m.cgroups.Resources); err != nil { + m.cgroups.Resources.Freezer = prevState + return err + } + return nil +} + +func (m *manager) GetPids() ([]int, error) { + return cgroups.GetPids(m.Path("devices")) +} + +func (m *manager) GetAllPids() ([]int, error) { + return cgroups.GetAllPids(m.Path("devices")) +} + +func (m *manager) GetPaths() map[string]string { + m.mu.Lock() + defer m.mu.Unlock() + return m.paths +} + +func (m *manager) GetCgroups() (*configs.Cgroup, error) { + return m.cgroups, nil +} + +func (m *manager) GetFreezerState() (configs.FreezerState, error) { + dir := m.Path("freezer") + // If the container doesn't have the freezer cgroup, say it's undefined. + if dir == "" { + return configs.Undefined, nil + } + freezer := &FreezerGroup{} + return freezer.GetState(dir) +} + +func (m *manager) Exists() bool { + return cgroups.PathExists(m.Path("devices")) +} + +func OOMKillCount(path string) (uint64, error) { + return fscommon.GetValueByKey(path, "memory.oom_control", "oom_kill") +} + +func (m *manager) OOMKillCount() (uint64, error) { + c, err := OOMKillCount(m.Path("memory")) + // Ignore ENOENT when rootless as it couldn't create cgroup. + if err != nil && m.cgroups.Rootless && os.IsNotExist(err) { + err = nil + } + + return c, err +} diff --git a/libcontainer/cgroups/fs/fs_test.go b/libcontainer/cgroups/fs/fs_test.go new file mode 100644 index 0000000..01293ad --- /dev/null +++ b/libcontainer/cgroups/fs/fs_test.go @@ -0,0 +1,50 @@ +package fs + +import ( + "testing" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +func BenchmarkGetStats(b *testing.B) { + if cgroups.IsCgroup2UnifiedMode() { + b.Skip("cgroup v2 is not supported") + } + + // Unset TestMode as we work with real cgroupfs here, + // and we want OpenFile to perform the fstype check. + cgroups.TestMode = false + defer func() { + cgroups.TestMode = true + }() + + cg := &configs.Cgroup{ + Path: "/some/kind/of/a/path/here", + Resources: &configs.Resources{}, + } + m, err := NewManager(cg, nil) + if err != nil { + b.Fatal(err) + } + err = m.Apply(-1) + if err != nil { + b.Fatal(err) + } + defer func() { + _ = m.Destroy() + }() + + var st *cgroups.Stats + + b.ResetTimer() + for i := 0; i < b.N; i++ { + st, err = m.GetStats() + if err != nil { + b.Fatal(err) + } + } + if st.CpuStats.CpuUsage.TotalUsage != 0 { + b.Fatalf("stats: %+v", st) + } +} diff --git a/libcontainer/cgroups/fs/fs_unsupported.go b/libcontainer/cgroups/fs/fs_unsupported.go deleted file mode 100644 index 3ef9e03..0000000 --- a/libcontainer/cgroups/fs/fs_unsupported.go +++ /dev/null @@ -1,3 +0,0 @@ -// +build !linux - -package fs diff --git a/libcontainer/cgroups/fs/hugetlb.go b/libcontainer/cgroups/fs/hugetlb.go index 68719c2..8ddd6fd 100644 --- a/libcontainer/cgroups/fs/hugetlb.go +++ b/libcontainer/cgroups/fs/hugetlb.go @@ -1,35 +1,26 @@ -// +build linux - package fs import ( - "fmt" "strconv" - "strings" "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" "github.com/opencontainers/runc/libcontainer/configs" ) -type HugetlbGroup struct { -} +type HugetlbGroup struct{} func (s *HugetlbGroup) Name() string { return "hugetlb" } -func (s *HugetlbGroup) Apply(d *cgroupData) error { - _, err := d.join("hugetlb") - if err != nil && !cgroups.IsNotFound(err) { - return err - } - return nil +func (s *HugetlbGroup) Apply(path string, _ *configs.Resources, pid int) error { + return apply(path, pid) } -func (s *HugetlbGroup) Set(path string, cgroup *configs.Cgroup) error { - for _, hugetlb := range cgroup.Resources.HugetlbLimit { - if err := fscommon.WriteFile(path, strings.Join([]string{"hugetlb", hugetlb.Pagesize, "limit_in_bytes"}, "."), strconv.FormatUint(hugetlb.Limit, 10)); err != nil { +func (s *HugetlbGroup) Set(path string, r *configs.Resources) error { + for _, hugetlb := range r.HugetlbLimit { + if err := cgroups.WriteFile(path, "hugetlb."+hugetlb.Pagesize+".limit_in_bytes", strconv.FormatUint(hugetlb.Limit, 10)); err != nil { return err } } @@ -37,31 +28,30 @@ func (s *HugetlbGroup) Set(path string, cgroup *configs.Cgroup) error { return nil } -func (s *HugetlbGroup) Remove(d *cgroupData) error { - return removePath(d.path("hugetlb")) -} - func (s *HugetlbGroup) GetStats(path string, stats *cgroups.Stats) error { + if !cgroups.PathExists(path) { + return nil + } hugetlbStats := cgroups.HugetlbStats{} - for _, pageSize := range HugePageSizes { - usage := strings.Join([]string{"hugetlb", pageSize, "usage_in_bytes"}, ".") + for _, pageSize := range cgroups.HugePageSizes() { + usage := "hugetlb." + pageSize + ".usage_in_bytes" value, err := fscommon.GetCgroupParamUint(path, usage) if err != nil { - return fmt.Errorf("failed to parse %s - %v", usage, err) + return err } hugetlbStats.Usage = value - maxUsage := strings.Join([]string{"hugetlb", pageSize, "max_usage_in_bytes"}, ".") + maxUsage := "hugetlb." + pageSize + ".max_usage_in_bytes" value, err = fscommon.GetCgroupParamUint(path, maxUsage) if err != nil { - return fmt.Errorf("failed to parse %s - %v", maxUsage, err) + return err } hugetlbStats.MaxUsage = value - failcnt := strings.Join([]string{"hugetlb", pageSize, "failcnt"}, ".") + failcnt := "hugetlb." + pageSize + ".failcnt" value, err = fscommon.GetCgroupParamUint(path, failcnt) if err != nil { - return fmt.Errorf("failed to parse %s - %v", failcnt, err) + return err } hugetlbStats.Failcnt = value diff --git a/libcontainer/cgroups/fs/hugetlb_test.go b/libcontainer/cgroups/fs/hugetlb_test.go index 9b60650..f4aea7e 100644 --- a/libcontainer/cgroups/fs/hugetlb_test.go +++ b/libcontainer/cgroups/fs/hugetlb_test.go @@ -1,5 +1,3 @@ -// +build linux - package fs import ( @@ -18,7 +16,7 @@ const ( hugetlbFailcnt = "100\n" ) -var ( +const ( usage = "hugetlb.%s.usage_in_bytes" limit = "hugetlb.%s.limit_in_bytes" maxUsage = "hugetlb.%s.max_usage_in_bytes" @@ -26,38 +24,38 @@ var ( ) func TestHugetlbSetHugetlb(t *testing.T) { - helper := NewCgroupTestUtil("hugetlb", t) - defer helper.cleanup() + path := tempDir(t, "hugetlb") const ( hugetlbBefore = 256 hugetlbAfter = 512 ) - for _, pageSize := range HugePageSizes { - helper.writeFileContents(map[string]string{ + for _, pageSize := range cgroups.HugePageSizes() { + writeFileContents(t, path, map[string]string{ fmt.Sprintf(limit, pageSize): strconv.Itoa(hugetlbBefore), }) } - for _, pageSize := range HugePageSizes { - helper.CgroupData.config.Resources.HugetlbLimit = []*configs.HugepageLimit{ + r := &configs.Resources{} + for _, pageSize := range cgroups.HugePageSizes() { + r.HugetlbLimit = []*configs.HugepageLimit{ { Pagesize: pageSize, Limit: hugetlbAfter, }, } hugetlb := &HugetlbGroup{} - if err := hugetlb.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + if err := hugetlb.Set(path, r); err != nil { t.Fatal(err) } } - for _, pageSize := range HugePageSizes { + for _, pageSize := range cgroups.HugePageSizes() { limit := fmt.Sprintf(limit, pageSize) - value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, limit) + value, err := fscommon.GetCgroupParamUint(path, limit) if err != nil { - t.Fatalf("Failed to parse %s - %s", limit, err) + t.Fatal(err) } if value != hugetlbAfter { t.Fatalf("Set hugetlb.limit_in_bytes failed. Expected: %v, Got: %v", hugetlbAfter, value) @@ -66,10 +64,9 @@ func TestHugetlbSetHugetlb(t *testing.T) { } func TestHugetlbStats(t *testing.T) { - helper := NewCgroupTestUtil("hugetlb", t) - defer helper.cleanup() - for _, pageSize := range HugePageSizes { - helper.writeFileContents(map[string]string{ + path := tempDir(t, "hugetlb") + for _, pageSize := range cgroups.HugePageSizes() { + writeFileContents(t, path, map[string]string{ fmt.Sprintf(usage, pageSize): hugetlbUsageContents, fmt.Sprintf(maxUsage, pageSize): hugetlbMaxUsageContents, fmt.Sprintf(failcnt, pageSize): hugetlbFailcnt, @@ -78,56 +75,50 @@ func TestHugetlbStats(t *testing.T) { hugetlb := &HugetlbGroup{} actualStats := *cgroups.NewStats() - err := hugetlb.GetStats(helper.CgroupPath, &actualStats) + err := hugetlb.GetStats(path, &actualStats) if err != nil { t.Fatal(err) } expectedStats := cgroups.HugetlbStats{Usage: 128, MaxUsage: 256, Failcnt: 100} - for _, pageSize := range HugePageSizes { + for _, pageSize := range cgroups.HugePageSizes() { expectHugetlbStatEquals(t, expectedStats, actualStats.HugetlbStats[pageSize]) } } func TestHugetlbStatsNoUsageFile(t *testing.T) { -t.Skip("Disabled unreliable test") - helper := NewCgroupTestUtil("hugetlb", t) - defer helper.cleanup() - helper.writeFileContents(map[string]string{ + path := tempDir(t, "hugetlb") + writeFileContents(t, path, map[string]string{ maxUsage: hugetlbMaxUsageContents, }) hugetlb := &HugetlbGroup{} actualStats := *cgroups.NewStats() - err := hugetlb.GetStats(helper.CgroupPath, &actualStats) + err := hugetlb.GetStats(path, &actualStats) if err == nil { t.Fatal("Expected failure") } } func TestHugetlbStatsNoMaxUsageFile(t *testing.T) { -t.Skip("Disabled unreliable test") - helper := NewCgroupTestUtil("hugetlb", t) - defer helper.cleanup() - for _, pageSize := range HugePageSizes { - helper.writeFileContents(map[string]string{ + path := tempDir(t, "hugetlb") + for _, pageSize := range cgroups.HugePageSizes() { + writeFileContents(t, path, map[string]string{ fmt.Sprintf(usage, pageSize): hugetlbUsageContents, }) } hugetlb := &HugetlbGroup{} actualStats := *cgroups.NewStats() - err := hugetlb.GetStats(helper.CgroupPath, &actualStats) + err := hugetlb.GetStats(path, &actualStats) if err == nil { t.Fatal("Expected failure") } } func TestHugetlbStatsBadUsageFile(t *testing.T) { -t.Skip("Disabled unreliable test") - helper := NewCgroupTestUtil("hugetlb", t) - defer helper.cleanup() - for _, pageSize := range HugePageSizes { - helper.writeFileContents(map[string]string{ + path := tempDir(t, "hugetlb") + for _, pageSize := range cgroups.HugePageSizes() { + writeFileContents(t, path, map[string]string{ fmt.Sprintf(usage, pageSize): "bad", maxUsage: hugetlbMaxUsageContents, }) @@ -135,24 +126,22 @@ t.Skip("Disabled unreliable test") hugetlb := &HugetlbGroup{} actualStats := *cgroups.NewStats() - err := hugetlb.GetStats(helper.CgroupPath, &actualStats) + err := hugetlb.GetStats(path, &actualStats) if err == nil { t.Fatal("Expected failure") } } func TestHugetlbStatsBadMaxUsageFile(t *testing.T) { -t.Skip("Disabled unreliable test") - helper := NewCgroupTestUtil("hugetlb", t) - defer helper.cleanup() - helper.writeFileContents(map[string]string{ + path := tempDir(t, "hugetlb") + writeFileContents(t, path, map[string]string{ usage: hugetlbUsageContents, maxUsage: "bad", }) hugetlb := &HugetlbGroup{} actualStats := *cgroups.NewStats() - err := hugetlb.GetStats(helper.CgroupPath, &actualStats) + err := hugetlb.GetStats(path, &actualStats) if err == nil { t.Fatal("Expected failure") } diff --git a/libcontainer/cgroups/fs/kmem.go b/libcontainer/cgroups/fs/kmem.go deleted file mode 100644 index 69b5a19..0000000 --- a/libcontainer/cgroups/fs/kmem.go +++ /dev/null @@ -1,62 +0,0 @@ -// +build linux,!nokmem - -package fs - -import ( - "errors" - "fmt" - "io/ioutil" - "os" - "path/filepath" - "strconv" - "syscall" // for Errno type only - - "github.com/opencontainers/runc/libcontainer/cgroups" - "golang.org/x/sys/unix" -) - -const cgroupKernelMemoryLimit = "memory.kmem.limit_in_bytes" - -func EnableKernelMemoryAccounting(path string) error { - // Ensure that kernel memory is available in this kernel build. If it - // isn't, we just ignore it because EnableKernelMemoryAccounting is - // automatically called for all memory limits. - if !cgroups.PathExists(filepath.Join(path, cgroupKernelMemoryLimit)) { - return nil - } - // We have to limit the kernel memory here as it won't be accounted at all - // until a limit is set on the cgroup and limit cannot be set once the - // cgroup has children, or if there are already tasks in the cgroup. - for _, i := range []int64{1, -1} { - if err := setKernelMemory(path, i); err != nil { - return err - } - } - return nil -} - -func setKernelMemory(path string, kernelMemoryLimit int64) error { - if path == "" { - return fmt.Errorf("no such directory for %s", cgroupKernelMemoryLimit) - } - if !cgroups.PathExists(filepath.Join(path, cgroupKernelMemoryLimit)) { - // We have specifically been asked to set a kmem limit. If the kernel - // doesn't support it we *must* error out. - return errors.New("kernel memory accounting not supported by this kernel") - } - if err := ioutil.WriteFile(filepath.Join(path, cgroupKernelMemoryLimit), []byte(strconv.FormatInt(kernelMemoryLimit, 10)), 0700); err != nil { - // Check if the error number returned by the syscall is "EBUSY" - // The EBUSY signal is returned on attempts to write to the - // memory.kmem.limit_in_bytes file if the cgroup has children or - // once tasks have been attached to the cgroup - if pathErr, ok := err.(*os.PathError); ok { - if errNo, ok := pathErr.Err.(syscall.Errno); ok { - if errNo == unix.EBUSY { - return fmt.Errorf("failed to set %s, because either tasks have already joined this cgroup or it has children", cgroupKernelMemoryLimit) - } - } - } - return fmt.Errorf("failed to write %v to %v: %v", kernelMemoryLimit, cgroupKernelMemoryLimit, err) - } - return nil -} diff --git a/libcontainer/cgroups/fs/kmem_disabled.go b/libcontainer/cgroups/fs/kmem_disabled.go deleted file mode 100644 index ac290fd..0000000 --- a/libcontainer/cgroups/fs/kmem_disabled.go +++ /dev/null @@ -1,15 +0,0 @@ -// +build linux,nokmem - -package fs - -import ( - "errors" -) - -func EnableKernelMemoryAccounting(path string) error { - return nil -} - -func setKernelMemory(path string, kernelMemoryLimit int64) error { - return errors.New("kernel memory accounting disabled in this runc build") -} diff --git a/libcontainer/cgroups/fs/memory.go b/libcontainer/cgroups/fs/memory.go index f81ed05..b7c75f9 100644 --- a/libcontainer/cgroups/fs/memory.go +++ b/libcontainer/cgroups/fs/memory.go @@ -1,15 +1,17 @@ -// +build linux - package fs import ( "bufio" + "errors" "fmt" + "math" "os" "path/filepath" "strconv" "strings" + "golang.org/x/sys/unix" + "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" "github.com/opencontainers/runc/libcontainer/configs" @@ -18,65 +20,66 @@ import ( const ( cgroupMemorySwapLimit = "memory.memsw.limit_in_bytes" cgroupMemoryLimit = "memory.limit_in_bytes" + cgroupMemoryUsage = "memory.usage_in_bytes" + cgroupMemoryMaxUsage = "memory.max_usage_in_bytes" ) -type MemoryGroup struct { -} +type MemoryGroup struct{} func (s *MemoryGroup) Name() string { return "memory" } -func (s *MemoryGroup) Apply(d *cgroupData) (err error) { - path, err := d.path("memory") - if err != nil && !cgroups.IsNotFound(err) { - return err - } else if path == "" { - return nil - } - if memoryAssigned(d.config) { - if _, err := os.Stat(path); os.IsNotExist(err) { - if err := os.MkdirAll(path, 0755); err != nil { - return err - } - // Only enable kernel memory accouting when this cgroup - // is created by libcontainer, otherwise we might get - // error when people use `cgroupsPath` to join an existed - // cgroup whose kernel memory is not initialized. - if err := EnableKernelMemoryAccounting(path); err != nil { - return err - } - } - } - defer func() { - if err != nil { - os.RemoveAll(path) - } - }() - - // We need to join memory cgroup after set memory limits, because - // kmem.limit_in_bytes can only be set when the cgroup is empty. - _, err = d.join("memory") - if err != nil && !cgroups.IsNotFound(err) { - return err - } - return nil +func (s *MemoryGroup) Apply(path string, _ *configs.Resources, pid int) error { + return apply(path, pid) } -func setMemoryAndSwap(path string, cgroup *configs.Cgroup) error { - // If the memory update is set to -1 we should also - // set swap to -1, it means unlimited memory. - if cgroup.Resources.Memory == -1 { +func setMemory(path string, val int64) error { + if val == 0 { + return nil + } + + err := cgroups.WriteFile(path, cgroupMemoryLimit, strconv.FormatInt(val, 10)) + if !errors.Is(err, unix.EBUSY) { + return err + } + + // EBUSY means the kernel can't set new limit as it's too low + // (lower than the current usage). Return more specific error. + usage, err := fscommon.GetCgroupParamUint(path, cgroupMemoryUsage) + if err != nil { + return err + } + max, err := fscommon.GetCgroupParamUint(path, cgroupMemoryMaxUsage) + if err != nil { + return err + } + + return fmt.Errorf("unable to set memory limit to %d (current usage: %d, peak usage: %d)", val, usage, max) +} + +func setSwap(path string, val int64) error { + if val == 0 { + return nil + } + + return cgroups.WriteFile(path, cgroupMemorySwapLimit, strconv.FormatInt(val, 10)) +} + +func setMemoryAndSwap(path string, r *configs.Resources) error { + // If the memory update is set to -1 and the swap is not explicitly + // set, we should also set swap to -1, it means unlimited memory. + if r.Memory == -1 && r.MemorySwap == 0 { // Only set swap if it's enabled in kernel if cgroups.PathExists(filepath.Join(path, cgroupMemorySwapLimit)) { - cgroup.Resources.MemorySwap = -1 + r.MemorySwap = -1 } } // When memory and swap memory are both set, we need to handle the cases // for updating container. - if cgroup.Resources.Memory != 0 && cgroup.Resources.MemorySwap != 0 { - memoryUsage, err := getMemoryData(path, "") + if r.Memory != 0 && r.MemorySwap != 0 { + curLimit, err := fscommon.GetCgroupParamUint(path, cgroupMemoryLimit) if err != nil { return err } @@ -84,84 +87,61 @@ func setMemoryAndSwap(path string, cgroup *configs.Cgroup) error { // When update memory limit, we should adapt the write sequence // for memory and swap memory, so it won't fail because the new // value and the old value don't fit kernel's validation. - if cgroup.Resources.MemorySwap == -1 || memoryUsage.Limit < uint64(cgroup.Resources.MemorySwap) { - if err := fscommon.WriteFile(path, cgroupMemorySwapLimit, strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil { + if r.MemorySwap == -1 || curLimit < uint64(r.MemorySwap) { + if err := setSwap(path, r.MemorySwap); err != nil { return err } - if err := fscommon.WriteFile(path, cgroupMemoryLimit, strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil { - return err - } - } else { - if err := fscommon.WriteFile(path, cgroupMemoryLimit, strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil { - return err - } - if err := fscommon.WriteFile(path, cgroupMemorySwapLimit, strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil { - return err - } - } - } else { - if cgroup.Resources.Memory != 0 { - if err := fscommon.WriteFile(path, cgroupMemoryLimit, strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil { - return err - } - } - if cgroup.Resources.MemorySwap != 0 { - if err := fscommon.WriteFile(path, cgroupMemorySwapLimit, strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil { + if err := setMemory(path, r.Memory); err != nil { return err } + return nil } } - return nil -} - -func (s *MemoryGroup) Set(path string, cgroup *configs.Cgroup) error { - if err := setMemoryAndSwap(path, cgroup); err != nil { + if err := setMemory(path, r.Memory); err != nil { + return err + } + if err := setSwap(path, r.MemorySwap); err != nil { return err } - if cgroup.Resources.KernelMemory != 0 { - if err := setKernelMemory(path, cgroup.Resources.KernelMemory); err != nil { + return nil +} + +func (s *MemoryGroup) Set(path string, r *configs.Resources) error { + if err := setMemoryAndSwap(path, r); err != nil { + return err + } + + // ignore KernelMemory and KernelMemoryTCP + + if r.MemoryReservation != 0 { + if err := cgroups.WriteFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(r.MemoryReservation, 10)); err != nil { return err } } - if cgroup.Resources.MemoryReservation != 0 { - if err := fscommon.WriteFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemoryReservation, 10)); err != nil { + if r.OomKillDisable { + if err := cgroups.WriteFile(path, "memory.oom_control", "1"); err != nil { return err } } - - if cgroup.Resources.KernelMemoryTCP != 0 { - if err := fscommon.WriteFile(path, "memory.kmem.tcp.limit_in_bytes", strconv.FormatInt(cgroup.Resources.KernelMemoryTCP, 10)); err != nil { - return err - } - } - if cgroup.Resources.OomKillDisable { - if err := fscommon.WriteFile(path, "memory.oom_control", "1"); err != nil { - return err - } - } - if cgroup.Resources.MemorySwappiness == nil || int64(*cgroup.Resources.MemorySwappiness) == -1 { + if r.MemorySwappiness == nil || int64(*r.MemorySwappiness) == -1 { return nil - } else if *cgroup.Resources.MemorySwappiness <= 100 { - if err := fscommon.WriteFile(path, "memory.swappiness", strconv.FormatUint(*cgroup.Resources.MemorySwappiness, 10)); err != nil { + } else if *r.MemorySwappiness <= 100 { + if err := cgroups.WriteFile(path, "memory.swappiness", strconv.FormatUint(*r.MemorySwappiness, 10)); err != nil { return err } } else { - return fmt.Errorf("invalid value:%d. valid memory swappiness range is 0-100", *cgroup.Resources.MemorySwappiness) + return fmt.Errorf("invalid memory swappiness value: %d (valid range is 0-100)", *r.MemorySwappiness) } return nil } -func (s *MemoryGroup) Remove(d *cgroupData) error { - return removePath(d.path("memory")) -} - func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error { - // Set stats from memory.stat. - statsFile, err := os.Open(filepath.Join(path, "memory.stat")) + const file = "memory.stat" + statsFile, err := cgroups.OpenFile(path, file, os.O_RDONLY) if err != nil { if os.IsNotExist(err) { return nil @@ -172,9 +152,9 @@ func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error { sc := bufio.NewScanner(statsFile) for sc.Scan() { - t, v, err := fscommon.GetCgroupParamKeyValue(sc.Text()) + t, v, err := fscommon.ParseKeyValue(sc.Text()) if err != nil { - return fmt.Errorf("failed to parse memory.stat (%q) - %v", sc.Text(), err) + return &parseError{Path: path, File: file, Err: err} } stats.MemoryStats.Stats[t] = v } @@ -201,25 +181,21 @@ func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error { } stats.MemoryStats.KernelTCPUsage = kernelTCPUsage - useHierarchy := strings.Join([]string{"memory", "use_hierarchy"}, ".") - value, err := fscommon.GetCgroupParamUint(path, useHierarchy) + value, err := fscommon.GetCgroupParamUint(path, "memory.use_hierarchy") if err != nil { return err } if value == 1 { stats.MemoryStats.UseHierarchy = true } - return nil -} -func memoryAssigned(cgroup *configs.Cgroup) bool { - return cgroup.Resources.Memory != 0 || - cgroup.Resources.MemoryReservation != 0 || - cgroup.Resources.MemorySwap > 0 || - cgroup.Resources.KernelMemory > 0 || - cgroup.Resources.KernelMemoryTCP > 0 || - cgroup.Resources.OomKillDisable || - (cgroup.Resources.MemorySwappiness != nil && int64(*cgroup.Resources.MemorySwappiness) != -1) + pagesByNUMA, err := getPageUsageByNUMA(path) + if err != nil { + return err + } + stats.MemoryStats.PageUsageByNUMA = pagesByNUMA + + return nil } func getMemoryData(path, name string) (cgroups.MemoryData, error) { @@ -227,45 +203,146 @@ func getMemoryData(path, name string) (cgroups.MemoryData, error) { moduleName := "memory" if name != "" { - moduleName = strings.Join([]string{"memory", name}, ".") + moduleName = "memory." + name } - usage := strings.Join([]string{moduleName, "usage_in_bytes"}, ".") - maxUsage := strings.Join([]string{moduleName, "max_usage_in_bytes"}, ".") - failcnt := strings.Join([]string{moduleName, "failcnt"}, ".") - limit := strings.Join([]string{moduleName, "limit_in_bytes"}, ".") + var ( + usage = moduleName + ".usage_in_bytes" + maxUsage = moduleName + ".max_usage_in_bytes" + failcnt = moduleName + ".failcnt" + limit = moduleName + ".limit_in_bytes" + ) value, err := fscommon.GetCgroupParamUint(path, usage) if err != nil { - if moduleName != "memory" && os.IsNotExist(err) { + if name != "" && os.IsNotExist(err) { + // Ignore ENOENT as swap and kmem controllers + // are optional in the kernel. return cgroups.MemoryData{}, nil } - return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", usage, err) + return cgroups.MemoryData{}, err } memoryData.Usage = value value, err = fscommon.GetCgroupParamUint(path, maxUsage) if err != nil { - if moduleName != "memory" && os.IsNotExist(err) { - return cgroups.MemoryData{}, nil - } - return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", maxUsage, err) + return cgroups.MemoryData{}, err } memoryData.MaxUsage = value value, err = fscommon.GetCgroupParamUint(path, failcnt) if err != nil { - if moduleName != "memory" && os.IsNotExist(err) { - return cgroups.MemoryData{}, nil - } - return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", failcnt, err) + return cgroups.MemoryData{}, err } memoryData.Failcnt = value value, err = fscommon.GetCgroupParamUint(path, limit) if err != nil { - if moduleName != "memory" && os.IsNotExist(err) { - return cgroups.MemoryData{}, nil - } - return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", limit, err) + return cgroups.MemoryData{}, err } memoryData.Limit = value return memoryData, nil } + +func getPageUsageByNUMA(path string) (cgroups.PageUsageByNUMA, error) { + const ( + maxColumns = math.MaxUint8 + 1 + file = "memory.numa_stat" + ) + stats := cgroups.PageUsageByNUMA{} + + fd, err := cgroups.OpenFile(path, file, os.O_RDONLY) + if os.IsNotExist(err) { + return stats, nil + } else if err != nil { + return stats, err + } + defer fd.Close() + + // File format is documented in linux/Documentation/cgroup-v1/memory.txt + // and it looks like this: + // + // total= N0= N1= ... + // file= N0= N1= ... + // anon= N0= N1= ... + // unevictable= N0= N1= ... + // hierarchical_= N0= N1= ... + + scanner := bufio.NewScanner(fd) + for scanner.Scan() { + var field *cgroups.PageStats + + line := scanner.Text() + columns := strings.SplitN(line, " ", maxColumns) + for i, column := range columns { + byNode := strings.SplitN(column, "=", 2) + // Some custom kernels have non-standard fields, like + // numa_locality 0 0 0 0 0 0 0 0 0 0 + // numa_exectime 0 + if len(byNode) < 2 { + if i == 0 { + // Ignore/skip those. + break + } else { + // The first column was already validated, + // so be strict to the rest. + return stats, malformedLine(path, file, line) + } + } + key, val := byNode[0], byNode[1] + if i == 0 { // First column: key is name, val is total. + field = getNUMAField(&stats, key) + if field == nil { // unknown field (new kernel?) + break + } + field.Total, err = strconv.ParseUint(val, 0, 64) + if err != nil { + return stats, &parseError{Path: path, File: file, Err: err} + } + field.Nodes = map[uint8]uint64{} + } else { // Subsequent columns: key is N, val is usage. + if len(key) < 2 || key[0] != 'N' { + // This is definitely an error. + return stats, malformedLine(path, file, line) + } + + n, err := strconv.ParseUint(key[1:], 10, 8) + if err != nil { + return stats, &parseError{Path: path, File: file, Err: err} + } + + usage, err := strconv.ParseUint(val, 10, 64) + if err != nil { + return stats, &parseError{Path: path, File: file, Err: err} + } + + field.Nodes[uint8(n)] = usage + } + + } + } + if err := scanner.Err(); err != nil { + return cgroups.PageUsageByNUMA{}, &parseError{Path: path, File: file, Err: err} + } + + return stats, nil +} + +func getNUMAField(stats *cgroups.PageUsageByNUMA, name string) *cgroups.PageStats { + switch name { + case "total": + return &stats.Total + case "file": + return &stats.File + case "anon": + return &stats.Anon + case "unevictable": + return &stats.Unevictable + case "hierarchical_total": + return &stats.Hierarchical.Total + case "hierarchical_file": + return &stats.Hierarchical.File + case "hierarchical_anon": + return &stats.Hierarchical.Anon + case "hierarchical_unevictable": + return &stats.Hierarchical.Unevictable + } + return nil +} diff --git a/libcontainer/cgroups/fs/memory_test.go b/libcontainer/cgroups/fs/memory_test.go index 62de563..d305a62 100644 --- a/libcontainer/cgroups/fs/memory_test.go +++ b/libcontainer/cgroups/fs/memory_test.go @@ -1,5 +1,3 @@ -// +build linux - package fs import ( @@ -8,6 +6,7 @@ import ( "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" ) const ( @@ -18,11 +17,29 @@ rss 1024` memoryFailcnt = "100\n" memoryLimitContents = "8192\n" memoryUseHierarchyContents = "1\n" + memoryNUMAStatContents = `total=44611 N0=32631 N1=7501 N2=1982 N3=2497 +file=44428 N0=32614 N1=7335 N2=1982 N3=2497 +anon=183 N0=17 N1=166 N2=0 N3=0 +unevictable=0 N0=0 N1=0 N2=0 N3=0 +hierarchical_total=768133 N0=509113 N1=138887 N2=20464 N3=99669 +hierarchical_file=722017 N0=496516 N1=119997 N2=20181 N3=85323 +hierarchical_anon=46096 N0=12597 N1=18890 N2=283 N3=14326 +hierarchical_unevictable=20 N0=0 N1=0 N2=0 N3=20 +` + memoryNUMAStatNoHierarchyContents = `total=44611 N0=32631 N1=7501 N2=1982 N3=2497 +file=44428 N0=32614 N1=7335 N2=1982 N3=2497 +anon=183 N0=17 N1=166 N2=0 N3=0 +unevictable=0 N0=0 N1=0 N2=0 N3=0 +` + // Some custom kernels has extra fields that should be ignored + memoryNUMAStatExtraContents = `numa_locality 0 0 0 0 0 0 0 0 0 0 +numa_exectime 0 +whatever=100 N0=0 +` ) func TestMemorySetMemory(t *testing.T) { - helper := NewCgroupTestUtil("memory", t) - defer helper.cleanup() + path := tempDir(t, "memory") const ( memoryBefore = 314572800 // 300M @@ -31,29 +48,31 @@ func TestMemorySetMemory(t *testing.T) { reservationAfter = 314572800 // 300M ) - helper.writeFileContents(map[string]string{ + writeFileContents(t, path, map[string]string{ "memory.limit_in_bytes": strconv.Itoa(memoryBefore), "memory.soft_limit_in_bytes": strconv.Itoa(reservationBefore), }) - helper.CgroupData.config.Resources.Memory = memoryAfter - helper.CgroupData.config.Resources.MemoryReservation = reservationAfter + r := &configs.Resources{ + Memory: memoryAfter, + MemoryReservation: reservationAfter, + } memory := &MemoryGroup{} - if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + if err := memory.Set(path, r); err != nil { t.Fatal(err) } - value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.limit_in_bytes") + value, err := fscommon.GetCgroupParamUint(path, "memory.limit_in_bytes") if err != nil { - t.Fatalf("Failed to parse memory.limit_in_bytes - %s", err) + t.Fatal(err) } if value != memoryAfter { t.Fatal("Got the wrong value, set memory.limit_in_bytes failed.") } - value, err = fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.soft_limit_in_bytes") + value, err = fscommon.GetCgroupParamUint(path, "memory.soft_limit_in_bytes") if err != nil { - t.Fatalf("Failed to parse memory.soft_limit_in_bytes - %s", err) + t.Fatal(err) } if value != reservationAfter { t.Fatal("Got the wrong value, set memory.soft_limit_in_bytes failed.") @@ -61,27 +80,28 @@ func TestMemorySetMemory(t *testing.T) { } func TestMemorySetMemoryswap(t *testing.T) { - helper := NewCgroupTestUtil("memory", t) - defer helper.cleanup() + path := tempDir(t, "memory") const ( memoryswapBefore = 314572800 // 300M memoryswapAfter = 524288000 // 500M ) - helper.writeFileContents(map[string]string{ + writeFileContents(t, path, map[string]string{ "memory.memsw.limit_in_bytes": strconv.Itoa(memoryswapBefore), }) - helper.CgroupData.config.Resources.MemorySwap = memoryswapAfter + r := &configs.Resources{ + MemorySwap: memoryswapAfter, + } memory := &MemoryGroup{} - if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + if err := memory.Set(path, r); err != nil { t.Fatal(err) } - value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.memsw.limit_in_bytes") + value, err := fscommon.GetCgroupParamUint(path, "memory.memsw.limit_in_bytes") if err != nil { - t.Fatalf("Failed to parse memory.memsw.limit_in_bytes - %s", err) + t.Fatal(err) } if value != memoryswapAfter { t.Fatal("Got the wrong value, set memory.memsw.limit_in_bytes failed.") @@ -89,8 +109,7 @@ func TestMemorySetMemoryswap(t *testing.T) { } func TestMemorySetMemoryLargerThanSwap(t *testing.T) { - helper := NewCgroupTestUtil("memory", t) - defer helper.cleanup() + path := tempDir(t, "memory") const ( memoryBefore = 314572800 // 300M @@ -99,7 +118,7 @@ func TestMemorySetMemoryLargerThanSwap(t *testing.T) { memoryswapAfter = 838860800 // 800M ) - helper.writeFileContents(map[string]string{ + writeFileContents(t, path, map[string]string{ "memory.limit_in_bytes": strconv.Itoa(memoryBefore), "memory.memsw.limit_in_bytes": strconv.Itoa(memoryswapBefore), // Set will call getMemoryData when memory and swap memory are @@ -109,23 +128,26 @@ func TestMemorySetMemoryLargerThanSwap(t *testing.T) { "memory.failcnt": "0", }) - helper.CgroupData.config.Resources.Memory = memoryAfter - helper.CgroupData.config.Resources.MemorySwap = memoryswapAfter + r := &configs.Resources{ + Memory: memoryAfter, + MemorySwap: memoryswapAfter, + } memory := &MemoryGroup{} - if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + if err := memory.Set(path, r); err != nil { t.Fatal(err) } - value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.limit_in_bytes") + value, err := fscommon.GetCgroupParamUint(path, "memory.limit_in_bytes") if err != nil { - t.Fatalf("Failed to parse memory.limit_in_bytes - %s", err) + t.Fatal(err) } if value != memoryAfter { t.Fatal("Got the wrong value, set memory.limit_in_bytes failed.") } - value, err = fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.memsw.limit_in_bytes") + + value, err = fscommon.GetCgroupParamUint(path, "memory.memsw.limit_in_bytes") if err != nil { - t.Fatalf("Failed to parse memory.memsw.limit_in_bytes - %s", err) + t.Fatal(err) } if value != memoryswapAfter { t.Fatal("Got the wrong value, set memory.memsw.limit_in_bytes failed.") @@ -133,8 +155,7 @@ func TestMemorySetMemoryLargerThanSwap(t *testing.T) { } func TestMemorySetSwapSmallerThanMemory(t *testing.T) { - helper := NewCgroupTestUtil("memory", t) - defer helper.cleanup() + path := tempDir(t, "memory") const ( memoryBefore = 629145600 // 600M @@ -143,115 +164,58 @@ func TestMemorySetSwapSmallerThanMemory(t *testing.T) { memoryswapAfter = 524288000 // 500M ) - helper.writeFileContents(map[string]string{ + writeFileContents(t, path, map[string]string{ "memory.limit_in_bytes": strconv.Itoa(memoryBefore), "memory.memsw.limit_in_bytes": strconv.Itoa(memoryswapBefore), - // Set will call getMemoryData when memory and swap memory are - // both set, fake these fields so we don't get error. - "memory.usage_in_bytes": "0", - "memory.max_usage_in_bytes": "0", - "memory.failcnt": "0", }) - helper.CgroupData.config.Resources.Memory = memoryAfter - helper.CgroupData.config.Resources.MemorySwap = memoryswapAfter + r := &configs.Resources{ + Memory: memoryAfter, + MemorySwap: memoryswapAfter, + } memory := &MemoryGroup{} - if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + if err := memory.Set(path, r); err != nil { t.Fatal(err) } - value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.limit_in_bytes") + value, err := fscommon.GetCgroupParamUint(path, "memory.limit_in_bytes") if err != nil { - t.Fatalf("Failed to parse memory.limit_in_bytes - %s", err) + t.Fatal(err) } if value != memoryAfter { - t.Fatal("Got the wrong value, set memory.limit_in_bytes failed.") + t.Fatalf("Got the wrong value (%d != %d), set memory.limit_in_bytes failed", value, memoryAfter) } - value, err = fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.memsw.limit_in_bytes") + + value, err = fscommon.GetCgroupParamUint(path, "memory.memsw.limit_in_bytes") if err != nil { - t.Fatalf("Failed to parse memory.memsw.limit_in_bytes - %s", err) + t.Fatal(err) } if value != memoryswapAfter { - t.Fatal("Got the wrong value, set memory.memsw.limit_in_bytes failed.") - } -} - -func TestMemorySetKernelMemory(t *testing.T) { - helper := NewCgroupTestUtil("memory", t) - defer helper.cleanup() - - const ( - kernelMemoryBefore = 314572800 // 300M - kernelMemoryAfter = 524288000 // 500M - ) - - helper.writeFileContents(map[string]string{ - "memory.kmem.limit_in_bytes": strconv.Itoa(kernelMemoryBefore), - }) - - helper.CgroupData.config.Resources.KernelMemory = kernelMemoryAfter - memory := &MemoryGroup{} - if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { - t.Fatal(err) - } - - value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.kmem.limit_in_bytes") - if err != nil { - t.Fatalf("Failed to parse memory.kmem.limit_in_bytes - %s", err) - } - if value != kernelMemoryAfter { - t.Fatal("Got the wrong value, set memory.kmem.limit_in_bytes failed.") - } -} - -func TestMemorySetKernelMemoryTCP(t *testing.T) { - helper := NewCgroupTestUtil("memory", t) - defer helper.cleanup() - - const ( - kernelMemoryTCPBefore = 314572800 // 300M - kernelMemoryTCPAfter = 524288000 // 500M - ) - - helper.writeFileContents(map[string]string{ - "memory.kmem.tcp.limit_in_bytes": strconv.Itoa(kernelMemoryTCPBefore), - }) - - helper.CgroupData.config.Resources.KernelMemoryTCP = kernelMemoryTCPAfter - memory := &MemoryGroup{} - if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { - t.Fatal(err) - } - - value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.kmem.tcp.limit_in_bytes") - if err != nil { - t.Fatalf("Failed to parse memory.kmem.tcp.limit_in_bytes - %s", err) - } - if value != kernelMemoryTCPAfter { - t.Fatal("Got the wrong value, set memory.kmem.tcp.limit_in_bytes failed.") + t.Fatalf("Got the wrong value (%d != %d), set memory.memsw.limit_in_bytes failed", value, memoryswapAfter) } } func TestMemorySetMemorySwappinessDefault(t *testing.T) { - helper := NewCgroupTestUtil("memory", t) - defer helper.cleanup() + path := tempDir(t, "memory") - swappinessBefore := 60 //default is 60 + swappinessBefore := 60 // default is 60 swappinessAfter := uint64(0) - helper.writeFileContents(map[string]string{ + writeFileContents(t, path, map[string]string{ "memory.swappiness": strconv.Itoa(swappinessBefore), }) - helper.CgroupData.config.Resources.MemorySwappiness = &swappinessAfter + r := &configs.Resources{ + MemorySwappiness: &swappinessAfter, + } memory := &MemoryGroup{} - if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + if err := memory.Set(path, r); err != nil { t.Fatal(err) } - value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.swappiness") + value, err := fscommon.GetCgroupParamUint(path, "memory.swappiness") if err != nil { - t.Fatalf("Failed to parse memory.swappiness - %s", err) + t.Fatal(err) } if value != swappinessAfter { t.Fatalf("Got the wrong value (%d), set memory.swappiness = %d failed.", value, swappinessAfter) @@ -259,9 +223,8 @@ func TestMemorySetMemorySwappinessDefault(t *testing.T) { } func TestMemoryStats(t *testing.T) { - helper := NewCgroupTestUtil("memory", t) - defer helper.cleanup() - helper.writeFileContents(map[string]string{ + path := tempDir(t, "memory") + writeFileContents(t, path, map[string]string{ "memory.stat": memoryStatContents, "memory.usage_in_bytes": memoryUsageContents, "memory.limit_in_bytes": memoryLimitContents, @@ -276,22 +239,43 @@ func TestMemoryStats(t *testing.T) { "memory.kmem.failcnt": memoryFailcnt, "memory.kmem.limit_in_bytes": memoryLimitContents, "memory.use_hierarchy": memoryUseHierarchyContents, + "memory.numa_stat": memoryNUMAStatContents + memoryNUMAStatExtraContents, }) memory := &MemoryGroup{} actualStats := *cgroups.NewStats() - err := memory.GetStats(helper.CgroupPath, &actualStats) + err := memory.GetStats(path, &actualStats) if err != nil { t.Fatal(err) } - expectedStats := cgroups.MemoryStats{Cache: 512, Usage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192}, SwapUsage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192}, KernelUsage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192}, Stats: map[string]uint64{"cache": 512, "rss": 1024}, UseHierarchy: true} + expectedStats := cgroups.MemoryStats{ + Cache: 512, + Usage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192}, + SwapUsage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192}, + KernelUsage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192}, + Stats: map[string]uint64{"cache": 512, "rss": 1024}, + UseHierarchy: true, + PageUsageByNUMA: cgroups.PageUsageByNUMA{ + PageUsageByNUMAInner: cgroups.PageUsageByNUMAInner{ + Total: cgroups.PageStats{Total: 44611, Nodes: map[uint8]uint64{0: 32631, 1: 7501, 2: 1982, 3: 2497}}, + File: cgroups.PageStats{Total: 44428, Nodes: map[uint8]uint64{0: 32614, 1: 7335, 2: 1982, 3: 2497}}, + Anon: cgroups.PageStats{Total: 183, Nodes: map[uint8]uint64{0: 17, 1: 166, 2: 0, 3: 0}}, + Unevictable: cgroups.PageStats{Total: 0, Nodes: map[uint8]uint64{0: 0, 1: 0, 2: 0, 3: 0}}, + }, + Hierarchical: cgroups.PageUsageByNUMAInner{ + Total: cgroups.PageStats{Total: 768133, Nodes: map[uint8]uint64{0: 509113, 1: 138887, 2: 20464, 3: 99669}}, + File: cgroups.PageStats{Total: 722017, Nodes: map[uint8]uint64{0: 496516, 1: 119997, 2: 20181, 3: 85323}}, + Anon: cgroups.PageStats{Total: 46096, Nodes: map[uint8]uint64{0: 12597, 1: 18890, 2: 283, 3: 14326}}, + Unevictable: cgroups.PageStats{Total: 20, Nodes: map[uint8]uint64{0: 0, 1: 0, 2: 0, 3: 20}}, + }, + }, + } expectMemoryStatEquals(t, expectedStats, actualStats.MemoryStats) } func TestMemoryStatsNoStatFile(t *testing.T) { - helper := NewCgroupTestUtil("memory", t) - defer helper.cleanup() - helper.writeFileContents(map[string]string{ + path := tempDir(t, "memory") + writeFileContents(t, path, map[string]string{ "memory.usage_in_bytes": memoryUsageContents, "memory.max_usage_in_bytes": memoryMaxUsageContents, "memory.limit_in_bytes": memoryLimitContents, @@ -299,16 +283,15 @@ func TestMemoryStatsNoStatFile(t *testing.T) { memory := &MemoryGroup{} actualStats := *cgroups.NewStats() - err := memory.GetStats(helper.CgroupPath, &actualStats) + err := memory.GetStats(path, &actualStats) if err != nil { t.Fatal(err) } } func TestMemoryStatsNoUsageFile(t *testing.T) { - helper := NewCgroupTestUtil("memory", t) - defer helper.cleanup() - helper.writeFileContents(map[string]string{ + path := tempDir(t, "memory") + writeFileContents(t, path, map[string]string{ "memory.stat": memoryStatContents, "memory.max_usage_in_bytes": memoryMaxUsageContents, "memory.limit_in_bytes": memoryLimitContents, @@ -316,16 +299,15 @@ func TestMemoryStatsNoUsageFile(t *testing.T) { memory := &MemoryGroup{} actualStats := *cgroups.NewStats() - err := memory.GetStats(helper.CgroupPath, &actualStats) + err := memory.GetStats(path, &actualStats) if err == nil { t.Fatal("Expected failure") } } func TestMemoryStatsNoMaxUsageFile(t *testing.T) { - helper := NewCgroupTestUtil("memory", t) - defer helper.cleanup() - helper.writeFileContents(map[string]string{ + path := tempDir(t, "memory") + writeFileContents(t, path, map[string]string{ "memory.stat": memoryStatContents, "memory.usage_in_bytes": memoryUsageContents, "memory.limit_in_bytes": memoryLimitContents, @@ -333,16 +315,15 @@ func TestMemoryStatsNoMaxUsageFile(t *testing.T) { memory := &MemoryGroup{} actualStats := *cgroups.NewStats() - err := memory.GetStats(helper.CgroupPath, &actualStats) + err := memory.GetStats(path, &actualStats) if err == nil { t.Fatal("Expected failure") } } func TestMemoryStatsNoLimitInBytesFile(t *testing.T) { - helper := NewCgroupTestUtil("memory", t) - defer helper.cleanup() - helper.writeFileContents(map[string]string{ + path := tempDir(t, "memory") + writeFileContents(t, path, map[string]string{ "memory.stat": memoryStatContents, "memory.usage_in_bytes": memoryUsageContents, "memory.max_usage_in_bytes": memoryMaxUsageContents, @@ -350,16 +331,15 @@ func TestMemoryStatsNoLimitInBytesFile(t *testing.T) { memory := &MemoryGroup{} actualStats := *cgroups.NewStats() - err := memory.GetStats(helper.CgroupPath, &actualStats) + err := memory.GetStats(path, &actualStats) if err == nil { t.Fatal("Expected failure") } } func TestMemoryStatsBadStatFile(t *testing.T) { - helper := NewCgroupTestUtil("memory", t) - defer helper.cleanup() - helper.writeFileContents(map[string]string{ + path := tempDir(t, "memory") + writeFileContents(t, path, map[string]string{ "memory.stat": "rss rss", "memory.usage_in_bytes": memoryUsageContents, "memory.max_usage_in_bytes": memoryMaxUsageContents, @@ -368,16 +348,15 @@ func TestMemoryStatsBadStatFile(t *testing.T) { memory := &MemoryGroup{} actualStats := *cgroups.NewStats() - err := memory.GetStats(helper.CgroupPath, &actualStats) + err := memory.GetStats(path, &actualStats) if err == nil { t.Fatal("Expected failure") } } func TestMemoryStatsBadUsageFile(t *testing.T) { - helper := NewCgroupTestUtil("memory", t) - defer helper.cleanup() - helper.writeFileContents(map[string]string{ + path := tempDir(t, "memory") + writeFileContents(t, path, map[string]string{ "memory.stat": memoryStatContents, "memory.usage_in_bytes": "bad", "memory.max_usage_in_bytes": memoryMaxUsageContents, @@ -386,16 +365,15 @@ func TestMemoryStatsBadUsageFile(t *testing.T) { memory := &MemoryGroup{} actualStats := *cgroups.NewStats() - err := memory.GetStats(helper.CgroupPath, &actualStats) + err := memory.GetStats(path, &actualStats) if err == nil { t.Fatal("Expected failure") } } func TestMemoryStatsBadMaxUsageFile(t *testing.T) { - helper := NewCgroupTestUtil("memory", t) - defer helper.cleanup() - helper.writeFileContents(map[string]string{ + path := tempDir(t, "memory") + writeFileContents(t, path, map[string]string{ "memory.stat": memoryStatContents, "memory.usage_in_bytes": memoryUsageContents, "memory.max_usage_in_bytes": "bad", @@ -404,16 +382,15 @@ func TestMemoryStatsBadMaxUsageFile(t *testing.T) { memory := &MemoryGroup{} actualStats := *cgroups.NewStats() - err := memory.GetStats(helper.CgroupPath, &actualStats) + err := memory.GetStats(path, &actualStats) if err == nil { t.Fatal("Expected failure") } } func TestMemoryStatsBadLimitInBytesFile(t *testing.T) { - helper := NewCgroupTestUtil("memory", t) - defer helper.cleanup() - helper.writeFileContents(map[string]string{ + path := tempDir(t, "memory") + writeFileContents(t, path, map[string]string{ "memory.stat": memoryStatContents, "memory.usage_in_bytes": memoryUsageContents, "memory.max_usage_in_bytes": memoryMaxUsageContents, @@ -422,35 +399,108 @@ func TestMemoryStatsBadLimitInBytesFile(t *testing.T) { memory := &MemoryGroup{} actualStats := *cgroups.NewStats() - err := memory.GetStats(helper.CgroupPath, &actualStats) + err := memory.GetStats(path, &actualStats) if err == nil { t.Fatal("Expected failure") } } func TestMemorySetOomControl(t *testing.T) { - helper := NewCgroupTestUtil("memory", t) - defer helper.cleanup() + path := tempDir(t, "memory") const ( oomKillDisable = 1 // disable oom killer, default is 0 ) - helper.writeFileContents(map[string]string{ + writeFileContents(t, path, map[string]string{ "memory.oom_control": strconv.Itoa(oomKillDisable), }) memory := &MemoryGroup{} - if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + r := &configs.Resources{} + if err := memory.Set(path, r); err != nil { t.Fatal(err) } - value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.oom_control") + value, err := fscommon.GetCgroupParamUint(path, "memory.oom_control") if err != nil { - t.Fatalf("Failed to parse memory.oom_control - %s", err) + t.Fatal(err) } - if value != oomKillDisable { t.Fatalf("Got the wrong value, set memory.oom_control failed.") } } + +func TestNoHierarchicalNumaStat(t *testing.T) { + path := tempDir(t, "memory") + writeFileContents(t, path, map[string]string{ + "memory.numa_stat": memoryNUMAStatNoHierarchyContents + memoryNUMAStatExtraContents, + }) + + actualStats, err := getPageUsageByNUMA(path) + if err != nil { + t.Fatal(err) + } + pageUsageByNUMA := cgroups.PageUsageByNUMA{ + PageUsageByNUMAInner: cgroups.PageUsageByNUMAInner{ + Total: cgroups.PageStats{Total: 44611, Nodes: map[uint8]uint64{0: 32631, 1: 7501, 2: 1982, 3: 2497}}, + File: cgroups.PageStats{Total: 44428, Nodes: map[uint8]uint64{0: 32614, 1: 7335, 2: 1982, 3: 2497}}, + Anon: cgroups.PageStats{Total: 183, Nodes: map[uint8]uint64{0: 17, 1: 166, 2: 0, 3: 0}}, + Unevictable: cgroups.PageStats{Total: 0, Nodes: map[uint8]uint64{0: 0, 1: 0, 2: 0, 3: 0}}, + }, + Hierarchical: cgroups.PageUsageByNUMAInner{}, + } + expectPageUsageByNUMAEquals(t, pageUsageByNUMA, actualStats) +} + +func TestBadNumaStat(t *testing.T) { + memoryNUMAStatBadContents := []struct { + desc, contents string + }{ + { + desc: "Nx where x is not a number", + contents: `total=44611 N0=44611, +file=44428 Nx=0 +`, + }, { + desc: "Nx where x > 255", + contents: `total=44611 N333=444`, + }, { + desc: "Nx argument missing", + contents: `total=44611 N0=123 N1=`, + }, { + desc: "Nx argument is not a number", + contents: `total=44611 N0=123 N1=a`, + }, { + desc: "Missing = after Nx", + contents: `total=44611 N0=123 N1`, + }, { + desc: "No Nx at non-first position", + contents: `total=44611 N0=32631 +file=44428 N0=32614 +anon=183 N0=12 badone +`, + }, + } + path := tempDir(t, "memory") + for _, c := range memoryNUMAStatBadContents { + writeFileContents(t, path, map[string]string{ + "memory.numa_stat": c.contents, + }) + + _, err := getPageUsageByNUMA(path) + if err == nil { + t.Errorf("case %q: expected error, got nil", c.desc) + } + } +} + +func TestWithoutNumaStat(t *testing.T) { + path := tempDir(t, "memory") + + actualStats, err := getPageUsageByNUMA(path) + if err != nil { + t.Fatal(err) + } + expectPageUsageByNUMAEquals(t, cgroups.PageUsageByNUMA{}, actualStats) +} diff --git a/libcontainer/cgroups/fs/name.go b/libcontainer/cgroups/fs/name.go index d8cf1d8..b8d5d84 100644 --- a/libcontainer/cgroups/fs/name.go +++ b/libcontainer/cgroups/fs/name.go @@ -1,5 +1,3 @@ -// +build linux - package fs import ( @@ -16,22 +14,15 @@ func (s *NameGroup) Name() string { return s.GroupName } -func (s *NameGroup) Apply(d *cgroupData) error { +func (s *NameGroup) Apply(path string, _ *configs.Resources, pid int) error { if s.Join { - // ignore errors if the named cgroup does not exist - d.join(s.GroupName) + // Ignore errors if the named cgroup does not exist. + _ = apply(path, pid) } return nil } -func (s *NameGroup) Set(path string, cgroup *configs.Cgroup) error { - return nil -} - -func (s *NameGroup) Remove(d *cgroupData) error { - if s.Join { - removePath(d.path(s.GroupName)) - } +func (s *NameGroup) Set(_ string, _ *configs.Resources) error { return nil } diff --git a/libcontainer/cgroups/fs/net_cls.go b/libcontainer/cgroups/fs/net_cls.go index 0212015..abfd09c 100644 --- a/libcontainer/cgroups/fs/net_cls.go +++ b/libcontainer/cgroups/fs/net_cls.go @@ -1,33 +1,25 @@ -// +build linux - package fs import ( "strconv" "github.com/opencontainers/runc/libcontainer/cgroups" - "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" "github.com/opencontainers/runc/libcontainer/configs" ) -type NetClsGroup struct { -} +type NetClsGroup struct{} func (s *NetClsGroup) Name() string { return "net_cls" } -func (s *NetClsGroup) Apply(d *cgroupData) error { - _, err := d.join("net_cls") - if err != nil && !cgroups.IsNotFound(err) { - return err - } - return nil +func (s *NetClsGroup) Apply(path string, _ *configs.Resources, pid int) error { + return apply(path, pid) } -func (s *NetClsGroup) Set(path string, cgroup *configs.Cgroup) error { - if cgroup.Resources.NetClsClassid != 0 { - if err := fscommon.WriteFile(path, "net_cls.classid", strconv.FormatUint(uint64(cgroup.Resources.NetClsClassid), 10)); err != nil { +func (s *NetClsGroup) Set(path string, r *configs.Resources) error { + if r.NetClsClassid != 0 { + if err := cgroups.WriteFile(path, "net_cls.classid", strconv.FormatUint(uint64(r.NetClsClassid), 10)); err != nil { return err } } @@ -35,10 +27,6 @@ func (s *NetClsGroup) Set(path string, cgroup *configs.Cgroup) error { return nil } -func (s *NetClsGroup) Remove(d *cgroupData) error { - return removePath(d.path("net_cls")) -} - func (s *NetClsGroup) GetStats(path string, stats *cgroups.Stats) error { return nil } diff --git a/libcontainer/cgroups/fs/net_cls_test.go b/libcontainer/cgroups/fs/net_cls_test.go index 602133a..085c061 100644 --- a/libcontainer/cgroups/fs/net_cls_test.go +++ b/libcontainer/cgroups/fs/net_cls_test.go @@ -1,5 +1,3 @@ -// +build linux - package fs import ( @@ -7,6 +5,7 @@ import ( "testing" "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" ) const ( @@ -15,25 +14,26 @@ const ( ) func TestNetClsSetClassid(t *testing.T) { - helper := NewCgroupTestUtil("net_cls", t) - defer helper.cleanup() + path := tempDir(t, "net_cls") - helper.writeFileContents(map[string]string{ + writeFileContents(t, path, map[string]string{ "net_cls.classid": strconv.FormatUint(classidBefore, 10), }) - helper.CgroupData.config.Resources.NetClsClassid = classidAfter + r := &configs.Resources{ + NetClsClassid: classidAfter, + } netcls := &NetClsGroup{} - if err := netcls.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + if err := netcls.Set(path, r); err != nil { t.Fatal(err) } // As we are in mock environment, we can't get correct value of classid from // net_cls.classid. // So. we just judge if we successfully write classid into file - value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "net_cls.classid") + value, err := fscommon.GetCgroupParamUint(path, "net_cls.classid") if err != nil { - t.Fatalf("Failed to parse net_cls.classid - %s", err) + t.Fatal(err) } if value != classidAfter { t.Fatal("Got the wrong value, set net_cls.classid failed.") diff --git a/libcontainer/cgroups/fs/net_prio.go b/libcontainer/cgroups/fs/net_prio.go index 2bdeedf..da74d37 100644 --- a/libcontainer/cgroups/fs/net_prio.go +++ b/libcontainer/cgroups/fs/net_prio.go @@ -1,31 +1,23 @@ -// +build linux - package fs import ( "github.com/opencontainers/runc/libcontainer/cgroups" - "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" "github.com/opencontainers/runc/libcontainer/configs" ) -type NetPrioGroup struct { -} +type NetPrioGroup struct{} func (s *NetPrioGroup) Name() string { return "net_prio" } -func (s *NetPrioGroup) Apply(d *cgroupData) error { - _, err := d.join("net_prio") - if err != nil && !cgroups.IsNotFound(err) { - return err - } - return nil +func (s *NetPrioGroup) Apply(path string, _ *configs.Resources, pid int) error { + return apply(path, pid) } -func (s *NetPrioGroup) Set(path string, cgroup *configs.Cgroup) error { - for _, prioMap := range cgroup.Resources.NetPrioIfpriomap { - if err := fscommon.WriteFile(path, "net_prio.ifpriomap", prioMap.CgroupString()); err != nil { +func (s *NetPrioGroup) Set(path string, r *configs.Resources) error { + for _, prioMap := range r.NetPrioIfpriomap { + if err := cgroups.WriteFile(path, "net_prio.ifpriomap", prioMap.CgroupString()); err != nil { return err } } @@ -33,10 +25,6 @@ func (s *NetPrioGroup) Set(path string, cgroup *configs.Cgroup) error { return nil } -func (s *NetPrioGroup) Remove(d *cgroupData) error { - return removePath(d.path("net_prio")) -} - func (s *NetPrioGroup) GetStats(path string, stats *cgroups.Stats) error { return nil } diff --git a/libcontainer/cgroups/fs/net_prio_test.go b/libcontainer/cgroups/fs/net_prio_test.go index 2ce8e19..453ff36 100644 --- a/libcontainer/cgroups/fs/net_prio_test.go +++ b/libcontainer/cgroups/fs/net_prio_test.go @@ -1,5 +1,3 @@ -// +build linux - package fs import ( @@ -10,28 +8,27 @@ import ( "github.com/opencontainers/runc/libcontainer/configs" ) -var ( - prioMap = []*configs.IfPrioMap{ - { - Interface: "test", - Priority: 5, - }, - } -) +var prioMap = []*configs.IfPrioMap{ + { + Interface: "test", + Priority: 5, + }, +} func TestNetPrioSetIfPrio(t *testing.T) { - helper := NewCgroupTestUtil("net_prio", t) - defer helper.cleanup() + path := tempDir(t, "net_prio") - helper.CgroupData.config.Resources.NetPrioIfpriomap = prioMap + r := &configs.Resources{ + NetPrioIfpriomap: prioMap, + } netPrio := &NetPrioGroup{} - if err := netPrio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + if err := netPrio.Set(path, r); err != nil { t.Fatal(err) } - value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "net_prio.ifpriomap") + value, err := fscommon.GetCgroupParamString(path, "net_prio.ifpriomap") if err != nil { - t.Fatalf("Failed to parse net_prio.ifpriomap - %s", err) + t.Fatal(err) } if !strings.Contains(value, "test 5") { t.Fatal("Got the wrong value, set net_prio.ifpriomap failed.") diff --git a/libcontainer/cgroups/fs/paths.go b/libcontainer/cgroups/fs/paths.go new file mode 100644 index 0000000..1092331 --- /dev/null +++ b/libcontainer/cgroups/fs/paths.go @@ -0,0 +1,186 @@ +package fs + +import ( + "errors" + "os" + "path/filepath" + "sync" + + "golang.org/x/sys/unix" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/utils" +) + +// The absolute path to the root of the cgroup hierarchies. +var ( + cgroupRootLock sync.Mutex + cgroupRoot string +) + +const defaultCgroupRoot = "/sys/fs/cgroup" + +func initPaths(cg *configs.Cgroup) (map[string]string, error) { + root, err := rootPath() + if err != nil { + return nil, err + } + + inner, err := innerPath(cg) + if err != nil { + return nil, err + } + + paths := make(map[string]string) + for _, sys := range subsystems { + name := sys.Name() + path, err := subsysPath(root, inner, name) + if err != nil { + // The non-presence of the devices subsystem + // is considered fatal for security reasons. + if cgroups.IsNotFound(err) && (cg.SkipDevices || name != "devices") { + continue + } + + return nil, err + } + paths[name] = path + } + + return paths, nil +} + +func tryDefaultCgroupRoot() string { + var st, pst unix.Stat_t + + // (1) it should be a directory... + err := unix.Lstat(defaultCgroupRoot, &st) + if err != nil || st.Mode&unix.S_IFDIR == 0 { + return "" + } + + // (2) ... and a mount point ... + err = unix.Lstat(filepath.Dir(defaultCgroupRoot), &pst) + if err != nil { + return "" + } + + if st.Dev == pst.Dev { + // parent dir has the same dev -- not a mount point + return "" + } + + // (3) ... of 'tmpfs' fs type. + var fst unix.Statfs_t + err = unix.Statfs(defaultCgroupRoot, &fst) + if err != nil || fst.Type != unix.TMPFS_MAGIC { + return "" + } + + // (4) it should have at least 1 entry ... + dir, err := os.Open(defaultCgroupRoot) + if err != nil { + return "" + } + names, err := dir.Readdirnames(1) + if err != nil { + return "" + } + if len(names) < 1 { + return "" + } + // ... which is a cgroup mount point. + err = unix.Statfs(filepath.Join(defaultCgroupRoot, names[0]), &fst) + if err != nil || fst.Type != unix.CGROUP_SUPER_MAGIC { + return "" + } + + return defaultCgroupRoot +} + +// rootPath finds and returns path to the root of the cgroup hierarchies. +func rootPath() (string, error) { + cgroupRootLock.Lock() + defer cgroupRootLock.Unlock() + + if cgroupRoot != "" { + return cgroupRoot, nil + } + + // fast path + cgroupRoot = tryDefaultCgroupRoot() + if cgroupRoot != "" { + return cgroupRoot, nil + } + + // slow path: parse mountinfo + mi, err := cgroups.GetCgroupMounts(false) + if err != nil { + return "", err + } + if len(mi) < 1 { + return "", errors.New("no cgroup mount found in mountinfo") + } + + // Get the first cgroup mount (e.g. "/sys/fs/cgroup/memory"), + // use its parent directory. + root := filepath.Dir(mi[0].Mountpoint) + + if _, err := os.Stat(root); err != nil { + return "", err + } + + cgroupRoot = root + return cgroupRoot, nil +} + +func innerPath(c *configs.Cgroup) (string, error) { + if (c.Name != "" || c.Parent != "") && c.Path != "" { + return "", errors.New("cgroup: either Path or Name and Parent should be used") + } + + // XXX: Do not remove CleanPath. Path safety is important! -- cyphar + innerPath := utils.CleanPath(c.Path) + if innerPath == "" { + cgParent := utils.CleanPath(c.Parent) + cgName := utils.CleanPath(c.Name) + innerPath = filepath.Join(cgParent, cgName) + } + + return innerPath, nil +} + +func subsysPath(root, inner, subsystem string) (string, error) { + // If the cgroup name/path is absolute do not look relative to the cgroup of the init process. + if filepath.IsAbs(inner) { + mnt, err := cgroups.FindCgroupMountpoint(root, subsystem) + // If we didn't mount the subsystem, there is no point we make the path. + if err != nil { + return "", err + } + + // Sometimes subsystems can be mounted together as 'cpu,cpuacct'. + return filepath.Join(root, filepath.Base(mnt), inner), nil + } + + // Use GetOwnCgroupPath instead of GetInitCgroupPath, because the creating + // process could in container and shared pid namespace with host, and + // /proc/1/cgroup could point to whole other world of cgroups. + parentPath, err := cgroups.GetOwnCgroupPath(subsystem) + if err != nil { + return "", err + } + + return filepath.Join(parentPath, inner), nil +} + +func apply(path string, pid int) error { + if path == "" { + return nil + } + if err := os.MkdirAll(path, 0o755); err != nil { + return err + } + return cgroups.WriteCgroupProc(path, pid) +} diff --git a/libcontainer/cgroups/fs/paths_test.go b/libcontainer/cgroups/fs/paths_test.go new file mode 100644 index 0000000..3a4d45f --- /dev/null +++ b/libcontainer/cgroups/fs/paths_test.go @@ -0,0 +1,104 @@ +package fs + +import ( + "path/filepath" + "strings" + "testing" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +func TestInvalidCgroupPath(t *testing.T) { + if cgroups.IsCgroup2UnifiedMode() { + t.Skip("cgroup v2 is not supported") + } + + root, err := rootPath() + if err != nil { + t.Fatalf("couldn't get cgroup root: %v", err) + } + + testCases := []struct { + test string + path, name, parent string + }{ + { + test: "invalid cgroup path", + path: "../../../../../../../../../../some/path", + }, + { + test: "invalid absolute cgroup path", + path: "/../../../../../../../../../../some/path", + }, + { + test: "invalid cgroup parent", + parent: "../../../../../../../../../../some/path", + name: "name", + }, + { + test: "invalid absolute cgroup parent", + parent: "/../../../../../../../../../../some/path", + name: "name", + }, + { + test: "invalid cgroup name", + parent: "parent", + name: "../../../../../../../../../../some/path", + }, + { + test: "invalid absolute cgroup name", + parent: "parent", + name: "/../../../../../../../../../../some/path", + }, + { + test: "invalid cgroup name and parent", + parent: "../../../../../../../../../../some/path", + name: "../../../../../../../../../../some/path", + }, + { + test: "invalid absolute cgroup name and parent", + parent: "/../../../../../../../../../../some/path", + name: "/../../../../../../../../../../some/path", + }, + } + + for _, tc := range testCases { + t.Run(tc.test, func(t *testing.T) { + config := &configs.Cgroup{Path: tc.path, Name: tc.name, Parent: tc.parent} + + inner, err := innerPath(config) + if err != nil { + t.Fatalf("couldn't get cgroup data: %v", err) + } + + // Make sure the final inner path doesn't go outside the cgroup mountpoint. + if strings.HasPrefix(inner, "..") { + t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!") + } + + // Double-check, using an actual cgroup. + deviceRoot := filepath.Join(root, "devices") + devicePath, err := subsysPath(root, inner, "devices") + if err != nil { + t.Fatalf("couldn't get cgroup path: %v", err) + } + if !strings.HasPrefix(devicePath, deviceRoot) { + t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!") + } + }) + } +} + +func TestTryDefaultCgroupRoot(t *testing.T) { + res := tryDefaultCgroupRoot() + exp := defaultCgroupRoot + if cgroups.IsCgroup2UnifiedMode() { + // checking that tryDefaultCgroupRoot does return "" + // in case /sys/fs/cgroup is not cgroup v1 root dir. + exp = "" + } + if res != exp { + t.Errorf("tryDefaultCgroupRoot: want %q, got %q", exp, res) + } +} diff --git a/libcontainer/cgroups/fs/perf_event.go b/libcontainer/cgroups/fs/perf_event.go index 5693676..b86955c 100644 --- a/libcontainer/cgroups/fs/perf_event.go +++ b/libcontainer/cgroups/fs/perf_event.go @@ -1,5 +1,3 @@ -// +build linux - package fs import ( @@ -7,29 +5,20 @@ import ( "github.com/opencontainers/runc/libcontainer/configs" ) -type PerfEventGroup struct { -} +type PerfEventGroup struct{} func (s *PerfEventGroup) Name() string { return "perf_event" } -func (s *PerfEventGroup) Apply(d *cgroupData) error { - // we just want to join this group even though we don't set anything - if _, err := d.join("perf_event"); err != nil && !cgroups.IsNotFound(err) { - return err - } - return nil +func (s *PerfEventGroup) Apply(path string, _ *configs.Resources, pid int) error { + return apply(path, pid) } -func (s *PerfEventGroup) Set(path string, cgroup *configs.Cgroup) error { +func (s *PerfEventGroup) Set(_ string, _ *configs.Resources) error { return nil } -func (s *PerfEventGroup) Remove(d *cgroupData) error { - return removePath(d.path("perf_event")) -} - func (s *PerfEventGroup) GetStats(path string, stats *cgroups.Stats) error { return nil } diff --git a/libcontainer/cgroups/fs/pids.go b/libcontainer/cgroups/fs/pids.go index 7bf6801..1f13532 100644 --- a/libcontainer/cgroups/fs/pids.go +++ b/libcontainer/cgroups/fs/pids.go @@ -1,10 +1,7 @@ -// +build linux - package fs import ( - "fmt" - "path/filepath" + "math" "strconv" "github.com/opencontainers/runc/libcontainer/cgroups" @@ -12,31 +9,26 @@ import ( "github.com/opencontainers/runc/libcontainer/configs" ) -type PidsGroup struct { -} +type PidsGroup struct{} func (s *PidsGroup) Name() string { return "pids" } -func (s *PidsGroup) Apply(d *cgroupData) error { - _, err := d.join("pids") - if err != nil && !cgroups.IsNotFound(err) { - return err - } - return nil +func (s *PidsGroup) Apply(path string, _ *configs.Resources, pid int) error { + return apply(path, pid) } -func (s *PidsGroup) Set(path string, cgroup *configs.Cgroup) error { - if cgroup.Resources.PidsLimit != 0 { +func (s *PidsGroup) Set(path string, r *configs.Resources) error { + if r.PidsLimit != 0 { // "max" is the fallback value. limit := "max" - if cgroup.Resources.PidsLimit > 0 { - limit = strconv.FormatInt(cgroup.Resources.PidsLimit, 10) + if r.PidsLimit > 0 { + limit = strconv.FormatInt(r.PidsLimit, 10) } - if err := fscommon.WriteFile(path, "pids.max", limit); err != nil { + if err := cgroups.WriteFile(path, "pids.max", limit); err != nil { return err } } @@ -44,28 +36,24 @@ func (s *PidsGroup) Set(path string, cgroup *configs.Cgroup) error { return nil } -func (s *PidsGroup) Remove(d *cgroupData) error { - return removePath(d.path("pids")) -} - func (s *PidsGroup) GetStats(path string, stats *cgroups.Stats) error { + if !cgroups.PathExists(path) { + return nil + } current, err := fscommon.GetCgroupParamUint(path, "pids.current") if err != nil { - return fmt.Errorf("failed to parse pids.current - %s", err) + return err } - maxString, err := fscommon.GetCgroupParamString(path, "pids.max") + max, err := fscommon.GetCgroupParamUint(path, "pids.max") if err != nil { - return fmt.Errorf("failed to parse pids.max - %s", err) + return err } - - // Default if pids.max == "max" is 0 -- which represents "no limit". - var max uint64 - if maxString != "max" { - max, err = fscommon.ParseUint(maxString, 10, 64) - if err != nil { - return fmt.Errorf("failed to parse pids.max - unable to parse %q as a uint from Cgroup file %q", maxString, filepath.Join(path, "pids.max")) - } + // If no limit is set, read from pids.max returns "max", which is + // converted to MaxUint64 by GetCgroupParamUint. Historically, we + // represent "no limit" for pids as 0, thus this conversion. + if max == math.MaxUint64 { + max = 0 } stats.PidsStats.Current = current diff --git a/libcontainer/cgroups/fs/pids_test.go b/libcontainer/cgroups/fs/pids_test.go index 66f3aa3..9d9a7ce 100644 --- a/libcontainer/cgroups/fs/pids_test.go +++ b/libcontainer/cgroups/fs/pids_test.go @@ -1,5 +1,3 @@ -// +build linux - package fs import ( @@ -8,6 +6,7 @@ import ( "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" ) const ( @@ -16,65 +15,64 @@ const ( ) func TestPidsSetMax(t *testing.T) { - helper := NewCgroupTestUtil("pids", t) - defer helper.cleanup() + path := tempDir(t, "pids") - helper.writeFileContents(map[string]string{ + writeFileContents(t, path, map[string]string{ "pids.max": "max", }) - helper.CgroupData.config.Resources.PidsLimit = maxLimited + r := &configs.Resources{ + PidsLimit: maxLimited, + } pids := &PidsGroup{} - if err := pids.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + if err := pids.Set(path, r); err != nil { t.Fatal(err) } - value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "pids.max") + value, err := fscommon.GetCgroupParamUint(path, "pids.max") if err != nil { - t.Fatalf("Failed to parse pids.max - %s", err) + t.Fatal(err) } - if value != maxLimited { t.Fatalf("Expected %d, got %d for setting pids.max - limited", maxLimited, value) } } func TestPidsSetUnlimited(t *testing.T) { - helper := NewCgroupTestUtil("pids", t) - defer helper.cleanup() + path := tempDir(t, "pids") - helper.writeFileContents(map[string]string{ + writeFileContents(t, path, map[string]string{ "pids.max": strconv.Itoa(maxLimited), }) - helper.CgroupData.config.Resources.PidsLimit = maxUnlimited + r := &configs.Resources{ + PidsLimit: maxUnlimited, + } pids := &PidsGroup{} - if err := pids.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + if err := pids.Set(path, r); err != nil { t.Fatal(err) } - value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "pids.max") + value, err := fscommon.GetCgroupParamString(path, "pids.max") if err != nil { - t.Fatalf("Failed to parse pids.max - %s", err) + t.Fatal(err) } - if value != "max" { t.Fatalf("Expected %s, got %s for setting pids.max - unlimited", "max", value) } } func TestPidsStats(t *testing.T) { - helper := NewCgroupTestUtil("pids", t) - defer helper.cleanup() + path := tempDir(t, "pids") - helper.writeFileContents(map[string]string{ + writeFileContents(t, path, map[string]string{ "pids.current": strconv.Itoa(1337), "pids.max": strconv.Itoa(maxLimited), }) pids := &PidsGroup{} stats := *cgroups.NewStats() - if err := pids.GetStats(helper.CgroupPath, &stats); err != nil { + if err := pids.GetStats(path, &stats); err != nil { t.Fatal(err) } @@ -88,17 +86,16 @@ func TestPidsStats(t *testing.T) { } func TestPidsStatsUnlimited(t *testing.T) { - helper := NewCgroupTestUtil("pids", t) - defer helper.cleanup() + path := tempDir(t, "pids") - helper.writeFileContents(map[string]string{ + writeFileContents(t, path, map[string]string{ "pids.current": strconv.Itoa(4096), "pids.max": "max", }) pids := &PidsGroup{} stats := *cgroups.NewStats() - if err := pids.GetStats(helper.CgroupPath, &stats); err != nil { + if err := pids.GetStats(path, &stats); err != nil { t.Fatal(err) } diff --git a/libcontainer/cgroups/fs/rdma.go b/libcontainer/cgroups/fs/rdma.go new file mode 100644 index 0000000..5bbe0f3 --- /dev/null +++ b/libcontainer/cgroups/fs/rdma.go @@ -0,0 +1,25 @@ +package fs + +import ( + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type RdmaGroup struct{} + +func (s *RdmaGroup) Name() string { + return "rdma" +} + +func (s *RdmaGroup) Apply(path string, _ *configs.Resources, pid int) error { + return apply(path, pid) +} + +func (s *RdmaGroup) Set(path string, r *configs.Resources) error { + return fscommon.RdmaSet(path, r) +} + +func (s *RdmaGroup) GetStats(path string, stats *cgroups.Stats) error { + return fscommon.RdmaGetStats(path, stats) +} diff --git a/libcontainer/cgroups/fs/stats_util_test.go b/libcontainer/cgroups/fs/stats_util_test.go index c5a8d18..9b7d840 100644 --- a/libcontainer/cgroups/fs/stats_util_test.go +++ b/libcontainer/cgroups/fs/stats_util_test.go @@ -1,123 +1,138 @@ -// +build linux - package fs import ( + "errors" "fmt" + "reflect" "testing" "github.com/opencontainers/runc/libcontainer/cgroups" - - "github.com/sirupsen/logrus" ) func blkioStatEntryEquals(expected, actual []cgroups.BlkioStatEntry) error { if len(expected) != len(actual) { - return fmt.Errorf("blkioStatEntries length do not match") + return errors.New("blkioStatEntries length do not match") } for i, expValue := range expected { actValue := actual[i] if expValue != actValue { - return fmt.Errorf("Expected blkio stat entry %v but found %v", expValue, actValue) + return fmt.Errorf("expected: %v, actual: %v", expValue, actValue) } } return nil } func expectBlkioStatsEquals(t *testing.T, expected, actual cgroups.BlkioStats) { + t.Helper() if err := blkioStatEntryEquals(expected.IoServiceBytesRecursive, actual.IoServiceBytesRecursive); err != nil { - logrus.Printf("blkio IoServiceBytesRecursive do not match - %s\n", err) - t.Fail() + t.Errorf("blkio IoServiceBytesRecursive do not match: %s", err) } if err := blkioStatEntryEquals(expected.IoServicedRecursive, actual.IoServicedRecursive); err != nil { - logrus.Printf("blkio IoServicedRecursive do not match - %s\n", err) - t.Fail() + t.Errorf("blkio IoServicedRecursive do not match: %s", err) } if err := blkioStatEntryEquals(expected.IoQueuedRecursive, actual.IoQueuedRecursive); err != nil { - logrus.Printf("blkio IoQueuedRecursive do not match - %s\n", err) - t.Fail() + t.Errorf("blkio IoQueuedRecursive do not match: %s", err) } if err := blkioStatEntryEquals(expected.SectorsRecursive, actual.SectorsRecursive); err != nil { - logrus.Printf("blkio SectorsRecursive do not match - %s\n", err) - t.Fail() + t.Errorf("blkio SectorsRecursive do not match: %s", err) } if err := blkioStatEntryEquals(expected.IoServiceTimeRecursive, actual.IoServiceTimeRecursive); err != nil { - logrus.Printf("blkio IoServiceTimeRecursive do not match - %s\n", err) - t.Fail() + t.Errorf("blkio IoServiceTimeRecursive do not match: %s", err) } if err := blkioStatEntryEquals(expected.IoWaitTimeRecursive, actual.IoWaitTimeRecursive); err != nil { - logrus.Printf("blkio IoWaitTimeRecursive do not match - %s\n", err) - t.Fail() + t.Errorf("blkio IoWaitTimeRecursive do not match: %s", err) } if err := blkioStatEntryEquals(expected.IoMergedRecursive, actual.IoMergedRecursive); err != nil { - logrus.Printf("blkio IoMergedRecursive do not match - %v vs %v\n", expected.IoMergedRecursive, actual.IoMergedRecursive) - t.Fail() + t.Errorf("blkio IoMergedRecursive do not match: expected: %v, actual: %v", expected.IoMergedRecursive, actual.IoMergedRecursive) } if err := blkioStatEntryEquals(expected.IoTimeRecursive, actual.IoTimeRecursive); err != nil { - logrus.Printf("blkio IoTimeRecursive do not match - %s\n", err) - t.Fail() + t.Errorf("blkio IoTimeRecursive do not match: %s", err) } } func expectThrottlingDataEquals(t *testing.T, expected, actual cgroups.ThrottlingData) { + t.Helper() if expected != actual { - logrus.Printf("Expected throttling data %v but found %v\n", expected, actual) - t.Fail() + t.Errorf("Expected throttling data: %v, actual: %v", expected, actual) } } func expectHugetlbStatEquals(t *testing.T, expected, actual cgroups.HugetlbStats) { + t.Helper() if expected != actual { - logrus.Printf("Expected hugetlb stats %v but found %v\n", expected, actual) - t.Fail() + t.Errorf("Expected hugetlb stats: %v, actual: %v", expected, actual) } } func expectMemoryStatEquals(t *testing.T, expected, actual cgroups.MemoryStats) { + t.Helper() expectMemoryDataEquals(t, expected.Usage, actual.Usage) expectMemoryDataEquals(t, expected.SwapUsage, actual.SwapUsage) expectMemoryDataEquals(t, expected.KernelUsage, actual.KernelUsage) + expectPageUsageByNUMAEquals(t, expected.PageUsageByNUMA, actual.PageUsageByNUMA) if expected.UseHierarchy != actual.UseHierarchy { - logrus.Printf("Expected memory use hierarchy %v, but found %v\n", expected.UseHierarchy, actual.UseHierarchy) - t.Fail() + t.Errorf("Expected memory use hierarchy: %v, actual: %v", expected.UseHierarchy, actual.UseHierarchy) } for key, expValue := range expected.Stats { actValue, ok := actual.Stats[key] if !ok { - logrus.Printf("Expected memory stat key %s not found\n", key) - t.Fail() + t.Errorf("Expected memory stat key %s not found", key) } if expValue != actValue { - logrus.Printf("Expected memory stat value %d but found %d\n", expValue, actValue) - t.Fail() + t.Errorf("Expected memory stat value: %d, actual: %d", expValue, actValue) } } } func expectMemoryDataEquals(t *testing.T, expected, actual cgroups.MemoryData) { + t.Helper() if expected.Usage != actual.Usage { - logrus.Printf("Expected memory usage %d but found %d\n", expected.Usage, actual.Usage) - t.Fail() + t.Errorf("Expected memory usage: %d, actual: %d", expected.Usage, actual.Usage) } if expected.MaxUsage != actual.MaxUsage { - logrus.Printf("Expected memory max usage %d but found %d\n", expected.MaxUsage, actual.MaxUsage) - t.Fail() + t.Errorf("Expected memory max usage: %d, actual: %d", expected.MaxUsage, actual.MaxUsage) } if expected.Failcnt != actual.Failcnt { - logrus.Printf("Expected memory failcnt %d but found %d\n", expected.Failcnt, actual.Failcnt) - t.Fail() + t.Errorf("Expected memory failcnt %d, actual: %d", expected.Failcnt, actual.Failcnt) } if expected.Limit != actual.Limit { - logrus.Printf("Expected memory limit %d but found %d\n", expected.Limit, actual.Limit) - t.Fail() + t.Errorf("Expected memory limit: %d, actual: %d", expected.Limit, actual.Limit) + } +} + +func expectPageUsageByNUMAEquals(t *testing.T, expected, actual cgroups.PageUsageByNUMA) { + t.Helper() + if !reflect.DeepEqual(expected.Total, actual.Total) { + t.Errorf("Expected total page usage by NUMA: %#v, actual: %#v", expected.Total, actual.Total) + } + if !reflect.DeepEqual(expected.File, actual.File) { + t.Errorf("Expected file page usage by NUMA: %#v, actual: %#v", expected.File, actual.File) + } + if !reflect.DeepEqual(expected.Anon, actual.Anon) { + t.Errorf("Expected anon page usage by NUMA: %#v, actual: %#v", expected.Anon, actual.Anon) + } + if !reflect.DeepEqual(expected.Unevictable, actual.Unevictable) { + t.Errorf("Expected unevictable page usage by NUMA: %#v, actual: %#v", expected.Unevictable, actual.Unevictable) + } + if !reflect.DeepEqual(expected.Hierarchical.Total, actual.Hierarchical.Total) { + t.Errorf("Expected hierarchical total page usage by NUMA: %#v, actual: %#v", expected.Hierarchical.Total, actual.Hierarchical.Total) + } + if !reflect.DeepEqual(expected.Hierarchical.File, actual.Hierarchical.File) { + t.Errorf("Expected hierarchical file page usage by NUMA: %#v, actual: %#v", expected.Hierarchical.File, actual.Hierarchical.File) + } + if !reflect.DeepEqual(expected.Hierarchical.Anon, actual.Hierarchical.Anon) { + t.Errorf("Expected hierarchical anon page usage by NUMA: %#v, actual: %#v", expected.Hierarchical.Anon, actual.Hierarchical.Anon) + } + if !reflect.DeepEqual(expected.Hierarchical.Unevictable, actual.Hierarchical.Unevictable) { + t.Errorf("Expected hierarchical total page usage by NUMA: %#v, actual: %#v", expected.Hierarchical.Unevictable, actual.Hierarchical.Unevictable) } } diff --git a/libcontainer/cgroups/fs/util_test.go b/libcontainer/cgroups/fs/util_test.go index 2c50d6f..85842b7 100644 --- a/libcontainer/cgroups/fs/util_test.go +++ b/libcontainer/cgroups/fs/util_test.go @@ -1,5 +1,3 @@ -// +build linux - /* Utility for testing cgroup operations. @@ -8,61 +6,34 @@ Creates a mock of the cgroup filesystem for the duration of the test. package fs import ( - "io/ioutil" "os" "path/filepath" "testing" - "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" - "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/cgroups" ) -type cgroupTestUtil struct { - // cgroup data to use in tests. - CgroupData *cgroupData - - // Path to the mock cgroup directory. - CgroupPath string - - // Temporary directory to store mock cgroup filesystem. - tempDir string - t *testing.T +func init() { + cgroups.TestMode = true } -// Creates a new test util for the specified subsystem -func NewCgroupTestUtil(subsystem string, t *testing.T) *cgroupTestUtil { - d := &cgroupData{ - config: &configs.Cgroup{}, - } - d.config.Resources = &configs.Resources{} - tempDir, err := ioutil.TempDir("", "cgroup_test") - if err != nil { - t.Fatal(err) - } - d.root = tempDir - testCgroupPath := filepath.Join(d.root, subsystem) - if err != nil { - t.Fatal(err) - } - +// tempDir creates a new test directory for the specified subsystem. +func tempDir(t *testing.T, subsystem string) string { + path := filepath.Join(t.TempDir(), subsystem) // Ensure the full mock cgroup path exists. - err = os.MkdirAll(testCgroupPath, 0755) - if err != nil { + if err := os.Mkdir(path, 0o755); err != nil { t.Fatal(err) } - return &cgroupTestUtil{CgroupData: d, CgroupPath: testCgroupPath, tempDir: tempDir, t: t} + return path } -func (c *cgroupTestUtil) cleanup() { - os.RemoveAll(c.tempDir) -} - -// Write the specified contents on the mock of the specified cgroup files. -func (c *cgroupTestUtil) writeFileContents(fileContents map[string]string) { +// writeFileContents writes the specified contents on the mock of the specified +// cgroup files. +func writeFileContents(t *testing.T, path string, fileContents map[string]string) { for file, contents := range fileContents { - err := fscommon.WriteFile(c.CgroupPath, file, contents) + err := cgroups.WriteFile(path, file, contents) if err != nil { - c.t.Fatal(err) + t.Fatal(err) } } } diff --git a/libcontainer/cgroups/fs2/cpu.go b/libcontainer/cgroups/fs2/cpu.go index f0f5df0..bbbae4d 100644 --- a/libcontainer/cgroups/fs2/cpu.go +++ b/libcontainer/cgroups/fs2/cpu.go @@ -1,11 +1,8 @@ -// +build linux - package fs2 import ( "bufio" "os" - "path/filepath" "strconv" "github.com/opencontainers/runc/libcontainer/cgroups" @@ -13,23 +10,45 @@ import ( "github.com/opencontainers/runc/libcontainer/configs" ) -func setCpu(dirPath string, cgroup *configs.Cgroup) error { - if cgroup.Resources.CpuWeight != 0 { - if err := fscommon.WriteFile(dirPath, "cpu.weight", strconv.FormatUint(cgroup.Resources.CpuWeight, 10)); err != nil { +func isCpuSet(r *configs.Resources) bool { + return r.CpuWeight != 0 || r.CpuQuota != 0 || r.CpuPeriod != 0 +} + +func setCpu(dirPath string, r *configs.Resources) error { + if !isCpuSet(r) { + return nil + } + + // NOTE: .CpuShares is not used here. Conversion is the caller's responsibility. + if r.CpuWeight != 0 { + if err := cgroups.WriteFile(dirPath, "cpu.weight", strconv.FormatUint(r.CpuWeight, 10)); err != nil { return err } } - if cgroup.Resources.CpuMax != "" { - if err := fscommon.WriteFile(dirPath, "cpu.max", cgroup.Resources.CpuMax); err != nil { + if r.CpuQuota != 0 || r.CpuPeriod != 0 { + str := "max" + if r.CpuQuota > 0 { + str = strconv.FormatInt(r.CpuQuota, 10) + } + period := r.CpuPeriod + if period == 0 { + // This default value is documented in + // https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html + period = 100000 + } + str += " " + strconv.FormatUint(period, 10) + if err := cgroups.WriteFile(dirPath, "cpu.max", str); err != nil { return err } } return nil } + func statCpu(dirPath string, stats *cgroups.Stats) error { - f, err := os.Open(filepath.Join(dirPath, "cpu.stat")) + const file = "cpu.stat" + f, err := cgroups.OpenFile(dirPath, file, os.O_RDONLY) if err != nil { return err } @@ -37,9 +56,9 @@ func statCpu(dirPath string, stats *cgroups.Stats) error { sc := bufio.NewScanner(f) for sc.Scan() { - t, v, err := fscommon.GetCgroupParamKeyValue(sc.Text()) + t, v, err := fscommon.ParseKeyValue(sc.Text()) if err != nil { - return err + return &parseError{Path: dirPath, File: file, Err: err} } switch t { case "usage_usec": @@ -50,7 +69,19 @@ func statCpu(dirPath string, stats *cgroups.Stats) error { case "system_usec": stats.CpuStats.CpuUsage.UsageInKernelmode = v * 1000 + + case "nr_periods": + stats.CpuStats.ThrottlingData.Periods = v + + case "nr_throttled": + stats.CpuStats.ThrottlingData.ThrottledPeriods = v + + case "throttled_usec": + stats.CpuStats.ThrottlingData.ThrottledTime = v * 1000 } } + if err := sc.Err(); err != nil { + return &parseError{Path: dirPath, File: file, Err: err} + } return nil } diff --git a/libcontainer/cgroups/fs2/cpuset.go b/libcontainer/cgroups/fs2/cpuset.go index 6492ac9..16c45ba 100644 --- a/libcontainer/cgroups/fs2/cpuset.go +++ b/libcontainer/cgroups/fs2/cpuset.go @@ -1,20 +1,26 @@ -// +build linux - package fs2 import ( - "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" ) -func setCpuset(dirPath string, cgroup *configs.Cgroup) error { - if cgroup.Resources.CpusetCpus != "" { - if err := fscommon.WriteFile(dirPath, "cpuset.cpus", cgroup.Resources.CpusetCpus); err != nil { +func isCpusetSet(r *configs.Resources) bool { + return r.CpusetCpus != "" || r.CpusetMems != "" +} + +func setCpuset(dirPath string, r *configs.Resources) error { + if !isCpusetSet(r) { + return nil + } + + if r.CpusetCpus != "" { + if err := cgroups.WriteFile(dirPath, "cpuset.cpus", r.CpusetCpus); err != nil { return err } } - if cgroup.Resources.CpusetMems != "" { - if err := fscommon.WriteFile(dirPath, "cpuset.mems", cgroup.Resources.CpusetMems); err != nil { + if r.CpusetMems != "" { + if err := cgroups.WriteFile(dirPath, "cpuset.mems", r.CpusetMems); err != nil { return err } } diff --git a/libcontainer/cgroups/fs2/create.go b/libcontainer/cgroups/fs2/create.go new file mode 100644 index 0000000..641123a --- /dev/null +++ b/libcontainer/cgroups/fs2/create.go @@ -0,0 +1,152 @@ +package fs2 + +import ( + "fmt" + "os" + "path/filepath" + "strings" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +func supportedControllers() (string, error) { + return cgroups.ReadFile(UnifiedMountpoint, "/cgroup.controllers") +} + +// needAnyControllers returns whether we enable some supported controllers or not, +// based on (1) controllers available and (2) resources that are being set. +// We don't check "pseudo" controllers such as +// "freezer" and "devices". +func needAnyControllers(r *configs.Resources) (bool, error) { + if r == nil { + return false, nil + } + + // list of all available controllers + content, err := supportedControllers() + if err != nil { + return false, err + } + avail := make(map[string]struct{}) + for _, ctr := range strings.Fields(content) { + avail[ctr] = struct{}{} + } + + // check whether the controller if available or not + have := func(controller string) bool { + _, ok := avail[controller] + return ok + } + + if isPidsSet(r) && have("pids") { + return true, nil + } + if isMemorySet(r) && have("memory") { + return true, nil + } + if isIoSet(r) && have("io") { + return true, nil + } + if isCpuSet(r) && have("cpu") { + return true, nil + } + if isCpusetSet(r) && have("cpuset") { + return true, nil + } + if isHugeTlbSet(r) && have("hugetlb") { + return true, nil + } + + return false, nil +} + +// containsDomainController returns whether the current config contains domain controller or not. +// Refer to: http://man7.org/linux/man-pages/man7/cgroups.7.html +// As at Linux 4.19, the following controllers are threaded: cpu, perf_event, and pids. +func containsDomainController(r *configs.Resources) bool { + return isMemorySet(r) || isIoSet(r) || isCpuSet(r) || isHugeTlbSet(r) +} + +// CreateCgroupPath creates cgroupv2 path, enabling all the supported controllers. +func CreateCgroupPath(path string, c *configs.Cgroup) (Err error) { + if !strings.HasPrefix(path, UnifiedMountpoint) { + return fmt.Errorf("invalid cgroup path %s", path) + } + + content, err := supportedControllers() + if err != nil { + return err + } + + const ( + cgTypeFile = "cgroup.type" + cgStCtlFile = "cgroup.subtree_control" + ) + ctrs := strings.Fields(content) + res := "+" + strings.Join(ctrs, " +") + + elements := strings.Split(path, "/") + elements = elements[3:] + current := "/sys/fs" + for i, e := range elements { + current = filepath.Join(current, e) + if i > 0 { + if err := os.Mkdir(current, 0o755); err != nil { + if !os.IsExist(err) { + return err + } + } else { + // If the directory was created, be sure it is not left around on errors. + current := current + defer func() { + if Err != nil { + os.Remove(current) + } + }() + } + cgType, _ := cgroups.ReadFile(current, cgTypeFile) + cgType = strings.TrimSpace(cgType) + switch cgType { + // If the cgroup is in an invalid mode (usually this means there's an internal + // process in the cgroup tree, because we created a cgroup under an + // already-populated-by-other-processes cgroup), then we have to error out if + // the user requested controllers which are not thread-aware. However, if all + // the controllers requested are thread-aware we can simply put the cgroup into + // threaded mode. + case "domain invalid": + if containsDomainController(c.Resources) { + return fmt.Errorf("cannot enter cgroupv2 %q with domain controllers -- it is in an invalid state", current) + } else { + // Not entirely correct (in theory we'd always want to be a domain -- + // since that means we're a properly delegated cgroup subtree) but in + // this case there's not much we can do and it's better than giving an + // error. + _ = cgroups.WriteFile(current, cgTypeFile, "threaded") + } + // If the cgroup is in (threaded) or (domain threaded) mode, we can only use thread-aware controllers + // (and you cannot usually take a cgroup out of threaded mode). + case "domain threaded": + fallthrough + case "threaded": + if containsDomainController(c.Resources) { + return fmt.Errorf("cannot enter cgroupv2 %q with domain controllers -- it is in %s mode", current, cgType) + } + } + } + // enable all supported controllers + if i < len(elements)-1 { + if err := cgroups.WriteFile(current, cgStCtlFile, res); err != nil { + // try write one by one + allCtrs := strings.Split(res, " ") + for _, ctr := range allCtrs { + _ = cgroups.WriteFile(current, cgStCtlFile, ctr) + } + } + // Some controllers might not be enabled when rootless or containerized, + // but we don't catch the error here. (Caught in setXXX() functions.) + } + } + + return nil +} diff --git a/libcontainer/cgroups/fs2/defaultpath.go b/libcontainer/cgroups/fs2/defaultpath.go index e84b33f..9c949c9 100644 --- a/libcontainer/cgroups/fs2/defaultpath.go +++ b/libcontainer/cgroups/fs2/defaultpath.go @@ -18,50 +18,52 @@ package fs2 import ( "bufio" + "errors" + "fmt" "io" "os" "path/filepath" "strings" "github.com/opencontainers/runc/libcontainer/configs" - libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils" - "github.com/pkg/errors" + "github.com/opencontainers/runc/libcontainer/utils" ) const UnifiedMountpoint = "/sys/fs/cgroup" func defaultDirPath(c *configs.Cgroup) (string, error) { if (c.Name != "" || c.Parent != "") && c.Path != "" { - return "", errors.Errorf("cgroup: either Path or Name and Parent should be used, got %+v", c) - } - if len(c.Paths) != 0 { - // never set by specconv - return "", errors.Errorf("cgroup: Paths is unsupported, use Path, got %+v", c) + return "", fmt.Errorf("cgroup: either Path or Name and Parent should be used, got %+v", c) } - // XXX: Do not remove this code. Path safety is important! -- cyphar - cgPath := libcontainerUtils.CleanPath(c.Path) - cgParent := libcontainerUtils.CleanPath(c.Parent) - cgName := libcontainerUtils.CleanPath(c.Name) - - ownCgroup, err := parseCgroupFile("/proc/self/cgroup") - if err != nil { - return "", err - } - return _defaultDirPath(UnifiedMountpoint, cgPath, cgParent, cgName, ownCgroup) + return _defaultDirPath(UnifiedMountpoint, c.Path, c.Parent, c.Name) } -func _defaultDirPath(root, cgPath, cgParent, cgName, ownCgroup string) (string, error) { +func _defaultDirPath(root, cgPath, cgParent, cgName string) (string, error) { if (cgName != "" || cgParent != "") && cgPath != "" { return "", errors.New("cgroup: either Path or Name and Parent should be used") } - innerPath := cgPath + + // XXX: Do not remove CleanPath. Path safety is important! -- cyphar + innerPath := utils.CleanPath(cgPath) if innerPath == "" { + cgParent := utils.CleanPath(cgParent) + cgName := utils.CleanPath(cgName) innerPath = filepath.Join(cgParent, cgName) } if filepath.IsAbs(innerPath) { return filepath.Join(root, innerPath), nil } + + ownCgroup, err := parseCgroupFile("/proc/self/cgroup") + if err != nil { + return "", err + } + // The current user scope most probably has tasks in it already, + // making it impossible to enable controllers for its sub-cgroup. + // A parent cgroup (with no tasks in it) is what we need. + ownCgroup = filepath.Dir(ownCgroup) + return filepath.Join(root, ownCgroup, innerPath), nil } @@ -76,24 +78,22 @@ func parseCgroupFile(path string) (string, error) { } func parseCgroupFromReader(r io.Reader) (string, error) { - var ( - s = bufio.NewScanner(r) - ) + s := bufio.NewScanner(r) for s.Scan() { - if err := s.Err(); err != nil { - return "", err - } var ( text = s.Text() parts = strings.SplitN(text, ":", 3) ) if len(parts) < 3 { - return "", errors.Errorf("invalid cgroup entry: %q", text) + return "", fmt.Errorf("invalid cgroup entry: %q", text) } // text is like "0::/user.slice/user-1001.slice/session-1.scope" if parts[0] == "0" && parts[1] == "" { return parts[2], nil } } + if err := s.Err(); err != nil { + return "", err + } return "", errors.New("cgroup path not found") } diff --git a/libcontainer/cgroups/fs2/defaultpath_test.go b/libcontainer/cgroups/fs2/defaultpath_test.go index 6d5d117..30f1c62 100644 --- a/libcontainer/cgroups/fs2/defaultpath_test.go +++ b/libcontainer/cgroups/fs2/defaultpath_test.go @@ -17,8 +17,11 @@ package fs2 import ( + "path/filepath" "strings" "testing" + + "github.com/opencontainers/runc/libcontainer/cgroups" ) func TestParseCgroupFromReader(t *testing.T) { @@ -30,8 +33,8 @@ func TestParseCgroupFromReader(t *testing.T) { for s, expected := range cases { g, err := parseCgroupFromReader(strings.NewReader(s)) if expected != "" { - if string(g) != expected { - t.Errorf("expected %q, got %q", expected, string(g)) + if g != expected { + t.Errorf("expected %q, got %q", expected, g) } if err != nil { t.Error(err) @@ -45,27 +48,35 @@ func TestParseCgroupFromReader(t *testing.T) { } func TestDefaultDirPath(t *testing.T) { - root := "/sys/fs/cgroup" + if !cgroups.IsCgroup2UnifiedMode() { + t.Skip("need cgroupv2") + } + // same code as in defaultDirPath() + ownCgroup, err := parseCgroupFile("/proc/self/cgroup") + if err != nil { + // Not a test failure, but rather some weird + // environment so we can't run this test. + t.Skipf("can't get own cgroup: %v", err) + } + ownCgroup = filepath.Dir(ownCgroup) + cases := []struct { - cgPath string - cgParent string - cgName string - ownCgroup string - expected string + cgPath string + cgParent string + cgName string + expected string }{ { - cgPath: "/foo/bar", - ownCgroup: "/apple/banana", - expected: "/sys/fs/cgroup/foo/bar", + cgPath: "/foo/bar", + expected: "/sys/fs/cgroup/foo/bar", }, { - cgPath: "foo/bar", - ownCgroup: "/apple/banana", - expected: "/sys/fs/cgroup/apple/banana/foo/bar", + cgPath: "foo/bar", + expected: filepath.Join(UnifiedMountpoint, ownCgroup, "foo/bar"), }, } for _, c := range cases { - got, err := _defaultDirPath(root, c.cgPath, c.cgParent, c.cgName, c.ownCgroup) + got, err := _defaultDirPath(UnifiedMountpoint, c.cgPath, c.cgParent, c.cgName) if err != nil { t.Fatal(err) } diff --git a/libcontainer/cgroups/fs2/devices.go b/libcontainer/cgroups/fs2/devices.go index e0fd685..0d23456 100644 --- a/libcontainer/cgroups/fs2/devices.go +++ b/libcontainer/cgroups/fs2/devices.go @@ -1,21 +1,21 @@ -// +build linux - package fs2 import ( + "fmt" + + "golang.org/x/sys/unix" + "github.com/opencontainers/runc/libcontainer/cgroups/ebpf" "github.com/opencontainers/runc/libcontainer/cgroups/ebpf/devicefilter" "github.com/opencontainers/runc/libcontainer/configs" - "github.com/pkg/errors" - "golang.org/x/sys/unix" + "github.com/opencontainers/runc/libcontainer/devices" + "github.com/opencontainers/runc/libcontainer/userns" ) -func isRWM(cgroupPermissions string) bool { - r := false - w := false - m := false - for _, rn := range cgroupPermissions { - switch rn { +func isRWM(perms devices.Permissions) bool { + var r, w, m bool + for _, perm := range perms { + switch perm { case 'r': r = true case 'w': @@ -27,45 +27,47 @@ func isRWM(cgroupPermissions string) bool { return r && w && m } -// the logic is from crun -// https://github.com/containers/crun/blob/0.10.2/src/libcrun/cgroup.c#L1644-L1652 -func canSkipEBPFError(cgroup *configs.Cgroup) bool { - for _, dev := range cgroup.Resources.Devices { - if dev.Allow || !isRWM(dev.Permissions) { +// This is similar to the logic applied in crun for handling errors from bpf(2) +// . +func canSkipEBPFError(r *configs.Resources) bool { + // If we're running in a user namespace we can ignore eBPF rules because we + // usually cannot use bpf(2), as well as rootless containers usually don't + // have the necessary privileges to mknod(2) device inodes or access + // host-level instances (though ideally we would be blocking device access + // for rootless containers anyway). + if userns.RunningInUserNS() { + return true + } + + // We cannot ignore an eBPF load error if any rule if is a block rule or it + // doesn't permit all access modes. + // + // NOTE: This will sometimes trigger in cases where access modes are split + // between different rules but to handle this correctly would require + // using ".../libcontainer/cgroup/devices".Emulator. + for _, dev := range r.Devices { + if !dev.Allow || !isRWM(dev.Permissions) { return false } } return true } -func setDevices(dirPath string, cgroup *configs.Cgroup) error { - devices := cgroup.Devices - if allowAllDevices := cgroup.Resources.AllowAllDevices; allowAllDevices != nil { - // never set by OCI specconv, but *allowAllDevices=false is still used by the integration test - if *allowAllDevices == true { - return errors.New("libcontainer AllowAllDevices is not supported, use Devices") - } - for _, ad := range cgroup.Resources.AllowedDevices { - d := *ad - d.Allow = true - devices = append(devices, &d) - } +func setDevices(dirPath string, r *configs.Resources) error { + if r.SkipDevices { + return nil } - if len(cgroup.Resources.DeniedDevices) != 0 { - // never set by OCI specconv - return errors.New("libcontainer DeniedDevices is not supported, use Devices") - } - insts, license, err := devicefilter.DeviceFilter(devices) + insts, license, err := devicefilter.DeviceFilter(r.Devices) if err != nil { return err } - dirFD, err := unix.Open(dirPath, unix.O_DIRECTORY|unix.O_RDONLY, 0600) + dirFD, err := unix.Open(dirPath, unix.O_DIRECTORY|unix.O_RDONLY, 0o600) if err != nil { - return errors.Errorf("cannot get dir FD for %s", dirPath) + return fmt.Errorf("cannot get dir FD for %s", dirPath) } defer unix.Close(dirFD) if _, err := ebpf.LoadAttachCgroupDeviceFilter(insts, license, dirFD); err != nil { - if !canSkipEBPFError(cgroup) { + if !canSkipEBPFError(r) { return err } } diff --git a/libcontainer/cgroups/fs2/freezer.go b/libcontainer/cgroups/fs2/freezer.go index 130c63f..8917a64 100644 --- a/libcontainer/cgroups/fs2/freezer.go +++ b/libcontainer/cgroups/fs2/freezer.go @@ -1,53 +1,127 @@ -// +build linux - package fs2 import ( - "strconv" + "bufio" + "errors" + "fmt" + "os" "strings" + "time" - "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "golang.org/x/sys/unix" + + "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" - "github.com/pkg/errors" ) func setFreezer(dirPath string, state configs.FreezerState) error { - var desired int + var stateStr string switch state { case configs.Undefined: return nil case configs.Frozen: - desired = 1 + stateStr = "1" case configs.Thawed: - desired = 0 + stateStr = "0" default: - return errors.Errorf("unknown freezer state %+v", state) + return fmt.Errorf("invalid freezer state %q requested", state) } - supportedErr := supportsFreezer(dirPath) - if supportedErr != nil && desired != 0 { - // can ignore error if desired == 1 - return errors.Wrap(supportedErr, "freezer not supported") - } - return freezeWithInt(dirPath, desired) -} -func supportsFreezer(dirPath string) error { - _, err := fscommon.ReadFile(dirPath, "cgroup.freeze") - return err -} - -// freeze writes desired int to "cgroup.freeze". -func freezeWithInt(dirPath string, desired int) error { - desiredS := strconv.Itoa(desired) - if err := fscommon.WriteFile(dirPath, "cgroup.freeze", desiredS); err != nil { - return err - } - got, err := fscommon.ReadFile(dirPath, "cgroup.freeze") + fd, err := cgroups.OpenFile(dirPath, "cgroup.freeze", unix.O_RDWR) if err != nil { + // We can ignore this request as long as the user didn't ask us to + // freeze the container (since without the freezer cgroup, that's a + // no-op). + if state != configs.Frozen { + return nil + } + return fmt.Errorf("freezer not supported: %w", err) + } + defer fd.Close() + + if _, err := fd.WriteString(stateStr); err != nil { return err } - if gotS := strings.TrimSpace(string(got)); gotS != desiredS { - return errors.Errorf("expected \"cgroup.freeze\" in %q to be %q, got %q", dirPath, desiredS, gotS) + // Confirm that the cgroup did actually change states. + if actualState, err := readFreezer(dirPath, fd); err != nil { + return err + } else if actualState != state { + return fmt.Errorf(`expected "cgroup.freeze" to be in state %q but was in %q`, state, actualState) } return nil } + +func getFreezer(dirPath string) (configs.FreezerState, error) { + fd, err := cgroups.OpenFile(dirPath, "cgroup.freeze", unix.O_RDONLY) + if err != nil { + // If the kernel is too old, then we just treat the freezer as being in + // an "undefined" state. + if os.IsNotExist(err) || errors.Is(err, unix.ENODEV) { + err = nil + } + return configs.Undefined, err + } + defer fd.Close() + + return readFreezer(dirPath, fd) +} + +func readFreezer(dirPath string, fd *os.File) (configs.FreezerState, error) { + if _, err := fd.Seek(0, 0); err != nil { + return configs.Undefined, err + } + state := make([]byte, 2) + if _, err := fd.Read(state); err != nil { + return configs.Undefined, err + } + switch string(state) { + case "0\n": + return configs.Thawed, nil + case "1\n": + return waitFrozen(dirPath) + default: + return configs.Undefined, fmt.Errorf(`unknown "cgroup.freeze" state: %q`, state) + } +} + +// waitFrozen polls cgroup.events until it sees "frozen 1" in it. +func waitFrozen(dirPath string) (configs.FreezerState, error) { + fd, err := cgroups.OpenFile(dirPath, "cgroup.events", unix.O_RDONLY) + if err != nil { + return configs.Undefined, err + } + defer fd.Close() + + // XXX: Simple wait/read/retry is used here. An implementation + // based on poll(2) or inotify(7) is possible, but it makes the code + // much more complicated. Maybe address this later. + const ( + // Perform maxIter with waitTime in between iterations. + waitTime = 10 * time.Millisecond + maxIter = 1000 + ) + scanner := bufio.NewScanner(fd) + for i := 0; scanner.Scan(); { + if i == maxIter { + return configs.Undefined, fmt.Errorf("timeout of %s reached waiting for the cgroup to freeze", waitTime*maxIter) + } + line := scanner.Text() + val := strings.TrimPrefix(line, "frozen ") + if val != line { // got prefix + if val[0] == '1' { + return configs.Frozen, nil + } + + i++ + // wait, then re-read + time.Sleep(waitTime) + _, err := fd.Seek(0, 0) + if err != nil { + return configs.Undefined, err + } + } + } + // Should only reach here either on read error, + // or if the file does not contain "frozen " line. + return configs.Undefined, scanner.Err() +} diff --git a/libcontainer/cgroups/fs2/fs2.go b/libcontainer/cgroups/fs2/fs2.go index 4bb7091..492778e 100644 --- a/libcontainer/cgroups/fs2/fs2.go +++ b/libcontainer/cgroups/fs2/fs2.go @@ -1,70 +1,17 @@ -// +build linux - package fs2 import ( - "io/ioutil" + "errors" + "fmt" "os" - "path/filepath" "strings" - securejoin "github.com/cyphar/filepath-securejoin" "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" "github.com/opencontainers/runc/libcontainer/configs" - "github.com/pkg/errors" ) -// NewManager creates a manager for cgroup v2 unified hierarchy. -// dirPath is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope". -// If dirPath is empty, it is automatically set using config. -func NewManager(config *configs.Cgroup, dirPath string, rootless bool) (cgroups.Manager, error) { - if config == nil { - config = &configs.Cgroup{} - } - if dirPath != "" { - if filepath.Clean(dirPath) != dirPath || !filepath.IsAbs(dirPath) { - return nil, errors.Errorf("invalid dir path %q", dirPath) - } - } else { - var err error - dirPath, err = defaultDirPath(config) - if err != nil { - return nil, err - } - } - controllers, err := detectControllers(dirPath) - if err != nil && !rootless { - return nil, err - } - - m := &manager{ - config: config, - dirPath: dirPath, - controllers: controllers, - rootless: rootless, - } - return m, nil -} - -func detectControllers(dirPath string) (map[string]struct{}, error) { - if err := os.MkdirAll(dirPath, 0755); err != nil { - return nil, err - } - controllersPath, err := securejoin.SecureJoin(dirPath, "cgroup.controllers") - if err != nil { - return nil, err - } - controllersData, err := ioutil.ReadFile(controllersPath) - if err != nil { - return nil, err - } - controllersFields := strings.Fields(string(controllersData)) - controllers := make(map[string]struct{}, len(controllersFields)) - for _, c := range controllersFields { - controllers[c] = struct{}{} - } - return controllers, nil -} +type parseError = fscommon.ParseError type manager struct { config *configs.Cgroup @@ -73,11 +20,65 @@ type manager struct { // controllers is content of "cgroup.controllers" file. // excludes pseudo-controllers ("devices" and "freezer"). controllers map[string]struct{} - rootless bool +} + +// NewManager creates a manager for cgroup v2 unified hierarchy. +// dirPath is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope". +// If dirPath is empty, it is automatically set using config. +func NewManager(config *configs.Cgroup, dirPath string) (cgroups.Manager, error) { + if dirPath == "" { + var err error + dirPath, err = defaultDirPath(config) + if err != nil { + return nil, err + } + } + + m := &manager{ + config: config, + dirPath: dirPath, + } + return m, nil +} + +func (m *manager) getControllers() error { + if m.controllers != nil { + return nil + } + + data, err := cgroups.ReadFile(m.dirPath, "cgroup.controllers") + if err != nil { + if m.config.Rootless && m.config.Path == "" { + return nil + } + return err + } + fields := strings.Fields(data) + m.controllers = make(map[string]struct{}, len(fields)) + for _, c := range fields { + m.controllers[c] = struct{}{} + } + + return nil } func (m *manager) Apply(pid int) error { - if err := cgroups.WriteCgroupProc(m.dirPath, pid); err != nil && !m.rootless { + if err := CreateCgroupPath(m.dirPath, m.config); err != nil { + // Related tests: + // - "runc create (no limits + no cgrouppath + no permission) succeeds" + // - "runc create (rootless + no limits + cgrouppath + no permission) fails with permission error" + // - "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error" + if m.config.Rootless { + if m.config.Path == "" { + if blNeed, nErr := needAnyControllers(m.config.Resources); nErr == nil && !blNeed { + return nil + } + return fmt.Errorf("rootless needs no limits + no cgrouppath when no permission is granted for cgroups: %w", err) + } + } + return err + } + if err := cgroups.WriteCgroupProc(m.dirPath, pid); err != nil { return err } return nil @@ -92,45 +93,45 @@ func (m *manager) GetAllPids() ([]int, error) { } func (m *manager) GetStats() (*cgroups.Stats, error) { - var ( - st cgroups.Stats - errs []error - ) + var errs []error + + st := cgroups.NewStats() + // pids (since kernel 4.5) - if _, ok := m.controllers["pids"]; ok { - if err := statPids(m.dirPath, &st); err != nil { - errs = append(errs, err) - } - } else { - if err := statPidsWithoutController(m.dirPath, &st); err != nil { - errs = append(errs, err) - } + if err := statPids(m.dirPath, st); err != nil { + errs = append(errs, err) } - // memory (since kenrel 4.5) - if _, ok := m.controllers["memory"]; ok { - if err := statMemory(m.dirPath, &st); err != nil { - errs = append(errs, err) - } + // memory (since kernel 4.5) + if err := statMemory(m.dirPath, st); err != nil && !os.IsNotExist(err) { + errs = append(errs, err) } // io (since kernel 4.5) - if _, ok := m.controllers["io"]; ok { - if err := statIo(m.dirPath, &st); err != nil { - errs = append(errs, err) - } + if err := statIo(m.dirPath, st); err != nil && !os.IsNotExist(err) { + errs = append(errs, err) } // cpu (since kernel 4.15) - if _, ok := m.controllers["cpu"]; ok { - if err := statCpu(m.dirPath, &st); err != nil { - errs = append(errs, err) - } + // Note cpu.stat is available even if the controller is not enabled. + if err := statCpu(m.dirPath, st); err != nil && !os.IsNotExist(err) { + errs = append(errs, err) } - if len(errs) > 0 && !m.rootless { - return &st, errors.Errorf("error while statting cgroup v2: %+v", errs) + // hugetlb (since kernel 5.6) + if err := statHugeTlb(m.dirPath, st); err != nil && !os.IsNotExist(err) { + errs = append(errs, err) } - return &st, nil + // rdma (since kernel 4.11) + if err := fscommon.RdmaGetStats(m.dirPath, st); err != nil && !os.IsNotExist(err) { + errs = append(errs, err) + } + if len(errs) > 0 && !m.config.Rootless { + return st, fmt.Errorf("error while statting cgroup v2: %+v", errs) + } + return st, nil } func (m *manager) Freeze(state configs.FreezerState) error { + if m.config.Resources == nil { + return errors.New("cannot toggle freezer: cgroups not configured for container") + } if err := setFreezer(m.dirPath, state); err != nil { return err } @@ -139,76 +140,120 @@ func (m *manager) Freeze(state configs.FreezerState) error { } func (m *manager) Destroy() error { - return os.RemoveAll(m.dirPath) + return cgroups.RemovePath(m.dirPath) } -// GetPaths is for compatibility purpose and should be removed in future -func (m *manager) GetPaths() map[string]string { - paths := map[string]string{ - // pseudo-controller for compatibility - "devices": m.dirPath, - "freezer": m.dirPath, - } - for c := range m.controllers { - paths[c] = m.dirPath - } - return paths +func (m *manager) Path(_ string) string { + return m.dirPath } -func (m *manager) GetUnifiedPath() (string, error) { - return m.dirPath, nil -} - -func (m *manager) Set(container *configs.Config) error { - if container == nil || container.Cgroups == nil { +func (m *manager) Set(r *configs.Resources) error { + if r == nil { return nil } - var errs []error + if err := m.getControllers(); err != nil { + return err + } // pids (since kernel 4.5) - if _, ok := m.controllers["pids"]; ok { - if err := setPids(m.dirPath, container.Cgroups); err != nil { - errs = append(errs, err) - } + if err := setPids(m.dirPath, r); err != nil { + return err } // memory (since kernel 4.5) - if _, ok := m.controllers["memory"]; ok { - if err := setMemory(m.dirPath, container.Cgroups); err != nil { - errs = append(errs, err) - } + if err := setMemory(m.dirPath, r); err != nil { + return err } // io (since kernel 4.5) - if _, ok := m.controllers["io"]; ok { - if err := setIo(m.dirPath, container.Cgroups); err != nil { - errs = append(errs, err) - } + if err := setIo(m.dirPath, r); err != nil { + return err } // cpu (since kernel 4.15) - if _, ok := m.controllers["cpu"]; ok { - if err := setCpu(m.dirPath, container.Cgroups); err != nil { - errs = append(errs, err) - } + if err := setCpu(m.dirPath, r); err != nil { + return err } // devices (since kernel 4.15, pseudo-controller) - if err := setDevices(m.dirPath, container.Cgroups); err != nil { - errs = append(errs, err) + // + // When rootless is true, errors from the device subsystem are ignored because it is really not expected to work. + // However, errors from other subsystems are not ignored. + // see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error" + if err := setDevices(m.dirPath, r); err != nil && !m.config.Rootless { + return err } // cpuset (since kernel 5.0) - if _, ok := m.controllers["cpuset"]; ok { - if err := setCpuset(m.dirPath, container.Cgroups); err != nil { - errs = append(errs, err) - } + if err := setCpuset(m.dirPath, r); err != nil { + return err + } + // hugetlb (since kernel 5.6) + if err := setHugeTlb(m.dirPath, r); err != nil { + return err + } + // rdma (since kernel 4.11) + if err := fscommon.RdmaSet(m.dirPath, r); err != nil { + return err } // freezer (since kernel 5.2, pseudo-controller) - if err := setFreezer(m.dirPath, container.Cgroups.Freezer); err != nil { - errs = append(errs, err) + if err := setFreezer(m.dirPath, r.Freezer); err != nil { + return err } - if len(errs) > 0 && !m.rootless { - return errors.Errorf("error while setting cgroup v2: %+v", errs) + if err := m.setUnified(r.Unified); err != nil { + return err } - m.config = container.Cgroups + m.config.Resources = r return nil } +func (m *manager) setUnified(res map[string]string) error { + for k, v := range res { + if strings.Contains(k, "/") { + return fmt.Errorf("unified resource %q must be a file name (no slashes)", k) + } + if err := cgroups.WriteFile(m.dirPath, k, v); err != nil { + // Check for both EPERM and ENOENT since O_CREAT is used by WriteFile. + if errors.Is(err, os.ErrPermission) || errors.Is(err, os.ErrNotExist) { + // Check if a controller is available, + // to give more specific error if not. + sk := strings.SplitN(k, ".", 2) + if len(sk) != 2 { + return fmt.Errorf("unified resource %q must be in the form CONTROLLER.PARAMETER", k) + } + c := sk[0] + if _, ok := m.controllers[c]; !ok && c != "cgroup" { + return fmt.Errorf("unified resource %q can't be set: controller %q not available", k, c) + } + } + return fmt.Errorf("unable to set unified resource %q: %w", k, err) + } + } + + return nil +} + +func (m *manager) GetPaths() map[string]string { + paths := make(map[string]string, 1) + paths[""] = m.dirPath + return paths +} + func (m *manager) GetCgroups() (*configs.Cgroup, error) { return m.config, nil } + +func (m *manager) GetFreezerState() (configs.FreezerState, error) { + return getFreezer(m.dirPath) +} + +func (m *manager) Exists() bool { + return cgroups.PathExists(m.dirPath) +} + +func OOMKillCount(path string) (uint64, error) { + return fscommon.GetValueByKey(path, "memory.events", "oom_kill") +} + +func (m *manager) OOMKillCount() (uint64, error) { + c, err := OOMKillCount(m.dirPath) + if err != nil && m.config.Rootless && os.IsNotExist(err) { + err = nil + } + + return c, err +} diff --git a/libcontainer/cgroups/fs2/hugetlb.go b/libcontainer/cgroups/fs2/hugetlb.go new file mode 100644 index 0000000..c92a7e6 --- /dev/null +++ b/libcontainer/cgroups/fs2/hugetlb.go @@ -0,0 +1,48 @@ +package fs2 + +import ( + "strconv" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +func isHugeTlbSet(r *configs.Resources) bool { + return len(r.HugetlbLimit) > 0 +} + +func setHugeTlb(dirPath string, r *configs.Resources) error { + if !isHugeTlbSet(r) { + return nil + } + for _, hugetlb := range r.HugetlbLimit { + if err := cgroups.WriteFile(dirPath, "hugetlb."+hugetlb.Pagesize+".max", strconv.FormatUint(hugetlb.Limit, 10)); err != nil { + return err + } + } + + return nil +} + +func statHugeTlb(dirPath string, stats *cgroups.Stats) error { + hugetlbStats := cgroups.HugetlbStats{} + for _, pagesize := range cgroups.HugePageSizes() { + value, err := fscommon.GetCgroupParamUint(dirPath, "hugetlb."+pagesize+".current") + if err != nil { + return err + } + hugetlbStats.Usage = value + + fileName := "hugetlb." + pagesize + ".events" + value, err = fscommon.GetValueByKey(dirPath, fileName, "max") + if err != nil { + return err + } + hugetlbStats.Failcnt = value + + stats.HugetlbStats[pagesize] = hugetlbStats + } + + return nil +} diff --git a/libcontainer/cgroups/fs2/io.go b/libcontainer/cgroups/fs2/io.go index 9a07308..b2ff7d3 100644 --- a/libcontainer/cgroups/fs2/io.go +++ b/libcontainer/cgroups/fs2/io.go @@ -1,44 +1,96 @@ -// +build linux - package fs2 import ( "bufio" + "bytes" + "fmt" "os" - "path/filepath" "strconv" "strings" + "github.com/sirupsen/logrus" + "github.com/opencontainers/runc/libcontainer/cgroups" - "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" "github.com/opencontainers/runc/libcontainer/configs" ) -func setIo(dirPath string, cgroup *configs.Cgroup) error { - if cgroup.Resources.BlkioWeight != 0 { - filename := "io.bfq.weight" - if err := fscommon.WriteFile(dirPath, filename, strconv.FormatUint(uint64(cgroup.Resources.BlkioWeight), 10)); err != nil { +func isIoSet(r *configs.Resources) bool { + return r.BlkioWeight != 0 || + len(r.BlkioWeightDevice) > 0 || + len(r.BlkioThrottleReadBpsDevice) > 0 || + len(r.BlkioThrottleWriteBpsDevice) > 0 || + len(r.BlkioThrottleReadIOPSDevice) > 0 || + len(r.BlkioThrottleWriteIOPSDevice) > 0 +} + +// bfqDeviceWeightSupported checks for per-device BFQ weight support (added +// in kernel v5.4, commit 795fe54c2a8) by reading from "io.bfq.weight". +func bfqDeviceWeightSupported(bfq *os.File) bool { + if bfq == nil { + return false + } + _, _ = bfq.Seek(0, 0) + buf := make([]byte, 32) + _, _ = bfq.Read(buf) + // If only a single number (default weight) if read back, we have older kernel. + _, err := strconv.ParseInt(string(bytes.TrimSpace(buf)), 10, 64) + return err != nil +} + +func setIo(dirPath string, r *configs.Resources) error { + if !isIoSet(r) { + return nil + } + + // If BFQ IO scheduler is available, use it. + var bfq *os.File + if r.BlkioWeight != 0 || len(r.BlkioWeightDevice) > 0 { + var err error + bfq, err = cgroups.OpenFile(dirPath, "io.bfq.weight", os.O_RDWR) + if err == nil { + defer bfq.Close() + } else if !os.IsNotExist(err) { return err } } - for _, td := range cgroup.Resources.BlkioThrottleReadBpsDevice { - if err := fscommon.WriteFile(dirPath, "io.max", td.StringName("rbps")); err != nil { + if r.BlkioWeight != 0 { + if bfq != nil { // Use BFQ. + if _, err := bfq.WriteString(strconv.FormatUint(uint64(r.BlkioWeight), 10)); err != nil { + return err + } + } else { + // Fallback to io.weight with a conversion scheme. + v := cgroups.ConvertBlkIOToIOWeightValue(r.BlkioWeight) + if err := cgroups.WriteFile(dirPath, "io.weight", strconv.FormatUint(v, 10)); err != nil { + return err + } + } + } + if bfqDeviceWeightSupported(bfq) { + for _, wd := range r.BlkioWeightDevice { + if _, err := bfq.WriteString(wd.WeightString() + "\n"); err != nil { + return fmt.Errorf("setting device weight %q: %w", wd.WeightString(), err) + } + } + } + for _, td := range r.BlkioThrottleReadBpsDevice { + if err := cgroups.WriteFile(dirPath, "io.max", td.StringName("rbps")); err != nil { return err } } - for _, td := range cgroup.Resources.BlkioThrottleWriteBpsDevice { - if err := fscommon.WriteFile(dirPath, "io.max", td.StringName("wbps")); err != nil { + for _, td := range r.BlkioThrottleWriteBpsDevice { + if err := cgroups.WriteFile(dirPath, "io.max", td.StringName("wbps")); err != nil { return err } } - for _, td := range cgroup.Resources.BlkioThrottleReadIOPSDevice { - if err := fscommon.WriteFile(dirPath, "io.max", td.StringName("riops")); err != nil { + for _, td := range r.BlkioThrottleReadIOPSDevice { + if err := cgroups.WriteFile(dirPath, "io.max", td.StringName("riops")); err != nil { return err } } - for _, td := range cgroup.Resources.BlkioThrottleWriteIOPSDevice { - if err := fscommon.WriteFile(dirPath, "io.max", td.StringName("wiops")); err != nil { + for _, td := range r.BlkioThrottleWriteIOPSDevice { + if err := cgroups.WriteFile(dirPath, "io.max", td.StringName("wiops")); err != nil { return err } } @@ -48,8 +100,7 @@ func setIo(dirPath string, cgroup *configs.Cgroup) error { func readCgroup2MapFile(dirPath string, name string) (map[string][]string, error) { ret := map[string][]string{} - p := filepath.Join(dirPath, name) - f, err := os.Open(p) + f, err := cgroups.OpenFile(dirPath, name, os.O_RDONLY) if err != nil { return nil, err } @@ -64,30 +115,31 @@ func readCgroup2MapFile(dirPath string, name string) (map[string][]string, error ret[parts[0]] = parts[1:] } if err := scanner.Err(); err != nil { - return nil, err + return nil, &parseError{Path: dirPath, File: name, Err: err} } return ret, nil } func statIo(dirPath string, stats *cgroups.Stats) error { - // more details on the io.stat file format: https://www.kernel.org/doc/Documentation/cgroup-v2.txt - var ioServiceBytesRecursive []cgroups.BlkioStatEntry - values, err := readCgroup2MapFile(dirPath, "io.stat") + const file = "io.stat" + values, err := readCgroup2MapFile(dirPath, file) if err != nil { return err } + // more details on the io.stat file format: https://www.kernel.org/doc/Documentation/cgroup-v2.txt + var parsedStats cgroups.BlkioStats for k, v := range values { d := strings.Split(k, ":") if len(d) != 2 { continue } - minor, err := strconv.ParseUint(d[0], 10, 0) + major, err := strconv.ParseUint(d[0], 10, 64) if err != nil { - return err + return &parseError{Path: dirPath, File: file, Err: err} } - major, err := strconv.ParseUint(d[1], 10, 0) + minor, err := strconv.ParseUint(d[1], 10, 64) if err != nil { - return err + return &parseError{Path: dirPath, File: file, Err: err} } for _, item := range v { @@ -97,17 +149,34 @@ func statIo(dirPath string, stats *cgroups.Stats) error { } op := d[0] - // Accommodate the cgroup v1 naming + // Map to the cgroupv1 naming and layout (in separate tables). + var targetTable *[]cgroups.BlkioStatEntry switch op { + // Equivalent to cgroupv1's blkio.io_service_bytes. case "rbytes": - op = "read" + op = "Read" + targetTable = &parsedStats.IoServiceBytesRecursive case "wbytes": - op = "write" + op = "Write" + targetTable = &parsedStats.IoServiceBytesRecursive + // Equivalent to cgroupv1's blkio.io_serviced. + case "rios": + op = "Read" + targetTable = &parsedStats.IoServicedRecursive + case "wios": + op = "Write" + targetTable = &parsedStats.IoServicedRecursive + default: + // Skip over entries we cannot map to cgroupv1 stats for now. + // In the future we should expand the stats struct to include + // them. + logrus.Debugf("cgroupv2 io stats: skipping over unmappable %s entry", item) + continue } - value, err := strconv.ParseUint(d[1], 10, 0) + value, err := strconv.ParseUint(d[1], 10, 64) if err != nil { - return err + return &parseError{Path: dirPath, File: file, Err: err} } entry := cgroups.BlkioStatEntry{ @@ -116,9 +185,9 @@ func statIo(dirPath string, stats *cgroups.Stats) error { Minor: minor, Value: value, } - ioServiceBytesRecursive = append(ioServiceBytesRecursive, entry) + *targetTable = append(*targetTable, entry) } } - stats.BlkioStats = cgroups.BlkioStats{IoServiceBytesRecursive: ioServiceBytesRecursive} + stats.BlkioStats = parsedStats return nil } diff --git a/libcontainer/cgroups/fs2/io_test.go b/libcontainer/cgroups/fs2/io_test.go new file mode 100644 index 0000000..15bb64a --- /dev/null +++ b/libcontainer/cgroups/fs2/io_test.go @@ -0,0 +1,81 @@ +package fs2 + +import ( + "os" + "path/filepath" + "reflect" + "sort" + "testing" + + "github.com/opencontainers/runc/libcontainer/cgroups" +) + +const exampleIoStatData = `254:1 rbytes=6901432320 wbytes=14245535744 rios=263278 wios=248603 dbytes=0 dios=0 +254:0 rbytes=2702336 wbytes=0 rios=97 wios=0 dbytes=0 dios=0 +259:0 rbytes=6911345664 wbytes=14245536256 rios=264538 wios=244914 dbytes=530485248 dios=2` + +var exampleIoStatsParsed = cgroups.BlkioStats{ + IoServiceBytesRecursive: []cgroups.BlkioStatEntry{ + {Major: 254, Minor: 1, Value: 6901432320, Op: "Read"}, + {Major: 254, Minor: 1, Value: 14245535744, Op: "Write"}, + {Major: 254, Minor: 0, Value: 2702336, Op: "Read"}, + {Major: 254, Minor: 0, Value: 0, Op: "Write"}, + {Major: 259, Minor: 0, Value: 6911345664, Op: "Read"}, + {Major: 259, Minor: 0, Value: 14245536256, Op: "Write"}, + }, + IoServicedRecursive: []cgroups.BlkioStatEntry{ + {Major: 254, Minor: 1, Value: 263278, Op: "Read"}, + {Major: 254, Minor: 1, Value: 248603, Op: "Write"}, + {Major: 254, Minor: 0, Value: 97, Op: "Read"}, + {Major: 254, Minor: 0, Value: 0, Op: "Write"}, + {Major: 259, Minor: 0, Value: 264538, Op: "Read"}, + {Major: 259, Minor: 0, Value: 244914, Op: "Write"}, + }, +} + +func lessBlkioStatEntry(a, b cgroups.BlkioStatEntry) bool { + if a.Major != b.Major { + return a.Major < b.Major + } + if a.Minor != b.Minor { + return a.Minor < b.Minor + } + if a.Op != b.Op { + return a.Op < b.Op + } + return a.Value < b.Value +} + +func sortBlkioStats(stats *cgroups.BlkioStats) { + for _, table := range []*[]cgroups.BlkioStatEntry{ + &stats.IoServicedRecursive, + &stats.IoServiceBytesRecursive, + } { + sort.SliceStable(*table, func(i, j int) bool { return lessBlkioStatEntry((*table)[i], (*table)[j]) }) + } +} + +func TestStatIo(t *testing.T) { + // We're using a fake cgroupfs. + cgroups.TestMode = true + + fakeCgroupDir := t.TempDir() + statPath := filepath.Join(fakeCgroupDir, "io.stat") + + if err := os.WriteFile(statPath, []byte(exampleIoStatData), 0o644); err != nil { + t.Fatal(err) + } + + var gotStats cgroups.Stats + if err := statIo(fakeCgroupDir, &gotStats); err != nil { + t.Error(err) + } + + // Sort the output since statIo uses a map internally. + sortBlkioStats(&gotStats.BlkioStats) + sortBlkioStats(&exampleIoStatsParsed) + + if !reflect.DeepEqual(gotStats.BlkioStats, exampleIoStatsParsed) { + t.Errorf("parsed cgroupv2 io.stat doesn't match expected result: \ngot %#v\nexpected %#v\n", gotStats.BlkioStats, exampleIoStatsParsed) + } +} diff --git a/libcontainer/cgroups/fs2/memory.go b/libcontainer/cgroups/fs2/memory.go index 23eccbe..adbc4b2 100644 --- a/libcontainer/cgroups/fs2/memory.go +++ b/libcontainer/cgroups/fs2/memory.go @@ -1,36 +1,71 @@ -// +build linux - package fs2 import ( "bufio" + "errors" + "math" "os" - "path/filepath" "strconv" "strings" + "golang.org/x/sys/unix" + "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" "github.com/opencontainers/runc/libcontainer/configs" - "github.com/pkg/errors" ) -func setMemory(dirPath string, cgroup *configs.Cgroup) error { - if cgroup.Resources.MemorySwap != 0 { - if err := fscommon.WriteFile(dirPath, "memory.swap.max", strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil { +// numToStr converts an int64 value to a string for writing to a +// cgroupv2 files with .min, .max, .low, or .high suffix. +// The value of -1 is converted to "max" for cgroupv1 compatibility +// (which used to write -1 to remove the limit). +func numToStr(value int64) (ret string) { + switch { + case value == 0: + ret = "" + case value == -1: + ret = "max" + default: + ret = strconv.FormatInt(value, 10) + } + + return ret +} + +func isMemorySet(r *configs.Resources) bool { + return r.MemoryReservation != 0 || r.Memory != 0 || r.MemorySwap != 0 +} + +func setMemory(dirPath string, r *configs.Resources) error { + if !isMemorySet(r) { + return nil + } + swap, err := cgroups.ConvertMemorySwapToCgroupV2Value(r.MemorySwap, r.Memory) + if err != nil { + return err + } + swapStr := numToStr(swap) + if swapStr == "" && swap == 0 && r.MemorySwap > 0 { + // memory and memorySwap set to the same value -- disable swap + swapStr = "0" + } + // never write empty string to `memory.swap.max`, it means set to 0. + if swapStr != "" { + if err := cgroups.WriteFile(dirPath, "memory.swap.max", swapStr); err != nil { return err } } - if cgroup.Resources.Memory != 0 { - if err := fscommon.WriteFile(dirPath, "memory.max", strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil { + + if val := numToStr(r.Memory); val != "" { + if err := cgroups.WriteFile(dirPath, "memory.max", val); err != nil { return err } } // cgroup.Resources.KernelMemory is ignored - if cgroup.Resources.MemoryReservation != 0 { - if err := fscommon.WriteFile(dirPath, "memory.low", strconv.FormatInt(cgroup.Resources.MemoryReservation, 10)); err != nil { + if val := numToStr(r.MemoryReservation); val != "" { + if err := cgroups.WriteFile(dirPath, "memory.low", val); err != nil { return err } } @@ -39,8 +74,8 @@ func setMemory(dirPath string, cgroup *configs.Cgroup) error { } func statMemory(dirPath string, stats *cgroups.Stats) error { - // Set stats from memory.stat. - statsFile, err := os.Open(filepath.Join(dirPath, "memory.stat")) + const file = "memory.stat" + statsFile, err := cgroups.OpenFile(dirPath, file, os.O_RDONLY) if err != nil { return err } @@ -48,16 +83,27 @@ func statMemory(dirPath string, stats *cgroups.Stats) error { sc := bufio.NewScanner(statsFile) for sc.Scan() { - t, v, err := fscommon.GetCgroupParamKeyValue(sc.Text()) + t, v, err := fscommon.ParseKeyValue(sc.Text()) if err != nil { - return errors.Wrapf(err, "failed to parse memory.stat (%q)", sc.Text()) + return &parseError{Path: dirPath, File: file, Err: err} } stats.MemoryStats.Stats[t] = v } - stats.MemoryStats.Cache = stats.MemoryStats.Stats["cache"] + if err := sc.Err(); err != nil { + return &parseError{Path: dirPath, File: file, Err: err} + } + stats.MemoryStats.Cache = stats.MemoryStats.Stats["file"] + // Unlike cgroup v1 which has memory.use_hierarchy binary knob, + // cgroup v2 is always hierarchical. + stats.MemoryStats.UseHierarchy = true memoryUsage, err := getMemoryDataV2(dirPath, "") if err != nil { + if errors.Is(err, unix.ENOENT) && dirPath == UnifiedMountpoint { + // The root cgroup does not have memory.{current,max} + // so emulate those using data from /proc/meminfo. + return statsFromMeminfo(stats) + } return err } stats.MemoryStats.Usage = memoryUsage @@ -65,9 +111,15 @@ func statMemory(dirPath string, stats *cgroups.Stats) error { if err != nil { return err } + // As cgroup v1 reports SwapUsage values as mem+swap combined, + // while in cgroup v2 swap values do not include memory, + // report combined mem+swap for v1 compatibility. + swapUsage.Usage += memoryUsage.Usage + if swapUsage.Limit != math.MaxUint64 { + swapUsage.Limit += memoryUsage.Limit + } stats.MemoryStats.SwapUsage = swapUsage - stats.MemoryStats.UseHierarchy = true return nil } @@ -76,28 +128,89 @@ func getMemoryDataV2(path, name string) (cgroups.MemoryData, error) { moduleName := "memory" if name != "" { - moduleName = strings.Join([]string{"memory", name}, ".") + moduleName = "memory." + name } - usage := strings.Join([]string{moduleName, "current"}, ".") - limit := strings.Join([]string{moduleName, "max"}, ".") + usage := moduleName + ".current" + limit := moduleName + ".max" value, err := fscommon.GetCgroupParamUint(path, usage) if err != nil { - if moduleName != "memory" && os.IsNotExist(err) { + if name != "" && os.IsNotExist(err) { + // Ignore EEXIST as there's no swap accounting + // if kernel CONFIG_MEMCG_SWAP is not set or + // swapaccount=0 kernel boot parameter is given. return cgroups.MemoryData{}, nil } - return cgroups.MemoryData{}, errors.Wrapf(err, "failed to parse %s", usage) + return cgroups.MemoryData{}, err } memoryData.Usage = value value, err = fscommon.GetCgroupParamUint(path, limit) if err != nil { - if moduleName != "memory" && os.IsNotExist(err) { - return cgroups.MemoryData{}, nil - } - return cgroups.MemoryData{}, errors.Wrapf(err, "failed to parse %s", limit) + return cgroups.MemoryData{}, err } memoryData.Limit = value return memoryData, nil } + +func statsFromMeminfo(stats *cgroups.Stats) error { + const file = "/proc/meminfo" + f, err := os.Open(file) + if err != nil { + return err + } + defer f.Close() + + // Fields we are interested in. + var ( + swap_free uint64 + swap_total uint64 + main_total uint64 + main_free uint64 + ) + mem := map[string]*uint64{ + "SwapFree": &swap_free, + "SwapTotal": &swap_total, + "MemTotal": &main_total, + "MemFree": &main_free, + } + + found := 0 + sc := bufio.NewScanner(f) + for sc.Scan() { + parts := strings.SplitN(sc.Text(), ":", 3) + if len(parts) != 2 { + // Should not happen. + continue + } + k := parts[0] + p, ok := mem[k] + if !ok { + // Unknown field -- not interested. + continue + } + vStr := strings.TrimSpace(strings.TrimSuffix(parts[1], " kB")) + *p, err = strconv.ParseUint(vStr, 10, 64) + if err != nil { + return &parseError{File: file, Err: errors.New("bad value for " + k)} + } + + found++ + if found == len(mem) { + // Got everything we need -- skip the rest. + break + } + } + if err := sc.Err(); err != nil { + return &parseError{Path: "", File: file, Err: err} + } + + stats.MemoryStats.SwapUsage.Usage = (swap_total - swap_free) * 1024 + stats.MemoryStats.SwapUsage.Limit = math.MaxUint64 + + stats.MemoryStats.Usage.Usage = (main_total - main_free) * 1024 + stats.MemoryStats.Usage.Limit = math.MaxUint64 + + return nil +} diff --git a/libcontainer/cgroups/fs2/pids.go b/libcontainer/cgroups/fs2/pids.go index db2d7ac..c8c4a36 100644 --- a/libcontainer/cgroups/fs2/pids.go +++ b/libcontainer/cgroups/fs2/pids.go @@ -1,31 +1,28 @@ -// +build linux - package fs2 import ( - "io/ioutil" + "errors" + "math" "os" - "path/filepath" - "strconv" "strings" + "golang.org/x/sys/unix" + "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" "github.com/opencontainers/runc/libcontainer/configs" - "github.com/pkg/errors" - "golang.org/x/sys/unix" ) -func setPids(dirPath string, cgroup *configs.Cgroup) error { - if cgroup.Resources.PidsLimit != 0 { - // "max" is the fallback value. - limit := "max" +func isPidsSet(r *configs.Resources) bool { + return r.PidsLimit != 0 +} - if cgroup.Resources.PidsLimit > 0 { - limit = strconv.FormatInt(cgroup.Resources.PidsLimit, 10) - } - - if err := fscommon.WriteFile(dirPath, "pids.max", limit); err != nil { +func setPids(dirPath string, r *configs.Resources) error { + if !isPidsSet(r) { + return nil + } + if val := numToStr(r.PidsLimit); val != "" { + if err := cgroups.WriteFile(dirPath, "pids.max", val); err != nil { return err } } @@ -33,32 +30,18 @@ func setPids(dirPath string, cgroup *configs.Cgroup) error { return nil } -func isNOTSUP(err error) bool { - switch err := err.(type) { - case *os.PathError: - return err.Err == unix.ENOTSUP - default: - return false - } -} - -func statPidsWithoutController(dirPath string, stats *cgroups.Stats) error { +func statPidsFromCgroupProcs(dirPath string, stats *cgroups.Stats) error { // if the controller is not enabled, let's read PIDS from cgroups.procs // (or threads if cgroup.threads is enabled) - contents, err := ioutil.ReadFile(filepath.Join(dirPath, "cgroup.procs")) - if err != nil && isNOTSUP(err) { - contents, err = ioutil.ReadFile(filepath.Join(dirPath, "cgroup.threads")) + contents, err := cgroups.ReadFile(dirPath, "cgroup.procs") + if errors.Is(err, unix.ENOTSUP) { + contents, err = cgroups.ReadFile(dirPath, "cgroup.threads") } if err != nil { return err } - pids := make(map[string]string) - for _, i := range strings.Split(string(contents), "\n") { - if i != "" { - pids[i] = i - } - } - stats.PidsStats.Current = uint64(len(pids)) + pids := strings.Count(contents, "\n") + stats.PidsStats.Current = uint64(pids) stats.PidsStats.Limit = 0 return nil } @@ -66,22 +49,21 @@ func statPidsWithoutController(dirPath string, stats *cgroups.Stats) error { func statPids(dirPath string, stats *cgroups.Stats) error { current, err := fscommon.GetCgroupParamUint(dirPath, "pids.current") if err != nil { - return errors.Wrap(err, "failed to parse pids.current") - } - - maxString, err := fscommon.GetCgroupParamString(dirPath, "pids.max") - if err != nil { - return errors.Wrap(err, "failed to parse pids.max") - } - - // Default if pids.max == "max" is 0 -- which represents "no limit". - var max uint64 - if maxString != "max" { - max, err = fscommon.ParseUint(maxString, 10, 64) - if err != nil { - return errors.Wrapf(err, "failed to parse pids.max - unable to parse %q as a uint from Cgroup file %q", - maxString, filepath.Join(dirPath, "pids.max")) + if os.IsNotExist(err) { + return statPidsFromCgroupProcs(dirPath, stats) } + return err + } + + max, err := fscommon.GetCgroupParamUint(dirPath, "pids.max") + if err != nil { + return err + } + // If no limit is set, read from pids.max returns "max", which is + // converted to MaxUint64 by GetCgroupParamUint. Historically, we + // represent "no limit" for pids as 0, thus this conversion. + if max == math.MaxUint64 { + max = 0 } stats.PidsStats.Current = current diff --git a/libcontainer/cgroups/fscommon/fscommon.go b/libcontainer/cgroups/fscommon/fscommon.go deleted file mode 100644 index dd92e8c..0000000 --- a/libcontainer/cgroups/fscommon/fscommon.go +++ /dev/null @@ -1,36 +0,0 @@ -// +build linux - -package fscommon - -import ( - "io/ioutil" - - securejoin "github.com/cyphar/filepath-securejoin" - "github.com/pkg/errors" -) - -func WriteFile(dir, file, data string) error { - if dir == "" { - return errors.Errorf("no directory specified for %s", file) - } - path, err := securejoin.SecureJoin(dir, file) - if err != nil { - return err - } - if err := ioutil.WriteFile(path, []byte(data), 0700); err != nil { - return errors.Wrapf(err, "failed to write %q to %q", data, path) - } - return nil -} - -func ReadFile(dir, file string) (string, error) { - if dir == "" { - return "", errors.Errorf("no directory specified for %s", file) - } - path, err := securejoin.SecureJoin(dir, file) - if err != nil { - return "", err - } - data, err := ioutil.ReadFile(path) - return string(data), err -} diff --git a/libcontainer/cgroups/fscommon/rdma.go b/libcontainer/cgroups/fscommon/rdma.go new file mode 100644 index 0000000..d463d15 --- /dev/null +++ b/libcontainer/cgroups/fscommon/rdma.go @@ -0,0 +1,121 @@ +package fscommon + +import ( + "bufio" + "errors" + "math" + "os" + "strconv" + "strings" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" + "golang.org/x/sys/unix" +) + +// parseRdmaKV parses raw string to RdmaEntry. +func parseRdmaKV(raw string, entry *cgroups.RdmaEntry) error { + var value uint32 + + parts := strings.SplitN(raw, "=", 3) + + if len(parts) != 2 { + return errors.New("Unable to parse RDMA entry") + } + + k, v := parts[0], parts[1] + + if v == "max" { + value = math.MaxUint32 + } else { + val64, err := strconv.ParseUint(v, 10, 32) + if err != nil { + return err + } + value = uint32(val64) + } + if k == "hca_handle" { + entry.HcaHandles = value + } else if k == "hca_object" { + entry.HcaObjects = value + } + + return nil +} + +// readRdmaEntries reads and converts array of rawstrings to RdmaEntries from file. +// example entry: mlx4_0 hca_handle=2 hca_object=2000 +func readRdmaEntries(dir, file string) ([]cgroups.RdmaEntry, error) { + rdmaEntries := make([]cgroups.RdmaEntry, 0) + fd, err := cgroups.OpenFile(dir, file, unix.O_RDONLY) + if err != nil { + return nil, err + } + defer fd.Close() //nolint:errorlint + scanner := bufio.NewScanner(fd) + for scanner.Scan() { + parts := strings.SplitN(scanner.Text(), " ", 4) + if len(parts) == 3 { + entry := new(cgroups.RdmaEntry) + entry.Device = parts[0] + err = parseRdmaKV(parts[1], entry) + if err != nil { + continue + } + err = parseRdmaKV(parts[2], entry) + if err != nil { + continue + } + + rdmaEntries = append(rdmaEntries, *entry) + } + } + return rdmaEntries, scanner.Err() +} + +// RdmaGetStats returns rdma stats such as totalLimit and current entries. +func RdmaGetStats(path string, stats *cgroups.Stats) error { + currentEntries, err := readRdmaEntries(path, "rdma.current") + if err != nil { + if errors.Is(err, os.ErrNotExist) { + err = nil + } + return err + } + maxEntries, err := readRdmaEntries(path, "rdma.max") + if err != nil { + return err + } + // If device got removed between reading two files, ignore returning stats. + if len(currentEntries) != len(maxEntries) { + return nil + } + + stats.RdmaStats = cgroups.RdmaStats{ + RdmaLimit: maxEntries, + RdmaCurrent: currentEntries, + } + + return nil +} + +func createCmdString(device string, limits configs.LinuxRdma) string { + cmdString := device + if limits.HcaHandles != nil { + cmdString += " hca_handle=" + strconv.FormatUint(uint64(*limits.HcaHandles), 10) + } + if limits.HcaObjects != nil { + cmdString += " hca_object=" + strconv.FormatUint(uint64(*limits.HcaObjects), 10) + } + return cmdString +} + +// RdmaSet sets RDMA resources. +func RdmaSet(path string, r *configs.Resources) error { + for device, limits := range r.Rdma { + if err := cgroups.WriteFile(path, "rdma.max", createCmdString(device, limits)); err != nil { + return err + } + } + return nil +} diff --git a/libcontainer/cgroups/fscommon/rdma_test.go b/libcontainer/cgroups/fscommon/rdma_test.go new file mode 100644 index 0000000..9bc0819 --- /dev/null +++ b/libcontainer/cgroups/fscommon/rdma_test.go @@ -0,0 +1,57 @@ +package fscommon + +import ( + "os" + "path/filepath" + "testing" + + "github.com/opencontainers/runc/libcontainer/configs" +) + +/* Roadmap for future */ +// (Low-priority) TODO: Check if it is possible to virtually mimic an actual RDMA device. +// TODO: Think of more edge-cases to add. + +// TestRdmaSet performs an E2E test of RdmaSet(), parseRdmaKV() using dummy device and a dummy cgroup file-system. +// Note: Following test does not guarantees that your host supports RDMA since this mocks underlying infrastructure. +func TestRdmaSet(t *testing.T) { + testCgroupPath := filepath.Join(t.TempDir(), "rdma") + + // Ensure the full mock cgroup path exists. + err := os.Mkdir(testCgroupPath, 0o755) + if err != nil { + t.Fatal(err) + } + + rdmaDevice := "mlx5_1" + maxHandles := uint32(100) + maxObjects := uint32(300) + + rdmaStubResource := &configs.Resources{ + Rdma: map[string]configs.LinuxRdma{ + rdmaDevice: { + HcaHandles: &maxHandles, + HcaObjects: &maxObjects, + }, + }, + } + + if err := RdmaSet(testCgroupPath, rdmaStubResource); err != nil { + t.Fatal(err) + } + + // The default rdma.max must be written. + rdmaEntries, err := readRdmaEntries(testCgroupPath, "rdma.max") + if err != nil { + t.Fatal(err) + } + if len(rdmaEntries) != 1 { + t.Fatal("rdma_test: Got the wrong values while parsing entries from rdma.max") + } + if rdmaEntries[0].HcaHandles != maxHandles { + t.Fatalf("rdma_test: Got the wrong value for hca_handles") + } + if rdmaEntries[0].HcaObjects != maxObjects { + t.Fatalf("rdma_test: Got the wrong value for hca_Objects") + } +} diff --git a/libcontainer/cgroups/fscommon/utils.go b/libcontainer/cgroups/fscommon/utils.go index 46c3c77..f4a51c9 100644 --- a/libcontainer/cgroups/fscommon/utils.go +++ b/libcontainer/cgroups/fscommon/utils.go @@ -1,23 +1,41 @@ -// +build linux - package fscommon import ( "errors" "fmt" - "io/ioutil" "math" - "path/filepath" + "path" "strconv" "strings" + + "github.com/opencontainers/runc/libcontainer/cgroups" ) var ( - ErrNotValidFormat = errors.New("line is not a valid key value format") + // Deprecated: use cgroups.OpenFile instead. + OpenFile = cgroups.OpenFile + // Deprecated: use cgroups.ReadFile instead. + ReadFile = cgroups.ReadFile + // Deprecated: use cgroups.WriteFile instead. + WriteFile = cgroups.WriteFile ) -// Saturates negative values at zero and returns a uint64. -// Due to kernel bugs, some of the memory cgroup stats can be negative. +// ParseError records a parse error details, including the file path. +type ParseError struct { + Path string + File string + Err error +} + +func (e *ParseError) Error() string { + return "unable to parse " + path.Join(e.Path, e.File) + ": " + e.Err.Error() +} + +func (e *ParseError) Unwrap() error { return e.Err } + +// ParseUint converts a string to an uint64 integer. +// Negative values are returned at zero as, due to kernel bugs, +// some of the memory cgroup stats can be negative. func ParseUint(s string, base, bitSize int) (uint64, error) { value, err := strconv.ParseUint(s, base, bitSize) if err != nil { @@ -26,7 +44,7 @@ func ParseUint(s string, base, bitSize int) (uint64, error) { // 2. Handle negative values lesser than MinInt64 if intErr == nil && intValue < 0 { return 0, nil - } else if intErr != nil && intErr.(*strconv.NumError).Err == strconv.ErrRange && intValue < 0 { + } else if errors.Is(intErr, strconv.ErrRange) && intValue < 0 { return 0, nil } @@ -36,48 +54,92 @@ func ParseUint(s string, base, bitSize int) (uint64, error) { return value, nil } -// Parses a cgroup param and returns as name, value -// i.e. "io_service_bytes 1234" will return as io_service_bytes, 1234 -func GetCgroupParamKeyValue(t string) (string, uint64, error) { - parts := strings.Fields(t) - switch len(parts) { - case 2: - value, err := ParseUint(parts[1], 10, 64) - if err != nil { - return "", 0, fmt.Errorf("unable to convert param value (%q) to uint64: %v", parts[1], err) - } - - return parts[0], value, nil - default: - return "", 0, ErrNotValidFormat +// ParseKeyValue parses a space-separated "name value" kind of cgroup +// parameter and returns its key as a string, and its value as uint64 +// (ParseUint is used to convert the value). For example, +// "io_service_bytes 1234" will be returned as "io_service_bytes", 1234. +func ParseKeyValue(t string) (string, uint64, error) { + parts := strings.SplitN(t, " ", 3) + if len(parts) != 2 { + return "", 0, fmt.Errorf("line %q is not in key value format", t) } + + value, err := ParseUint(parts[1], 10, 64) + if err != nil { + return "", 0, err + } + + return parts[0], value, nil } -// Gets a single uint64 value from the specified cgroup file. -func GetCgroupParamUint(cgroupPath, cgroupFile string) (uint64, error) { - fileName := filepath.Join(cgroupPath, cgroupFile) - contents, err := ioutil.ReadFile(fileName) +// GetValueByKey reads a key-value pairs from the specified cgroup file, +// and returns a value of the specified key. ParseUint is used for value +// conversion. +func GetValueByKey(path, file, key string) (uint64, error) { + content, err := cgroups.ReadFile(path, file) if err != nil { return 0, err } - trimmed := strings.TrimSpace(string(contents)) - if trimmed == "max" { + + lines := strings.Split(content, "\n") + for _, line := range lines { + arr := strings.Split(line, " ") + if len(arr) == 2 && arr[0] == key { + val, err := ParseUint(arr[1], 10, 64) + if err != nil { + err = &ParseError{Path: path, File: file, Err: err} + } + return val, err + } + } + + return 0, nil +} + +// GetCgroupParamUint reads a single uint64 value from the specified cgroup file. +// If the value read is "max", the math.MaxUint64 is returned. +func GetCgroupParamUint(path, file string) (uint64, error) { + contents, err := GetCgroupParamString(path, file) + if err != nil { + return 0, err + } + contents = strings.TrimSpace(contents) + if contents == "max" { return math.MaxUint64, nil } - res, err := ParseUint(trimmed, 10, 64) + res, err := ParseUint(contents, 10, 64) if err != nil { - return res, fmt.Errorf("unable to parse %q as a uint from Cgroup file %q", string(contents), fileName) + return res, &ParseError{Path: path, File: file, Err: err} } return res, nil } -// Gets a string value from the specified cgroup file -func GetCgroupParamString(cgroupPath, cgroupFile string) (string, error) { - contents, err := ioutil.ReadFile(filepath.Join(cgroupPath, cgroupFile)) +// GetCgroupParamInt reads a single int64 value from specified cgroup file. +// If the value read is "max", the math.MaxInt64 is returned. +func GetCgroupParamInt(path, file string) (int64, error) { + contents, err := cgroups.ReadFile(path, file) + if err != nil { + return 0, err + } + contents = strings.TrimSpace(contents) + if contents == "max" { + return math.MaxInt64, nil + } + + res, err := strconv.ParseInt(contents, 10, 64) + if err != nil { + return res, &ParseError{Path: path, File: file, Err: err} + } + return res, nil +} + +// GetCgroupParamString reads a string from the specified cgroup file. +func GetCgroupParamString(path, file string) (string, error) { + contents, err := cgroups.ReadFile(path, file) if err != nil { return "", err } - return strings.TrimSpace(string(contents)), nil + return strings.TrimSpace(contents), nil } diff --git a/libcontainer/cgroups/fscommon/utils_test.go b/libcontainer/cgroups/fscommon/utils_test.go index d0c5668..0339e99 100644 --- a/libcontainer/cgroups/fscommon/utils_test.go +++ b/libcontainer/cgroups/fscommon/utils_test.go @@ -1,14 +1,13 @@ -// +build linux - package fscommon import ( - "io/ioutil" "math" "os" "path/filepath" "strconv" "testing" + + "github.com/opencontainers/runc/libcontainer/cgroups" ) const ( @@ -17,18 +16,17 @@ const ( floatString = "2048" ) +func init() { + cgroups.TestMode = true +} + func TestGetCgroupParamsInt(t *testing.T) { // Setup tempdir. - tempDir, err := ioutil.TempDir("", "cgroup_utils_test") - if err != nil { - t.Fatal(err) - } - defer os.RemoveAll(tempDir) + tempDir := t.TempDir() tempFile := filepath.Join(tempDir, cgroupFile) // Success. - err = ioutil.WriteFile(tempFile, []byte(floatString), 0755) - if err != nil { + if err := os.WriteFile(tempFile, []byte(floatString), 0o755); err != nil { t.Fatal(err) } value, err := GetCgroupParamUint(tempDir, cgroupFile) @@ -39,7 +37,7 @@ func TestGetCgroupParamsInt(t *testing.T) { } // Success with new line. - err = ioutil.WriteFile(tempFile, []byte(floatString+"\n"), 0755) + err = os.WriteFile(tempFile, []byte(floatString+"\n"), 0o755) if err != nil { t.Fatal(err) } @@ -51,7 +49,7 @@ func TestGetCgroupParamsInt(t *testing.T) { } // Success with negative values - err = ioutil.WriteFile(tempFile, []byte("-12345"), 0755) + err = os.WriteFile(tempFile, []byte("-12345"), 0o755) if err != nil { t.Fatal(err) } @@ -64,7 +62,7 @@ func TestGetCgroupParamsInt(t *testing.T) { // Success with negative values lesser than min int64 s := strconv.FormatFloat(math.MinInt64, 'f', -1, 64) - err = ioutil.WriteFile(tempFile, []byte(s), 0755) + err = os.WriteFile(tempFile, []byte(s), 0o755) if err != nil { t.Fatal(err) } @@ -76,7 +74,7 @@ func TestGetCgroupParamsInt(t *testing.T) { } // Not a float. - err = ioutil.WriteFile(tempFile, []byte("not-a-float"), 0755) + err = os.WriteFile(tempFile, []byte("not-a-float"), 0o755) if err != nil { t.Fatal(err) } diff --git a/libcontainer/cgroups/getallpids.go b/libcontainer/cgroups/getallpids.go new file mode 100644 index 0000000..1355a51 --- /dev/null +++ b/libcontainer/cgroups/getallpids.go @@ -0,0 +1,27 @@ +package cgroups + +import ( + "io/fs" + "path/filepath" +) + +// GetAllPids returns all pids from the cgroup identified by path, and all its +// sub-cgroups. +func GetAllPids(path string) ([]int, error) { + var pids []int + err := filepath.WalkDir(path, func(p string, d fs.DirEntry, iErr error) error { + if iErr != nil { + return iErr + } + if !d.IsDir() { + return nil + } + cPids, err := readProcsFile(p) + if err != nil { + return err + } + pids = append(pids, cPids...) + return nil + }) + return pids, err +} diff --git a/libcontainer/cgroups/getallpids_test.go b/libcontainer/cgroups/getallpids_test.go new file mode 100644 index 0000000..e6b0632 --- /dev/null +++ b/libcontainer/cgroups/getallpids_test.go @@ -0,0 +1,17 @@ +package cgroups + +import ( + "testing" +) + +func BenchmarkGetAllPids(b *testing.B) { + total := 0 + for i := 0; i < b.N; i++ { + i, err := GetAllPids("/sys/fs/cgroup") + if err != nil { + b.Fatal(err) + } + total += len(i) + } + b.Logf("iter: %d, total: %d", b.N, total) +} diff --git a/libcontainer/cgroups/manager/manager_test.go b/libcontainer/cgroups/manager/manager_test.go new file mode 100644 index 0000000..b53e6f1 --- /dev/null +++ b/libcontainer/cgroups/manager/manager_test.go @@ -0,0 +1,44 @@ +package manager + +import ( + "testing" + + "github.com/opencontainers/runc/libcontainer/configs" +) + +// TestNilResources checks that a cgroup manager do not panic when +// config.Resources is nil. While it does not make sense to use a +// manager with no resources, it should not result in a panic. +// +// This tests either v1 or v2 managers (both fs and systemd), +// depending on what cgroup version is available on the host. +func TestNilResources(t *testing.T) { + for _, sd := range []bool{false, true} { + cg := &configs.Cgroup{} // .Resources is nil + cg.Systemd = sd + mgr, err := New(cg) + if err != nil { + // Some managers require non-nil Resources during + // instantiation -- provide and retry. In such case + // we're mostly testing Set(nil) below. + cg.Resources = &configs.Resources{} + mgr, err = New(cg) + if err != nil { + t.Error(err) + continue + } + } + _ = mgr.Apply(-1) + _ = mgr.Set(nil) + _ = mgr.Freeze(configs.Thawed) + _ = mgr.Exists() + _, _ = mgr.GetAllPids() + _, _ = mgr.GetCgroups() + _, _ = mgr.GetFreezerState() + _ = mgr.Path("") + _ = mgr.GetPaths() + _, _ = mgr.GetStats() + _, _ = mgr.OOMKillCount() + _ = mgr.Destroy() + } +} diff --git a/libcontainer/cgroups/manager/new.go b/libcontainer/cgroups/manager/new.go new file mode 100644 index 0000000..5df120d --- /dev/null +++ b/libcontainer/cgroups/manager/new.go @@ -0,0 +1,78 @@ +package manager + +import ( + "errors" + "fmt" + "path/filepath" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fs" + "github.com/opencontainers/runc/libcontainer/cgroups/fs2" + "github.com/opencontainers/runc/libcontainer/cgroups/systemd" + "github.com/opencontainers/runc/libcontainer/configs" +) + +// New returns the instance of a cgroup manager, which is chosen +// based on the local environment (whether cgroup v1 or v2 is used) +// and the config (whether config.Systemd is set or not). +func New(config *configs.Cgroup) (cgroups.Manager, error) { + return NewWithPaths(config, nil) +} + +// NewWithPaths is similar to New, and can be used in case cgroup paths +// are already well known, which can save some resources. +// +// For cgroup v1, the keys are controller/subsystem name, and the values +// are absolute filesystem paths to the appropriate cgroups. +// +// For cgroup v2, the only key allowed is "" (empty string), and the value +// is the unified cgroup path. +func NewWithPaths(config *configs.Cgroup, paths map[string]string) (cgroups.Manager, error) { + if config == nil { + return nil, errors.New("cgroups/manager.New: config must not be nil") + } + if config.Systemd && !systemd.IsRunningSystemd() { + return nil, errors.New("systemd not running on this host, cannot use systemd cgroups manager") + } + + // Cgroup v2 aka unified hierarchy. + if cgroups.IsCgroup2UnifiedMode() { + path, err := getUnifiedPath(paths) + if err != nil { + return nil, fmt.Errorf("manager.NewWithPaths: inconsistent paths: %w", err) + } + if config.Systemd { + return systemd.NewUnifiedManager(config, path) + } + return fs2.NewManager(config, path) + } + + // Cgroup v1. + if config.Systemd { + return systemd.NewLegacyManager(config, paths) + } + + return fs.NewManager(config, paths) +} + +// getUnifiedPath is an implementation detail of libcontainer factory. +// Historically, it saves cgroup paths as per-subsystem path map (as returned +// by cm.GetPaths(""), but with v2 we only have one single unified path +// (with "" as a key). +// +// This function converts from that map to string (using "" as a key), +// and also checks that the map itself is sane. +func getUnifiedPath(paths map[string]string) (string, error) { + if len(paths) > 1 { + return "", fmt.Errorf("expected a single path, got %+v", paths) + } + path := paths[""] + // can be empty + if path != "" { + if filepath.Clean(path) != path || !filepath.IsAbs(path) { + return "", fmt.Errorf("invalid path: %q", path) + } + } + + return path, nil +} diff --git a/libcontainer/cgroups/stats.go b/libcontainer/cgroups/stats.go index 8eeedc5..40a81dd 100644 --- a/libcontainer/cgroups/stats.go +++ b/libcontainer/cgroups/stats.go @@ -1,5 +1,3 @@ -// +build linux - package cgroups type ThrottlingData struct { @@ -20,6 +18,12 @@ type CpuUsage struct { // Total CPU time consumed per core. // Units: nanoseconds. PercpuUsage []uint64 `json:"percpu_usage,omitempty"` + // CPU time consumed per core in kernel mode + // Units: nanoseconds. + PercpuUsageInKernelmode []uint64 `json:"percpu_usage_in_kernelmode"` + // CPU time consumed per core in user mode + // Units: nanoseconds. + PercpuUsageInUsermode []uint64 `json:"percpu_usage_in_usermode"` // Time spent by tasks of the cgroup in kernel mode. // Units: nanoseconds. UsageInKernelmode uint64 `json:"usage_in_kernelmode"` @@ -33,6 +37,33 @@ type CpuStats struct { ThrottlingData ThrottlingData `json:"throttling_data,omitempty"` } +type CPUSetStats struct { + // List of the physical numbers of the CPUs on which processes + // in that cpuset are allowed to execute + CPUs []uint16 `json:"cpus,omitempty"` + // cpu_exclusive flag + CPUExclusive uint64 `json:"cpu_exclusive"` + // List of memory nodes on which processes in that cpuset + // are allowed to allocate memory + Mems []uint16 `json:"mems,omitempty"` + // mem_hardwall flag + MemHardwall uint64 `json:"mem_hardwall"` + // mem_exclusive flag + MemExclusive uint64 `json:"mem_exclusive"` + // memory_migrate flag + MemoryMigrate uint64 `json:"memory_migrate"` + // memory_spread page flag + MemorySpreadPage uint64 `json:"memory_spread_page"` + // memory_spread slab flag + MemorySpreadSlab uint64 `json:"memory_spread_slab"` + // memory_pressure + MemoryPressure uint64 `json:"memory_pressure"` + // sched_load balance flag + SchedLoadBalance uint64 `json:"sched_load_balance"` + // sched_relax_domain_level + SchedRelaxDomainLevel int64 `json:"sched_relax_domain_level"` +} + type MemoryData struct { Usage uint64 `json:"usage,omitempty"` MaxUsage uint64 `json:"max_usage,omitempty"` @@ -51,12 +82,33 @@ type MemoryStats struct { KernelUsage MemoryData `json:"kernel_usage,omitempty"` // usage of kernel TCP memory KernelTCPUsage MemoryData `json:"kernel_tcp_usage,omitempty"` + // usage of memory pages by NUMA node + // see chapter 5.6 of memory controller documentation + PageUsageByNUMA PageUsageByNUMA `json:"page_usage_by_numa,omitempty"` // if true, memory usage is accounted for throughout a hierarchy of cgroups. UseHierarchy bool `json:"use_hierarchy"` Stats map[string]uint64 `json:"stats,omitempty"` } +type PageUsageByNUMA struct { + // Embedding is used as types can't be recursive. + PageUsageByNUMAInner + Hierarchical PageUsageByNUMAInner `json:"hierarchical,omitempty"` +} + +type PageUsageByNUMAInner struct { + Total PageStats `json:"total,omitempty"` + File PageStats `json:"file,omitempty"` + Anon PageStats `json:"anon,omitempty"` + Unevictable PageStats `json:"unevictable,omitempty"` +} + +type PageStats struct { + Total uint64 `json:"total,omitempty"` + Nodes map[uint8]uint64 `json:"nodes,omitempty"` +} + type PidsStats struct { // number of pids in the cgroup Current uint64 `json:"current,omitempty"` @@ -72,7 +124,7 @@ type BlkioStatEntry struct { } type BlkioStats struct { - // number of bytes tranferred to and from the block device + // number of bytes transferred to and from the block device IoServiceBytesRecursive []BlkioStatEntry `json:"io_service_bytes_recursive,omitempty"` IoServicedRecursive []BlkioStatEntry `json:"io_serviced_recursive,omitempty"` IoQueuedRecursive []BlkioStatEntry `json:"io_queue_recursive,omitempty"` @@ -92,13 +144,26 @@ type HugetlbStats struct { Failcnt uint64 `json:"failcnt"` } +type RdmaEntry struct { + Device string `json:"device,omitempty"` + HcaHandles uint32 `json:"hca_handles,omitempty"` + HcaObjects uint32 `json:"hca_objects,omitempty"` +} + +type RdmaStats struct { + RdmaLimit []RdmaEntry `json:"rdma_limit,omitempty"` + RdmaCurrent []RdmaEntry `json:"rdma_current,omitempty"` +} + type Stats struct { CpuStats CpuStats `json:"cpu_stats,omitempty"` + CPUSetStats CPUSetStats `json:"cpuset_stats,omitempty"` MemoryStats MemoryStats `json:"memory_stats,omitempty"` PidsStats PidsStats `json:"pids_stats,omitempty"` BlkioStats BlkioStats `json:"blkio_stats,omitempty"` // the map is in the format "size of hugepage: stats of the hugepage" HugetlbStats map[string]HugetlbStats `json:"hugetlb_stats,omitempty"` + RdmaStats RdmaStats `json:"rdma_stats,omitempty"` } func NewStats() *Stats { diff --git a/libcontainer/cgroups/systemd/apply_nosystemd.go b/libcontainer/cgroups/systemd/apply_nosystemd.go deleted file mode 100644 index ef0db5a..0000000 --- a/libcontainer/cgroups/systemd/apply_nosystemd.go +++ /dev/null @@ -1,67 +0,0 @@ -// +build !linux - -package systemd - -import ( - "fmt" - - "github.com/opencontainers/runc/libcontainer/cgroups" - "github.com/opencontainers/runc/libcontainer/configs" -) - -type Manager struct { - Cgroups *configs.Cgroup - Paths map[string]string -} - -func UseSystemd() bool { - return false -} - -func NewSystemdCgroupsManager() (func(config *configs.Cgroup, paths map[string]string) cgroups.Manager, error) { - return nil, fmt.Errorf("Systemd not supported") -} - -func (m *Manager) Apply(pid int) error { - return fmt.Errorf("Systemd not supported") -} - -func (m *Manager) GetPids() ([]int, error) { - return nil, fmt.Errorf("Systemd not supported") -} - -func (m *Manager) GetAllPids() ([]int, error) { - return nil, fmt.Errorf("Systemd not supported") -} - -func (m *Manager) Destroy() error { - return fmt.Errorf("Systemd not supported") -} - -func (m *Manager) GetPaths() map[string]string { - return nil -} - -func (m *Manager) GetUnifiedPath() (string, error) { - return "", fmt.Errorf("Systemd not supported") -} - -func (m *Manager) GetStats() (*cgroups.Stats, error) { - return nil, fmt.Errorf("Systemd not supported") -} - -func (m *Manager) Set(container *configs.Config) error { - return fmt.Errorf("Systemd not supported") -} - -func (m *Manager) Freeze(state configs.FreezerState) error { - return fmt.Errorf("Systemd not supported") -} - -func Freeze(c *configs.Cgroup, state configs.FreezerState) error { - return fmt.Errorf("Systemd not supported") -} - -func (m *Manager) GetCgroups() (*configs.Cgroup, error) { - return nil, fmt.Errorf("Systemd not supported") -} diff --git a/libcontainer/cgroups/systemd/apply_systemd.go b/libcontainer/cgroups/systemd/apply_systemd.go deleted file mode 100644 index c4b19b3..0000000 --- a/libcontainer/cgroups/systemd/apply_systemd.go +++ /dev/null @@ -1,534 +0,0 @@ -// +build linux - -package systemd - -import ( - "errors" - "fmt" - "io/ioutil" - "math" - "os" - "path/filepath" - "strings" - "sync" - "time" - - systemdDbus "github.com/coreos/go-systemd/dbus" - "github.com/godbus/dbus" - "github.com/opencontainers/runc/libcontainer/cgroups" - "github.com/opencontainers/runc/libcontainer/cgroups/fs" - "github.com/opencontainers/runc/libcontainer/configs" - "github.com/sirupsen/logrus" -) - -type LegacyManager struct { - mu sync.Mutex - Cgroups *configs.Cgroup - Paths map[string]string -} - -type subsystem interface { - // Name returns the name of the subsystem. - Name() string - // Returns the stats, as 'stats', corresponding to the cgroup under 'path'. - GetStats(path string, stats *cgroups.Stats) error - // Set the cgroup represented by cgroup. - Set(path string, cgroup *configs.Cgroup) error -} - -var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist") - -type subsystemSet []subsystem - -func (s subsystemSet) Get(name string) (subsystem, error) { - for _, ss := range s { - if ss.Name() == name { - return ss, nil - } - } - return nil, errSubsystemDoesNotExist -} - -var legacySubsystems = subsystemSet{ - &fs.CpusetGroup{}, - &fs.DevicesGroup{}, - &fs.MemoryGroup{}, - &fs.CpuGroup{}, - &fs.CpuacctGroup{}, - &fs.PidsGroup{}, - &fs.BlkioGroup{}, - &fs.HugetlbGroup{}, - &fs.PerfEventGroup{}, - &fs.FreezerGroup{}, - &fs.NetPrioGroup{}, - &fs.NetClsGroup{}, - &fs.NameGroup{GroupName: "name=systemd"}, -} - -const ( - testScopeWait = 4 - testSliceWait = 4 -) - -var ( - connLock sync.Mutex - theConn *systemdDbus.Conn -) - -func newProp(name string, units interface{}) systemdDbus.Property { - return systemdDbus.Property{ - Name: name, - Value: dbus.MakeVariant(units), - } -} - -// NOTE: This function comes from package github.com/coreos/go-systemd/util -// It was borrowed here to avoid a dependency on cgo. -// -// IsRunningSystemd checks whether the host was booted with systemd as its init -// system. This functions similarly to systemd's `sd_booted(3)`: internally, it -// checks whether /run/systemd/system/ exists and is a directory. -// http://www.freedesktop.org/software/systemd/man/sd_booted.html -func isRunningSystemd() bool { - fi, err := os.Lstat("/run/systemd/system") - if err != nil { - return false - } - return fi.IsDir() -} - -func UseSystemd() bool { - if !isRunningSystemd() { - return false - } - - connLock.Lock() - defer connLock.Unlock() - - if theConn == nil { - var err error - theConn, err = systemdDbus.New() - if err != nil { - return false - } - } - return true -} - -func NewSystemdCgroupsManager() (func(config *configs.Cgroup, paths map[string]string) cgroups.Manager, error) { - if !isRunningSystemd() { - return nil, fmt.Errorf("systemd not running on this host, can't use systemd as a cgroups.Manager") - } - if cgroups.IsCgroup2UnifiedMode() { - return func(config *configs.Cgroup, paths map[string]string) cgroups.Manager { - return &UnifiedManager{ - Cgroups: config, - Paths: paths, - } - }, nil - } - return func(config *configs.Cgroup, paths map[string]string) cgroups.Manager { - return &LegacyManager{ - Cgroups: config, - Paths: paths, - } - }, nil -} - -func (m *LegacyManager) Apply(pid int) error { - var ( - c = m.Cgroups - unitName = getUnitName(c) - slice = "system.slice" - properties []systemdDbus.Property - ) - - if c.Paths != nil { - paths := make(map[string]string) - for name, path := range c.Paths { - _, err := getSubsystemPath(m.Cgroups, name) - if err != nil { - // Don't fail if a cgroup hierarchy was not found, just skip this subsystem - if cgroups.IsNotFound(err) { - continue - } - return err - } - paths[name] = path - } - m.Paths = paths - return cgroups.EnterPid(m.Paths, pid) - } - - if c.Parent != "" { - slice = c.Parent - } - - properties = append(properties, systemdDbus.PropDescription("libcontainer container "+c.Name)) - - // if we create a slice, the parent is defined via a Wants= - if strings.HasSuffix(unitName, ".slice") { - properties = append(properties, systemdDbus.PropWants(slice)) - } else { - // otherwise, we use Slice= - properties = append(properties, systemdDbus.PropSlice(slice)) - } - - // only add pid if its valid, -1 is used w/ general slice creation. - if pid != -1 { - properties = append(properties, newProp("PIDs", []uint32{uint32(pid)})) - } - - // Check if we can delegate. This is only supported on systemd versions 218 and above. - if !strings.HasSuffix(unitName, ".slice") { - // Assume scopes always support delegation. - properties = append(properties, newProp("Delegate", true)) - } - - // Always enable accounting, this gets us the same behaviour as the fs implementation, - // plus the kernel has some problems with joining the memory cgroup at a later time. - properties = append(properties, - newProp("MemoryAccounting", true), - newProp("CPUAccounting", true), - newProp("BlockIOAccounting", true)) - - // Assume DefaultDependencies= will always work (the check for it was previously broken.) - properties = append(properties, - newProp("DefaultDependencies", false)) - - if c.Resources.Memory != 0 { - properties = append(properties, - newProp("MemoryLimit", uint64(c.Resources.Memory))) - } - - if c.Resources.CpuShares != 0 { - properties = append(properties, - newProp("CPUShares", c.Resources.CpuShares)) - } - - // cpu.cfs_quota_us and cpu.cfs_period_us are controlled by systemd. - if c.Resources.CpuQuota != 0 && c.Resources.CpuPeriod != 0 { - // corresponds to USEC_INFINITY in systemd - // if USEC_INFINITY is provided, CPUQuota is left unbound by systemd - // always setting a property value ensures we can apply a quota and remove it later - cpuQuotaPerSecUSec := uint64(math.MaxUint64) - if c.Resources.CpuQuota > 0 { - // systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to CPUQuota - // (integer percentage of CPU) internally. This means that if a fractional percent of - // CPU is indicated by Resources.CpuQuota, we need to round up to the nearest - // 10ms (1% of a second) such that child cgroups can set the cpu.cfs_quota_us they expect. - cpuQuotaPerSecUSec = uint64(c.Resources.CpuQuota*1000000) / c.Resources.CpuPeriod - if cpuQuotaPerSecUSec%10000 != 0 { - cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000 - } - } - properties = append(properties, - newProp("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec)) - } - - if c.Resources.BlkioWeight != 0 { - properties = append(properties, - newProp("BlockIOWeight", uint64(c.Resources.BlkioWeight))) - } - - if c.Resources.PidsLimit > 0 { - properties = append(properties, - newProp("TasksAccounting", true), - newProp("TasksMax", uint64(c.Resources.PidsLimit))) - } - - // We have to set kernel memory here, as we can't change it once - // processes have been attached to the cgroup. - if c.Resources.KernelMemory != 0 { - if err := setKernelMemory(c); err != nil { - return err - } - } - - statusChan := make(chan string, 1) - if _, err := theConn.StartTransientUnit(unitName, "replace", properties, statusChan); err == nil { - select { - case <-statusChan: - case <-time.After(time.Second): - logrus.Warnf("Timed out while waiting for StartTransientUnit(%s) completion signal from dbus. Continuing...", unitName) - } - } else if !isUnitExists(err) { - return err - } - - if err := joinCgroups(c, pid); err != nil { - return err - } - - paths := make(map[string]string) - for _, s := range legacySubsystems { - subsystemPath, err := getSubsystemPath(m.Cgroups, s.Name()) - if err != nil { - // Don't fail if a cgroup hierarchy was not found, just skip this subsystem - if cgroups.IsNotFound(err) { - continue - } - return err - } - paths[s.Name()] = subsystemPath - } - m.Paths = paths - return nil -} - -func (m *LegacyManager) Destroy() error { - if m.Cgroups.Paths != nil { - return nil - } - m.mu.Lock() - defer m.mu.Unlock() - theConn.StopUnit(getUnitName(m.Cgroups), "replace", nil) - if err := cgroups.RemovePaths(m.Paths); err != nil { - return err - } - m.Paths = make(map[string]string) - return nil -} - -func (m *LegacyManager) GetPaths() map[string]string { - m.mu.Lock() - paths := m.Paths - m.mu.Unlock() - return paths -} - -func (m *LegacyManager) GetUnifiedPath() (string, error) { - return "", errors.New("unified path is only supported when running in unified mode") -} - -func join(c *configs.Cgroup, subsystem string, pid int) (string, error) { - path, err := getSubsystemPath(c, subsystem) - if err != nil { - return "", err - } - - if err := os.MkdirAll(path, 0755); err != nil { - return "", err - } - if err := cgroups.WriteCgroupProc(path, pid); err != nil { - return "", err - } - return path, nil -} - -func joinCgroups(c *configs.Cgroup, pid int) error { - for _, sys := range legacySubsystems { - name := sys.Name() - switch name { - case "name=systemd": - // let systemd handle this - case "cpuset": - path, err := getSubsystemPath(c, name) - if err != nil && !cgroups.IsNotFound(err) { - return err - } - s := &fs.CpusetGroup{} - if err := s.ApplyDir(path, c, pid); err != nil { - return err - } - default: - _, err := join(c, name, pid) - if err != nil { - // Even if it's `not found` error, we'll return err - // because devices cgroup is hard requirement for - // container security. - if name == "devices" { - return err - } - // For other subsystems, omit the `not found` error - // because they are optional. - if !cgroups.IsNotFound(err) { - return err - } - } - } - } - - return nil -} - -// systemd represents slice hierarchy using `-`, so we need to follow suit when -// generating the path of slice. Essentially, test-a-b.slice becomes -// /test.slice/test-a.slice/test-a-b.slice. -func ExpandSlice(slice string) (string, error) { - suffix := ".slice" - // Name has to end with ".slice", but can't be just ".slice". - if len(slice) < len(suffix) || !strings.HasSuffix(slice, suffix) { - return "", fmt.Errorf("invalid slice name: %s", slice) - } - - // Path-separators are not allowed. - if strings.Contains(slice, "/") { - return "", fmt.Errorf("invalid slice name: %s", slice) - } - - var path, prefix string - sliceName := strings.TrimSuffix(slice, suffix) - // if input was -.slice, we should just return root now - if sliceName == "-" { - return "/", nil - } - for _, component := range strings.Split(sliceName, "-") { - // test--a.slice isn't permitted, nor is -test.slice. - if component == "" { - return "", fmt.Errorf("invalid slice name: %s", slice) - } - - // Append the component to the path and to the prefix. - path += "/" + prefix + component + suffix - prefix += component + "-" - } - return path, nil -} - -func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) { - mountpoint, err := cgroups.FindCgroupMountpoint(c.Path, subsystem) - if err != nil { - return "", err - } - - initPath, err := cgroups.GetInitCgroup(subsystem) - if err != nil { - return "", err - } - // if pid 1 is systemd 226 or later, it will be in init.scope, not the root - initPath = strings.TrimSuffix(filepath.Clean(initPath), "init.scope") - - slice := "system.slice" - if c.Parent != "" { - slice = c.Parent - } - - slice, err = ExpandSlice(slice) - if err != nil { - return "", err - } - - return filepath.Join(mountpoint, initPath, slice, getUnitName(c)), nil -} - -func (m *LegacyManager) Freeze(state configs.FreezerState) error { - path, err := getSubsystemPath(m.Cgroups, "freezer") - if err != nil { - return err - } - prevState := m.Cgroups.Resources.Freezer - m.Cgroups.Resources.Freezer = state - freezer, err := legacySubsystems.Get("freezer") - if err != nil { - return err - } - err = freezer.Set(path, m.Cgroups) - if err != nil { - m.Cgroups.Resources.Freezer = prevState - return err - } - return nil -} - -func (m *LegacyManager) GetPids() ([]int, error) { - path, err := getSubsystemPath(m.Cgroups, "devices") - if err != nil { - return nil, err - } - return cgroups.GetPids(path) -} - -func (m *LegacyManager) GetAllPids() ([]int, error) { - path, err := getSubsystemPath(m.Cgroups, "devices") - if err != nil { - return nil, err - } - return cgroups.GetAllPids(path) -} - -func (m *LegacyManager) GetStats() (*cgroups.Stats, error) { - m.mu.Lock() - defer m.mu.Unlock() - stats := cgroups.NewStats() - for name, path := range m.Paths { - sys, err := legacySubsystems.Get(name) - if err == errSubsystemDoesNotExist || !cgroups.PathExists(path) { - continue - } - if err := sys.GetStats(path, stats); err != nil { - return nil, err - } - } - - return stats, nil -} - -func (m *LegacyManager) Set(container *configs.Config) error { - // If Paths are set, then we are just joining cgroups paths - // and there is no need to set any values. - if m.Cgroups.Paths != nil { - return nil - } - for _, sys := range legacySubsystems { - // Get the subsystem path, but don't error out for not found cgroups. - path, err := getSubsystemPath(container.Cgroups, sys.Name()) - if err != nil && !cgroups.IsNotFound(err) { - return err - } - - if err := sys.Set(path, container.Cgroups); err != nil { - return err - } - } - - if m.Paths["cpu"] != "" { - if err := fs.CheckCpushares(m.Paths["cpu"], container.Cgroups.Resources.CpuShares); err != nil { - return err - } - } - return nil -} - -func getUnitName(c *configs.Cgroup) string { - // by default, we create a scope unless the user explicitly asks for a slice. - if !strings.HasSuffix(c.Name, ".slice") { - return fmt.Sprintf("%s-%s.scope", c.ScopePrefix, c.Name) - } - return c.Name -} - -func setKernelMemory(c *configs.Cgroup) error { - path, err := getSubsystemPath(c, "memory") - if err != nil && !cgroups.IsNotFound(err) { - return err - } - - if err := os.MkdirAll(path, 0755); err != nil { - return err - } - // do not try to enable the kernel memory if we already have - // tasks in the cgroup. - content, err := ioutil.ReadFile(filepath.Join(path, "tasks")) - if err != nil { - return err - } - if len(content) > 0 { - return nil - } - return fs.EnableKernelMemoryAccounting(path) -} - -// isUnitExists returns true if the error is that a systemd unit already exists. -func isUnitExists(err error) bool { - if err != nil { - if dbusError, ok := err.(dbus.Error); ok { - return strings.Contains(dbusError.Name, "org.freedesktop.systemd1.UnitExists") - } - } - return false -} - -func (m *LegacyManager) GetCgroups() (*configs.Cgroup, error) { - return m.Cgroups, nil -} diff --git a/libcontainer/cgroups/systemd/common.go b/libcontainer/cgroups/systemd/common.go new file mode 100644 index 0000000..98ccc51 --- /dev/null +++ b/libcontainer/cgroups/systemd/common.go @@ -0,0 +1,528 @@ +package systemd + +import ( + "bufio" + "context" + "errors" + "fmt" + "math" + "os" + "regexp" + "strconv" + "strings" + "sync" + "time" + + systemdDbus "github.com/coreos/go-systemd/v22/dbus" + dbus "github.com/godbus/dbus/v5" + "github.com/sirupsen/logrus" + + cgroupdevices "github.com/opencontainers/runc/libcontainer/cgroups/devices" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/devices" +) + +const ( + // Default kernel value for cpu quota period is 100000 us (100 ms), same for v1 and v2. + // v1: https://www.kernel.org/doc/html/latest/scheduler/sched-bwc.html and + // v2: https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html + defCPUQuotaPeriod = uint64(100000) +) + +var ( + versionOnce sync.Once + version int + + isRunningSystemdOnce sync.Once + isRunningSystemd bool +) + +// NOTE: This function comes from package github.com/coreos/go-systemd/util +// It was borrowed here to avoid a dependency on cgo. +// +// IsRunningSystemd checks whether the host was booted with systemd as its init +// system. This functions similarly to systemd's `sd_booted(3)`: internally, it +// checks whether /run/systemd/system/ exists and is a directory. +// http://www.freedesktop.org/software/systemd/man/sd_booted.html +func IsRunningSystemd() bool { + isRunningSystemdOnce.Do(func() { + fi, err := os.Lstat("/run/systemd/system") + isRunningSystemd = err == nil && fi.IsDir() + }) + return isRunningSystemd +} + +// systemd represents slice hierarchy using `-`, so we need to follow suit when +// generating the path of slice. Essentially, test-a-b.slice becomes +// /test.slice/test-a.slice/test-a-b.slice. +func ExpandSlice(slice string) (string, error) { + suffix := ".slice" + // Name has to end with ".slice", but can't be just ".slice". + if len(slice) < len(suffix) || !strings.HasSuffix(slice, suffix) { + return "", fmt.Errorf("invalid slice name: %s", slice) + } + + // Path-separators are not allowed. + if strings.Contains(slice, "/") { + return "", fmt.Errorf("invalid slice name: %s", slice) + } + + var path, prefix string + sliceName := strings.TrimSuffix(slice, suffix) + // if input was -.slice, we should just return root now + if sliceName == "-" { + return "/", nil + } + for _, component := range strings.Split(sliceName, "-") { + // test--a.slice isn't permitted, nor is -test.slice. + if component == "" { + return "", fmt.Errorf("invalid slice name: %s", slice) + } + + // Append the component to the path and to the prefix. + path += "/" + prefix + component + suffix + prefix += component + "-" + } + return path, nil +} + +func groupPrefix(ruleType devices.Type) (string, error) { + switch ruleType { + case devices.BlockDevice: + return "block-", nil + case devices.CharDevice: + return "char-", nil + default: + return "", fmt.Errorf("device type %v has no group prefix", ruleType) + } +} + +// findDeviceGroup tries to find the device group name (as listed in +// /proc/devices) with the type prefixed as required for DeviceAllow, for a +// given (type, major) combination. If more than one device group exists, an +// arbitrary one is chosen. +func findDeviceGroup(ruleType devices.Type, ruleMajor int64) (string, error) { + fh, err := os.Open("/proc/devices") + if err != nil { + return "", err + } + defer fh.Close() + + prefix, err := groupPrefix(ruleType) + if err != nil { + return "", err + } + + scanner := bufio.NewScanner(fh) + var currentType devices.Type + for scanner.Scan() { + // We need to strip spaces because the first number is column-aligned. + line := strings.TrimSpace(scanner.Text()) + + // Handle the "header" lines. + switch line { + case "Block devices:": + currentType = devices.BlockDevice + continue + case "Character devices:": + currentType = devices.CharDevice + continue + case "": + continue + } + + // Skip lines unrelated to our type. + if currentType != ruleType { + continue + } + + // Parse out the (major, name). + var ( + currMajor int64 + currName string + ) + if n, err := fmt.Sscanf(line, "%d %s", &currMajor, &currName); err != nil || n != 2 { + if err == nil { + err = errors.New("wrong number of fields") + } + return "", fmt.Errorf("scan /proc/devices line %q: %w", line, err) + } + + if currMajor == ruleMajor { + return prefix + currName, nil + } + } + if err := scanner.Err(); err != nil { + return "", fmt.Errorf("reading /proc/devices: %w", err) + } + // Couldn't find the device group. + return "", nil +} + +// DeviceAllow is the dbus type "a(ss)" which means we need a struct +// to represent it in Go. +type deviceAllowEntry struct { + Path string + Perms string +} + +func allowAllDevices() []systemdDbus.Property { + // Setting mode to auto and removing all DeviceAllow rules + // results in allowing access to all devices. + return []systemdDbus.Property{ + newProp("DevicePolicy", "auto"), + newProp("DeviceAllow", []deviceAllowEntry{}), + } +} + +// generateDeviceProperties takes the configured device rules and generates a +// corresponding set of systemd properties to configure the devices correctly. +func generateDeviceProperties(r *configs.Resources) ([]systemdDbus.Property, error) { + if r.SkipDevices { + return nil, nil + } + + properties := []systemdDbus.Property{ + // Always run in the strictest white-list mode. + newProp("DevicePolicy", "strict"), + // Empty the DeviceAllow array before filling it. + newProp("DeviceAllow", []deviceAllowEntry{}), + } + + // Figure out the set of rules. + configEmu := &cgroupdevices.Emulator{} + for _, rule := range r.Devices { + if err := configEmu.Apply(*rule); err != nil { + return nil, fmt.Errorf("unable to apply rule for systemd: %w", err) + } + } + // systemd doesn't support blacklists. So we log a warning, and tell + // systemd to act as a deny-all whitelist. This ruleset will be replaced + // with our normal fallback code. This may result in spurious errors, but + // the only other option is to error out here. + if configEmu.IsBlacklist() { + // However, if we're dealing with an allow-all rule then we can do it. + if configEmu.IsAllowAll() { + return allowAllDevices(), nil + } + logrus.Warn("systemd doesn't support blacklist device rules -- applying temporary deny-all rule") + return properties, nil + } + + // Now generate the set of rules we actually need to apply. Unlike the + // normal devices cgroup, in "strict" mode systemd defaults to a deny-all + // whitelist which is the default for devices.Emulator. + finalRules, err := configEmu.Rules() + if err != nil { + return nil, fmt.Errorf("unable to get simplified rules for systemd: %w", err) + } + var deviceAllowList []deviceAllowEntry + for _, rule := range finalRules { + if !rule.Allow { + // Should never happen. + return nil, fmt.Errorf("[internal error] cannot add deny rule to systemd DeviceAllow list: %v", *rule) + } + switch rule.Type { + case devices.BlockDevice, devices.CharDevice: + default: + // Should never happen. + return nil, fmt.Errorf("invalid device type for DeviceAllow: %v", rule.Type) + } + + entry := deviceAllowEntry{ + Perms: string(rule.Permissions), + } + + // systemd has a fairly odd (though understandable) syntax here, and + // because of the OCI configuration format we have to do quite a bit of + // trickery to convert things: + // + // * Concrete rules with non-wildcard major/minor numbers have to use + // /dev/{block,char} paths. This is slightly odd because it means + // that we cannot add whitelist rules for devices that don't exist, + // but there's not too much we can do about that. + // + // However, path globbing is not support for path-based rules so we + // need to handle wildcards in some other manner. + // + // * Wildcard-minor rules have to specify a "device group name" (the + // second column in /proc/devices). + // + // * Wildcard (major and minor) rules can just specify a glob with the + // type ("char-*" or "block-*"). + // + // The only type of rule we can't handle is wildcard-major rules, and + // so we'll give a warning in that case (note that the fallback code + // will insert any rules systemd couldn't handle). What amazing fun. + + if rule.Major == devices.Wildcard { + // "_ *:n _" rules aren't supported by systemd. + if rule.Minor != devices.Wildcard { + logrus.Warnf("systemd doesn't support '*:n' device rules -- temporarily ignoring rule: %v", *rule) + continue + } + + // "_ *:* _" rules just wildcard everything. + prefix, err := groupPrefix(rule.Type) + if err != nil { + return nil, err + } + entry.Path = prefix + "*" + } else if rule.Minor == devices.Wildcard { + // "_ n:* _" rules require a device group from /proc/devices. + group, err := findDeviceGroup(rule.Type, rule.Major) + if err != nil { + return nil, fmt.Errorf("unable to find device '%v/%d': %w", rule.Type, rule.Major, err) + } + if group == "" { + // Couldn't find a group. + logrus.Warnf("could not find device group for '%v/%d' in /proc/devices -- temporarily ignoring rule: %v", rule.Type, rule.Major, *rule) + continue + } + entry.Path = group + } else { + // "_ n:m _" rules are just a path in /dev/{block,char}/. + switch rule.Type { + case devices.BlockDevice: + entry.Path = fmt.Sprintf("/dev/block/%d:%d", rule.Major, rule.Minor) + case devices.CharDevice: + entry.Path = fmt.Sprintf("/dev/char/%d:%d", rule.Major, rule.Minor) + } + } + deviceAllowList = append(deviceAllowList, entry) + } + + properties = append(properties, newProp("DeviceAllow", deviceAllowList)) + return properties, nil +} + +func newProp(name string, units interface{}) systemdDbus.Property { + return systemdDbus.Property{ + Name: name, + Value: dbus.MakeVariant(units), + } +} + +func getUnitName(c *configs.Cgroup) string { + // by default, we create a scope unless the user explicitly asks for a slice. + if !strings.HasSuffix(c.Name, ".slice") { + return c.ScopePrefix + "-" + c.Name + ".scope" + } + return c.Name +} + +// This code should be in sync with getUnitName. +func getUnitType(unitName string) string { + if strings.HasSuffix(unitName, ".slice") { + return "Slice" + } + return "Scope" +} + +// isDbusError returns true if the error is a specific dbus error. +func isDbusError(err error, name string) bool { + if err != nil { + var derr dbus.Error + if errors.As(err, &derr) { + return strings.Contains(derr.Name, name) + } + } + return false +} + +// isUnitExists returns true if the error is that a systemd unit already exists. +func isUnitExists(err error) bool { + return isDbusError(err, "org.freedesktop.systemd1.UnitExists") +} + +func startUnit(cm *dbusConnManager, unitName string, properties []systemdDbus.Property) error { + statusChan := make(chan string, 1) + err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) error { + _, err := c.StartTransientUnitContext(context.TODO(), unitName, "replace", properties, statusChan) + return err + }) + if err == nil { + timeout := time.NewTimer(30 * time.Second) + defer timeout.Stop() + + select { + case s := <-statusChan: + close(statusChan) + // Please refer to https://pkg.go.dev/github.com/coreos/go-systemd/v22/dbus#Conn.StartUnit + if s != "done" { + resetFailedUnit(cm, unitName) + return fmt.Errorf("error creating systemd unit `%s`: got `%s`", unitName, s) + } + case <-timeout.C: + resetFailedUnit(cm, unitName) + return errors.New("Timeout waiting for systemd to create " + unitName) + } + } else if !isUnitExists(err) { + return err + } + + return nil +} + +func stopUnit(cm *dbusConnManager, unitName string) error { + statusChan := make(chan string, 1) + err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) error { + _, err := c.StopUnitContext(context.TODO(), unitName, "replace", statusChan) + return err + }) + if err == nil { + timeout := time.NewTimer(30 * time.Second) + defer timeout.Stop() + + select { + case s := <-statusChan: + close(statusChan) + // Please refer to https://godoc.org/github.com/coreos/go-systemd/v22/dbus#Conn.StartUnit + if s != "done" { + logrus.Warnf("error removing unit `%s`: got `%s`. Continuing...", unitName, s) + } + case <-timeout.C: + return errors.New("Timed out while waiting for systemd to remove " + unitName) + } + } + return nil +} + +func resetFailedUnit(cm *dbusConnManager, name string) { + err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) error { + return c.ResetFailedUnitContext(context.TODO(), name) + }) + if err != nil { + logrus.Warnf("unable to reset failed unit: %v", err) + } +} + +func getUnitTypeProperty(cm *dbusConnManager, unitName string, unitType string, propertyName string) (*systemdDbus.Property, error) { + var prop *systemdDbus.Property + err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) (Err error) { + prop, Err = c.GetUnitTypePropertyContext(context.TODO(), unitName, unitType, propertyName) + return Err + }) + return prop, err +} + +func setUnitProperties(cm *dbusConnManager, name string, properties ...systemdDbus.Property) error { + return cm.retryOnDisconnect(func(c *systemdDbus.Conn) error { + return c.SetUnitPropertiesContext(context.TODO(), name, true, properties...) + }) +} + +func getManagerProperty(cm *dbusConnManager, name string) (string, error) { + str := "" + err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) error { + var err error + str, err = c.GetManagerProperty(name) + return err + }) + if err != nil { + return "", err + } + return strconv.Unquote(str) +} + +func systemdVersion(cm *dbusConnManager) int { + versionOnce.Do(func() { + version = -1 + verStr, err := getManagerProperty(cm, "Version") + if err == nil { + version, err = systemdVersionAtoi(verStr) + } + + if err != nil { + logrus.WithError(err).Error("unable to get systemd version") + } + }) + + return version +} + +func systemdVersionAtoi(verStr string) (int, error) { + // verStr should be of the form: + // "v245.4-1.fc32", "245", "v245-1.fc32", "245-1.fc32" (without quotes). + // The result for all of the above should be 245. + // Thus, we unconditionally remove the "v" prefix + // and then match on the first integer we can grab. + re := regexp.MustCompile(`v?([0-9]+)`) + matches := re.FindStringSubmatch(verStr) + if len(matches) < 2 { + return 0, fmt.Errorf("can't parse version %s: incorrect number of matches %v", verStr, matches) + } + ver, err := strconv.Atoi(matches[1]) + if err != nil { + return -1, fmt.Errorf("can't parse version: %w", err) + } + return ver, nil +} + +func addCpuQuota(cm *dbusConnManager, properties *[]systemdDbus.Property, quota int64, period uint64) { + if period != 0 { + // systemd only supports CPUQuotaPeriodUSec since v242 + sdVer := systemdVersion(cm) + if sdVer >= 242 { + *properties = append(*properties, + newProp("CPUQuotaPeriodUSec", period)) + } else { + logrus.Debugf("systemd v%d is too old to support CPUQuotaPeriodSec "+ + " (setting will still be applied to cgroupfs)", sdVer) + } + } + if quota != 0 || period != 0 { + // corresponds to USEC_INFINITY in systemd + cpuQuotaPerSecUSec := uint64(math.MaxUint64) + if quota > 0 { + if period == 0 { + // assume the default + period = defCPUQuotaPeriod + } + // systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to CPUQuota + // (integer percentage of CPU) internally. This means that if a fractional percent of + // CPU is indicated by Resources.CpuQuota, we need to round up to the nearest + // 10ms (1% of a second) such that child cgroups can set the cpu.cfs_quota_us they expect. + cpuQuotaPerSecUSec = uint64(quota*1000000) / period + if cpuQuotaPerSecUSec%10000 != 0 { + cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000 + } + } + *properties = append(*properties, + newProp("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec)) + } +} + +func addCpuset(cm *dbusConnManager, props *[]systemdDbus.Property, cpus, mems string) error { + if cpus == "" && mems == "" { + return nil + } + + // systemd only supports AllowedCPUs/AllowedMemoryNodes since v244 + sdVer := systemdVersion(cm) + if sdVer < 244 { + logrus.Debugf("systemd v%d is too old to support AllowedCPUs/AllowedMemoryNodes"+ + " (settings will still be applied to cgroupfs)", sdVer) + return nil + } + + if cpus != "" { + bits, err := RangeToBits(cpus) + if err != nil { + return fmt.Errorf("resources.CPU.Cpus=%q conversion error: %w", + cpus, err) + } + *props = append(*props, + newProp("AllowedCPUs", bits)) + } + if mems != "" { + bits, err := RangeToBits(mems) + if err != nil { + return fmt.Errorf("resources.CPU.Mems=%q conversion error: %w", + mems, err) + } + *props = append(*props, + newProp("AllowedMemoryNodes", bits)) + } + return nil +} diff --git a/libcontainer/cgroups/systemd/cpuset.go b/libcontainer/cgroups/systemd/cpuset.go new file mode 100644 index 0000000..83d10dd --- /dev/null +++ b/libcontainer/cgroups/systemd/cpuset.go @@ -0,0 +1,55 @@ +package systemd + +import ( + "errors" + "math/big" + "strconv" + "strings" +) + +// RangeToBits converts a text representation of a CPU mask (as written to +// or read from cgroups' cpuset.* files, e.g. "1,3-5") to a slice of bytes +// with the corresponding bits set (as consumed by systemd over dbus as +// AllowedCPUs/AllowedMemoryNodes unit property value). +func RangeToBits(str string) ([]byte, error) { + bits := new(big.Int) + + for _, r := range strings.Split(str, ",") { + // allow extra spaces around + r = strings.TrimSpace(r) + // allow empty elements (extra commas) + if r == "" { + continue + } + ranges := strings.SplitN(r, "-", 2) + if len(ranges) > 1 { + start, err := strconv.ParseUint(ranges[0], 10, 32) + if err != nil { + return nil, err + } + end, err := strconv.ParseUint(ranges[1], 10, 32) + if err != nil { + return nil, err + } + if start > end { + return nil, errors.New("invalid range: " + r) + } + for i := start; i <= end; i++ { + bits.SetBit(bits, int(i), 1) + } + } else { + val, err := strconv.ParseUint(ranges[0], 10, 32) + if err != nil { + return nil, err + } + bits.SetBit(bits, int(val), 1) + } + } + + ret := bits.Bytes() + if len(ret) == 0 { + // do not allow empty values + return nil, errors.New("empty value") + } + return ret, nil +} diff --git a/libcontainer/cgroups/systemd/cpuset_test.go b/libcontainer/cgroups/systemd/cpuset_test.go new file mode 100644 index 0000000..3030cba --- /dev/null +++ b/libcontainer/cgroups/systemd/cpuset_test.go @@ -0,0 +1,55 @@ +package systemd + +import ( + "bytes" + "testing" +) + +func TestRangeToBits(t *testing.T) { + testCases := []struct { + in string + out []byte + isErr bool + }{ + {in: "", isErr: true}, + {in: "0", out: []byte{1}}, + {in: "1", out: []byte{2}}, + {in: "0-1", out: []byte{3}}, + {in: "0,1", out: []byte{3}}, + {in: ",0,1,", out: []byte{3}}, + {in: "0-3", out: []byte{0x0f}}, + {in: "0,1,2-3", out: []byte{0x0f}}, + {in: "4-7", out: []byte{0xf0}}, + {in: "0-7", out: []byte{0xff}}, + {in: "0-15", out: []byte{0xff, 0xff}}, + {in: "16", out: []byte{1, 0, 0}}, + {in: "0-3,32-33", out: []byte{3, 0, 0, 0, 0x0f}}, + // extra spaces and tabs are ok + {in: "1, 2, 1-2", out: []byte{6}}, + {in: " , 1 , 3 , 5-7, ", out: []byte{0xea}}, + // somewhat large values + {in: "128-130,1", out: []byte{7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2}}, + + {in: "-", isErr: true}, + {in: "1-", isErr: true}, + {in: "-3", isErr: true}, + // bad range (start > end) + {in: "54-53", isErr: true}, + // kernel does not allow extra spaces inside a range + {in: "1 - 2", isErr: true}, + } + + for _, tc := range testCases { + out, err := RangeToBits(tc.in) + if err != nil { + if !tc.isErr { + t.Errorf("case %q: unexpected error: %v", tc.in, err) + } + + continue + } + if !bytes.Equal(out, tc.out) { + t.Errorf("case %q: expected %v, got %v", tc.in, tc.out, out) + } + } +} diff --git a/libcontainer/cgroups/systemd/dbus.go b/libcontainer/cgroups/systemd/dbus.go new file mode 100644 index 0000000..3e547e2 --- /dev/null +++ b/libcontainer/cgroups/systemd/dbus.go @@ -0,0 +1,100 @@ +package systemd + +import ( + "context" + "fmt" + "sync" + + systemdDbus "github.com/coreos/go-systemd/v22/dbus" + dbus "github.com/godbus/dbus/v5" +) + +var ( + dbusC *systemdDbus.Conn + dbusMu sync.RWMutex + dbusInited bool + dbusRootless bool +) + +type dbusConnManager struct{} + +// newDbusConnManager initializes systemd dbus connection manager. +func newDbusConnManager(rootless bool) *dbusConnManager { + dbusMu.Lock() + defer dbusMu.Unlock() + if dbusInited && rootless != dbusRootless { + panic("can't have both root and rootless dbus") + } + dbusInited = true + dbusRootless = rootless + return &dbusConnManager{} +} + +// getConnection lazily initializes and returns systemd dbus connection. +func (d *dbusConnManager) getConnection() (*systemdDbus.Conn, error) { + // In the case where dbusC != nil + // Use the read lock the first time to ensure + // that Conn can be acquired at the same time. + dbusMu.RLock() + if conn := dbusC; conn != nil { + dbusMu.RUnlock() + return conn, nil + } + dbusMu.RUnlock() + + // In the case where dbusC == nil + // Use write lock to ensure that only one + // will be created + dbusMu.Lock() + defer dbusMu.Unlock() + if conn := dbusC; conn != nil { + return conn, nil + } + + conn, err := d.newConnection() + if err != nil { + // When dbus-user-session is not installed, we can't detect whether we should try to connect to user dbus or system dbus, so d.dbusRootless is set to false. + // This may fail with a cryptic error "read unix @->/run/systemd/private: read: connection reset by peer: unknown." + // https://github.com/moby/moby/issues/42793 + return nil, fmt.Errorf("failed to connect to dbus (hint: for rootless containers, maybe you need to install dbus-user-session package, see https://github.com/opencontainers/runc/blob/master/docs/cgroup-v2.md): %w", err) + } + dbusC = conn + return conn, nil +} + +func (d *dbusConnManager) newConnection() (*systemdDbus.Conn, error) { + if dbusRootless { + return newUserSystemdDbus() + } + return systemdDbus.NewWithContext(context.TODO()) +} + +// resetConnection resets the connection to its initial state +// (so it can be reconnected if necessary). +func (d *dbusConnManager) resetConnection(conn *systemdDbus.Conn) { + dbusMu.Lock() + defer dbusMu.Unlock() + if dbusC != nil && dbusC == conn { + dbusC.Close() + dbusC = nil + } +} + +var errDbusConnClosed = dbus.ErrClosed.Error() + +// retryOnDisconnect calls op, and if the error it returns is about closed dbus +// connection, the connection is re-established and the op is retried. This helps +// with the situation when dbus is restarted and we have a stale connection. +func (d *dbusConnManager) retryOnDisconnect(op func(*systemdDbus.Conn) error) error { + for { + conn, err := d.getConnection() + if err != nil { + return err + } + err = op(conn) + if !isDbusError(err, errDbusConnClosed) { + return err + } + d.resetConnection(conn) + } +} diff --git a/libcontainer/cgroups/systemd/systemd_test.go b/libcontainer/cgroups/systemd/systemd_test.go new file mode 100644 index 0000000..7417bf2 --- /dev/null +++ b/libcontainer/cgroups/systemd/systemd_test.go @@ -0,0 +1,456 @@ +package systemd + +import ( + "bufio" + "bytes" + "os" + "os/exec" + "strings" + "testing" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/devices" +) + +func newManager(t *testing.T, config *configs.Cgroup) (m cgroups.Manager) { + t.Helper() + var err error + + if cgroups.IsCgroup2UnifiedMode() { + m, err = NewUnifiedManager(config, "") + } else { + m, err = NewLegacyManager(config, nil) + } + if err != nil { + t.Fatal(err) + } + t.Cleanup(func() { _ = m.Destroy() }) + + return m +} + +func TestSystemdVersion(t *testing.T) { + systemdVersionTests := []struct { + verStr string + expectedVer int + expectErr bool + }{ + {`"219"`, 219, false}, + {`"v245.4-1.fc32"`, 245, false}, + {`"241-1"`, 241, false}, + {`"v241-1"`, 241, false}, + {"NaN", 0, true}, + {"", 0, true}, + } + for _, sdTest := range systemdVersionTests { + ver, err := systemdVersionAtoi(sdTest.verStr) + if !sdTest.expectErr && err != nil { + t.Errorf("systemdVersionAtoi(%s); want nil; got %v", sdTest.verStr, err) + } + if sdTest.expectErr && err == nil { + t.Errorf("systemdVersionAtoi(%s); wanted failure; got nil", sdTest.verStr) + } + if ver != sdTest.expectedVer { + t.Errorf("systemdVersionAtoi(%s); want %d; got %d", sdTest.verStr, sdTest.expectedVer, ver) + } + } +} + +func TestValidUnitTypes(t *testing.T) { + testCases := []struct { + unitName string + expectedUnitType string + }{ + {"system.slice", "Slice"}, + {"kubepods.slice", "Slice"}, + {"testing-container:ab.scope", "Scope"}, + } + for _, sdTest := range testCases { + unitType := getUnitType(sdTest.unitName) + if unitType != sdTest.expectedUnitType { + t.Errorf("getUnitType(%s); want %q; got %q", sdTest.unitName, sdTest.expectedUnitType, unitType) + } + } +} + +// TestPodSkipDevicesUpdate checks that updating a pod having SkipDevices: true +// does not result in spurious "permission denied" errors in a container +// running under the pod. The test is somewhat similar in nature to the +// @test "update devices [minimal transition rules]" in tests/integration, +// but uses a pod. +func TestPodSkipDevicesUpdate(t *testing.T) { + if !IsRunningSystemd() { + t.Skip("Test requires systemd.") + } + if os.Geteuid() != 0 { + t.Skip("Test requires root.") + } + + podName := "system-runc_test_pod" + t.Name() + ".slice" + podConfig := &configs.Cgroup{ + Systemd: true, + Parent: "system.slice", + Name: podName, + Resources: &configs.Resources{ + PidsLimit: 42, + Memory: 32 * 1024 * 1024, + SkipDevices: true, + }, + } + // Create "pod" cgroup (a systemd slice to hold containers). + pm := newManager(t, podConfig) + if err := pm.Apply(-1); err != nil { + t.Fatal(err) + } + if err := pm.Set(podConfig.Resources); err != nil { + t.Fatal(err) + } + + containerConfig := &configs.Cgroup{ + Parent: podName, + ScopePrefix: "test", + Name: "PodSkipDevicesUpdate", + Resources: &configs.Resources{ + Devices: []*devices.Rule{ + // Allow access to /dev/null. + { + Type: devices.CharDevice, + Major: 1, + Minor: 3, + Permissions: "rwm", + Allow: true, + }, + }, + }, + } + + // Create a "container" within the "pod" cgroup. + // This is not a real container, just a process in the cgroup. + cmd := exec.Command("bash", "-c", "while true; do echo > /dev/null; done") + cmd.Env = append(os.Environ(), "LANG=C") + var stderr bytes.Buffer + cmd.Stderr = &stderr + if err := cmd.Start(); err != nil { + t.Fatal(err) + } + // Make sure to not leave a zombie. + defer func() { + // These may fail, we don't care. + _ = cmd.Process.Kill() + _ = cmd.Wait() + }() + + // Put the process into a cgroup. + cm := newManager(t, containerConfig) + if err := cm.Apply(cmd.Process.Pid); err != nil { + t.Fatal(err) + } + // Check that we put the "container" into the "pod" cgroup. + if !strings.HasPrefix(cm.Path("devices"), pm.Path("devices")) { + t.Fatalf("expected container cgroup path %q to be under pod cgroup path %q", + cm.Path("devices"), pm.Path("devices")) + } + if err := cm.Set(containerConfig.Resources); err != nil { + t.Fatal(err) + } + + // Now update the pod a few times. + for i := 0; i < 42; i++ { + podConfig.Resources.PidsLimit++ + podConfig.Resources.Memory += 1024 * 1024 + if err := pm.Set(podConfig.Resources); err != nil { + t.Fatal(err) + } + } + // Kill the "container". + if err := cmd.Process.Kill(); err != nil { + t.Fatal(err) + } + + _ = cmd.Wait() + + // "Container" stderr should be empty. + if stderr.Len() != 0 { + t.Fatalf("container stderr not empty: %s", stderr.String()) + } +} + +func testSkipDevices(t *testing.T, skipDevices bool, expected []string) { + if !IsRunningSystemd() { + t.Skip("Test requires systemd.") + } + if os.Geteuid() != 0 { + t.Skip("Test requires root.") + } + + podConfig := &configs.Cgroup{ + Parent: "system.slice", + Name: "system-runc_test_pods.slice", + Resources: &configs.Resources{ + SkipDevices: skipDevices, + }, + } + // Create "pods" cgroup (a systemd slice to hold containers). + pm := newManager(t, podConfig) + if err := pm.Apply(-1); err != nil { + t.Fatal(err) + } + if err := pm.Set(podConfig.Resources); err != nil { + t.Fatal(err) + } + + config := &configs.Cgroup{ + Parent: "system-runc_test_pods.slice", + ScopePrefix: "test", + Name: "SkipDevices", + Resources: &configs.Resources{ + Devices: []*devices.Rule{ + // Allow access to /dev/full only. + { + Type: devices.CharDevice, + Major: 1, + Minor: 7, + Permissions: "rwm", + Allow: true, + }, + }, + }, + } + + // Create a "container" within the "pods" cgroup. + // This is not a real container, just a process in the cgroup. + cmd := exec.Command("bash", "-c", "read; echo > /dev/full; cat /dev/null; true") + cmd.Env = append(os.Environ(), "LANG=C") + stdinR, stdinW, err := os.Pipe() + if err != nil { + t.Fatal(err) + } + cmd.Stdin = stdinR + var stderr bytes.Buffer + cmd.Stderr = &stderr + err = cmd.Start() + stdinR.Close() + defer stdinW.Close() + if err != nil { + t.Fatal(err) + } + // Make sure to not leave a zombie. + defer func() { + // These may fail, we don't care. + _, _ = stdinW.WriteString("hey\n") + _ = cmd.Wait() + }() + + // Put the process into a cgroup. + m := newManager(t, config) + if err := m.Apply(cmd.Process.Pid); err != nil { + t.Fatal(err) + } + // Check that we put the "container" into the "pod" cgroup. + if !strings.HasPrefix(m.Path("devices"), pm.Path("devices")) { + t.Fatalf("expected container cgroup path %q to be under pod cgroup path %q", + m.Path("devices"), pm.Path("devices")) + } + if err := m.Set(config.Resources); err != nil { + // failed to write "c 1:7 rwm": write /sys/fs/cgroup/devices/system.slice/system-runc_test_pods.slice/test-SkipDevices.scope/devices.allow: operation not permitted + if skipDevices == false && strings.HasSuffix(err.Error(), "/devices.allow: operation not permitted") { + // Cgroup v1 devices controller gives EPERM on trying + // to enable devices that are not enabled + // (skipDevices=false) in a parent cgroup. + // If this happens, test is passing. + return + } + t.Fatal(err) + } + + // Check that we can access /dev/full but not /dev/zero. + if _, err := stdinW.WriteString("wow\n"); err != nil { + t.Fatal(err) + } + if err := cmd.Wait(); err != nil { + t.Fatal(err) + } + for _, exp := range expected { + if !strings.Contains(stderr.String(), exp) { + t.Errorf("expected %q, got: %s", exp, stderr.String()) + } + } +} + +func TestSkipDevicesTrue(t *testing.T) { + testSkipDevices(t, true, []string{ + "echo: write error: No space left on device", + "cat: /dev/null: Operation not permitted", + }) +} + +func TestSkipDevicesFalse(t *testing.T) { + // If SkipDevices is not set for the parent slice, access to both + // devices should fail. This is done to assess the test correctness. + // For cgroup v1, we check for m.Set returning EPERM. + // For cgroup v2, we check for the errors below. + testSkipDevices(t, false, []string{ + "/dev/full: Operation not permitted", + "cat: /dev/null: Operation not permitted", + }) +} + +func TestUnitExistsIgnored(t *testing.T) { + if !IsRunningSystemd() { + t.Skip("Test requires systemd.") + } + if os.Geteuid() != 0 { + t.Skip("Test requires root.") + } + + podConfig := &configs.Cgroup{ + Parent: "system.slice", + Name: "system-runc_test_exists.slice", + Resources: &configs.Resources{}, + } + // Create "pods" cgroup (a systemd slice to hold containers). + pm := newManager(t, podConfig) + + // create twice to make sure "UnitExists" error is ignored. + for i := 0; i < 2; i++ { + if err := pm.Apply(-1); err != nil { + t.Fatal(err) + } + } +} + +func TestFreezePodCgroup(t *testing.T) { + if !IsRunningSystemd() { + t.Skip("Test requires systemd.") + } + if os.Geteuid() != 0 { + t.Skip("Test requires root.") + } + + podConfig := &configs.Cgroup{ + Parent: "system.slice", + Name: "system-runc_test_pod.slice", + Resources: &configs.Resources{ + SkipDevices: true, + Freezer: configs.Frozen, + }, + } + // Create a "pod" cgroup (a systemd slice to hold containers), + // which is frozen initially. + pm := newManager(t, podConfig) + if err := pm.Apply(-1); err != nil { + t.Fatal(err) + } + + if err := pm.Set(podConfig.Resources); err != nil { + t.Fatal(err) + } + + // Check the pod is frozen. + pf, err := pm.GetFreezerState() + if err != nil { + t.Fatal(err) + } + if pf != configs.Frozen { + t.Fatalf("expected pod to be frozen, got %v", pf) + } + + // Create a "container" within the "pod" cgroup. + // This is not a real container, just a process in the cgroup. + containerConfig := &configs.Cgroup{ + Parent: "system-runc_test_pod.slice", + ScopePrefix: "test", + Name: "inner-container", + Resources: &configs.Resources{}, + } + + cmd := exec.Command("bash", "-c", "while read; do echo $REPLY; done") + cmd.Env = append(os.Environ(), "LANG=C") + + // Setup stdin. + stdinR, stdinW, err := os.Pipe() + if err != nil { + t.Fatal(err) + } + cmd.Stdin = stdinR + + // Setup stdout. + stdoutR, stdoutW, err := os.Pipe() + if err != nil { + t.Fatal(err) + } + cmd.Stdout = stdoutW + rdr := bufio.NewReader(stdoutR) + + // Setup stderr. + var stderr bytes.Buffer + cmd.Stderr = &stderr + + err = cmd.Start() + stdinR.Close() + stdoutW.Close() + defer func() { + _ = stdinW.Close() + _ = stdoutR.Close() + }() + if err != nil { + t.Fatal(err) + } + // Make sure to not leave a zombie. + defer func() { + // These may fail, we don't care. + _ = cmd.Process.Kill() + _ = cmd.Wait() + }() + + // Put the process into a cgroup. + cm := newManager(t, containerConfig) + + if err := cm.Apply(cmd.Process.Pid); err != nil { + t.Fatal(err) + } + if err := cm.Set(containerConfig.Resources); err != nil { + t.Fatal(err) + } + // Check that we put the "container" into the "pod" cgroup. + if !strings.HasPrefix(cm.Path("freezer"), pm.Path("freezer")) { + t.Fatalf("expected container cgroup path %q to be under pod cgroup path %q", + cm.Path("freezer"), pm.Path("freezer")) + } + // Check the container is not reported as frozen despite the frozen parent. + cf, err := cm.GetFreezerState() + if err != nil { + t.Fatal(err) + } + if cf != configs.Thawed { + t.Fatalf("expected container to be thawed, got %v", cf) + } + + // Unfreeze the pod. + if err := pm.Freeze(configs.Thawed); err != nil { + t.Fatal(err) + } + + cf, err = cm.GetFreezerState() + if err != nil { + t.Fatal(err) + } + if cf != configs.Thawed { + t.Fatalf("expected container to be thawed, got %v", cf) + } + + // Check the "container" works. + marker := "one two\n" + _, err = stdinW.WriteString(marker) + if err != nil { + t.Fatal(err) + } + reply, err := rdr.ReadString('\n') + if err != nil { + t.Fatalf("reading from container: %v", err) + } + if reply != marker { + t.Fatalf("expected %q, got %q", marker, reply) + } +} diff --git a/libcontainer/cgroups/systemd/unified_hierarchy.go b/libcontainer/cgroups/systemd/unified_hierarchy.go deleted file mode 100644 index 6605099..0000000 --- a/libcontainer/cgroups/systemd/unified_hierarchy.go +++ /dev/null @@ -1,312 +0,0 @@ -// +build linux - -package systemd - -import ( - "fmt" - "io/ioutil" - "math" - "os" - "path/filepath" - "strings" - "sync" - "time" - - systemdDbus "github.com/coreos/go-systemd/dbus" - "github.com/opencontainers/runc/libcontainer/cgroups" - "github.com/opencontainers/runc/libcontainer/cgroups/fs2" - "github.com/opencontainers/runc/libcontainer/configs" - "github.com/pkg/errors" - "github.com/sirupsen/logrus" -) - -type UnifiedManager struct { - mu sync.Mutex - Cgroups *configs.Cgroup - Paths map[string]string -} - -func (m *UnifiedManager) Apply(pid int) error { - var ( - c = m.Cgroups - unitName = getUnitName(c) - slice = "system.slice" - properties []systemdDbus.Property - ) - - if c.Paths != nil { - paths := make(map[string]string) - for name, path := range c.Paths { - _, err := getSubsystemPath(m.Cgroups, name) - if err != nil { - // Don't fail if a cgroup hierarchy was not found, just skip this subsystem - if cgroups.IsNotFound(err) { - continue - } - return err - } - paths[name] = path - } - m.Paths = paths - return cgroups.EnterPid(m.Paths, pid) - } - - if c.Parent != "" { - slice = c.Parent - } - - properties = append(properties, systemdDbus.PropDescription("libcontainer container "+c.Name)) - - // if we create a slice, the parent is defined via a Wants= - if strings.HasSuffix(unitName, ".slice") { - properties = append(properties, systemdDbus.PropWants(slice)) - } else { - // otherwise, we use Slice= - properties = append(properties, systemdDbus.PropSlice(slice)) - } - - // only add pid if its valid, -1 is used w/ general slice creation. - if pid != -1 { - properties = append(properties, newProp("PIDs", []uint32{uint32(pid)})) - } - - // Check if we can delegate. This is only supported on systemd versions 218 and above. - if !strings.HasSuffix(unitName, ".slice") { - // Assume scopes always support delegation. - properties = append(properties, newProp("Delegate", true)) - } - - // Always enable accounting, this gets us the same behaviour as the fs implementation, - // plus the kernel has some problems with joining the memory cgroup at a later time. - properties = append(properties, - newProp("MemoryAccounting", true), - newProp("CPUAccounting", true), - newProp("BlockIOAccounting", true)) - - // Assume DefaultDependencies= will always work (the check for it was previously broken.) - properties = append(properties, - newProp("DefaultDependencies", false)) - - if c.Resources.Memory != 0 { - properties = append(properties, - newProp("MemoryLimit", uint64(c.Resources.Memory))) - } - - if c.Resources.CpuShares != 0 { - properties = append(properties, - newProp("CPUShares", c.Resources.CpuShares)) - } - - // cpu.cfs_quota_us and cpu.cfs_period_us are controlled by systemd. - if c.Resources.CpuQuota != 0 && c.Resources.CpuPeriod != 0 { - // corresponds to USEC_INFINITY in systemd - // if USEC_INFINITY is provided, CPUQuota is left unbound by systemd - // always setting a property value ensures we can apply a quota and remove it later - cpuQuotaPerSecUSec := uint64(math.MaxUint64) - if c.Resources.CpuQuota > 0 { - // systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to CPUQuota - // (integer percentage of CPU) internally. This means that if a fractional percent of - // CPU is indicated by Resources.CpuQuota, we need to round up to the nearest - // 10ms (1% of a second) such that child cgroups can set the cpu.cfs_quota_us they expect. - cpuQuotaPerSecUSec = uint64(c.Resources.CpuQuota*1000000) / c.Resources.CpuPeriod - if cpuQuotaPerSecUSec%10000 != 0 { - cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000 - } - } - properties = append(properties, - newProp("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec)) - } - - if c.Resources.BlkioWeight != 0 { - properties = append(properties, - newProp("BlockIOWeight", uint64(c.Resources.BlkioWeight))) - } - - if c.Resources.PidsLimit > 0 { - properties = append(properties, - newProp("TasksAccounting", true), - newProp("TasksMax", uint64(c.Resources.PidsLimit))) - } - - // We have to set kernel memory here, as we can't change it once - // processes have been attached to the cgroup. - if c.Resources.KernelMemory != 0 { - if err := setKernelMemory(c); err != nil { - return err - } - } - - statusChan := make(chan string, 1) - if _, err := theConn.StartTransientUnit(unitName, "replace", properties, statusChan); err == nil { - select { - case <-statusChan: - case <-time.After(time.Second): - logrus.Warnf("Timed out while waiting for StartTransientUnit(%s) completion signal from dbus. Continuing...", unitName) - } - } else if !isUnitExists(err) { - return err - } - - if err := joinCgroupsV2(c, pid); err != nil { - return err - } - - path, err := getSubsystemPath(m.Cgroups, "") - if err != nil { - return err - } - m.Paths = map[string]string{ - "pids": path, - "memory": path, - "io": path, - "cpu": path, - "devices": path, - "cpuset": path, - "freezer": path, - } - return nil -} - -func (m *UnifiedManager) Destroy() error { - if m.Cgroups.Paths != nil { - return nil - } - m.mu.Lock() - defer m.mu.Unlock() - theConn.StopUnit(getUnitName(m.Cgroups), "replace", nil) - if err := cgroups.RemovePaths(m.Paths); err != nil { - return err - } - m.Paths = make(map[string]string) - return nil -} - -func (m *UnifiedManager) GetPaths() map[string]string { - m.mu.Lock() - paths := m.Paths - m.mu.Unlock() - return paths -} -func (m *UnifiedManager) GetUnifiedPath() (string, error) { - unifiedPath := "" - m.mu.Lock() - defer m.mu.Unlock() - for k, v := range m.Paths { - if unifiedPath == "" { - unifiedPath = v - } else if v != unifiedPath { - return unifiedPath, - errors.Errorf("expected %q path to be unified path %q, got %q", k, unifiedPath, v) - } - } - if unifiedPath == "" { - // FIXME: unified path could be detected even when no controller is available - return unifiedPath, errors.New("cannot detect unified path") - } - return unifiedPath, nil -} -func createCgroupsv2Path(path string) (Err error) { - content, err := ioutil.ReadFile("/sys/fs/cgroup/cgroup.controllers") - if err != nil { - return err - } - if !filepath.HasPrefix(path, "/sys/fs/cgroup") { - return fmt.Errorf("invalid cgroup path %s", path) - } - - res := "" - for i, c := range strings.Split(strings.TrimSpace(string(content)), " ") { - if i == 0 { - res = fmt.Sprintf("+%s", c) - } else { - res = res + fmt.Sprintf(" +%s", c) - } - } - resByte := []byte(res) - - current := "/sys/fs" - elements := strings.Split(path, "/") - for i, e := range elements[3:] { - current = filepath.Join(current, e) - if i > 0 { - if err := os.Mkdir(current, 0755); err != nil { - if !os.IsExist(err) { - return err - } - } else { - // If the directory was created, be sure it is not left around on errors. - defer func() { - if Err != nil { - os.Remove(current) - } - }() - } - } - if i < len(elements[3:])-1 { - if err := ioutil.WriteFile(filepath.Join(current, "cgroup.subtree_control"), resByte, 0755); err != nil { - return err - } - } - } - return nil -} - -func joinCgroupsV2(c *configs.Cgroup, pid int) error { - path, err := getSubsystemPath(c, "memory") - if err != nil { - return err - } - return createCgroupsv2Path(path) -} - -func (m *UnifiedManager) fsManager() (cgroups.Manager, error) { - path, err := m.GetUnifiedPath() - if err != nil { - return nil, err - } - return fs2.NewManager(m.Cgroups, path, false) -} - -func (m *UnifiedManager) Freeze(state configs.FreezerState) error { - fsMgr, err := m.fsManager() - if err != nil { - return err - } - return fsMgr.Freeze(state) -} - -func (m *UnifiedManager) GetPids() ([]int, error) { - path, err := m.GetUnifiedPath() - if err != nil { - return nil, err - } - return cgroups.GetPids(path) -} - -func (m *UnifiedManager) GetAllPids() ([]int, error) { - path, err := m.GetUnifiedPath() - if err != nil { - return nil, err - } - return cgroups.GetAllPids(path) -} - -func (m *UnifiedManager) GetStats() (*cgroups.Stats, error) { - fsMgr, err := m.fsManager() - if err != nil { - return nil, err - } - return fsMgr.GetStats() -} - -func (m *UnifiedManager) Set(container *configs.Config) error { - fsMgr, err := m.fsManager() - if err != nil { - return err - } - return fsMgr.Set(container) -} - -func (m *UnifiedManager) GetCgroups() (*configs.Cgroup, error) { - return m.Cgroups, nil -} diff --git a/libcontainer/cgroups/systemd/user.go b/libcontainer/cgroups/systemd/user.go new file mode 100644 index 0000000..0f50f76 --- /dev/null +++ b/libcontainer/cgroups/systemd/user.go @@ -0,0 +1,106 @@ +package systemd + +import ( + "bufio" + "bytes" + "errors" + "fmt" + "os" + "os/exec" + "path/filepath" + "strconv" + "strings" + + systemdDbus "github.com/coreos/go-systemd/v22/dbus" + dbus "github.com/godbus/dbus/v5" + + "github.com/opencontainers/runc/libcontainer/userns" +) + +// newUserSystemdDbus creates a connection for systemd user-instance. +func newUserSystemdDbus() (*systemdDbus.Conn, error) { + addr, err := DetectUserDbusSessionBusAddress() + if err != nil { + return nil, err + } + uid, err := DetectUID() + if err != nil { + return nil, err + } + + return systemdDbus.NewConnection(func() (*dbus.Conn, error) { + conn, err := dbus.Dial(addr) + if err != nil { + return nil, fmt.Errorf("error while dialing %q: %w", addr, err) + } + methods := []dbus.Auth{dbus.AuthExternal(strconv.Itoa(uid))} + err = conn.Auth(methods) + if err != nil { + conn.Close() + return nil, fmt.Errorf("error while authenticating connection (address=%q, UID=%d): %w", addr, uid, err) + } + if err = conn.Hello(); err != nil { + conn.Close() + return nil, fmt.Errorf("error while sending Hello message (address=%q, UID=%d): %w", addr, uid, err) + } + return conn, nil + }) +} + +// DetectUID detects UID from the OwnerUID field of `busctl --user status` +// if running in userNS. The value corresponds to sd_bus_creds_get_owner_uid(3) . +// +// Otherwise returns os.Getuid() . +func DetectUID() (int, error) { + if !userns.RunningInUserNS() { + return os.Getuid(), nil + } + b, err := exec.Command("busctl", "--user", "--no-pager", "status").CombinedOutput() + if err != nil { + return -1, fmt.Errorf("could not execute `busctl --user --no-pager status` (output: %q): %w", string(b), err) + } + scanner := bufio.NewScanner(bytes.NewReader(b)) + for scanner.Scan() { + s := strings.TrimSpace(scanner.Text()) + if strings.HasPrefix(s, "OwnerUID=") { + uidStr := strings.TrimPrefix(s, "OwnerUID=") + i, err := strconv.Atoi(uidStr) + if err != nil { + return -1, fmt.Errorf("could not detect the OwnerUID: %w", err) + } + return i, nil + } + } + if err := scanner.Err(); err != nil { + return -1, err + } + return -1, errors.New("could not detect the OwnerUID") +} + +// DetectUserDbusSessionBusAddress returns $DBUS_SESSION_BUS_ADDRESS if set. +// Otherwise returns "unix:path=$XDG_RUNTIME_DIR/bus" if $XDG_RUNTIME_DIR/bus exists. +// Otherwise parses the value from `systemctl --user show-environment` . +func DetectUserDbusSessionBusAddress() (string, error) { + if env := os.Getenv("DBUS_SESSION_BUS_ADDRESS"); env != "" { + return env, nil + } + if xdr := os.Getenv("XDG_RUNTIME_DIR"); xdr != "" { + busPath := filepath.Join(xdr, "bus") + if _, err := os.Stat(busPath); err == nil { + busAddress := "unix:path=" + busPath + return busAddress, nil + } + } + b, err := exec.Command("systemctl", "--user", "--no-pager", "show-environment").CombinedOutput() + if err != nil { + return "", fmt.Errorf("could not execute `systemctl --user --no-pager show-environment` (output=%q): %w", string(b), err) + } + scanner := bufio.NewScanner(bytes.NewReader(b)) + for scanner.Scan() { + s := strings.TrimSpace(scanner.Text()) + if strings.HasPrefix(s, "DBUS_SESSION_BUS_ADDRESS=") { + return strings.TrimPrefix(s, "DBUS_SESSION_BUS_ADDRESS="), nil + } + } + return "", errors.New("could not detect DBUS_SESSION_BUS_ADDRESS from `systemctl --user --no-pager show-environment`. Make sure you have installed the dbus-user-session or dbus-daemon package and then run: `systemctl --user start dbus`") +} diff --git a/libcontainer/cgroups/systemd/v1.go b/libcontainer/cgroups/systemd/v1.go new file mode 100644 index 0000000..a74a05a --- /dev/null +++ b/libcontainer/cgroups/systemd/v1.go @@ -0,0 +1,477 @@ +package systemd + +import ( + "errors" + "os" + "path/filepath" + "reflect" + "strings" + "sync" + + systemdDbus "github.com/coreos/go-systemd/v22/dbus" + "github.com/godbus/dbus/v5" + "github.com/sirupsen/logrus" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fs" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type legacyManager struct { + mu sync.Mutex + cgroups *configs.Cgroup + paths map[string]string + dbus *dbusConnManager +} + +func NewLegacyManager(cg *configs.Cgroup, paths map[string]string) (cgroups.Manager, error) { + if cg.Rootless { + return nil, errors.New("cannot use rootless systemd cgroups manager on cgroup v1") + } + if cg.Resources != nil && cg.Resources.Unified != nil { + return nil, cgroups.ErrV1NoUnified + } + if paths == nil { + var err error + paths, err = initPaths(cg) + if err != nil { + return nil, err + } + } + return &legacyManager{ + cgroups: cg, + paths: paths, + dbus: newDbusConnManager(false), + }, nil +} + +type subsystem interface { + // Name returns the name of the subsystem. + Name() string + // Returns the stats, as 'stats', corresponding to the cgroup under 'path'. + GetStats(path string, stats *cgroups.Stats) error + // Set sets cgroup resource limits. + Set(path string, r *configs.Resources) error +} + +var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist") + +var legacySubsystems = []subsystem{ + &fs.CpusetGroup{}, + &fs.DevicesGroup{}, + &fs.MemoryGroup{}, + &fs.CpuGroup{}, + &fs.CpuacctGroup{}, + &fs.PidsGroup{}, + &fs.BlkioGroup{}, + &fs.HugetlbGroup{}, + &fs.PerfEventGroup{}, + &fs.FreezerGroup{}, + &fs.NetPrioGroup{}, + &fs.NetClsGroup{}, + &fs.NameGroup{GroupName: "name=systemd"}, + &fs.RdmaGroup{}, +} + +func genV1ResourcesProperties(r *configs.Resources, cm *dbusConnManager) ([]systemdDbus.Property, error) { + var properties []systemdDbus.Property + + deviceProperties, err := generateDeviceProperties(r) + if err != nil { + return nil, err + } + properties = append(properties, deviceProperties...) + + if r.Memory != 0 { + properties = append(properties, + newProp("MemoryLimit", uint64(r.Memory))) + } + + if r.CpuShares != 0 { + properties = append(properties, + newProp("CPUShares", r.CpuShares)) + } + + addCpuQuota(cm, &properties, r.CpuQuota, r.CpuPeriod) + + if r.BlkioWeight != 0 { + properties = append(properties, + newProp("BlockIOWeight", uint64(r.BlkioWeight))) + } + + if r.PidsLimit > 0 || r.PidsLimit == -1 { + properties = append(properties, + newProp("TasksMax", uint64(r.PidsLimit))) + } + + err = addCpuset(cm, &properties, r.CpusetCpus, r.CpusetMems) + if err != nil { + return nil, err + } + + return properties, nil +} + +// initPaths figures out and returns paths to cgroups. +func initPaths(c *configs.Cgroup) (map[string]string, error) { + slice := "system.slice" + if c.Parent != "" { + var err error + slice, err = ExpandSlice(c.Parent) + if err != nil { + return nil, err + } + } + + unit := getUnitName(c) + + paths := make(map[string]string) + for _, s := range legacySubsystems { + subsystemPath, err := getSubsystemPath(slice, unit, s.Name()) + if err != nil { + // Even if it's `not found` error, we'll return err + // because devices cgroup is hard requirement for + // container security. + if s.Name() == "devices" { + return nil, err + } + // Don't fail if a cgroup hierarchy was not found, just skip this subsystem + if cgroups.IsNotFound(err) { + continue + } + return nil, err + } + paths[s.Name()] = subsystemPath + } + + // If systemd is using cgroups-hybrid mode then add the slice path of + // this container to the paths so the following process executed with + // "runc exec" joins that cgroup as well. + if cgroups.IsCgroup2HybridMode() { + // "" means cgroup-hybrid path + cgroupsHybridPath, err := getSubsystemPath(slice, unit, "") + if err != nil && cgroups.IsNotFound(err) { + return nil, err + } + paths[""] = cgroupsHybridPath + } + + return paths, nil +} + +func (m *legacyManager) Apply(pid int) error { + var ( + c = m.cgroups + unitName = getUnitName(c) + slice = "system.slice" + properties []systemdDbus.Property + ) + + m.mu.Lock() + defer m.mu.Unlock() + + if c.Parent != "" { + slice = c.Parent + } + + properties = append(properties, systemdDbus.PropDescription("libcontainer container "+c.Name)) + + if strings.HasSuffix(unitName, ".slice") { + // If we create a slice, the parent is defined via a Wants=. + properties = append(properties, systemdDbus.PropWants(slice)) + } else { + // Otherwise it's a scope, which we put into a Slice=. + properties = append(properties, systemdDbus.PropSlice(slice)) + // Assume scopes always support delegation (supported since systemd v218). + properties = append(properties, newProp("Delegate", true)) + } + + // only add pid if its valid, -1 is used w/ general slice creation. + if pid != -1 { + properties = append(properties, newProp("PIDs", []uint32{uint32(pid)})) + } + + // Always enable accounting, this gets us the same behaviour as the fs implementation, + // plus the kernel has some problems with joining the memory cgroup at a later time. + properties = append(properties, + newProp("MemoryAccounting", true), + newProp("CPUAccounting", true), + newProp("BlockIOAccounting", true), + newProp("TasksAccounting", true), + ) + + // Assume DefaultDependencies= will always work (the check for it was previously broken.) + properties = append(properties, + newProp("DefaultDependencies", false)) + + properties = append(properties, c.SystemdProps...) + + if err := startUnit(m.dbus, unitName, properties); err != nil { + return err + } + + if err := m.joinCgroups(pid); err != nil { + return err + } + + return nil +} + +func (m *legacyManager) Destroy() error { + m.mu.Lock() + defer m.mu.Unlock() + + stopErr := stopUnit(m.dbus, getUnitName(m.cgroups)) + + // Both on success and on error, cleanup all the cgroups + // we are aware of, as some of them were created directly + // by Apply() and are not managed by systemd. + if err := cgroups.RemovePaths(m.paths); err != nil && stopErr == nil { + return err + } + + return stopErr +} + +func (m *legacyManager) Path(subsys string) string { + m.mu.Lock() + defer m.mu.Unlock() + return m.paths[subsys] +} + +func (m *legacyManager) joinCgroups(pid int) error { + for _, sys := range legacySubsystems { + name := sys.Name() + switch name { + case "name=systemd": + // let systemd handle this + case "cpuset": + if path, ok := m.paths[name]; ok { + s := &fs.CpusetGroup{} + if err := s.ApplyDir(path, m.cgroups.Resources, pid); err != nil { + return err + } + } + default: + if path, ok := m.paths[name]; ok { + if err := os.MkdirAll(path, 0o755); err != nil { + return err + } + if err := cgroups.WriteCgroupProc(path, pid); err != nil { + return err + } + } + } + } + + return nil +} + +func getSubsystemPath(slice, unit, subsystem string) (string, error) { + mountpoint, err := cgroups.FindCgroupMountpoint("", subsystem) + if err != nil { + return "", err + } + + initPath, err := cgroups.GetInitCgroup(subsystem) + if err != nil { + return "", err + } + // if pid 1 is systemd 226 or later, it will be in init.scope, not the root + initPath = strings.TrimSuffix(filepath.Clean(initPath), "init.scope") + + return filepath.Join(mountpoint, initPath, slice, unit), nil +} + +func (m *legacyManager) Freeze(state configs.FreezerState) error { + err := m.doFreeze(state) + if err == nil { + m.cgroups.Resources.Freezer = state + } + return err +} + +// doFreeze is the same as Freeze but without +// changing the m.cgroups.Resources.Frozen field. +func (m *legacyManager) doFreeze(state configs.FreezerState) error { + path, ok := m.paths["freezer"] + if !ok { + return errSubsystemDoesNotExist + } + freezer := &fs.FreezerGroup{} + resources := &configs.Resources{Freezer: state} + return freezer.Set(path, resources) +} + +func (m *legacyManager) GetPids() ([]int, error) { + path, ok := m.paths["devices"] + if !ok { + return nil, errSubsystemDoesNotExist + } + return cgroups.GetPids(path) +} + +func (m *legacyManager) GetAllPids() ([]int, error) { + path, ok := m.paths["devices"] + if !ok { + return nil, errSubsystemDoesNotExist + } + return cgroups.GetAllPids(path) +} + +func (m *legacyManager) GetStats() (*cgroups.Stats, error) { + m.mu.Lock() + defer m.mu.Unlock() + stats := cgroups.NewStats() + for _, sys := range legacySubsystems { + path := m.paths[sys.Name()] + if path == "" { + continue + } + if err := sys.GetStats(path, stats); err != nil { + return nil, err + } + } + + return stats, nil +} + +// freezeBeforeSet answers whether there is a need to freeze the cgroup before +// applying its systemd unit properties, and thaw after, while avoiding +// unnecessary freezer state changes. +// +// The reason why we have to freeze is that systemd's application of device +// rules is done disruptively, resulting in spurious errors to common devices +// (unlike our fs driver, they will happily write deny-all rules to running +// containers). So we have to freeze the container to avoid the container get +// an occasional "permission denied" error. +func (m *legacyManager) freezeBeforeSet(unitName string, r *configs.Resources) (needsFreeze, needsThaw bool, err error) { + // Special case for SkipDevices, as used by Kubernetes to create pod + // cgroups with allow-all device policy). + if r.SkipDevices { + if r.SkipFreezeOnSet { + // Both needsFreeze and needsThaw are false. + return + } + + // No need to freeze if SkipDevices is set, and either + // (1) systemd unit does not (yet) exist, or + // (2) it has DevicePolicy=auto and empty DeviceAllow list. + // + // Interestingly, (1) and (2) are the same here because + // a non-existent unit returns default properties, + // and settings in (2) are the defaults. + // + // Do not return errors from getUnitTypeProperty, as they alone + // should not prevent Set from working. + + unitType := getUnitType(unitName) + + devPolicy, e := getUnitTypeProperty(m.dbus, unitName, unitType, "DevicePolicy") + if e == nil && devPolicy.Value == dbus.MakeVariant("auto") { + devAllow, e := getUnitTypeProperty(m.dbus, unitName, unitType, "DeviceAllow") + if e == nil { + if rv := reflect.ValueOf(devAllow.Value.Value()); rv.Kind() == reflect.Slice && rv.Len() == 0 { + needsFreeze = false + needsThaw = false + return + } + } + } + } + + needsFreeze = true + needsThaw = true + + // Check the current freezer state. + freezerState, err := m.GetFreezerState() + if err != nil { + return + } + if freezerState == configs.Frozen { + // Already frozen, and should stay frozen. + needsFreeze = false + needsThaw = false + } + + if r.Freezer == configs.Frozen { + // Will be frozen anyway -- no need to thaw. + needsThaw = false + } + return +} + +func (m *legacyManager) Set(r *configs.Resources) error { + if r == nil { + return nil + } + if r.Unified != nil { + return cgroups.ErrV1NoUnified + } + properties, err := genV1ResourcesProperties(r, m.dbus) + if err != nil { + return err + } + + unitName := getUnitName(m.cgroups) + needsFreeze, needsThaw, err := m.freezeBeforeSet(unitName, r) + if err != nil { + return err + } + + if needsFreeze { + if err := m.doFreeze(configs.Frozen); err != nil { + // If freezer cgroup isn't supported, we just warn about it. + logrus.Infof("freeze container before SetUnitProperties failed: %v", err) + } + } + setErr := setUnitProperties(m.dbus, unitName, properties...) + if needsThaw { + if err := m.doFreeze(configs.Thawed); err != nil { + logrus.Infof("thaw container after SetUnitProperties failed: %v", err) + } + } + if setErr != nil { + return setErr + } + + for _, sys := range legacySubsystems { + // Get the subsystem path, but don't error out for not found cgroups. + path, ok := m.paths[sys.Name()] + if !ok { + continue + } + if err := sys.Set(path, r); err != nil { + return err + } + } + + return nil +} + +func (m *legacyManager) GetPaths() map[string]string { + m.mu.Lock() + defer m.mu.Unlock() + return m.paths +} + +func (m *legacyManager) GetCgroups() (*configs.Cgroup, error) { + return m.cgroups, nil +} + +func (m *legacyManager) GetFreezerState() (configs.FreezerState, error) { + path, ok := m.paths["freezer"] + if !ok { + return configs.Undefined, nil + } + freezer := &fs.FreezerGroup{} + return freezer.GetState(path) +} + +func (m *legacyManager) Exists() bool { + return cgroups.PathExists(m.Path("devices")) +} + +func (m *legacyManager) OOMKillCount() (uint64, error) { + return fs.OOMKillCount(m.Path("memory")) +} diff --git a/libcontainer/cgroups/systemd/v1_test.go b/libcontainer/cgroups/systemd/v1_test.go new file mode 100644 index 0000000..7c7ef55 --- /dev/null +++ b/libcontainer/cgroups/systemd/v1_test.go @@ -0,0 +1,220 @@ +package systemd + +import ( + "os" + "os/exec" + "strings" + "testing" + + "golang.org/x/sys/unix" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +func TestFreezeBeforeSet(t *testing.T) { + requireV1(t) + + testCases := []struct { + desc string + // Test input. + cg *configs.Cgroup + preFreeze bool + // Expected values. + // Before unit creation (Apply). + freeze0, thaw0 bool + // After unit creation. + freeze1, thaw1 bool + }{ + { + // A slice with SkipDevices. + desc: "slice,skip-devices", + cg: &configs.Cgroup{ + Name: "system-runc_test_freeze_1.slice", + Parent: "system.slice", + Resources: &configs.Resources{ + SkipDevices: true, + }, + }, + // Expected. + freeze0: false, + thaw0: false, + freeze1: false, + thaw1: false, + }, + { + // A scope with SkipDevices. Not a realistic scenario with runc + // (as container can't have SkipDevices == true), but possible + // for a standalone cgroup manager. + desc: "scope,skip-devices", + cg: &configs.Cgroup{ + ScopePrefix: "test", + Name: "testFreeze2", + Parent: "system.slice", + Resources: &configs.Resources{ + SkipDevices: true, + }, + }, + // Expected. + freeze0: false, + thaw0: false, + freeze1: false, + thaw1: false, + }, + { + // A slice that is about to be frozen in Set. + desc: "slice,will-freeze", + cg: &configs.Cgroup{ + Name: "system-runc_test_freeze_3.slice", + Parent: "system.slice", + Resources: &configs.Resources{ + Freezer: configs.Frozen, + }, + }, + // Expected. + freeze0: true, + thaw0: false, + freeze1: true, + thaw1: false, + }, + { + // A pre-frozen slice that should stay frozen. + desc: "slice,pre-frozen,will-freeze", + cg: &configs.Cgroup{ + Name: "system-runc_test_freeze_4.slice", + Parent: "system.slice", + Resources: &configs.Resources{ + Freezer: configs.Frozen, + }, + }, + preFreeze: true, + // Expected. + freeze0: true, // not actually frozen yet. + thaw0: false, + freeze1: false, + thaw1: false, + }, + { + // A pre-frozen scope with skip devices set. + desc: "scope,pre-frozen,skip-devices", + cg: &configs.Cgroup{ + ScopePrefix: "test", + Name: "testFreeze5", + Parent: "system.slice", + Resources: &configs.Resources{ + SkipDevices: true, + }, + }, + preFreeze: true, + // Expected. + freeze0: false, + thaw0: false, + freeze1: false, + thaw1: false, + }, + { + // A pre-frozen scope which will be thawed. + desc: "scope,pre-frozen", + cg: &configs.Cgroup{ + ScopePrefix: "test", + Name: "testFreeze6", + Parent: "system.slice", + Resources: &configs.Resources{}, + }, + preFreeze: true, + // Expected. + freeze0: true, // not actually frozen yet. + thaw0: true, + freeze1: false, + thaw1: false, + }, + } + + for _, tc := range testCases { + tc := tc + t.Run(tc.desc, func(t *testing.T) { + m, err := NewLegacyManager(tc.cg, nil) + if err != nil { + t.Fatal(err) + } + defer m.Destroy() //nolint:errcheck + lm := m.(*legacyManager) + + // Checks for a non-existent unit. + freeze, thaw, err := lm.freezeBeforeSet(getUnitName(tc.cg), tc.cg.Resources) + if err != nil { + t.Fatal(err) + } + if freeze != tc.freeze0 || thaw != tc.thaw0 { + t.Errorf("before Apply (non-existent unit): expected freeze: %v, thaw: %v, got freeze: %v, thaw: %v", + tc.freeze0, tc.thaw0, freeze, thaw) + } + + // Create systemd unit. + pid := -1 + if strings.HasSuffix(getUnitName(tc.cg), ".scope") { + // Scopes require a process inside. + cmd := exec.Command("bash", "-c", "sleep 1m") + if err := cmd.Start(); err != nil { + t.Fatal(err) + } + pid = cmd.Process.Pid + // Make sure to not leave a zombie. + defer func() { + // These may fail, we don't care. + _ = cmd.Process.Kill() + _ = cmd.Wait() + }() + } + if err := m.Apply(pid); err != nil { + t.Fatal(err) + } + if tc.preFreeze { + if err := m.Freeze(configs.Frozen); err != nil { + t.Error(err) + return // no more checks + } + } + freeze, thaw, err = lm.freezeBeforeSet(getUnitName(tc.cg), tc.cg.Resources) + if err != nil { + t.Error(err) + return // no more checks + } + if freeze != tc.freeze1 || thaw != tc.thaw1 { + t.Errorf("expected freeze: %v, thaw: %v, got freeze: %v, thaw: %v", + tc.freeze1, tc.thaw1, freeze, thaw) + } + // Destroy() timeouts on a frozen container, so we need to thaw it. + if tc.preFreeze { + if err := m.Freeze(configs.Thawed); err != nil { + t.Error(err) + } + } + // Destroy() does not kill processes in cgroup, so we should. + if pid != -1 { + if err = unix.Kill(pid, unix.SIGKILL); err != nil { + t.Errorf("unable to kill pid %d: %s", pid, err) + } + } + // Not really needed, but may help catch some bugs. + if err := m.Destroy(); err != nil { + t.Errorf("destroy: %s", err) + } + }) + } +} + +// requireV1 skips the test unless a set of requirements (cgroup v1, +// systemd, root) is met. +func requireV1(t *testing.T) { + t.Helper() + if cgroups.IsCgroup2UnifiedMode() { + t.Skip("Test requires cgroup v1.") + } + if !IsRunningSystemd() { + t.Skip("Test requires systemd.") + } + if os.Geteuid() != 0 { + t.Skip("Test requires root.") + } +} diff --git a/libcontainer/cgroups/systemd/v2.go b/libcontainer/cgroups/systemd/v2.go new file mode 100644 index 0000000..c31f0ec --- /dev/null +++ b/libcontainer/cgroups/systemd/v2.go @@ -0,0 +1,462 @@ +package systemd + +import ( + "bufio" + "fmt" + "math" + "os" + "path/filepath" + "strconv" + "strings" + "sync" + + systemdDbus "github.com/coreos/go-systemd/v22/dbus" + securejoin "github.com/cyphar/filepath-securejoin" + "github.com/sirupsen/logrus" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fs2" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type unifiedManager struct { + mu sync.Mutex + cgroups *configs.Cgroup + // path is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope" + path string + dbus *dbusConnManager + fsMgr cgroups.Manager +} + +func NewUnifiedManager(config *configs.Cgroup, path string) (cgroups.Manager, error) { + m := &unifiedManager{ + cgroups: config, + path: path, + dbus: newDbusConnManager(config.Rootless), + } + if err := m.initPath(); err != nil { + return nil, err + } + + fsMgr, err := fs2.NewManager(config, m.path) + if err != nil { + return nil, err + } + m.fsMgr = fsMgr + + return m, nil +} + +// unifiedResToSystemdProps tries to convert from Cgroup.Resources.Unified +// key/value map (where key is cgroupfs file name) to systemd unit properties. +// This is on a best-effort basis, so the properties that are not known +// (to this function and/or systemd) are ignored (but logged with "debug" +// log level). +// +// For the list of keys, see https://www.kernel.org/doc/Documentation/cgroup-v2.txt +// +// For the list of systemd unit properties, see systemd.resource-control(5). +func unifiedResToSystemdProps(cm *dbusConnManager, res map[string]string) (props []systemdDbus.Property, _ error) { + var err error + + for k, v := range res { + if strings.Contains(k, "/") { + return nil, fmt.Errorf("unified resource %q must be a file name (no slashes)", k) + } + sk := strings.SplitN(k, ".", 2) + if len(sk) != 2 { + return nil, fmt.Errorf("unified resource %q must be in the form CONTROLLER.PARAMETER", k) + } + // Kernel is quite forgiving to extra whitespace + // around the value, and so should we. + v = strings.TrimSpace(v) + // Please keep cases in alphabetical order. + switch k { + case "cpu.max": + // value: quota [period] + quota := int64(0) // 0 means "unlimited" for addCpuQuota, if period is set + period := defCPUQuotaPeriod + sv := strings.Fields(v) + if len(sv) < 1 || len(sv) > 2 { + return nil, fmt.Errorf("unified resource %q value invalid: %q", k, v) + } + // quota + if sv[0] != "max" { + quota, err = strconv.ParseInt(sv[0], 10, 64) + if err != nil { + return nil, fmt.Errorf("unified resource %q period value conversion error: %w", k, err) + } + } + // period + if len(sv) == 2 { + period, err = strconv.ParseUint(sv[1], 10, 64) + if err != nil { + return nil, fmt.Errorf("unified resource %q quota value conversion error: %w", k, err) + } + } + addCpuQuota(cm, &props, quota, period) + + case "cpu.weight": + num, err := strconv.ParseUint(v, 10, 64) + if err != nil { + return nil, fmt.Errorf("unified resource %q value conversion error: %w", k, err) + } + props = append(props, + newProp("CPUWeight", num)) + + case "cpuset.cpus", "cpuset.mems": + bits, err := RangeToBits(v) + if err != nil { + return nil, fmt.Errorf("unified resource %q=%q conversion error: %w", k, v, err) + } + m := map[string]string{ + "cpuset.cpus": "AllowedCPUs", + "cpuset.mems": "AllowedMemoryNodes", + } + // systemd only supports these properties since v244 + sdVer := systemdVersion(cm) + if sdVer >= 244 { + props = append(props, + newProp(m[k], bits)) + } else { + logrus.Debugf("systemd v%d is too old to support %s"+ + " (setting will still be applied to cgroupfs)", + sdVer, m[k]) + } + + case "memory.high", "memory.low", "memory.min", "memory.max", "memory.swap.max": + num := uint64(math.MaxUint64) + if v != "max" { + num, err = strconv.ParseUint(v, 10, 64) + if err != nil { + return nil, fmt.Errorf("unified resource %q value conversion error: %w", k, err) + } + } + m := map[string]string{ + "memory.high": "MemoryHigh", + "memory.low": "MemoryLow", + "memory.min": "MemoryMin", + "memory.max": "MemoryMax", + "memory.swap.max": "MemorySwapMax", + } + props = append(props, + newProp(m[k], num)) + + case "pids.max": + num := uint64(math.MaxUint64) + if v != "max" { + var err error + num, err = strconv.ParseUint(v, 10, 64) + if err != nil { + return nil, fmt.Errorf("unified resource %q value conversion error: %w", k, err) + } + } + props = append(props, + newProp("TasksMax", num)) + + case "memory.oom.group": + // Setting this to 1 is roughly equivalent to OOMPolicy=kill + // (as per systemd.service(5) and + // https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html), + // but it's not clear what to do if it is unset or set + // to 0 in runc update, as there are two other possible + // values for OOMPolicy (continue/stop). + fallthrough + + default: + // Ignore the unknown resource here -- will still be + // applied in Set which calls fs2.Set. + logrus.Debugf("don't know how to convert unified resource %q=%q to systemd unit property; skipping (will still be applied to cgroupfs)", k, v) + } + } + + return props, nil +} + +func genV2ResourcesProperties(r *configs.Resources, cm *dbusConnManager) ([]systemdDbus.Property, error) { + var properties []systemdDbus.Property + + // NOTE: This is of questionable correctness because we insert our own + // devices eBPF program later. Two programs with identical rules + // aren't the end of the world, but it is a bit concerning. However + // it's unclear if systemd removes all eBPF programs attached when + // doing SetUnitProperties... + deviceProperties, err := generateDeviceProperties(r) + if err != nil { + return nil, err + } + properties = append(properties, deviceProperties...) + + if r.Memory != 0 { + properties = append(properties, + newProp("MemoryMax", uint64(r.Memory))) + } + if r.MemoryReservation != 0 { + properties = append(properties, + newProp("MemoryLow", uint64(r.MemoryReservation))) + } + + swap, err := cgroups.ConvertMemorySwapToCgroupV2Value(r.MemorySwap, r.Memory) + if err != nil { + return nil, err + } + if swap != 0 { + properties = append(properties, + newProp("MemorySwapMax", uint64(swap))) + } + + if r.CpuWeight != 0 { + properties = append(properties, + newProp("CPUWeight", r.CpuWeight)) + } + + addCpuQuota(cm, &properties, r.CpuQuota, r.CpuPeriod) + + if r.PidsLimit > 0 || r.PidsLimit == -1 { + properties = append(properties, + newProp("TasksMax", uint64(r.PidsLimit))) + } + + err = addCpuset(cm, &properties, r.CpusetCpus, r.CpusetMems) + if err != nil { + return nil, err + } + + // ignore r.KernelMemory + + // convert Resources.Unified map to systemd properties + if r.Unified != nil { + unifiedProps, err := unifiedResToSystemdProps(cm, r.Unified) + if err != nil { + return nil, err + } + properties = append(properties, unifiedProps...) + } + + return properties, nil +} + +func (m *unifiedManager) Apply(pid int) error { + var ( + c = m.cgroups + unitName = getUnitName(c) + properties []systemdDbus.Property + ) + + slice := "system.slice" + if m.cgroups.Rootless { + slice = "user.slice" + } + if c.Parent != "" { + slice = c.Parent + } + + properties = append(properties, systemdDbus.PropDescription("libcontainer container "+c.Name)) + + if strings.HasSuffix(unitName, ".slice") { + // If we create a slice, the parent is defined via a Wants=. + properties = append(properties, systemdDbus.PropWants(slice)) + } else { + // Otherwise it's a scope, which we put into a Slice=. + properties = append(properties, systemdDbus.PropSlice(slice)) + // Assume scopes always support delegation (supported since systemd v218). + properties = append(properties, newProp("Delegate", true)) + } + + // only add pid if its valid, -1 is used w/ general slice creation. + if pid != -1 { + properties = append(properties, newProp("PIDs", []uint32{uint32(pid)})) + } + + // Always enable accounting, this gets us the same behaviour as the fs implementation, + // plus the kernel has some problems with joining the memory cgroup at a later time. + properties = append(properties, + newProp("MemoryAccounting", true), + newProp("CPUAccounting", true), + newProp("IOAccounting", true), + newProp("TasksAccounting", true), + ) + + // Assume DefaultDependencies= will always work (the check for it was previously broken.) + properties = append(properties, + newProp("DefaultDependencies", false)) + + properties = append(properties, c.SystemdProps...) + + if err := startUnit(m.dbus, unitName, properties); err != nil { + return fmt.Errorf("unable to start unit %q (properties %+v): %w", unitName, properties, err) + } + + if err := fs2.CreateCgroupPath(m.path, m.cgroups); err != nil { + return err + } + + if c.OwnerUID != nil { + filesToChown, err := cgroupFilesToChown() + if err != nil { + return err + } + + for _, v := range filesToChown { + err := os.Chown(m.path+"/"+v, *c.OwnerUID, -1) + if err != nil { + return err + } + } + } + + return nil +} + +// The kernel exposes a list of files that should be chowned to the delegate +// uid in /sys/kernel/cgroup/delegate. If the file is not present +// (Linux < 4.15), use the initial values mentioned in cgroups(7). +func cgroupFilesToChown() ([]string, error) { + filesToChown := []string{"."} // the directory itself must be chowned + const cgroupDelegateFile = "/sys/kernel/cgroup/delegate" + f, err := os.Open(cgroupDelegateFile) + if err == nil { + defer f.Close() + scanner := bufio.NewScanner(f) + for scanner.Scan() { + filesToChown = append(filesToChown, scanner.Text()) + } + if err := scanner.Err(); err != nil { + return nil, fmt.Errorf("error reading %s: %w", cgroupDelegateFile, err) + } + } else { + filesToChown = append(filesToChown, "cgroup.procs", "cgroup.subtree_control", "cgroup.threads") + } + return filesToChown, nil +} + +func (m *unifiedManager) Destroy() error { + m.mu.Lock() + defer m.mu.Unlock() + + unitName := getUnitName(m.cgroups) + if err := stopUnit(m.dbus, unitName); err != nil { + return err + } + + // systemd 239 do not remove sub-cgroups. + err := m.fsMgr.Destroy() + // fsMgr.Destroy has handled ErrNotExist + if err != nil { + return err + } + + return nil +} + +func (m *unifiedManager) Path(_ string) string { + return m.path +} + +// getSliceFull value is used in initPath. +// The value is incompatible with systemdDbus.PropSlice. +func (m *unifiedManager) getSliceFull() (string, error) { + c := m.cgroups + slice := "system.slice" + if c.Rootless { + slice = "user.slice" + } + if c.Parent != "" { + var err error + slice, err = ExpandSlice(c.Parent) + if err != nil { + return "", err + } + } + + if c.Rootless { + // managerCG is typically "/user.slice/user-${uid}.slice/user@${uid}.service". + managerCG, err := getManagerProperty(m.dbus, "ControlGroup") + if err != nil { + return "", err + } + slice = filepath.Join(managerCG, slice) + } + + // an example of the final slice in rootless: "/user.slice/user-1001.slice/user@1001.service/user.slice" + // NOTE: systemdDbus.PropSlice requires the "/user.slice/user-1001.slice/user@1001.service/" prefix NOT to be specified. + return slice, nil +} + +func (m *unifiedManager) initPath() error { + if m.path != "" { + return nil + } + + sliceFull, err := m.getSliceFull() + if err != nil { + return err + } + + c := m.cgroups + path := filepath.Join(sliceFull, getUnitName(c)) + path, err = securejoin.SecureJoin(fs2.UnifiedMountpoint, path) + if err != nil { + return err + } + + // an example of the final path in rootless: + // "/sys/fs/cgroup/user.slice/user-1001.slice/user@1001.service/user.slice/libpod-132ff0d72245e6f13a3bbc6cdc5376886897b60ac59eaa8dea1df7ab959cbf1c.scope" + m.path = path + + return nil +} + +func (m *unifiedManager) Freeze(state configs.FreezerState) error { + return m.fsMgr.Freeze(state) +} + +func (m *unifiedManager) GetPids() ([]int, error) { + return cgroups.GetPids(m.path) +} + +func (m *unifiedManager) GetAllPids() ([]int, error) { + return cgroups.GetAllPids(m.path) +} + +func (m *unifiedManager) GetStats() (*cgroups.Stats, error) { + return m.fsMgr.GetStats() +} + +func (m *unifiedManager) Set(r *configs.Resources) error { + if r == nil { + return nil + } + properties, err := genV2ResourcesProperties(r, m.dbus) + if err != nil { + return err + } + + if err := setUnitProperties(m.dbus, getUnitName(m.cgroups), properties...); err != nil { + return fmt.Errorf("unable to set unit properties: %w", err) + } + + return m.fsMgr.Set(r) +} + +func (m *unifiedManager) GetPaths() map[string]string { + paths := make(map[string]string, 1) + paths[""] = m.path + return paths +} + +func (m *unifiedManager) GetCgroups() (*configs.Cgroup, error) { + return m.cgroups, nil +} + +func (m *unifiedManager) GetFreezerState() (configs.FreezerState, error) { + return m.fsMgr.GetFreezerState() +} + +func (m *unifiedManager) Exists() bool { + return cgroups.PathExists(m.path) +} + +func (m *unifiedManager) OOMKillCount() (uint64, error) { + return m.fsMgr.OOMKillCount() +} diff --git a/libcontainer/cgroups/utils.go b/libcontainer/cgroups/utils.go index dbcc58f..13ebf52 100644 --- a/libcontainer/cgroups/utils.go +++ b/libcontainer/cgroups/utils.go @@ -1,179 +1,70 @@ -// +build linux - package cgroups import ( "bufio" + "errors" "fmt" "io" - "io/ioutil" "os" "path/filepath" "strconv" "strings" "sync" - "syscall" "time" - units "github.com/docker/go-units" + "github.com/opencontainers/runc/libcontainer/userns" + "github.com/sirupsen/logrus" "golang.org/x/sys/unix" ) const ( - CgroupNamePrefix = "name=" CgroupProcesses = "cgroup.procs" unifiedMountpoint = "/sys/fs/cgroup" + hybridMountpoint = "/sys/fs/cgroup/unified" ) var ( isUnifiedOnce sync.Once isUnified bool + isHybridOnce sync.Once + isHybrid bool ) -// HugePageSizeUnitList is a list of the units used by the linux kernel when -// naming the HugePage control files. -// https://www.kernel.org/doc/Documentation/cgroup-v1/hugetlb.txt -// TODO Since the kernel only use KB, MB and GB; TB and PB should be removed, -// depends on https://github.com/docker/go-units/commit/a09cd47f892041a4fac473133d181f5aea6fa393 -var HugePageSizeUnitList = []string{"B", "KB", "MB", "GB", "TB", "PB"} - // IsCgroup2UnifiedMode returns whether we are running in cgroup v2 unified mode. func IsCgroup2UnifiedMode() bool { isUnifiedOnce.Do(func() { - var st syscall.Statfs_t - if err := syscall.Statfs(unifiedMountpoint, &st); err != nil { - panic("cannot statfs cgroup root") + var st unix.Statfs_t + err := unix.Statfs(unifiedMountpoint, &st) + if err != nil { + if os.IsNotExist(err) && userns.RunningInUserNS() { + // ignore the "not found" error if running in userns + logrus.WithError(err).Debugf("%s missing, assuming cgroup v1", unifiedMountpoint) + isUnified = false + return + } + panic(fmt.Sprintf("cannot statfs cgroup root: %s", err)) } isUnified = st.Type == unix.CGROUP2_SUPER_MAGIC }) return isUnified } -// https://www.kernel.org/doc/Documentation/cgroup-v1/cgroups.txt -func FindCgroupMountpoint(cgroupPath, subsystem string) (string, error) { - if IsCgroup2UnifiedMode() { - return unifiedMountpoint, nil - } - mnt, _, err := FindCgroupMountpointAndRoot(cgroupPath, subsystem) - return mnt, err -} - -func FindCgroupMountpointAndRoot(cgroupPath, subsystem string) (string, string, error) { - // We are not using mount.GetMounts() because it's super-inefficient, - // parsing it directly sped up x10 times because of not using Sscanf. - // It was one of two major performance drawbacks in container start. - if !isSubsystemAvailable(subsystem) { - return "", "", NewNotFoundError(subsystem) - } - - f, err := os.Open("/proc/self/mountinfo") - if err != nil { - return "", "", err - } - defer f.Close() - - if IsCgroup2UnifiedMode() { - subsystem = "" - } - - return findCgroupMountpointAndRootFromReader(f, cgroupPath, subsystem) -} - -func findCgroupMountpointAndRootFromReader(reader io.Reader, cgroupPath, subsystem string) (string, string, error) { - scanner := bufio.NewScanner(reader) - for scanner.Scan() { - txt := scanner.Text() - fields := strings.Fields(txt) - if len(fields) < 9 { - continue - } - if strings.HasPrefix(fields[4], cgroupPath) { - for _, opt := range strings.Split(fields[len(fields)-1], ",") { - if (subsystem == "" && fields[9] == "cgroup2") || opt == subsystem { - return fields[4], fields[3], nil - } - } - } - } - if err := scanner.Err(); err != nil { - return "", "", err - } - - return "", "", NewNotFoundError(subsystem) -} - -func isSubsystemAvailable(subsystem string) bool { - if IsCgroup2UnifiedMode() { - controllers, err := GetAllSubsystems() +// IsCgroup2HybridMode returns whether we are running in cgroup v2 hybrid mode. +func IsCgroup2HybridMode() bool { + isHybridOnce.Do(func() { + var st unix.Statfs_t + err := unix.Statfs(hybridMountpoint, &st) if err != nil { - return false - } - for _, c := range controllers { - if c == subsystem { - return true + if os.IsNotExist(err) { + // ignore the "not found" error + isHybrid = false + return } + panic(fmt.Sprintf("cannot statfs cgroup root: %s", err)) } - return false - } - - cgroups, err := ParseCgroupFile("/proc/self/cgroup") - if err != nil { - return false - } - _, avail := cgroups[subsystem] - return avail -} - -func GetClosestMountpointAncestor(dir, mountinfo string) string { - deepestMountPoint := "" - for _, mountInfoEntry := range strings.Split(mountinfo, "\n") { - mountInfoParts := strings.Fields(mountInfoEntry) - if len(mountInfoParts) < 5 { - continue - } - mountPoint := mountInfoParts[4] - if strings.HasPrefix(mountPoint, deepestMountPoint) && strings.HasPrefix(dir, mountPoint) { - deepestMountPoint = mountPoint - } - } - return deepestMountPoint -} - -func FindCgroupMountpointDir() (string, error) { - f, err := os.Open("/proc/self/mountinfo") - if err != nil { - return "", err - } - defer f.Close() - - scanner := bufio.NewScanner(f) - for scanner.Scan() { - text := scanner.Text() - fields := strings.Split(text, " ") - // Safe as mountinfo encodes mountpoints with spaces as \040. - index := strings.Index(text, " - ") - postSeparatorFields := strings.Fields(text[index+3:]) - numPostFields := len(postSeparatorFields) - - // This is an error as we can't detect if the mount is for "cgroup" - if numPostFields == 0 { - return "", fmt.Errorf("Found no fields post '-' in %q", text) - } - - if postSeparatorFields[0] == "cgroup" || postSeparatorFields[0] == "cgroup2" { - // Check that the mount is properly formatted. - if numPostFields < 3 { - return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text) - } - - return filepath.Dir(fields[4]), nil - } - } - if err := scanner.Err(); err != nil { - return "", err - } - - return "", NewNotFoundError("cgroup") + isHybrid = st.Type == unix.CGROUP2_SUPER_MAGIC + }) + return isHybrid } type Mount struct { @@ -182,58 +73,13 @@ type Mount struct { Subsystems []string } -func (m Mount) GetOwnCgroup(cgroups map[string]string) (string, error) { - if len(m.Subsystems) == 0 { - return "", fmt.Errorf("no subsystem for mount") - } - - return getControllerPath(m.Subsystems[0], cgroups) -} - -func getCgroupMountsHelper(ss map[string]bool, mi io.Reader, all bool) ([]Mount, error) { - res := make([]Mount, 0, len(ss)) - scanner := bufio.NewScanner(mi) - numFound := 0 - for scanner.Scan() && numFound < len(ss) { - txt := scanner.Text() - sepIdx := strings.Index(txt, " - ") - if sepIdx == -1 { - return nil, fmt.Errorf("invalid mountinfo format") - } - if txt[sepIdx+3:sepIdx+10] == "cgroup2" || txt[sepIdx+3:sepIdx+9] != "cgroup" { - continue - } - fields := strings.Split(txt, " ") - m := Mount{ - Mountpoint: fields[4], - Root: fields[3], - } - for _, opt := range strings.Split(fields[len(fields)-1], ",") { - seen, known := ss[opt] - if !known || (!all && seen) { - continue - } - ss[opt] = true - if strings.HasPrefix(opt, CgroupNamePrefix) { - opt = opt[len(CgroupNamePrefix):] - } - m.Subsystems = append(m.Subsystems, opt) - numFound++ - } - if len(m.Subsystems) > 0 || all { - res = append(res, m) - } - } - if err := scanner.Err(); err != nil { - return nil, err - } - return res, nil -} - // GetCgroupMounts returns the mounts for the cgroup subsystems. // all indicates whether to return just the first instance or all the mounts. +// This function should not be used from cgroupv2 code, as in this case +// all the controllers are available under the constant unifiedMountpoint. func GetCgroupMounts(all bool) ([]Mount, error) { if IsCgroup2UnifiedMode() { + // TODO: remove cgroupv2 case once all external users are converted availableControllers, err := GetAllSubsystems() if err != nil { return nil, err @@ -246,22 +92,7 @@ func GetCgroupMounts(all bool) ([]Mount, error) { return []Mount{m}, nil } - f, err := os.Open("/proc/self/mountinfo") - if err != nil { - return nil, err - } - defer f.Close() - - allSubsystems, err := ParseCgroupFile("/proc/self/cgroup") - if err != nil { - return nil, err - } - - allMap := make(map[string]bool) - for s := range allSubsystems { - allMap[s] = false - } - return getCgroupMountsHelper(allMap, f, all) + return getCgroupMountsV1(all) } // GetAllSubsystems returns all the cgroup subsystems supported by the kernel @@ -274,11 +105,11 @@ func GetAllSubsystems() ([]string, error) { // - freezer: implemented in kernel 5.2 // We assume these are always available, as it is hard to detect availability. pseudo := []string{"devices", "freezer"} - data, err := ioutil.ReadFile("/sys/fs/cgroup/cgroup.controllers") + data, err := ReadFile("/sys/fs/cgroup", "cgroup.controllers") if err != nil { return nil, err } - subsystems := append(pseudo, strings.Fields(string(data))...) + subsystems := append(pseudo, strings.Fields(data)...) return subsystems, nil } f, err := os.Open("/proc/cgroups") @@ -305,61 +136,8 @@ func GetAllSubsystems() ([]string, error) { return subsystems, nil } -// GetOwnCgroup returns the relative path to the cgroup docker is running in. -func GetOwnCgroup(subsystem string) (string, error) { - cgroups, err := ParseCgroupFile("/proc/self/cgroup") - if err != nil { - return "", err - } - - return getControllerPath(subsystem, cgroups) -} - -func GetOwnCgroupPath(subsystem string) (string, error) { - cgroup, err := GetOwnCgroup(subsystem) - if err != nil { - return "", err - } - - return getCgroupPathHelper(subsystem, cgroup) -} - -func GetInitCgroup(subsystem string) (string, error) { - cgroups, err := ParseCgroupFile("/proc/1/cgroup") - if err != nil { - return "", err - } - - return getControllerPath(subsystem, cgroups) -} - -func GetInitCgroupPath(subsystem string) (string, error) { - cgroup, err := GetInitCgroup(subsystem) - if err != nil { - return "", err - } - - return getCgroupPathHelper(subsystem, cgroup) -} - -func getCgroupPathHelper(subsystem, cgroup string) (string, error) { - mnt, root, err := FindCgroupMountpointAndRoot("", subsystem) - if err != nil { - return "", err - } - - // This is needed for nested containers, because in /proc/self/cgroup we - // see paths from host, which don't exist in container. - relCgroup, err := filepath.Rel(root, cgroup) - if err != nil { - return "", err - } - - return filepath.Join(mnt, relCgroup), nil -} - func readProcsFile(dir string) ([]int, error) { - f, err := os.Open(filepath.Join(dir, CgroupProcesses)) + f, err := OpenFile(dir, CgroupProcesses, os.O_RDONLY) if err != nil { return nil, err } @@ -379,11 +157,18 @@ func readProcsFile(dir string) ([]int, error) { out = append(out, pid) } } - return out, nil + return out, s.Err() } -// ParseCgroupFile parses the given cgroup file, typically from -// /proc//cgroup, into a map of subgroups to cgroup names. +// ParseCgroupFile parses the given cgroup file, typically /proc/self/cgroup +// or /proc//cgroup, into a map of subsystems to cgroup paths, e.g. +// "cpu": "/user.slice/user-1000.slice" +// "pids": "/user.slice/user-1000.slice" +// etc. +// +// Note that for cgroup v2 unified hierarchy, there are no per-controller +// cgroup paths, so the resulting map will have a single element where the key +// is empty string ("") and the value is the cgroup path the is in. func ParseCgroupFile(path string) (map[string]string, error) { f, err := os.Open(path) if err != nil { @@ -423,22 +208,6 @@ func parseCgroupFromReader(r io.Reader) (map[string]string, error) { return cgroups, nil } -func getControllerPath(subsystem string, cgroups map[string]string) (string, error) { - if IsCgroup2UnifiedMode() { - return "/", nil - } - - if p, ok := cgroups[subsystem]; ok { - return p, nil - } - - if p, ok := cgroups[CgroupNamePrefix+subsystem]; ok { - return p, nil - } - - return "", NewNotFoundError(subsystem) -} - func PathExists(path string) bool { if _, err := os.Stat(path); err != nil { return false @@ -457,20 +226,65 @@ func EnterPid(cgroupPaths map[string]string, pid int) error { return nil } +func rmdir(path string) error { + err := unix.Rmdir(path) + if err == nil || err == unix.ENOENT { //nolint:errorlint // unix errors are bare + return nil + } + return &os.PathError{Op: "rmdir", Path: path, Err: err} +} + +// RemovePath aims to remove cgroup path. It does so recursively, +// by removing any subdirectories (sub-cgroups) first. +func RemovePath(path string) error { + // try the fast path first + if err := rmdir(path); err == nil { + return nil + } + + infos, err := os.ReadDir(path) + if err != nil { + if os.IsNotExist(err) { + err = nil + } + return err + } + for _, info := range infos { + if info.IsDir() { + // We should remove subcgroups dir first + if err = RemovePath(filepath.Join(path, info.Name())); err != nil { + break + } + } + } + if err == nil { + err = rmdir(path) + } + return err +} + // RemovePaths iterates over the provided paths removing them. // We trying to remove all paths five times with increasing delay between tries. // If after all there are not removed cgroups - appropriate error will be // returned. func RemovePaths(paths map[string]string) (err error) { + const retries = 5 delay := 10 * time.Millisecond - for i := 0; i < 5; i++ { + for i := 0; i < retries; i++ { if i != 0 { time.Sleep(delay) delay *= 2 } for s, p := range paths { - os.RemoveAll(p) - // TODO: here probably should be logging + if err := RemovePath(p); err != nil { + // do not log intermediate iterations + switch i { + case 0: + logrus.WithError(err).Warnf("Failed to remove cgroup (will retry)") + case retries - 1: + logrus.WithError(err).Error("Failed to remove cgroup") + } + } _, err := os.Stat(p) // We need this strange way of checking cgroups existence because // RemoveAll almost always returns error, even on already removed @@ -480,65 +294,88 @@ func RemovePaths(paths map[string]string) (err error) { } } if len(paths) == 0 { + //nolint:ineffassign,staticcheck // done to help garbage collecting: opencontainers/runc#2506 + paths = make(map[string]string) return nil } } return fmt.Errorf("Failed to remove paths: %v", paths) } -func GetHugePageSize() ([]string, error) { - files, err := ioutil.ReadDir("/sys/kernel/mm/hugepages") - if err != nil { - return []string{}, err - } - var fileNames []string - for _, st := range files { - fileNames = append(fileNames, st.Name()) - } - return getHugePageSizeFromFilenames(fileNames) +var ( + hugePageSizes []string + initHPSOnce sync.Once +) + +func HugePageSizes() []string { + initHPSOnce.Do(func() { + dir, err := os.OpenFile("/sys/kernel/mm/hugepages", unix.O_DIRECTORY|unix.O_RDONLY, 0) + if err != nil { + return + } + files, err := dir.Readdirnames(0) + dir.Close() + if err != nil { + return + } + + hugePageSizes, err = getHugePageSizeFromFilenames(files) + if err != nil { + logrus.Warn("HugePageSizes: ", err) + } + }) + + return hugePageSizes } func getHugePageSizeFromFilenames(fileNames []string) ([]string, error) { - var pageSizes []string - for _, fileName := range fileNames { - nameArray := strings.Split(fileName, "-") - pageSize, err := units.RAMInBytes(nameArray[1]) - if err != nil { - return []string{}, err + pageSizes := make([]string, 0, len(fileNames)) + var warn error + + for _, file := range fileNames { + // example: hugepages-1048576kB + val := strings.TrimPrefix(file, "hugepages-") + if len(val) == len(file) { + // Unexpected file name: no prefix found, ignore it. + continue } - sizeString := units.CustomSize("%g%s", float64(pageSize), 1024.0, HugePageSizeUnitList) - pageSizes = append(pageSizes, sizeString) + // The suffix is always "kB" (as of Linux 5.13). If we find + // something else, produce an error but keep going. + eLen := len(val) - 2 + val = strings.TrimSuffix(val, "kB") + if len(val) != eLen { + // Highly unlikely. + if warn == nil { + warn = errors.New(file + `: invalid suffix (expected "kB")`) + } + continue + } + size, err := strconv.Atoi(val) + if err != nil { + // Highly unlikely. + if warn == nil { + warn = fmt.Errorf("%s: %w", file, err) + } + continue + } + // Model after https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/hugetlb_cgroup.c?id=eff48ddeab782e35e58ccc8853f7386bbae9dec4#n574 + // but in our case the size is in KB already. + if size >= (1 << 20) { + val = strconv.Itoa(size>>20) + "GB" + } else if size >= (1 << 10) { + val = strconv.Itoa(size>>10) + "MB" + } else { + val += "KB" + } + pageSizes = append(pageSizes, val) } - return pageSizes, nil + return pageSizes, warn } // GetPids returns all pids, that were added to cgroup at path. -func GetPids(path string) ([]int, error) { - return readProcsFile(path) -} - -// GetAllPids returns all pids, that were added to cgroup at path and to all its -// subcgroups. -func GetAllPids(path string) ([]int, error) { - var pids []int - // collect pids from all sub-cgroups - err := filepath.Walk(path, func(p string, info os.FileInfo, iErr error) error { - dir, file := filepath.Split(p) - if file != CgroupProcesses { - return nil - } - if iErr != nil { - return iErr - } - cPids, err := readProcsFile(dir) - if err != nil { - return err - } - pids = append(pids, cPids...) - return nil - }) - return pids, err +func GetPids(dir string) ([]int, error) { + return readProcsFile(dir) } // WriteCgroupProc writes the specified pid into the cgroup's cgroup.procs file @@ -554,35 +391,77 @@ func WriteCgroupProc(dir string, pid int) error { return nil } - cgroupProcessesFile, err := os.OpenFile(filepath.Join(dir, CgroupProcesses), os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0700) + file, err := OpenFile(dir, CgroupProcesses, os.O_WRONLY) if err != nil { - return fmt.Errorf("failed to write %v to %v: %v", pid, CgroupProcesses, err) + return fmt.Errorf("failed to write %v: %w", pid, err) } - defer cgroupProcessesFile.Close() + defer file.Close() for i := 0; i < 5; i++ { - _, err = cgroupProcessesFile.WriteString(strconv.Itoa(pid)) + _, err = file.WriteString(strconv.Itoa(pid)) if err == nil { return nil } // EINVAL might mean that the task being added to cgroup.procs is in state // TASK_NEW. We should attempt to do so again. - if isEINVAL(err) { + if errors.Is(err, unix.EINVAL) { time.Sleep(30 * time.Millisecond) continue } - return fmt.Errorf("failed to write %v to %v: %v", pid, CgroupProcesses, err) + return fmt.Errorf("failed to write %v: %w", pid, err) } return err } -func isEINVAL(err error) bool { - switch err := err.(type) { - case *os.PathError: - return err.Err == unix.EINVAL - default: - return false +// Since the OCI spec is designed for cgroup v1, in some cases +// there is need to convert from the cgroup v1 configuration to cgroup v2 +// the formula for cpuShares is y = (1 + ((x - 2) * 9999) / 262142) +// convert from [2-262144] to [1-10000] +// 262144 comes from Linux kernel definition "#define MAX_SHARES (1UL << 18)" +func ConvertCPUSharesToCgroupV2Value(cpuShares uint64) uint64 { + if cpuShares == 0 { + return 0 } + return (1 + ((cpuShares-2)*9999)/262142) +} + +// ConvertMemorySwapToCgroupV2Value converts MemorySwap value from OCI spec +// for use by cgroup v2 drivers. A conversion is needed since Resources.MemorySwap +// is defined as memory+swap combined, while in cgroup v2 swap is a separate value. +func ConvertMemorySwapToCgroupV2Value(memorySwap, memory int64) (int64, error) { + // for compatibility with cgroup1 controller, set swap to unlimited in + // case the memory is set to unlimited, and swap is not explicitly set, + // treating the request as "set both memory and swap to unlimited". + if memory == -1 && memorySwap == 0 { + return -1, nil + } + if memorySwap == -1 || memorySwap == 0 { + // -1 is "max", 0 is "unset", so treat as is + return memorySwap, nil + } + // sanity checks + if memory == 0 || memory == -1 { + return 0, errors.New("unable to set swap limit without memory limit") + } + if memory < 0 { + return 0, fmt.Errorf("invalid memory value: %d", memory) + } + if memorySwap < memory { + return 0, errors.New("memory+swap limit should be >= memory limit") + } + + return memorySwap - memory, nil +} + +// Since the OCI spec is designed for cgroup v1, in some cases +// there is need to convert from the cgroup v1 configuration to cgroup v2 +// the formula for BlkIOWeight to IOWeight is y = (1 + (x - 10) * 9999 / 990) +// convert linearly from [10-1000] to [1-10000] +func ConvertBlkIOToIOWeightValue(blkIoWeight uint16) uint64 { + if blkIoWeight == 0 { + return 0 + } + return 1 + (uint64(blkIoWeight)-10)*9999/990 } diff --git a/libcontainer/cgroups/utils_test.go b/libcontainer/cgroups/utils_test.go index 3214b9d..c9feb84 100644 --- a/libcontainer/cgroups/utils_test.go +++ b/libcontainer/cgroups/utils_test.go @@ -1,14 +1,13 @@ -// +build linux - package cgroups import ( "bytes" "errors" - "fmt" "reflect" "strings" "testing" + + "github.com/moby/sys/mountinfo" ) const fedoraMountinfo = `15 35 0:3 / /proc rw,nosuid,nodev,noexec,relatime shared:5 - proc proc rw @@ -180,8 +179,11 @@ const cgroup2Mountinfo = `18 64 0:18 / /sys rw,nosuid,nodev,noexec,relatime shar func TestGetCgroupMounts(t *testing.T) { type testData struct { - mountInfo string - root string + mountInfo string + root string + // all is the total number of records expected with all=true, + // or 0 for no extra records expected (most cases). + all int subsystems map[string]bool } testTable := []testData{ @@ -189,51 +191,64 @@ func TestGetCgroupMounts(t *testing.T) { mountInfo: fedoraMountinfo, root: "/", subsystems: map[string]bool{ - "cpuset": false, - "cpu": false, - "cpuacct": false, - "memory": false, - "devices": false, - "freezer": false, - "net_cls": false, - "blkio": false, - "perf_event": false, - "hugetlb": false, + "name=systemd": false, + "cpuset": false, + "cpu": false, + "cpuacct": false, + "memory": false, + "devices": false, + "freezer": false, + "net_cls": false, + "blkio": false, + "perf_event": false, + "hugetlb": false, }, }, { mountInfo: systemdMountinfo, root: "/system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope", subsystems: map[string]bool{ - "cpuset": false, - "cpu": false, - "cpuacct": false, - "memory": false, - "devices": false, - "freezer": false, - "net_cls": false, - "blkio": false, - "perf_event": false, + "name=systemd": false, + "cpuset": false, + "cpu": false, + "cpuacct": false, + "memory": false, + "devices": false, + "freezer": false, + "net_cls": false, + "net_prio": false, + "blkio": false, + "perf_event": false, }, }, { mountInfo: bedrockMountinfo, root: "/", + all: 50, subsystems: map[string]bool{ - "cpuset": false, - "cpu": false, - "cpuacct": false, - "memory": false, - "devices": false, - "freezer": false, - "net_cls": false, - "blkio": false, - "perf_event": false, + "name=systemd": false, + "cpuset": false, + "cpu": false, + "cpuacct": false, + "memory": false, + "devices": false, + "freezer": false, + "net_cls": false, + "net_prio": false, + "blkio": false, + "perf_event": false, + "pids": false, }, }, } for _, td := range testTable { - mi := bytes.NewBufferString(td.mountInfo) + mi, err := mountinfo.GetMountsFromReader( + bytes.NewBufferString(td.mountInfo), + mountinfo.FSTypeFilter("cgroup"), + ) + if err != nil { + t.Fatal(err) + } cgMounts, err := getCgroupMountsHelper(td.subsystems, mi, false) if err != nil { t.Fatal(err) @@ -245,6 +260,7 @@ func TestGetCgroupMounts(t *testing.T) { } } for ss := range td.subsystems { + ss = strings.TrimPrefix(ss, CgroupNamePrefix) m, ok := cgMap[ss] if !ok { t.Fatalf("%s not found", ss) @@ -266,6 +282,28 @@ func TestGetCgroupMounts(t *testing.T) { t.Fatalf("subsystem %s not found in Subsystems field %v", ss, m.Subsystems) } } + // Test the all=true case. + + // Reset the test input. + for k := range td.subsystems { + td.subsystems[k] = false + } + cgMountsAll, err := getCgroupMountsHelper(td.subsystems, mi, true) + if err != nil { + t.Fatal(err) + } + if td.all == 0 { + // Results with and without "all" should be the same. + if len(cgMounts) != len(cgMountsAll) || !reflect.DeepEqual(cgMounts, cgMountsAll) { + t.Errorf("expected same results, got (all=false) %v, (all=true) %v", cgMounts, cgMountsAll) + } + } else { + // Make sure we got all records. + if len(cgMountsAll) != td.all { + t.Errorf("expected %d records, got %d (%+v)", td.all, len(cgMountsAll), cgMountsAll) + } + } + } } @@ -282,11 +320,15 @@ func BenchmarkGetCgroupMounts(b *testing.B) { "perf_event": false, "hugetlb": false, } + mi, err := mountinfo.GetMountsFromReader( + bytes.NewBufferString(fedoraMountinfo), + mountinfo.FSTypeFilter("cgroup"), + ) + if err != nil { + b.Fatal(err) + } b.ResetTimer() for i := 0; i < b.N; i++ { - b.StopTimer() - mi := bytes.NewBufferString(fedoraMountinfo) - b.StartTimer() if _, err := getCgroupMountsHelper(subsystems, mi, false); err != nil { b.Fatal(err) } @@ -327,7 +369,7 @@ func TestParseCgroupString(t *testing.T) { }, { input: `malformed input`, - expectedError: fmt.Errorf(`invalid cgroup entry: must contain at least two colons: malformed input`), + expectedError: errors.New(`invalid cgroup entry: must contain at least two colons: malformed input`), }, } @@ -343,7 +385,6 @@ func TestParseCgroupString(t *testing.T) { } } } - } func TestIgnoreCgroup2Mount(t *testing.T) { @@ -361,7 +402,13 @@ func TestIgnoreCgroup2Mount(t *testing.T) { "name=systemd": false, } - mi := bytes.NewBufferString(cgroup2Mountinfo) + mi, err := mountinfo.GetMountsFromReader( + bytes.NewBufferString(cgroup2Mountinfo), + mountinfo.FSTypeFilter("cgroup"), + ) + if err != nil { + t.Fatal(err) + } cgMounts, err := getCgroupMountsHelper(subsystems, mi, false) if err != nil { t.Fatal(err) @@ -373,40 +420,9 @@ func TestIgnoreCgroup2Mount(t *testing.T) { } } -func TestGetClosestMountpointAncestor(t *testing.T) { - fakeMountInfo := ` 18 24 0:17 / /sys rw,nosuid,nodev,noexec,relatime - sysfs sysfs rw -100 99 1:31 / /foo/bar rw,relatime - fake fake rw,fake -100 99 1:31 / /foo/bar/baz2 rw,relatime - fake fake rw,fake -100 99 1:31 / /foo/bar/baz rw,relatime - fake fake rw,fake -100 99 1:31 / /foo/bar/bazza rw,relatime - fake fake rw,fake -100 99 1:31 / /foo/bar/baz3 rw,relatime - fake fake rw,fake -100 99 1:31 / /foo rw,relatime - fake fake rw,fake -100 99 1:31 / /unrelated rw,relatime - fake fake rw,fake -100 99 1:31 / / rw,relatime - fake fake rw,fake -` - testCases := []struct { - input string - output string - }{ - {input: "/foo/bar/baz/a/b/c", output: "/foo/bar/baz"}, - {input: "/foo/bar/baz", output: "/foo/bar/baz"}, - {input: "/foo/bar/bazza", output: "/foo/bar/bazza"}, - {input: "/a/b/c/d", output: "/"}, - } - - for _, c := range testCases { - mountpoint := GetClosestMountpointAncestor(c.input, fakeMountInfo) - if mountpoint != c.output { - t.Errorf("expected %s, got %s", c.output, mountpoint) - } - } -} - func TestFindCgroupMountpointAndRoot(t *testing.T) { - fakeMountInfo := ` -35 27 0:29 / /foo rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,devices -35 27 0:29 / /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,devices -` + fakeMountInfo := `35 27 0:29 / /foo rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,devices +35 27 0:29 / /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,devices` testCases := []struct { cgroupPath string output string @@ -415,45 +431,218 @@ func TestFindCgroupMountpointAndRoot(t *testing.T) { {cgroupPath: "", output: "/foo"}, } + mi, err := mountinfo.GetMountsFromReader( + bytes.NewBufferString(fakeMountInfo), + mountinfo.FSTypeFilter("cgroup"), + ) + if err != nil { + t.Fatal(err) + } + for _, c := range testCases { - mountpoint, _, _ := findCgroupMountpointAndRootFromReader(strings.NewReader(fakeMountInfo), c.cgroupPath, "devices") + mountpoint, _, _ := findCgroupMountpointAndRootFromMI(mi, c.cgroupPath, "devices") if mountpoint != c.output { t.Errorf("expected %s, got %s", c.output, mountpoint) } } } -func TestGetHugePageSizeImpl(t *testing.T) { +func BenchmarkGetHugePageSizeImpl(b *testing.B) { + var ( + input = []string{"hugepages-1048576kB", "hugepages-2048kB", "hugepages-32768kB", "hugepages-64kB"} + output []string + err error + ) + for i := 0; i < b.N; i++ { + output, err = getHugePageSizeFromFilenames(input) + } + if err != nil || len(output) != len(input) { + b.Fatal("unexpected results") + } +} +func TestGetHugePageSizeImpl(t *testing.T) { testCases := []struct { - inputFiles []string - outputPageSizes []string - err error + doc string + input []string + output []string + isErr bool }{ { - inputFiles: []string{"hugepages-1048576kB", "hugepages-2048kB", "hugepages-32768kB", "hugepages-64kB"}, - outputPageSizes: []string{"1GB", "2MB", "32MB", "64KB"}, - err: nil, + doc: "normal input", + input: []string{"hugepages-1048576kB", "hugepages-2048kB", "hugepages-32768kB", "hugepages-64kB"}, + output: []string{"1GB", "2MB", "32MB", "64KB"}, }, { - inputFiles: []string{}, - outputPageSizes: []string{}, - err: nil, + doc: "empty input", + input: []string{}, + output: []string{}, }, { - inputFiles: []string{"hugepages-a"}, - outputPageSizes: []string{}, - err: errors.New("invalid size: 'a'"), + doc: "not a number", + input: []string{"hugepages-akB"}, + isErr: true, + }, + { + doc: "no prefix (silently skipped)", + input: []string{"1024kB"}, + }, + { + doc: "invalid prefix (silently skipped)", + input: []string{"whatever-1024kB"}, + }, + { + doc: "invalid suffix", + input: []string{"hugepages-1024gB"}, + isErr: true, + }, + { + doc: "no suffix", + input: []string{"hugepages-1024"}, + isErr: true, + }, + { + doc: "mixed valid and invalid entries", + input: []string{"hugepages-4194304kB", "hugepages-2048kB", "hugepages-akB", "hugepages-64kB"}, + output: []string{"4GB", "2MB", "64KB"}, + isErr: true, + }, + { + doc: "more mixed valid and invalid entries", + input: []string{"hugepages-2048kB", "hugepages-kB", "hugepages-64kB"}, + output: []string{"2MB", "64KB"}, + isErr: true, }, } for _, c := range testCases { - pageSizes, err := getHugePageSizeFromFilenames(c.inputFiles) - if len(pageSizes) != 0 && len(c.outputPageSizes) != 0 && !reflect.DeepEqual(pageSizes, c.outputPageSizes) { - t.Errorf("expected %s, got %s", c.outputPageSizes, pageSizes) - } - if err != nil && err.Error() != c.err.Error() { - t.Errorf("expected error %s, got %s", c.err, err) + c := c + t.Run(c.doc, func(t *testing.T) { + output, err := getHugePageSizeFromFilenames(c.input) + t.Log("input:", c.input, "; output:", output, "; err:", err) + if err != nil { + if !c.isErr { + t.Errorf("input %v, expected nil, got error: %v", c.input, err) + } + // no more checks + return + } + if c.isErr { + t.Errorf("input %v, expected error, got error: nil, output: %v", c.input, output) + } + // check output + if len(output) != len(c.output) || (len(output) > 0 && !reflect.DeepEqual(output, c.output)) { + t.Errorf("input %v, expected %v, got %v", c.input, c.output, output) + } + }) + } +} + +func TestConvertCPUSharesToCgroupV2Value(t *testing.T) { + cases := map[uint64]uint64{ + 0: 0, + 2: 1, + 262144: 10000, + } + for i, expected := range cases { + got := ConvertCPUSharesToCgroupV2Value(i) + if got != expected { + t.Errorf("expected ConvertCPUSharesToCgroupV2Value(%d) to be %d, got %d", i, expected, got) + } + } +} + +func TestConvertMemorySwapToCgroupV2Value(t *testing.T) { + cases := []struct { + memswap, memory int64 + expected int64 + expErr bool + }{ + { + memswap: 0, + memory: 0, + expected: 0, + }, + { + memswap: -1, + memory: 0, + expected: -1, + }, + { + memswap: -1, + memory: -1, + expected: -1, + }, + { + memswap: -2, + memory: 0, + expErr: true, + }, + { + memswap: -1, + memory: 1000, + expected: -1, + }, + { + memswap: 1000, + memory: 1000, + expected: 0, + }, + { + memswap: 500, + memory: 200, + expected: 300, + }, + { + memswap: 300, + memory: 400, + expErr: true, + }, + { + memswap: 300, + memory: 0, + expErr: true, + }, + { + memswap: 300, + memory: -300, + expErr: true, + }, + { + memswap: 300, + memory: -1, + expErr: true, + }, + } + + for _, c := range cases { + swap, err := ConvertMemorySwapToCgroupV2Value(c.memswap, c.memory) + if c.expErr { + if err == nil { + t.Errorf("memswap: %d, memory %d, expected error, got %d, nil", c.memswap, c.memory, swap) + } + // no more checks + continue + } + if err != nil { + t.Errorf("memswap: %d, memory %d, expected success, got error %s", c.memswap, c.memory, err) + } + if swap != c.expected { + t.Errorf("memswap: %d, memory %d, expected %d, got %d", c.memswap, c.memory, c.expected, swap) + } + } +} + +func TestConvertBlkIOToIOWeightValue(t *testing.T) { + cases := map[uint16]uint64{ + 0: 0, + 10: 1, + 1000: 10000, + } + for i, expected := range cases { + got := ConvertBlkIOToIOWeightValue(i) + if got != expected { + t.Errorf("expected ConvertBlkIOToIOWeightValue(%d) to be %d, got %d", i, expected, got) } } } diff --git a/libcontainer/cgroups/v1_utils.go b/libcontainer/cgroups/v1_utils.go new file mode 100644 index 0000000..47c75f2 --- /dev/null +++ b/libcontainer/cgroups/v1_utils.go @@ -0,0 +1,290 @@ +package cgroups + +import ( + "errors" + "fmt" + "os" + "path/filepath" + "strings" + "sync" + "syscall" + + securejoin "github.com/cyphar/filepath-securejoin" + "github.com/moby/sys/mountinfo" + "golang.org/x/sys/unix" +) + +// Code in this source file are specific to cgroup v1, +// and must not be used from any cgroup v2 code. + +const ( + CgroupNamePrefix = "name=" + defaultPrefix = "/sys/fs/cgroup" +) + +var ( + errUnified = errors.New("not implemented for cgroup v2 unified hierarchy") + ErrV1NoUnified = errors.New("invalid configuration: cannot use unified on cgroup v1") + + readMountinfoOnce sync.Once + readMountinfoErr error + cgroupMountinfo []*mountinfo.Info +) + +type NotFoundError struct { + Subsystem string +} + +func (e *NotFoundError) Error() string { + return fmt.Sprintf("mountpoint for %s not found", e.Subsystem) +} + +func NewNotFoundError(sub string) error { + return &NotFoundError{ + Subsystem: sub, + } +} + +func IsNotFound(err error) bool { + var nfErr *NotFoundError + return errors.As(err, &nfErr) +} + +func tryDefaultPath(cgroupPath, subsystem string) string { + if !strings.HasPrefix(defaultPrefix, cgroupPath) { + return "" + } + + // remove possible prefix + subsystem = strings.TrimPrefix(subsystem, CgroupNamePrefix) + + // Make sure we're still under defaultPrefix, and resolve + // a possible symlink (like cpu -> cpu,cpuacct). + path, err := securejoin.SecureJoin(defaultPrefix, subsystem) + if err != nil { + return "" + } + + // (1) path should be a directory. + st, err := os.Lstat(path) + if err != nil || !st.IsDir() { + return "" + } + + // (2) path should be a mount point. + pst, err := os.Lstat(filepath.Dir(path)) + if err != nil { + return "" + } + + if st.Sys().(*syscall.Stat_t).Dev == pst.Sys().(*syscall.Stat_t).Dev { + // parent dir has the same dev -- path is not a mount point + return "" + } + + // (3) path should have 'cgroup' fs type. + fst := unix.Statfs_t{} + err = unix.Statfs(path, &fst) + if err != nil || fst.Type != unix.CGROUP_SUPER_MAGIC { + return "" + } + + return path +} + +// readCgroupMountinfo returns a list of cgroup v1 mounts (i.e. the ones +// with fstype of "cgroup") for the current running process. +// +// The results are cached (to avoid re-reading mountinfo which is relatively +// expensive), so it is assumed that cgroup mounts are not being changed. +func readCgroupMountinfo() ([]*mountinfo.Info, error) { + readMountinfoOnce.Do(func() { + cgroupMountinfo, readMountinfoErr = mountinfo.GetMounts( + mountinfo.FSTypeFilter("cgroup"), + ) + }) + + return cgroupMountinfo, readMountinfoErr +} + +// https://www.kernel.org/doc/Documentation/cgroup-v1/cgroups.txt +func FindCgroupMountpoint(cgroupPath, subsystem string) (string, error) { + if IsCgroup2UnifiedMode() { + return "", errUnified + } + + // If subsystem is empty, we look for the cgroupv2 hybrid path. + if len(subsystem) == 0 { + return hybridMountpoint, nil + } + + // Avoid parsing mountinfo by trying the default path first, if possible. + if path := tryDefaultPath(cgroupPath, subsystem); path != "" { + return path, nil + } + + mnt, _, err := FindCgroupMountpointAndRoot(cgroupPath, subsystem) + return mnt, err +} + +func FindCgroupMountpointAndRoot(cgroupPath, subsystem string) (string, string, error) { + if IsCgroup2UnifiedMode() { + return "", "", errUnified + } + + mi, err := readCgroupMountinfo() + if err != nil { + return "", "", err + } + + return findCgroupMountpointAndRootFromMI(mi, cgroupPath, subsystem) +} + +func findCgroupMountpointAndRootFromMI(mounts []*mountinfo.Info, cgroupPath, subsystem string) (string, string, error) { + for _, mi := range mounts { + if strings.HasPrefix(mi.Mountpoint, cgroupPath) { + for _, opt := range strings.Split(mi.VFSOptions, ",") { + if opt == subsystem { + return mi.Mountpoint, mi.Root, nil + } + } + } + } + + return "", "", NewNotFoundError(subsystem) +} + +func (m Mount) GetOwnCgroup(cgroups map[string]string) (string, error) { + if len(m.Subsystems) == 0 { + return "", errors.New("no subsystem for mount") + } + + return getControllerPath(m.Subsystems[0], cgroups) +} + +func getCgroupMountsHelper(ss map[string]bool, mounts []*mountinfo.Info, all bool) ([]Mount, error) { + res := make([]Mount, 0, len(ss)) + numFound := 0 + for _, mi := range mounts { + m := Mount{ + Mountpoint: mi.Mountpoint, + Root: mi.Root, + } + for _, opt := range strings.Split(mi.VFSOptions, ",") { + seen, known := ss[opt] + if !known || (!all && seen) { + continue + } + ss[opt] = true + opt = strings.TrimPrefix(opt, CgroupNamePrefix) + m.Subsystems = append(m.Subsystems, opt) + numFound++ + } + if len(m.Subsystems) > 0 || all { + res = append(res, m) + } + if !all && numFound >= len(ss) { + break + } + } + return res, nil +} + +func getCgroupMountsV1(all bool) ([]Mount, error) { + mi, err := readCgroupMountinfo() + if err != nil { + return nil, err + } + + allSubsystems, err := ParseCgroupFile("/proc/self/cgroup") + if err != nil { + return nil, err + } + + allMap := make(map[string]bool) + for s := range allSubsystems { + allMap[s] = false + } + + return getCgroupMountsHelper(allMap, mi, all) +} + +// GetOwnCgroup returns the relative path to the cgroup docker is running in. +func GetOwnCgroup(subsystem string) (string, error) { + if IsCgroup2UnifiedMode() { + return "", errUnified + } + cgroups, err := ParseCgroupFile("/proc/self/cgroup") + if err != nil { + return "", err + } + + return getControllerPath(subsystem, cgroups) +} + +func GetOwnCgroupPath(subsystem string) (string, error) { + cgroup, err := GetOwnCgroup(subsystem) + if err != nil { + return "", err + } + + // If subsystem is empty, we look for the cgroupv2 hybrid path. + if len(subsystem) == 0 { + return hybridMountpoint, nil + } + + return getCgroupPathHelper(subsystem, cgroup) +} + +func GetInitCgroup(subsystem string) (string, error) { + if IsCgroup2UnifiedMode() { + return "", errUnified + } + cgroups, err := ParseCgroupFile("/proc/1/cgroup") + if err != nil { + return "", err + } + + return getControllerPath(subsystem, cgroups) +} + +func GetInitCgroupPath(subsystem string) (string, error) { + cgroup, err := GetInitCgroup(subsystem) + if err != nil { + return "", err + } + + return getCgroupPathHelper(subsystem, cgroup) +} + +func getCgroupPathHelper(subsystem, cgroup string) (string, error) { + mnt, root, err := FindCgroupMountpointAndRoot("", subsystem) + if err != nil { + return "", err + } + + // This is needed for nested containers, because in /proc/self/cgroup we + // see paths from host, which don't exist in container. + relCgroup, err := filepath.Rel(root, cgroup) + if err != nil { + return "", err + } + + return filepath.Join(mnt, relCgroup), nil +} + +func getControllerPath(subsystem string, cgroups map[string]string) (string, error) { + if IsCgroup2UnifiedMode() { + return "", errUnified + } + + if p, ok := cgroups[subsystem]; ok { + return p, nil + } + + if p, ok := cgroups[CgroupNamePrefix+subsystem]; ok { + return p, nil + } + + return "", NewNotFoundError(subsystem) +} diff --git a/libcontainer/configs/cgroup_linux.go b/libcontainer/configs/cgroup_linux.go index 58ed19c..2d4a898 100644 --- a/libcontainer/configs/cgroup_linux.go +++ b/libcontainer/configs/cgroup_linux.go @@ -1,5 +1,10 @@ package configs +import ( + systemdDbus "github.com/coreos/go-systemd/v22/dbus" + "github.com/opencontainers/runc/libcontainer/devices" +) + type FreezerState string const ( @@ -8,12 +13,12 @@ const ( Thawed FreezerState = "THAWED" ) +// Cgroup holds properties of a cgroup on Linux. type Cgroup struct { - // Deprecated, use Path instead + // Name specifies the name of the cgroup Name string `json:"name,omitempty"` - // name of parent of cgroup or slice - // Deprecated, use Path instead + // Parent specifies the name of parent of cgroup or slice Parent string `json:"parent,omitempty"` // Path specifies the path to cgroups that are created and/or joined by the container. @@ -23,24 +28,31 @@ type Cgroup struct { // ScopePrefix describes prefix for the scope name ScopePrefix string `json:"scope_prefix"` - // Paths represent the absolute cgroups paths to join. - // This takes precedence over Path. - Paths map[string]string - // Resources contains various cgroups settings to apply *Resources + + // Systemd tells if systemd should be used to manage cgroups. + Systemd bool + + // SystemdProps are any additional properties for systemd, + // derived from org.systemd.property.xxx annotations. + // Ignored unless systemd is used for managing cgroups. + SystemdProps []systemdDbus.Property `json:"-"` + + // Rootless tells if rootless cgroups should be used. + Rootless bool + + // The host UID that should own the cgroup, or nil to accept + // the default ownership. This should only be set when the + // cgroupfs is to be mounted read/write. + // Not all cgroup manager implementations support changing + // the ownership. + OwnerUID *int `json:"owner_uid,omitempty"` } type Resources struct { - // If this is true allow access to any kind of device within the container. If false, allow access only to devices explicitly listed in the allowed_devices list. - // Deprecated - AllowAllDevices *bool `json:"allow_all_devices,omitempty"` - // Deprecated - AllowedDevices []*Device `json:"allowed_devices,omitempty"` - // Deprecated - DeniedDevices []*Device `json:"denied_devices,omitempty"` - - Devices []*Device `json:"devices"` + // Devices is the set of access rules for devices in the container. + Devices []*devices.Rule `json:"devices"` // Memory limit (in bytes) Memory int64 `json:"memory"` @@ -51,12 +63,6 @@ type Resources struct { // Total memory usage (memory + swap); set `-1` to enable unlimited swap MemorySwap int64 `json:"memory_swap"` - // Kernel memory limit (in bytes) - KernelMemory int64 `json:"kernel_memory"` - - // Kernel memory limit for TCP use (in bytes) - KernelMemoryTCP int64 `json:"kernel_memory_tcp"` - // CPU shares (relative weight vs. other containers) CpuShares uint64 `json:"cpu_shares"` @@ -120,11 +126,33 @@ type Resources struct { // Set class identifier for container's network packets NetClsClassid uint32 `json:"net_cls_classid_u"` + // Rdma resource restriction configuration + Rdma map[string]LinuxRdma `json:"rdma"` + // Used on cgroups v2: // CpuWeight sets a proportional bandwidth limit. CpuWeight uint64 `json:"cpu_weight"` - // CpuMax sets she maximum bandwidth limit (format: max period). - CpuMax string `json:"cpu_max"` + // Unified is cgroupv2-only key-value map. + Unified map[string]string `json:"unified"` + + // SkipDevices allows to skip configuring device permissions. + // Used by e.g. kubelet while creating a parent cgroup (kubepods) + // common for many containers, and by runc update. + // + // NOTE it is impossible to start a container which has this flag set. + SkipDevices bool `json:"-"` + + // SkipFreezeOnSet is a flag for cgroup manager to skip the cgroup + // freeze when setting resources. Only applicable to systemd legacy + // (i.e. cgroup v1) manager (which uses freeze by default to avoid + // spurious permission errors caused by systemd inability to update + // device rules in a non-disruptive manner). + // + // If not set, a few methods (such as looking into cgroup's + // devices.list and querying the systemd unit properties) are used + // during Set() to figure out whether the freeze is required. Those + // methods may be relatively slow, thus this flag. + SkipFreezeOnSet bool `json:"-"` } diff --git a/libcontainer/configs/cgroup_unsupported.go b/libcontainer/configs/cgroup_unsupported.go index c0c23d7..7e38302 100644 --- a/libcontainer/configs/cgroup_unsupported.go +++ b/libcontainer/configs/cgroup_unsupported.go @@ -1,8 +1,9 @@ +//go:build !linux // +build !linux package configs +// Cgroup holds properties of a cgroup on Linux // TODO Windows: This can ultimately be entirely factored out on Windows as // cgroups are a Unix-specific construct. -type Cgroup struct { -} +type Cgroup struct{} diff --git a/libcontainer/configs/config.go b/libcontainer/configs/config.go index 24989e9..c1b4a00 100644 --- a/libcontainer/configs/config.go +++ b/libcontainer/configs/config.go @@ -7,9 +7,10 @@ import ( "os/exec" "time" - "github.com/opencontainers/runtime-spec/specs-go" - "github.com/sirupsen/logrus" + + "github.com/opencontainers/runc/libcontainer/devices" + "github.com/opencontainers/runtime-spec/specs-go" ) type Rlimit struct { @@ -30,9 +31,12 @@ type IDMap struct { // for syscalls. Additional architectures can be added by specifying them in // Architectures. type Seccomp struct { - DefaultAction Action `json:"default_action"` - Architectures []string `json:"architectures"` - Syscalls []*Syscall `json:"syscalls"` + DefaultAction Action `json:"default_action"` + Architectures []string `json:"architectures"` + Syscalls []*Syscall `json:"syscalls"` + DefaultErrnoRet *uint `json:"default_errno_ret"` + ListenerPath string `json:"listener_path,omitempty"` + ListenerMetadata string `json:"listener_metadata,omitempty"` } // Action is taken upon rule match in Seccomp @@ -45,6 +49,9 @@ const ( Allow Trace Log + Notify + KillThread + KillProcess ) // Operator is a comparison operator to be used when matching syscall arguments in Seccomp @@ -70,9 +77,10 @@ type Arg struct { // Syscall is a rule to match a syscall in Seccomp type Syscall struct { - Name string `json:"name"` - Action Action `json:"action"` - Args []*Arg `json:"args"` + Name string `json:"name"` + Action Action `json:"action"` + ErrnoRet *uint `json:"errnoRet"` + Args []*Arg `json:"args"` } // TODO Windows. Many of these fields should be factored out into those parts @@ -91,6 +99,9 @@ type Config struct { // Path to a directory containing the container's root filesystem. Rootfs string `json:"rootfs"` + // Umask is the umask to use inside of the container. + Umask *uint32 `json:"umask"` + // Readonlyfs will remount the container's rootfs as readonly where only externally mounted // bind mounts are writtable. Readonlyfs bool `json:"readonlyfs"` @@ -103,7 +114,7 @@ type Config struct { Mounts []*Mount `json:"mounts"` // The device nodes that should be automatically created within the container upon container start. Note, make sure that the node is marked as allowed in the cgroup as well! - Devices []*Device `json:"devices"` + Devices []*devices.Device `json:"devices"` MountLabel string `json:"mount_label"` @@ -175,7 +186,7 @@ type Config struct { // Hooks are a collection of actions to perform at various container lifecycle events. // CommandHooks are serialized to JSON, but other hooks are not. - Hooks *Hooks + Hooks Hooks // Version is the version of opencontainer specification that is supported. Version string `json:"version"` @@ -202,16 +213,55 @@ type Config struct { RootlessCgroups bool `json:"rootless_cgroups,omitempty"` } -type Hooks struct { +type ( + HookName string + HookList []Hook + Hooks map[HookName]HookList +) + +const ( // Prestart commands are executed after the container namespaces are created, // but before the user supplied command is executed from init. - Prestart []Hook + // Note: This hook is now deprecated + // Prestart commands are called in the Runtime namespace. + Prestart HookName = "prestart" + + // CreateRuntime commands MUST be called as part of the create operation after + // the runtime environment has been created but before the pivot_root has been executed. + // CreateRuntime is called immediately after the deprecated Prestart hook. + // CreateRuntime commands are called in the Runtime Namespace. + CreateRuntime HookName = "createRuntime" + + // CreateContainer commands MUST be called as part of the create operation after + // the runtime environment has been created but before the pivot_root has been executed. + // CreateContainer commands are called in the Container namespace. + CreateContainer HookName = "createContainer" + + // StartContainer commands MUST be called as part of the start operation and before + // the container process is started. + // StartContainer commands are called in the Container namespace. + StartContainer HookName = "startContainer" // Poststart commands are executed after the container init process starts. - Poststart []Hook + // Poststart commands are called in the Runtime Namespace. + Poststart HookName = "poststart" // Poststop commands are executed after the container init process exits. - Poststop []Hook + // Poststop commands are called in the Runtime Namespace. + Poststop HookName = "poststop" +) + +// KnownHookNames returns the known hook names. +// Used by `runc features`. +func KnownHookNames() []string { + return []string{ + string(Prestart), // deprecated + string(CreateRuntime), + string(CreateContainer), + string(StartContainer), + string(Poststart), + string(Poststop), + } } type Capabilities struct { @@ -227,32 +277,39 @@ type Capabilities struct { Ambient []string } -func (hooks *Hooks) UnmarshalJSON(b []byte) error { - var state struct { - Prestart []CommandHook - Poststart []CommandHook - Poststop []CommandHook +func (hooks HookList) RunHooks(state *specs.State) error { + for i, h := range hooks { + if err := h.Run(state); err != nil { + return fmt.Errorf("error running hook #%d: %w", i, err) + } } + return nil +} + +func (hooks *Hooks) UnmarshalJSON(b []byte) error { + var state map[HookName][]CommandHook + if err := json.Unmarshal(b, &state); err != nil { return err } - deserialize := func(shooks []CommandHook) (hooks []Hook) { - for _, shook := range shooks { - hooks = append(hooks, shook) + *hooks = Hooks{} + for n, commandHooks := range state { + if len(commandHooks) == 0 { + continue } - return hooks + (*hooks)[n] = HookList{} + for _, h := range commandHooks { + (*hooks)[n] = append((*hooks)[n], h) + } } - hooks.Prestart = deserialize(state.Prestart) - hooks.Poststart = deserialize(state.Poststart) - hooks.Poststop = deserialize(state.Poststop) return nil } -func (hooks Hooks) MarshalJSON() ([]byte, error) { +func (hooks *Hooks) MarshalJSON() ([]byte, error) { serialize := func(hooks []Hook) (serializableHooks []CommandHook) { for _, hook := range hooks { switch chook := hook.(type) { @@ -267,9 +324,12 @@ func (hooks Hooks) MarshalJSON() ([]byte, error) { } return json.Marshal(map[string]interface{}{ - "prestart": serialize(hooks.Prestart), - "poststart": serialize(hooks.Poststart), - "poststop": serialize(hooks.Poststop), + "prestart": serialize((*hooks)[Prestart]), + "createRuntime": serialize((*hooks)[CreateRuntime]), + "createContainer": serialize((*hooks)[CreateContainer]), + "startContainer": serialize((*hooks)[StartContainer]), + "poststart": serialize((*hooks)[Poststart]), + "poststop": serialize((*hooks)[Poststop]), }) } @@ -333,7 +393,7 @@ func (c Command) Run(s *specs.State) error { go func() { err := cmd.Wait() if err != nil { - err = fmt.Errorf("error running hook: %v, stdout: %s, stderr: %s", err, stdout.String(), stderr.String()) + err = fmt.Errorf("error running hook: %w, stdout: %s, stderr: %s", err, stdout.String(), stderr.String()) } errC <- err }() @@ -347,8 +407,8 @@ func (c Command) Run(s *specs.State) error { case err := <-errC: return err case <-timerCh: - cmd.Process.Kill() - cmd.Wait() + _ = cmd.Process.Kill() + <-errC return fmt.Errorf("hook ran past specified timeout of %.1fs", c.Timeout.Seconds()) } } diff --git a/libcontainer/configs/config_linux.go b/libcontainer/configs/config_linux.go index 07da108..8c02848 100644 --- a/libcontainer/configs/config_linux.go +++ b/libcontainer/configs/config_linux.go @@ -1,17 +1,24 @@ package configs -import "fmt" +import "errors" + +var ( + errNoUIDMap = errors.New("User namespaces enabled, but no uid mappings found.") + errNoUserMap = errors.New("User namespaces enabled, but no user mapping found.") + errNoGIDMap = errors.New("User namespaces enabled, but no gid mappings found.") + errNoGroupMap = errors.New("User namespaces enabled, but no group mapping found.") +) // HostUID gets the translated uid for the process on host which could be // different when user namespaces are enabled. func (c Config) HostUID(containerId int) (int, error) { if c.Namespaces.Contains(NEWUSER) { if c.UidMappings == nil { - return -1, fmt.Errorf("User namespaces enabled, but no uid mappings found.") + return -1, errNoUIDMap } id, found := c.hostIDFromMapping(containerId, c.UidMappings) if !found { - return -1, fmt.Errorf("User namespaces enabled, but no user mapping found.") + return -1, errNoUserMap } return id, nil } @@ -30,11 +37,11 @@ func (c Config) HostRootUID() (int, error) { func (c Config) HostGID(containerId int) (int, error) { if c.Namespaces.Contains(NEWUSER) { if c.GidMappings == nil { - return -1, fmt.Errorf("User namespaces enabled, but no gid mappings found.") + return -1, errNoGIDMap } id, found := c.hostIDFromMapping(containerId, c.GidMappings) if !found { - return -1, fmt.Errorf("User namespaces enabled, but no group mapping found.") + return -1, errNoGroupMap } return id, nil } diff --git a/libcontainer/configs/config_linux_test.go b/libcontainer/configs/config_linux_test.go index 9c5f0fe..68d33e6 100644 --- a/libcontainer/configs/config_linux_test.go +++ b/libcontainer/configs/config_linux_test.go @@ -1,55 +1,10 @@ package configs import ( - "encoding/json" - "fmt" - "os" - "path/filepath" "testing" ) -func loadConfig(name string) (*Config, error) { - f, err := os.Open(filepath.Join("../sample_configs", name)) - if err != nil { - return nil, err - } - defer f.Close() - - var container *Config - if err := json.NewDecoder(f).Decode(&container); err != nil { - return nil, err - } - - // Check that a config doesn't contain extra fields - var configMap, abstractMap map[string]interface{} - - if _, err := f.Seek(0, 0); err != nil { - return nil, err - } - - if err := json.NewDecoder(f).Decode(&abstractMap); err != nil { - return nil, err - } - - configData, err := json.Marshal(&container) - if err != nil { - return nil, err - } - - if err := json.Unmarshal(configData, &configMap); err != nil { - return nil, err - } - - for k := range configMap { - delete(abstractMap, k) - } - - if len(abstractMap) != 0 { - return nil, fmt.Errorf("unknown fields: %s", abstractMap) - } - - return container, nil -} +var HookNameList = []HookName{Prestart, CreateRuntime, CreateContainer, StartContainer, Poststart, Poststop} func TestRemoveNamespace(t *testing.T) { ns := Namespaces{ diff --git a/libcontainer/configs/config_test.go b/libcontainer/configs/config_test.go index c89a764..5c29b74 100644 --- a/libcontainer/configs/config_test.go +++ b/libcontainer/configs/config_test.go @@ -15,27 +15,29 @@ import ( func TestUnmarshalHooks(t *testing.T) { timeout := time.Second - prestartCmd := configs.NewCommandHook(configs.Command{ - Path: "/var/vcap/hooks/prestart", + hookCmd := configs.NewCommandHook(configs.Command{ + Path: "/var/vcap/hooks/hook", Args: []string{"--pid=123"}, Env: []string{"FOO=BAR"}, Dir: "/var/vcap", Timeout: &timeout, }) - prestart, err := json.Marshal(prestartCmd.Command) + + hookJson, err := json.Marshal(hookCmd) if err != nil { t.Fatal(err) } - hook := configs.Hooks{} - err = hook.UnmarshalJSON([]byte(fmt.Sprintf(`{"Prestart" :[%s]}`, prestart))) - if err != nil { - t.Fatal(err) - } + for _, hookName := range configs.HookNameList { + hooks := configs.Hooks{} + err = hooks.UnmarshalJSON([]byte(fmt.Sprintf(`{"%s" :[%s]}`, hookName, hookJson))) + if err != nil { + t.Fatal(err) + } - if !reflect.DeepEqual(hook.Prestart[0], prestartCmd) { - t.Errorf("Expected prestart to equal %+v but it was %+v", - prestartCmd, hook.Prestart[0]) + if !reflect.DeepEqual(hooks[hookName], configs.HookList{hookCmd}) { + t.Errorf("Expected %s to equal %+v but it was %+v", hookName, hookCmd, hooks[hookName]) + } } } @@ -50,8 +52,8 @@ func TestUnmarshalHooksWithInvalidData(t *testing.T) { func TestMarshalHooks(t *testing.T) { timeout := time.Second - prestartCmd := configs.NewCommandHook(configs.Command{ - Path: "/var/vcap/hooks/prestart", + hookCmd := configs.NewCommandHook(configs.Command{ + Path: "/var/vcap/hooks/hook", Args: []string{"--pid=123"}, Env: []string{"FOO=BAR"}, Dir: "/var/vcap", @@ -59,14 +61,21 @@ func TestMarshalHooks(t *testing.T) { }) hook := configs.Hooks{ - Prestart: []configs.Hook{prestartCmd}, + configs.Prestart: configs.HookList{hookCmd}, + configs.CreateRuntime: configs.HookList{hookCmd}, + configs.CreateContainer: configs.HookList{hookCmd}, + configs.StartContainer: configs.HookList{hookCmd}, + configs.Poststart: configs.HookList{hookCmd}, + configs.Poststop: configs.HookList{hookCmd}, } hooks, err := hook.MarshalJSON() if err != nil { t.Fatal(err) } - h := `{"poststart":null,"poststop":null,"prestart":[{"path":"/var/vcap/hooks/prestart","args":["--pid=123"],"env":["FOO=BAR"],"dir":"/var/vcap","timeout":1000000000}]}` + // Note Marshal seems to output fields in alphabetical order + hookCmdJson := `[{"path":"/var/vcap/hooks/hook","args":["--pid=123"],"env":["FOO=BAR"],"dir":"/var/vcap","timeout":1000000000}]` + h := fmt.Sprintf(`{"createContainer":%[1]s,"createRuntime":%[1]s,"poststart":%[1]s,"poststop":%[1]s,"prestart":%[1]s,"startContainer":%[1]s}`, hookCmdJson) if string(hooks) != h { t.Errorf("Expected hooks %s to equal %s", string(hooks), h) } @@ -75,8 +84,8 @@ func TestMarshalHooks(t *testing.T) { func TestMarshalUnmarshalHooks(t *testing.T) { timeout := time.Second - prestart := configs.NewCommandHook(configs.Command{ - Path: "/var/vcap/hooks/prestart", + hookCmd := configs.NewCommandHook(configs.Command{ + Path: "/var/vcap/hooks/hook", Args: []string{"--pid=123"}, Env: []string{"FOO=BAR"}, Dir: "/var/vcap", @@ -84,7 +93,12 @@ func TestMarshalUnmarshalHooks(t *testing.T) { }) hook := configs.Hooks{ - Prestart: []configs.Hook{prestart}, + configs.Prestart: configs.HookList{hookCmd}, + configs.CreateRuntime: configs.HookList{hookCmd}, + configs.CreateContainer: configs.HookList{hookCmd}, + configs.StartContainer: configs.HookList{hookCmd}, + configs.Poststart: configs.HookList{hookCmd}, + configs.Poststop: configs.HookList{hookCmd}, } hooks, err := hook.MarshalJSON() if err != nil { @@ -96,8 +110,8 @@ func TestMarshalUnmarshalHooks(t *testing.T) { if err != nil { t.Fatal(err) } - if !reflect.DeepEqual(umMhook.Prestart[0], prestart) { - t.Errorf("Expected hooks to be equal after mashaling -> unmarshaling them: %+v, %+v", umMhook.Prestart[0], prestart) + if !reflect.DeepEqual(umMhook, hook) { + t.Errorf("Expected hooks to be equal after mashaling -> unmarshaling them: %+v, %+v", umMhook, hook) } } @@ -106,14 +120,14 @@ func TestMarshalHooksWithUnexpectedType(t *testing.T) { return nil }) hook := configs.Hooks{ - Prestart: []configs.Hook{fHook}, + configs.CreateRuntime: configs.HookList{fHook}, } hooks, err := hook.MarshalJSON() if err != nil { t.Fatal(err) } - h := `{"poststart":null,"poststop":null,"prestart":null}` + h := `{"createContainer":null,"createRuntime":null,"poststart":null,"poststop":null,"prestart":null,"startContainer":null}` if string(hooks) != h { t.Errorf("Expected hooks %s to equal %s", string(hooks), h) } @@ -130,12 +144,15 @@ func TestFuncHookRun(t *testing.T) { fHook := configs.NewFunctionHook(func(s *specs.State) error { if !reflect.DeepEqual(state, s) { - t.Errorf("Expected state %+v to equal %+v", state, s) + return fmt.Errorf("expected state %+v to equal %+v", state, s) } return nil }) - fHook.Run(state) + err := fHook.Run(state) + if err != nil { + t.Fatal(err) + } } func TestCommandHookRun(t *testing.T) { @@ -146,18 +163,45 @@ func TestCommandHookRun(t *testing.T) { Pid: 1, Bundle: "/bundle", } - timeout := time.Second + + stateJson, err := json.Marshal(state) + if err != nil { + t.Fatal(err) + } + + verifyCommandTemplate := `#!/bin/sh +if [ "$1" != "testarg" ]; then + echo "Bad value for $1. Expected 'testarg', found '$1'" + exit 1 +fi +if [ -z "$FOO" ] || [ "$FOO" != BAR ]; then + echo "Bad value for FOO. Expected 'BAR', found '$FOO'" + exit 1 +fi +expectedJson=%q +read JSON +if [ "$JSON" != "$expectedJson" ]; then + echo "Bad JSON received. Expected '$expectedJson', found '$JSON'" + exit 1 +fi +exit 0 + ` + verifyCommand := fmt.Sprintf(verifyCommandTemplate, stateJson) + filename := "/tmp/runc-hooktest.sh" + os.Remove(filename) + if err := os.WriteFile(filename, []byte(verifyCommand), 0o700); err != nil { + t.Fatalf("Failed to create tmp file: %v", err) + } + defer os.Remove(filename) cmdHook := configs.NewCommandHook(configs.Command{ - Path: os.Args[0], - Args: []string{os.Args[0], "-test.run=TestHelperProcess"}, - Env: []string{"FOO=BAR"}, - Dir: "/", - Timeout: &timeout, + Path: filename, + Args: []string{filename, "testarg"}, + Env: []string{"FOO=BAR"}, + Dir: "/", }) - err := cmdHook.Run(state) - if err != nil { + if err := cmdHook.Run(state); err != nil { t.Errorf(fmt.Sprintf("Expected error to not occur but it was %+v", err)) } } @@ -170,26 +214,15 @@ func TestCommandHookRunTimeout(t *testing.T) { Pid: 1, Bundle: "/bundle", } - timeout := (10 * time.Millisecond) + timeout := 100 * time.Millisecond cmdHook := configs.NewCommandHook(configs.Command{ - Path: os.Args[0], - Args: []string{os.Args[0], "-test.run=TestHelperProcessWithTimeout"}, - Env: []string{"FOO=BAR"}, - Dir: "/", + Path: "/bin/sleep", + Args: []string{"/bin/sleep", "1"}, Timeout: &timeout, }) - err := cmdHook.Run(state) - if err == nil { + if err := cmdHook.Run(state); err == nil { t.Error("Expected error to occur but it was nil") } } - -func TestHelperProcess(*testing.T) { - fmt.Println("Helper Process") - os.Exit(0) -} -func TestHelperProcessWithTimeout(*testing.T) { - time.Sleep(time.Second) -} diff --git a/libcontainer/configs/config_windows_test.go b/libcontainer/configs/config_windows_test.go deleted file mode 100644 index 1a0c8fa..0000000 --- a/libcontainer/configs/config_windows_test.go +++ /dev/null @@ -1,3 +0,0 @@ -package configs - -// All current tests are for Unix-specific functionality diff --git a/libcontainer/configs/configs_fuzzer.go b/libcontainer/configs/configs_fuzzer.go new file mode 100644 index 0000000..bce829e --- /dev/null +++ b/libcontainer/configs/configs_fuzzer.go @@ -0,0 +1,10 @@ +//go:build gofuzz +// +build gofuzz + +package configs + +func FuzzUnmarshalJSON(data []byte) int { + hooks := Hooks{} + _ = hooks.UnmarshalJSON(data) + return 1 +} diff --git a/libcontainer/configs/device.go b/libcontainer/configs/device.go deleted file mode 100644 index 8701bb2..0000000 --- a/libcontainer/configs/device.go +++ /dev/null @@ -1,57 +0,0 @@ -package configs - -import ( - "fmt" - "os" -) - -const ( - Wildcard = -1 -) - -// TODO Windows: This can be factored out in the future - -type Device struct { - // Device type, block, char, etc. - Type rune `json:"type"` - - // Path to the device. - Path string `json:"path"` - - // Major is the device's major number. - Major int64 `json:"major"` - - // Minor is the device's minor number. - Minor int64 `json:"minor"` - - // Cgroup permissions format, rwm. - Permissions string `json:"permissions"` - - // FileMode permission bits for the device. - FileMode os.FileMode `json:"file_mode"` - - // Uid of the device. - Uid uint32 `json:"uid"` - - // Gid of the device. - Gid uint32 `json:"gid"` - - // Write the file to the allowed list - Allow bool `json:"allow"` -} - -func (d *Device) CgroupString() string { - return fmt.Sprintf("%c %s:%s %s", d.Type, deviceNumberString(d.Major), deviceNumberString(d.Minor), d.Permissions) -} - -func (d *Device) Mkdev() int { - return int((d.Major << 8) | (d.Minor & 0xff) | ((d.Minor & 0xfff00) << 12)) -} - -// deviceNumberString converts the device number to a string return result. -func deviceNumberString(number int64) string { - if number == Wildcard { - return "*" - } - return fmt.Sprint(number) -} diff --git a/libcontainer/configs/device_defaults.go b/libcontainer/configs/device_defaults.go deleted file mode 100644 index e4f423c..0000000 --- a/libcontainer/configs/device_defaults.go +++ /dev/null @@ -1,111 +0,0 @@ -// +build linux - -package configs - -var ( - // DefaultSimpleDevices are devices that are to be both allowed and created. - DefaultSimpleDevices = []*Device{ - // /dev/null and zero - { - Path: "/dev/null", - Type: 'c', - Major: 1, - Minor: 3, - Permissions: "rwm", - FileMode: 0666, - }, - { - Path: "/dev/zero", - Type: 'c', - Major: 1, - Minor: 5, - Permissions: "rwm", - FileMode: 0666, - }, - - { - Path: "/dev/full", - Type: 'c', - Major: 1, - Minor: 7, - Permissions: "rwm", - FileMode: 0666, - }, - - // consoles and ttys - { - Path: "/dev/tty", - Type: 'c', - Major: 5, - Minor: 0, - Permissions: "rwm", - FileMode: 0666, - }, - - // /dev/urandom,/dev/random - { - Path: "/dev/urandom", - Type: 'c', - Major: 1, - Minor: 9, - Permissions: "rwm", - FileMode: 0666, - }, - { - Path: "/dev/random", - Type: 'c', - Major: 1, - Minor: 8, - Permissions: "rwm", - FileMode: 0666, - }, - } - DefaultAllowedDevices = append([]*Device{ - // allow mknod for any device - { - Type: 'c', - Major: Wildcard, - Minor: Wildcard, - Permissions: "m", - }, - { - Type: 'b', - Major: Wildcard, - Minor: Wildcard, - Permissions: "m", - }, - - { - Path: "/dev/console", - Type: 'c', - Major: 5, - Minor: 1, - Permissions: "rwm", - }, - // /dev/pts/ - pts namespaces are "coming soon" - { - Path: "", - Type: 'c', - Major: 136, - Minor: Wildcard, - Permissions: "rwm", - }, - { - Path: "", - Type: 'c', - Major: 5, - Minor: 2, - Permissions: "rwm", - }, - - // tuntap - { - Path: "", - Type: 'c', - Major: 10, - Minor: 200, - Permissions: "rwm", - }, - }, DefaultSimpleDevices...) - DefaultAutoCreatedDevices = append([]*Device{}, DefaultSimpleDevices...) -) diff --git a/libcontainer/configs/intelrdt.go b/libcontainer/configs/intelrdt.go index 57e9f03..f8d951a 100644 --- a/libcontainer/configs/intelrdt.go +++ b/libcontainer/configs/intelrdt.go @@ -1,6 +1,9 @@ package configs type IntelRdt struct { + // The identity for RDT Class of Service + ClosID string `json:"closID,omitempty"` + // The schema for L3 cache id and capacity bitmask (CBM) // Format: "L3:=;=;..." L3CacheSchema string `json:"l3_cache_schema,omitempty"` diff --git a/libcontainer/configs/mount.go b/libcontainer/configs/mount.go index 670757d..784c618 100644 --- a/libcontainer/configs/mount.go +++ b/libcontainer/configs/mount.go @@ -1,9 +1,11 @@ package configs +import "golang.org/x/sys/unix" + const ( // EXT_COPYUP is a directive to copy up the contents of a directory when // a tmpfs is mounted over it. - EXT_COPYUP = 1 << iota + EXT_COPYUP = 1 << iota //nolint:golint // ignore "don't use ALL_CAPS" warning ) type Mount struct { @@ -28,6 +30,9 @@ type Mount struct { // Relabel source if set, "z" indicates shared, "Z" indicates unshared. Relabel string `json:"relabel"` + // RecAttr represents mount properties to be applied recursively (AT_RECURSIVE), see mount_setattr(2). + RecAttr *unix.MountAttr `json:"rec_attr"` + // Extensions are additional flags that are specific to runc. Extensions int `json:"extensions"` @@ -37,3 +42,7 @@ type Mount struct { // Optional Command to be run after Source is mounted. PostmountCmds []Command `json:"postmount_cmds"` } + +func (m *Mount) IsBind() bool { + return m.Flags&unix.MS_BIND != 0 +} diff --git a/libcontainer/configs/namespaces_linux.go b/libcontainer/configs/namespaces_linux.go index 1bbaef9..d52d6fc 100644 --- a/libcontainer/configs/namespaces_linux.go +++ b/libcontainer/configs/namespaces_linux.go @@ -56,7 +56,7 @@ func IsNamespaceSupported(ns NamespaceType) bool { if nsFile == "" { return false } - _, err := os.Stat(fmt.Sprintf("/proc/self/ns/%s", nsFile)) + _, err := os.Stat("/proc/self/ns/" + nsFile) // a namespace is supported if it exists and we have permissions to read it supported = err == nil supportedNamespaces[ns] = supported diff --git a/libcontainer/configs/namespaces_syscall.go b/libcontainer/configs/namespaces_syscall.go index 2dc7adf..0516dba 100644 --- a/libcontainer/configs/namespaces_syscall.go +++ b/libcontainer/configs/namespaces_syscall.go @@ -1,3 +1,4 @@ +//go:build linux // +build linux package configs diff --git a/libcontainer/configs/namespaces_syscall_unsupported.go b/libcontainer/configs/namespaces_syscall_unsupported.go index 5d9a5c8..fbb0d49 100644 --- a/libcontainer/configs/namespaces_syscall_unsupported.go +++ b/libcontainer/configs/namespaces_syscall_unsupported.go @@ -1,3 +1,4 @@ +//go:build !linux && !windows // +build !linux,!windows package configs diff --git a/libcontainer/configs/namespaces_unsupported.go b/libcontainer/configs/namespaces_unsupported.go index 19bf713..946db30 100644 --- a/libcontainer/configs/namespaces_unsupported.go +++ b/libcontainer/configs/namespaces_unsupported.go @@ -1,8 +1,8 @@ +//go:build !linux // +build !linux package configs // Namespace defines configuration for each namespace. It specifies an // alternate path that is able to be joined via setns. -type Namespace struct { -} +type Namespace struct{} diff --git a/libcontainer/configs/network.go b/libcontainer/configs/network.go index ccdb228..c44c3ea 100644 --- a/libcontainer/configs/network.go +++ b/libcontainer/configs/network.go @@ -50,7 +50,10 @@ type Network struct { HairpinMode bool `json:"hairpin_mode"` } -// Routes can be specified to create entries in the route table as the container is started +// Route defines a routing table entry. +// +// Routes can be specified to create entries in the routing table as the container +// is started. // // All of destination, source, and gateway should be either IPv4 or IPv6. // One of the three options must be present, and omitted entries will use their @@ -58,15 +61,15 @@ type Network struct { // gateway to 1.2.3.4 and the interface to eth0 will set up a standard // destination of 0.0.0.0(or *) when viewed in the route table. type Route struct { - // Sets the destination and mask, should be a CIDR. Accepts IPv4 and IPv6 + // Destination specifies the destination IP address and mask in the CIDR form. Destination string `json:"destination"` - // Sets the source and mask, should be a CIDR. Accepts IPv4 and IPv6 + // Source specifies the source IP address and mask in the CIDR form. Source string `json:"source"` - // Sets the gateway. Accepts IPv4 and IPv6 + // Gateway specifies the gateway IP address. Gateway string `json:"gateway"` - // The device to set this route up for, for example: eth0 + // InterfaceName specifies the device to set this route up for, for example eth0. InterfaceName string `json:"interface_name"` } diff --git a/libcontainer/configs/rdma.go b/libcontainer/configs/rdma.go new file mode 100644 index 0000000..c69f2c8 --- /dev/null +++ b/libcontainer/configs/rdma.go @@ -0,0 +1,9 @@ +package configs + +// LinuxRdma for Linux cgroup 'rdma' resource management (Linux 4.11) +type LinuxRdma struct { + // Maximum number of HCA handles that can be opened. Default is "no limit". + HcaHandles *uint32 `json:"hca_handles,omitempty"` + // Maximum number of HCA objects that can be created. Default is "no limit". + HcaObjects *uint32 `json:"hca_objects,omitempty"` +} diff --git a/libcontainer/configs/validate/rootless.go b/libcontainer/configs/validate/rootless.go index 393d9e8..9a6e5eb 100644 --- a/libcontainer/configs/validate/rootless.go +++ b/libcontainer/configs/validate/rootless.go @@ -1,6 +1,7 @@ package validate import ( + "errors" "fmt" "strings" @@ -10,6 +11,9 @@ import ( // rootlessEUID makes sure that the config can be applied when runc // is being executed as a non-root user (euid != 0) in the current user namespace. func (v *ConfigValidator) rootlessEUID(config *configs.Config) error { + if !config.RootlessEUID { + return nil + } if err := rootlessEUIDMappings(config); err != nil { return err } @@ -35,14 +39,14 @@ func hasIDMapping(id int, mappings []configs.IDMap) bool { func rootlessEUIDMappings(config *configs.Config) error { if !config.Namespaces.Contains(configs.NEWUSER) { - return fmt.Errorf("rootless container requires user namespaces") + return errors.New("rootless container requires user namespaces") } if len(config.UidMappings) == 0 { - return fmt.Errorf("rootless containers requires at least one UID mapping") + return errors.New("rootless containers requires at least one UID mapping") } if len(config.GidMappings) == 0 { - return fmt.Errorf("rootless containers requires at least one GID mapping") + return errors.New("rootless containers requires at least one GID mapping") } return nil } @@ -67,7 +71,7 @@ func rootlessEUIDMount(config *configs.Config) error { continue } if !hasIDMapping(uid, config.UidMappings) { - return fmt.Errorf("cannot specify uid= mount options for unmapped uid in rootless containers") + return errors.New("cannot specify uid= mount options for unmapped uid in rootless containers") } } @@ -79,7 +83,7 @@ func rootlessEUIDMount(config *configs.Config) error { continue } if !hasIDMapping(gid, config.GidMappings) { - return fmt.Errorf("cannot specify gid= mount options for unmapped gid in rootless containers") + return errors.New("cannot specify gid= mount options for unmapped gid in rootless containers") } } } diff --git a/libcontainer/configs/validate/validator.go b/libcontainer/configs/validate/validator.go index 3b42f30..6493124 100644 --- a/libcontainer/configs/validate/validator.go +++ b/libcontainer/configs/validate/validator.go @@ -1,14 +1,19 @@ package validate import ( + "errors" "fmt" "os" "path/filepath" "strings" + "sync" + "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/intelrdt" selinux "github.com/opencontainers/selinux/go-selinux" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" ) type Validator interface { @@ -19,39 +24,37 @@ func New() Validator { return &ConfigValidator{} } -type ConfigValidator struct { -} +type ConfigValidator struct{} + +type check func(config *configs.Config) error func (v *ConfigValidator) Validate(config *configs.Config) error { - if err := v.rootfs(config); err != nil { - return err + checks := []check{ + v.cgroups, + v.rootfs, + v.network, + v.hostname, + v.security, + v.usernamespace, + v.cgroupnamespace, + v.sysctl, + v.intelrdt, + v.rootlessEUID, } - if err := v.network(config); err != nil { - return err - } - if err := v.hostname(config); err != nil { - return err - } - if err := v.security(config); err != nil { - return err - } - if err := v.usernamespace(config); err != nil { - return err - } - if err := v.cgroupnamespace(config); err != nil { - return err - } - if err := v.sysctl(config); err != nil { - return err - } - if err := v.intelrdt(config); err != nil { - return err - } - if config.RootlessEUID { - if err := v.rootlessEUID(config); err != nil { + for _, c := range checks { + if err := c(config); err != nil { return err } } + // Relaxed validation rules for backward compatibility + warns := []check{ + v.mounts, // TODO (runc v1.x.x): make this an error instead of a warning + } + for _, c := range warns { + if err := c(config); err != nil { + logrus.WithError(err).Warn("invalid configuration") + } + } return nil } @@ -59,20 +62,17 @@ func (v *ConfigValidator) Validate(config *configs.Config) error { // to the container's root filesystem. func (v *ConfigValidator) rootfs(config *configs.Config) error { if _, err := os.Stat(config.Rootfs); err != nil { - if os.IsNotExist(err) { - return fmt.Errorf("rootfs (%s) does not exist", config.Rootfs) - } - return err + return fmt.Errorf("invalid rootfs: %w", err) } cleaned, err := filepath.Abs(config.Rootfs) if err != nil { - return err + return fmt.Errorf("invalid rootfs: %w", err) } if cleaned, err = filepath.EvalSymlinks(cleaned); err != nil { - return err + return fmt.Errorf("invalid rootfs: %w", err) } if filepath.Clean(config.Rootfs) != cleaned { - return fmt.Errorf("%s is not an absolute path or is a symlink", config.Rootfs) + return errors.New("invalid rootfs: not an absolute path, or a symlink") } return nil } @@ -80,7 +80,7 @@ func (v *ConfigValidator) rootfs(config *configs.Config) error { func (v *ConfigValidator) network(config *configs.Config) error { if !config.Namespaces.Contains(configs.NEWNET) { if len(config.Networks) > 0 || len(config.Routes) > 0 { - return fmt.Errorf("unable to apply network settings without a private NET namespace") + return errors.New("unable to apply network settings without a private NET namespace") } } return nil @@ -88,7 +88,7 @@ func (v *ConfigValidator) network(config *configs.Config) error { func (v *ConfigValidator) hostname(config *configs.Config) error { if config.Hostname != "" && !config.Namespaces.Contains(configs.NEWUTS) { - return fmt.Errorf("unable to set hostname without a private UTS namespace") + return errors.New("unable to set hostname without a private UTS namespace") } return nil } @@ -97,10 +97,10 @@ func (v *ConfigValidator) security(config *configs.Config) error { // restrict sys without mount namespace if (len(config.MaskPaths) > 0 || len(config.ReadonlyPaths) > 0) && !config.Namespaces.Contains(configs.NEWNS) { - return fmt.Errorf("unable to restrict sys entries without a private MNT namespace") + return errors.New("unable to restrict sys entries without a private MNT namespace") } if config.ProcessLabel != "" && !selinux.GetEnabled() { - return fmt.Errorf("selinux label is specified in config, but selinux is disabled or not supported") + return errors.New("selinux label is specified in config, but selinux is disabled or not supported") } return nil @@ -109,11 +109,11 @@ func (v *ConfigValidator) security(config *configs.Config) error { func (v *ConfigValidator) usernamespace(config *configs.Config) error { if config.Namespaces.Contains(configs.NEWUSER) { if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) { - return fmt.Errorf("USER namespaces aren't enabled in the kernel") + return errors.New("USER namespaces aren't enabled in the kernel") } } else { if config.UidMappings != nil || config.GidMappings != nil { - return fmt.Errorf("User namespace mappings specified, but USER namespace isn't enabled in the config") + return errors.New("User namespace mappings specified, but USER namespace isn't enabled in the config") } } return nil @@ -122,12 +122,42 @@ func (v *ConfigValidator) usernamespace(config *configs.Config) error { func (v *ConfigValidator) cgroupnamespace(config *configs.Config) error { if config.Namespaces.Contains(configs.NEWCGROUP) { if _, err := os.Stat("/proc/self/ns/cgroup"); os.IsNotExist(err) { - return fmt.Errorf("cgroup namespaces aren't enabled in the kernel") + return errors.New("cgroup namespaces aren't enabled in the kernel") } } return nil } +// convertSysctlVariableToDotsSeparator can return sysctl variables in dots separator format. +// The '/' separator is also accepted in place of a '.'. +// Convert the sysctl variables to dots separator format for validation. +// More info: +// https://man7.org/linux/man-pages/man8/sysctl.8.html +// https://man7.org/linux/man-pages/man5/sysctl.d.5.html +// For example: +// Input sysctl variable "net/ipv4/conf/eno2.100.rp_filter" +// will return the converted value "net.ipv4.conf.eno2/100.rp_filter" +func convertSysctlVariableToDotsSeparator(val string) string { + if val == "" { + return val + } + firstSepIndex := strings.IndexAny(val, "./") + if firstSepIndex == -1 || val[firstSepIndex] == '.' { + return val + } + + f := func(r rune) rune { + switch r { + case '.': + return '/' + case '/': + return '.' + } + return r + } + return strings.Map(f, val) +} + // sysctl validates that the specified sysctl keys are valid or not. // /proc/sys isn't completely namespaced and depending on which namespaces // are specified, a subset of sysctls are permitted. @@ -143,7 +173,14 @@ func (v *ConfigValidator) sysctl(config *configs.Config) error { "kernel.shm_rmid_forced": true, } + var ( + netOnce sync.Once + hostnet bool + hostnetErr error + ) + for s := range config.Sysctl { + s := convertSysctlVariableToDotsSeparator(s) if validSysctlMap[s] || strings.HasPrefix(s, "fs.mqueue.") { if config.Namespaces.Contains(configs.NEWIPC) { continue @@ -152,16 +189,27 @@ func (v *ConfigValidator) sysctl(config *configs.Config) error { } } if strings.HasPrefix(s, "net.") { - if config.Namespaces.Contains(configs.NEWNET) { - if path := config.Namespaces.PathOf(configs.NEWNET); path != "" { - if err := checkHostNs(s, path); err != nil { - return err - } + // Is container using host netns? + // Here "host" means "current", not "initial". + netOnce.Do(func() { + if !config.Namespaces.Contains(configs.NEWNET) { + hostnet = true + return } - continue - } else { - return fmt.Errorf("sysctl %q is not allowed in the hosts network namespace", s) + path := config.Namespaces.PathOf(configs.NEWNET) + if path == "" { + // own netns, so hostnet = false + return + } + hostnet, hostnetErr = isHostNetNS(path) + }) + if hostnetErr != nil { + return fmt.Errorf("invalid netns path: %w", hostnetErr) } + if hostnet { + return fmt.Errorf("sysctl %q not allowed in host network namespace", s) + } + continue } if config.Namespaces.Contains(configs.NEWUTS) { switch s { @@ -181,65 +229,75 @@ func (v *ConfigValidator) sysctl(config *configs.Config) error { func (v *ConfigValidator) intelrdt(config *configs.Config) error { if config.IntelRdt != nil { - if !intelrdt.IsCatEnabled() && !intelrdt.IsMbaEnabled() { - return fmt.Errorf("intelRdt is specified in config, but Intel RDT is not supported or enabled") + if !intelrdt.IsCATEnabled() && !intelrdt.IsMBAEnabled() { + return errors.New("intelRdt is specified in config, but Intel RDT is not supported or enabled") } - if !intelrdt.IsCatEnabled() && config.IntelRdt.L3CacheSchema != "" { - return fmt.Errorf("intelRdt.l3CacheSchema is specified in config, but Intel RDT/CAT is not enabled") - } - if !intelrdt.IsMbaEnabled() && config.IntelRdt.MemBwSchema != "" { - return fmt.Errorf("intelRdt.memBwSchema is specified in config, but Intel RDT/MBA is not enabled") + if config.IntelRdt.ClosID == "." || config.IntelRdt.ClosID == ".." || strings.Contains(config.IntelRdt.ClosID, "/") { + return fmt.Errorf("invalid intelRdt.ClosID %q", config.IntelRdt.ClosID) } - if intelrdt.IsCatEnabled() && config.IntelRdt.L3CacheSchema == "" { - return fmt.Errorf("Intel RDT/CAT is enabled and intelRdt is specified in config, but intelRdt.l3CacheSchema is empty") + if !intelrdt.IsCATEnabled() && config.IntelRdt.L3CacheSchema != "" { + return errors.New("intelRdt.l3CacheSchema is specified in config, but Intel RDT/CAT is not enabled") } - if intelrdt.IsMbaEnabled() && config.IntelRdt.MemBwSchema == "" { - return fmt.Errorf("Intel RDT/MBA is enabled and intelRdt is specified in config, but intelRdt.memBwSchema is empty") + if !intelrdt.IsMBAEnabled() && config.IntelRdt.MemBwSchema != "" { + return errors.New("intelRdt.memBwSchema is specified in config, but Intel RDT/MBA is not enabled") } } return nil } -func isSymbolicLink(path string) (bool, error) { - fi, err := os.Lstat(path) - if err != nil { - return false, err - } - - return fi.Mode()&os.ModeSymlink == os.ModeSymlink, nil -} - -// checkHostNs checks whether network sysctl is used in host namespace. -func checkHostNs(sysctlConfig string, path string) error { - var currentProcessNetns = "/proc/self/ns/net" - // readlink on the current processes network namespace - destOfCurrentProcess, err := os.Readlink(currentProcessNetns) - if err != nil { - return fmt.Errorf("read soft link %q error", currentProcessNetns) - } - - // First check if the provided path is a symbolic link - symLink, err := isSymbolicLink(path) - if err != nil { - return fmt.Errorf("could not check that %q is a symlink: %v", path, err) - } - - if symLink == false { - // The provided namespace is not a symbolic link, - // it is not the host namespace. +func (v *ConfigValidator) cgroups(config *configs.Config) error { + c := config.Cgroups + if c == nil { return nil } - // readlink on the path provided in the struct - destOfContainer, err := os.Readlink(path) - if err != nil { - return fmt.Errorf("read soft link %q error", path) + if (c.Name != "" || c.Parent != "") && c.Path != "" { + return fmt.Errorf("cgroup: either Path or Name and Parent should be used, got %+v", c) } - if destOfContainer == destOfCurrentProcess { - return fmt.Errorf("sysctl %q is not allowed in the hosts network namespace", sysctlConfig) + + r := c.Resources + if r == nil { + return nil } + + if !cgroups.IsCgroup2UnifiedMode() && r.Unified != nil { + return cgroups.ErrV1NoUnified + } + + if cgroups.IsCgroup2UnifiedMode() { + _, err := cgroups.ConvertMemorySwapToCgroupV2Value(r.MemorySwap, r.Memory) + if err != nil { + return err + } + } + return nil } + +func (v *ConfigValidator) mounts(config *configs.Config) error { + for _, m := range config.Mounts { + if !filepath.IsAbs(m.Destination) { + return fmt.Errorf("invalid mount %+v: mount destination not absolute", m) + } + } + + return nil +} + +func isHostNetNS(path string) (bool, error) { + const currentProcessNetns = "/proc/self/ns/net" + + var st1, st2 unix.Stat_t + + if err := unix.Stat(currentProcessNetns, &st1); err != nil { + return false, &os.PathError{Op: "stat", Path: currentProcessNetns, Err: err} + } + if err := unix.Stat(path, &st2); err != nil { + return false, &os.PathError{Op: "stat", Path: path, Err: err} + } + + return (st1.Dev == st2.Dev) && (st1.Ino == st2.Ino), nil +} diff --git a/libcontainer/configs/validate/validator_test.go b/libcontainer/configs/validate/validator_test.go index f6826fb..5181333 100644 --- a/libcontainer/configs/validate/validator_test.go +++ b/libcontainer/configs/validate/validator_test.go @@ -1,11 +1,12 @@ -package validate_test +package validate import ( "os" + "path/filepath" "testing" "github.com/opencontainers/runc/libcontainer/configs" - "github.com/opencontainers/runc/libcontainer/configs/validate" + "golang.org/x/sys/unix" ) func TestValidate(t *testing.T) { @@ -13,7 +14,7 @@ func TestValidate(t *testing.T) { Rootfs: "/var", } - validator := validate.New() + validator := New() err := validator.Validate(config) if err != nil { t.Errorf("Expected error to not occur: %+v", err) @@ -22,14 +23,16 @@ func TestValidate(t *testing.T) { func TestValidateWithInvalidRootfs(t *testing.T) { dir := "rootfs" - os.Symlink("/var", dir) + if err := os.Symlink("/var", dir); err != nil { + t.Fatal(err) + } defer os.Remove(dir) config := &configs.Config{ Rootfs: dir, } - validator := validate.New() + validator := New() err := validator.Validate(config) if err == nil { t.Error("Expected error to occur but it was nil") @@ -44,7 +47,7 @@ func TestValidateNetworkWithoutNETNamespace(t *testing.T) { Networks: []*configs.Network{network}, } - validator := validate.New() + validator := New() err := validator.Validate(config) if err == nil { t.Error("Expected error to occur but it was nil") @@ -59,7 +62,7 @@ func TestValidateNetworkRoutesWithoutNETNamespace(t *testing.T) { Routes: []*configs.Route{route}, } - validator := validate.New() + validator := New() err := validator.Validate(config) if err == nil { t.Error("Expected error to occur but it was nil") @@ -77,7 +80,7 @@ func TestValidateHostname(t *testing.T) { ), } - validator := validate.New() + validator := New() err := validator.Validate(config) if err != nil { t.Errorf("Expected error to not occur: %+v", err) @@ -90,7 +93,7 @@ func TestValidateHostnameWithoutUTSNamespace(t *testing.T) { Hostname: "runc", } - validator := validate.New() + validator := New() err := validator.Validate(config) if err == nil { t.Error("Expected error to occur but it was nil") @@ -108,7 +111,7 @@ func TestValidateSecurityWithMaskPaths(t *testing.T) { ), } - validator := validate.New() + validator := New() err := validator.Validate(config) if err != nil { t.Errorf("Expected error to not occur: %+v", err) @@ -126,7 +129,7 @@ func TestValidateSecurityWithROPaths(t *testing.T) { ), } - validator := validate.New() + validator := New() err := validator.Validate(config) if err != nil { t.Errorf("Expected error to not occur: %+v", err) @@ -140,7 +143,7 @@ func TestValidateSecurityWithoutNEWNS(t *testing.T) { ReadonlyPaths: []string{"/proc/sys"}, } - validator := validate.New() + validator := New() err := validator.Validate(config) if err == nil { t.Error("Expected error to occur but it was nil") @@ -149,7 +152,7 @@ func TestValidateSecurityWithoutNEWNS(t *testing.T) { func TestValidateUsernamespace(t *testing.T) { if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) { - t.Skip("userns is unsupported") + t.Skip("Test requires userns.") } config := &configs.Config{ Rootfs: "/var", @@ -160,7 +163,7 @@ func TestValidateUsernamespace(t *testing.T) { ), } - validator := validate.New() + validator := New() err := validator.Validate(config) if err != nil { t.Errorf("expected error to not occur %+v", err) @@ -174,18 +177,47 @@ func TestValidateUsernamespaceWithoutUserNS(t *testing.T) { UidMappings: []configs.IDMap{uidMap}, } - validator := validate.New() + validator := New() err := validator.Validate(config) if err == nil { t.Error("Expected error to occur but it was nil") } } +// TestConvertSysctlVariableToDotsSeparator tests whether the sysctl variable +// can be correctly converted to a dot as a separator. +func TestConvertSysctlVariableToDotsSeparator(t *testing.T) { + type testCase struct { + in string + out string + } + valid := []testCase{ + {in: "kernel.shm_rmid_forced", out: "kernel.shm_rmid_forced"}, + {in: "kernel/shm_rmid_forced", out: "kernel.shm_rmid_forced"}, + {in: "net.ipv4.conf.eno2/100.rp_filter", out: "net.ipv4.conf.eno2/100.rp_filter"}, + {in: "net/ipv4/conf/eno2.100/rp_filter", out: "net.ipv4.conf.eno2/100.rp_filter"}, + {in: "net/ipv4/ip_local_port_range", out: "net.ipv4.ip_local_port_range"}, + {in: "kernel/msgmax", out: "kernel.msgmax"}, + {in: "kernel/sem", out: "kernel.sem"}, + } + + for _, test := range valid { + convertSysctlVal := convertSysctlVariableToDotsSeparator(test.in) + if convertSysctlVal != test.out { + t.Errorf("The sysctl variable was not converted correctly. got: %s, want: %s", convertSysctlVal, test.out) + } + } +} + func TestValidateSysctl(t *testing.T) { sysctl := map[string]string{ - "fs.mqueue.ctl": "ctl", - "net.ctl": "ctl", - "kernel.ctl": "ctl", + "fs.mqueue.ctl": "ctl", + "fs/mqueue/ctl": "ctl", + "net.ctl": "ctl", + "net/ctl": "ctl", + "net.ipv4.conf.eno2/100.rp_filter": "ctl", + "kernel.ctl": "ctl", + "kernel/ctl": "ctl", } for k, v := range sysctl { @@ -194,7 +226,7 @@ func TestValidateSysctl(t *testing.T) { Sysctl: map[string]string{k: v}, } - validator := validate.New() + validator := New() err := validator.Validate(config) if err == nil { t.Error("Expected error to occur but it was nil") @@ -204,9 +236,13 @@ func TestValidateSysctl(t *testing.T) { func TestValidateValidSysctl(t *testing.T) { sysctl := map[string]string{ - "fs.mqueue.ctl": "ctl", - "net.ctl": "ctl", - "kernel.msgmax": "ctl", + "fs.mqueue.ctl": "ctl", + "fs/mqueue/ctl": "ctl", + "net.ctl": "ctl", + "net/ctl": "ctl", + "net.ipv4.conf.eno2/100.rp_filter": "ctl", + "kernel.msgmax": "ctl", + "kernel/msgmax": "ctl", } for k, v := range sysctl { @@ -223,7 +259,7 @@ func TestValidateValidSysctl(t *testing.T) { }, } - validator := validate.New() + validator := New() err := validator.Validate(config) if err != nil { t.Errorf("Expected error to not occur with {%s=%s} but got: %q", k, v, err) @@ -245,13 +281,54 @@ func TestValidateSysctlWithSameNs(t *testing.T) { ), } - validator := validate.New() + validator := New() err := validator.Validate(config) if err == nil { t.Error("Expected error to occur but it was nil") } } +func TestValidateSysctlWithBindHostNetNS(t *testing.T) { + if os.Getuid() != 0 { + t.Skip("requires root") + } + + const selfnet = "/proc/self/ns/net" + + file := filepath.Join(t.TempDir(), "default") + fd, err := os.Create(file) + if err != nil { + t.Fatal(err) + } + defer os.Remove(file) + fd.Close() + + if err := unix.Mount(selfnet, file, "bind", unix.MS_BIND, ""); err != nil { + t.Fatalf("can't bind-mount %s to %s: %s", selfnet, file, err) + } + defer func() { + _ = unix.Unmount(file, unix.MNT_DETACH) + }() + + config := &configs.Config{ + Rootfs: "/var", + Sysctl: map[string]string{"net.ctl": "ctl", "net.foo": "bar"}, + Namespaces: configs.Namespaces( + []configs.Namespace{ + { + Type: configs.NEWNET, + Path: file, + }, + }, + ), + } + + validator := New() + if err := validator.Validate(config); err == nil { + t.Error("Expected error to occur but it was nil") + } +} + func TestValidateSysctlWithoutNETNamespace(t *testing.T) { config := &configs.Config{ Rootfs: "/var", @@ -259,9 +336,44 @@ func TestValidateSysctlWithoutNETNamespace(t *testing.T) { Namespaces: []configs.Namespace{}, } - validator := validate.New() + validator := New() err := validator.Validate(config) if err == nil { t.Error("Expected error to occur but it was nil") } } + +func TestValidateMounts(t *testing.T) { + testCases := []struct { + isErr bool + dest string + }{ + // TODO (runc v1.x.x): make these relative paths an error. See https://github.com/opencontainers/runc/pull/3004 + {isErr: false, dest: "not/an/abs/path"}, + {isErr: false, dest: "./rel/path"}, + {isErr: false, dest: "./rel/path"}, + {isErr: false, dest: "../../path"}, + + {isErr: false, dest: "/abs/path"}, + {isErr: false, dest: "/abs/but/../unclean"}, + } + + validator := New() + + for _, tc := range testCases { + config := &configs.Config{ + Rootfs: "/var", + Mounts: []*configs.Mount{ + {Destination: tc.dest}, + }, + } + + err := validator.Validate(config) + if tc.isErr && err == nil { + t.Errorf("mount dest: %s, expected error, got nil", tc.dest) + } + if !tc.isErr && err != nil { + t.Errorf("mount dest: %s, expected nil, got error %v", tc.dest, err) + } + } +} diff --git a/libcontainer/console_linux.go b/libcontainer/console_linux.go index 9997e93..29b9c3b 100644 --- a/libcontainer/console_linux.go +++ b/libcontainer/console_linux.go @@ -9,7 +9,7 @@ import ( // mount initializes the console inside the rootfs mounting with the specified mount label // and applying the correct ownership of the console. func mountConsole(slavePath string) error { - oldMask := unix.Umask(0000) + oldMask := unix.Umask(0o000) defer unix.Umask(oldMask) f, err := os.Create("/dev/console") if err != nil && !os.IsExist(err) { @@ -18,7 +18,7 @@ func mountConsole(slavePath string) error { if f != nil { f.Close() } - return unix.Mount(slavePath, "/dev/console", "bind", unix.MS_BIND, "") + return mount(slavePath, "/dev/console", "", "bind", unix.MS_BIND, "") } // dupStdio opens the slavePath for the console and dups the fds to the current diff --git a/libcontainer/container.go b/libcontainer/container.go index ba7541c..300c952 100644 --- a/libcontainer/container.go +++ b/libcontainer/container.go @@ -74,22 +74,12 @@ type BaseContainer interface { ID() string // Returns the current status of the container. - // - // errors: - // ContainerNotExists - Container no longer exists, - // Systemerror - System error. Status() (Status, error) // State returns the current container's state information. - // - // errors: - // SystemError - System error. State() (*State, error) // OCIState returns the current container's state information. - // - // errors: - // SystemError - System error. OCIState() (*specs.State, error) // Returns the current config of the container. @@ -97,48 +87,26 @@ type BaseContainer interface { // Returns the PIDs inside this container. The PIDs are in the namespace of the calling process. // - // errors: - // ContainerNotExists - Container no longer exists, - // Systemerror - System error. - // // Some of the returned PIDs may no longer refer to processes in the Container, unless // the Container state is PAUSED in which case every PID in the slice is valid. Processes() ([]int, error) // Returns statistics for the container. - // - // errors: - // ContainerNotExists - Container no longer exists, - // Systemerror - System error. Stats() (*Stats, error) // Set resources of container as configured // // We can use this to change resources when containers are running. // - // errors: - // SystemError - System error. Set(config configs.Config) error // Start a process inside the container. Returns error if process fails to // start. You can track process lifecycle with passed Process structure. - // - // errors: - // ContainerNotExists - Container no longer exists, - // ConfigInvalid - config is invalid, - // ContainerPaused - Container is paused, - // SystemError - System error. Start(process *Process) (err error) // Run immediately starts the process inside the container. Returns error if process // fails to start. It does not block waiting for the exec fifo after start returns but // opens the fifo after start returns. - // - // errors: - // ContainerNotExists - Container no longer exists, - // ConfigInvalid - config is invalid, - // ContainerPaused - Container is paused, - // SystemError - System error. Run(process *Process) (err error) // Destroys the container, if its in a valid state, after killing any @@ -149,25 +117,14 @@ type BaseContainer interface { // // Running containers must first be stopped using Signal(..). // Paused containers must first be resumed using Resume(..). - // - // errors: - // ContainerNotStopped - Container is still running, - // ContainerPaused - Container is paused, - // SystemError - System error. Destroy() error // Signal sends the provided signal code to the container's initial process. // // If all is specified the signal is sent to all processes in the container // including the initial process. - // - // errors: - // SystemError - System error. Signal(s os.Signal, all bool) error // Exec signals the container to exec the users process at the end of the init. - // - // errors: - // SystemError - System error. Exec() error } diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index fe70c93..f6877b7 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -1,5 +1,3 @@ -// +build linux - package libcontainer import ( @@ -8,30 +6,31 @@ import ( "errors" "fmt" "io" - "io/ioutil" "net" "os" "os/exec" + "path" "path/filepath" "reflect" + "strconv" "strings" "sync" - "syscall" // only for SysProcAttr and Signal "time" + "github.com/checkpoint-restore/go-criu/v5" + criurpc "github.com/checkpoint-restore/go-criu/v5/rpc" securejoin "github.com/cyphar/filepath-securejoin" + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/sirupsen/logrus" + "github.com/vishvananda/netlink/nl" + "golang.org/x/sys/unix" + "google.golang.org/protobuf/proto" + "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/intelrdt" "github.com/opencontainers/runc/libcontainer/system" "github.com/opencontainers/runc/libcontainer/utils" - "github.com/opencontainers/runtime-spec/specs-go" - - criurpc "github.com/checkpoint-restore/go-criu/rpc" - "github.com/golang/protobuf/proto" - "github.com/sirupsen/logrus" - "github.com/vishvananda/netlink/nl" - "golang.org/x/sys/unix" ) const stdioFdCount = 3 @@ -53,6 +52,7 @@ type linuxContainer struct { criuVersion int state containerState created time.Time + fifo *os.File } // State represents a running container's state @@ -65,8 +65,12 @@ type State struct { // Set to true if BaseState.Config.RootlessEUID && BaseState.Config.RootlessCgroups Rootless bool `json:"rootless"` - // Path to all the cgroups setup for a container. Key is cgroup subsystem name - // with the value as the path. + // Paths to all the container's cgroups, as returned by (*cgroups.Manager).GetPaths + // + // For cgroup v1, a key is cgroup subsystem name, and the value is the path + // to the cgroup for this subsystem. + // + // For cgroup v2 unified hierarchy, a key is "", and the value is the unified path. CgroupPaths map[string]string `json:"cgroup_paths"` // NamespacePaths are filepaths to the container's namespaces. Key is the namespace type @@ -91,48 +95,26 @@ type Container interface { // Methods below here are platform specific // Checkpoint checkpoints the running container's state to disk using the criu(8) utility. - // - // errors: - // Systemerror - System error. Checkpoint(criuOpts *CriuOpts) error // Restore restores the checkpointed container to a running state using the criu(8) utility. - // - // errors: - // Systemerror - System error. Restore(process *Process, criuOpts *CriuOpts) error // If the Container state is RUNNING or CREATED, sets the Container state to PAUSING and pauses // the execution of any user processes. Asynchronously, when the container finished being paused the // state is changed to PAUSED. // If the Container state is PAUSED, do nothing. - // - // errors: - // ContainerNotExists - Container no longer exists, - // ContainerNotRunning - Container not running or created, - // Systemerror - System error. Pause() error // If the Container state is PAUSED, resumes the execution of any user processes in the // Container before setting the Container state to RUNNING. // If the Container state is RUNNING, do nothing. - // - // errors: - // ContainerNotExists - Container no longer exists, - // ContainerNotPaused - Container is not paused, - // Systemerror - System error. Resume() error // NotifyOOM returns a read-only channel signaling when the container receives an OOM notification. - // - // errors: - // Systemerror - System error. NotifyOOM() (<-chan struct{}, error) // NotifyMemoryPressure returns a read-only channel signaling when the container reaches a given pressure level - // - // errors: - // Systemerror - System error. NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) } @@ -165,9 +147,19 @@ func (c *linuxContainer) OCIState() (*specs.State, error) { } func (c *linuxContainer) Processes() ([]int, error) { - pids, err := c.cgroupManager.GetAllPids() + var pids []int + status, err := c.currentStatus() if err != nil { - return nil, newSystemErrorWithCause(err, "getting all container pids from cgroups") + return pids, err + } + // for systemd cgroup, the unit's cgroup path will be auto removed if container's all processes exited + if status == Stopped && !c.cgroupManager.Exists() { + return pids, nil + } + + pids, err = c.cgroupManager.GetAllPids() + if err != nil { + return nil, fmt.Errorf("unable to get all container pids: %w", err) } return pids, nil } @@ -178,11 +170,11 @@ func (c *linuxContainer) Stats() (*Stats, error) { stats = &Stats{} ) if stats.CgroupStats, err = c.cgroupManager.GetStats(); err != nil { - return stats, newSystemErrorWithCause(err, "getting container stats from cgroups") + return stats, fmt.Errorf("unable to get container cgroup stats: %w", err) } if c.intelRdtManager != nil { if stats.IntelRdtStats, err = c.intelRdtManager.GetStats(); err != nil { - return stats, newSystemErrorWithCause(err, "getting container's Intel RDT stats") + return stats, fmt.Errorf("unable to get container Intel RDT stats: %w", err) } } for _, iface := range c.config.Networks { @@ -190,7 +182,7 @@ func (c *linuxContainer) Stats() (*Stats, error) { case "veth": istats, err := getNetworkInterfaceStats(iface.HostInterfaceName) if err != nil { - return stats, newSystemErrorWithCausef(err, "getting network stats for interface %q", iface.HostInterfaceName) + return stats, fmt.Errorf("unable to get network stats for interface %q: %w", iface.HostInterfaceName, err) } stats.Interfaces = append(stats.Interfaces, istats) } @@ -206,11 +198,11 @@ func (c *linuxContainer) Set(config configs.Config) error { return err } if status == Stopped { - return newGenericError(fmt.Errorf("container not running"), ContainerNotRunning) + return ErrNotRunning } - if err := c.cgroupManager.Set(&config); err != nil { + if err := c.cgroupManager.Set(config.Cgroups.Resources); err != nil { // Set configs back - if err2 := c.cgroupManager.Set(c.config); err2 != nil { + if err2 := c.cgroupManager.Set(c.config.Cgroups.Resources); err2 != nil { logrus.Warnf("Setting back cgroup configs failed due to error: %v, your state.json and actual configs might be inconsistent.", err2) } return err @@ -218,6 +210,9 @@ func (c *linuxContainer) Set(config configs.Config) error { if c.intelRdtManager != nil { if err := c.intelRdtManager.Set(&config); err != nil { // Set configs back + if err2 := c.cgroupManager.Set(c.config.Cgroups.Resources); err2 != nil { + logrus.Warnf("Setting back cgroup configs failed due to error: %v, your state.json and actual configs might be inconsistent.", err2) + } if err2 := c.intelRdtManager.Set(c.config); err2 != nil { logrus.Warnf("Setting back intelrdt configs failed due to error: %v, your state.json and actual configs might be inconsistent.", err2) } @@ -233,6 +228,9 @@ func (c *linuxContainer) Set(config configs.Config) error { func (c *linuxContainer) Start(process *Process) error { c.m.Lock() defer c.m.Unlock() + if c.config.Cgroups.Resources.SkipDevices { + return errors.New("can't start container with SkipDevices set") + } if process.Init { if err := c.createExecFifo(); err != nil { return err @@ -287,12 +285,12 @@ func (c *linuxContainer) exec() error { } func readFromExecFifo(execFifo io.Reader) error { - data, err := ioutil.ReadAll(execFifo) + data, err := io.ReadAll(execFifo) if err != nil { return err } if len(data) <= 0 { - return fmt.Errorf("cannot start an already running container") + return errors.New("cannot start an already running container") } return nil } @@ -309,11 +307,11 @@ func awaitFifoOpen(path string) <-chan openResult { func fifoOpen(path string, block bool) openResult { flags := os.O_RDONLY if !block { - flags |= syscall.O_NONBLOCK + flags |= unix.O_NONBLOCK } f, err := os.OpenFile(path, flags, 0) if err != nil { - return openResult{err: newSystemErrorWithCause(err, "open exec fifo for reading")} + return openResult{err: fmt.Errorf("exec fifo: %w", err)} } return openResult{file: f} } @@ -335,43 +333,41 @@ type openResult struct { err error } -func (c *linuxContainer) start(process *Process) error { +func (c *linuxContainer) start(process *Process) (retErr error) { parent, err := c.newParentProcess(process) if err != nil { - return newSystemErrorWithCause(err, "creating new parent process") + return fmt.Errorf("unable to create new parent process: %w", err) } - parent.forwardChildLogs() - if err := parent.start(); err != nil { - // terminate the process to ensure that it properly is reaped. - if err := ignoreTerminateErrors(parent.terminate()); err != nil { - logrus.Warn(err) - } - return newSystemErrorWithCause(err, "starting container process") - } - // generate a timestamp indicating when the container was started - c.created = time.Now().UTC() - if process.Init { - c.state = &createdState{ - c: c, - } - state, err := c.updateState(parent) - if err != nil { - return err - } - c.initProcessStartTime = state.InitProcessStartTime + logsDone := parent.forwardChildLogs() + if logsDone != nil { + defer func() { + // Wait for log forwarder to finish. This depends on + // runc init closing the _LIBCONTAINER_LOGPIPE log fd. + err := <-logsDone + if err != nil && retErr == nil { + retErr = fmt.Errorf("unable to forward init logs: %w", err) + } + }() + } + + if err := parent.start(); err != nil { + return fmt.Errorf("unable to start container process: %w", err) + } + + if process.Init { + c.fifo.Close() if c.config.Hooks != nil { s, err := c.currentOCIState() if err != nil { return err } - for i, hook := range c.config.Hooks.Poststart { - if err := hook.Run(s); err != nil { - if err := ignoreTerminateErrors(parent.terminate()); err != nil { - logrus.Warn(err) - } - return newSystemErrorWithCausef(err, "running poststart hook %d", i) + + if err := c.config.Hooks[configs.Poststart].RunHooks(s); err != nil { + if err := ignoreTerminateErrors(parent.terminate()); err != nil { + logrus.Warn(fmt.Errorf("error running poststart hook: %w", err)) } + return err } } } @@ -379,21 +375,35 @@ func (c *linuxContainer) start(process *Process) error { } func (c *linuxContainer) Signal(s os.Signal, all bool) error { - if all { - return signalAllProcesses(c.cgroupManager, s) - } + c.m.Lock() + defer c.m.Unlock() status, err := c.currentStatus() if err != nil { return err } + if all { + // for systemd cgroup, the unit's cgroup path will be auto removed if container's all processes exited + if status == Stopped && !c.cgroupManager.Exists() { + return nil + } + return signalAllProcesses(c.cgroupManager, s) + } // to avoid a PID reuse attack if status == Running || status == Created || status == Paused { if err := c.initProcess.signal(s); err != nil { - return newSystemErrorWithCause(err, "signaling init process") + return fmt.Errorf("unable to signal init: %w", err) + } + if status == Paused { + // For cgroup v1, killing a process in a frozen cgroup + // does nothing until it's thawed. Only thaw the cgroup + // for SIGKILL. + if s, ok := s.(unix.Signal); ok && s == unix.SIGKILL { + _ = c.cgroupManager.Freeze(configs.Thawed) + } } return nil } - return newGenericError(fmt.Errorf("container not running"), ContainerNotRunning) + return ErrNotRunning } func (c *linuxContainer) createExecFifo() error { @@ -410,8 +420,8 @@ func (c *linuxContainer) createExecFifo() error { if _, err := os.Stat(fifoName); err == nil { return fmt.Errorf("exec fifo %s already exists", fifoName) } - oldMask := unix.Umask(0000) - if err := unix.Mkfifo(fifoName, 0622); err != nil { + oldMask := unix.Umask(0o000) + if err := unix.Mkfifo(fifoName, 0o622); err != nil { unix.Umask(oldMask) return err } @@ -430,34 +440,32 @@ func (c *linuxContainer) deleteExecFifo() { // fd, with _LIBCONTAINER_FIFOFD set to its fd number. func (c *linuxContainer) includeExecFifo(cmd *exec.Cmd) error { fifoName := filepath.Join(c.root, execFifoFilename) - fifoFd, err := unix.Open(fifoName, unix.O_PATH|unix.O_CLOEXEC, 0) + fifo, err := os.OpenFile(fifoName, unix.O_PATH|unix.O_CLOEXEC, 0) if err != nil { return err } + c.fifo = fifo - cmd.ExtraFiles = append(cmd.ExtraFiles, os.NewFile(uintptr(fifoFd), fifoName)) + cmd.ExtraFiles = append(cmd.ExtraFiles, fifo) cmd.Env = append(cmd.Env, - fmt.Sprintf("_LIBCONTAINER_FIFOFD=%d", stdioFdCount+len(cmd.ExtraFiles)-1)) + "_LIBCONTAINER_FIFOFD="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1)) return nil } func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) { parentInitPipe, childInitPipe, err := utils.NewSockPair("init") if err != nil { - return nil, newSystemErrorWithCause(err, "creating new init pipe") + return nil, fmt.Errorf("unable to create init pipe: %w", err) } messageSockPair := filePair{parentInitPipe, childInitPipe} parentLogPipe, childLogPipe, err := os.Pipe() if err != nil { - return nil, fmt.Errorf("Unable to create the log pipe: %s", err) + return nil, fmt.Errorf("unable to create log pipe: %w", err) } logFilePair := filePair{parentLogPipe, childLogPipe} - cmd, err := c.commandTemplate(p, childInitPipe, childLogPipe) - if err != nil { - return nil, newSystemErrorWithCause(err, "creating new command template") - } + cmd := c.commandTemplate(p, childInitPipe, childLogPipe) if !p.Init { return c.newSetnsProcess(p, cmd, messageSockPair, logFilePair) } @@ -468,12 +476,12 @@ func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) { // that problem), but we no longer do that. However, there's no need to do // this for `runc exec` so we just keep it this way to be safe. if err := c.includeExecFifo(cmd); err != nil { - return nil, newSystemErrorWithCause(err, "including execfifo in cmd.Exec setup") + return nil, fmt.Errorf("unable to setup exec fifo: %w", err) } return c.newInitProcess(p, cmd, messageSockPair, logFilePair) } -func (c *linuxContainer) commandTemplate(p *Process, childInitPipe *os.File, childLogPipe *os.File) (*exec.Cmd, error) { +func (c *linuxContainer) commandTemplate(p *Process, childInitPipe *os.File, childLogPipe *os.File) *exec.Cmd { cmd := exec.Command(c.initPath, c.initArgs[1:]...) cmd.Args[0] = c.initArgs[0] cmd.Stdin = p.Stdin @@ -481,35 +489,62 @@ func (c *linuxContainer) commandTemplate(p *Process, childInitPipe *os.File, chi cmd.Stderr = p.Stderr cmd.Dir = c.config.Rootfs if cmd.SysProcAttr == nil { - cmd.SysProcAttr = &syscall.SysProcAttr{} + cmd.SysProcAttr = &unix.SysProcAttr{} } - cmd.Env = append(cmd.Env, fmt.Sprintf("GOMAXPROCS=%s", os.Getenv("GOMAXPROCS"))) + cmd.Env = append(cmd.Env, "GOMAXPROCS="+os.Getenv("GOMAXPROCS")) cmd.ExtraFiles = append(cmd.ExtraFiles, p.ExtraFiles...) if p.ConsoleSocket != nil { cmd.ExtraFiles = append(cmd.ExtraFiles, p.ConsoleSocket) cmd.Env = append(cmd.Env, - fmt.Sprintf("_LIBCONTAINER_CONSOLE=%d", stdioFdCount+len(cmd.ExtraFiles)-1), + "_LIBCONTAINER_CONSOLE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1), ) } cmd.ExtraFiles = append(cmd.ExtraFiles, childInitPipe) cmd.Env = append(cmd.Env, - fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-1), - fmt.Sprintf("_LIBCONTAINER_STATEDIR=%s", c.root), + "_LIBCONTAINER_INITPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1), + "_LIBCONTAINER_STATEDIR="+c.root, ) cmd.ExtraFiles = append(cmd.ExtraFiles, childLogPipe) cmd.Env = append(cmd.Env, - fmt.Sprintf("_LIBCONTAINER_LOGPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-1), - fmt.Sprintf("_LIBCONTAINER_LOGLEVEL=%s", p.LogLevel), + "_LIBCONTAINER_LOGPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1), + "_LIBCONTAINER_LOGLEVEL="+p.LogLevel, ) // NOTE: when running a container with no PID namespace and the parent process spawning the container is // PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason // even with the parent still running. if c.config.ParentDeathSignal > 0 { - cmd.SysProcAttr.Pdeathsig = syscall.Signal(c.config.ParentDeathSignal) + cmd.SysProcAttr.Pdeathsig = unix.Signal(c.config.ParentDeathSignal) } - return cmd, nil + return cmd +} + +// shouldSendMountSources says whether the child process must setup bind mounts with +// the source pre-opened (O_PATH) in the host user namespace. +// See https://github.com/opencontainers/runc/issues/2484 +func (c *linuxContainer) shouldSendMountSources() bool { + // Passing the mount sources via SCM_RIGHTS is only necessary when + // both userns and mntns are active. + if !c.config.Namespaces.Contains(configs.NEWUSER) || + !c.config.Namespaces.Contains(configs.NEWNS) { + return false + } + + // nsexec.c send_mountsources() requires setns(mntns) capabilities + // CAP_SYS_CHROOT and CAP_SYS_ADMIN. + if c.config.RootlessEUID { + return false + } + + // We need to send sources if there are bind-mounts. + for _, m := range c.config.Mounts { + if m.IsBind() { + return true + } + } + + return false } func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, logFilePair filePair) (*initProcess, error) { @@ -521,10 +556,40 @@ func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPa } } _, sharePidns := nsMaps[configs.NEWPID] - data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps) + data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps, initStandard) if err != nil { return nil, err } + + if c.shouldSendMountSources() { + // Elements on this slice will be paired with mounts (see StartInitialization() and + // prepareRootfs()). This slice MUST have the same size as c.config.Mounts. + mountFds := make([]int, len(c.config.Mounts)) + for i, m := range c.config.Mounts { + if !m.IsBind() { + // Non bind-mounts do not use an fd. + mountFds[i] = -1 + continue + } + + // The fd passed here will not be used: nsexec.c will overwrite it with dup3(). We just need + // to allocate a fd so that we know the number to pass in the environment variable. The fd + // must not be closed before cmd.Start(), so we reuse messageSockPair.child because the + // lifecycle of that fd is already taken care of. + cmd.ExtraFiles = append(cmd.ExtraFiles, messageSockPair.child) + mountFds[i] = stdioFdCount + len(cmd.ExtraFiles) - 1 + } + + mountFdsJson, err := json.Marshal(mountFds) + if err != nil { + return nil, fmt.Errorf("Error creating _LIBCONTAINER_MOUNT_FDS: %w", err) + } + + cmd.Env = append(cmd.Env, + "_LIBCONTAINER_MOUNT_FDS="+string(mountFdsJson), + ) + } + init := &initProcess{ cmd: cmd, messageSockPair: messageSockPair, @@ -545,25 +610,49 @@ func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, messageSockP cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns)) state, err := c.currentState() if err != nil { - return nil, newSystemErrorWithCause(err, "getting container's current state") + return nil, fmt.Errorf("unable to get container state: %w", err) } // for setns process, we don't have to set cloneflags as the process namespaces // will only be set via setns syscall - data, err := c.bootstrapData(0, state.NamespacePaths) + data, err := c.bootstrapData(0, state.NamespacePaths, initSetns) if err != nil { return nil, err } - return &setnsProcess{ + proc := &setnsProcess{ cmd: cmd, - cgroupPaths: c.cgroupManager.GetPaths(), + cgroupPaths: state.CgroupPaths, rootlessCgroups: c.config.RootlessCgroups, intelRdtPath: state.IntelRdtPath, messageSockPair: messageSockPair, logFilePair: logFilePair, + manager: c.cgroupManager, config: c.newInitConfig(p), process: p, bootstrapData: data, - }, nil + initProcessPid: state.InitProcessPid, + } + if len(p.SubCgroupPaths) > 0 { + if add, ok := p.SubCgroupPaths[""]; ok { + // cgroup v1: using the same path for all controllers. + // cgroup v2: the only possible way. + for k := range proc.cgroupPaths { + proc.cgroupPaths[k] = path.Join(proc.cgroupPaths[k], add) + } + // cgroup v2: do not try to join init process's cgroup + // as a fallback (see (*setnsProcess).start). + proc.initProcessPid = 0 + } else { + // Per-controller paths. + for ctrl, add := range p.SubCgroupPaths { + if val, ok := proc.cgroupPaths[ctrl]; ok { + proc.cgroupPaths[ctrl] = path.Join(val, add) + } else { + return nil, fmt.Errorf("unknown controller %s in SubCgroupPaths", ctrl) + } + } + } + } + return proc, nil } func (c *linuxContainer) newInitConfig(process *Process) *initConfig { @@ -583,6 +672,9 @@ func (c *linuxContainer) newInitConfig(process *Process) *initConfig { AppArmorProfile: c.config.AppArmorProfile, ProcessLabel: c.config.ProcessLabel, Rlimits: c.config.Rlimits, + CreateConsole: process.ConsoleSocket != nil, + ConsoleWidth: process.ConsoleWidth, + ConsoleHeight: process.ConsoleHeight, } if process.NoNewPrivileges != nil { cfg.NoNewPrivileges = *process.NoNewPrivileges @@ -596,9 +688,10 @@ func (c *linuxContainer) newInitConfig(process *Process) *initConfig { if len(process.Rlimits) > 0 { cfg.Rlimits = process.Rlimits } - cfg.CreateConsole = process.ConsoleSocket != nil - cfg.ConsoleWidth = process.ConsoleWidth - cfg.ConsoleHeight = process.ConsoleHeight + if cgroups.IsCgroup2UnifiedMode() { + cfg.Cgroup2Path = c.cgroupManager.Path("") + } + return cfg } @@ -624,7 +717,7 @@ func (c *linuxContainer) Pause() error { c: c, }) } - return newGenericError(fmt.Errorf("container not running or created: %s", status), ContainerNotRunning) + return ErrNotRunning } func (c *linuxContainer) Resume() error { @@ -635,7 +728,7 @@ func (c *linuxContainer) Resume() error { return err } if status != Paused { - return newGenericError(fmt.Errorf("container not paused"), ContainerNotPaused) + return ErrNotPaused } if err := c.cgroupManager.Freeze(configs.Thawed); err != nil { return err @@ -650,7 +743,11 @@ func (c *linuxContainer) NotifyOOM() (<-chan struct{}, error) { if c.config.RootlessCgroups { logrus.Warn("getting OOM notifications may fail if you don't have the full access to cgroups") } - return notifyOnOOM(c.cgroupManager.GetPaths()) + path := c.cgroupManager.Path("memory") + if cgroups.IsCgroup2UnifiedMode() { + return notifyOnOOMV2(path) + } + return notifyOnOOM(path) } func (c *linuxContainer) NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) { @@ -658,25 +755,13 @@ func (c *linuxContainer) NotifyMemoryPressure(level PressureLevel) (<-chan struc if c.config.RootlessCgroups { logrus.Warn("getting memory pressure notifications may fail if you don't have the full access to cgroups") } - return notifyMemoryPressure(c.cgroupManager.GetPaths(), level) + return notifyMemoryPressure(c.cgroupManager.Path("memory"), level) } var criuFeatures *criurpc.CriuFeatures func (c *linuxContainer) checkCriuFeatures(criuOpts *CriuOpts, rpcOpts *criurpc.CriuOpts, criuFeat *criurpc.CriuFeatures) error { - - var t criurpc.CriuReqType - t = criurpc.CriuReqType_FEATURE_CHECK - - // criu 1.8 => 10800 - if err := c.checkCriuVersion(10800); err != nil { - // Feature checking was introduced with CRIU 1.8. - // Ignore the feature check if an older CRIU version is used - // and just act as before. - // As all automated PR testing is done using CRIU 1.7 this - // code will not be tested by automated PR testing. - return nil - } + t := criurpc.CriuReqType_FEATURE_CHECK // make sure the features we are looking for are really not from // some previous check @@ -691,13 +776,12 @@ func (c *linuxContainer) checkCriuFeatures(criuOpts *CriuOpts, rpcOpts *criurpc. Features: criuFeat, } - err := c.criuSwrk(nil, req, criuOpts, false, nil) + err := c.criuSwrk(nil, req, criuOpts, nil) if err != nil { logrus.Debugf("%s", err) - return fmt.Errorf("CRIU feature check failed") + return errors.New("CRIU feature check failed") } - logrus.Debugf("Feature check says: %s", criuFeatures) missingFeatures := false // The outer if checks if the fields actually exist @@ -721,56 +805,12 @@ func (c *linuxContainer) checkCriuFeatures(criuOpts *CriuOpts, rpcOpts *criurpc. } if missingFeatures { - return fmt.Errorf("CRIU is missing features") + return errors.New("CRIU is missing features") } return nil } -func parseCriuVersion(path string) (int, error) { - var x, y, z int - - out, err := exec.Command(path, "-V").Output() - if err != nil { - return 0, fmt.Errorf("Unable to execute CRIU command: %s", path) - } - - x = 0 - y = 0 - z = 0 - if ep := strings.Index(string(out), "-"); ep >= 0 { - // criu Git version format - var version string - if sp := strings.Index(string(out), "GitID"); sp > 0 { - version = string(out)[sp:ep] - } else { - return 0, fmt.Errorf("Unable to parse the CRIU version: %s", path) - } - - n, err := fmt.Sscanf(version, "GitID: v%d.%d.%d", &x, &y, &z) // 1.5.2 - if err != nil { - n, err = fmt.Sscanf(version, "GitID: v%d.%d", &x, &y) // 1.6 - y++ - } else { - z++ - } - if n < 2 || err != nil { - return 0, fmt.Errorf("Unable to parse the CRIU version: %s %d %s", version, n, err) - } - } else { - // criu release version format - n, err := fmt.Sscanf(string(out), "Version: %d.%d.%d\n", &x, &y, &z) // 1.5.2 - if err != nil { - n, err = fmt.Sscanf(string(out), "Version: %d.%d\n", &x, &y) // 1.6 - } - if n < 2 || err != nil { - return 0, fmt.Errorf("Unable to parse the CRIU version: %s %d %s", out, n, err) - } - } - - return x*10000 + y*100 + z, nil -} - func compareCriuVersion(criuVersion int, minVersion int) error { // simple function to perform the actual version compare if criuVersion < minVersion { @@ -780,60 +820,20 @@ func compareCriuVersion(criuVersion int, minVersion int) error { return nil } -// This is used to store the result of criu version RPC -var criuVersionRPC *criurpc.CriuVersion - // checkCriuVersion checks Criu version greater than or equal to minVersion func (c *linuxContainer) checkCriuVersion(minVersion int) error { - // If the version of criu has already been determined there is no need // to ask criu for the version again. Use the value from c.criuVersion. if c.criuVersion != 0 { return compareCriuVersion(c.criuVersion, minVersion) } - // First try if this version of CRIU support the version RPC. - // The CRIU version RPC was introduced with CRIU 3.0. - - // First, reset the variable for the RPC answer to nil - criuVersionRPC = nil - - var t criurpc.CriuReqType - t = criurpc.CriuReqType_VERSION - req := &criurpc.CriuReq{ - Type: &t, - } - - err := c.criuSwrk(nil, req, nil, false, nil) + criu := criu.MakeCriu() + criu.SetCriuPath(c.criuPath) + var err error + c.criuVersion, err = criu.GetCriuVersion() if err != nil { - return fmt.Errorf("CRIU version check failed: %s", err) - } - - if criuVersionRPC != nil { - logrus.Debugf("CRIU version: %s", criuVersionRPC) - // major and minor are always set - c.criuVersion = int(*criuVersionRPC.Major) * 10000 - c.criuVersion += int(*criuVersionRPC.Minor) * 100 - if criuVersionRPC.Sublevel != nil { - c.criuVersion += int(*criuVersionRPC.Sublevel) - } - if criuVersionRPC.Gitid != nil { - // runc's convention is that a CRIU git release is - // always the same as increasing the minor by 1 - c.criuVersion -= (c.criuVersion % 100) - c.criuVersion += 100 - } - return compareCriuVersion(c.criuVersion, minVersion) - } - - // This is CRIU without the version RPC and therefore - // older than 3.0. Parsing the output is required. - - // This can be remove once runc does not work with criu older than 3.0 - - c.criuVersion, err = parseCriuVersion(c.criuPath) - if err != nil { - return err + return fmt.Errorf("CRIU version check failed: %w", err) } return compareCriuVersion(c.criuVersion, minVersion) @@ -842,11 +842,10 @@ func (c *linuxContainer) checkCriuVersion(minVersion int) error { const descriptorsFilename = "descriptors.json" func (c *linuxContainer) addCriuDumpMount(req *criurpc.CriuReq, m *configs.Mount) { - mountDest := m.Destination - if strings.HasPrefix(mountDest, c.config.Rootfs) { - mountDest = mountDest[len(c.config.Rootfs):] + mountDest := strings.TrimPrefix(m.Destination, c.config.Rootfs) + if dest, err := securejoin.SecureJoin(c.config.Rootfs, mountDest); err == nil { + mountDest = dest[len(c.config.Rootfs):] } - extMnt := &criurpc.ExtMountMap{ Key: proto.String(mountDest), Val: proto.String(mountDest), @@ -876,26 +875,6 @@ func (c *linuxContainer) addMaskPaths(req *criurpc.CriuReq) error { return nil } -func waitForCriuLazyServer(r *os.File, status string) error { - - data := make([]byte, 1) - _, err := r.Read(data) - if err != nil { - return err - } - fd, err := os.OpenFile(status, os.O_TRUNC|os.O_WRONLY, os.ModeAppend) - if err != nil { - return err - } - _, err = fd.Write(data) - if err != nil { - return err - } - fd.Close() - - return nil -} - func (c *linuxContainer) handleCriuConfigurationFile(rpcOpts *criurpc.CriuOpts) { // CRIU will evaluate a configuration starting with release 3.11. // Settings in the configuration file will overwrite RPC settings. @@ -922,6 +901,117 @@ func (c *linuxContainer) handleCriuConfigurationFile(rpcOpts *criurpc.CriuOpts) } } +func (c *linuxContainer) criuSupportsExtNS(t configs.NamespaceType) bool { + var minVersion int + switch t { + case configs.NEWNET: + // CRIU supports different external namespace with different released CRIU versions. + // For network namespaces to work we need at least criu 3.11.0 => 31100. + minVersion = 31100 + case configs.NEWPID: + // For PID namespaces criu 31500 is needed. + minVersion = 31500 + default: + return false + } + return c.checkCriuVersion(minVersion) == nil +} + +func criuNsToKey(t configs.NamespaceType) string { + return "extRoot" + strings.Title(configs.NsName(t)) + "NS" +} + +func (c *linuxContainer) handleCheckpointingExternalNamespaces(rpcOpts *criurpc.CriuOpts, t configs.NamespaceType) error { + if !c.criuSupportsExtNS(t) { + return nil + } + + nsPath := c.config.Namespaces.PathOf(t) + if nsPath == "" { + return nil + } + // CRIU expects the information about an external namespace + // like this: --external []: + // This is always 'extRootNS'. + var ns unix.Stat_t + if err := unix.Stat(nsPath, &ns); err != nil { + return err + } + criuExternal := fmt.Sprintf("%s[%d]:%s", configs.NsName(t), ns.Ino, criuNsToKey(t)) + rpcOpts.External = append(rpcOpts.External, criuExternal) + + return nil +} + +func (c *linuxContainer) handleRestoringNamespaces(rpcOpts *criurpc.CriuOpts, extraFiles *[]*os.File) error { + for _, ns := range c.config.Namespaces { + switch ns.Type { + case configs.NEWNET, configs.NEWPID: + // If the container is running in a network or PID namespace and has + // a path to the network or PID namespace configured, we will dump + // that network or PID namespace as an external namespace and we + // will expect that the namespace exists during restore. + // This basically means that CRIU will ignore the namespace + // and expect it to be setup correctly. + if err := c.handleRestoringExternalNamespaces(rpcOpts, extraFiles, ns.Type); err != nil { + return err + } + default: + // For all other namespaces except NET and PID CRIU has + // a simpler way of joining the existing namespace if set + nsPath := c.config.Namespaces.PathOf(ns.Type) + if nsPath == "" { + continue + } + if ns.Type == configs.NEWCGROUP { + // CRIU has no code to handle NEWCGROUP + return fmt.Errorf("Do not know how to handle namespace %v", ns.Type) + } + // CRIU has code to handle NEWTIME, but it does not seem to be defined in runc + + // CRIU will issue a warning for NEWUSER: + // criu/namespaces.c: 'join-ns with user-namespace is not fully tested and dangerous' + rpcOpts.JoinNs = append(rpcOpts.JoinNs, &criurpc.JoinNamespace{ + Ns: proto.String(configs.NsName(ns.Type)), + NsFile: proto.String(nsPath), + }) + } + } + + return nil +} + +func (c *linuxContainer) handleRestoringExternalNamespaces(rpcOpts *criurpc.CriuOpts, extraFiles *[]*os.File, t configs.NamespaceType) error { + if !c.criuSupportsExtNS(t) { + return nil + } + + nsPath := c.config.Namespaces.PathOf(t) + if nsPath == "" { + return nil + } + // CRIU wants the information about an existing namespace + // like this: --inherit-fd fd[]: + // The needs to be the same as during checkpointing. + // We are always using 'extRootNS' as the key in this. + nsFd, err := os.Open(nsPath) + if err != nil { + logrus.Errorf("If a specific network namespace is defined it must exist: %s", err) + return fmt.Errorf("Requested network namespace %v does not exist", nsPath) + } + inheritFd := &criurpc.InheritFd{ + Key: proto.String(criuNsToKey(t)), + // The offset of four is necessary because 0, 1, 2 and 3 are + // already used by stdin, stdout, stderr, 'criu swrk' socket. + Fd: proto.Int32(int32(4 + len(*extraFiles))), + } + rpcOpts.InheritFd = append(rpcOpts.InheritFd, inheritFd) + // All open FDs need to be transferred to CRIU via extraFiles + *extraFiles = append(*extraFiles, nsFd) + + return nil +} + func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error { c.m.Lock() defer c.m.Unlock() @@ -932,35 +1022,21 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error { // support for doing unprivileged dumps, but the setup of // rootless containers might make this complicated. - // criu 1.5.2 => 10502 - if err := c.checkCriuVersion(10502); err != nil { + // We are relying on the CRIU version RPC which was introduced with CRIU 3.0.0 + if err := c.checkCriuVersion(30000); err != nil { return err } if criuOpts.ImagesDirectory == "" { - return fmt.Errorf("invalid directory to save checkpoint") + return errors.New("invalid directory to save checkpoint") } // Since a container can be C/R'ed multiple times, // the checkpoint directory may already exist. - if err := os.Mkdir(criuOpts.ImagesDirectory, 0700); err != nil && !os.IsExist(err) { + if err := os.Mkdir(criuOpts.ImagesDirectory, 0o700); err != nil && !os.IsExist(err) { return err } - if criuOpts.WorkDirectory == "" { - criuOpts.WorkDirectory = filepath.Join(c.root, "criu.work") - } - - if err := os.Mkdir(criuOpts.WorkDirectory, 0700); err != nil && !os.IsExist(err) { - return err - } - - workDir, err := os.Open(criuOpts.WorkDirectory) - if err != nil { - return err - } - defer workDir.Close() - imageDir, err := os.Open(criuOpts.ImagesDirectory) if err != nil { return err @@ -969,7 +1045,6 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error { rpcOpts := criurpc.CriuOpts{ ImagesDirFd: proto.Int32(int32(imageDir.Fd())), - WorkDirFd: proto.Int32(int32(workDir.Fd())), LogLevel: proto.Int32(4), LogFile: proto.String("dump.log"), Root: proto.String(c.config.Rootfs), @@ -987,6 +1062,19 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error { LazyPages: proto.Bool(criuOpts.LazyPages), } + // if criuOpts.WorkDirectory is not set, criu default is used. + if criuOpts.WorkDirectory != "" { + if err := os.Mkdir(criuOpts.WorkDirectory, 0o700); err != nil && !os.IsExist(err) { + return err + } + workDir, err := os.Open(criuOpts.WorkDirectory) + if err != nil { + return err + } + defer workDir.Close() + rpcOpts.WorkDirFd = proto.Int32(int32(workDir.Fd())) + } + c.handleCriuConfigurationFile(&rpcOpts) // If the container is running in a network namespace and has @@ -995,30 +1083,22 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error { // will expect that the namespace exists during restore. // This basically means that CRIU will ignore the namespace // and expect to be setup correctly. - nsPath := c.config.Namespaces.PathOf(configs.NEWNET) - if nsPath != "" { - // For this to work we need at least criu 3.11.0 => 31100. - // As there was already a successful version check we will - // not error out if it fails. runc will just behave as it used - // to do and ignore external network namespaces. - err := c.checkCriuVersion(31100) - if err == nil { - // CRIU expects the information about an external namespace - // like this: --external net[]: - // This is always 'extRootNetNS'. - var netns syscall.Stat_t - err = syscall.Stat(nsPath, &netns) - if err != nil { - return err - } - criuExternal := fmt.Sprintf("net[%d]:extRootNetNS", netns.Ino) - rpcOpts.External = append(rpcOpts.External, criuExternal) - } + if err := c.handleCheckpointingExternalNamespaces(&rpcOpts, configs.NEWNET); err != nil { + return err } - fcg := c.cgroupManager.GetPaths()["freezer"] - if fcg != "" { - rpcOpts.FreezeCgroup = proto.String(fcg) + // Same for possible external PID namespaces + if err := c.handleCheckpointingExternalNamespaces(&rpcOpts, configs.NEWPID); err != nil { + return err + } + + // CRIU can use cgroup freezer; when rpcOpts.FreezeCgroup + // is not set, CRIU uses ptrace() to pause the processes. + // Note cgroup v2 freezer is only supported since CRIU release 3.14. + if !cgroups.IsCgroup2UnifiedMode() || c.checkCriuVersion(31400) == nil { + if fcg := c.cgroupManager.Path("freezer"); fcg != "" { + rpcOpts.FreezeCgroup = proto.String(fcg) + } } // append optional criu opts, e.g., page-server and port @@ -1029,7 +1109,7 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error { } } - //pre-dump may need parentImage param to complete iterative migration + // pre-dump may need parentImage param to complete iterative migration if criuOpts.ParentImage != "" { rpcOpts.ParentImg = proto.String(criuOpts.ParentImage) rpcOpts.TrackMem = proto.Bool(true) @@ -1037,11 +1117,7 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error { // append optional manage cgroups mode if criuOpts.ManageCgroupsMode != 0 { - // criu 1.7 => 10700 - if err := c.checkCriuVersion(10700); err != nil { - return err - } - mode := criurpc.CriuCgMode(criuOpts.ManageCgroupsMode) + mode := criuOpts.ManageCgroupsMode rpcOpts.ManageCgroupsMode = &mode } @@ -1059,36 +1135,53 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error { } else { t = criurpc.CriuReqType_DUMP } - req := &criurpc.CriuReq{ - Type: &t, - Opts: &rpcOpts, - } if criuOpts.LazyPages { // lazy migration requested; check if criu supports it feat := criurpc.CriuFeatures{ LazyPages: proto.Bool(true), } - if err := c.checkCriuFeatures(criuOpts, &rpcOpts, &feat); err != nil { return err } - statusRead, statusWrite, err := os.Pipe() - if err != nil { - return err + if fd := criuOpts.StatusFd; fd != -1 { + // check that the FD is valid + flags, err := unix.FcntlInt(uintptr(fd), unix.F_GETFL, 0) + if err != nil { + return fmt.Errorf("invalid --status-fd argument %d: %w", fd, err) + } + // and writable + if flags&unix.O_WRONLY == 0 { + return fmt.Errorf("invalid --status-fd argument %d: not writable", fd) + } + + if c.checkCriuVersion(31500) != nil { + // For criu 3.15+, use notifications (see case "status-ready" + // in criuNotifications). Otherwise, rely on criu status fd. + rpcOpts.StatusFd = proto.Int32(int32(fd)) + } } - rpcOpts.StatusFd = proto.Int32(int32(statusWrite.Fd())) - go waitForCriuLazyServer(statusRead, criuOpts.StatusFd) } - //no need to dump these information in pre-dump + req := &criurpc.CriuReq{ + Type: &t, + Opts: &rpcOpts, + } + + // no need to dump all this in pre-dump if !criuOpts.PreDump { + hasCgroupns := c.config.Namespaces.Contains(configs.NEWCGROUP) for _, m := range c.config.Mounts { switch m.Device { case "bind": c.addCriuDumpMount(req, m) case "cgroup": + if cgroups.IsCgroup2UnifiedMode() || hasCgroupns { + // real mount(s) + continue + } + // a set of "external" bind mounts binds, err := getCgroupMounts(m) if err != nil { return err @@ -1114,13 +1207,13 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error { return err } - err = ioutil.WriteFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename), fdsJSON, 0600) + err = os.WriteFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename), fdsJSON, 0o600) if err != nil { return err } } - err = c.criuSwrk(nil, req, criuOpts, false, nil) + err = c.criuSwrk(nil, req, criuOpts, nil) if err != nil { return err } @@ -1128,11 +1221,10 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error { } func (c *linuxContainer) addCriuRestoreMount(req *criurpc.CriuReq, m *configs.Mount) { - mountDest := m.Destination - if strings.HasPrefix(mountDest, c.config.Rootfs) { - mountDest = mountDest[len(c.config.Rootfs):] + mountDest := strings.TrimPrefix(m.Destination, c.config.Rootfs) + if dest, err := securejoin.SecureJoin(c.config.Rootfs, mountDest); err == nil { + mountDest = dest[len(c.config.Rootfs):] } - extMnt := &criurpc.ExtMountMap{ Key: proto.String(mountDest), Val: proto.String(m.Source), @@ -1166,15 +1258,24 @@ func (c *linuxContainer) restoreNetwork(req *criurpc.CriuReq, criuOpts *CriuOpts func (c *linuxContainer) makeCriuRestoreMountpoints(m *configs.Mount) error { switch m.Device { case "cgroup": - // Do nothing for cgroup, CRIU should handle it + // No mount point(s) need to be created: + // + // * for v1, mount points are saved by CRIU because + // /sys/fs/cgroup is a tmpfs mount + // + // * for v2, /sys/fs/cgroup is a real mount, but + // the mountpoint appears as soon as /sys is mounted + return nil case "bind": // The prepareBindMount() function checks if source // exists. So it cannot be used for other filesystem types. - if err := prepareBindMount(m, c.config.Rootfs); err != nil { + // TODO: pass something else than nil? Not sure if criu is + // impacted by issue #2484 + if err := prepareBindMount(m, c.config.Rootfs, nil); err != nil { return err } default: - // for all other file-systems just create the mountpoints + // for all other filesystems just create the mountpoints dest, err := securejoin.SecureJoin(c.config.Rootfs, m.Destination) if err != nil { return err @@ -1182,8 +1283,7 @@ func (c *linuxContainer) makeCriuRestoreMountpoints(m *configs.Mount) error { if err := checkProcMount(c.config.Rootfs, dest, ""); err != nil { return err } - m.Destination = dest - if err := os.MkdirAll(dest, 0755); err != nil { + if err := os.MkdirAll(dest, 0o755); err != nil { return err } } @@ -1195,10 +1295,10 @@ func (c *linuxContainer) makeCriuRestoreMountpoints(m *configs.Mount) error { func isPathInPrefixList(path string, prefix []string) bool { for _, p := range prefix { if strings.HasPrefix(path, p+"/") { - return false + return true } } - return true + return false } // prepareCriuRestoreMounts tries to set up the rootfs of the @@ -1219,11 +1319,46 @@ func (c *linuxContainer) prepareCriuRestoreMounts(mounts []*configs.Mount) error // Now go through all mounts and create the mountpoints // if the mountpoints are not on a tmpfs, as CRIU will // restore the complete tmpfs content from its checkpoint. + umounts := []string{} + defer func() { + for _, u := range umounts { + _ = utils.WithProcfd(c.config.Rootfs, u, func(procfd string) error { + if e := unix.Unmount(procfd, unix.MNT_DETACH); e != nil { + if e != unix.EINVAL { //nolint:errorlint // unix errors are bare + // Ignore EINVAL as it means 'target is not a mount point.' + // It probably has already been unmounted. + logrus.Warnf("Error during cleanup unmounting of %s (%s): %v", procfd, u, e) + } + } + return nil + }) + } + }() for _, m := range mounts { - if isPathInPrefixList(m.Destination, tmpfs) { + if !isPathInPrefixList(m.Destination, tmpfs) { if err := c.makeCriuRestoreMountpoints(m); err != nil { return err } + // If the mount point is a bind mount, we need to mount + // it now so that runc can create the necessary mount + // points for mounts in bind mounts. + // This also happens during initial container creation. + // Without this CRIU restore will fail + // See: https://github.com/opencontainers/runc/issues/2748 + // It is also not necessary to order the mount points + // because during initial container creation mounts are + // set up in the order they are configured. + if m.Device == "bind" { + if err := utils.WithProcfd(c.config.Rootfs, m.Destination, func(procfd string) error { + if err := mount(m.Source, m.Destination, procfd, "", unix.MS_BIND|unix.MS_REC, ""); err != nil { + return err + } + return nil + }); err != nil { + return err + } + umounts = append(umounts, m.Destination) + } } } return nil @@ -1240,25 +1375,12 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error { // TODO(avagin): Figure out how to make this work nicely. CRIU doesn't have // support for unprivileged restore at the moment. - // criu 1.5.2 => 10502 - if err := c.checkCriuVersion(10502); err != nil { + // We are relying on the CRIU version RPC which was introduced with CRIU 3.0.0 + if err := c.checkCriuVersion(30000); err != nil { return err } - if criuOpts.WorkDirectory == "" { - criuOpts.WorkDirectory = filepath.Join(c.root, "criu.work") - } - // Since a container can be C/R'ed multiple times, - // the work directory may already exist. - if err := os.Mkdir(criuOpts.WorkDirectory, 0700); err != nil && !os.IsExist(err) { - return err - } - workDir, err := os.Open(criuOpts.WorkDirectory) - if err != nil { - return err - } - defer workDir.Close() if criuOpts.ImagesDirectory == "" { - return fmt.Errorf("invalid directory to restore checkpoint") + return errors.New("invalid directory to restore checkpoint") } imageDir, err := os.Open(criuOpts.ImagesDirectory) if err != nil { @@ -1271,7 +1393,7 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error { // c.config.Rootfs is bind-mounted to a temporary directory // to satisfy these requirements. root := filepath.Join(c.root, "criu-root") - if err := os.Mkdir(root, 0755); err != nil { + if err := os.Mkdir(root, 0o755); err != nil { return err } defer os.Remove(root) @@ -1279,17 +1401,16 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error { if err != nil { return err } - err = unix.Mount(c.config.Rootfs, root, "", unix.MS_BIND|unix.MS_REC, "") + err = mount(c.config.Rootfs, root, "", "", unix.MS_BIND|unix.MS_REC, "") if err != nil { return err } - defer unix.Unmount(root, unix.MNT_DETACH) + defer unix.Unmount(root, unix.MNT_DETACH) //nolint: errcheck t := criurpc.CriuReqType_RESTORE req := &criurpc.CriuReq{ Type: &t, Opts: &criurpc.CriuOpts{ ImagesDirFd: proto.Int32(int32(imageDir.Fd())), - WorkDirFd: proto.Int32(int32(workDir.Fd())), EvasiveDevices: proto.Bool(true), LogLevel: proto.Int32(4), LogFile: proto.String("restore.log"), @@ -1308,38 +1429,38 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error { }, } + if criuOpts.LsmProfile != "" { + // CRIU older than 3.16 has a bug which breaks the possibility + // to set a different LSM profile. + if err := c.checkCriuVersion(31600); err != nil { + return errors.New("--lsm-profile requires at least CRIU 3.16") + } + req.Opts.LsmProfile = proto.String(criuOpts.LsmProfile) + } + if criuOpts.LsmMountContext != "" { + if err := c.checkCriuVersion(31600); err != nil { + return errors.New("--lsm-mount-context requires at least CRIU 3.16") + } + req.Opts.LsmMountContext = proto.String(criuOpts.LsmMountContext) + } + + if criuOpts.WorkDirectory != "" { + // Since a container can be C/R'ed multiple times, + // the work directory may already exist. + if err := os.Mkdir(criuOpts.WorkDirectory, 0o700); err != nil && !os.IsExist(err) { + return err + } + workDir, err := os.Open(criuOpts.WorkDirectory) + if err != nil { + return err + } + defer workDir.Close() + req.Opts.WorkDirFd = proto.Int32(int32(workDir.Fd())) + } c.handleCriuConfigurationFile(req.Opts) - // Same as during checkpointing. If the container has a specific network namespace - // assigned to it, this now expects that the checkpoint will be restored in a - // already created network namespace. - nsPath := c.config.Namespaces.PathOf(configs.NEWNET) - if nsPath != "" { - // For this to work we need at least criu 3.11.0 => 31100. - // As there was already a successful version check we will - // not error out if it fails. runc will just behave as it used - // to do and ignore external network namespaces. - err := c.checkCriuVersion(31100) - if err == nil { - // CRIU wants the information about an existing network namespace - // like this: --inherit-fd fd[]: - // The needs to be the same as during checkpointing. - // We are always using 'extRootNetNS' as the key in this. - netns, err := os.Open(nsPath) - defer netns.Close() - if err != nil { - logrus.Errorf("If a specific network namespace is defined it must exist: %s", err) - return fmt.Errorf("Requested network namespace %v does not exist", nsPath) - } - inheritFd := new(criurpc.InheritFd) - inheritFd.Key = proto.String("extRootNetNS") - // The offset of four is necessary because 0, 1, 2 and 3 is already - // used by stdin, stdout, stderr, 'criu swrk' socket. - inheritFd.Fd = proto.Int32(int32(4 + len(extraFiles))) - req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd) - // All open FDs need to be transferred to CRIU via extraFiles - extraFiles = append(extraFiles, netns) - } + if err := c.handleRestoringNamespaces(req.Opts, &extraFiles); err != nil { + return err } // This will modify the rootfs of the container in the same way runc @@ -1348,11 +1469,16 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error { return err } + hasCgroupns := c.config.Namespaces.Contains(configs.NEWCGROUP) for _, m := range c.config.Mounts { switch m.Device { case "bind": c.addCriuRestoreMount(req, m) case "cgroup": + if cgroups.IsCgroup2UnifiedMode() || hasCgroupns { + continue + } + // cgroup v1 is a set of bind mounts, unless cgroupns is used binds, err := getCgroupMounts(m) if err != nil { return err @@ -1379,11 +1505,7 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error { // append optional manage cgroups mode if criuOpts.ManageCgroupsMode != 0 { - // criu 1.7 => 10700 - if err := c.checkCriuVersion(10700); err != nil { - return err - } - mode := criurpc.CriuCgMode(criuOpts.ManageCgroupsMode) + mode := criuOpts.ManageCgroupsMode req.Opts.ManageCgroupsMode = &mode } @@ -1391,7 +1513,7 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error { fds []string fdJSON []byte ) - if fdJSON, err = ioutil.ReadFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename)); err != nil { + if fdJSON, err = os.ReadFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename)); err != nil { return err } @@ -1406,19 +1528,36 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error { req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd) } } - return c.criuSwrk(process, req, criuOpts, true, extraFiles) + err = c.criuSwrk(process, req, criuOpts, extraFiles) + + // Now that CRIU is done let's close all opened FDs CRIU needed. + for _, fd := range extraFiles { + fd.Close() + } + + return err } func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error { + // need to apply cgroups only on restore + if req.GetType() != criurpc.CriuReqType_RESTORE { + return nil + } + // XXX: Do we need to deal with this case? AFAIK criu still requires root. if err := c.cgroupManager.Apply(pid); err != nil { return err } - if err := c.cgroupManager.Set(c.config); err != nil { - return newSystemError(err) + if err := c.cgroupManager.Set(c.config.Cgroups.Resources); err != nil { + return err } + if cgroups.IsCgroup2UnifiedMode() { + return nil + } + // the stuff below is cgroupv1-specific + path := fmt.Sprintf("/proc/%d/cgroup", pid) cgroupsPaths, err := cgroups.ParseCgroupFile(path) if err != nil { @@ -1436,7 +1575,7 @@ func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error { return nil } -func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *CriuOpts, applyCgroups bool, extraFiles []*os.File) error { +func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *CriuOpts, extraFiles []*os.File) error { fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_SEQPACKET|unix.SOCK_CLOEXEC, 0) if err != nil { return err @@ -1469,7 +1608,6 @@ func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts * // the initial CRIU run to detect the version. Skip it. logrus.Debugf("Using CRIU %d at: %s", c.criuVersion, c.criuPath) } - logrus.Debugf("Using CRIU with following args: %s", args) cmd := exec.Command(c.criuPath, args...) if process != nil { cmd.Stdin = process.Stdin @@ -1484,26 +1622,29 @@ func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts * if err := cmd.Start(); err != nil { return err } + // we close criuServer so that even if CRIU crashes or unexpectedly exits, runc will not hang. criuServer.Close() + // cmd.Process will be replaced by a restored init. + criuProcess := cmd.Process + var criuProcessState *os.ProcessState defer func() { - criuClientCon.Close() - _, err := cmd.Process.Wait() - if err != nil { - return + if criuProcessState == nil { + criuClientCon.Close() + _, err := criuProcess.Wait() + if err != nil { + logrus.Warnf("wait on criuProcess returned %v", err) + } } }() - if applyCgroups { - err := c.criuApplyCgroups(cmd.Process.Pid, req) - if err != nil { - return err - } + if err := c.criuApplyCgroups(criuProcess.Pid, req); err != nil { + return err } var extFds []string if process != nil { - extFds, err = getPipeFds(cmd.Process.Pid) + extFds, err = getPipeFds(criuProcess.Pid) if err != nil { return err } @@ -1514,19 +1655,19 @@ func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts * // should be empty. For older CRIU versions it still will be // available but empty. criurpc.CriuReqType_VERSION actually // has no req.GetOpts(). - if !(req.GetType() == criurpc.CriuReqType_FEATURE_CHECK || - req.GetType() == criurpc.CriuReqType_VERSION) { + if logrus.GetLevel() >= logrus.DebugLevel && + !(req.GetType() == criurpc.CriuReqType_FEATURE_CHECK || + req.GetType() == criurpc.CriuReqType_VERSION) { val := reflect.ValueOf(req.GetOpts()) v := reflect.Indirect(val) for i := 0; i < v.NumField(); i++ { st := v.Type() name := st.Field(i).Name - if strings.HasPrefix(name, "XXX_") { - continue + if 'A' <= name[0] && name[0] <= 'Z' { + value := val.MethodByName("Get" + name).Call([]reflect.Value{}) + logrus.Debugf("CRIU option %s with value %v", name, value[0]) } - value := val.MethodByName("Get" + name).Call([]reflect.Value{}) - logrus.Debugf("CRIU option %s with value %v", name, value[0]) } } data, err := proto.Marshal(req) @@ -1540,16 +1681,25 @@ func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts * buf := make([]byte, 10*4096) oob := make([]byte, 4096) - for true { + for { n, oobn, _, _, err := criuClientCon.ReadMsgUnix(buf, oob) + if req.Opts != nil && req.Opts.StatusFd != nil { + // Close status_fd as soon as we got something back from criu, + // assuming it has consumed (reopened) it by this time. + // Otherwise it will might be left open forever and whoever + // is waiting on it will wait forever. + fd := int(*req.Opts.StatusFd) + _ = unix.Close(fd) + req.Opts.StatusFd = nil + } if err != nil { return err } if n == 0 { - return fmt.Errorf("unexpected EOF") + return errors.New("unexpected EOF") } if n == len(buf) { - return fmt.Errorf("buffer is too small") + return errors.New("buffer is too small") } resp := new(criurpc.CriuResp) @@ -1559,25 +1709,16 @@ func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts * } if !resp.GetSuccess() { typeString := req.GetType().String() - if typeString == "VERSION" { - // If the VERSION RPC fails this probably means that the CRIU - // version is too old for this RPC. Just return 'nil'. - return nil - } return fmt.Errorf("criu failed: type %s errno %d\nlog file: %s", typeString, resp.GetCrErrno(), logPath) } t := resp.GetType() switch { - case t == criurpc.CriuReqType_VERSION: - logrus.Debugf("CRIU version: %s", resp) - criuVersionRPC = resp.GetVersion() - break case t == criurpc.CriuReqType_FEATURE_CHECK: logrus.Debugf("Feature check says: %s", resp) criuFeatures = resp.GetFeatures() case t == criurpc.CriuReqType_NOTIFY: - if err := c.criuNotifications(resp, process, opts, extFds, oob[:oobn]); err != nil { + if err := c.criuNotifications(resp, process, cmd, opts, extFds, oob[:oobn]); err != nil { return err } t = criurpc.CriuReqType_NOTIFY @@ -1604,10 +1745,10 @@ func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts * break } - criuClientCon.CloseWrite() + _ = criuClientCon.CloseWrite() // cmd.Wait() waits cmd.goroutines which are used for proxying file descriptors. // Here we want to wait only the CRIU process. - st, err := cmd.Process.Wait() + criuProcessState, err = criuProcess.Wait() if err != nil { return err } @@ -1619,8 +1760,8 @@ func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts * // and not the whole series of pre-dump, pre-dump, ...m, dump // If we got the message CriuReqType_PRE_DUMP it means // CRIU was successful and we need to forcefully stop CRIU - if !st.Success() && *req.Type != criurpc.CriuReqType_PRE_DUMP { - return fmt.Errorf("criu failed: %s\nlog file: %s", st.String(), logPath) + if !criuProcessState.Success() && *req.Type != criurpc.CriuReqType_PRE_DUMP { + return fmt.Errorf("criu failed: %s\nlog file: %s", criuProcessState.String(), logPath) } return nil } @@ -1653,43 +1794,53 @@ func unlockNetwork(config *configs.Config) error { return nil } -func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Process, opts *CriuOpts, fds []string, oob []byte) error { +func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Process, cmd *exec.Cmd, opts *CriuOpts, fds []string, oob []byte) error { notify := resp.GetNotify() if notify == nil { return fmt.Errorf("invalid response: %s", resp.String()) } - logrus.Debugf("notify: %s\n", notify.GetScript()) - switch { - case notify.GetScript() == "post-dump": + script := notify.GetScript() + logrus.Debugf("notify: %s\n", script) + switch script { + case "post-dump": f, err := os.Create(filepath.Join(c.root, "checkpoint")) if err != nil { return err } f.Close() - case notify.GetScript() == "network-unlock": + case "network-unlock": if err := unlockNetwork(c.config); err != nil { return err } - case notify.GetScript() == "network-lock": + case "network-lock": if err := lockNetwork(c.config); err != nil { return err } - case notify.GetScript() == "setup-namespaces": + case "setup-namespaces": if c.config.Hooks != nil { s, err := c.currentOCIState() if err != nil { return nil } s.Pid = int(notify.GetPid()) - for i, hook := range c.config.Hooks.Prestart { - if err := hook.Run(s); err != nil { - return newSystemErrorWithCausef(err, "running prestart hook %d", i) - } + + if err := c.config.Hooks[configs.Prestart].RunHooks(s); err != nil { + return err + } + if err := c.config.Hooks[configs.CreateRuntime].RunHooks(s); err != nil { + return err } } - case notify.GetScript() == "post-restore": + case "post-restore": pid := notify.GetPid() - r, err := newRestoredProcess(int(pid), fds) + + p, err := os.FindProcess(int(pid)) + if err != nil { + return err + } + cmd.Process = p + + r, err := newRestoredProcess(cmd, fds) if err != nil { return err } @@ -1710,7 +1861,7 @@ func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Proc logrus.Error(err) } } - case notify.GetScript() == "orphan-pts-master": + case "orphan-pts-master": scm, err := unix.ParseSocketControlMessage(oob) if err != nil { return err @@ -1727,6 +1878,16 @@ func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Proc if err := utils.SendFd(process.ConsoleSocket, master.Name(), master.Fd()); err != nil { return err } + case "status-ready": + if opts.StatusFd != -1 { + // write \0 to status fd to notify that lazy page server is ready + _, err := unix.Write(opts.StatusFd, []byte{0}) + if err != nil { + logrus.Warnf("can't write \\0 to status fd: %v", err) + } + _ = unix.Close(opts.StatusFd) + opts.StatusFd = -1 + } } return nil } @@ -1746,17 +1907,30 @@ func (c *linuxContainer) updateState(process parentProcess) (*State, error) { return state, nil } -func (c *linuxContainer) saveState(s *State) error { - f, err := os.Create(filepath.Join(c.root, stateFilename)) +func (c *linuxContainer) saveState(s *State) (retErr error) { + tmpFile, err := os.CreateTemp(c.root, "state-") if err != nil { return err } - defer f.Close() - return utils.WriteJSON(f, s) -} -func (c *linuxContainer) deleteState() error { - return os.Remove(filepath.Join(c.root, stateFilename)) + defer func() { + if retErr != nil { + tmpFile.Close() + os.Remove(tmpFile.Name()) + } + }() + + err = utils.WriteJSON(tmpFile, s) + if err != nil { + return err + } + err = tmpFile.Close() + if err != nil { + return err + } + + stateFilePath := filepath.Join(c.root, stateFilename) + return os.Rename(tmpFile.Name(), stateFilePath) } func (c *linuxContainer) currentStatus() (Status, error) { @@ -1778,10 +1952,7 @@ func (c *linuxContainer) refreshState() error { if paused { return c.state.transition(&pausedState{c: c}) } - t, err := c.runType() - if err != nil { - return err - } + t := c.runType() switch t { case Created: return c.state.transition(&createdState{c: c}) @@ -1791,48 +1962,32 @@ func (c *linuxContainer) refreshState() error { return c.state.transition(&stoppedState{c: c}) } -func (c *linuxContainer) runType() (Status, error) { +func (c *linuxContainer) runType() Status { if c.initProcess == nil { - return Stopped, nil + return Stopped } pid := c.initProcess.pid() stat, err := system.Stat(pid) if err != nil { - return Stopped, nil + return Stopped } if stat.StartTime != c.initProcessStartTime || stat.State == system.Zombie || stat.State == system.Dead { - return Stopped, nil + return Stopped } // We'll create exec fifo and blocking on it after container is created, // and delete it after start container. if _, err := os.Stat(filepath.Join(c.root, execFifoFilename)); err == nil { - return Created, nil + return Created } - return Running, nil + return Running } func (c *linuxContainer) isPaused() (bool, error) { - fcg := c.cgroupManager.GetPaths()["freezer"] - if fcg == "" { - // A container doesn't have a freezer cgroup - return false, nil - } - pausedState := "FROZEN" - filename := "freezer.state" - if cgroups.IsCgroup2UnifiedMode() { - filename = "cgroup.freeze" - pausedState = "1" - } - - data, err := ioutil.ReadFile(filepath.Join(fcg, filename)) + state, err := c.cgroupManager.GetFreezerState() if err != nil { - // If freezer cgroup is not mounted, the container would just be not paused. - if os.IsNotExist(err) || err == syscall.ENODEV { - return false, nil - } - return false, newSystemErrorWithCause(err, "checking if container is paused") + return false, err } - return bytes.Equal(bytes.TrimSpace(data), []byte(pausedState)), nil + return state == configs.Frozen, nil } func (c *linuxContainer) currentState() (*State, error) { @@ -1846,9 +2001,10 @@ func (c *linuxContainer) currentState() (*State, error) { startTime, _ = c.initProcess.startTime() externalDescriptors = c.initProcess.externalDescriptors() } - intelRdtPath, err := intelrdt.GetIntelRdtPath(c.ID()) - if err != nil { - intelRdtPath = "" + + intelRdtPath := "" + if c.intelRdtManager != nil { + intelRdtPath = c.intelRdtManager.GetPath() } state := &State{ BaseState: BaseState{ @@ -1893,7 +2049,7 @@ func (c *linuxContainer) currentOCIState() (*specs.State, error) { if err != nil { return nil, err } - state.Status = status.String() + state.Status = specs.ContainerState(status.String()) if status != Stopped { if c.initProcess != nil { state.Pid = c.initProcess.pid() @@ -1916,16 +2072,16 @@ func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceTyp if p, ok := namespaces[ns]; ok && p != "" { // check if the requested namespace is supported if !configs.IsNamespaceSupported(ns) { - return nil, newSystemError(fmt.Errorf("namespace %s is not supported", ns)) + return nil, fmt.Errorf("namespace %s is not supported", ns) } // only set to join this namespace if it exists if _, err := os.Lstat(p); err != nil { - return nil, newSystemErrorWithCausef(err, "running lstat on namespace path %q", p) + return nil, fmt.Errorf("namespace path: %w", err) } // do not allow namespace path with comma as we use it to separate // the namespace paths if strings.ContainsRune(p, ',') { - return nil, newSystemError(fmt.Errorf("invalid path %s", p)) + return nil, fmt.Errorf("invalid namespace path %s", p) } paths = append(paths, fmt.Sprintf("%s:%s", configs.NsName(ns), p)) } @@ -1946,16 +2102,34 @@ func encodeIDMapping(idMap []configs.IDMap) ([]byte, error) { return data.Bytes(), nil } +// netlinkError is an error wrapper type for use by custom netlink message +// types. Panics with errors are wrapped in netlinkError so that the recover +// in bootstrapData can distinguish intentional panics. +type netlinkError struct{ error } + // bootstrapData encodes the necessary data in netlink binary format // as a io.Reader. // Consumer can write the data to a bootstrap program // such as one that uses nsenter package to bootstrap the container's // init process correctly, i.e. with correct namespaces, uid/gid // mapping etc. -func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string) (io.Reader, error) { +func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string, it initType) (_ io.Reader, Err error) { // create the netlink message r := nl.NewNetlinkRequest(int(InitMsg), 0) + // Our custom messages cannot bubble up an error using returns, instead + // they will panic with the specific error type, netlinkError. In that + // case, recover from the panic and return that as an error. + defer func() { + if r := recover(); r != nil { + if e, ok := r.(netlinkError); ok { + Err = e.error + } else { + panic(r) + } + } + }() + // write cloneFlags r.AddData(&Int32msg{ Type: CloneFlagsAttr, @@ -2024,7 +2198,7 @@ func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Na // write oom_score_adj r.AddData(&Bytemsg{ Type: OomScoreAdjAttr, - Value: []byte(fmt.Sprintf("%d", *c.config.OomScoreAdj)), + Value: []byte(strconv.Itoa(*c.config.OomScoreAdj)), }) } @@ -2034,6 +2208,25 @@ func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Na Value: c.config.RootlessEUID, }) + // Bind mount source to open. + if it == initStandard && c.shouldSendMountSources() { + var mounts []byte + for _, m := range c.config.Mounts { + if m.IsBind() { + if strings.IndexByte(m.Source, 0) >= 0 { + return nil, fmt.Errorf("mount source string contains null byte: %q", m.Source) + } + mounts = append(mounts, []byte(m.Source)...) + } + mounts = append(mounts, byte(0)) + } + + r.AddData(&Bytemsg{ + Type: MountSourcesAttr, + Value: mounts, + }) + } + return bytes.NewReader(r.Serialize()), nil } @@ -2044,9 +2237,19 @@ func ignoreTerminateErrors(err error) error { if err == nil { return nil } + // terminate() might return an error from either Kill or Wait. + // The (*Cmd).Wait documentation says: "If the command fails to run + // or doesn't complete successfully, the error is of type *ExitError". + // Filter out such errors (like "exit status 1" or "signal: killed"). + var exitErr *exec.ExitError + if errors.As(err, &exitErr) { + return nil + } + if errors.Is(err, os.ErrProcessDone) { + return nil + } s := err.Error() - switch { - case strings.Contains(s, "process already finished"), strings.Contains(s, "Wait was already called"): + if strings.Contains(s, "Wait was already called") { return nil } return err diff --git a/libcontainer/container_linux_test.go b/libcontainer/container_linux_test.go index f8af05d..3eb6e5a 100644 --- a/libcontainer/container_linux_test.go +++ b/libcontainer/container_linux_test.go @@ -1,10 +1,7 @@ -// +build linux - package libcontainer import ( "fmt" - "io/ioutil" "os" "testing" @@ -42,7 +39,7 @@ func (m *mockCgroupManager) Apply(pid int) error { return nil } -func (m *mockCgroupManager) Set(container *configs.Config) error { +func (m *mockCgroupManager) Set(_ *configs.Resources) error { return nil } @@ -50,21 +47,35 @@ func (m *mockCgroupManager) Destroy() error { return nil } +func (m *mockCgroupManager) Exists() bool { + _, err := os.Lstat(m.Path("devices")) + return err == nil +} + +func (m *mockCgroupManager) OOMKillCount() (uint64, error) { + return 0, nil +} + func (m *mockCgroupManager) GetPaths() map[string]string { return m.paths } -func (m *mockCgroupManager) GetUnifiedPath() (string, error) { - return "", fmt.Errorf("unimplemented") +func (m *mockCgroupManager) Path(subsys string) string { + return m.paths[subsys] } func (m *mockCgroupManager) Freeze(state configs.FreezerState) error { return nil } + func (m *mockCgroupManager) GetCgroups() (*configs.Cgroup, error) { return nil, nil } +func (m *mockCgroupManager) GetFreezerState() (configs.FreezerState, error) { + return configs.Thawed, nil +} + func (m *mockIntelRdtManager) Apply(pid int) error { return nil } @@ -125,15 +136,32 @@ func (m *mockProcess) externalDescriptors() []string { func (m *mockProcess) setExternalDescriptors(newFds []string) { } -func (m *mockProcess) forwardChildLogs() { +func (m *mockProcess) forwardChildLogs() chan error { + return nil } func TestGetContainerPids(t *testing.T) { - container := &linuxContainer{ - id: "myid", - config: &configs.Config{}, - cgroupManager: &mockCgroupManager{allPids: []int{1, 2, 3}}, + pid := 1 + stat, err := system.Stat(pid) + if err != nil { + t.Fatalf("can't stat pid %d, got %v", pid, err) } + container := &linuxContainer{ + id: "myid", + config: &configs.Config{}, + cgroupManager: &mockCgroupManager{ + allPids: []int{1, 2, 3}, + paths: map[string]string{ + "device": "/proc/self/cgroups", + }, + }, + initProcess: &mockProcess{ + _pid: 1, + started: 10, + }, + initProcessStartTime: stat.StartTime, + } + container.state = &runningState{c: container} pids, err := container.Processes() if err != nil { t.Fatal(err) @@ -176,7 +204,7 @@ func TestGetContainerStats(t *testing.T) { if stats.CgroupStats.MemoryStats.Usage.Usage != 1024 { t.Fatalf("expected memory usage 1024 but received %d", stats.CgroupStats.MemoryStats.Usage.Usage) } - if intelrdt.IsCatEnabled() { + if intelrdt.IsCATEnabled() { if stats.IntelRdtStats == nil { t.Fatal("intel rdt stats are nil") } @@ -184,7 +212,7 @@ func TestGetContainerStats(t *testing.T) { t.Fatalf("expected L3CacheSchema L3:0=f;1=f0 but received %s", stats.IntelRdtStats.L3CacheSchema) } } - if intelrdt.IsMbaEnabled() { + if intelrdt.IsMBAEnabled() { if stats.IntelRdtStats == nil { t.Fatal("intel rdt stats are nil") } @@ -257,7 +285,7 @@ func TestGetContainerState(t *testing.T) { if memPath := paths["memory"]; memPath != expectedMemoryPath { t.Fatalf("expected memory path %q but received %q", expectedMemoryPath, memPath) } - if intelrdt.IsCatEnabled() || intelrdt.IsMbaEnabled() { + if intelrdt.IsCATEnabled() || intelrdt.IsMBAEnabled() { intelRdtPath := state.IntelRdtPath if intelRdtPath == "" { t.Fatal("intel rdt path should not be empty") @@ -302,22 +330,14 @@ func TestGetContainerState(t *testing.T) { } func TestGetContainerStateAfterUpdate(t *testing.T) { - var ( - pid = os.Getpid() - ) + pid := os.Getpid() stat, err := system.Stat(pid) if err != nil { t.Fatal(err) } - rootDir, err := ioutil.TempDir("", "TestGetContainerStateAfterUpdate") - if err != nil { - t.Fatal(err) - } - defer os.RemoveAll(rootDir) - container := &linuxContainer{ - root: rootDir, + root: t.TempDir(), id: "myid", config: &configs.Config{ Namespaces: []configs.Namespace{ diff --git a/libcontainer/criu_opts_linux.go b/libcontainer/criu_opts_linux.go index a2e344f..b39476e 100644 --- a/libcontainer/criu_opts_linux.go +++ b/libcontainer/criu_opts_linux.go @@ -1,14 +1,6 @@ package libcontainer -// cgroup restoring strategy provided by criu -type cgMode uint32 - -const ( - CRIU_CG_MODE_SOFT cgMode = 3 + iota // restore cgroup properties if only dir created by criu - CRIU_CG_MODE_FULL // always restore all cgroups and their properties - CRIU_CG_MODE_STRICT // restore all, requiring them to not present in the system - CRIU_CG_MODE_DEFAULT // the same as CRIU_CG_MODE_SOFT -) +import criu "github.com/checkpoint-restore/go-criu/v5/rpc" type CriuPageServerInfo struct { Address string // IP address of CRIU page server @@ -32,9 +24,11 @@ type CriuOpts struct { PreDump bool // call criu predump to perform iterative checkpoint PageServer CriuPageServerInfo // allow to dump to criu page server VethPairs []VethPairName // pass the veth to criu when restore - ManageCgroupsMode cgMode // dump or restore cgroup mode + ManageCgroupsMode criu.CriuCgMode // dump or restore cgroup mode EmptyNs uint32 // don't c/r properties for namespace from this mask AutoDedup bool // auto deduplication for incremental dumps LazyPages bool // restore memory pages lazily using userfaultfd - StatusFd string // fd for feedback when lazy server is ready + StatusFd int // fd for feedback when lazy server is ready + LsmProfile string // LSM profile used to restore the container + LsmMountContext string // LSM mount context value to use during restore } diff --git a/libcontainer/devices/device.go b/libcontainer/devices/device.go new file mode 100644 index 0000000..c2c2b3b --- /dev/null +++ b/libcontainer/devices/device.go @@ -0,0 +1,174 @@ +package devices + +import ( + "fmt" + "os" + "strconv" +) + +const ( + Wildcard = -1 +) + +type Device struct { + Rule + + // Path to the device. + Path string `json:"path"` + + // FileMode permission bits for the device. + FileMode os.FileMode `json:"file_mode"` + + // Uid of the device. + Uid uint32 `json:"uid"` + + // Gid of the device. + Gid uint32 `json:"gid"` +} + +// Permissions is a cgroupv1-style string to represent device access. It +// has to be a string for backward compatibility reasons, hence why it has +// methods to do set operations. +type Permissions string + +const ( + deviceRead uint = (1 << iota) + deviceWrite + deviceMknod +) + +func (p Permissions) toSet() uint { + var set uint + for _, perm := range p { + switch perm { + case 'r': + set |= deviceRead + case 'w': + set |= deviceWrite + case 'm': + set |= deviceMknod + } + } + return set +} + +func fromSet(set uint) Permissions { + var perm string + if set&deviceRead == deviceRead { + perm += "r" + } + if set&deviceWrite == deviceWrite { + perm += "w" + } + if set&deviceMknod == deviceMknod { + perm += "m" + } + return Permissions(perm) +} + +// Union returns the union of the two sets of Permissions. +func (p Permissions) Union(o Permissions) Permissions { + lhs := p.toSet() + rhs := o.toSet() + return fromSet(lhs | rhs) +} + +// Difference returns the set difference of the two sets of Permissions. +// In set notation, A.Difference(B) gives you A\B. +func (p Permissions) Difference(o Permissions) Permissions { + lhs := p.toSet() + rhs := o.toSet() + return fromSet(lhs &^ rhs) +} + +// Intersection computes the intersection of the two sets of Permissions. +func (p Permissions) Intersection(o Permissions) Permissions { + lhs := p.toSet() + rhs := o.toSet() + return fromSet(lhs & rhs) +} + +// IsEmpty returns whether the set of permissions in a Permissions is +// empty. +func (p Permissions) IsEmpty() bool { + return p == Permissions("") +} + +// IsValid returns whether the set of permissions is a subset of valid +// permissions (namely, {r,w,m}). +func (p Permissions) IsValid() bool { + return p == fromSet(p.toSet()) +} + +type Type rune + +const ( + WildcardDevice Type = 'a' + BlockDevice Type = 'b' + CharDevice Type = 'c' // or 'u' + FifoDevice Type = 'p' +) + +func (t Type) IsValid() bool { + switch t { + case WildcardDevice, BlockDevice, CharDevice, FifoDevice: + return true + default: + return false + } +} + +func (t Type) CanMknod() bool { + switch t { + case BlockDevice, CharDevice, FifoDevice: + return true + default: + return false + } +} + +func (t Type) CanCgroup() bool { + switch t { + case WildcardDevice, BlockDevice, CharDevice: + return true + default: + return false + } +} + +type Rule struct { + // Type of device ('c' for char, 'b' for block). If set to 'a', this rule + // acts as a wildcard and all fields other than Allow are ignored. + Type Type `json:"type"` + + // Major is the device's major number. + Major int64 `json:"major"` + + // Minor is the device's minor number. + Minor int64 `json:"minor"` + + // Permissions is the set of permissions that this rule applies to (in the + // cgroupv1 format -- any combination of "rwm"). + Permissions Permissions `json:"permissions"` + + // Allow specifies whether this rule is allowed. + Allow bool `json:"allow"` +} + +func (d *Rule) CgroupString() string { + var ( + major = strconv.FormatInt(d.Major, 10) + minor = strconv.FormatInt(d.Minor, 10) + ) + if d.Major == Wildcard { + major = "*" + } + if d.Minor == Wildcard { + minor = "*" + } + return fmt.Sprintf("%c %s:%s %s", d.Type, major, minor, d.Permissions) +} + +func (d *Rule) Mkdev() (uint64, error) { + return mkDev(d) +} diff --git a/libcontainer/devices/device_unix.go b/libcontainer/devices/device_unix.go new file mode 100644 index 0000000..7d8e9fc --- /dev/null +++ b/libcontainer/devices/device_unix.go @@ -0,0 +1,120 @@ +//go:build !windows +// +build !windows + +package devices + +import ( + "errors" + "os" + "path/filepath" + + "golang.org/x/sys/unix" +) + +// ErrNotADevice denotes that a file is not a valid linux device. +var ErrNotADevice = errors.New("not a device node") + +// Testing dependencies +var ( + unixLstat = unix.Lstat + osReadDir = os.ReadDir +) + +func mkDev(d *Rule) (uint64, error) { + if d.Major == Wildcard || d.Minor == Wildcard { + return 0, errors.New("cannot mkdev() device with wildcards") + } + return unix.Mkdev(uint32(d.Major), uint32(d.Minor)), nil +} + +// DeviceFromPath takes the path to a device and its cgroup_permissions (which +// cannot be easily queried) to look up the information about a linux device +// and returns that information as a Device struct. +func DeviceFromPath(path, permissions string) (*Device, error) { + var stat unix.Stat_t + err := unixLstat(path, &stat) + if err != nil { + return nil, err + } + + var ( + devType Type + mode = stat.Mode + devNumber = uint64(stat.Rdev) //nolint:unconvert // Rdev is uint32 on e.g. MIPS. + major = unix.Major(devNumber) + minor = unix.Minor(devNumber) + ) + switch mode & unix.S_IFMT { + case unix.S_IFBLK: + devType = BlockDevice + case unix.S_IFCHR: + devType = CharDevice + case unix.S_IFIFO: + devType = FifoDevice + default: + return nil, ErrNotADevice + } + return &Device{ + Rule: Rule{ + Type: devType, + Major: int64(major), + Minor: int64(minor), + Permissions: Permissions(permissions), + }, + Path: path, + FileMode: os.FileMode(mode &^ unix.S_IFMT), + Uid: stat.Uid, + Gid: stat.Gid, + }, nil +} + +// HostDevices returns all devices that can be found under /dev directory. +func HostDevices() ([]*Device, error) { + return GetDevices("/dev") +} + +// GetDevices recursively traverses a directory specified by path +// and returns all devices found there. +func GetDevices(path string) ([]*Device, error) { + files, err := osReadDir(path) + if err != nil { + return nil, err + } + var out []*Device + for _, f := range files { + switch { + case f.IsDir(): + switch f.Name() { + // ".lxc" & ".lxd-mounts" added to address https://github.com/lxc/lxd/issues/2825 + // ".udev" added to address https://github.com/opencontainers/runc/issues/2093 + case "pts", "shm", "fd", "mqueue", ".lxc", ".lxd-mounts", ".udev": + continue + default: + sub, err := GetDevices(filepath.Join(path, f.Name())) + if err != nil { + return nil, err + } + + out = append(out, sub...) + continue + } + case f.Name() == "console": + continue + } + device, err := DeviceFromPath(filepath.Join(path, f.Name()), "rwm") + if err != nil { + if errors.Is(err, ErrNotADevice) { + continue + } + if os.IsNotExist(err) { + continue + } + return nil, err + } + if device.Type == FifoDevice { + continue + } + out = append(out, device) + } + return out, nil +} diff --git a/libcontainer/devices/device_unix_go116_test.go b/libcontainer/devices/device_unix_go116_test.go new file mode 100644 index 0000000..5e1cdf2 --- /dev/null +++ b/libcontainer/devices/device_unix_go116_test.go @@ -0,0 +1,39 @@ +//go:build !go1.17 +// +build !go1.17 + +package devices + +import "io/fs" + +// The following code is adapted from go1.17.1/src/io/fs/readdir.go +// to compensate for the lack of fs.FileInfoToDirEntry in Go 1.16. + +// dirInfo is a DirEntry based on a FileInfo. +type dirInfo struct { + fileInfo fs.FileInfo +} + +func (di dirInfo) IsDir() bool { + return di.fileInfo.IsDir() +} + +func (di dirInfo) Type() fs.FileMode { + return di.fileInfo.Mode().Type() +} + +func (di dirInfo) Info() (fs.FileInfo, error) { + return di.fileInfo, nil +} + +func (di dirInfo) Name() string { + return di.fileInfo.Name() +} + +// fileInfoToDirEntry returns a DirEntry that returns information from info. +// If info is nil, FileInfoToDirEntry returns nil. +func fileInfoToDirEntry(info fs.FileInfo) fs.DirEntry { + if info == nil { + return nil + } + return dirInfo{fileInfo: info} +} diff --git a/libcontainer/devices/device_unix_go117_test.go b/libcontainer/devices/device_unix_go117_test.go new file mode 100644 index 0000000..d74db8f --- /dev/null +++ b/libcontainer/devices/device_unix_go117_test.go @@ -0,0 +1,8 @@ +//go:build go1.17 +// +build go1.17 + +package devices + +import "io/fs" + +var fileInfoToDirEntry = fs.FileInfoToDirEntry diff --git a/libcontainer/devices/device_unix_test.go b/libcontainer/devices/device_unix_test.go new file mode 100644 index 0000000..c58256c --- /dev/null +++ b/libcontainer/devices/device_unix_test.go @@ -0,0 +1,97 @@ +//go:build !windows +// +build !windows + +package devices + +import ( + "errors" + "io/fs" + "os" + "testing" + + "golang.org/x/sys/unix" +) + +func cleanupTest() { + unixLstat = unix.Lstat + osReadDir = os.ReadDir +} + +func TestDeviceFromPathLstatFailure(t *testing.T) { + testError := errors.New("test error") + + // Override unix.Lstat to inject error. + unixLstat = func(path string, stat *unix.Stat_t) error { + return testError + } + defer cleanupTest() + + _, err := DeviceFromPath("", "") + if !errors.Is(err, testError) { + t.Fatalf("Unexpected error %v, expected %v", err, testError) + } +} + +func TestHostDevicesIoutilReadDirFailure(t *testing.T) { + testError := errors.New("test error") + + // Override os.ReadDir to inject error. + osReadDir = func(dirname string) ([]fs.DirEntry, error) { + return nil, testError + } + defer cleanupTest() + + _, err := HostDevices() + if !errors.Is(err, testError) { + t.Fatalf("Unexpected error %v, expected %v", err, testError) + } +} + +func TestHostDevicesIoutilReadDirDeepFailure(t *testing.T) { + testError := errors.New("test error") + called := false + + // Override os.ReadDir to inject error after the first call. + osReadDir = func(dirname string) ([]fs.DirEntry, error) { + if called { + return nil, testError + } + called = true + + // Provoke a second call. + fi, err := os.Lstat("/tmp") + if err != nil { + t.Fatalf("Unexpected error %v", err) + } + + return []fs.DirEntry{fileInfoToDirEntry(fi)}, nil + } + defer cleanupTest() + + _, err := HostDevices() + if !errors.Is(err, testError) { + t.Fatalf("Unexpected error %v, expected %v", err, testError) + } +} + +func TestHostDevicesAllValid(t *testing.T) { + devices, err := HostDevices() + if err != nil { + t.Fatalf("failed to get host devices: %v", err) + } + + for _, device := range devices { + // Devices can't have major number 0. + if device.Major == 0 { + t.Errorf("device entry %+v has zero major number", device) + } + switch device.Type { + case BlockDevice, CharDevice: + case FifoDevice: + t.Logf("fifo devices shouldn't show up from HostDevices") + fallthrough + default: + t.Errorf("device entry %+v has unexpected type %v", device, device.Type) + } + } +} diff --git a/libcontainer/devices/devices.go b/libcontainer/devices/devices.go deleted file mode 100644 index 5dabe06..0000000 --- a/libcontainer/devices/devices.go +++ /dev/null @@ -1,110 +0,0 @@ -package devices - -import ( - "errors" - "io/ioutil" - "os" - "path/filepath" - - "github.com/opencontainers/runc/libcontainer/configs" - "golang.org/x/sys/unix" -) - -var ( - // ErrNotADevice denotes that a file is not a valid linux device. - ErrNotADevice = errors.New("not a device node") -) - -// Testing dependencies -var ( - unixLstat = unix.Lstat - ioutilReadDir = ioutil.ReadDir -) - -// Given the path to a device and its cgroup_permissions(which cannot be easily queried) look up the -// information about a linux device and return that information as a Device struct. -func DeviceFromPath(path, permissions string) (*configs.Device, error) { - var stat unix.Stat_t - err := unixLstat(path, &stat) - if err != nil { - return nil, err - } - - var ( - devNumber = uint64(stat.Rdev) - major = unix.Major(devNumber) - minor = unix.Minor(devNumber) - ) - if major == 0 { - return nil, ErrNotADevice - } - - var ( - devType rune - mode = stat.Mode - ) - switch { - case mode&unix.S_IFBLK == unix.S_IFBLK: - devType = 'b' - case mode&unix.S_IFCHR == unix.S_IFCHR: - devType = 'c' - } - return &configs.Device{ - Type: devType, - Path: path, - Major: int64(major), - Minor: int64(minor), - Permissions: permissions, - FileMode: os.FileMode(mode), - Uid: stat.Uid, - Gid: stat.Gid, - }, nil -} - -// HostDevices returns all devices that can be found under /dev directory. -func HostDevices() ([]*configs.Device, error) { - return GetDevices("/dev") -} - -// GetDevices recursively traverses a directory specified by path -// and returns all devices found there. -func GetDevices(path string) ([]*configs.Device, error) { - files, err := ioutilReadDir(path) - if err != nil { - return nil, err - } - var out []*configs.Device - for _, f := range files { - switch { - case f.IsDir(): - switch f.Name() { - // ".lxc" & ".lxd-mounts" added to address https://github.com/lxc/lxd/issues/2825 - // ".udev" added to address https://github.com/opencontainers/runc/issues/2093 - case "pts", "shm", "fd", "mqueue", ".lxc", ".lxd-mounts", ".udev": - continue - default: - sub, err := GetDevices(filepath.Join(path, f.Name())) - if err != nil { - return nil, err - } - - out = append(out, sub...) - continue - } - case f.Name() == "console": - continue - } - device, err := DeviceFromPath(filepath.Join(path, f.Name()), "rwm") - if err != nil { - if err == ErrNotADevice { - continue - } - if os.IsNotExist(err) { - continue - } - return nil, err - } - out = append(out, device) - } - return out, nil -} diff --git a/libcontainer/devices/devices_test.go b/libcontainer/devices/devices_test.go deleted file mode 100644 index 0afa9d9..0000000 --- a/libcontainer/devices/devices_test.go +++ /dev/null @@ -1,63 +0,0 @@ -package devices - -import ( - "errors" - "os" - "testing" - - "golang.org/x/sys/unix" -) - -func TestDeviceFromPathLstatFailure(t *testing.T) { - testError := errors.New("test error") - - // Override unix.Lstat to inject error. - unixLstat = func(path string, stat *unix.Stat_t) error { - return testError - } - - _, err := DeviceFromPath("", "") - if err != testError { - t.Fatalf("Unexpected error %v, expected %v", err, testError) - } -} - -func TestHostDevicesIoutilReadDirFailure(t *testing.T) { - testError := errors.New("test error") - - // Override ioutil.ReadDir to inject error. - ioutilReadDir = func(dirname string) ([]os.FileInfo, error) { - return nil, testError - } - - _, err := HostDevices() - if err != testError { - t.Fatalf("Unexpected error %v, expected %v", err, testError) - } -} - -func TestHostDevicesIoutilReadDirDeepFailure(t *testing.T) { - testError := errors.New("test error") - called := false - - // Override ioutil.ReadDir to inject error after the first call. - ioutilReadDir = func(dirname string) ([]os.FileInfo, error) { - if called { - return nil, testError - } - called = true - - // Provoke a second call. - fi, err := os.Lstat("/tmp") - if err != nil { - t.Fatalf("Unexpected error %v", err) - } - - return []os.FileInfo{fi}, nil - } - - _, err := HostDevices() - if err != testError { - t.Fatalf("Unexpected error %v, expected %v", err, testError) - } -} diff --git a/libcontainer/error.go b/libcontainer/error.go index 21a3789..510c072 100644 --- a/libcontainer/error.go +++ b/libcontainer/error.go @@ -1,70 +1,13 @@ package libcontainer -import "io" +import "errors" -// ErrorCode is the API error code type. -type ErrorCode int - -// API error codes. -const ( - // Factory errors - IdInUse ErrorCode = iota - InvalidIdFormat - - // Container errors - ContainerNotExists - ContainerPaused - ContainerNotStopped - ContainerNotRunning - ContainerNotPaused - - // Process errors - NoProcessOps - - // Common errors - ConfigInvalid - ConsoleExists - SystemError +var ( + ErrExist = errors.New("container with given ID already exists") + ErrInvalidID = errors.New("invalid container ID format") + ErrNotExist = errors.New("container does not exist") + ErrPaused = errors.New("container paused") + ErrRunning = errors.New("container still running") + ErrNotRunning = errors.New("container not running") + ErrNotPaused = errors.New("container not paused") ) - -func (c ErrorCode) String() string { - switch c { - case IdInUse: - return "Id already in use" - case InvalidIdFormat: - return "Invalid format" - case ContainerPaused: - return "Container paused" - case ConfigInvalid: - return "Invalid configuration" - case SystemError: - return "System error" - case ContainerNotExists: - return "Container does not exist" - case ContainerNotStopped: - return "Container is not stopped" - case ContainerNotRunning: - return "Container is not running" - case ConsoleExists: - return "Console exists for process" - case ContainerNotPaused: - return "Container is not paused" - case NoProcessOps: - return "No process operations" - default: - return "Unknown error" - } -} - -// Error is the API error type. -type Error interface { - error - - // Returns an error if it failed to write the detail of the Error to w. - // The detail of the Error may include the error message and a - // representation of the stack trace. - Detail(w io.Writer) error - - // Returns the error code for this error. - Code() ErrorCode -} diff --git a/libcontainer/error_test.go b/libcontainer/error_test.go deleted file mode 100644 index 36841ad..0000000 --- a/libcontainer/error_test.go +++ /dev/null @@ -1,25 +0,0 @@ -package libcontainer - -import "testing" - -func TestErrorCode(t *testing.T) { - codes := map[ErrorCode]string{ - IdInUse: "Id already in use", - InvalidIdFormat: "Invalid format", - ContainerPaused: "Container paused", - ConfigInvalid: "Invalid configuration", - SystemError: "System error", - ContainerNotExists: "Container does not exist", - ContainerNotStopped: "Container is not stopped", - ContainerNotRunning: "Container is not running", - ConsoleExists: "Console exists for process", - ContainerNotPaused: "Container is not paused", - NoProcessOps: "No process operations", - } - - for code, expected := range codes { - if actual := code.String(); actual != expected { - t.Fatalf("expected string %q but received %q", expected, actual) - } - } -} diff --git a/libcontainer/factory.go b/libcontainer/factory.go index 0986cd7..9f9e8fc 100644 --- a/libcontainer/factory.go +++ b/libcontainer/factory.go @@ -14,29 +14,15 @@ type Factory interface { // // Returns the new container with a running process. // - // errors: - // IdInUse - id is already in use by a container - // InvalidIdFormat - id has incorrect format - // ConfigInvalid - config is invalid - // Systemerror - System error - // // On error, any partially created container parts are cleaned up (the operation is atomic). Create(id string, config *configs.Config) (Container, error) // Load takes an ID for an existing container and returns the container information // from the state. This presents a read only view of the container. - // - // errors: - // Path does not exist - // System error Load(id string) (Container, error) // StartInitialization is an internal API to libcontainer used during the reexec of the // container. - // - // Errors: - // Pipe connection error - // System error StartInitialization() error // Type returns info string about factory type (e.g. lxc, libcontainer...) diff --git a/libcontainer/factory_linux.go b/libcontainer/factory_linux.go index 437633c..023d623 100644 --- a/libcontainer/factory_linux.go +++ b/libcontainer/factory_linux.go @@ -1,9 +1,8 @@ -// +build linux - package libcontainer import ( "encoding/json" + "errors" "fmt" "os" "path/filepath" @@ -11,19 +10,16 @@ import ( "runtime/debug" "strconv" - "github.com/cyphar/filepath-securejoin" - "github.com/opencontainers/runc/libcontainer/cgroups" - "github.com/opencontainers/runc/libcontainer/cgroups/fs" - "github.com/opencontainers/runc/libcontainer/cgroups/fs2" - "github.com/opencontainers/runc/libcontainer/cgroups/systemd" + securejoin "github.com/cyphar/filepath-securejoin" + "github.com/moby/sys/mountinfo" + "golang.org/x/sys/unix" + + "github.com/opencontainers/runc/libcontainer/cgroups/manager" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/configs/validate" "github.com/opencontainers/runc/libcontainer/intelrdt" - "github.com/opencontainers/runc/libcontainer/mount" "github.com/opencontainers/runc/libcontainer/utils" - "github.com/pkg/errors" - - "golang.org/x/sys/unix" + "github.com/sirupsen/logrus" ) const ( @@ -41,7 +37,9 @@ func InitArgs(args ...string) func(*LinuxFactory) error { // Resolve relative paths to ensure that its available // after directory changes. if args[0], err = filepath.Abs(args[0]); err != nil { - return newGenericError(err, ConfigInvalid) + // The only error returned from filepath.Abs is + // the one from os.Getwd, i.e. a system error. + return err } } @@ -50,86 +48,15 @@ func InitArgs(args ...string) func(*LinuxFactory) error { } } -// SystemdCgroups is an options func to configure a LinuxFactory to return -// containers that use systemd to create and manage cgroups. -func SystemdCgroups(l *LinuxFactory) error { - systemdCgroupsManager, err := systemd.NewSystemdCgroupsManager() - if err != nil { - return err - } - l.NewCgroupsManager = systemdCgroupsManager - return nil -} - -func getUnifiedPath(paths map[string]string) string { - unifiedPath := "" - for k, v := range paths { - if unifiedPath == "" { - unifiedPath = v - } else if v != unifiedPath { - panic(errors.Errorf("expected %q path to be unified path %q, got %q", k, unifiedPath, v)) - } - } - // can be empty - return unifiedPath -} - -func cgroupfs2(l *LinuxFactory, rootless bool) error { - l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager { - m, err := fs2.NewManager(config, getUnifiedPath(paths), rootless) - if err != nil { - panic(err) - } - return m - } - return nil -} - -// Cgroupfs is an options func to configure a LinuxFactory to return containers -// that use the native cgroups filesystem implementation to create and manage -// cgroups. -func Cgroupfs(l *LinuxFactory) error { - if cgroups.IsCgroup2UnifiedMode() { - return cgroupfs2(l, false) - } - l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager { - return &fs.Manager{ - Cgroups: config, - Paths: paths, - } - } - return nil -} - -// RootlessCgroupfs is an options func to configure a LinuxFactory to return -// containers that use the native cgroups filesystem implementation to create -// and manage cgroups. The difference between RootlessCgroupfs and Cgroupfs is -// that RootlessCgroupfs can transparently handle permission errors that occur -// during rootless container (including euid=0 in userns) setup (while still allowing cgroup usage if -// they've been set up properly). -func RootlessCgroupfs(l *LinuxFactory) error { - if cgroups.IsCgroup2UnifiedMode() { - return cgroupfs2(l, true) - } - l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager { - return &fs.Manager{ - Cgroups: config, - Rootless: true, - Paths: paths, - } - } - return nil -} - // IntelRdtfs is an options func to configure a LinuxFactory to return // containers that use the Intel RDT "resource control" filesystem to // create and manage Intel RDT resources (e.g., L3 cache, memory bandwidth). func IntelRdtFs(l *LinuxFactory) error { - l.NewIntelRdtManager = func(config *configs.Config, id string, path string) intelrdt.Manager { - return &intelrdt.IntelRdtManager{ - Config: config, - Id: id, - Path: path, + if !intelrdt.IsCATEnabled() && !intelrdt.IsMBAEnabled() { + l.NewIntelRdtManager = nil + } else { + l.NewIntelRdtManager = func(config *configs.Config, id string, path string) intelrdt.Manager { + return intelrdt.NewManager(config, id, path) } } return nil @@ -137,12 +64,12 @@ func IntelRdtFs(l *LinuxFactory) error { // TmpfsRoot is an option func to mount LinuxFactory.Root to tmpfs. func TmpfsRoot(l *LinuxFactory) error { - mounted, err := mount.Mounted(l.Root) + mounted, err := mountinfo.Mounted(l.Root) if err != nil { return err } if !mounted { - if err := unix.Mount("tmpfs", l.Root, "tmpfs", 0, ""); err != nil { + if err := mount("tmpfs", l.Root, "", "tmpfs", 0, ""); err != nil { return err } } @@ -162,8 +89,8 @@ func CriuPath(criupath string) func(*LinuxFactory) error { // configures the factory with the provided option funcs. func New(root string, options ...func(*LinuxFactory) error) (Factory, error) { if root != "" { - if err := os.MkdirAll(root, 0700); err != nil { - return nil, newGenericError(err, SystemError) + if err := os.MkdirAll(root, 0o700); err != nil { + return nil, err } } l := &LinuxFactory{ @@ -173,7 +100,7 @@ func New(root string, options ...func(*LinuxFactory) error) (Factory, error) { Validator: validate.New(), CriuPath: "criu", } - Cgroupfs(l) + for _, opt := range options { if opt == nil { continue @@ -202,7 +129,7 @@ type LinuxFactory struct { // containers. CriuPath string - // New{u,g}uidmapPath is the path to the binaries used for mapping with + // New{u,g}idmapPath is the path to the binaries used for mapping with // rootless containers. NewuidmapPath string NewgidmapPath string @@ -210,37 +137,69 @@ type LinuxFactory struct { // Validator provides validation to container configurations. Validator validate.Validator - // NewCgroupsManager returns an initialized cgroups manager for a single container. - NewCgroupsManager func(config *configs.Cgroup, paths map[string]string) cgroups.Manager - // NewIntelRdtManager returns an initialized Intel RDT manager for a single container. NewIntelRdtManager func(config *configs.Config, id string, path string) intelrdt.Manager } func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, error) { if l.Root == "" { - return nil, newGenericError(fmt.Errorf("invalid root"), ConfigInvalid) + return nil, errors.New("root not set") } if err := l.validateID(id); err != nil { return nil, err } if err := l.Validator.Validate(config); err != nil { - return nil, newGenericError(err, ConfigInvalid) + return nil, err } containerRoot, err := securejoin.SecureJoin(l.Root, id) if err != nil { return nil, err } if _, err := os.Stat(containerRoot); err == nil { - return nil, newGenericError(fmt.Errorf("container with id exists: %v", id), IdInUse) + return nil, ErrExist } else if !os.IsNotExist(err) { - return nil, newGenericError(err, SystemError) + return nil, err } - if err := os.MkdirAll(containerRoot, 0711); err != nil { - return nil, newGenericError(err, SystemError) + + cm, err := manager.New(config.Cgroups) + if err != nil { + return nil, err + } + + // Check that cgroup does not exist or empty (no processes). + // Note for cgroup v1 this check is not thorough, as there are multiple + // separate hierarchies, while both Exists() and GetAllPids() only use + // one for "devices" controller (assuming others are the same, which is + // probably true in almost all scenarios). Checking all the hierarchies + // would be too expensive. + if cm.Exists() { + pids, err := cm.GetAllPids() + // Reading PIDs can race with cgroups removal, so ignore ENOENT and ENODEV. + if err != nil && !errors.Is(err, os.ErrNotExist) && !errors.Is(err, unix.ENODEV) { + return nil, fmt.Errorf("unable to get cgroup PIDs: %w", err) + } + if len(pids) != 0 { + // TODO: return an error. + logrus.Warnf("container's cgroup is not empty: %d process(es) found", len(pids)) + logrus.Warn("DEPRECATED: running container in a non-empty cgroup won't be supported in runc 1.2; https://github.com/opencontainers/runc/issues/3132") + } + } + + // Check that cgroup is not frozen. Do not use Exists() here + // since in cgroup v1 it only checks "devices" controller. + st, err := cm.GetFreezerState() + if err != nil { + return nil, fmt.Errorf("unable to get cgroup freezer state: %w", err) + } + if st == configs.Frozen { + return nil, errors.New("container's cgroup unexpectedly frozen") + } + + if err := os.MkdirAll(containerRoot, 0o711); err != nil { + return nil, err } if err := os.Chown(containerRoot, unix.Geteuid(), unix.Getegid()); err != nil { - return nil, newGenericError(err, SystemError) + return nil, err } c := &linuxContainer{ id: id, @@ -251,9 +210,9 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err criuPath: l.CriuPath, newuidmapPath: l.NewuidmapPath, newgidmapPath: l.NewgidmapPath, - cgroupManager: l.NewCgroupsManager(config.Cgroups, nil), + cgroupManager: cm, } - if intelrdt.IsCatEnabled() || intelrdt.IsMbaEnabled() { + if l.NewIntelRdtManager != nil { c.intelRdtManager = l.NewIntelRdtManager(config, id, "") } c.state = &stoppedState{c: c} @@ -262,9 +221,9 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err func (l *LinuxFactory) Load(id string) (Container, error) { if l.Root == "" { - return nil, newGenericError(fmt.Errorf("invalid root"), ConfigInvalid) + return nil, errors.New("root not set") } - //when load, we need to check id is valid or not. + // when load, we need to check id is valid or not. if err := l.validateID(id); err != nil { return nil, err } @@ -272,7 +231,7 @@ func (l *LinuxFactory) Load(id string) (Container, error) { if err != nil { return nil, err } - state, err := l.loadState(containerRoot, id) + state, err := l.loadState(containerRoot) if err != nil { return nil, err } @@ -281,6 +240,10 @@ func (l *LinuxFactory) Load(id string) (Container, error) { processStartTime: state.InitProcessStartTime, fds: state.ExternalDescriptors, } + cm, err := manager.NewWithPaths(state.Config.Cgroups, state.CgroupPaths) + if err != nil { + return nil, err + } c := &linuxContainer{ initProcess: r, initProcessStartTime: state.InitProcessStartTime, @@ -291,17 +254,17 @@ func (l *LinuxFactory) Load(id string) (Container, error) { criuPath: l.CriuPath, newuidmapPath: l.NewuidmapPath, newgidmapPath: l.NewgidmapPath, - cgroupManager: l.NewCgroupsManager(state.Config.Cgroups, state.CgroupPaths), + cgroupManager: cm, root: containerRoot, created: state.Created, } + if l.NewIntelRdtManager != nil { + c.intelRdtManager = l.NewIntelRdtManager(&state.Config, id, state.IntelRdtPath) + } c.state = &loadedState{c: c} if err := c.refreshState(); err != nil { return nil, err } - if intelrdt.IsCatEnabled() || intelrdt.IsMbaEnabled() { - c.intelRdtManager = l.NewIntelRdtManager(&state.Config, id, state.IntelRdtPath) - } return c, nil } @@ -312,66 +275,74 @@ func (l *LinuxFactory) Type() string { // StartInitialization loads a container by opening the pipe fd from the parent to read the configuration and state // This is a low level implementation detail of the reexec and should not be consumed externally func (l *LinuxFactory) StartInitialization() (err error) { - var ( - pipefd, fifofd int - consoleSocket *os.File - envInitPipe = os.Getenv("_LIBCONTAINER_INITPIPE") - envFifoFd = os.Getenv("_LIBCONTAINER_FIFOFD") - envConsole = os.Getenv("_LIBCONTAINER_CONSOLE") - ) - // Get the INITPIPE. - pipefd, err = strconv.Atoi(envInitPipe) + envInitPipe := os.Getenv("_LIBCONTAINER_INITPIPE") + pipefd, err := strconv.Atoi(envInitPipe) if err != nil { - return fmt.Errorf("unable to convert _LIBCONTAINER_INITPIPE=%s to int: %s", envInitPipe, err) + err = fmt.Errorf("unable to convert _LIBCONTAINER_INITPIPE: %w", err) + logrus.Error(err) + return err } - - var ( - pipe = os.NewFile(uintptr(pipefd), "pipe") - it = initType(os.Getenv("_LIBCONTAINER_INITTYPE")) - ) + pipe := os.NewFile(uintptr(pipefd), "pipe") defer pipe.Close() + defer func() { + // We have an error during the initialization of the container's init, + // send it back to the parent process in the form of an initError. + if werr := writeSync(pipe, procError); werr != nil { + fmt.Fprintln(os.Stderr, err) + return + } + if werr := utils.WriteJSON(pipe, &initError{Message: err.Error()}); werr != nil { + fmt.Fprintln(os.Stderr, err) + return + } + }() + // Only init processes have FIFOFD. - fifofd = -1 + fifofd := -1 + envInitType := os.Getenv("_LIBCONTAINER_INITTYPE") + it := initType(envInitType) if it == initStandard { + envFifoFd := os.Getenv("_LIBCONTAINER_FIFOFD") if fifofd, err = strconv.Atoi(envFifoFd); err != nil { - return fmt.Errorf("unable to convert _LIBCONTAINER_FIFOFD=%s to int: %s", envFifoFd, err) + return fmt.Errorf("unable to convert _LIBCONTAINER_FIFOFD: %w", err) } } - if envConsole != "" { + var consoleSocket *os.File + if envConsole := os.Getenv("_LIBCONTAINER_CONSOLE"); envConsole != "" { console, err := strconv.Atoi(envConsole) if err != nil { - return fmt.Errorf("unable to convert _LIBCONTAINER_CONSOLE=%s to int: %s", envConsole, err) + return fmt.Errorf("unable to convert _LIBCONTAINER_CONSOLE: %w", err) } consoleSocket = os.NewFile(uintptr(console), "console-socket") defer consoleSocket.Close() } + logPipeFdStr := os.Getenv("_LIBCONTAINER_LOGPIPE") + logPipeFd, err := strconv.Atoi(logPipeFdStr) + if err != nil { + return fmt.Errorf("unable to convert _LIBCONTAINER_LOGPIPE: %w", err) + } + + // Get mount files (O_PATH). + mountFds, err := parseMountFds() + if err != nil { + return err + } + // clear the current process's environment to clean any libcontainer // specific env vars. os.Clearenv() - defer func() { - // We have an error during the initialization of the container's init, - // send it back to the parent process in the form of an initError. - if werr := utils.WriteJSON(pipe, syncT{procError}); werr != nil { - fmt.Fprintln(os.Stderr, err) - return - } - if werr := utils.WriteJSON(pipe, newSystemError(err)); werr != nil { - fmt.Fprintln(os.Stderr, err) - return - } - }() defer func() { if e := recover(); e != nil { - err = fmt.Errorf("panic from initialization: %v, %v", e, string(debug.Stack())) + err = fmt.Errorf("panic from initialization: %w, %v", e, string(debug.Stack())) } }() - i, err := newContainerInit(it, pipe, consoleSocket, fifofd) + i, err := newContainerInit(it, pipe, consoleSocket, fifofd, logPipeFd, mountFds) if err != nil { return err } @@ -380,7 +351,7 @@ func (l *LinuxFactory) StartInitialization() (err error) { return i.Init() } -func (l *LinuxFactory) loadState(root, id string) (*State, error) { +func (l *LinuxFactory) loadState(root string) (*State, error) { stateFilePath, err := securejoin.SecureJoin(root, stateFilename) if err != nil { return nil, err @@ -388,21 +359,21 @@ func (l *LinuxFactory) loadState(root, id string) (*State, error) { f, err := os.Open(stateFilePath) if err != nil { if os.IsNotExist(err) { - return nil, newGenericError(fmt.Errorf("container %q does not exist", id), ContainerNotExists) + return nil, ErrNotExist } - return nil, newGenericError(err, SystemError) + return nil, err } defer f.Close() var state *State if err := json.NewDecoder(f).Decode(&state); err != nil { - return nil, newGenericError(err, SystemError) + return nil, err } return state, nil } func (l *LinuxFactory) validateID(id string) error { if !idRegex.MatchString(id) || string(os.PathSeparator)+id != utils.CleanPath(string(os.PathSeparator)+id) { - return newGenericError(fmt.Errorf("invalid id format: %v", id), InvalidIdFormat) + return ErrInvalidID } return nil @@ -425,3 +396,18 @@ func NewgidmapPath(newgidmapPath string) func(*LinuxFactory) error { return nil } } + +func parseMountFds() ([]int, error) { + fdsJson := os.Getenv("_LIBCONTAINER_MOUNT_FDS") + if fdsJson == "" { + // Always return the nil slice if no fd is present. + return nil, nil + } + + var mountFds []int + if err := json.Unmarshal([]byte(fdsJson), &mountFds); err != nil { + return nil, fmt.Errorf("Error unmarshalling _LIBCONTAINER_MOUNT_FDS: %w", err) + } + + return mountFds, nil +} diff --git a/libcontainer/factory_linux_test.go b/libcontainer/factory_linux_test.go index 1dc0180..d29c32e 100644 --- a/libcontainer/factory_linux_test.go +++ b/libcontainer/factory_linux_test.go @@ -1,37 +1,23 @@ -// +build linux - package libcontainer import ( - "io/ioutil" + "errors" "os" "path/filepath" "reflect" "testing" + "github.com/moby/sys/mountinfo" "github.com/opencontainers/runc/libcontainer/configs" - "github.com/opencontainers/runc/libcontainer/mount" "github.com/opencontainers/runc/libcontainer/utils" "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sys/unix" ) -func newTestRoot() (string, error) { - dir, err := ioutil.TempDir("", "libcontainer") - if err != nil { - return "", err - } - return dir, nil -} - func TestFactoryNew(t *testing.T) { - root, rerr := newTestRoot() - if rerr != nil { - t.Fatal(rerr) - } - defer os.RemoveAll(root) - factory, err := New(root, Cgroupfs) + root := t.TempDir() + factory, err := New(root) if err != nil { t.Fatal(err) } @@ -52,12 +38,8 @@ func TestFactoryNew(t *testing.T) { } func TestFactoryNewIntelRdt(t *testing.T) { - root, rerr := newTestRoot() - if rerr != nil { - t.Fatal(rerr) - } - defer os.RemoveAll(root) - factory, err := New(root, Cgroupfs, IntelRdtFs) + root := t.TempDir() + factory, err := New(root, IntelRdtFs) if err != nil { t.Fatal(err) } @@ -78,13 +60,8 @@ func TestFactoryNewIntelRdt(t *testing.T) { } func TestFactoryNewTmpfs(t *testing.T) { -t.Skip("DM - skipping privileged test") - root, rerr := newTestRoot() - if rerr != nil { - t.Fatal(rerr) - } - defer os.RemoveAll(root) - factory, err := New(root, Cgroupfs, TmpfsRoot) + root := t.TempDir() + factory, err := New(root, TmpfsRoot) if err != nil { t.Fatal(err) } @@ -102,42 +79,35 @@ t.Skip("DM - skipping privileged test") if factory.Type() != "libcontainer" { t.Fatalf("unexpected factory type: %q, expected %q", factory.Type(), "libcontainer") } - mounted, err := mount.Mounted(lfactory.Root) + mounted, err := mountinfo.Mounted(lfactory.Root) if err != nil { t.Fatal(err) } if !mounted { t.Fatalf("Factory Root is not mounted") } - mounts, err := mount.GetMounts() + mounts, err := mountinfo.GetMounts(mountinfo.SingleEntryFilter(lfactory.Root)) if err != nil { t.Fatal(err) } - var found bool - for _, m := range mounts { - if m.Mountpoint == lfactory.Root { - if m.Fstype != "tmpfs" { - t.Fatalf("Fstype of root: %s, expected %s", m.Fstype, "tmpfs") - } - if m.Source != "tmpfs" { - t.Fatalf("Source of root: %s, expected %s", m.Source, "tmpfs") - } - found = true - } - } - if !found { + if len(mounts) != 1 { t.Fatalf("Factory Root is not listed in mounts list") } - defer unix.Unmount(root, unix.MNT_DETACH) + m := mounts[0] + if m.FSType != "tmpfs" { + t.Fatalf("FSType of root: %s, expected %s", m.FSType, "tmpfs") + } + if m.Source != "tmpfs" { + t.Fatalf("Source of root: %s, expected %s", m.Source, "tmpfs") + } + err = unix.Unmount(root, unix.MNT_DETACH) + if err != nil { + t.Error("failed to unmount root:", err) + } } func TestFactoryLoadNotExists(t *testing.T) { - root, rerr := newTestRoot() - if rerr != nil { - t.Fatal(rerr) - } - defer os.RemoveAll(root) - factory, err := New(root, Cgroupfs) + factory, err := New(t.TempDir()) if err != nil { t.Fatal(err) } @@ -145,32 +115,24 @@ func TestFactoryLoadNotExists(t *testing.T) { if err == nil { t.Fatal("expected nil error loading non-existing container") } - lerr, ok := err.(Error) - if !ok { - t.Fatal("expected libcontainer error type") - } - if lerr.Code() != ContainerNotExists { - t.Fatalf("expected error code %s but received %s", ContainerNotExists, lerr.Code()) + if !errors.Is(err, ErrNotExist) { + t.Fatalf("expected ErrNotExist, got %v", err) } } func TestFactoryLoadContainer(t *testing.T) { - root, err := newTestRoot() - if err != nil { - t.Fatal(err) - } - defer os.RemoveAll(root) + root := t.TempDir() // setup default container config and state for mocking var ( id = "1" - expectedHooks = &configs.Hooks{ - Prestart: []configs.Hook{ + expectedHooks = configs.Hooks{ + configs.Prestart: configs.HookList{ configs.CommandHook{Command: configs.Command{Path: "prestart-hook"}}, }, - Poststart: []configs.Hook{ + configs.Poststart: configs.HookList{ configs.CommandHook{Command: configs.Command{Path: "poststart-hook"}}, }, - Poststop: []configs.Hook{ + configs.Poststop: configs.HookList{ unserializableHook{}, configs.CommandHook{Command: configs.Command{Path: "poststop-hook"}}, }, @@ -178,6 +140,9 @@ func TestFactoryLoadContainer(t *testing.T) { expectedConfig = &configs.Config{ Rootfs: "/mycontainer/root", Hooks: expectedHooks, + Cgroups: &configs.Cgroup{ + Resources: &configs.Resources{}, + }, } expectedState = &State{ BaseState: BaseState{ @@ -186,13 +151,13 @@ func TestFactoryLoadContainer(t *testing.T) { }, } ) - if err := os.Mkdir(filepath.Join(root, id), 0700); err != nil { + if err := os.Mkdir(filepath.Join(root, id), 0o700); err != nil { t.Fatal(err) } if err := marshal(filepath.Join(root, id, stateFilename), expectedState); err != nil { t.Fatal(err) } - factory, err := New(root, Cgroupfs, IntelRdtFs) + factory, err := New(root, IntelRdtFs) if err != nil { t.Fatal(err) } @@ -207,7 +172,7 @@ func TestFactoryLoadContainer(t *testing.T) { if config.Rootfs != expectedConfig.Rootfs { t.Fatalf("expected rootfs %q but received %q", expectedConfig.Rootfs, config.Rootfs) } - expectedHooks.Poststop = expectedHooks.Poststop[1:] // expect unserializable hook to be skipped + expectedHooks[configs.Poststop] = expectedHooks[configs.Poststop][1:] // expect unserializable hook to be skipped if !reflect.DeepEqual(config.Hooks, expectedHooks) { t.Fatalf("expects hooks %q but received %q", expectedHooks, config.Hooks) } @@ -225,7 +190,7 @@ func marshal(path string, v interface{}) error { if err != nil { return err } - defer f.Close() + defer f.Close() //nolint: errcheck return utils.WriteJSON(f, v) } diff --git a/libcontainer/generic_error.go b/libcontainer/generic_error.go deleted file mode 100644 index 6e7de2f..0000000 --- a/libcontainer/generic_error.go +++ /dev/null @@ -1,92 +0,0 @@ -package libcontainer - -import ( - "fmt" - "io" - "text/template" - "time" - - "github.com/opencontainers/runc/libcontainer/stacktrace" -) - -var errorTemplate = template.Must(template.New("error").Parse(`Timestamp: {{.Timestamp}} -Code: {{.ECode}} -{{if .Message }} -Message: {{.Message}} -{{end}} -Frames:{{range $i, $frame := .Stack.Frames}} ---- -{{$i}}: {{$frame.Function}} -Package: {{$frame.Package}} -File: {{$frame.File}}@{{$frame.Line}}{{end}} -`)) - -func newGenericError(err error, c ErrorCode) Error { - if le, ok := err.(Error); ok { - return le - } - gerr := &genericError{ - Timestamp: time.Now(), - Err: err, - ECode: c, - Stack: stacktrace.Capture(1), - } - if err != nil { - gerr.Message = err.Error() - } - return gerr -} - -func newSystemError(err error) Error { - return createSystemError(err, "") -} - -func newSystemErrorWithCausef(err error, cause string, v ...interface{}) Error { - return createSystemError(err, fmt.Sprintf(cause, v...)) -} - -func newSystemErrorWithCause(err error, cause string) Error { - return createSystemError(err, cause) -} - -// createSystemError creates the specified error with the correct number of -// stack frames skipped. This is only to be called by the other functions for -// formatting the error. -func createSystemError(err error, cause string) Error { - gerr := &genericError{ - Timestamp: time.Now(), - Err: err, - ECode: SystemError, - Cause: cause, - Stack: stacktrace.Capture(2), - } - if err != nil { - gerr.Message = err.Error() - } - return gerr -} - -type genericError struct { - Timestamp time.Time - ECode ErrorCode - Err error `json:"-"` - Cause string - Message string - Stack stacktrace.Stacktrace -} - -func (e *genericError) Error() string { - if e.Cause == "" { - return e.Message - } - frame := e.Stack.Frames[0] - return fmt.Sprintf("%s:%d: %s caused %q", frame.File, frame.Line, e.Cause, e.Message) -} - -func (e *genericError) Code() ErrorCode { - return e.ECode -} - -func (e *genericError) Detail(w io.Writer) error { - return errorTemplate.Execute(w, e) -} diff --git a/libcontainer/generic_error_test.go b/libcontainer/generic_error_test.go deleted file mode 100644 index 8fbdd4d..0000000 --- a/libcontainer/generic_error_test.go +++ /dev/null @@ -1,49 +0,0 @@ -package libcontainer - -import ( - "fmt" - "io/ioutil" - "testing" -) - -func TestErrorDetail(t *testing.T) { - err := newGenericError(fmt.Errorf("test error"), SystemError) - if derr := err.Detail(ioutil.Discard); derr != nil { - t.Fatal(derr) - } -} - -func TestErrorWithCode(t *testing.T) { - err := newGenericError(fmt.Errorf("test error"), SystemError) - if code := err.Code(); code != SystemError { - t.Fatalf("expected err code %q but %q", SystemError, code) - } -} - -func TestErrorWithError(t *testing.T) { - cc := []struct { - errmsg string - cause string - }{ - { - errmsg: "test error", - }, - { - errmsg: "test error", - cause: "test", - }, - } - - for _, v := range cc { - err := newSystemErrorWithCause(fmt.Errorf(v.errmsg), v.cause) - - msg := err.Error() - if v.cause == "" && msg != v.errmsg { - t.Fatalf("expected err(%q) equal errmsg(%q)", msg, v.errmsg) - } - if v.cause != "" && msg == v.errmsg { - t.Fatalf("unexpected err(%q) equal errmsg(%q)", msg, v.errmsg) - } - - } -} diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go index c1b1560..cb862a6 100644 --- a/libcontainer/init_linux.go +++ b/libcontainer/init_linux.go @@ -1,29 +1,29 @@ -// +build linux - package libcontainer import ( + "bytes" "encoding/json" + "errors" "fmt" "io" - "io/ioutil" "net" "os" + "strconv" "strings" - "syscall" // only for Errno "unsafe" + "github.com/containerd/console" + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/sirupsen/logrus" + "github.com/vishvananda/netlink" "golang.org/x/sys/unix" - "github.com/containerd/console" + "github.com/opencontainers/runc/libcontainer/capabilities" "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/system" "github.com/opencontainers/runc/libcontainer/user" "github.com/opencontainers/runc/libcontainer/utils" - "github.com/pkg/errors" - "github.com/sirupsen/logrus" - "github.com/vishvananda/netlink" ) type initType string @@ -34,8 +34,8 @@ const ( ) type pid struct { - Pid int `json:"pid"` - PidFirstChild int `json:"pid_first"` + Pid int `json:"stage2_pid"` + PidFirstChild int `json:"stage1_pid"` } // network is an internal struct used to setup container networks. @@ -68,13 +68,15 @@ type initConfig struct { ConsoleHeight uint16 `json:"console_height"` RootlessEUID bool `json:"rootless_euid,omitempty"` RootlessCgroups bool `json:"rootless_cgroups,omitempty"` + SpecState *specs.State `json:"spec_state,omitempty"` + Cgroup2Path string `json:"cgroup2_path,omitempty"` } type initer interface { Init() error } -func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd int) (initer, error) { +func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, logFd int, mountFds []int) (initer, error) { var config *initConfig if err := json.NewDecoder(pipe).Decode(&config); err != nil { return nil, err @@ -84,10 +86,16 @@ func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd } switch t { case initSetns: + // mountFds must be nil in this case. We don't mount while doing runc exec. + if mountFds != nil { + return nil, errors.New("mountFds must be nil. Can't mount while doing runc exec.") + } + return &linuxSetnsInit{ pipe: pipe, consoleSocket: consoleSocket, config: config, + logFd: logFd, }, nil case initStandard: return &linuxStandardInit{ @@ -96,6 +104,8 @@ func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd parentPid: unix.Getppid(), config: config, fifoFd: fifoFd, + logFd: logFd, + mountFds: mountFds, }, nil } return nil, fmt.Errorf("unknown init type %q", t) @@ -107,9 +117,19 @@ func populateProcessEnvironment(env []string) error { for _, pair := range env { p := strings.SplitN(pair, "=", 2) if len(p) < 2 { - return fmt.Errorf("invalid environment '%v'", pair) + return fmt.Errorf("invalid environment variable: %q", pair) } - if err := os.Setenv(p[0], p[1]); err != nil { + name, val := p[0], p[1] + if name == "" { + return fmt.Errorf("environment variable name can't be empty: %q", pair) + } + if strings.IndexByte(name, 0) >= 0 { + return fmt.Errorf("environment variable name can't contain null(\\x00): %q", pair) + } + if strings.IndexByte(val, 0) >= 0 { + return fmt.Errorf("environment variable value can't contain null(\\x00): %q", pair) + } + if err := os.Setenv(name, val); err != nil { return err } } @@ -124,41 +144,61 @@ func finalizeNamespace(config *initConfig) error { // inherited are marked close-on-exec so they stay out of the // container if err := utils.CloseExecFrom(config.PassedFilesCount + 3); err != nil { - return errors.Wrap(err, "close exec fds") + return fmt.Errorf("error closing exec fds: %w", err) } - if config.Cwd != "" { - if err := unix.Chdir(config.Cwd); err != nil { - return fmt.Errorf("chdir to cwd (%q) set in config.json failed: %v", config.Cwd, err) + // we only do chdir if it's specified + doChdir := config.Cwd != "" + if doChdir { + // First, attempt the chdir before setting up the user. + // This could allow us to access a directory that the user running runc can access + // but the container user cannot. + err := unix.Chdir(config.Cwd) + switch { + case err == nil: + doChdir = false + case os.IsPermission(err): + // If we hit an EPERM, we should attempt again after setting up user. + // This will allow us to successfully chdir if the container user has access + // to the directory, but the user running runc does not. + // This is useful in cases where the cwd is also a volume that's been chowned to the container user. + default: + return fmt.Errorf("chdir to cwd (%q) set in config.json failed: %w", config.Cwd, err) } } - capabilities := &configs.Capabilities{} + caps := &configs.Capabilities{} if config.Capabilities != nil { - capabilities = config.Capabilities + caps = config.Capabilities } else if config.Config.Capabilities != nil { - capabilities = config.Config.Capabilities + caps = config.Config.Capabilities } - w, err := newContainerCapList(capabilities) + w, err := capabilities.New(caps) if err != nil { return err } // drop capabilities in bounding set before changing user if err := w.ApplyBoundingSet(); err != nil { - return errors.Wrap(err, "apply bounding set") + return fmt.Errorf("unable to apply bounding set: %w", err) } // preserve existing capabilities while we change users if err := system.SetKeepCaps(); err != nil { - return errors.Wrap(err, "set keep caps") + return fmt.Errorf("unable to set keep caps: %w", err) } if err := setupUser(config); err != nil { - return errors.Wrap(err, "setup user") + return fmt.Errorf("unable to setup user: %w", err) + } + // Change working directory AFTER the user has been set up, if we haven't done it yet. + if doChdir { + if err := unix.Chdir(config.Cwd); err != nil { + return fmt.Errorf("chdir to cwd (%q) set in config.json failed: %w", config.Cwd, err) + } } if err := system.ClearKeepCaps(); err != nil { - return errors.Wrap(err, "clear keep caps") + return fmt.Errorf("unable to clear keep caps: %w", err) } if err := w.ApplyCaps(); err != nil { - return errors.Wrap(err, "apply caps") + return fmt.Errorf("unable to apply caps: %w", err) } return nil } @@ -183,6 +223,9 @@ func setupConsole(socket *os.File, config *initConfig, mount bool) error { return err } + // After we return from here, we don't need the console anymore. + defer pty.Close() + if config.ConsoleHeight != 0 && config.ConsoleWidth != 0 { err = pty.Resize(console.WinSize{ Height: config.ConsoleHeight, @@ -194,9 +237,6 @@ func setupConsole(socket *os.File, config *initConfig, mount bool) error { } } - // After we return from here, we don't need the console anymore. - defer pty.Close() - // Mount the console inside our rootfs. if mount { if err := mountConsole(slavePath); err != nil { @@ -237,6 +277,36 @@ func syncParentHooks(pipe io.ReadWriter) error { return readSync(pipe, procResume) } +// syncParentSeccomp sends to the given pipe a JSON payload which +// indicates that the parent should pick up the seccomp fd with pidfd_getfd() +// and send it to the seccomp agent over a unix socket. It then waits for +// the parent to indicate that it is cleared to resume and closes the seccompFd. +// If the seccompFd is -1, there isn't anything to sync with the parent, so it +// returns no error. +func syncParentSeccomp(pipe io.ReadWriter, seccompFd int) error { + if seccompFd == -1 { + return nil + } + + // Tell parent. + if err := writeSyncWithFd(pipe, procSeccomp, seccompFd); err != nil { + unix.Close(seccompFd) + return err + } + + // Wait for parent to give the all-clear. + if err := readSync(pipe, procSeccompDone); err != nil { + unix.Close(seccompFd) + return fmt.Errorf("sync parent seccomp: %w", err) + } + + if err := unix.Close(seccompFd); err != nil { + return fmt.Errorf("close seccomp fd: %w", err) + } + + return nil +} + // setupUser changes the groups, gid, and uid for the user inside the container func setupUser(config *initConfig) error { // Set up defaults. @@ -272,10 +342,10 @@ func setupUser(config *initConfig) error { // Rather than just erroring out later in setuid(2) and setgid(2), check // that the user is mapped here. if _, err := config.Config.HostUID(execUser.Uid); err != nil { - return fmt.Errorf("cannot set uid to unmapped user in user namespace") + return errors.New("cannot set uid to unmapped user in user namespace") } if _, err := config.Config.HostGID(execUser.Gid); err != nil { - return fmt.Errorf("cannot set gid to unmapped user in user namespace") + return errors.New("cannot set gid to unmapped user in user namespace") } if config.RootlessEUID { @@ -284,17 +354,17 @@ func setupUser(config *initConfig) error { // this check earlier, but if libcontainer.Process.User was typesafe // this might work. if len(addGroups) > 0 { - return fmt.Errorf("cannot set any additional groups in a rootless container") + return errors.New("cannot set any additional groups in a rootless container") } } // Before we change to the container's user make sure that the processes // STDIO is correctly owned by the user that we are switching to. - if err := fixStdioPermissions(config, execUser); err != nil { + if err := fixStdioPermissions(execUser); err != nil { return err } - setgroups, err := ioutil.ReadFile("/proc/self/setgroups") + setgroups, err := os.ReadFile("/proc/self/setgroups") if err != nil && !os.IsNotExist(err) { return err } @@ -303,12 +373,12 @@ func setupUser(config *initConfig) error { // There's nothing we can do about /etc/group entries, so we silently // ignore setting groups here (since the user didn't explicitly ask us to // set the group). - allowSupGroups := !config.RootlessEUID && strings.TrimSpace(string(setgroups)) != "deny" + allowSupGroups := !config.RootlessEUID && string(bytes.TrimSpace(setgroups)) != "deny" if allowSupGroups { suppGroups := append(execUser.Sgids, addGroups...) if err := unix.Setgroups(suppGroups); err != nil { - return err + return &os.SyscallError{Syscall: "setgroups", Err: err} } } @@ -331,10 +401,10 @@ func setupUser(config *initConfig) error { // fixStdioPermissions fixes the permissions of PID 1's STDIO within the container to the specified user. // The ownership needs to match because it is created outside of the container and needs to be // localized. -func fixStdioPermissions(config *initConfig, u *user.ExecUser) error { +func fixStdioPermissions(u *user.ExecUser) error { var null unix.Stat_t if err := unix.Stat("/dev/null", &null); err != nil { - return err + return &os.PathError{Op: "stat", Path: "/dev/null", Err: err} } for _, fd := range []uintptr{ os.Stdin.Fd(), @@ -343,7 +413,7 @@ func fixStdioPermissions(config *initConfig, u *user.ExecUser) error { } { var s unix.Stat_t if err := unix.Fstat(int(fd), &s); err != nil { - return err + return &os.PathError{Op: "fstat", Path: "fd " + strconv.Itoa(int(fd)), Err: err} } // Skip chown of /dev/null if it was used as one of the STDIO fds. @@ -364,10 +434,12 @@ func fixStdioPermissions(config *initConfig, u *user.ExecUser) error { // privileged_wrt_inode_uidgid() has failed). In either case, we // are in a configuration where it's better for us to just not // touch the stdio rather than bail at this point. + + // nolint:errorlint // unix errors are bare if err == unix.EINVAL || err == unix.EPERM { continue } - return err + return &os.PathError{Op: "fchown", Path: "fd " + strconv.Itoa(int(fd)), Err: err} } } return nil @@ -421,8 +493,8 @@ func setupRoute(config *configs.Config) error { func setupRlimits(limits []configs.Rlimit, pid int) error { for _, rlimit := range limits { - if err := system.Prlimit(pid, rlimit.Type, unix.Rlimit{Max: rlimit.Hard, Cur: rlimit.Soft}); err != nil { - return fmt.Errorf("error setting rlimit type %v: %v", rlimit.Type, err) + if err := unix.Prlimit(pid, rlimit.Type, &unix.Rlimit{Max: rlimit.Hard, Cur: rlimit.Soft}, nil); err != nil { + return fmt.Errorf("error setting rlimit type %v: %w", rlimit.Type, err) } } return nil @@ -430,6 +502,7 @@ func setupRlimits(limits []configs.Rlimit, pid int) error { const _P_PID = 1 +//nolint:structcheck,unused type siginfo struct { si_signo int32 si_errno int32 @@ -446,27 +519,12 @@ func isWaitable(pid int) (bool, error) { si := &siginfo{} _, _, e := unix.Syscall6(unix.SYS_WAITID, _P_PID, uintptr(pid), uintptr(unsafe.Pointer(si)), unix.WEXITED|unix.WNOWAIT|unix.WNOHANG, 0, 0) if e != 0 { - return false, os.NewSyscallError("waitid", e) + return false, &os.SyscallError{Syscall: "waitid", Err: e} } return si.si_pid != 0, nil } -// isNoChildren returns true if err represents a unix.ECHILD (formerly syscall.ECHILD) false otherwise -func isNoChildren(err error) bool { - switch err := err.(type) { - case syscall.Errno: - if err == unix.ECHILD { - return true - } - case *os.SyscallError: - if err.Err == unix.ECHILD { - return true - } - } - return false -} - // signalAllProcesses freezes then iterates over all the processes inside the // manager's cgroups sending the signal s to them. // If s is SIGKILL then it will wait for each process to exit. @@ -479,7 +537,9 @@ func signalAllProcesses(m cgroups.Manager, s os.Signal) error { } pids, err := m.GetAllPids() if err != nil { - m.Freeze(configs.Thawed) + if err := m.Freeze(configs.Thawed); err != nil { + logrus.Warn(err) + } return err } for _, pid := range pids { @@ -510,7 +570,7 @@ func signalAllProcesses(m cgroups.Manager, s os.Signal) error { for _, p := range procs { if s != unix.SIGKILL { if ok, err := isWaitable(p.Pid); err != nil { - if !isNoChildren(err) { + if !errors.Is(err, unix.ECHILD) { logrus.Warn("signalAllProcesses: ", p.Pid, err) } continue @@ -527,7 +587,7 @@ func signalAllProcesses(m cgroups.Manager, s os.Signal) error { // to retrieve its exit code. if subreaper == 0 { if _, err := p.Wait(); err != nil { - if !isNoChildren(err) { + if !errors.Is(err, unix.ECHILD) { logrus.Warn("wait: ", err) } } diff --git a/libcontainer/integration/checkpoint_test.go b/libcontainer/integration/checkpoint_test.go index cdb6810..c86ec8a 100644 --- a/libcontainer/integration/checkpoint_test.go +++ b/libcontainer/integration/checkpoint_test.go @@ -3,7 +3,6 @@ package integration import ( "bufio" "bytes" - "io/ioutil" "os" "os/exec" "path/filepath" @@ -11,21 +10,19 @@ import ( "testing" "github.com/opencontainers/runc/libcontainer" - "github.com/opencontainers/runc/libcontainer/cgroups" - "github.com/opencontainers/runc/libcontainer/configs" - "golang.org/x/sys/unix" ) -func showFile(t *testing.T, fname string) error { +func showFile(t *testing.T, fname string) { + t.Helper() t.Logf("=== %s ===\n", fname) f, err := os.Open(fname) if err != nil { t.Log(err) - return err + return } - defer f.Close() + defer f.Close() //nolint: errcheck scanner := bufio.NewScanner(f) for scanner.Scan() { @@ -33,18 +30,16 @@ func showFile(t *testing.T, fname string) error { } if err := scanner.Err(); err != nil { - return err + t.Log(err) + return } t.Logf("=== END ===\n") - - return nil } func TestUsernsCheckpoint(t *testing.T) { - t.Skip("Ubuntu kernel is broken to run criu (#2196, #2198)") if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) { - t.Skip("userns is unsupported") + t.Skip("Test requires userns.") } cmd := exec.Command("criu", "check", "--feature", "userns") if err := cmd.Run(); err != nil { @@ -54,7 +49,6 @@ func TestUsernsCheckpoint(t *testing.T) { } func TestCheckpoint(t *testing.T) { - t.Skip("Ubuntu kernel is broken to run criu (#2196, #2198)") testCheckpoint(t, false) } @@ -62,52 +56,21 @@ func testCheckpoint(t *testing.T, userns bool) { if testing.Short() { return } - if cgroups.IsCgroup2UnifiedMode() { - t.Skip("cgroup v1 is not supported") + + if _, err := exec.LookPath("criu"); err != nil { + t.Skipf("criu binary not found: %v", err) } - root, err := newTestRoot() - if err != nil { - t.Fatal(err) - } - defer os.RemoveAll(root) - - rootfs, err := newRootfs() - if err != nil { - t.Fatal(err) - } - defer remove(rootfs) - - config := newTemplateConfig(rootfs) - - config.Mounts = append(config.Mounts, &configs.Mount{ - Destination: "/sys/fs/cgroup", - Device: "cgroup", - Flags: defaultMountFlags | unix.MS_RDONLY, - }) - - if userns { - config.UidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}} - config.GidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}} - config.Namespaces = append(config.Namespaces, configs.Namespace{Type: configs.NEWUSER}) - } - - factory, err := libcontainer.New(root, libcontainer.Cgroupfs) - - if err != nil { - t.Fatal(err) - } + config := newTemplateConfig(t, &tParam{userns: userns}) + factory, err := libcontainer.New(t.TempDir()) + ok(t, err) container, err := factory.Create("test", config) - if err != nil { - t.Fatal(err) - } - defer container.Destroy() + ok(t, err) + defer destroyContainer(container) stdinR, stdinW, err := os.Pipe() - if err != nil { - t.Fatal(err) - } + ok(t, err) var stdout bytes.Buffer @@ -121,28 +84,19 @@ func testCheckpoint(t *testing.T, userns bool) { } err = container.Run(&pconfig) - stdinR.Close() - defer stdinW.Close() - if err != nil { - t.Fatal(err) - } + _ = stdinR.Close() + defer stdinW.Close() //nolint: errcheck + ok(t, err) pid, err := pconfig.Pid() - if err != nil { - t.Fatal(err) - } + ok(t, err) process, err := os.FindProcess(pid) - if err != nil { - t.Fatal(err) - } + ok(t, err) - parentDir, err := ioutil.TempDir("", "criu-parent") - if err != nil { - t.Fatal(err) - } - defer os.RemoveAll(parentDir) + tmp := t.TempDir() + parentDir := filepath.Join(tmp, "criu-parent") preDumpOpts := &libcontainer.CriuOpts{ ImagesDirectory: parentDir, WorkDirectory: parentDir, @@ -156,19 +110,13 @@ func testCheckpoint(t *testing.T, userns bool) { } state, err := container.Status() - if err != nil { - t.Fatal(err) - } + ok(t, err) if state != libcontainer.Running { t.Fatal("Unexpected preDump state: ", state) } - imagesDir, err := ioutil.TempDir("", "criu") - if err != nil { - t.Fatal(err) - } - defer os.RemoveAll(imagesDir) + imagesDir := filepath.Join(tmp, "criu") checkpointOpts := &libcontainer.CriuOpts{ ImagesDirectory: imagesDir, @@ -184,80 +132,58 @@ func testCheckpoint(t *testing.T, userns bool) { } state, err = container.Status() - if err != nil { - t.Fatal(err) - } + ok(t, err) if state != libcontainer.Stopped { t.Fatal("Unexpected state checkpoint: ", state) } - stdinW.Close() + _ = stdinW.Close() _, err = process.Wait() - if err != nil { - t.Fatal(err) - } + ok(t, err) // reload the container container, err = factory.Load("test") - if err != nil { - t.Fatal(err) - } + ok(t, err) restoreStdinR, restoreStdinW, err := os.Pipe() - if err != nil { - t.Fatal(err) - } + ok(t, err) + var restoreStdout bytes.Buffer restoreProcessConfig := &libcontainer.Process{ Cwd: "/", Stdin: restoreStdinR, - Stdout: &stdout, + Stdout: &restoreStdout, Init: true, } err = container.Restore(restoreProcessConfig, checkpointOpts) - restoreStdinR.Close() - defer restoreStdinW.Close() + _ = restoreStdinR.Close() + defer restoreStdinW.Close() //nolint: errcheck if err != nil { showFile(t, restoreLog) t.Fatal(err) } state, err = container.Status() - if err != nil { - t.Fatal(err) - } + ok(t, err) if state != libcontainer.Running { t.Fatal("Unexpected restore state: ", state) } pid, err = restoreProcessConfig.Pid() - if err != nil { - t.Fatal(err) - } + ok(t, err) - process, err = os.FindProcess(pid) - if err != nil { - t.Fatal(err) - } + err = unix.Kill(pid, 0) + ok(t, err) _, err = restoreStdinW.WriteString("Hello!") - if err != nil { - t.Fatal(err) - } + ok(t, err) - restoreStdinW.Close() - s, err := process.Wait() - if err != nil { - t.Fatal(err) - } + _ = restoreStdinW.Close() + waitProcess(restoreProcessConfig, t) - if !s.Success() { - t.Fatal(s.String(), pid) - } - - output := string(stdout.Bytes()) + output := restoreStdout.String() if !strings.Contains(output, "Hello!") { t.Fatal("Did not restore the pipe correctly:", output) } diff --git a/libcontainer/integration/exec_test.go b/libcontainer/integration/exec_test.go index 7822fa8..ada4f85 100644 --- a/libcontainer/integration/exec_test.go +++ b/libcontainer/integration/exec_test.go @@ -3,14 +3,15 @@ package integration import ( "bytes" "encoding/json" + "errors" "fmt" - "io/ioutil" "os" "os/exec" "path/filepath" "reflect" "strconv" "strings" + "syscall" "testing" "github.com/opencontainers/runc/libcontainer" @@ -28,7 +29,7 @@ func TestExecPS(t *testing.T) { func TestUsernsExecPS(t *testing.T) { if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) { - t.Skip("userns is unsupported") + t.Skip("Test requires userns.") } testExecPS(t, true) } @@ -37,17 +38,9 @@ func testExecPS(t *testing.T, userns bool) { if testing.Short() { return } - rootfs, err := newRootfs() - ok(t, err) - defer remove(rootfs) - config := newTemplateConfig(rootfs) - if userns { - config.UidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}} - config.GidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}} - config.Namespaces = append(config.Namespaces, configs.Namespace{Type: configs.NEWUSER}) - } + config := newTemplateConfig(t, &tParam{userns: userns}) - buffers, exitCode, err := runContainer(config, "", "ps", "-o", "pid,user,comm") + buffers, exitCode, err := runContainer(t, config, "ps", "-o", "pid,user,comm") if err != nil { t.Fatalf("%s: %s", buffers, err) } @@ -70,15 +63,11 @@ func TestIPCPrivate(t *testing.T) { return } - rootfs, err := newRootfs() - ok(t, err) - defer remove(rootfs) - l, err := os.Readlink("/proc/1/ns/ipc") ok(t, err) - config := newTemplateConfig(rootfs) - buffers, exitCode, err := runContainer(config, "", "readlink", "/proc/self/ns/ipc") + config := newTemplateConfig(t, nil) + buffers, exitCode, err := runContainer(t, config, "readlink", "/proc/self/ns/ipc") ok(t, err) if exitCode != 0 { @@ -95,16 +84,12 @@ func TestIPCHost(t *testing.T) { return } - rootfs, err := newRootfs() - ok(t, err) - defer remove(rootfs) - l, err := os.Readlink("/proc/1/ns/ipc") ok(t, err) - config := newTemplateConfig(rootfs) + config := newTemplateConfig(t, nil) config.Namespaces.Remove(configs.NEWIPC) - buffers, exitCode, err := runContainer(config, "", "readlink", "/proc/self/ns/ipc") + buffers, exitCode, err := runContainer(t, config, "readlink", "/proc/self/ns/ipc") ok(t, err) if exitCode != 0 { @@ -121,17 +106,13 @@ func TestIPCJoinPath(t *testing.T) { return } - rootfs, err := newRootfs() - ok(t, err) - defer remove(rootfs) - l, err := os.Readlink("/proc/1/ns/ipc") ok(t, err) - config := newTemplateConfig(rootfs) + config := newTemplateConfig(t, nil) config.Namespaces.Add(configs.NEWIPC, "/proc/1/ns/ipc") - buffers, exitCode, err := runContainer(config, "", "readlink", "/proc/self/ns/ipc") + buffers, exitCode, err := runContainer(t, config, "readlink", "/proc/self/ns/ipc") ok(t, err) if exitCode != 0 { @@ -148,15 +129,10 @@ func TestIPCBadPath(t *testing.T) { return } - rootfs, err := newRootfs() - ok(t, err) - defer remove(rootfs) - - config := newTemplateConfig(rootfs) + config := newTemplateConfig(t, nil) config.Namespaces.Add(configs.NEWIPC, "/proc/1/ns/ipcc") - _, _, err = runContainer(config, "", "true") - if err == nil { + if _, _, err := runContainer(t, config, "true"); err == nil { t.Fatal("container succeeded with bad ipc path") } } @@ -167,7 +143,7 @@ func TestRlimit(t *testing.T) { func TestUsernsRlimit(t *testing.T) { if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) { - t.Skip("userns is unsupported") + t.Skip("Test requires userns.") } testRlimit(t, true) @@ -178,16 +154,7 @@ func testRlimit(t *testing.T, userns bool) { return } - rootfs, err := newRootfs() - ok(t, err) - defer remove(rootfs) - - config := newTemplateConfig(rootfs) - if userns { - config.UidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}} - config.GidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}} - config.Namespaces = append(config.Namespaces, configs.Namespace{Type: configs.NEWUSER}) - } + config := newTemplateConfig(t, &tParam{userns: userns}) // ensure limit is lower than what the config requests to test that in a user namespace // the Setrlimit call happens early enough that we still have permissions to raise the limit. @@ -196,7 +163,7 @@ func testRlimit(t *testing.T, userns bool) { Cur: 1024, })) - out, _, err := runContainer(config, "", "/bin/sh", "-c", "ulimit -n") + out, _, err := runContainer(t, config, "/bin/sh", "-c", "ulimit -n") ok(t, err) if limit := strings.TrimSpace(out.Stdout.String()); limit != "1025" { t.Fatalf("expected rlimit to be 1025, got %s", limit) @@ -207,19 +174,12 @@ func TestEnter(t *testing.T) { if testing.Short() { return } - if cgroups.IsCgroup2UnifiedMode() { - t.Skip("cgroup v1 is not supported") - } - rootfs, err := newRootfs() + config := newTemplateConfig(t, nil) + + container, err := newContainer(t, config) ok(t, err) - defer remove(rootfs) - - config := newTemplateConfig(rootfs) - - container, err := newContainerWithName("test", config) - ok(t, err) - defer container.Destroy() + defer destroyContainer(container) // Execute a first process in the container stdinR, stdinW, err := os.Pipe() @@ -236,8 +196,8 @@ func TestEnter(t *testing.T) { Init: true, } err = container.Run(&pconfig) - stdinR.Close() - defer stdinW.Close() + _ = stdinR.Close() + defer stdinW.Close() //nolint: errcheck ok(t, err) pid, err := pconfig.Pid() ok(t, err) @@ -254,8 +214,8 @@ func TestEnter(t *testing.T) { pconfig2.Stdout = &stdout2 err = container.Run(&pconfig2) - stdinR2.Close() - defer stdinW2.Close() + _ = stdinR2.Close() + defer stdinW2.Close() //nolint: errcheck ok(t, err) pid2, err := pconfig2.Pid() @@ -275,17 +235,17 @@ func TestEnter(t *testing.T) { } // Wait processes - stdinW2.Close() + _ = stdinW2.Close() waitProcess(&pconfig2, t) - stdinW.Close() + _ = stdinW.Close() waitProcess(&pconfig, t) // Check that both processes live in the same pidns - pidns := string(stdout.Bytes()) + pidns := stdout.String() ok(t, err) - pidns2 := string(stdout2.Bytes()) + pidns2 := stdout2.String() ok(t, err) if pidns != pidns2 { @@ -298,15 +258,10 @@ func TestProcessEnv(t *testing.T) { return } - rootfs, err := newRootfs() + config := newTemplateConfig(t, nil) + container, err := newContainer(t, config) ok(t, err) - defer remove(rootfs) - - config := newTemplateConfig(rootfs) - - container, err := newContainerWithName("test", config) - ok(t, err) - defer container.Destroy() + defer destroyContainer(container) var stdout bytes.Buffer pconfig := libcontainer.Process{ @@ -328,7 +283,7 @@ func TestProcessEnv(t *testing.T) { // Wait for process waitProcess(&pconfig, t) - outputEnv := string(stdout.Bytes()) + outputEnv := stdout.String() // Check that the environment has the key/value pair we added if !strings.Contains(outputEnv, "FOO=BAR") { @@ -346,16 +301,12 @@ func TestProcessEmptyCaps(t *testing.T) { return } - rootfs, err := newRootfs() - ok(t, err) - defer remove(rootfs) - - config := newTemplateConfig(rootfs) + config := newTemplateConfig(t, nil) config.Capabilities = nil - container, err := newContainerWithName("test", config) + container, err := newContainer(t, config) ok(t, err) - defer container.Destroy() + defer destroyContainer(container) var stdout bytes.Buffer pconfig := libcontainer.Process{ @@ -372,7 +323,7 @@ func TestProcessEmptyCaps(t *testing.T) { // Wait for process waitProcess(&pconfig, t) - outputStatus := string(stdout.Bytes()) + outputStatus := stdout.String() lines := strings.Split(outputStatus, "\n") @@ -395,15 +346,10 @@ func TestProcessCaps(t *testing.T) { return } - rootfs, err := newRootfs() + config := newTemplateConfig(t, nil) + container, err := newContainer(t, config) ok(t, err) - defer remove(rootfs) - - config := newTemplateConfig(rootfs) - - container, err := newContainerWithName("test", config) - ok(t, err) - defer container.Destroy() + defer destroyContainer(container) var stdout bytes.Buffer pconfig := libcontainer.Process{ @@ -425,7 +371,7 @@ func TestProcessCaps(t *testing.T) { // Wait for process waitProcess(&pconfig, t) - outputStatus := string(stdout.Bytes()) + outputStatus := stdout.String() lines := strings.Split(outputStatus, "\n") @@ -450,10 +396,7 @@ func TestProcessCaps(t *testing.T) { t.Fatal("Could not parse effective caps", err) } - var netAdminMask uint64 - var netAdminBit uint - netAdminBit = 12 // from capability.h - netAdminMask = 1 << netAdminBit + const netAdminMask = 1 << unix.CAP_NET_ADMIN if effectiveCaps&netAdminMask != netAdminMask { t.Fatal("CAP_NET_ADMIN is not set as expected") } @@ -463,15 +406,11 @@ func TestAdditionalGroups(t *testing.T) { if testing.Short() { return } - rootfs, err := newRootfs() - ok(t, err) - defer remove(rootfs) - config := newTemplateConfig(rootfs) - - container, err := newContainerWithName("test", config) + config := newTemplateConfig(t, nil) + container, err := newContainer(t, config) ok(t, err) - defer container.Destroy() + defer destroyContainer(container) var stdout bytes.Buffer pconfig := libcontainer.Process{ @@ -489,7 +428,7 @@ func TestAdditionalGroups(t *testing.T) { // Wait for process waitProcess(&pconfig, t) - outputGroups := string(stdout.Bytes()) + outputGroups := stdout.String() // Check that the groups output has the groups that we specified if !strings.Contains(outputGroups, "audio") { @@ -502,32 +441,38 @@ func TestAdditionalGroups(t *testing.T) { } func TestFreeze(t *testing.T) { - testFreeze(t, false) -} - -func TestSystemdFreeze(t *testing.T) { - if !systemd.UseSystemd() { - t.Skip("Systemd is unsupported") + for _, systemd := range []bool{true, false} { + for _, set := range []bool{true, false} { + name := "" + if systemd { + name += "Systemd" + } else { + name += "FS" + } + if set { + name += "ViaSet" + } else { + name += "ViaPauseResume" + } + t.Run(name, func(t *testing.T) { + testFreeze(t, systemd, set) + }) + } } - testFreeze(t, true) } -func testFreeze(t *testing.T, systemd bool) { +func testFreeze(t *testing.T, withSystemd bool, useSet bool) { if testing.Short() { return } - if cgroups.IsCgroup2UnifiedMode() { - t.Skip("cgroup v1 is not supported") + if withSystemd && !systemd.IsRunningSystemd() { + t.Skip("Test requires systemd.") } - rootfs, err := newRootfs() + config := newTemplateConfig(t, &tParam{systemd: withSystemd}) + container, err := newContainer(t, config) ok(t, err) - defer remove(rootfs) - - config := newTemplateConfig(rootfs) - container, err := newContainerWithName("test", config) - ok(t, err) - defer container.Destroy() + defer destroyContainer(container) stdinR, stdinW, err := os.Pipe() ok(t, err) @@ -540,21 +485,33 @@ func testFreeze(t *testing.T, systemd bool) { Init: true, } err = container.Run(pconfig) - stdinR.Close() - defer stdinW.Close() + _ = stdinR.Close() + defer stdinW.Close() //nolint: errcheck ok(t, err) - err = container.Pause() + if !useSet { + err = container.Pause() + } else { + config.Cgroups.Resources.Freezer = configs.Frozen + err = container.Set(*config) + } ok(t, err) + state, err := container.Status() ok(t, err) - err = container.Resume() - ok(t, err) if state != libcontainer.Paused { t.Fatal("Unexpected state: ", state) } - stdinW.Close() + if !useSet { + err = container.Resume() + } else { + config.Cgroups.Resources.Freezer = configs.Thawed + err = container.Set(*config) + } + ok(t, err) + + _ = stdinW.Close() waitProcess(pconfig, t) } @@ -563,8 +520,8 @@ func TestCpuShares(t *testing.T) { } func TestCpuSharesSystemd(t *testing.T) { - if !systemd.UseSystemd() { - t.Skip("Systemd is unsupported") + if !systemd.IsRunningSystemd() { + t.Skip("Test requires systemd.") } testCpuShares(t, true) } @@ -574,21 +531,13 @@ func testCpuShares(t *testing.T, systemd bool) { return } if cgroups.IsCgroup2UnifiedMode() { - t.Skip("cgroup v1 is not supported") + t.Skip("cgroup v2 does not support CpuShares") } - rootfs, err := newRootfs() - ok(t, err) - defer remove(rootfs) - - config := newTemplateConfig(rootfs) - if systemd { - config.Cgroups.Parent = "system.slice" - } + config := newTemplateConfig(t, &tParam{systemd: systemd}) config.Cgroups.Resources.CpuShares = 1 - _, _, err = runContainer(config, "", "ps") - if err == nil { + if _, _, err := runContainer(t, config, "ps"); err == nil { t.Fatalf("runContainer should failed with invalid CpuShares") } } @@ -598,8 +547,8 @@ func TestPids(t *testing.T) { } func TestPidsSystemd(t *testing.T) { - if !systemd.UseSystemd() { - t.Skip("Systemd is unsupported") + if !systemd.IsRunningSystemd() { + t.Skip("Test requires systemd.") } testPids(t, true) } @@ -608,25 +557,12 @@ func testPids(t *testing.T, systemd bool) { if testing.Short() { return } - if cgroups.IsCgroup2UnifiedMode() { - t.Skip("cgroup v1 is not supported") - } - rootfs, err := newRootfs() - ok(t, err) - defer remove(rootfs) - - config := newTemplateConfig(rootfs) - if systemd { - config.Cgroups.Parent = "system.slice" - } + config := newTemplateConfig(t, &tParam{systemd: systemd}) config.Cgroups.Resources.PidsLimit = -1 // Running multiple processes. - _, ret, err := runContainer(config, "", "/bin/sh", "-c", "/bin/true | /bin/true | /bin/true | /bin/true") - if err != nil && strings.Contains(err.Error(), "no such directory for pids.max") { - t.Skip("PIDs cgroup is unsupported") - } + _, ret, err := runContainer(t, config, "/bin/sh", "-c", "/bin/true | /bin/true | /bin/true | /bin/true") ok(t, err) if ret != 0 { @@ -636,14 +572,11 @@ func testPids(t *testing.T, systemd bool) { // Enforce a permissive limit. This needs to be fairly hand-wavey due to the // issues with running Go binaries with pids restrictions (see below). config.Cgroups.Resources.PidsLimit = 64 - _, ret, err = runContainer(config, "", "/bin/sh", "-c", ` + _, ret, err = runContainer(t, config, "/bin/sh", "-c", ` /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true`) - if err != nil && strings.Contains(err.Error(), "no such directory for pids.max") { - t.Skip("PIDs cgroup is unsupported") - } ok(t, err) if ret != 0 { @@ -653,7 +586,7 @@ func testPids(t *testing.T, systemd bool) { // Enforce a restrictive limit. 64 * /bin/true + 1 * shell should cause this // to fail reliability. config.Cgroups.Resources.PidsLimit = 64 - out, _, err := runContainer(config, "", "/bin/sh", "-c", ` + out, _, err := runContainer(t, config, "/bin/sh", "-c", ` /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true | @@ -662,15 +595,12 @@ func testPids(t *testing.T, systemd bool) { /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true`) - if err != nil && strings.Contains(err.Error(), "no such directory for pids.max") { - t.Skip("PIDs cgroup is unsupported") - } if err != nil && !strings.Contains(out.String(), "sh: can't fork") { - ok(t, err) + t.Fatal(err) } if err == nil { - t.Fatalf("expected fork() to fail with restrictive pids limit") + t.Fatal("expected fork() to fail with restrictive pids limit") } // Minimal restrictions are not really supported, due to quirks in using Go @@ -679,38 +609,135 @@ func testPids(t *testing.T, systemd bool) { // As such, we don't test that case. YMMV. } -func TestRunWithKernelMemory(t *testing.T) { - testRunWithKernelMemory(t, false) +func TestCgroupResourcesUnifiedErrorOnV1(t *testing.T) { + testCgroupResourcesUnifiedErrorOnV1(t, false) } -func TestRunWithKernelMemorySystemd(t *testing.T) { - if !systemd.UseSystemd() { - t.Skip("Systemd is unsupported") +func TestCgroupResourcesUnifiedErrorOnV1Systemd(t *testing.T) { + if !systemd.IsRunningSystemd() { + t.Skip("Test requires systemd.") } - testRunWithKernelMemory(t, true) + testCgroupResourcesUnifiedErrorOnV1(t, true) } -func testRunWithKernelMemory(t *testing.T, systemd bool) { +func testCgroupResourcesUnifiedErrorOnV1(t *testing.T, systemd bool) { if testing.Short() { return } if cgroups.IsCgroup2UnifiedMode() { - t.Skip("cgroup v1 is not supported") + t.Skip("requires cgroup v1") } - rootfs, err := newRootfs() - ok(t, err) - defer remove(rootfs) - - config := newTemplateConfig(rootfs) - if systemd { - config.Cgroups.Parent = "system.slice" + config := newTemplateConfig(t, &tParam{systemd: systemd}) + config.Cgroups.Resources.Unified = map[string]string{ + "memory.min": "10240", } - config.Cgroups.Resources.KernelMemory = 52428800 + _, _, err := runContainer(t, config, "true") + if !strings.Contains(err.Error(), cgroups.ErrV1NoUnified.Error()) { + t.Fatalf("expected error to contain %v, got %v", cgroups.ErrV1NoUnified, err) + } +} - _, _, err = runContainer(config, "", "ps") - if err != nil { - t.Fatalf("runContainer failed with kernel memory limit: %v", err) +func TestCgroupResourcesUnified(t *testing.T) { + testCgroupResourcesUnified(t, false) +} + +func TestCgroupResourcesUnifiedSystemd(t *testing.T) { + if !systemd.IsRunningSystemd() { + t.Skip("Test requires systemd.") + } + testCgroupResourcesUnified(t, true) +} + +func testCgroupResourcesUnified(t *testing.T, systemd bool) { + if testing.Short() { + return + } + if !cgroups.IsCgroup2UnifiedMode() { + t.Skip("requires cgroup v2") + } + + config := newTemplateConfig(t, &tParam{systemd: systemd}) + config.Cgroups.Resources.Memory = 536870912 // 512M + config.Cgroups.Resources.MemorySwap = 536870912 // 512M, i.e. no swap + config.Namespaces.Add(configs.NEWCGROUP, "") + + testCases := []struct { + name string + cfg map[string]string + expError string + cmd []string + exp string + }{ + { + name: "dummy", + cmd: []string{"true"}, + exp: "", + }, + { + name: "set memory.min", + cfg: map[string]string{"memory.min": "131072"}, + cmd: []string{"cat", "/sys/fs/cgroup/memory.min"}, + exp: "131072\n", + }, + { + name: "check memory.max", + cmd: []string{"cat", "/sys/fs/cgroup/memory.max"}, + exp: strconv.Itoa(int(config.Cgroups.Resources.Memory)) + "\n", + }, + + { + name: "overwrite memory.max", + cfg: map[string]string{"memory.max": "268435456"}, + cmd: []string{"cat", "/sys/fs/cgroup/memory.max"}, + exp: "268435456\n", + }, + { + name: "no such controller error", + cfg: map[string]string{"privet.vsem": "vam"}, + expError: "controller \"privet\" not available", + }, + { + name: "slash in key error", + cfg: map[string]string{"bad/key": "val"}, + expError: "must be a file name (no slashes)", + }, + { + name: "no dot in key error", + cfg: map[string]string{"badkey": "val"}, + expError: "must be in the form CONTROLLER.PARAMETER", + }, + { + name: "read-only parameter", + cfg: map[string]string{"pids.current": "42"}, + expError: "failed to write", + }, + } + + for _, tc := range testCases { + config.Cgroups.Resources.Unified = tc.cfg + buffers, ret, err := runContainer(t, config, tc.cmd...) + if tc.expError != "" { + if err == nil { + t.Errorf("case %q failed: expected error, got nil", tc.name) + continue + } + if !strings.Contains(err.Error(), tc.expError) { + t.Errorf("case %q failed: expected error to contain %q, got %q", tc.name, tc.expError, err) + } + continue + } + if err != nil { + t.Errorf("case %q failed: expected no error, got %v (command: %v, status: %d, stderr: %q)", + tc.name, err, tc.cmd, ret, buffers.Stderr.String()) + continue + } + if tc.exp != "" { + out := buffers.Stdout.String() + if out != tc.exp { + t.Errorf("expected %q, got %q", tc.exp, out) + } + } } } @@ -719,18 +746,10 @@ func TestContainerState(t *testing.T) { return } - rootfs, err := newRootfs() - if err != nil { - t.Fatal(err) - } - defer remove(rootfs) - l, err := os.Readlink("/proc/1/ns/ipc") - if err != nil { - t.Fatal(err) - } + ok(t, err) - config := newTemplateConfig(rootfs) + config := newTemplateConfig(t, nil) config.Namespaces = configs.Namespaces([]configs.Namespace{ {Type: configs.NEWNS}, {Type: configs.NEWUTS}, @@ -740,16 +759,13 @@ func TestContainerState(t *testing.T) { {Type: configs.NEWNET}, }) - container, err := newContainerWithName("test", config) - if err != nil { - t.Fatal(err) - } - defer container.Destroy() + container, err := newContainer(t, config) + ok(t, err) + defer destroyContainer(container) stdinR, stdinW, err := os.Pipe() - if err != nil { - t.Fatal(err) - } + ok(t, err) + p := &libcontainer.Process{ Cwd: "/", Args: []string{"cat"}, @@ -758,25 +774,19 @@ func TestContainerState(t *testing.T) { Init: true, } err = container.Run(p) - if err != nil { - t.Fatal(err) - } - stdinR.Close() - defer stdinW.Close() + ok(t, err) + _ = stdinR.Close() + defer stdinW.Close() //nolint: errcheck st, err := container.State() - if err != nil { - t.Fatal(err) - } + ok(t, err) l1, err := os.Readlink(st.NamespacePaths[configs.NEWIPC]) - if err != nil { - t.Fatal(err) - } + ok(t, err) if l1 != l { t.Fatal("Container using non-host ipc namespace") } - stdinW.Close() + _ = stdinW.Close() waitProcess(p, t) } @@ -785,29 +795,16 @@ func TestPassExtraFiles(t *testing.T) { return } - rootfs, err := newRootfs() - if err != nil { - t.Fatal(err) - } - defer remove(rootfs) - - config := newTemplateConfig(rootfs) - - container, err := newContainerWithName("test", config) - if err != nil { - t.Fatal(err) - } - defer container.Destroy() + config := newTemplateConfig(t, nil) + container, err := newContainer(t, config) + ok(t, err) + defer destroyContainer(container) var stdout bytes.Buffer pipeout1, pipein1, err := os.Pipe() - if err != nil { - t.Fatal(err) - } + ok(t, err) pipeout2, pipein2, err := os.Pipe() - if err != nil { - t.Fatal(err) - } + ok(t, err) process := libcontainer.Process{ Cwd: "/", Args: []string{"sh", "-c", "cd /proc/$$/fd; echo -n *; echo -n 1 >3; echo -n 2 >4"}, @@ -818,31 +815,25 @@ func TestPassExtraFiles(t *testing.T) { Init: true, } err = container.Run(&process) - if err != nil { - t.Fatal(err) - } + ok(t, err) waitProcess(&process, t) - out := string(stdout.Bytes()) + out := stdout.String() // fd 5 is the directory handle for /proc/$$/fd if out != "0 1 2 3 4 5" { t.Fatalf("expected to have the file descriptors '0 1 2 3 4 5' passed to init, got '%s'", out) } - var buf = []byte{0} + buf := []byte{0} _, err = pipeout1.Read(buf) - if err != nil { - t.Fatal(err) - } + ok(t, err) out1 := string(buf) if out1 != "1" { t.Fatalf("expected first pipe to receive '1', got '%s'", out1) } _, err = pipeout2.Read(buf) - if err != nil { - t.Fatal(err) - } + ok(t, err) out2 := string(buf) if out2 != "2" { t.Fatalf("expected second pipe to receive '2', got '%s'", out2) @@ -854,19 +845,9 @@ func TestMountCmds(t *testing.T) { return } - rootfs, err := newRootfs() - if err != nil { - t.Fatal(err) - } - defer remove(rootfs) - - tmpDir, err := ioutil.TempDir("", "tmpdir") - if err != nil { - t.Fatal(err) - } - defer os.RemoveAll(tmpDir) - - config := newTemplateConfig(rootfs) + tmpDir := t.TempDir() + config := newTemplateConfig(t, nil) + rootfs := config.Rootfs config.Mounts = append(config.Mounts, &configs.Mount{ Source: tmpDir, Destination: "/tmp", @@ -882,11 +863,9 @@ func TestMountCmds(t *testing.T) { }, }) - container, err := newContainerWithName("test", config) - if err != nil { - t.Fatal(err) - } - defer container.Destroy() + container, err := newContainer(t, config) + ok(t, err) + defer destroyContainer(container) pconfig := libcontainer.Process{ Cwd: "/", @@ -895,17 +874,13 @@ func TestMountCmds(t *testing.T) { Init: true, } err = container.Run(&pconfig) - if err != nil { - t.Fatal(err) - } + ok(t, err) // Wait for process waitProcess(&pconfig, t) - entries, err := ioutil.ReadDir(tmpDir) - if err != nil { - t.Fatal(err) - } + entries, err := os.ReadDir(tmpDir) + ok(t, err) expected := []string{"hello", "hello-backup", "world", "world-backup"} for i, e := range entries { if e.Name() != expected[i] { @@ -919,23 +894,24 @@ func TestSysctl(t *testing.T) { return } - rootfs, err := newRootfs() - ok(t, err) - defer remove(rootfs) - - config := newTemplateConfig(rootfs) + config := newTemplateConfig(t, nil) config.Sysctl = map[string]string{ "kernel.shmmni": "8192", + "kernel/shmmax": "4194304", } + const ( + cmd = "cat shmmni shmmax" + exp = "8192\n4194304\n" + ) - container, err := newContainerWithName("test", config) + container, err := newContainer(t, config) ok(t, err) - defer container.Destroy() + defer destroyContainer(container) var stdout bytes.Buffer pconfig := libcontainer.Process{ - Cwd: "/", - Args: []string{"sh", "-c", "cat /proc/sys/kernel/shmmni"}, + Cwd: "/proc/sys/kernel", + Args: []string{"sh", "-c", cmd}, Env: standardEnvironment, Stdin: nil, Stdout: &stdout, @@ -947,9 +923,9 @@ func TestSysctl(t *testing.T) { // Wait for process waitProcess(&pconfig, t) - shmmniOutput := strings.TrimSpace(string(stdout.Bytes())) - if shmmniOutput != "8192" { - t.Fatalf("kernel.shmmni property expected to be 8192, but is %s", shmmniOutput) + out := stdout.String() + if out != exp { + t.Fatalf("expected %s, got %s", exp, out) } } @@ -957,18 +933,8 @@ func TestMountCgroupRO(t *testing.T) { if testing.Short() { return } - rootfs, err := newRootfs() - ok(t, err) - defer remove(rootfs) - config := newTemplateConfig(rootfs) - - config.Mounts = append(config.Mounts, &configs.Mount{ - Destination: "/sys/fs/cgroup", - Device: "cgroup", - Flags: defaultMountFlags | unix.MS_RDONLY, - }) - - buffers, exitCode, err := runContainer(config, "", "mount") + config := newTemplateConfig(t, nil) + buffers, exitCode, err := runContainer(t, config, "mount") if err != nil { t.Fatalf("%s: %s", buffers, err) } @@ -1006,18 +972,16 @@ func TestMountCgroupRW(t *testing.T) { if testing.Short() { return } - rootfs, err := newRootfs() - ok(t, err) - defer remove(rootfs) - config := newTemplateConfig(rootfs) + config := newTemplateConfig(t, nil) + // clear the RO flag from cgroup mount + for _, m := range config.Mounts { + if m.Device == "cgroup" { + m.Flags = defaultMountFlags + break + } + } - config.Mounts = append(config.Mounts, &configs.Mount{ - Destination: "/sys/fs/cgroup", - Device: "cgroup", - Flags: defaultMountFlags, - }) - - buffers, exitCode, err := runContainer(config, "", "mount") + buffers, exitCode, err := runContainer(t, config, "mount") if err != nil { t.Fatalf("%s: %s", buffers, err) } @@ -1056,16 +1020,12 @@ func TestOomScoreAdj(t *testing.T) { return } - rootfs, err := newRootfs() - ok(t, err) - defer remove(rootfs) - - config := newTemplateConfig(rootfs) + config := newTemplateConfig(t, nil) config.OomScoreAdj = ptrInt(200) - container, err := newContainerWithName("test", config) + container, err := newContainer(t, config) ok(t, err) - defer container.Destroy() + defer destroyContainer(container) var stdout bytes.Buffer pconfig := libcontainer.Process{ @@ -1081,7 +1041,7 @@ func TestOomScoreAdj(t *testing.T) { // Wait for process waitProcess(&pconfig, t) - outputOomScoreAdj := strings.TrimSpace(string(stdout.Bytes())) + outputOomScoreAdj := strings.TrimSpace(stdout.String()) // Check that the oom_score_adj matches the value that was set as part of config. if outputOomScoreAdj != strconv.Itoa(*config.OomScoreAdj) { @@ -1094,17 +1054,9 @@ func TestHook(t *testing.T) { return } - bundle, err := newTestBundle() - ok(t, err) - defer remove(bundle) - - rootfs, err := newRootfs() - ok(t, err) - defer remove(rootfs) - - config := newTemplateConfig(rootfs) - expectedBundle := bundle - config.Labels = append(config.Labels, fmt.Sprintf("bundle=%s", expectedBundle)) + config := newTemplateConfig(t, nil) + expectedBundle := t.TempDir() + config.Labels = append(config.Labels, "bundle="+expectedBundle) getRootfsFromBundle := func(bundle string) (string, error) { f, err := os.Open(filepath.Join(bundle, "config.json")) @@ -1118,39 +1070,67 @@ func TestHook(t *testing.T) { } return config.Rootfs, nil } + createFileFromBundle := func(filename, bundle string) error { + root, err := getRootfsFromBundle(bundle) + if err != nil { + return err + } - config.Hooks = &configs.Hooks{ - Prestart: []configs.Hook{ + f, err := os.Create(filepath.Join(root, filename)) + if err != nil { + return err + } + return f.Close() + } + + // Note FunctionHooks can't be serialized to json this means they won't be passed down to the container + // For CreateContainer and StartContainer which run in the container namespace, this means we need to pass Command Hooks. + hookFiles := map[configs.HookName]string{ + configs.Prestart: "prestart", + configs.CreateRuntime: "createRuntime", + configs.CreateContainer: "createContainer", + configs.StartContainer: "startContainer", + configs.Poststart: "poststart", + } + + config.Hooks = configs.Hooks{ + configs.Prestart: configs.HookList{ configs.NewFunctionHook(func(s *specs.State) error { if s.Bundle != expectedBundle { t.Fatalf("Expected prestart hook bundlePath '%s'; got '%s'", expectedBundle, s.Bundle) } - - root, err := getRootfsFromBundle(s.Bundle) - if err != nil { - return err - } - f, err := os.Create(filepath.Join(root, "test")) - if err != nil { - return err - } - return f.Close() + return createFileFromBundle(hookFiles[configs.Prestart], s.Bundle) }), }, - Poststart: []configs.Hook{ + configs.CreateRuntime: configs.HookList{ + configs.NewFunctionHook(func(s *specs.State) error { + if s.Bundle != expectedBundle { + t.Fatalf("Expected createRuntime hook bundlePath '%s'; got '%s'", expectedBundle, s.Bundle) + } + return createFileFromBundle(hookFiles[configs.CreateRuntime], s.Bundle) + }), + }, + configs.CreateContainer: configs.HookList{ + configs.NewCommandHook(configs.Command{ + Path: "/bin/bash", + Args: []string{"/bin/bash", "-c", fmt.Sprintf("touch ./%s", hookFiles[configs.CreateContainer])}, + }), + }, + configs.StartContainer: configs.HookList{ + configs.NewCommandHook(configs.Command{ + Path: "/bin/sh", + Args: []string{"/bin/sh", "-c", fmt.Sprintf("touch /%s", hookFiles[configs.StartContainer])}, + }), + }, + configs.Poststart: configs.HookList{ configs.NewFunctionHook(func(s *specs.State) error { if s.Bundle != expectedBundle { t.Fatalf("Expected poststart hook bundlePath '%s'; got '%s'", expectedBundle, s.Bundle) } - - root, err := getRootfsFromBundle(s.Bundle) - if err != nil { - return err - } - return ioutil.WriteFile(filepath.Join(root, "test"), []byte("hello world"), 0755) + return createFileFromBundle(hookFiles[configs.Poststart], s.Bundle) }), }, - Poststop: []configs.Hook{ + configs.Poststop: configs.HookList{ configs.NewFunctionHook(func(s *specs.State) error { if s.Bundle != expectedBundle { t.Fatalf("Expected poststop hook bundlePath '%s'; got '%s'", expectedBundle, s.Bundle) @@ -1160,23 +1140,35 @@ func TestHook(t *testing.T) { if err != nil { return err } - return os.RemoveAll(filepath.Join(root, "test")) + + for _, hook := range hookFiles { + if err = os.RemoveAll(filepath.Join(root, hook)); err != nil { + return err + } + } + return nil }), }, } // write config of json format into config.json under bundle - f, err := os.OpenFile(filepath.Join(bundle, "config.json"), os.O_CREATE|os.O_RDWR, 0644) + f, err := os.OpenFile(filepath.Join(expectedBundle, "config.json"), os.O_CREATE|os.O_RDWR, 0o644) ok(t, err) ok(t, json.NewEncoder(f).Encode(config)) - container, err := newContainerWithName("test", config) + container, err := newContainer(t, config) ok(t, err) + // e.g: 'ls /prestart ...' + cmd := "ls " + for _, hook := range hookFiles { + cmd += "/" + hook + " " + } + var stdout bytes.Buffer pconfig := libcontainer.Process{ Cwd: "/", - Args: []string{"sh", "-c", "ls /test"}, + Args: []string{"sh", "-c", cmd}, Env: standardEnvironment, Stdin: nil, Stdout: &stdout, @@ -1188,30 +1180,15 @@ func TestHook(t *testing.T) { // Wait for process waitProcess(&pconfig, t) - outputLs := string(stdout.Bytes()) - - // Check that the ls output has the expected file touched by the prestart hook - if !strings.Contains(outputLs, "/test") { - container.Destroy() - t.Fatalf("ls output doesn't have the expected file: %s", outputLs) - } - - // Check that the file is written by the poststart hook - testFilePath := filepath.Join(rootfs, "test") - contents, err := ioutil.ReadFile(testFilePath) - if err != nil { - t.Fatalf("cannot read file '%s': %s", testFilePath, err) - } - if string(contents) != "hello world" { - t.Fatalf("Expected test file to contain 'hello world'; got '%s'", string(contents)) - } - if err := container.Destroy(); err != nil { t.Fatalf("container destroy %s", err) } - fi, err := os.Stat(filepath.Join(rootfs, "test")) - if err == nil || !os.IsNotExist(err) { - t.Fatalf("expected file to not exist, got %s", fi.Name()) + + for _, hook := range []string{"prestart", "createRuntime", "poststart"} { + fi, err := os.Stat(filepath.Join(config.Rootfs, hook)) + if err == nil || !os.IsNotExist(err) { + t.Fatalf("expected file '%s to not exists, but it does", fi.Name()) + } } } @@ -1220,11 +1197,8 @@ func TestSTDIOPermissions(t *testing.T) { return } - rootfs, err := newRootfs() - ok(t, err) - defer remove(rootfs) - config := newTemplateConfig(rootfs) - buffers, exitCode, err := runContainer(config, "", "sh", "-c", "echo hi > /dev/stderr") + config := newTemplateConfig(t, nil) + buffers, exitCode, err := runContainer(t, config, "sh", "-c", "echo hi > /dev/stderr") ok(t, err) if exitCode != 0 { t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr) @@ -1235,8 +1209,8 @@ func TestSTDIOPermissions(t *testing.T) { } } -func unmountOp(path string) error { - return unix.Unmount(path, unix.MNT_DETACH) +func unmountOp(path string) { + _ = unix.Unmount(path, unix.MNT_DETACH) } // Launch container with rootfsPropagation in rslave mode. Also @@ -1253,21 +1227,15 @@ func TestRootfsPropagationSlaveMount(t *testing.T) { if testing.Short() { return } - rootfs, err := newRootfs() - ok(t, err) - defer remove(rootfs) - config := newTemplateConfig(rootfs) - + config := newTemplateConfig(t, nil) config.RootPropagation = unix.MS_SLAVE | unix.MS_REC - // Bind mount a volume - dir1host, err := ioutil.TempDir("", "mnt1host") - ok(t, err) - defer os.RemoveAll(dir1host) + // Bind mount a volume. + dir1host := t.TempDir() // Make this dir a "shared" mount point. This will make sure a // slave relationship can be established in container. - err = unix.Mount(dir1host, dir1host, "bind", unix.MS_BIND|unix.MS_REC, "") + err := unix.Mount(dir1host, dir1host, "bind", unix.MS_BIND|unix.MS_REC, "") ok(t, err) err = unix.Mount("", dir1host, "", unix.MS_SHARED|unix.MS_REC, "") ok(t, err) @@ -1277,11 +1245,12 @@ func TestRootfsPropagationSlaveMount(t *testing.T) { Source: dir1host, Destination: dir1cont, Device: "bind", - Flags: unix.MS_BIND | unix.MS_REC}) + Flags: unix.MS_BIND | unix.MS_REC, + }) - container, err := newContainerWithName("testSlaveMount", config) + container, err := newContainer(t, config) ok(t, err) - defer container.Destroy() + defer destroyContainer(container) stdinR, stdinW, err := os.Pipe() ok(t, err) @@ -1295,15 +1264,16 @@ func TestRootfsPropagationSlaveMount(t *testing.T) { } err = container.Run(pconfig) - stdinR.Close() - defer stdinW.Close() + _ = stdinR.Close() + defer stdinW.Close() //nolint: errcheck ok(t, err) - // Create mnt1host/mnt2host and bind mount itself on top of it. This - // should be visible in container. - dir2host, err := ioutil.TempDir(dir1host, "mnt2host") + // Create mnt2host under dir1host and bind mount itself on top of it. + // This should be visible in container. + dir2host := filepath.Join(dir1host, "mnt2host") + err = os.Mkdir(dir2host, 0o700) ok(t, err) - defer os.RemoveAll(dir2host) + defer remove(dir2host) err = unix.Mount(dir2host, dir2host, "bind", unix.MS_BIND, "") defer unmountOp(dir2host) @@ -1324,19 +1294,19 @@ func TestRootfsPropagationSlaveMount(t *testing.T) { } err = container.Run(pconfig2) - stdinR2.Close() - defer stdinW2.Close() + _ = stdinR2.Close() + defer stdinW2.Close() //nolint: errcheck ok(t, err) - stdinW2.Close() + _ = stdinW2.Close() waitProcess(pconfig2, t) - stdinW.Close() + _ = stdinW.Close() waitProcess(pconfig, t) mountPropagated = false dir2cont = filepath.Join(dir1cont, filepath.Base(dir2host)) - propagationInfo := string(stdout2.Bytes()) + propagationInfo := stdout2.String() lines := strings.Split(propagationInfo, "\n") for _, l := range lines { linefields := strings.Split(l, " ") @@ -1369,20 +1339,15 @@ func TestRootfsPropagationSharedMount(t *testing.T) { if testing.Short() { return } - rootfs, err := newRootfs() - ok(t, err) - defer remove(rootfs) - config := newTemplateConfig(rootfs) + config := newTemplateConfig(t, nil) config.RootPropagation = unix.MS_PRIVATE - // Bind mount a volume - dir1host, err := ioutil.TempDir("", "mnt1host") - ok(t, err) - defer os.RemoveAll(dir1host) + // Bind mount a volume. + dir1host := t.TempDir() // Make this dir a "shared" mount point. This will make sure a // shared relationship can be established in container. - err = unix.Mount(dir1host, dir1host, "bind", unix.MS_BIND|unix.MS_REC, "") + err := unix.Mount(dir1host, dir1host, "bind", unix.MS_BIND|unix.MS_REC, "") ok(t, err) err = unix.Mount("", dir1host, "", unix.MS_SHARED|unix.MS_REC, "") ok(t, err) @@ -1392,11 +1357,12 @@ func TestRootfsPropagationSharedMount(t *testing.T) { Source: dir1host, Destination: dir1cont, Device: "bind", - Flags: unix.MS_BIND | unix.MS_REC}) + Flags: unix.MS_BIND | unix.MS_REC, + }) - container, err := newContainerWithName("testSharedMount", config) + container, err := newContainer(t, config) ok(t, err) - defer container.Destroy() + defer destroyContainer(container) stdinR, stdinW, err := os.Pipe() ok(t, err) @@ -1410,16 +1376,17 @@ func TestRootfsPropagationSharedMount(t *testing.T) { } err = container.Run(pconfig) - stdinR.Close() - defer stdinW.Close() + _ = stdinR.Close() + defer stdinW.Close() //nolint: errcheck ok(t, err) - // Create mnt1host/mnt2cont. This will become visible inside container + // Create mnt2cont under dir1host. This will become visible inside container // at mnt1cont/mnt2cont. Bind mount itself on top of it. This // should be visible on host now. - dir2host, err := ioutil.TempDir(dir1host, "mnt2cont") + dir2host := filepath.Join(dir1host, "mnt2cont") + err = os.Mkdir(dir2host, 0o700) ok(t, err) - defer os.RemoveAll(dir2host) + defer remove(dir2host) dir2cont = filepath.Join(dir1cont, filepath.Base(dir2host)) @@ -1445,26 +1412,26 @@ func TestRootfsPropagationSharedMount(t *testing.T) { pconfig2.Capabilities.Inheritable = append(config.Capabilities.Inheritable, "CAP_SYS_ADMIN") err = container.Run(pconfig2) - stdinR2.Close() - defer stdinW2.Close() + _ = stdinR2.Close() + defer stdinW2.Close() //nolint: errcheck ok(t, err) // Wait for process - stdinW2.Close() + _ = stdinW2.Close() waitProcess(pconfig2, t) - stdinW.Close() + _ = stdinW.Close() waitProcess(pconfig, t) defer unmountOp(dir2host) // Check if mount is visible on host or not. out, err := exec.Command("findmnt", "-n", "-f", "-oTARGET", dir2host).CombinedOutput() - outtrim := strings.TrimSpace(string(out)) + outtrim := string(bytes.TrimSpace(out)) if err != nil { t.Logf("findmnt error %q: %q", err, outtrim) } - if string(outtrim) != dir2host { + if outtrim != dir2host { t.Fatalf("Mount in container on %s did not propagate to host on %s. finmnt output=%s", dir2cont, dir2host, outtrim) } } @@ -1474,16 +1441,12 @@ func TestPIDHost(t *testing.T) { return } - rootfs, err := newRootfs() - ok(t, err) - defer remove(rootfs) - l, err := os.Readlink("/proc/1/ns/pid") ok(t, err) - config := newTemplateConfig(rootfs) + config := newTemplateConfig(t, nil) config.Namespaces.Remove(configs.NEWPID) - buffers, exitCode, err := runContainer(config, "", "readlink", "/proc/self/ns/pid") + buffers, exitCode, err := runContainer(t, config, "readlink", "/proc/self/ns/pid") ok(t, err) if exitCode != 0 { @@ -1495,18 +1458,64 @@ func TestPIDHost(t *testing.T) { } } +func TestPIDHostInitProcessWait(t *testing.T) { + if testing.Short() { + return + } + + pidns := "/proc/1/ns/pid" + + // Run a container with two long-running processes. + config := newTemplateConfig(t, nil) + config.Namespaces.Add(configs.NEWPID, pidns) + container, err := newContainer(t, config) + ok(t, err) + defer func() { + _ = container.Destroy() + }() + + process1 := &libcontainer.Process{ + Cwd: "/", + Args: []string{"sleep", "100"}, + Env: standardEnvironment, + Init: true, + } + err = container.Run(process1) + ok(t, err) + + process2 := &libcontainer.Process{ + Cwd: "/", + Args: []string{"sleep", "100"}, + Env: standardEnvironment, + Init: false, + } + err = container.Run(process2) + ok(t, err) + + // Kill the init process and Wait for it. + err = process1.Signal(syscall.SIGKILL) + ok(t, err) + _, err = process1.Wait() + if err == nil { + t.Fatal("expected Wait to indicate failure") + } + + // The non-init process must've been killed. + err = process2.Signal(syscall.Signal(0)) + if err == nil || err.Error() != "no such process" { + t.Fatalf("expected process to have been killed: %v", err) + } +} + func TestInitJoinPID(t *testing.T) { if testing.Short() { return } - rootfs, err := newRootfs() - ok(t, err) - defer remove(rootfs) - // Execute a long-running container - container1, err := newContainer(newTemplateConfig(rootfs)) + config1 := newTemplateConfig(t, nil) + container1, err := newContainer(t, config1) ok(t, err) - defer container1.Destroy() + defer destroyContainer(container1) stdinR1, stdinW1, err := os.Pipe() ok(t, err) @@ -1518,8 +1527,8 @@ func TestInitJoinPID(t *testing.T) { Init: true, } err = container1.Run(init1) - stdinR1.Close() - defer stdinW1.Close() + _ = stdinR1.Close() + defer stdinW1.Close() //nolint: errcheck ok(t, err) // get the state of the first container @@ -1528,12 +1537,12 @@ func TestInitJoinPID(t *testing.T) { pidns1 := state1.NamespacePaths[configs.NEWPID] // Run a container inside the existing pidns but with different cgroups - config2 := newTemplateConfig(rootfs) + config2 := newTemplateConfig(t, nil) config2.Namespaces.Add(configs.NEWPID, pidns1) config2.Cgroups.Path = "integration/test2" - container2, err := newContainerWithName("testCT2", config2) + container2, err := newContainer(t, config2) ok(t, err) - defer container2.Destroy() + defer destroyContainer(container2) stdinR2, stdinW2, err := os.Pipe() ok(t, err) @@ -1545,8 +1554,8 @@ func TestInitJoinPID(t *testing.T) { Init: true, } err = container2.Run(init2) - stdinR2.Close() - defer stdinW2.Close() + _ = stdinR2.Close() + defer stdinW2.Close() //nolint: errcheck ok(t, err) // get the state of the second container state2, err := container2.State() @@ -1580,9 +1589,9 @@ func TestInitJoinPID(t *testing.T) { // Stop init processes one by one. Stop the second container should // not stop the first. - stdinW2.Close() + _ = stdinW2.Close() waitProcess(init2, t) - stdinW1.Close() + _ = stdinW1.Close() waitProcess(init1, t) out := strings.TrimSpace(buffers.Stdout.String()) @@ -1597,23 +1606,17 @@ func TestInitJoinPID(t *testing.T) { func TestInitJoinNetworkAndUser(t *testing.T) { if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) { - t.Skip("userns is unsupported") + t.Skip("Test requires userns.") } if testing.Short() { return } - rootfs, err := newRootfs() - ok(t, err) - defer remove(rootfs) // Execute a long-running container - config1 := newTemplateConfig(rootfs) - config1.UidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}} - config1.GidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}} - config1.Namespaces = append(config1.Namespaces, configs.Namespace{Type: configs.NEWUSER}) - container1, err := newContainer(config1) + config1 := newTemplateConfig(t, &tParam{userns: true}) + container1, err := newContainer(t, config1) ok(t, err) - defer container1.Destroy() + defer destroyContainer(container1) stdinR1, stdinW1, err := os.Pipe() ok(t, err) @@ -1625,8 +1628,8 @@ func TestInitJoinNetworkAndUser(t *testing.T) { Init: true, } err = container1.Run(init1) - stdinR1.Close() - defer stdinW1.Close() + _ = stdinR1.Close() + defer stdinW1.Close() //nolint: errcheck ok(t, err) // get the state of the first container @@ -1635,20 +1638,14 @@ func TestInitJoinNetworkAndUser(t *testing.T) { netns1 := state1.NamespacePaths[configs.NEWNET] userns1 := state1.NamespacePaths[configs.NEWUSER] - // Run a container inside the existing pidns but with different cgroups - rootfs2, err := newRootfs() - ok(t, err) - defer remove(rootfs2) - - config2 := newTemplateConfig(rootfs2) - config2.UidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}} - config2.GidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}} + // Run a container inside the existing pidns but with different cgroups. + config2 := newTemplateConfig(t, &tParam{userns: true}) config2.Namespaces.Add(configs.NEWNET, netns1) config2.Namespaces.Add(configs.NEWUSER, userns1) config2.Cgroups.Path = "integration/test2" - container2, err := newContainerWithName("testCT2", config2) + container2, err := newContainer(t, config2) ok(t, err) - defer container2.Destroy() + defer destroyContainer(container2) stdinR2, stdinW2, err := os.Pipe() ok(t, err) @@ -1660,8 +1657,8 @@ func TestInitJoinNetworkAndUser(t *testing.T) { Init: true, } err = container2.Run(init2) - stdinR2.Close() - defer stdinW2.Close() + _ = stdinR2.Close() + defer stdinW2.Close() //nolint: errcheck ok(t, err) // get the state of the second container @@ -1685,9 +1682,9 @@ func TestInitJoinNetworkAndUser(t *testing.T) { } // Stop init processes one by one. Stop the second container should // not stop the first. - stdinW2.Close() + _ = stdinW2.Close() waitProcess(init2, t) - stdinW1.Close() + _ = stdinW1.Close() waitProcess(init1, t) } @@ -1696,12 +1693,7 @@ func TestTmpfsCopyUp(t *testing.T) { return } - rootfs, err := newRootfs() - ok(t, err) - defer remove(rootfs) - - config := newTemplateConfig(rootfs) - + config := newTemplateConfig(t, nil) config.Mounts = append(config.Mounts, &configs.Mount{ Source: "tmpfs", Destination: "/etc", @@ -1709,9 +1701,9 @@ func TestTmpfsCopyUp(t *testing.T) { Extensions: configs.EXT_COPYUP, }) - container, err := newContainerWithName("test", config) + container, err := newContainer(t, config) ok(t, err) - defer container.Destroy() + defer destroyContainer(container) var stdout bytes.Buffer pconfig := libcontainer.Process{ @@ -1727,7 +1719,7 @@ func TestTmpfsCopyUp(t *testing.T) { // Wait for process waitProcess(&pconfig, t) - outputLs := string(stdout.Bytes()) + outputLs := stdout.String() // Check that the ls output has /etc/passwd if !strings.Contains(outputLs, "/etc/passwd") { @@ -1737,22 +1729,18 @@ func TestTmpfsCopyUp(t *testing.T) { func TestCGROUPPrivate(t *testing.T) { if _, err := os.Stat("/proc/self/ns/cgroup"); os.IsNotExist(err) { - t.Skip("cgroupns is unsupported") + t.Skip("Test requires cgroupns.") } if testing.Short() { return } - rootfs, err := newRootfs() - ok(t, err) - defer remove(rootfs) - l, err := os.Readlink("/proc/1/ns/cgroup") ok(t, err) - config := newTemplateConfig(rootfs) + config := newTemplateConfig(t, nil) config.Namespaces.Add(configs.NEWCGROUP, "") - buffers, exitCode, err := runContainer(config, "", "readlink", "/proc/self/ns/cgroup") + buffers, exitCode, err := runContainer(t, config, "readlink", "/proc/self/ns/cgroup") ok(t, err) if exitCode != 0 { @@ -1766,21 +1754,17 @@ func TestCGROUPPrivate(t *testing.T) { func TestCGROUPHost(t *testing.T) { if _, err := os.Stat("/proc/self/ns/cgroup"); os.IsNotExist(err) { - t.Skip("cgroupns is unsupported") + t.Skip("Test requires cgroupns.") } if testing.Short() { return } - rootfs, err := newRootfs() - ok(t, err) - defer remove(rootfs) - l, err := os.Readlink("/proc/1/ns/cgroup") ok(t, err) - config := newTemplateConfig(rootfs) - buffers, exitCode, err := runContainer(config, "", "readlink", "/proc/self/ns/cgroup") + config := newTemplateConfig(t, nil) + buffers, exitCode, err := runContainer(t, config, "readlink", "/proc/self/ns/cgroup") ok(t, err) if exitCode != 0 { @@ -1791,3 +1775,134 @@ func TestCGROUPHost(t *testing.T) { t.Fatalf("cgroup link not equal to host link %q %q", actual, l) } } + +func TestFdLeaks(t *testing.T) { + testFdLeaks(t, false) +} + +func TestFdLeaksSystemd(t *testing.T) { + if !systemd.IsRunningSystemd() { + t.Skip("Test requires systemd.") + } + testFdLeaks(t, true) +} + +func testFdLeaks(t *testing.T, systemd bool) { + if testing.Short() { + return + } + + pfd, err := os.Open("/proc/self/fd") + ok(t, err) + defer pfd.Close() + fds0, err := pfd.Readdirnames(0) + ok(t, err) + _, err = pfd.Seek(0, 0) + ok(t, err) + + config := newTemplateConfig(t, &tParam{systemd: systemd}) + buffers, exitCode, err := runContainer(t, config, "true") + ok(t, err) + + if exitCode != 0 { + t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr) + } + + fds1, err := pfd.Readdirnames(0) + ok(t, err) + + if len(fds1) == len(fds0) { + return + } + // Show the extra opened files. + + excludedPaths := []string{ + "/sys/fs/cgroup", // opened once, see prepareOpenat2 + "anon_inode:bpf-prog", // FIXME: see https://github.com/opencontainers/runc/issues/2366#issuecomment-776411392 + } + + count := 0 +next_fd: + for _, fd1 := range fds1 { + for _, fd0 := range fds0 { + if fd0 == fd1 { + continue next_fd + } + } + dst, _ := os.Readlink("/proc/self/fd/" + fd1) + for _, ex := range excludedPaths { + if ex == dst { + continue next_fd + } + } + + count++ + t.Logf("extra fd %s -> %s", fd1, dst) + } + if count > 0 { + t.Fatalf("found %d extra fds after container.Run", count) + } +} + +// Test that a container using user namespaces is able to bind mount a folder +// that does not have permissions for group/others. +func TestBindMountAndUser(t *testing.T) { + if _, err := os.Stat("/proc/self/ns/user"); errors.Is(err, os.ErrNotExist) { + t.Skip("userns is unsupported") + } + + if testing.Short() { + return + } + + temphost := t.TempDir() + dirhost := filepath.Join(temphost, "inaccessible", "dir") + + err := os.MkdirAll(dirhost, 0o755) + ok(t, err) + + err = os.WriteFile(filepath.Join(dirhost, "foo.txt"), []byte("Hello"), 0o755) + ok(t, err) + + // Make this dir inaccessible to "group,others". + err = os.Chmod(filepath.Join(temphost, "inaccessible"), 0o700) + ok(t, err) + + config := newTemplateConfig(t, &tParam{ + userns: true, + }) + + // Set HostID to 1000 to avoid DAC_OVERRIDE bypassing the purpose of this test. + config.UidMappings[0].HostID = 1000 + config.GidMappings[0].HostID = 1000 + + // Set the owner of rootfs to the effective IDs in the host to avoid errors + // while creating the folders to perform the mounts. + err = os.Chown(config.Rootfs, 1000, 1000) + ok(t, err) + + config.Mounts = append(config.Mounts, &configs.Mount{ + Source: dirhost, + Destination: "/tmp/mnt1cont", + Device: "bind", + Flags: unix.MS_BIND | unix.MS_REC, + }) + + container, err := newContainer(t, config) + ok(t, err) + defer container.Destroy() //nolint: errcheck + + var stdout bytes.Buffer + + pconfig := libcontainer.Process{ + Cwd: "/", + Args: []string{"sh", "-c", "stat /tmp/mnt1cont/foo.txt"}, + Env: standardEnvironment, + Stdout: &stdout, + Init: true, + } + err = container.Run(&pconfig) + ok(t, err) + + waitProcess(&pconfig, t) +} diff --git a/libcontainer/integration/execin_test.go b/libcontainer/integration/execin_test.go index 14f8a59..f8a6a9c 100644 --- a/libcontainer/integration/execin_test.go +++ b/libcontainer/integration/execin_test.go @@ -22,13 +22,10 @@ func TestExecIn(t *testing.T) { if testing.Short() { return } - rootfs, err := newRootfs() + config := newTemplateConfig(t, nil) + container, err := newContainer(t, config) ok(t, err) - defer remove(rootfs) - config := newTemplateConfig(rootfs) - container, err := newContainer(config) - ok(t, err) - defer container.Destroy() + defer destroyContainer(container) // Execute a first process in the container stdinR, stdinW, err := os.Pipe() @@ -41,8 +38,8 @@ func TestExecIn(t *testing.T) { Init: true, } err = container.Run(process) - stdinR.Close() - defer stdinW.Close() + _ = stdinR.Close() + defer stdinW.Close() //nolint: errcheck ok(t, err) buffers := newStdBuffers() @@ -58,7 +55,7 @@ func TestExecIn(t *testing.T) { err = container.Run(ps) ok(t, err) waitProcess(ps, t) - stdinW.Close() + _ = stdinW.Close() waitProcess(process, t) out := buffers.Stdout.String() @@ -66,13 +63,13 @@ func TestExecIn(t *testing.T) { t.Fatalf("unexpected running process, output %q", out) } if strings.Contains(out, "\r") { - t.Fatalf("unexpected carriage-return in output") + t.Fatalf("unexpected carriage-return in output %q", out) } } func TestExecInUsernsRlimit(t *testing.T) { if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) { - t.Skip("userns is unsupported") + t.Skip("Test requires userns.") } testExecInRlimit(t, true) @@ -87,20 +84,10 @@ func testExecInRlimit(t *testing.T, userns bool) { return } - rootfs, err := newRootfs() + config := newTemplateConfig(t, &tParam{userns: userns}) + container, err := newContainer(t, config) ok(t, err) - defer remove(rootfs) - - config := newTemplateConfig(rootfs) - if userns { - config.UidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}} - config.GidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}} - config.Namespaces = append(config.Namespaces, configs.Namespace{Type: configs.NEWUSER}) - } - - container, err := newContainer(config) - ok(t, err) - defer container.Destroy() + defer destroyContainer(container) stdinR, stdinW, err := os.Pipe() ok(t, err) @@ -112,8 +99,8 @@ func testExecInRlimit(t *testing.T, userns bool) { Init: true, } err = container.Run(process) - stdinR.Close() - defer stdinW.Close() + _ = stdinR.Close() + defer stdinW.Close() //nolint: errcheck ok(t, err) buffers := newStdBuffers() @@ -134,7 +121,7 @@ func testExecInRlimit(t *testing.T, userns bool) { ok(t, err) waitProcess(ps, t) - stdinW.Close() + _ = stdinW.Close() waitProcess(process, t) out := buffers.Stdout.String() @@ -148,14 +135,10 @@ func TestExecInAdditionalGroups(t *testing.T) { return } - rootfs, err := newRootfs() + config := newTemplateConfig(t, nil) + container, err := newContainer(t, config) ok(t, err) - defer remove(rootfs) - - config := newTemplateConfig(rootfs) - container, err := newContainer(config) - ok(t, err) - defer container.Destroy() + defer destroyContainer(container) // Execute a first process in the container stdinR, stdinW, err := os.Pipe() @@ -168,8 +151,8 @@ func TestExecInAdditionalGroups(t *testing.T) { Init: true, } err = container.Run(process) - stdinR.Close() - defer stdinW.Close() + _ = stdinR.Close() + defer stdinW.Close() //nolint: errcheck ok(t, err) var stdout bytes.Buffer @@ -187,10 +170,10 @@ func TestExecInAdditionalGroups(t *testing.T) { // Wait for process waitProcess(&pconfig, t) - stdinW.Close() + _ = stdinW.Close() waitProcess(process, t) - outputGroups := string(stdout.Bytes()) + outputGroups := stdout.String() // Check that the groups output has the groups that we specified if !strings.Contains(outputGroups, "audio") { @@ -206,13 +189,10 @@ func TestExecInError(t *testing.T) { if testing.Short() { return } - rootfs, err := newRootfs() + config := newTemplateConfig(t, nil) + container, err := newContainer(t, config) ok(t, err) - defer remove(rootfs) - config := newTemplateConfig(rootfs) - container, err := newContainer(config) - ok(t, err) - defer container.Destroy() + defer destroyContainer(container) // Execute a first process in the container stdinR, stdinW, err := os.Pipe() @@ -225,9 +205,9 @@ func TestExecInError(t *testing.T) { Init: true, } err = container.Run(process) - stdinR.Close() + _ = stdinR.Close() defer func() { - stdinW.Close() + _ = stdinW.Close() if _, err := process.Wait(); err != nil { t.Log(err) } @@ -259,13 +239,11 @@ func TestExecInTTY(t *testing.T) { if testing.Short() { return } - rootfs, err := newRootfs() + t.Skip("racy; see https://github.com/opencontainers/runc/issues/2425") + config := newTemplateConfig(t, nil) + container, err := newContainer(t, config) ok(t, err) - defer remove(rootfs) - config := newTemplateConfig(rootfs) - container, err := newContainer(config) - ok(t, err) - defer container.Destroy() + defer destroyContainer(container) // Execute a first process in the container stdinR, stdinW, err := os.Pipe() @@ -278,77 +256,74 @@ func TestExecInTTY(t *testing.T) { Init: true, } err = container.Run(process) - stdinR.Close() - defer stdinW.Close() + _ = stdinR.Close() + defer func() { + _ = stdinW.Close() + if _, err := process.Wait(); err != nil { + t.Log(err) + } + }() ok(t, err) - var stdout bytes.Buffer ps := &libcontainer.Process{ Cwd: "/", Args: []string{"ps"}, Env: standardEnvironment, } - parent, child, err := utils.NewSockPair("console") - if err != nil { + + // Repeat to increase chances to catch a race; see + // https://github.com/opencontainers/runc/issues/2425. + for i := 0; i < 300; i++ { + var stdout bytes.Buffer + + parent, child, err := utils.NewSockPair("console") ok(t, err) - } - defer parent.Close() - defer child.Close() - ps.ConsoleSocket = child - type cdata struct { - c console.Console - err error - } - dc := make(chan *cdata, 1) - go func() { - f, err := utils.RecvFd(parent) - if err != nil { - dc <- &cdata{ - err: err, - } - return - } - c, err := console.ConsoleFromFile(f) - if err != nil { - dc <- &cdata{ - err: err, - } - return - } - console.ClearONLCR(c.Fd()) - dc <- &cdata{ - c: c, - } - }() - err = container.Run(ps) - ok(t, err) - data := <-dc - if data.err != nil { - ok(t, data.err) - } - console := data.c - copy := make(chan struct{}) - go func() { - io.Copy(&stdout, console) - close(copy) - }() - ok(t, err) - select { - case <-time.After(5 * time.Second): - t.Fatal("Waiting for copy timed out") - case <-copy: - } - waitProcess(ps, t) + ps.ConsoleSocket = child - stdinW.Close() - waitProcess(process, t) + done := make(chan (error)) + go func() { + f, err := utils.RecvFd(parent) + if err != nil { + done <- fmt.Errorf("RecvFd: %w", err) + return + } + c, err := console.ConsoleFromFile(f) + if err != nil { + done <- fmt.Errorf("ConsoleFromFile: %w", err) + return + } + err = console.ClearONLCR(c.Fd()) + if err != nil { + done <- fmt.Errorf("ClearONLCR: %w", err) + return + } + // An error from io.Copy is expected once the terminal + // is gone, so we deliberately ignore it. + _, _ = io.Copy(&stdout, c) + done <- nil + }() - out := stdout.String() - if !strings.Contains(out, "cat") || !strings.Contains(out, "ps") { - t.Fatalf("unexpected running process, output %q", out) - } - if strings.Contains(out, "\r") { - t.Fatalf("unexpected carriage-return in output") + err = container.Run(ps) + ok(t, err) + + select { + case <-time.After(5 * time.Second): + t.Fatal("Waiting for copy timed out") + case err := <-done: + ok(t, err) + } + + waitProcess(ps, t) + _ = parent.Close() + _ = child.Close() + + out := stdout.String() + if !strings.Contains(out, "cat") || !strings.Contains(out, "ps") { + t.Fatalf("unexpected running process, output %q", out) + } + if strings.Contains(out, "\r") { + t.Fatalf("unexpected carriage-return in output %q", out) + } } } @@ -356,13 +331,10 @@ func TestExecInEnvironment(t *testing.T) { if testing.Short() { return } - rootfs, err := newRootfs() + config := newTemplateConfig(t, nil) + container, err := newContainer(t, config) ok(t, err) - defer remove(rootfs) - config := newTemplateConfig(rootfs) - container, err := newContainer(config) - ok(t, err) - defer container.Destroy() + defer destroyContainer(container) // Execute a first process in the container stdinR, stdinW, err := os.Pipe() @@ -375,8 +347,8 @@ func TestExecInEnvironment(t *testing.T) { Init: true, } err = container.Run(process) - stdinR.Close() - defer stdinW.Close() + _ = stdinR.Close() + defer stdinW.Close() //nolint: errcheck ok(t, err) buffers := newStdBuffers() @@ -398,7 +370,7 @@ func TestExecInEnvironment(t *testing.T) { ok(t, err) waitProcess(process2, t) - stdinW.Close() + _ = stdinW.Close() waitProcess(process, t) out := buffers.Stdout.String() @@ -416,23 +388,14 @@ func TestExecinPassExtraFiles(t *testing.T) { if testing.Short() { return } - rootfs, err := newRootfs() - if err != nil { - t.Fatal(err) - } - defer remove(rootfs) - config := newTemplateConfig(rootfs) - container, err := newContainer(config) - if err != nil { - t.Fatal(err) - } - defer container.Destroy() + config := newTemplateConfig(t, nil) + container, err := newContainer(t, config) + ok(t, err) + defer destroyContainer(container) // Execute a first process in the container stdinR, stdinW, err := os.Pipe() - if err != nil { - t.Fatal(err) - } + ok(t, err) process := &libcontainer.Process{ Cwd: "/", Args: []string{"cat"}, @@ -441,21 +404,15 @@ func TestExecinPassExtraFiles(t *testing.T) { Init: true, } err = container.Run(process) - stdinR.Close() - defer stdinW.Close() - if err != nil { - t.Fatal(err) - } + _ = stdinR.Close() + defer stdinW.Close() //nolint: errcheck + ok(t, err) var stdout bytes.Buffer pipeout1, pipein1, err := os.Pipe() - if err != nil { - t.Fatal(err) - } + ok(t, err) pipeout2, pipein2, err := os.Pipe() - if err != nil { - t.Fatal(err) - } + ok(t, err) inprocess := &libcontainer.Process{ Cwd: "/", Args: []string{"sh", "-c", "cd /proc/$$/fd; echo -n *; echo -n 1 >3; echo -n 2 >4"}, @@ -465,33 +422,27 @@ func TestExecinPassExtraFiles(t *testing.T) { Stdout: &stdout, } err = container.Run(inprocess) - if err != nil { - t.Fatal(err) - } + ok(t, err) waitProcess(inprocess, t) - stdinW.Close() + _ = stdinW.Close() waitProcess(process, t) - out := string(stdout.Bytes()) + out := stdout.String() // fd 5 is the directory handle for /proc/$$/fd if out != "0 1 2 3 4 5" { t.Fatalf("expected to have the file descriptors '0 1 2 3 4 5' passed to exec, got '%s'", out) } - var buf = []byte{0} + buf := []byte{0} _, err = pipeout1.Read(buf) - if err != nil { - t.Fatal(err) - } + ok(t, err) out1 := string(buf) if out1 != "1" { t.Fatalf("expected first pipe to receive '1', got '%s'", out1) } _, err = pipeout2.Read(buf) - if err != nil { - t.Fatal(err) - } + ok(t, err) out2 := string(buf) if out2 != "2" { t.Fatalf("expected second pipe to receive '2', got '%s'", out2) @@ -502,14 +453,11 @@ func TestExecInOomScoreAdj(t *testing.T) { if testing.Short() { return } - rootfs, err := newRootfs() - ok(t, err) - defer remove(rootfs) - config := newTemplateConfig(rootfs) + config := newTemplateConfig(t, nil) config.OomScoreAdj = ptrInt(200) - container, err := newContainer(config) + container, err := newContainer(t, config) ok(t, err) - defer container.Destroy() + defer destroyContainer(container) stdinR, stdinW, err := os.Pipe() ok(t, err) @@ -521,8 +469,8 @@ func TestExecInOomScoreAdj(t *testing.T) { Init: true, } err = container.Run(process) - stdinR.Close() - defer stdinW.Close() + _ = stdinR.Close() + defer stdinW.Close() //nolint: errcheck ok(t, err) buffers := newStdBuffers() @@ -538,7 +486,7 @@ func TestExecInOomScoreAdj(t *testing.T) { ok(t, err) waitProcess(ps, t) - stdinW.Close() + _ = stdinW.Close() waitProcess(process, t) out := buffers.Stdout.String() @@ -549,21 +497,15 @@ func TestExecInOomScoreAdj(t *testing.T) { func TestExecInUserns(t *testing.T) { if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) { - t.Skip("userns is unsupported") + t.Skip("Test requires userns.") } if testing.Short() { return } - rootfs, err := newRootfs() + config := newTemplateConfig(t, &tParam{userns: true}) + container, err := newContainer(t, config) ok(t, err) - defer remove(rootfs) - config := newTemplateConfig(rootfs) - config.UidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}} - config.GidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}} - config.Namespaces = append(config.Namespaces, configs.Namespace{Type: configs.NEWUSER}) - container, err := newContainer(config) - ok(t, err) - defer container.Destroy() + defer destroyContainer(container) // Execute a first process in the container stdinR, stdinW, err := os.Pipe() @@ -577,8 +519,8 @@ func TestExecInUserns(t *testing.T) { Init: true, } err = container.Run(process) - stdinR.Close() - defer stdinW.Close() + _ = stdinR.Close() + defer stdinW.Close() //nolint: errcheck ok(t, err) initPID, err := process.Pid() @@ -599,7 +541,7 @@ func TestExecInUserns(t *testing.T) { err = container.Run(process2) ok(t, err) waitProcess(process2, t) - stdinW.Close() + _ = stdinW.Close() waitProcess(process, t) if out := strings.TrimSpace(buffers.Stdout.String()); out != initUserns { diff --git a/libcontainer/integration/init_test.go b/libcontainer/integration/init_test.go index f5180ea..effcde0 100644 --- a/libcontainer/integration/init_test.go +++ b/libcontainer/integration/init_test.go @@ -28,19 +28,10 @@ func init() { } } -var testRoots []string - func TestMain(m *testing.M) { logrus.SetOutput(os.Stderr) logrus.SetLevel(logrus.InfoLevel) - // Clean up roots after running everything. - defer func() { - for _, root := range testRoots { - os.RemoveAll(root) - } - }() - ret := m.Run() os.Exit(ret) } diff --git a/libcontainer/integration/seccomp_test.go b/libcontainer/integration/seccomp_test.go index 77f1a8d..a7eeefb 100644 --- a/libcontainer/integration/seccomp_test.go +++ b/libcontainer/integration/seccomp_test.go @@ -1,3 +1,4 @@ +//go:build linux && cgo && seccomp // +build linux,cgo,seccomp package integration @@ -12,33 +13,28 @@ import ( libseccomp "github.com/seccomp/libseccomp-golang" ) -func TestSeccompDenyGetcwd(t *testing.T) { +func TestSeccompDenyGetcwdWithErrno(t *testing.T) { if testing.Short() { return } - rootfs, err := newRootfs() - if err != nil { - t.Fatal(err) - } - defer remove(rootfs) + errnoRet := uint(syscall.ESRCH) - config := newTemplateConfig(rootfs) + config := newTemplateConfig(t, nil) config.Seccomp = &configs.Seccomp{ DefaultAction: configs.Allow, Syscalls: []*configs.Syscall{ { - Name: "getcwd", - Action: configs.Errno, + Name: "getcwd", + Action: configs.Errno, + ErrnoRet: &errnoRet, }, }, } - container, err := newContainer(config) - if err != nil { - t.Fatal(err) - } - defer container.Destroy() + container, err := newContainer(t, config) + ok(t, err) + defer container.Destroy() //nolint:errcheck buffers := newStdBuffers() pwd := &libcontainer.Process{ @@ -52,9 +48,66 @@ func TestSeccompDenyGetcwd(t *testing.T) { } err = container.Run(pwd) - if err != nil { - t.Fatal(err) + ok(t, err) + ps, err := pwd.Wait() + if err == nil { + t.Fatal("Expecting error (negative return code); instead exited cleanly!") } + + var exitCode int + status := ps.Sys().(syscall.WaitStatus) + if status.Exited() { + exitCode = status.ExitStatus() + } else if status.Signaled() { + exitCode = -int(status.Signal()) + } else { + t.Fatalf("Unrecognized exit reason!") + } + + if exitCode == 0 { + t.Fatalf("Getcwd should fail with negative exit code, instead got %d!", exitCode) + } + + expected := "pwd: getcwd: No such process" + actual := strings.Trim(buffers.Stderr.String(), "\n") + if actual != expected { + t.Fatalf("Expected output %s but got %s\n", expected, actual) + } +} + +func TestSeccompDenyGetcwd(t *testing.T) { + if testing.Short() { + return + } + + config := newTemplateConfig(t, nil) + config.Seccomp = &configs.Seccomp{ + DefaultAction: configs.Allow, + Syscalls: []*configs.Syscall{ + { + Name: "getcwd", + Action: configs.Errno, + }, + }, + } + + container, err := newContainer(t, config) + ok(t, err) + defer container.Destroy() //nolint:errcheck + + buffers := newStdBuffers() + pwd := &libcontainer.Process{ + Cwd: "/", + Args: []string{"pwd"}, + Env: standardEnvironment, + Stdin: buffers.Stdin, + Stdout: buffers.Stdout, + Stderr: buffers.Stderr, + Init: true, + } + + err = container.Run(pwd) + ok(t, err) ps, err := pwd.Wait() if err == nil { t.Fatal("Expecting error (negative return code); instead exited cleanly!") @@ -86,13 +139,7 @@ func TestSeccompPermitWriteConditional(t *testing.T) { return } - rootfs, err := newRootfs() - if err != nil { - t.Fatal(err) - } - defer remove(rootfs) - - config := newTemplateConfig(rootfs) + config := newTemplateConfig(t, nil) config.Seccomp = &configs.Seccomp{ DefaultAction: configs.Allow, Syscalls: []*configs.Syscall{ @@ -110,11 +157,9 @@ func TestSeccompPermitWriteConditional(t *testing.T) { }, } - container, err := newContainer(config) - if err != nil { - t.Fatal(err) - } - defer container.Destroy() + container, err := newContainer(t, config) + ok(t, err) + defer container.Destroy() //nolint:errcheck buffers := newStdBuffers() dmesg := &libcontainer.Process{ @@ -128,9 +173,7 @@ func TestSeccompPermitWriteConditional(t *testing.T) { } err = container.Run(dmesg) - if err != nil { - t.Fatal(err) - } + ok(t, err) if _, err := dmesg.Wait(); err != nil { t.Fatalf("%s: %s", err, buffers.Stderr) } @@ -148,13 +191,7 @@ func TestSeccompDenyWriteConditional(t *testing.T) { return } - rootfs, err := newRootfs() - if err != nil { - t.Fatal(err) - } - defer remove(rootfs) - - config := newTemplateConfig(rootfs) + config := newTemplateConfig(t, nil) config.Seccomp = &configs.Seccomp{ DefaultAction: configs.Allow, Syscalls: []*configs.Syscall{ @@ -172,11 +209,9 @@ func TestSeccompDenyWriteConditional(t *testing.T) { }, } - container, err := newContainer(config) - if err != nil { - t.Fatal(err) - } - defer container.Destroy() + container, err := newContainer(t, config) + ok(t, err) + defer container.Destroy() //nolint:errcheck buffers := newStdBuffers() dmesg := &libcontainer.Process{ @@ -190,9 +225,7 @@ func TestSeccompDenyWriteConditional(t *testing.T) { } err = container.Run(dmesg) - if err != nil { - t.Fatal(err) - } + ok(t, err) ps, err := dmesg.Wait() if err == nil { @@ -226,13 +259,7 @@ func TestSeccompPermitWriteMultipleConditions(t *testing.T) { return } - rootfs, err := newRootfs() - if err != nil { - t.Fatal(err) - } - defer remove(rootfs) - - config := newTemplateConfig(rootfs) + config := newTemplateConfig(t, nil) config.Seccomp = &configs.Seccomp{ DefaultAction: configs.Allow, Syscalls: []*configs.Syscall{ @@ -255,7 +282,7 @@ func TestSeccompPermitWriteMultipleConditions(t *testing.T) { }, } - buffers, exitCode, err := runContainer(config, "", "ls", "/") + buffers, exitCode, err := runContainer(t, config, "ls", "/") if err != nil { t.Fatalf("%s: %s", buffers, err) } @@ -281,13 +308,7 @@ func TestSeccompDenyWriteMultipleConditions(t *testing.T) { return } - rootfs, err := newRootfs() - if err != nil { - t.Fatal(err) - } - defer remove(rootfs) - - config := newTemplateConfig(rootfs) + config := newTemplateConfig(t, nil) config.Seccomp = &configs.Seccomp{ DefaultAction: configs.Allow, Syscalls: []*configs.Syscall{ @@ -310,7 +331,7 @@ func TestSeccompDenyWriteMultipleConditions(t *testing.T) { }, } - buffers, exitCode, err := runContainer(config, "", "ls", "/does_not_exist") + buffers, exitCode, err := runContainer(t, config, "ls", "/does_not_exist") if err == nil { t.Fatalf("Expecting error return, instead got 0") } @@ -330,14 +351,8 @@ func TestSeccompMultipleConditionSameArgDeniesStdout(t *testing.T) { return } - rootfs, err := newRootfs() - if err != nil { - t.Fatal(err) - } - defer remove(rootfs) - - // Prevent writing to both stdout and stderr - config := newTemplateConfig(rootfs) + // Prevent writing to both stdout and stderr. + config := newTemplateConfig(t, nil) config.Seccomp = &configs.Seccomp{ DefaultAction: configs.Allow, Syscalls: []*configs.Syscall{ @@ -360,7 +375,7 @@ func TestSeccompMultipleConditionSameArgDeniesStdout(t *testing.T) { }, } - buffers, exitCode, err := runContainer(config, "", "ls", "/") + buffers, exitCode, err := runContainer(t, config, "ls", "/") if err != nil { t.Fatalf("%s: %s", buffers, err) } @@ -378,14 +393,8 @@ func TestSeccompMultipleConditionSameArgDeniesStderr(t *testing.T) { return } - rootfs, err := newRootfs() - if err != nil { - t.Fatal(err) - } - defer remove(rootfs) - - // Prevent writing to both stdout and stderr - config := newTemplateConfig(rootfs) + // Prevent writing to both stdout and stderr. + config := newTemplateConfig(t, nil) config.Seccomp = &configs.Seccomp{ DefaultAction: configs.Allow, Syscalls: []*configs.Syscall{ @@ -408,7 +417,7 @@ func TestSeccompMultipleConditionSameArgDeniesStderr(t *testing.T) { }, } - buffers, exitCode, err := runContainer(config, "", "ls", "/does_not_exist") + buffers, exitCode, err := runContainer(t, config, "ls", "/does_not_exist") if err == nil { t.Fatalf("Expecting error return, instead got 0") } diff --git a/libcontainer/integration/template_test.go b/libcontainer/integration/template_test.go index 5f7cab5..f56db89 100644 --- a/libcontainer/integration/template_test.go +++ b/libcontainer/integration/template_test.go @@ -1,8 +1,14 @@ package integration import ( - "github.com/opencontainers/runc/libcontainer/configs" + "strconv" + "strings" + "testing" + "time" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/devices" + "github.com/opencontainers/runc/libcontainer/specconv" "golang.org/x/sys/unix" ) @@ -15,14 +21,27 @@ var standardEnvironment = []string{ const defaultMountFlags = unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV -// newTemplateConfig returns a base template for running a container +type tParam struct { + userns bool + systemd bool +} + +// newTemplateConfig returns a base template for running a container. // -// it uses a network strategy of just setting a loopback interface -// and the default setup for devices -func newTemplateConfig(rootfs string) *configs.Config { - allowAllDevices := false - return &configs.Config{ - Rootfs: rootfs, +// It uses a network strategy of just setting a loopback interface +// and the default setup for devices. +// +// If p is nil, a default container is created. +func newTemplateConfig(t *testing.T, p *tParam) *configs.Config { + var allowedDevices []*devices.Rule + for _, device := range specconv.AllowedDevices { + allowedDevices = append(allowedDevices, &device.Rule) + } + if p == nil { + p = &tParam{} + } + config := &configs.Config{ + Rootfs: newRootfs(t), Capabilities: &configs.Capabilities{ Bounding: []string{ "CAP_CHOWN", @@ -113,11 +132,10 @@ func newTemplateConfig(rootfs string) *configs.Config { {Type: configs.NEWNET}, }), Cgroups: &configs.Cgroup{ - Path: "integration/test", + Systemd: p.systemd, Resources: &configs.Resources{ MemorySwappiness: nil, - AllowAllDevices: &allowAllDevices, - AllowedDevices: configs.DefaultAllowedDevices, + Devices: allowedDevices, }, }, MaskPaths: []string{ @@ -127,7 +145,7 @@ func newTemplateConfig(rootfs string) *configs.Config { ReadonlyPaths: []string{ "/proc/sys", "/proc/sysrq-trigger", "/proc/irq", "/proc/bus", }, - Devices: configs.DefaultAutoCreatedDevices, + Devices: specconv.AllowedDevices, Hostname: "integration", Mounts: []*configs.Mount{ { @@ -188,4 +206,27 @@ func newTemplateConfig(rootfs string) *configs.Config { }, }, } + + if p.userns { + config.UidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}} + config.GidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}} + config.Namespaces = append(config.Namespaces, configs.Namespace{Type: configs.NEWUSER}) + } else { + config.Mounts = append(config.Mounts, &configs.Mount{ + Destination: "/sys/fs/cgroup", + Device: "cgroup", + Flags: defaultMountFlags | unix.MS_RDONLY, + }) + } + + if p.systemd { + id := strconv.FormatInt(-int64(time.Now().Nanosecond()), 36) + config.Cgroups.Name = strings.ReplaceAll(t.Name(), "/", "_") + id + config.Cgroups.Parent = "system.slice" + config.Cgroups.ScopePrefix = "runc-test" + } else { + config.Cgroups.Path = "/test/integration" + } + + return config } diff --git a/libcontainer/integration/update_test.go b/libcontainer/integration/update_test.go new file mode 100644 index 0000000..5678b6f --- /dev/null +++ b/libcontainer/integration/update_test.go @@ -0,0 +1,98 @@ +package integration + +import ( + "bytes" + "os" + "strings" + "testing" + + "github.com/opencontainers/runc/libcontainer" + "github.com/opencontainers/runc/libcontainer/cgroups/systemd" + "github.com/opencontainers/runc/libcontainer/devices" +) + +func testUpdateDevices(t *testing.T, systemd bool) { + if testing.Short() { + return + } + config := newTemplateConfig(t, &tParam{systemd: systemd}) + container, err := newContainer(t, config) + ok(t, err) + defer destroyContainer(container) + + // Execute a first process in the container + stdinR, stdinW, err := os.Pipe() + ok(t, err) + process := &libcontainer.Process{ + Cwd: "/", + Args: []string{"cat"}, + Env: standardEnvironment, + Stdin: stdinR, + Init: true, + } + err = container.Run(process) + _ = stdinR.Close() + defer func() { + _ = stdinW.Close() + if _, err := process.Wait(); err != nil { + t.Log(err) + } + }() + ok(t, err) + + var buf bytes.Buffer + devCheck := &libcontainer.Process{ + Cwd: "/", + Args: []string{"/bin/sh", "-c", "echo > /dev/full; cat /dev/null; true"}, + Env: standardEnvironment, + Stderr: &buf, + } + isAllowed := true + expected := map[bool][]string{ + true: { + "write error: No space left on device", // from write to /dev/full + // no error from cat /dev/null + }, + false: { + "/dev/full: Operation not permitted", + `cat: can't open '/dev/null': Operation not permitted`, + }, + } + defaultDevices := config.Cgroups.Resources.Devices + + for i := 0; i < 300; i++ { + // Check the access + buf.Reset() + err = container.Run(devCheck) + ok(t, err) + waitProcess(devCheck, t) + + for _, exp := range expected[isAllowed] { + if !strings.Contains(buf.String(), exp) { + t.Fatalf("[%d] expected %q, got %q", i, exp, buf.String()) + } + } + + // Now flip the access permission + isAllowed = !isAllowed + if isAllowed { + config.Cgroups.Resources.Devices = defaultDevices + } else { + config.Cgroups.Resources.Devices = []*devices.Rule{} + } + if err := container.Set(*config); err != nil { + t.Fatal(err) + } + } +} + +func TestUpdateDevices(t *testing.T) { + testUpdateDevices(t, false) +} + +func TestUpdateDevicesSystemd(t *testing.T) { + if !systemd.IsRunningSystemd() { + t.Skip("Test requires systemd.") + } + testUpdateDevices(t, true) +} diff --git a/libcontainer/integration/utils_test.go b/libcontainer/integration/utils_test.go index 8b2d714..def29fc 100644 --- a/libcontainer/integration/utils_test.go +++ b/libcontainer/integration/utils_test.go @@ -2,14 +2,13 @@ package integration import ( "bytes" - "crypto/md5" - "encoding/hex" "fmt" - "io/ioutil" "os" "os/exec" "path/filepath" + "regexp" "runtime" + "strconv" "strings" "syscall" "testing" @@ -19,6 +18,36 @@ import ( "github.com/opencontainers/runc/libcontainer/configs" ) +var busyboxTar string + +// init makes sure the container images are downloaded, +// and initializes busyboxTar. If images can't be downloaded, +// we are unable to run any tests, so panic. +func init() { + // Figure out path to get-images.sh. Note it won't work + // in case the compiled test binary is moved elsewhere. + _, ex, _, _ := runtime.Caller(0) + getImages, err := filepath.Abs(filepath.Join(filepath.Dir(ex), "..", "..", "tests", "integration", "get-images.sh")) + if err != nil { + panic(err) + } + // Call it to make sure images are downloaded, and to get the paths. + out, err := exec.Command(getImages).CombinedOutput() + if err != nil { + panic(fmt.Errorf("getImages error %w (output: %s)", err, out)) + } + // Extract the value of BUSYBOX_IMAGE. + found := regexp.MustCompile(`(?m)^BUSYBOX_IMAGE=(.*)$`).FindSubmatchIndex(out) + if len(found) < 4 { + panic(fmt.Errorf("unable to find BUSYBOX_IMAGE= in %q", out)) + } + busyboxTar = string(out[found[2]:found[3]]) + // Finally, check the file is present + if _, err := os.Stat(busyboxTar); err != nil { + panic(err) + } +} + func ptrInt(v int) *int { return &v } @@ -50,99 +79,100 @@ func (b *stdBuffers) String() string { // ok fails the test if an err is not nil. func ok(t testing.TB, err error) { + t.Helper() if err != nil { - _, file, line, _ := runtime.Caller(1) - t.Fatalf("%s:%d: unexpected error: %s\n\n", filepath.Base(file), line, err.Error()) + t.Fatalf("unexpected error: %v", err) } } func waitProcess(p *libcontainer.Process, t *testing.T) { - _, file, line, _ := runtime.Caller(1) + t.Helper() status, err := p.Wait() - if err != nil { - t.Fatalf("%s:%d: unexpected error: %s\n\n", filepath.Base(file), line, err.Error()) + t.Fatalf("unexpected error: %v", err) } if !status.Success() { - t.Fatalf("%s:%d: unexpected status: %s\n\n", filepath.Base(file), line, status.String()) + t.Fatalf("unexpected status: %v", status) } } -func newTestRoot() (string, error) { - dir, err := ioutil.TempDir("", "libcontainer") - if err != nil { - return "", err - } - if err := os.MkdirAll(dir, 0700); err != nil { - return "", err - } - testRoots = append(testRoots, dir) - return dir, nil -} - -func newTestBundle() (string, error) { - dir, err := ioutil.TempDir("", "bundle") - if err != nil { - return "", err - } - if err := os.MkdirAll(dir, 0700); err != nil { - return "", err - } - return dir, nil -} - -// newRootfs creates a new tmp directory and copies the busybox root filesystem -func newRootfs() (string, error) { - dir, err := ioutil.TempDir("", "") - if err != nil { - return "", err - } - if err := os.MkdirAll(dir, 0700); err != nil { - return "", err - } +// newRootfs creates a new tmp directory and copies the busybox root +// filesystem to it. +func newRootfs(t *testing.T) string { + t.Helper() + dir := t.TempDir() if err := copyBusybox(dir); err != nil { - return "", err + t.Fatal(err) } - return dir, nil + + // Make sure others can read+exec, so all tests (inside userns too) can + // read the rootfs. + if err := traversePath(dir); err != nil { + t.Fatalf("Error making newRootfs path traversable by others: %v", err) + } + + return dir +} + +// traversePath gives read+execute permissions to others for all elements in tPath below +// os.TempDir() and errors out if elements above it don't have read+exec permissions for others. +// tPath MUST be a descendant of os.TempDir(). The path returned by testing.TempDir() usually is. +func traversePath(tPath string) error { + // Check the assumption that the argument is under os.TempDir(). + tempBase := os.TempDir() + if !strings.HasPrefix(tPath, tempBase) { + return fmt.Errorf("traversePath: %q is not a descendant of %q", tPath, tempBase) + } + + var path string + for _, p := range strings.SplitAfter(tPath, "/") { + path = path + p + stats, err := os.Stat(path) + if err != nil { + return err + } + + perm := stats.Mode().Perm() + + if perm&0o5 == 0o5 { + continue + } + + if strings.HasPrefix(tempBase, path) { + return fmt.Errorf("traversePath: directory %q MUST have read+exec permissions for others", path) + } + + if err := os.Chmod(path, perm|0o5); err != nil { + return err + } + } + + return nil } func remove(dir string) { - os.RemoveAll(dir) + _ = os.RemoveAll(dir) } // copyBusybox copies the rootfs for a busybox container created for the test image // into the new directory for the specific test func copyBusybox(dest string) error { - out, err := exec.Command("sh", "-c", fmt.Sprintf("cp -a /busybox/* %s/", dest)).CombinedOutput() + out, err := exec.Command("sh", "-c", fmt.Sprintf("tar --exclude './dev/*' -C %q -xf %q", dest, busyboxTar)).CombinedOutput() if err != nil { - return fmt.Errorf("copy error %q: %q", err, out) + return fmt.Errorf("untar error %w: %q", err, out) } return nil } -func newContainer(config *configs.Config) (libcontainer.Container, error) { - h := md5.New() - h.Write([]byte(time.Now().String())) - return newContainerWithName(hex.EncodeToString(h.Sum(nil)), config) -} +func newContainer(t *testing.T, config *configs.Config) (libcontainer.Container, error) { + name := strings.ReplaceAll(t.Name(), "/", "_") + strconv.FormatInt(-int64(time.Now().Nanosecond()), 35) + root := t.TempDir() -func newContainerWithName(name string, config *configs.Config) (libcontainer.Container, error) { - root, err := newTestRoot() + f, err := libcontainer.New(root) if err != nil { return nil, err } - - f, err := libcontainer.New(root, libcontainer.Cgroupfs) - if err != nil { - return nil, err - } - if config.Cgroups != nil && config.Cgroups.Parent == "system.slice" { - f, err = libcontainer.New(root, libcontainer.SystemdCgroups) - if err != nil { - return nil, err - } - } return f.Create(name, config) } @@ -150,12 +180,12 @@ func newContainerWithName(name string, config *configs.Config) (libcontainer.Con // // buffers are returned containing the STDOUT and STDERR output for the run // along with the exit code and any go error -func runContainer(config *configs.Config, console string, args ...string) (buffers *stdBuffers, exitCode int, err error) { - container, err := newContainer(config) +func runContainer(t *testing.T, config *configs.Config, args ...string) (buffers *stdBuffers, exitCode int, err error) { + container, err := newContainer(t, config) if err != nil { return nil, -1, err } - defer container.Destroy() + defer destroyContainer(container) buffers = newStdBuffers() process := &libcontainer.Process{ Cwd: "/", @@ -185,3 +215,7 @@ func runContainer(config *configs.Config, console string, args ...string) (buffe } return } + +func destroyContainer(container libcontainer.Container) { + _ = container.Destroy() +} diff --git a/libcontainer/intelrdt/cmt.go b/libcontainer/intelrdt/cmt.go new file mode 100644 index 0000000..6480a13 --- /dev/null +++ b/libcontainer/intelrdt/cmt.go @@ -0,0 +1,23 @@ +package intelrdt + +var cmtEnabled bool + +// Check if Intel RDT/CMT is enabled. +func IsCMTEnabled() bool { + featuresInit() + return cmtEnabled +} + +func getCMTNumaNodeStats(numaPath string) (*CMTNumaNodeStats, error) { + stats := &CMTNumaNodeStats{} + + if enabledMonFeatures.llcOccupancy { + llcOccupancy, err := getIntelRdtParamUint(numaPath, "llc_occupancy") + if err != nil { + return nil, err + } + stats.LLCOccupancy = llcOccupancy + } + + return stats, nil +} diff --git a/libcontainer/intelrdt/cmt_test.go b/libcontainer/intelrdt/cmt_test.go new file mode 100644 index 0000000..3bd43ad --- /dev/null +++ b/libcontainer/intelrdt/cmt_test.go @@ -0,0 +1,44 @@ +package intelrdt + +import ( + "path/filepath" + "testing" +) + +func TestGetCMTNumaNodeStats(t *testing.T) { + mocksNUMANodesToCreate := []string{"mon_l3_00", "mon_l3_01"} + + mocksFilesToCreate := map[string]uint64{ + "llc_occupancy": 9123911, + } + + mockedL3_MON := mockResctrlL3_MON(t, mocksNUMANodesToCreate, mocksFilesToCreate) + + t.Run("Gather mbm", func(t *testing.T) { + enabledMonFeatures.llcOccupancy = true + + stats := make([]CMTNumaNodeStats, 0, len(mocksNUMANodesToCreate)) + for _, numa := range mocksNUMANodesToCreate { + other, err := getCMTNumaNodeStats(filepath.Join(mockedL3_MON, "mon_data", numa)) + if err != nil { + t.Fatal(err) + } + stats = append(stats, *other) + } + + expectedStats := CMTNumaNodeStats{ + LLCOccupancy: mocksFilesToCreate["llc_occupancy"], + } + + checkCMTStatCorrection(stats[0], expectedStats, t) + checkCMTStatCorrection(stats[1], expectedStats, t) + }) +} + +func checkCMTStatCorrection(got CMTNumaNodeStats, expected CMTNumaNodeStats, t *testing.T) { + if got.LLCOccupancy != expected.LLCOccupancy { + t.Fatalf("Wrong value of `llc_occupancy`. Expected: %v but got: %v", + expected.LLCOccupancy, + got.LLCOccupancy) + } +} diff --git a/libcontainer/intelrdt/intelrdt.go b/libcontainer/intelrdt/intelrdt.go index 0071ce7..8b6bf3e 100644 --- a/libcontainer/intelrdt/intelrdt.go +++ b/libcontainer/intelrdt/intelrdt.go @@ -1,17 +1,19 @@ -// +build linux - package intelrdt import ( "bufio" + "bytes" + "errors" "fmt" - "io/ioutil" + "io" "os" "path/filepath" "strconv" "strings" "sync" + "github.com/moby/sys/mountinfo" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" "github.com/opencontainers/runc/libcontainer/configs" ) @@ -55,6 +57,10 @@ import ( * | | |-- cbm_mask * | | |-- min_cbm_bits * | | |-- num_closids + * | |-- L3_MON + * | | |-- max_threshold_occupancy + * | | |-- mon_features + * | | |-- num_rmids * | |-- MB * | |-- bandwidth_gran * | |-- delay_linear @@ -63,7 +69,7 @@ import ( * |-- ... * |-- schemata * |-- tasks - * |-- + * |-- * |-- ... * |-- schemata * |-- tasks @@ -146,7 +152,7 @@ type Manager interface { // Returns statistics for Intel RDT GetStats() (*Stats, error) - // Destroys the Intel RDT 'container_id' group + // Destroys the Intel RDT container-specific 'container_id' group Destroy() error // Returns Intel RDT path to save in a state file and to be able to @@ -158,125 +164,137 @@ type Manager interface { } // This implements interface Manager -type IntelRdtManager struct { +type intelRdtManager struct { mu sync.Mutex - Config *configs.Config - Id string - Path string + config *configs.Config + id string + path string +} + +func NewManager(config *configs.Config, id string, path string) Manager { + return &intelRdtManager{ + config: config, + id: id, + path: path, + } } const ( - IntelRdtTasks = "tasks" + intelRdtTasks = "tasks" ) var ( - // The absolute root path of the Intel RDT "resource control" filesystem - intelRdtRoot string - intelRdtRootLock sync.Mutex - // The flag to indicate if Intel RDT/CAT is enabled - isCatEnabled bool + catEnabled bool // The flag to indicate if Intel RDT/MBA is enabled - isMbaEnabled bool + mbaEnabled bool // The flag to indicate if Intel RDT/MBA Software Controller is enabled - isMbaScEnabled bool + mbaScEnabled bool + + // For Intel RDT initialization + initOnce sync.Once + + errNotFound = errors.New("Intel RDT resctrl mount point not found") ) -type intelRdtData struct { - root string - config *configs.Config - pid int -} - -// Check if Intel RDT sub-features are enabled in init() -func init() { - // 1. Check if hardware and kernel support Intel RDT sub-features - // "cat_l3" flag for CAT and "mba" flag for MBA - isCatFlagSet, isMbaFlagSet, err := parseCpuInfoFile("/proc/cpuinfo") - if err != nil { - return - } - - // 2. Check if Intel RDT "resource control" filesystem is mounted - // The user guarantees to mount the filesystem - if !isIntelRdtMounted() { - return - } - - // 3. Double check if Intel RDT sub-features are available in - // "resource control" filesystem. Intel RDT sub-features can be - // selectively disabled or enabled by kernel command line - // (e.g., rdt=!l3cat,mba) in 4.14 and newer kernel - if isCatFlagSet { - if _, err := os.Stat(filepath.Join(intelRdtRoot, "info", "L3")); err == nil { - isCatEnabled = true +// Check if Intel RDT sub-features are enabled in featuresInit() +func featuresInit() { + initOnce.Do(func() { + // 1. Check if hardware and kernel support Intel RDT sub-features + flagsSet, err := parseCpuInfoFile("/proc/cpuinfo") + if err != nil { + return } - } - if isMbaScEnabled { - // We confirm MBA Software Controller is enabled in step 2, - // MBA should be enabled because MBA Software Controller - // depends on MBA - isMbaEnabled = true - } else if isMbaFlagSet { - if _, err := os.Stat(filepath.Join(intelRdtRoot, "info", "MB")); err == nil { - isMbaEnabled = true + + // 2. Check if Intel RDT "resource control" filesystem is available. + // The user guarantees to mount the filesystem. + root, err := Root() + if err != nil { + return } - } + + // 3. Double check if Intel RDT sub-features are available in + // "resource control" filesystem. Intel RDT sub-features can be + // selectively disabled or enabled by kernel command line + // (e.g., rdt=!l3cat,mba) in 4.14 and newer kernel + if flagsSet.CAT { + if _, err := os.Stat(filepath.Join(root, "info", "L3")); err == nil { + catEnabled = true + } + } + if mbaScEnabled { + // We confirm MBA Software Controller is enabled in step 2, + // MBA should be enabled because MBA Software Controller + // depends on MBA + mbaEnabled = true + } else if flagsSet.MBA { + if _, err := os.Stat(filepath.Join(root, "info", "MB")); err == nil { + mbaEnabled = true + } + } + if flagsSet.MBMTotal || flagsSet.MBMLocal || flagsSet.CMT { + if _, err := os.Stat(filepath.Join(root, "info", "L3_MON")); err != nil { + return + } + enabledMonFeatures, err = getMonFeatures(root) + if err != nil { + return + } + if enabledMonFeatures.mbmTotalBytes || enabledMonFeatures.mbmLocalBytes { + mbmEnabled = true + } + if enabledMonFeatures.llcOccupancy { + cmtEnabled = true + } + } + }) } // Return the mount point path of Intel RDT "resource control" filesysem -func findIntelRdtMountpointDir() (string, error) { - f, err := os.Open("/proc/self/mountinfo") +func findIntelRdtMountpointDir(f io.Reader) (string, error) { + mi, err := mountinfo.GetMountsFromReader(f, func(m *mountinfo.Info) (bool, bool) { + // similar to mountinfo.FSTypeFilter but stops after the first match + if m.FSType == "resctrl" { + return false, true // don't skip, stop + } + return true, false // skip, keep going + }) if err != nil { return "", err } - defer f.Close() - - s := bufio.NewScanner(f) - for s.Scan() { - text := s.Text() - fields := strings.Split(text, " ") - // Safe as mountinfo encodes mountpoints with spaces as \040. - index := strings.Index(text, " - ") - postSeparatorFields := strings.Fields(text[index+3:]) - numPostFields := len(postSeparatorFields) - - // This is an error as we can't detect if the mount is for "Intel RDT" - if numPostFields == 0 { - return "", fmt.Errorf("Found no fields post '-' in %q", text) - } - - if postSeparatorFields[0] == "resctrl" { - // Check that the mount is properly formatted. - if numPostFields < 3 { - return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text) - } - - // Check if MBA Software Controller is enabled through mount option "-o mba_MBps" - if strings.Contains(postSeparatorFields[2], "mba_MBps") { - isMbaScEnabled = true - } - - return fields[4], nil - } - } - if err := s.Err(); err != nil { - return "", err + if len(mi) < 1 { + return "", errNotFound } - return "", NewNotFoundError("Intel RDT") + // Check if MBA Software Controller is enabled through mount option "-o mba_MBps" + if strings.Contains(","+mi[0].VFSOptions+",", ",mba_MBps,") { + mbaScEnabled = true + } + + return mi[0].Mountpoint, nil } -// Gets the root path of Intel RDT "resource control" filesystem -func getIntelRdtRoot() (string, error) { - intelRdtRootLock.Lock() - defer intelRdtRootLock.Unlock() +// For Root() use only. +var ( + intelRdtRoot string + rootMu sync.Mutex +) + +// Root returns the Intel RDT "resource control" filesystem mount point. +func Root() (string, error) { + rootMu.Lock() + defer rootMu.Unlock() if intelRdtRoot != "" { return intelRdtRoot, nil } - root, err := findIntelRdtMountpointDir() + f, err := os.Open("/proc/self/mountinfo") + if err != nil { + return "", err + } + root, err := findIntelRdtMountpointDir(f) + f.Close() if err != nil { return "", err } @@ -289,78 +307,67 @@ func getIntelRdtRoot() (string, error) { return intelRdtRoot, nil } -func isIntelRdtMounted() bool { - _, err := getIntelRdtRoot() - if err != nil { - return false - } +type cpuInfoFlags struct { + CAT bool // Cache Allocation Technology + MBA bool // Memory Bandwidth Allocation - return true + // Memory Bandwidth Monitoring related. + MBMTotal bool + MBMLocal bool + + CMT bool // Cache Monitoring Technology } -func parseCpuInfoFile(path string) (bool, bool, error) { - isCatFlagSet := false - isMbaFlagSet := false +func parseCpuInfoFile(path string) (cpuInfoFlags, error) { + infoFlags := cpuInfoFlags{} f, err := os.Open(path) if err != nil { - return false, false, err + return infoFlags, err } defer f.Close() s := bufio.NewScanner(f) for s.Scan() { - if err := s.Err(); err != nil { - return false, false, err - } - line := s.Text() // Search "cat_l3" and "mba" flags in first "flags" line - if strings.Contains(line, "flags") { + if strings.HasPrefix(line, "flags") { flags := strings.Split(line, " ") // "cat_l3" flag for CAT and "mba" flag for MBA for _, flag := range flags { switch flag { case "cat_l3": - isCatFlagSet = true + infoFlags.CAT = true case "mba": - isMbaFlagSet = true + infoFlags.MBA = true + case "cqm_mbm_total": + infoFlags.MBMTotal = true + case "cqm_mbm_local": + infoFlags.MBMLocal = true + case "cqm_occup_llc": + infoFlags.CMT = true } } - return isCatFlagSet, isMbaFlagSet, nil + return infoFlags, nil } } - return isCatFlagSet, isMbaFlagSet, nil -} - -func parseUint(s string, base, bitSize int) (uint64, error) { - value, err := strconv.ParseUint(s, base, bitSize) - if err != nil { - intValue, intErr := strconv.ParseInt(s, base, bitSize) - // 1. Handle negative values greater than MinInt64 (and) - // 2. Handle negative values lesser than MinInt64 - if intErr == nil && intValue < 0 { - return 0, nil - } else if intErr != nil && intErr.(*strconv.NumError).Err == strconv.ErrRange && intValue < 0 { - return 0, nil - } - - return value, err + if err := s.Err(); err != nil { + return infoFlags, err } - return value, nil + return infoFlags, nil } // Gets a single uint64 value from the specified file. func getIntelRdtParamUint(path, file string) (uint64, error) { fileName := filepath.Join(path, file) - contents, err := ioutil.ReadFile(fileName) + contents, err := os.ReadFile(fileName) if err != nil { return 0, err } - res, err := parseUint(strings.TrimSpace(string(contents)), 10, 64) + res, err := fscommon.ParseUint(string(bytes.TrimSpace(contents)), 10, 64) if err != nil { return res, fmt.Errorf("unable to parse %q as a uint from file %q", string(contents), fileName) } @@ -369,41 +376,29 @@ func getIntelRdtParamUint(path, file string) (uint64, error) { // Gets a string value from the specified file func getIntelRdtParamString(path, file string) (string, error) { - contents, err := ioutil.ReadFile(filepath.Join(path, file)) + contents, err := os.ReadFile(filepath.Join(path, file)) if err != nil { return "", err } - return strings.TrimSpace(string(contents)), nil + return string(bytes.TrimSpace(contents)), nil } func writeFile(dir, file, data string) error { if dir == "" { return fmt.Errorf("no such directory for %s", file) } - if err := ioutil.WriteFile(filepath.Join(dir, file), []byte(data+"\n"), 0700); err != nil { - return fmt.Errorf("failed to write %v to %v: %v", data, file, err) + if err := os.WriteFile(filepath.Join(dir, file), []byte(data+"\n"), 0o600); err != nil { + return newLastCmdError(fmt.Errorf("intelrdt: unable to write %v: %w", data, err)) } return nil } -func getIntelRdtData(c *configs.Config, pid int) (*intelRdtData, error) { - rootPath, err := getIntelRdtRoot() - if err != nil { - return nil, err - } - return &intelRdtData{ - root: rootPath, - config: c, - pid: pid, - }, nil -} - // Get the read-only L3 cache information func getL3CacheInfo() (*L3CacheInfo, error) { l3CacheInfo := &L3CacheInfo{} - rootPath, err := getIntelRdtRoot() + rootPath, err := Root() if err != nil { return l3CacheInfo, err } @@ -433,7 +428,7 @@ func getL3CacheInfo() (*L3CacheInfo, error) { func getMemBwInfo() (*MemBwInfo, error) { memBwInfo := &MemBwInfo{} - rootPath, err := getIntelRdtRoot() + rootPath, err := Root() if err != nil { return memBwInfo, err } @@ -466,7 +461,7 @@ func getMemBwInfo() (*MemBwInfo, error) { // Get diagnostics for last filesystem operation error from file info/last_cmd_status func getLastCmdStatus() (string, error) { - rootPath, err := getIntelRdtRoot() + rootPath, err := Root() if err != nil { return "", err } @@ -483,98 +478,123 @@ func getLastCmdStatus() (string, error) { // WriteIntelRdtTasks writes the specified pid into the "tasks" file func WriteIntelRdtTasks(dir string, pid int) error { if dir == "" { - return fmt.Errorf("no such directory for %s", IntelRdtTasks) + return fmt.Errorf("no such directory for %s", intelRdtTasks) } // Don't attach any pid if -1 is specified as a pid if pid != -1 { - if err := ioutil.WriteFile(filepath.Join(dir, IntelRdtTasks), []byte(strconv.Itoa(pid)), 0700); err != nil { - return fmt.Errorf("failed to write %v to %v: %v", pid, IntelRdtTasks, err) + if err := os.WriteFile(filepath.Join(dir, intelRdtTasks), []byte(strconv.Itoa(pid)), 0o600); err != nil { + return newLastCmdError(fmt.Errorf("intelrdt: unable to add pid %d: %w", pid, err)) } } return nil } // Check if Intel RDT/CAT is enabled -func IsCatEnabled() bool { - return isCatEnabled +func IsCATEnabled() bool { + featuresInit() + return catEnabled } // Check if Intel RDT/MBA is enabled -func IsMbaEnabled() bool { - return isMbaEnabled +func IsMBAEnabled() bool { + featuresInit() + return mbaEnabled } // Check if Intel RDT/MBA Software Controller is enabled -func IsMbaScEnabled() bool { - return isMbaScEnabled +func IsMBAScEnabled() bool { + featuresInit() + return mbaScEnabled } -// Get the 'container_id' path in Intel RDT "resource control" filesystem -func GetIntelRdtPath(id string) (string, error) { - rootPath, err := getIntelRdtRoot() +// Get the path of the clos group in "resource control" filesystem that the container belongs to +func (m *intelRdtManager) getIntelRdtPath() (string, error) { + rootPath, err := Root() if err != nil { return "", err } - path := filepath.Join(rootPath, id) - return path, nil + clos := m.id + if m.config.IntelRdt != nil && m.config.IntelRdt.ClosID != "" { + clos = m.config.IntelRdt.ClosID + } + + return filepath.Join(rootPath, clos), nil } // Applies Intel RDT configuration to the process with the specified pid -func (m *IntelRdtManager) Apply(pid int) (err error) { +func (m *intelRdtManager) Apply(pid int) (err error) { // If intelRdt is not specified in config, we do nothing - if m.Config.IntelRdt == nil { + if m.config.IntelRdt == nil { return nil } - d, err := getIntelRdtData(m.Config, pid) - if err != nil && !IsNotFound(err) { - return err - } - m.mu.Lock() - defer m.mu.Unlock() - path, err := d.join(m.Id) + path, err := m.getIntelRdtPath() if err != nil { return err } - m.Path = path + m.mu.Lock() + defer m.mu.Unlock() + + if m.config.IntelRdt.ClosID != "" && m.config.IntelRdt.L3CacheSchema == "" && m.config.IntelRdt.MemBwSchema == "" { + // Check that the CLOS exists, i.e. it has been pre-configured to + // conform with the runtime spec + if _, err := os.Stat(path); err != nil { + return fmt.Errorf("clos dir not accessible (must be pre-created when l3CacheSchema and memBwSchema are empty): %w", err) + } + } + + if err := os.MkdirAll(path, 0o755); err != nil { + return newLastCmdError(err) + } + + if err := WriteIntelRdtTasks(path, pid); err != nil { + return newLastCmdError(err) + } + + m.path = path return nil } -// Destroys the Intel RDT 'container_id' group -func (m *IntelRdtManager) Destroy() error { - m.mu.Lock() - defer m.mu.Unlock() - if err := os.RemoveAll(m.GetPath()); err != nil { - return err +// Destroys the Intel RDT container-specific 'container_id' group +func (m *intelRdtManager) Destroy() error { + // Don't remove resctrl group if closid has been explicitly specified. The + // group is likely externally managed, i.e. by some other entity than us. + // There are probably other containers/tasks sharing the same group. + if m.config.IntelRdt == nil || m.config.IntelRdt.ClosID == "" { + m.mu.Lock() + defer m.mu.Unlock() + if err := os.RemoveAll(m.GetPath()); err != nil { + return err + } + m.path = "" } - m.Path = "" return nil } // Returns Intel RDT path to save in a state file and to be able to // restore the object later -func (m *IntelRdtManager) GetPath() string { - if m.Path == "" { - m.Path, _ = GetIntelRdtPath(m.Id) +func (m *intelRdtManager) GetPath() string { + if m.path == "" { + m.path, _ = m.getIntelRdtPath() } - return m.Path + return m.path } // Returns statistics for Intel RDT -func (m *IntelRdtManager) GetStats() (*Stats, error) { +func (m *intelRdtManager) GetStats() (*Stats, error) { // If intelRdt is not specified in config - if m.Config.IntelRdt == nil { + if m.config.IntelRdt == nil { return nil, nil } m.mu.Lock() defer m.mu.Unlock() - stats := NewStats() + stats := newStats() - rootPath, err := getIntelRdtRoot() + rootPath, err := Root() if err != nil { return nil, err } @@ -585,14 +605,15 @@ func (m *IntelRdtManager) GetStats() (*Stats, error) { } schemaRootStrings := strings.Split(tmpRootStrings, "\n") - // The L3 cache and memory bandwidth schemata in 'container_id' group - tmpStrings, err := getIntelRdtParamString(m.GetPath(), "schemata") + // The L3 cache and memory bandwidth schemata in container's clos group + containerPath := m.GetPath() + tmpStrings, err := getIntelRdtParamString(containerPath, "schemata") if err != nil { return nil, err } schemaStrings := strings.Split(tmpStrings, "\n") - if IsCatEnabled() { + if IsCATEnabled() { // The read-only L3 cache information l3CacheInfo, err := getL3CacheInfo() if err != nil { @@ -607,7 +628,7 @@ func (m *IntelRdtManager) GetStats() (*Stats, error) { } } - // The L3 cache schema in 'container_id' group + // The L3 cache schema in container's clos group for _, schema := range schemaStrings { if strings.Contains(schema, "L3") { stats.L3CacheSchema = strings.TrimSpace(schema) @@ -615,7 +636,7 @@ func (m *IntelRdtManager) GetStats() (*Stats, error) { } } - if IsMbaEnabled() { + if IsMBAEnabled() { // The read-only memory bandwidth information memBwInfo, err := getMemBwInfo() if err != nil { @@ -630,7 +651,7 @@ func (m *IntelRdtManager) GetStats() (*Stats, error) { } } - // The memory bandwidth schema in 'container_id' group + // The memory bandwidth schema in container's clos group for _, schema := range schemaStrings { if strings.Contains(schema, "MB") { stats.MemBwSchema = strings.TrimSpace(schema) @@ -638,11 +659,18 @@ func (m *IntelRdtManager) GetStats() (*Stats, error) { } } + if IsMBMEnabled() || IsCMTEnabled() { + err = getMonitoringStats(containerPath, stats) + if err != nil { + return nil, err + } + } + return stats, nil } // Set Intel RDT "resource control" filesystem as configured. -func (m *IntelRdtManager) Set(container *configs.Config) error { +func (m *intelRdtManager) Set(container *configs.Config) error { // About L3 cache schema: // It has allocation bitmasks/values for L3 cache on each socket, // which contains L3 cache id and capacity bitmask (CBM). @@ -693,24 +721,30 @@ func (m *IntelRdtManager) Set(container *configs.Config) error { l3CacheSchema := container.IntelRdt.L3CacheSchema memBwSchema := container.IntelRdt.MemBwSchema + // TODO: verify that l3CacheSchema and/or memBwSchema match the + // existing schemata if ClosID has been specified. This is a more + // involved than reading the file and doing plain string comparison as + // the value written in does not necessarily match what gets read out + // (leading zeros, cache id ordering etc). + // Write a single joint schema string to schemata file if l3CacheSchema != "" && memBwSchema != "" { if err := writeFile(path, "schemata", l3CacheSchema+"\n"+memBwSchema); err != nil { - return NewLastCmdError(err) + return err } } // Write only L3 cache schema string to schemata file if l3CacheSchema != "" && memBwSchema == "" { if err := writeFile(path, "schemata", l3CacheSchema); err != nil { - return NewLastCmdError(err) + return err } } // Write only memory bandwidth schema string to schemata file if l3CacheSchema == "" && memBwSchema != "" { if err := writeFile(path, "schemata", memBwSchema); err != nil { - return NewLastCmdError(err) + return err } } } @@ -718,56 +752,10 @@ func (m *IntelRdtManager) Set(container *configs.Config) error { return nil } -func (raw *intelRdtData) join(id string) (string, error) { - path := filepath.Join(raw.root, id) - if err := os.MkdirAll(path, 0755); err != nil { - return "", NewLastCmdError(err) - } - - if err := WriteIntelRdtTasks(path, raw.pid); err != nil { - return "", NewLastCmdError(err) - } - return path, nil -} - -type NotFoundError struct { - ResourceControl string -} - -func (e *NotFoundError) Error() string { - return fmt.Sprintf("mountpoint for %s not found", e.ResourceControl) -} - -func NewNotFoundError(res string) error { - return &NotFoundError{ - ResourceControl: res, - } -} - -func IsNotFound(err error) bool { - if err == nil { - return false - } - _, ok := err.(*NotFoundError) - return ok -} - -type LastCmdError struct { - LastCmdStatus string - Err error -} - -func (e *LastCmdError) Error() string { - return fmt.Sprintf(e.Err.Error() + ", last_cmd_status: " + e.LastCmdStatus) -} - -func NewLastCmdError(err error) error { - lastCmdStatus, err1 := getLastCmdStatus() +func newLastCmdError(err error) error { + status, err1 := getLastCmdStatus() if err1 == nil { - return &LastCmdError{ - LastCmdStatus: lastCmdStatus, - Err: err, - } + return fmt.Errorf("%w, last_cmd_status: %s", err, status) } return err } diff --git a/libcontainer/intelrdt/intelrdt_test.go b/libcontainer/intelrdt/intelrdt_test.go index a19b961..3534b43 100644 --- a/libcontainer/intelrdt/intelrdt_test.go +++ b/libcontainer/intelrdt/intelrdt_test.go @@ -1,19 +1,16 @@ -// +build linux - package intelrdt import ( + "errors" + "io" + "os" + "path/filepath" "strings" "testing" ) func TestIntelRdtSetL3CacheSchema(t *testing.T) { - if !IsCatEnabled() { - return - } - helper := NewIntelRdtTestUtil(t) - defer helper.cleanup() const ( l3CacheSchemaBefore = "L3:0=f;1=f0" @@ -24,12 +21,9 @@ func TestIntelRdtSetL3CacheSchema(t *testing.T) { "schemata": l3CacheSchemaBefore + "\n", }) - helper.IntelRdtData.config.IntelRdt.L3CacheSchema = l3CacheSchemeAfter - intelrdt := &IntelRdtManager{ - Config: helper.IntelRdtData.config, - Path: helper.IntelRdtPath, - } - if err := intelrdt.Set(helper.IntelRdtData.config); err != nil { + helper.config.IntelRdt.L3CacheSchema = l3CacheSchemeAfter + intelrdt := NewManager(helper.config, "", helper.IntelRdtPath) + if err := intelrdt.Set(helper.config); err != nil { t.Fatal(err) } @@ -46,12 +40,7 @@ func TestIntelRdtSetL3CacheSchema(t *testing.T) { } func TestIntelRdtSetMemBwSchema(t *testing.T) { - if !IsMbaEnabled() { - return - } - helper := NewIntelRdtTestUtil(t) - defer helper.cleanup() const ( memBwSchemaBefore = "MB:0=20;1=70" @@ -62,12 +51,9 @@ func TestIntelRdtSetMemBwSchema(t *testing.T) { "schemata": memBwSchemaBefore + "\n", }) - helper.IntelRdtData.config.IntelRdt.MemBwSchema = memBwSchemeAfter - intelrdt := &IntelRdtManager{ - Config: helper.IntelRdtData.config, - Path: helper.IntelRdtPath, - } - if err := intelrdt.Set(helper.IntelRdtData.config); err != nil { + helper.config.IntelRdt.MemBwSchema = memBwSchemeAfter + intelrdt := NewManager(helper.config, "", helper.IntelRdtPath) + if err := intelrdt.Set(helper.config); err != nil { t.Fatal(err) } @@ -84,12 +70,7 @@ func TestIntelRdtSetMemBwSchema(t *testing.T) { } func TestIntelRdtSetMemBwScSchema(t *testing.T) { - if !IsMbaScEnabled() { - return - } - helper := NewIntelRdtTestUtil(t) - defer helper.cleanup() const ( memBwScSchemaBefore = "MB:0=5000;1=7000" @@ -100,12 +81,9 @@ func TestIntelRdtSetMemBwScSchema(t *testing.T) { "schemata": memBwScSchemaBefore + "\n", }) - helper.IntelRdtData.config.IntelRdt.MemBwSchema = memBwScSchemeAfter - intelrdt := &IntelRdtManager{ - Config: helper.IntelRdtData.config, - Path: helper.IntelRdtPath, - } - if err := intelrdt.Set(helper.IntelRdtData.config); err != nil { + helper.config.IntelRdt.MemBwSchema = memBwScSchemeAfter + intelrdt := NewManager(helper.config, "", helper.IntelRdtPath) + if err := intelrdt.Set(helper.config); err != nil { t.Fatal(err) } @@ -120,3 +98,170 @@ func TestIntelRdtSetMemBwScSchema(t *testing.T) { t.Fatal("Got the wrong value, set 'schemata' failed.") } } + +func TestApply(t *testing.T) { + helper := NewIntelRdtTestUtil(t) + + const closID = "test-clos" + + helper.config.IntelRdt.ClosID = closID + intelrdt := NewManager(helper.config, "", helper.IntelRdtPath) + if err := intelrdt.Apply(1234); err == nil { + t.Fatal("unexpected success when applying pid") + } + if _, err := os.Stat(filepath.Join(helper.IntelRdtPath, closID)); err == nil { + t.Fatal("closid dir should not exist") + } + + // Dir should be created if some schema has been specified + intelrdt.(*intelRdtManager).config.IntelRdt.L3CacheSchema = "L3:0=f" + if err := intelrdt.Apply(1235); err != nil { + t.Fatalf("Apply() failed: %v", err) + } + + pids, err := getIntelRdtParamString(intelrdt.GetPath(), "tasks") + if err != nil { + t.Fatalf("failed to read tasks file: %v", err) + } + if pids != "1235" { + t.Fatalf("unexpected tasks file, expected '1235', got %q", pids) + } +} + +const ( + mountinfoValid = `18 40 0:18 / /sys rw,nosuid,nodev,noexec,relatime shared:6 - sysfs sysfs rw +19 40 0:3 / /proc rw,nosuid,nodev,noexec,relatime shared:5 - proc proc rw +20 40 0:5 / /dev rw,nosuid shared:2 - devtmpfs devtmpfs rw,size=131927256k,nr_inodes=32981814,mode=755 +21 18 0:17 / /sys/kernel/security rw,nosuid,nodev,noexec,relatime shared:7 - securityfs securityfs rw +22 20 0:19 / /dev/shm rw,nosuid,nodev shared:3 - tmpfs tmpfs rw +23 20 0:12 / /dev/pts rw,nosuid,noexec,relatime shared:4 - devpts devpts rw,gid=5,mode=620,ptmxmode=000 +24 40 0:20 / /run rw,nosuid,nodev shared:22 - tmpfs tmpfs rw,mode=755 +25 18 0:21 / /sys/fs/cgroup ro,nosuid,nodev,noexec shared:8 - tmpfs tmpfs ro,mode=755 +26 25 0:22 / /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:9 - cgroup cgroup rw,xattr,release_agent=/usr/lib/systemd/systemd-cgroups-agent,name=systemd +27 18 0:23 / /sys/fs/pstore rw,nosuid,nodev,noexec,relatime shared:20 - pstore pstore rw +28 25 0:24 / /sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:10 - cgroup cgroup rw,perf_event +29 25 0:25 / /sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:11 - cgroup cgroup rw,cpuacct,cpu +30 25 0:26 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:12 - cgroup cgroup rw,memory +31 25 0:27 / /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:13 - cgroup cgroup rw,devices +32 25 0:28 / /sys/fs/cgroup/hugetlb rw,nosuid,nodev,noexec,relatime shared:14 - cgroup cgroup rw,hugetlb +33 25 0:29 / /sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:15 - cgroup cgroup rw,blkio +34 25 0:30 / /sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:16 - cgroup cgroup rw,pids +35 25 0:31 / /sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,cpuset +36 25 0:32 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,freezer +37 25 0:33 / /sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:19 - cgroup cgroup rw,net_prio,net_cls +38 18 0:34 / /sys/kernel/config rw,relatime shared:21 - configfs configfs rw +40 0 253:0 / / rw,relatime shared:1 - ext4 /dev/mapper/vvrg-vvrg rw,data=ordered +16 18 0:6 / /sys/kernel/debug rw,relatime shared:23 - debugfs debugfs rw +41 18 0:16 / /sys/fs/resctrl rw,relatime shared:24 - resctrl resctrl rw +42 20 0:36 / /dev/hugepages rw,relatime shared:25 - hugetlbfs hugetlbfs rw +43 19 0:37 / /proc/sys/fs/binfmt_misc rw,relatime shared:26 - autofs systemd-1 rw,fd=32,pgrp=1,timeout=0,minproto=5,maxproto=5,direct,pipe_ino=35492 +44 20 0:15 / /dev/mqueue rw,relatime shared:27 - mqueue mqueue rw +45 40 8:1 / /boot rw,relatime shared:28 - ext4 /dev/sda1 rw,stripe=4,data=ordered +46 40 253:1 / /home rw,relatime shared:29 - ext4 /dev/mapper/vvhg-vvhg rw,data=ordered +47 40 0:38 / /var/lib/nfs/rpc_pipefs rw,relatime shared:30 - rpc_pipefs sunrpc rw +125 24 0:20 /mesos/containers /run/mesos/containers rw,nosuid shared:22 - tmpfs tmpfs rw,mode=755 +123 40 253:0 /var/lib/docker/containers /var/lib/docker/containers rw,relatime - ext4 /dev/mapper/vvrg-vvrg rw,data=ordered +129 40 253:0 /var/lib/docker/overlay2 /var/lib/docker/overlay2 rw,relatime - ext4 /dev/mapper/vvrg-vvrg rw,data=ordered +119 24 0:39 / /run/user/1009 rw,nosuid,nodev,relatime shared:100 - tmpfs tmpfs rw,size=26387788k,mode=700,uid=1009,gid=1009` + + mountinfoMbaSc = `18 40 0:18 / /sys rw,nosuid,nodev,noexec,relatime shared:6 - sysfs sysfs rw +19 40 0:3 / /proc rw,nosuid,nodev,noexec,relatime shared:5 - proc proc rw +20 40 0:5 / /dev rw,nosuid shared:2 - devtmpfs devtmpfs rw,size=131927256k,nr_inodes=32981814,mode=755 +21 18 0:17 / /sys/kernel/security rw,nosuid,nodev,noexec,relatime shared:7 - securityfs securityfs rw +22 20 0:19 / /dev/shm rw,nosuid,nodev shared:3 - tmpfs tmpfs rw +23 20 0:12 / /dev/pts rw,nosuid,noexec,relatime shared:4 - devpts devpts rw,gid=5,mode=620,ptmxmode=000 +24 40 0:20 / /run rw,nosuid,nodev shared:22 - tmpfs tmpfs rw,mode=755 +25 18 0:21 / /sys/fs/cgroup ro,nosuid,nodev,noexec shared:8 - tmpfs tmpfs ro,mode=755 +26 25 0:22 / /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:9 - cgroup cgroup rw,xattr,release_agent=/usr/lib/systemd/systemd-cgroups-agent,name=systemd +27 18 0:23 / /sys/fs/pstore rw,nosuid,nodev,noexec,relatime shared:20 - pstore pstore rw +28 25 0:24 / /sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:10 - cgroup cgroup rw,perf_event +29 25 0:25 / /sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:11 - cgroup cgroup rw,cpuacct,cpu +30 25 0:26 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:12 - cgroup cgroup rw,memory +31 25 0:27 / /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:13 - cgroup cgroup rw,devices +32 25 0:28 / /sys/fs/cgroup/hugetlb rw,nosuid,nodev,noexec,relatime shared:14 - cgroup cgroup rw,hugetlb +33 25 0:29 / /sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:15 - cgroup cgroup rw,blkio +34 25 0:30 / /sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:16 - cgroup cgroup rw,pids +35 25 0:31 / /sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,cpuset +36 25 0:32 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,freezer +37 25 0:33 / /sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:19 - cgroup cgroup rw,net_prio,net_cls +38 18 0:34 / /sys/kernel/config rw,relatime shared:21 - configfs configfs rw +40 0 253:0 / / rw,relatime shared:1 - ext4 /dev/mapper/vvrg-vvrg rw,data=ordered +16 18 0:6 / /sys/kernel/debug rw,relatime shared:23 - debugfs debugfs rw +41 18 0:16 / /sys/fs/resctrl rw,relatime shared:24 - resctrl resctrl rw,mba_MBps +42 20 0:36 / /dev/hugepages rw,relatime shared:25 - hugetlbfs hugetlbfs rw +43 19 0:37 / /proc/sys/fs/binfmt_misc rw,relatime shared:26 - autofs systemd-1 rw,fd=32,pgrp=1,timeout=0,minproto=5,maxproto=5,direct,pipe_ino=35492 +44 20 0:15 / /dev/mqueue rw,relatime shared:27 - mqueue mqueue rw +45 40 8:1 / /boot rw,relatime shared:28 - ext4 /dev/sda1 rw,stripe=4,data=ordered +46 40 253:1 / /home rw,relatime shared:29 - ext4 /dev/mapper/vvhg-vvhg rw,data=ordered +47 40 0:38 / /var/lib/nfs/rpc_pipefs rw,relatime shared:30 - rpc_pipefs sunrpc rw +125 24 0:20 /mesos/containers /run/mesos/containers rw,nosuid shared:22 - tmpfs tmpfs rw,mode=755 +123 40 253:0 /var/lib/docker/containers /var/lib/docker/containers rw,relatime - ext4 /dev/mapper/vvrg-vvrg rw,data=ordered +129 40 253:0 /var/lib/docker/overlay2 /var/lib/docker/overlay2 rw,relatime - ext4 /dev/mapper/vvrg-vvrg rw,data=ordered +119 24 0:39 / /run/user/1009 rw,nosuid,nodev,relatime shared:100 - tmpfs tmpfs rw,size=26387788k,mode=700,uid=1009,gid=1009` +) + +func TestFindIntelRdtMountpointDir(t *testing.T) { + testCases := []struct { + name string + input io.Reader + isNotFoundError bool + isError bool + mbaScEnabled bool + mountpoint string + }{ + { + name: "Valid mountinfo with MBA Software Controller disabled", + input: strings.NewReader(mountinfoValid), + mountpoint: "/sys/fs/resctrl", + }, + { + name: "Valid mountinfo with MBA Software Controller enabled", + input: strings.NewReader(mountinfoMbaSc), + mbaScEnabled: true, + mountpoint: "/sys/fs/resctrl", + }, + { + name: "Empty mountinfo", + input: strings.NewReader(""), + isNotFoundError: true, + }, + { + name: "Broken mountinfo", + input: strings.NewReader("baa"), + isError: true, + }, + } + + for _, tc := range testCases { + tc := tc + t.Run(tc.name, func(t *testing.T) { + mbaScEnabled = false + mp, err := findIntelRdtMountpointDir(tc.input) + if tc.isNotFoundError { + if !errors.Is(err, errNotFound) { + t.Errorf("expected errNotFound error, got %+v", err) + } + return + } + if tc.isError { + if err == nil { + t.Error("expected error, got nil") + } + return + } + if err != nil { + t.Errorf("expected nil, got %+v", err) + return + } + // no errors, check the results + if tc.mbaScEnabled != mbaScEnabled { + t.Errorf("expected mbaScEnabled=%v, got %v", + tc.mbaScEnabled, mbaScEnabled) + } + if tc.mountpoint != mp { + t.Errorf("expected mountpoint=%q, got %q", + tc.mountpoint, mp) + } + }) + } +} diff --git a/libcontainer/intelrdt/mbm.go b/libcontainer/intelrdt/mbm.go new file mode 100644 index 0000000..13f31ac --- /dev/null +++ b/libcontainer/intelrdt/mbm.go @@ -0,0 +1,31 @@ +package intelrdt + +// The flag to indicate if Intel RDT/MBM is enabled +var mbmEnabled bool + +// Check if Intel RDT/MBM is enabled. +func IsMBMEnabled() bool { + featuresInit() + return mbmEnabled +} + +func getMBMNumaNodeStats(numaPath string) (*MBMNumaNodeStats, error) { + stats := &MBMNumaNodeStats{} + if enabledMonFeatures.mbmTotalBytes { + mbmTotalBytes, err := getIntelRdtParamUint(numaPath, "mbm_total_bytes") + if err != nil { + return nil, err + } + stats.MBMTotalBytes = mbmTotalBytes + } + + if enabledMonFeatures.mbmLocalBytes { + mbmLocalBytes, err := getIntelRdtParamUint(numaPath, "mbm_local_bytes") + if err != nil { + return nil, err + } + stats.MBMLocalBytes = mbmLocalBytes + } + + return stats, nil +} diff --git a/libcontainer/intelrdt/mbm_test.go b/libcontainer/intelrdt/mbm_test.go new file mode 100644 index 0000000..4f22cbd --- /dev/null +++ b/libcontainer/intelrdt/mbm_test.go @@ -0,0 +1,53 @@ +package intelrdt + +import ( + "path/filepath" + "testing" +) + +func TestGetMBMNumaNodeStats(t *testing.T) { + mocksNUMANodesToCreate := []string{"mon_l3_00", "mon_l3_01"} + + mocksFilesToCreate := map[string]uint64{ + "mbm_total_bytes": 9123911, + "mbm_local_bytes": 2361361, + } + + mockedL3_MON := mockResctrlL3_MON(t, mocksNUMANodesToCreate, mocksFilesToCreate) + + t.Run("Gather mbm", func(t *testing.T) { + enabledMonFeatures.mbmTotalBytes = true + enabledMonFeatures.mbmLocalBytes = true + + stats := make([]MBMNumaNodeStats, 0, len(mocksNUMANodesToCreate)) + for _, numa := range mocksNUMANodesToCreate { + other, err := getMBMNumaNodeStats(filepath.Join(mockedL3_MON, "mon_data", numa)) + if err != nil { + t.Fatal(err) + } + stats = append(stats, *other) + } + + expectedStats := MBMNumaNodeStats{ + MBMTotalBytes: mocksFilesToCreate["mbm_total_bytes"], + MBMLocalBytes: mocksFilesToCreate["mbm_local_bytes"], + } + + checkMBMStatCorrection(stats[0], expectedStats, t) + checkMBMStatCorrection(stats[1], expectedStats, t) + }) +} + +func checkMBMStatCorrection(got MBMNumaNodeStats, expected MBMNumaNodeStats, t *testing.T) { + if got.MBMTotalBytes != expected.MBMTotalBytes { + t.Fatalf("Wrong value of mbm_total_bytes. Expected: %v but got: %v", + expected.MBMTotalBytes, + got.MBMTotalBytes) + } + + if got.MBMLocalBytes != expected.MBMLocalBytes { + t.Fatalf("Wrong value of mbm_local_bytes. Expected: %v but got: %v", + expected.MBMLocalBytes, + got.MBMLocalBytes) + } +} diff --git a/libcontainer/intelrdt/monitoring.go b/libcontainer/intelrdt/monitoring.go new file mode 100644 index 0000000..82e0002 --- /dev/null +++ b/libcontainer/intelrdt/monitoring.go @@ -0,0 +1,83 @@ +package intelrdt + +import ( + "bufio" + "io" + "os" + "path/filepath" + + "github.com/sirupsen/logrus" +) + +var enabledMonFeatures monFeatures + +type monFeatures struct { + mbmTotalBytes bool + mbmLocalBytes bool + llcOccupancy bool +} + +func getMonFeatures(intelRdtRoot string) (monFeatures, error) { + file, err := os.Open(filepath.Join(intelRdtRoot, "info", "L3_MON", "mon_features")) + if err != nil { + return monFeatures{}, err + } + defer file.Close() + return parseMonFeatures(file) +} + +func parseMonFeatures(reader io.Reader) (monFeatures, error) { + scanner := bufio.NewScanner(reader) + + monFeatures := monFeatures{} + + for scanner.Scan() { + switch feature := scanner.Text(); feature { + case "mbm_total_bytes": + monFeatures.mbmTotalBytes = true + case "mbm_local_bytes": + monFeatures.mbmLocalBytes = true + case "llc_occupancy": + monFeatures.llcOccupancy = true + default: + logrus.Warnf("Unsupported Intel RDT monitoring feature: %s", feature) + } + } + + return monFeatures, scanner.Err() +} + +func getMonitoringStats(containerPath string, stats *Stats) error { + numaFiles, err := os.ReadDir(filepath.Join(containerPath, "mon_data")) + if err != nil { + return err + } + + var mbmStats []MBMNumaNodeStats + var cmtStats []CMTNumaNodeStats + + for _, file := range numaFiles { + if file.IsDir() { + numaPath := filepath.Join(containerPath, "mon_data", file.Name()) + if IsMBMEnabled() { + numaMBMStats, err := getMBMNumaNodeStats(numaPath) + if err != nil { + return err + } + mbmStats = append(mbmStats, *numaMBMStats) + } + if IsCMTEnabled() { + numaCMTStats, err := getCMTNumaNodeStats(numaPath) + if err != nil { + return err + } + cmtStats = append(cmtStats, *numaCMTStats) + } + } + } + + stats.MBMStats = &mbmStats + stats.CMTStats = &cmtStats + + return err +} diff --git a/libcontainer/intelrdt/monitoring_test.go b/libcontainer/intelrdt/monitoring_test.go new file mode 100644 index 0000000..0a89ef2 --- /dev/null +++ b/libcontainer/intelrdt/monitoring_test.go @@ -0,0 +1,103 @@ +package intelrdt + +import ( + "os" + "path/filepath" + "strconv" + "strings" + "testing" +) + +func TestParseMonFeatures(t *testing.T) { + t.Run("All features available", func(t *testing.T) { + parsedMonFeatures, err := parseMonFeatures( + strings.NewReader("mbm_total_bytes\nmbm_local_bytes\nllc_occupancy")) + if err != nil { + t.Errorf("Error while parsing mon features err = %v", err) + } + + expectedMonFeatures := monFeatures{true, true, true} + + if parsedMonFeatures != expectedMonFeatures { + t.Error("Cannot gather all features!") + } + }) + + t.Run("No features available", func(t *testing.T) { + parsedMonFeatures, err := parseMonFeatures(strings.NewReader("")) + if err != nil { + t.Errorf("Error while parsing mon features err = %v", err) + } + + expectedMonFeatures := monFeatures{false, false, false} + + if parsedMonFeatures != expectedMonFeatures { + t.Error("Expected no features available but there is any!") + } + }) +} + +func mockResctrlL3_MON(t *testing.T, NUMANodes []string, mocks map[string]uint64) string { + t.Helper() + testDir := t.TempDir() + monDataPath := filepath.Join(testDir, "mon_data") + + for _, numa := range NUMANodes { + numaPath := filepath.Join(monDataPath, numa) + err := os.MkdirAll(numaPath, 0o700) + if err != nil { + t.Fatal(err) + } + + for fileName, value := range mocks { + err := os.WriteFile(filepath.Join(numaPath, fileName), []byte(strconv.FormatUint(value, 10)), 0o644) + if err != nil { + t.Fatal(err) + } + } + + } + + return testDir +} + +func TestGetMonitoringStats(t *testing.T) { + enabledMonFeatures.mbmTotalBytes = true + enabledMonFeatures.mbmLocalBytes = true + enabledMonFeatures.llcOccupancy = true + mbmEnabled = true + cmtEnabled = true + + mocksNUMANodesToCreate := []string{"mon_l3_00", "mon_l3_01"} + + mocksFilesToCreate := map[string]uint64{ + "mbm_total_bytes": 9123911, + "mbm_local_bytes": 2361361, + "llc_occupancy": 123331, + } + + mockedL3_MON := mockResctrlL3_MON(t, mocksNUMANodesToCreate, mocksFilesToCreate) + + t.Run("Gather monitoring stats", func(t *testing.T) { + var stats Stats + err := getMonitoringStats(mockedL3_MON, &stats) + if err != nil { + t.Fatal(err) + } + + expectedMBMStats := MBMNumaNodeStats{ + MBMTotalBytes: mocksFilesToCreate["mbm_total_bytes"], + MBMLocalBytes: mocksFilesToCreate["mbm_local_bytes"], + } + + expectedCMTStats := CMTNumaNodeStats{LLCOccupancy: mocksFilesToCreate["llc_occupancy"]} + + for _, gotMBMStat := range *stats.MBMStats { + checkMBMStatCorrection(gotMBMStat, expectedMBMStats, t) + } + + for _, gotCMTStat := range *stats.CMTStats { + checkCMTStatCorrection(gotCMTStat, expectedCMTStats, t) + } + }) +} diff --git a/libcontainer/intelrdt/stats.go b/libcontainer/intelrdt/stats.go index df5686f..a5eb254 100644 --- a/libcontainer/intelrdt/stats.go +++ b/libcontainer/intelrdt/stats.go @@ -1,5 +1,3 @@ -// +build linux - package intelrdt type L3CacheInfo struct { @@ -15,6 +13,19 @@ type MemBwInfo struct { NumClosids uint64 `json:"num_closids,omitempty"` } +type MBMNumaNodeStats struct { + // The 'mbm_total_bytes' in 'container_id' group. + MBMTotalBytes uint64 `json:"mbm_total_bytes"` + + // The 'mbm_local_bytes' in 'container_id' group. + MBMLocalBytes uint64 `json:"mbm_local_bytes"` +} + +type CMTNumaNodeStats struct { + // The 'llc_occupancy' in 'container_id' group. + LLCOccupancy uint64 `json:"llc_occupancy"` +} + type Stats struct { // The read-only L3 cache information L3CacheInfo *L3CacheInfo `json:"l3_cache_info,omitempty"` @@ -33,8 +44,14 @@ type Stats struct { // The memory bandwidth schema in 'container_id' group MemBwSchema string `json:"mem_bw_schema,omitempty"` + + // The memory bandwidth monitoring statistics from NUMA nodes in 'container_id' group + MBMStats *[]MBMNumaNodeStats `json:"mbm_stats,omitempty"` + + // The cache monitoring technology statistics from NUMA nodes in 'container_id' group + CMTStats *[]CMTNumaNodeStats `json:"cmt_stats,omitempty"` } -func NewStats() *Stats { +func newStats() *Stats { return &Stats{} } diff --git a/libcontainer/intelrdt/util_test.go b/libcontainer/intelrdt/util_test.go index 970b6ce..f1b4244 100644 --- a/libcontainer/intelrdt/util_test.go +++ b/libcontainer/intelrdt/util_test.go @@ -1,5 +1,3 @@ -// +build linux - /* * Utility for testing Intel RDT operations. * Creates a mock of the Intel RDT "resource control" filesystem for the duration of the test. @@ -7,7 +5,6 @@ package intelrdt import ( - "io/ioutil" "os" "path/filepath" "testing" @@ -16,44 +13,27 @@ import ( ) type intelRdtTestUtil struct { - // intelRdt data to use in tests - IntelRdtData *intelRdtData + config *configs.Config // Path to the mock Intel RDT "resource control" filesystem directory IntelRdtPath string - // Temporary directory to store mock Intel RDT "resource control" filesystem - tempDir string - t *testing.T + t *testing.T } // Creates a new test util func NewIntelRdtTestUtil(t *testing.T) *intelRdtTestUtil { - d := &intelRdtData{ - config: &configs.Config{ - IntelRdt: &configs.IntelRdt{}, - }, - } - tempDir, err := ioutil.TempDir("", "intelrdt_test") - if err != nil { - t.Fatal(err) - } - d.root = tempDir - testIntelRdtPath := filepath.Join(d.root, "resctrl") - if err != nil { - t.Fatal(err) + config := &configs.Config{ + IntelRdt: &configs.IntelRdt{}, } + intelRdtRoot = t.TempDir() + testIntelRdtPath := filepath.Join(intelRdtRoot, "resctrl") // Ensure the full mock Intel RDT "resource control" filesystem path exists - err = os.MkdirAll(testIntelRdtPath, 0755) - if err != nil { + if err := os.MkdirAll(testIntelRdtPath, 0o755); err != nil { t.Fatal(err) } - return &intelRdtTestUtil{IntelRdtData: d, IntelRdtPath: testIntelRdtPath, tempDir: tempDir, t: t} -} - -func (c *intelRdtTestUtil) cleanup() { - os.RemoveAll(c.tempDir) + return &intelRdtTestUtil{config: config, IntelRdtPath: testIntelRdtPath, t: t} } // Write the specified contents on the mock of the specified Intel RDT "resource control" files diff --git a/libcontainer/keys/keyctl.go b/libcontainer/keys/keyctl.go index 74dedd5..f3a6c53 100644 --- a/libcontainer/keys/keyctl.go +++ b/libcontainer/keys/keyctl.go @@ -1,39 +1,36 @@ -// +build linux - package keys import ( + "errors" "fmt" "strconv" "strings" - "github.com/pkg/errors" - "golang.org/x/sys/unix" ) type KeySerial uint32 func JoinSessionKeyring(name string) (KeySerial, error) { - sessKeyId, err := unix.KeyctlJoinSessionKeyring(name) + sessKeyID, err := unix.KeyctlJoinSessionKeyring(name) if err != nil { - return 0, errors.Wrap(err, "create session key") + return 0, fmt.Errorf("unable to create session key: %w", err) } - return KeySerial(sessKeyId), nil + return KeySerial(sessKeyID), nil } // ModKeyringPerm modifies permissions on a keyring by reading the current permissions, // anding the bits with the given mask (clearing permissions) and setting // additional permission bits -func ModKeyringPerm(ringId KeySerial, mask, setbits uint32) error { - dest, err := unix.KeyctlString(unix.KEYCTL_DESCRIBE, int(ringId)) +func ModKeyringPerm(ringID KeySerial, mask, setbits uint32) error { + dest, err := unix.KeyctlString(unix.KEYCTL_DESCRIBE, int(ringID)) if err != nil { return err } res := strings.Split(dest, ";") if len(res) < 5 { - return fmt.Errorf("Destination buffer for key description is too small") + return errors.New("Destination buffer for key description is too small") } // parse permissions @@ -44,5 +41,5 @@ func ModKeyringPerm(ringId KeySerial, mask, setbits uint32) error { perm := (uint32(perm64) & mask) | setbits - return unix.KeyctlSetperm(int(ringId), perm) + return unix.KeyctlSetperm(int(ringID), perm) } diff --git a/libcontainer/logs/logs.go b/libcontainer/logs/logs.go index 1077e7b..95deb0d 100644 --- a/libcontainer/logs/logs.go +++ b/libcontainer/logs/logs.go @@ -3,100 +3,54 @@ package logs import ( "bufio" "encoding/json" - "fmt" "io" - "os" - "strconv" - "sync" "github.com/sirupsen/logrus" ) -var ( - configureMutex = sync.Mutex{} - // loggingConfigured will be set once logging has been configured via invoking `ConfigureLogging`. - // Subsequent invocations of `ConfigureLogging` would be no-op - loggingConfigured = false -) +func ForwardLogs(logPipe io.ReadCloser) chan error { + done := make(chan error, 1) + s := bufio.NewScanner(logPipe) -type Config struct { - LogLevel logrus.Level - LogFormat string - LogFilePath string - LogPipeFd string -} - -func ForwardLogs(logPipe io.Reader) { - lineReader := bufio.NewReader(logPipe) - for { - line, err := lineReader.ReadBytes('\n') - if len(line) > 0 { - processEntry(line) - } - if err == io.EOF { - logrus.Debugf("log pipe has been closed: %+v", err) - return - } - if err != nil { - logrus.Errorf("log pipe read error: %+v", err) - } - } -} - -func processEntry(text []byte) { - type jsonLog struct { - Level string `json:"level"` - Msg string `json:"msg"` + logger := logrus.StandardLogger() + if logger.ReportCaller { + // Need a copy of the standard logger, but with ReportCaller + // turned off, as the logs are merely forwarded and their + // true source is not this file/line/function. + logNoCaller := *logrus.StandardLogger() + logNoCaller.ReportCaller = false + logger = &logNoCaller } - var jl jsonLog + go func() { + for s.Scan() { + processEntry(s.Bytes(), logger) + } + if err := logPipe.Close(); err != nil { + logrus.Errorf("error closing log source: %v", err) + } + // The only error we want to return is when reading from + // logPipe has failed. + done <- s.Err() + close(done) + }() + + return done +} + +func processEntry(text []byte, logger *logrus.Logger) { + if len(text) == 0 { + return + } + + var jl struct { + Level logrus.Level `json:"level"` + Msg string `json:"msg"` + } if err := json.Unmarshal(text, &jl); err != nil { - logrus.Errorf("failed to decode %q to json: %+v", text, err) + logrus.Errorf("failed to decode %q to json: %v", text, err) return } - lvl, err := logrus.ParseLevel(jl.Level) - if err != nil { - logrus.Errorf("failed to parse log level %q: %v\n", jl.Level, err) - return - } - logrus.StandardLogger().Logf(lvl, jl.Msg) -} - -func ConfigureLogging(config Config) error { - configureMutex.Lock() - defer configureMutex.Unlock() - - if loggingConfigured { - logrus.Debug("logging has already been configured") - return nil - } - - logrus.SetLevel(config.LogLevel) - - if config.LogPipeFd != "" { - logPipeFdInt, err := strconv.Atoi(config.LogPipeFd) - if err != nil { - return fmt.Errorf("failed to convert _LIBCONTAINER_LOGPIPE environment variable value %q to int: %v", config.LogPipeFd, err) - } - logrus.SetOutput(os.NewFile(uintptr(logPipeFdInt), "logpipe")) - } else if config.LogFilePath != "" { - f, err := os.OpenFile(config.LogFilePath, os.O_CREATE|os.O_WRONLY|os.O_APPEND|os.O_SYNC, 0644) - if err != nil { - return err - } - logrus.SetOutput(f) - } - - switch config.LogFormat { - case "text": - // retain logrus's default. - case "json": - logrus.SetFormatter(new(logrus.JSONFormatter)) - default: - return fmt.Errorf("unknown log-format %q", config.LogFormat) - } - - loggingConfigured = true - return nil + logger.Log(jl.Level, jl.Msg) } diff --git a/libcontainer/logs/logs_linux_test.go b/libcontainer/logs/logs_linux_test.go index 83166fa..1264048 100644 --- a/libcontainer/logs/logs_linux_test.go +++ b/libcontainer/logs/logs_linux_test.go @@ -1,160 +1,166 @@ package logs import ( - "errors" - "io/ioutil" + "bytes" + "io" "os" - "strings" "testing" "time" "github.com/sirupsen/logrus" ) +const msgErr = `"level":"error"` + func TestLoggingToFile(t *testing.T) { - logW, logFile, _ := runLogForwarding(t) - defer os.Remove(logFile) - defer logW.Close() + l := runLogForwarding(t) - logToLogWriter(t, logW, `{"level": "info","msg":"kitten"}`) - - logFileContent := waitForLogContent(t, logFile) - if !strings.Contains(string(logFileContent), "kitten") { - t.Fatalf("%s does not contain kitten", string(logFileContent)) - } + msg := `"level":"info","msg":"kitten"` + logToLogWriter(t, l, msg) + finish(t, l) + check(t, l, msg, msgErr) } func TestLogForwardingDoesNotStopOnJsonDecodeErr(t *testing.T) { - logW, logFile, _ := runLogForwarding(t) - defer os.Remove(logFile) - defer logW.Close() + l := runLogForwarding(t) - logToLogWriter(t, logW, "invalid-json-with-kitten") + logToLogWriter(t, l, `"invalid-json-with-kitten"`) + checkWait(t, l, msgErr, "") - logFileContent := waitForLogContent(t, logFile) - if !strings.Contains(string(logFileContent), "failed to decode") { - t.Fatalf("%q does not contain decoding error", string(logFileContent)) - } + truncateLogFile(t, l.file) - truncateLogFile(t, logFile) - - logToLogWriter(t, logW, `{"level": "info","msg":"puppy"}`) - - logFileContent = waitForLogContent(t, logFile) - if !strings.Contains(string(logFileContent), "puppy") { - t.Fatalf("%s does not contain puppy", string(logFileContent)) - } + msg := `"level":"info","msg":"puppy"` + logToLogWriter(t, l, msg) + finish(t, l) + check(t, l, msg, msgErr) } func TestLogForwardingDoesNotStopOnLogLevelParsingErr(t *testing.T) { - logW, logFile, _ := runLogForwarding(t) - defer os.Remove(logFile) - defer logW.Close() + l := runLogForwarding(t) - logToLogWriter(t, logW, `{"level": "alert","msg":"puppy"}`) + msg := `"level":"alert","msg":"puppy"` + logToLogWriter(t, l, msg) + checkWait(t, l, msgErr, msg) - logFileContent := waitForLogContent(t, logFile) - if !strings.Contains(string(logFileContent), "failed to parse log level") { - t.Fatalf("%q does not contain log level parsing error", string(logFileContent)) - } + truncateLogFile(t, l.file) - truncateLogFile(t, logFile) - - logToLogWriter(t, logW, `{"level": "info","msg":"puppy"}`) - - logFileContent = waitForLogContent(t, logFile) - if !strings.Contains(string(logFileContent), "puppy") { - t.Fatalf("%s does not contain puppy", string(logFileContent)) - } + msg = `"level":"info","msg":"puppy"` + logToLogWriter(t, l, msg) + finish(t, l) + check(t, l, msg, msgErr) } func TestLogForwardingStopsAfterClosingTheWriter(t *testing.T) { - logW, logFile, doneForwarding := runLogForwarding(t) - defer os.Remove(logFile) + l := runLogForwarding(t) - logToLogWriter(t, logW, `{"level": "info","msg":"sync"}`) + msg := `"level":"info","msg":"sync"` + logToLogWriter(t, l, msg) - logFileContent := waitForLogContent(t, logFile) - if !strings.Contains(string(logFileContent), "sync") { - t.Fatalf("%q does not contain sync message", string(logFileContent)) - } - - logW.Close() + // Do not use finish() here as we check done pipe ourselves. + l.w.Close() select { - case <-doneForwarding: + case <-l.done: case <-time.After(10 * time.Second): t.Fatal("log forwarding did not stop after closing the pipe") } + + check(t, l, msg, msgErr) } -func logToLogWriter(t *testing.T, logW *os.File, message string) { - _, err := logW.Write([]byte(message + "\n")) +func logToLogWriter(t *testing.T, l *log, message string) { + t.Helper() + _, err := l.w.Write([]byte("{" + message + "}\n")) if err != nil { t.Fatalf("failed to write %q to log writer: %v", message, err) } } -func runLogForwarding(t *testing.T) (*os.File, string, chan struct{}) { +type log struct { + w io.WriteCloser + file *os.File + done chan error +} + +func runLogForwarding(t *testing.T) *log { + t.Helper() logR, logW, err := os.Pipe() if err != nil { t.Fatal(err) } + t.Cleanup(func() { + logR.Close() + logW.Close() + }) - tempFile, err := ioutil.TempFile("", "") + tempFile, err := os.CreateTemp("", "") if err != nil { t.Fatal(err) } - logFile := tempFile.Name() + t.Cleanup(func() { + tempFile.Close() + os.Remove(tempFile.Name()) + }) - logConfig := Config{LogLevel: logrus.InfoLevel, LogFormat: "json", LogFilePath: logFile} - return logW, logFile, startLogForwarding(t, logConfig, logR) + logrus.SetOutput(tempFile) + logrus.SetFormatter(&logrus.JSONFormatter{}) + doneForwarding := ForwardLogs(logR) + + return &log{w: logW, done: doneForwarding, file: tempFile} } -func startLogForwarding(t *testing.T, logConfig Config, logR *os.File) chan struct{} { - loggingConfigured = false - if err := ConfigureLogging(logConfig); err != nil { - t.Fatal(err) +func finish(t *testing.T, l *log) { + t.Helper() + l.w.Close() + if err := <-l.done; err != nil { + t.Fatalf("ForwardLogs: %v", err) } - doneForwarding := make(chan struct{}) - go func() { - ForwardLogs(logR) - close(doneForwarding) - }() - return doneForwarding } -func waitForLogContent(t *testing.T, logFile string) string { - startTime := time.Now() +func truncateLogFile(t *testing.T, file *os.File) { + t.Helper() - for { - if time.Now().After(startTime.Add(10 * time.Second)) { - t.Fatal(errors.New("No content in log file after 10 seconds")) - break - } - - fileContent, err := ioutil.ReadFile(logFile) - if err != nil { - t.Fatal(err) - } - if len(fileContent) == 0 { - continue - } - return string(fileContent) - } - - return "" -} - -func truncateLogFile(t *testing.T, logFile string) { - file, err := os.OpenFile(logFile, os.O_RDWR, 0666) - if err != nil { - t.Fatalf("failed to open log file: %v", err) - return - } - defer file.Close() - - err = file.Truncate(0) + err := file.Truncate(0) if err != nil { t.Fatalf("failed to truncate log file: %v", err) } } + +// check checks that the file contains txt and does not contain notxt. +func check(t *testing.T, l *log, txt, notxt string) { + t.Helper() + contents, err := os.ReadFile(l.file.Name()) + if err != nil { + t.Fatal(err) + } + if txt != "" && !bytes.Contains(contents, []byte(txt)) { + t.Fatalf("%s does not contain %s", contents, txt) + } + if notxt != "" && bytes.Contains(contents, []byte(notxt)) { + t.Fatalf("%s does contain %s", contents, notxt) + } +} + +// checkWait is like check, but if the file is empty, +// it waits until it's not. +func checkWait(t *testing.T, l *log, txt string, notxt string) { + t.Helper() + const ( + delay = 100 * time.Millisecond + iter = 3 + ) + for i := 0; ; i++ { + st, err := l.file.Stat() + if err != nil { + t.Fatal(err) + } + if st.Size() > 0 { + break + } + if i == iter { + t.Fatalf("waited %s for file %s to be non-empty but it still is", iter*delay, l.file.Name()) + } + time.Sleep(delay) + } + + check(t, l, txt, notxt) +} diff --git a/libcontainer/message_linux.go b/libcontainer/message_linux.go index 1d4f503..6d1107e 100644 --- a/libcontainer/message_linux.go +++ b/libcontainer/message_linux.go @@ -1,8 +1,9 @@ -// +build linux - package libcontainer import ( + "fmt" + "math" + "github.com/vishvananda/netlink/nl" "golang.org/x/sys/unix" ) @@ -20,6 +21,7 @@ const ( RootlessEUIDAttr uint16 = 27287 UidmapPathAttr uint16 = 27288 GidmapPathAttr uint16 = 27289 + MountSourcesAttr uint16 = 27290 ) type Int32msg struct { @@ -54,6 +56,12 @@ type Bytemsg struct { func (msg *Bytemsg) Serialize() []byte { l := msg.Len() + if l > math.MaxUint16 { + // We cannot return nil nor an error here, so we panic with + // a specific type instead, which is handled via recover in + // bootstrapData. + panic(netlinkError{fmt.Errorf("netlink: cannot serialize bytemsg of length %d (larger than UINT16_MAX)", l)}) + } buf := make([]byte, (l+unix.NLA_ALIGNTO-1) & ^(unix.NLA_ALIGNTO-1)) native := nl.NativeEndian() native.PutUint16(buf[0:2], uint16(l)) diff --git a/libcontainer/mount/mount.go b/libcontainer/mount/mount.go deleted file mode 100644 index e8965e0..0000000 --- a/libcontainer/mount/mount.go +++ /dev/null @@ -1,23 +0,0 @@ -package mount - -// GetMounts retrieves a list of mounts for the current running process. -func GetMounts() ([]*Info, error) { - return parseMountTable() -} - -// Mounted looks at /proc/self/mountinfo to determine of the specified -// mountpoint has been mounted -func Mounted(mountpoint string) (bool, error) { - entries, err := parseMountTable() - if err != nil { - return false, err - } - - // Search the table for the mountpoint - for _, e := range entries { - if e.Mountpoint == mountpoint { - return true, nil - } - } - return false, nil -} diff --git a/libcontainer/mount/mount_linux.go b/libcontainer/mount/mount_linux.go deleted file mode 100644 index 1e51919..0000000 --- a/libcontainer/mount/mount_linux.go +++ /dev/null @@ -1,82 +0,0 @@ -// +build linux - -package mount - -import ( - "bufio" - "fmt" - "io" - "os" - "strings" -) - -const ( - /* 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue - (1)(2)(3) (4) (5) (6) (7) (8) (9) (10) (11) - - (1) mount ID: unique identifier of the mount (may be reused after umount) - (2) parent ID: ID of parent (or of self for the top of the mount tree) - (3) major:minor: value of st_dev for files on filesystem - (4) root: root of the mount within the filesystem - (5) mount point: mount point relative to the process's root - (6) mount options: per mount options - (7) optional fields: zero or more fields of the form "tag[:value]" - (8) separator: marks the end of the optional fields - (9) filesystem type: name of filesystem of the form "type[.subtype]" - (10) mount source: filesystem specific information or "none" - (11) super options: per super block options*/ - mountinfoFormat = "%d %d %d:%d %s %s %s %s" -) - -// Parse /proc/self/mountinfo because comparing Dev and ino does not work from -// bind mounts -func parseMountTable() ([]*Info, error) { - f, err := os.Open("/proc/self/mountinfo") - if err != nil { - return nil, err - } - defer f.Close() - - return parseInfoFile(f) -} - -func parseInfoFile(r io.Reader) ([]*Info, error) { - var ( - s = bufio.NewScanner(r) - out = []*Info{} - ) - - for s.Scan() { - if err := s.Err(); err != nil { - return nil, err - } - - var ( - p = &Info{} - text = s.Text() - optionalFields string - ) - - if _, err := fmt.Sscanf(text, mountinfoFormat, - &p.ID, &p.Parent, &p.Major, &p.Minor, - &p.Root, &p.Mountpoint, &p.Opts, &optionalFields); err != nil { - return nil, fmt.Errorf("Scanning '%s' failed: %s", text, err) - } - // Safe as mountinfo encodes mountpoints with spaces as \040. - index := strings.Index(text, " - ") - postSeparatorFields := strings.Fields(text[index+3:]) - if len(postSeparatorFields) < 3 { - return nil, fmt.Errorf("Error found less than 3 fields post '-' in %q", text) - } - - if optionalFields != "-" { - p.Optional = optionalFields - } - - p.Fstype = postSeparatorFields[0] - p.Source = postSeparatorFields[1] - p.VfsOpts = strings.Join(postSeparatorFields[2:], " ") - out = append(out, p) - } - return out, nil -} diff --git a/libcontainer/mount/mountinfo.go b/libcontainer/mount/mountinfo.go deleted file mode 100644 index e3fc353..0000000 --- a/libcontainer/mount/mountinfo.go +++ /dev/null @@ -1,40 +0,0 @@ -package mount - -// Info reveals information about a particular mounted filesystem. This -// struct is populated from the content in the /proc//mountinfo file. -type Info struct { - // ID is a unique identifier of the mount (may be reused after umount). - ID int - - // Parent indicates the ID of the mount parent (or of self for the top of the - // mount tree). - Parent int - - // Major indicates one half of the device ID which identifies the device class. - Major int - - // Minor indicates one half of the device ID which identifies a specific - // instance of device. - Minor int - - // Root of the mount within the filesystem. - Root string - - // Mountpoint indicates the mount point relative to the process's root. - Mountpoint string - - // Opts represents mount-specific options. - Opts string - - // Optional represents optional fields. - Optional string - - // Fstype indicates the type of filesystem, such as EXT3. - Fstype string - - // Source indicates filesystem specific information or "none". - Source string - - // VfsOpts represents per super block options. - VfsOpts string -} diff --git a/libcontainer/mount_linux.go b/libcontainer/mount_linux.go new file mode 100644 index 0000000..5f49de9 --- /dev/null +++ b/libcontainer/mount_linux.go @@ -0,0 +1,83 @@ +package libcontainer + +import ( + "strconv" + + "golang.org/x/sys/unix" +) + +// mountError holds an error from a failed mount or unmount operation. +type mountError struct { + op string + source string + target string + procfd string + flags uintptr + data string + err error +} + +// Error provides a string error representation. +func (e *mountError) Error() string { + out := e.op + " " + + if e.source != "" { + out += e.source + ":" + e.target + } else { + out += e.target + } + if e.procfd != "" { + out += " (via " + e.procfd + ")" + } + + if e.flags != uintptr(0) { + out += ", flags: 0x" + strconv.FormatUint(uint64(e.flags), 16) + } + if e.data != "" { + out += ", data: " + e.data + } + + out += ": " + e.err.Error() + return out +} + +// Unwrap returns the underlying error. +// This is a convention used by Go 1.13+ standard library. +func (e *mountError) Unwrap() error { + return e.err +} + +// mount is a simple unix.Mount wrapper. If procfd is not empty, it is used +// instead of target (and the target is only used to add context to an error). +func mount(source, target, procfd, fstype string, flags uintptr, data string) error { + dst := target + if procfd != "" { + dst = procfd + } + if err := unix.Mount(source, dst, fstype, flags, data); err != nil { + return &mountError{ + op: "mount", + source: source, + target: target, + procfd: procfd, + flags: flags, + data: data, + err: err, + } + } + return nil +} + +// unmount is a simple unix.Unmount wrapper. +func unmount(target string, flags int) error { + err := unix.Unmount(target, flags) + if err != nil { + return &mountError{ + op: "unmount", + target: target, + flags: uintptr(flags), + err: err, + } + } + return nil +} diff --git a/libcontainer/network_linux.go b/libcontainer/network_linux.go index 938d8ce..8915548 100644 --- a/libcontainer/network_linux.go +++ b/libcontainer/network_linux.go @@ -1,13 +1,11 @@ -// +build linux - package libcontainer import ( + "bytes" "fmt" - "io/ioutil" + "os" "path/filepath" "strconv" - "strings" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/types" @@ -75,16 +73,15 @@ func getNetworkInterfaceStats(interfaceName string) (*types.NetworkInterface, er // Reads the specified statistics available under /sys/class/net//statistics func readSysfsNetworkStats(ethInterface, statsFile string) (uint64, error) { - data, err := ioutil.ReadFile(filepath.Join("/sys/class/net", ethInterface, "statistics", statsFile)) + data, err := os.ReadFile(filepath.Join("/sys/class/net", ethInterface, "statistics", statsFile)) if err != nil { return 0, err } - return strconv.ParseUint(strings.TrimSpace(string(data)), 10, 64) + return strconv.ParseUint(string(bytes.TrimSpace(data)), 10, 64) } // loopback is a network strategy that provides a basic loopback device -type loopback struct { -} +type loopback struct{} func (l *loopback) create(n *network, nspid int) error { return nil diff --git a/libcontainer/notify_linux.go b/libcontainer/notify_linux.go index 47a0678..a876284 100644 --- a/libcontainer/notify_linux.go +++ b/libcontainer/notify_linux.go @@ -1,18 +1,14 @@ -// +build linux - package libcontainer import ( + "errors" "fmt" - "io/ioutil" "os" "path/filepath" "golang.org/x/sys/unix" ) -const oomCgroupName = "memory" - type PressureLevel uint const ( @@ -36,7 +32,7 @@ func registerMemoryEvent(cgDir string, evName string, arg string) (<-chan struct eventControlPath := filepath.Join(cgDir, "cgroup.event_control") data := fmt.Sprintf("%d %d %s", eventfd.Fd(), evFile.Fd(), arg) - if err := ioutil.WriteFile(eventControlPath, []byte(data), 0700); err != nil { + if err := os.WriteFile(eventControlPath, []byte(data), 0o700); err != nil { eventfd.Close() evFile.Close() return nil, err @@ -66,19 +62,17 @@ func registerMemoryEvent(cgDir string, evName string, arg string) (<-chan struct // notifyOnOOM returns channel on which you can expect event about OOM, // if process died without OOM this channel will be closed. -func notifyOnOOM(paths map[string]string) (<-chan struct{}, error) { - dir := paths[oomCgroupName] +func notifyOnOOM(dir string) (<-chan struct{}, error) { if dir == "" { - return nil, fmt.Errorf("path %q missing", oomCgroupName) + return nil, errors.New("memory controller missing") } return registerMemoryEvent(dir, "memory.oom_control", "") } -func notifyMemoryPressure(paths map[string]string, level PressureLevel) (<-chan struct{}, error) { - dir := paths[oomCgroupName] +func notifyMemoryPressure(dir string, level PressureLevel) (<-chan struct{}, error) { if dir == "" { - return nil, fmt.Errorf("path %q missing", oomCgroupName) + return nil, errors.New("memory controller missing") } if level > CriticalPressure { diff --git a/libcontainer/notify_linux_test.go b/libcontainer/notify_linux_test.go index 1e15ae2..3967f07 100644 --- a/libcontainer/notify_linux_test.go +++ b/libcontainer/notify_linux_test.go @@ -1,11 +1,8 @@ -// +build linux - package libcontainer import ( "encoding/binary" "fmt" - "io/ioutil" "os" "path/filepath" "testing" @@ -14,30 +11,24 @@ import ( "golang.org/x/sys/unix" ) -type notifyFunc func(paths map[string]string) (<-chan struct{}, error) +type notifyFunc func(path string) (<-chan struct{}, error) func testMemoryNotification(t *testing.T, evName string, notify notifyFunc, targ string) { - memoryPath, err := ioutil.TempDir("", "testmemnotification-"+evName) - if err != nil { - t.Fatal(err) - } + memoryPath := t.TempDir() evFile := filepath.Join(memoryPath, evName) eventPath := filepath.Join(memoryPath, "cgroup.event_control") - if err := ioutil.WriteFile(evFile, []byte{}, 0700); err != nil { + if err := os.WriteFile(evFile, []byte{}, 0o700); err != nil { t.Fatal(err) } - if err := ioutil.WriteFile(eventPath, []byte{}, 0700); err != nil { + if err := os.WriteFile(eventPath, []byte{}, 0o700); err != nil { t.Fatal(err) } - paths := map[string]string{ - "memory": memoryPath, - } - ch, err := notify(paths) + ch, err := notify(memoryPath) if err != nil { t.Fatal("expected no error, got:", err) } - data, err := ioutil.ReadFile(eventPath) + data, err := os.ReadFile(eventPath) if err != nil { t.Fatal("couldn't read event control file:", err) } @@ -102,8 +93,8 @@ func testMemoryNotification(t *testing.T, evName string, notify notifyFunc, targ } func TestNotifyOnOOM(t *testing.T) { - f := func(paths map[string]string) (<-chan struct{}, error) { - return notifyOnOOM(paths) + f := func(path string) (<-chan struct{}, error) { + return notifyOnOOM(path) } testMemoryNotification(t, "memory.oom_control", f, "") @@ -117,8 +108,8 @@ func TestNotifyMemoryPressure(t *testing.T) { } for level, arg := range tests { - f := func(paths map[string]string) (<-chan struct{}, error) { - return notifyMemoryPressure(paths, level) + f := func(path string) (<-chan struct{}, error) { + return notifyMemoryPressure(path, level) } testMemoryNotification(t, "memory.pressure_level", f, arg) diff --git a/libcontainer/notify_v2_linux.go b/libcontainer/notify_v2_linux.go new file mode 100644 index 0000000..821536c --- /dev/null +++ b/libcontainer/notify_v2_linux.go @@ -0,0 +1,80 @@ +package libcontainer + +import ( + "fmt" + "path/filepath" + "unsafe" + + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" +) + +func registerMemoryEventV2(cgDir, evName, cgEvName string) (<-chan struct{}, error) { + fd, err := unix.InotifyInit() + if err != nil { + return nil, fmt.Errorf("unable to init inotify: %w", err) + } + // watching oom kill + evFd, err := unix.InotifyAddWatch(fd, filepath.Join(cgDir, evName), unix.IN_MODIFY) + if err != nil { + unix.Close(fd) + return nil, fmt.Errorf("unable to add inotify watch: %w", err) + } + // Because no `unix.IN_DELETE|unix.IN_DELETE_SELF` event for cgroup file system, so watching all process exited + cgFd, err := unix.InotifyAddWatch(fd, filepath.Join(cgDir, cgEvName), unix.IN_MODIFY) + if err != nil { + unix.Close(fd) + return nil, fmt.Errorf("unable to add inotify watch: %w", err) + } + ch := make(chan struct{}) + go func() { + var ( + buffer [unix.SizeofInotifyEvent + unix.PathMax + 1]byte + offset uint32 + ) + defer func() { + unix.Close(fd) + close(ch) + }() + + for { + n, err := unix.Read(fd, buffer[:]) + if err != nil { + logrus.Warnf("unable to read event data from inotify, got error: %v", err) + return + } + if n < unix.SizeofInotifyEvent { + logrus.Warnf("we should read at least %d bytes from inotify, but got %d bytes.", unix.SizeofInotifyEvent, n) + return + } + offset = 0 + for offset <= uint32(n-unix.SizeofInotifyEvent) { + rawEvent := (*unix.InotifyEvent)(unsafe.Pointer(&buffer[offset])) + offset += unix.SizeofInotifyEvent + rawEvent.Len + if rawEvent.Mask&unix.IN_MODIFY != unix.IN_MODIFY { + continue + } + switch int(rawEvent.Wd) { + case evFd: + oom, err := fscommon.GetValueByKey(cgDir, evName, "oom_kill") + if err != nil || oom > 0 { + ch <- struct{}{} + } + case cgFd: + pids, err := fscommon.GetValueByKey(cgDir, cgEvName, "populated") + if err != nil || pids == 0 { + return + } + } + } + } + }() + return ch, nil +} + +// notifyOnOOMV2 returns channel on which you can expect event about OOM, +// if process died without OOM this channel will be closed. +func notifyOnOOMV2(path string) (<-chan struct{}, error) { + return registerMemoryEventV2(path, "memory.events", "cgroup.events") +} diff --git a/libcontainer/nsenter/cloned_binary.c b/libcontainer/nsenter/cloned_binary.c index ad10f14..4268ebd 100644 --- a/libcontainer/nsenter/cloned_binary.c +++ b/libcontainer/nsenter/cloned_binary.c @@ -1,7 +1,14 @@ +// SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later /* * Copyright (C) 2019 Aleksa Sarai * Copyright (C) 2019 SUSE LLC * + * This work is dual licensed under the following licenses. You may use, + * redistribute, and/or modify the work under the conditions of either (or + * both) licenses. + * + * === Apache-2.0 === + * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at @@ -13,6 +20,23 @@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. + * + * === LGPL-2.1-or-later === + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see + * . + * */ #define _GNU_SOURCE @@ -35,14 +59,38 @@ #include /* Use our own wrapper for memfd_create. */ -#if !defined(SYS_memfd_create) && defined(__NR_memfd_create) -# define SYS_memfd_create __NR_memfd_create +#ifndef SYS_memfd_create +# ifdef __NR_memfd_create +# define SYS_memfd_create __NR_memfd_create +# else +/* These values come from . */ +# warning "libc is outdated -- using hard-coded SYS_memfd_create" +# if defined(__x86_64__) +# define SYS_memfd_create 319 +# elif defined(__i386__) +# define SYS_memfd_create 356 +# elif defined(__ia64__) +# define SYS_memfd_create 1340 +# elif defined(__arm__) +# define SYS_memfd_create 385 +# elif defined(__aarch64__) +# define SYS_memfd_create 279 +# elif defined(__ppc__) || defined(__PPC64__) || defined(__powerpc64__) +# define SYS_memfd_create 360 +# elif defined(__s390__) || defined(__s390x__) +# define SYS_memfd_create 350 +# else +# warning "unknown architecture -- cannot hard-code SYS_memfd_create" +# endif +# endif #endif + /* memfd_create(2) flags -- copied from . */ #ifndef MFD_CLOEXEC # define MFD_CLOEXEC 0x0001U # define MFD_ALLOW_SEALING 0x0002U #endif + int memfd_create(const char *name, unsigned int flags) { #ifdef SYS_memfd_create @@ -53,7 +101,6 @@ int memfd_create(const char *name, unsigned int flags) #endif } - /* This comes directly from . */ #ifndef F_LINUX_SPECIFIC_BASE # define F_LINUX_SPECIFIC_BASE 1024 @@ -79,7 +126,7 @@ static void *must_realloc(void *ptr, size_t size) void *old = ptr; do { ptr = realloc(old, size); - } while(!ptr); + } while (!ptr); return ptr; } @@ -91,12 +138,14 @@ static void *must_realloc(void *ptr, size_t size) static int is_self_cloned(void) { int fd, ret, is_cloned = 0; - struct stat statbuf = {}; - struct statfs fsbuf = {}; + struct stat statbuf = { }; + struct statfs fsbuf = { }; - fd = open("/proc/self/exe", O_RDONLY|O_CLOEXEC); - if (fd < 0) + fd = open("/proc/self/exe", O_RDONLY | O_CLOEXEC); + if (fd < 0) { + fprintf(stderr, "you have no read access to runc binary file\n"); return -ENOTRECOVERABLE; + } /* * Is the binary a fully-sealed memfd? We don't need CLONED_BINARY_ENV for @@ -248,7 +297,7 @@ enum { static int make_execfd(int *fdtype) { int fd = -1; - char template[PATH_MAX] = {0}; + char template[PATH_MAX] = { 0 }; char *prefix = getenv("_LIBCONTAINER_STATEDIR"); if (!prefix || *prefix != '/') @@ -277,7 +326,7 @@ static int make_execfd(int *fdtype) *fdtype = EFD_FILE; fd = open(prefix, O_TMPFILE | O_EXCL | O_RDWR | O_CLOEXEC, 0700); if (fd >= 0) { - struct stat statbuf = {}; + struct stat statbuf = { }; bool working_otmpfile = false; /* @@ -322,27 +371,27 @@ static int seal_execfd(int *fd, int fdtype) switch (fdtype) { case EFD_MEMFD: return fcntl(*fd, F_ADD_SEALS, RUNC_MEMFD_SEALS); - case EFD_FILE: { - /* Need to re-open our pseudo-memfd as an O_PATH to avoid execve(2) giving -ETXTBSY. */ - int newfd; - char fdpath[PATH_MAX] = {0}; + case EFD_FILE:{ + /* Need to re-open our pseudo-memfd as an O_PATH to avoid execve(2) giving -ETXTBSY. */ + int newfd; + char fdpath[PATH_MAX] = { 0 }; - if (fchmod(*fd, 0100) < 0) - return -1; + if (fchmod(*fd, 0100) < 0) + return -1; - if (snprintf(fdpath, sizeof(fdpath), "/proc/self/fd/%d", *fd) < 0) - return -1; + if (snprintf(fdpath, sizeof(fdpath), "/proc/self/fd/%d", *fd) < 0) + return -1; - newfd = open(fdpath, O_PATH | O_CLOEXEC); - if (newfd < 0) - return -1; + newfd = open(fdpath, O_PATH | O_CLOEXEC); + if (newfd < 0) + return -1; - close(*fd); - *fd = newfd; - return 0; - } + close(*fd); + *fd = newfd; + return 0; + } default: - break; + break; } return -1; } @@ -350,7 +399,7 @@ static int seal_execfd(int *fd, int fdtype) static int try_bindfd(void) { int fd, ret = -1; - char template[PATH_MAX] = {0}; + char template[PATH_MAX] = { 0 }; char *prefix = getenv("_LIBCONTAINER_STATEDIR"); if (!prefix || *prefix != '/') @@ -378,7 +427,6 @@ static int try_bindfd(void) if (mount("", template, "", MS_REMOUNT | MS_BIND | MS_RDONLY, "") < 0) goto out_umount; - /* Get read-only handle that we're sure can't be made read-write. */ ret = open(template, O_PATH | O_CLOEXEC); @@ -422,7 +470,7 @@ static ssize_t fd_to_fd(int outfd, int infd) if (n < 0) return -1; nwritten += n; - } while(nwritten < nread); + } while (nwritten < nread); total += nwritten; } @@ -433,7 +481,7 @@ static ssize_t fd_to_fd(int outfd, int infd) static int clone_binary(void) { int binfd, execfd; - struct stat statbuf = {}; + struct stat statbuf = { }; size_t sent = 0; int fdtype = EFD_NONE; diff --git a/libcontainer/nsenter/escape.c b/libcontainer/nsenter/escape.c new file mode 100644 index 0000000..78e7e9f --- /dev/null +++ b/libcontainer/nsenter/escape.c @@ -0,0 +1,142 @@ +#include +#include + +#ifdef ESCAPE_TEST +# include +# define test_assert(arg) assert(arg) +#else +# define test_assert(arg) +#endif + +#define DEL '\x7f' + +/* + * Poor man version of itoa with base=16 and input number from 0 to 15, + * represented by a char. Converts it to a single hex digit ('0' to 'f'). + */ +static char hex(char i) +{ + test_assert(i >= 0 && i < 16); + + if (i >= 0 && i < 10) { + return '0' + i; + } + if (i >= 10 && i < 16) { + return 'a' + i - 10; + } + return '?'; +} + +/* + * Given the character, tells how many _extra_ characters are needed + * to JSON-escape it. If 0 is returned, the character does not need to + * be escaped. + */ +static int need_escape(char c) +{ + switch (c) { + case '\\': + case '"': + case '\b': + case '\n': + case '\r': + case '\t': + case '\f': + return 1; + case DEL: // -> \u007f + return 5; + default: + if (c > 0 && c < ' ') { + // ASCII decimal 01 to 31 -> \u00xx + return 5; + } + return 0; + } +} + +/* + * Escape the string so it can be used as a JSON string (per RFC4627, + * section 2.5 minimal requirements, plus the DEL (0x7f) character). + * + * It is expected that the argument is a string allocated via malloc. + * In case no escaping is needed, the original string is returned as is; + * otherwise, the original string is free'd, and the newly allocated + * escaped string is returned. Thus, in any case, the value returned + * need to be free'd by the caller. + */ +char *escape_json_string(char *s) +{ + int i, j, len; + char *c, *out; + + /* + * First, check if escaping is at all needed -- if not, we can avoid + * malloc and return the argument as is. While at it, count how much + * extra space is required. + * + * XXX: the counting code must be in sync with the escaping code + * (checked by test_assert()s below). + */ + for (i = j = 0; s[i] != '\0'; i++) { + j += need_escape(s[i]); + } + if (j == 0) { + // nothing to escape + return s; + } + + len = i + j + 1; + out = malloc(len); + if (!out) { + free(s); + // As malloc failed, strdup can fail, too, so in the worst case + // scenario NULL will be returned from here. + return strdup("escape_json_string: out of memory"); + } + for (c = s, j = 0; *c != '\0'; c++) { + switch (*c) { + case '"': + case '\\': + test_assert(need_escape(*c) == 1); + out[j++] = '\\'; + out[j++] = *c; + continue; + } + if ((*c < 0 || *c >= ' ') && (*c != DEL)) { + // no escape needed + test_assert(need_escape(*c) == 0); + out[j++] = *c; + continue; + } + out[j++] = '\\'; + switch (*c) { + case '\b': + out[j++] = 'b'; + break; + case '\n': + out[j++] = 'n'; + break; + case '\r': + out[j++] = 'r'; + break; + case '\t': + out[j++] = 't'; + break; + case '\f': + out[j++] = 'f'; + break; + default: + test_assert(need_escape(*c) == 5); + out[j++] = 'u'; + out[j++] = '0'; + out[j++] = '0'; + out[j++] = hex(*c >> 4); + out[j++] = hex(*c & 0x0f); + } + } + test_assert(j + 1 == len); + out[j] = '\0'; + + free(s); + return out; +} diff --git a/libcontainer/nsenter/nsenter.go b/libcontainer/nsenter/nsenter.go index 07f4d63..2d1f3e1 100644 --- a/libcontainer/nsenter/nsenter.go +++ b/libcontainer/nsenter/nsenter.go @@ -1,3 +1,4 @@ +//go:build linux && !gccgo // +build linux,!gccgo package nsenter diff --git a/libcontainer/nsenter/nsenter_gccgo.go b/libcontainer/nsenter/nsenter_gccgo.go index 63c7a3e..86bad53 100644 --- a/libcontainer/nsenter/nsenter_gccgo.go +++ b/libcontainer/nsenter/nsenter_gccgo.go @@ -1,3 +1,4 @@ +//go:build linux && gccgo // +build linux,gccgo package nsenter diff --git a/libcontainer/nsenter/nsenter_test.go b/libcontainer/nsenter/nsenter_test.go index c4d3c86..0cbf0aa 100644 --- a/libcontainer/nsenter/nsenter_test.go +++ b/libcontainer/nsenter/nsenter_test.go @@ -3,9 +3,9 @@ package nsenter import ( "bytes" "encoding/json" + "errors" "fmt" "io" - "io/ioutil" "os" "os/exec" "strings" @@ -13,25 +13,12 @@ import ( "github.com/opencontainers/runc/libcontainer" "github.com/vishvananda/netlink/nl" - "golang.org/x/sys/unix" ) -type pid struct { - Pid int `json:"Pid"` -} - -type logentry struct { - Msg string `json:"msg"` - Level string `json:"level"` -} - func TestNsenterValidPaths(t *testing.T) { args := []string{"nsenter-exec"} - parent, child, err := newPipe() - if err != nil { - t.Fatalf("failed to create pipe %v", err) - } + parent, child := newPipe(t) namespaces := []string{ // join pid ns of the current process @@ -47,8 +34,10 @@ func TestNsenterValidPaths(t *testing.T) { } if err := cmd.Start(); err != nil { - t.Fatalf("nsenter failed to start %v", err) + t.Fatalf("nsenter failed to start: %v", err) } + child.Close() + // write cloneFlags r := nl.NewNetlinkRequest(int(libcontainer.InitMsg), 0) r.AddData(&libcontainer.Int32msg{ @@ -63,36 +52,20 @@ func TestNsenterValidPaths(t *testing.T) { t.Fatal(err) } - decoder := json.NewDecoder(parent) - var pid *pid + initWaiter(t, parent) if err := cmd.Wait(); err != nil { - t.Fatalf("nsenter exits with a non-zero exit status") - } - if err := decoder.Decode(&pid); err != nil { - dir, _ := ioutil.ReadDir(fmt.Sprintf("/proc/%d/ns", os.Getpid())) - for _, d := range dir { - t.Log(d.Name()) - } - t.Fatalf("%v", err) + t.Fatalf("nsenter error: %v", err) } - p, err := os.FindProcess(pid.Pid) - if err != nil { - t.Fatalf("%v", err) - } - p.Wait() + reapChildren(t, parent) } func TestNsenterInvalidPaths(t *testing.T) { args := []string{"nsenter-exec"} - parent, child, err := newPipe() - if err != nil { - t.Fatalf("failed to create pipe %v", err) - } + parent, child := newPipe(t) namespaces := []string{ - // join pid ns of the current process fmt.Sprintf("pid:/proc/%d/ns/pid", -1), } cmd := &exec.Cmd{ @@ -103,8 +76,10 @@ func TestNsenterInvalidPaths(t *testing.T) { } if err := cmd.Start(); err != nil { - t.Fatal(err) + t.Fatalf("nsenter failed to start: %v", err) } + child.Close() + // write cloneFlags r := nl.NewNetlinkRequest(int(libcontainer.InitMsg), 0) r.AddData(&libcontainer.Int32msg{ @@ -119,6 +94,7 @@ func TestNsenterInvalidPaths(t *testing.T) { t.Fatal(err) } + initWaiter(t, parent) if err := cmd.Wait(); err == nil { t.Fatalf("nsenter exits with a zero exit status") } @@ -126,13 +102,9 @@ func TestNsenterInvalidPaths(t *testing.T) { func TestNsenterIncorrectPathType(t *testing.T) { args := []string{"nsenter-exec"} - parent, child, err := newPipe() - if err != nil { - t.Fatalf("failed to create pipe %v", err) - } + parent, child := newPipe(t) namespaces := []string{ - // join pid ns of the current process fmt.Sprintf("net:/proc/%d/ns/pid", os.Getpid()), } cmd := &exec.Cmd{ @@ -143,8 +115,10 @@ func TestNsenterIncorrectPathType(t *testing.T) { } if err := cmd.Start(); err != nil { - t.Fatal(err) + t.Fatalf("nsenter failed to start: %v", err) } + child.Close() + // write cloneFlags r := nl.NewNetlinkRequest(int(libcontainer.InitMsg), 0) r.AddData(&libcontainer.Int32msg{ @@ -159,23 +133,16 @@ func TestNsenterIncorrectPathType(t *testing.T) { t.Fatal(err) } + initWaiter(t, parent) if err := cmd.Wait(); err == nil { - t.Fatalf("nsenter exits with a zero exit status") + t.Fatalf("nsenter error: %v", err) } } func TestNsenterChildLogging(t *testing.T) { args := []string{"nsenter-exec"} - parent, child, err := newPipe() - if err != nil { - t.Fatalf("failed to create exec pipe %v", err) - } - logread, logwrite, err := os.Pipe() - if err != nil { - t.Fatalf("failed to create log pipe %v", err) - } - defer logread.Close() - defer logwrite.Close() + parent, child := newPipe(t) + logread, logwrite := newPipe(t) namespaces := []string{ // join pid ns of the current process @@ -191,8 +158,11 @@ func TestNsenterChildLogging(t *testing.T) { } if err := cmd.Start(); err != nil { - t.Fatalf("nsenter failed to start %v", err) + t.Fatalf("nsenter failed to start: %v", err) } + child.Close() + logwrite.Close() + // write cloneFlags r := nl.NewNetlinkRequest(int(libcontainer.InitMsg), 0) r.AddData(&libcontainer.Int32msg{ @@ -207,33 +177,93 @@ func TestNsenterChildLogging(t *testing.T) { t.Fatal(err) } - logsDecoder := json.NewDecoder(logread) - var logentry *logentry - - err = logsDecoder.Decode(&logentry) - if err != nil { - t.Fatalf("child log: %v", err) - } - if logentry.Level == "" || logentry.Msg == "" { - t.Fatalf("child log: empty log fileds: level=\"%s\" msg=\"%s\"", logentry.Level, logentry.Msg) - } + initWaiter(t, parent) + getLogs(t, logread) if err := cmd.Wait(); err != nil { - t.Fatalf("nsenter exits with a non-zero exit status") + t.Fatalf("nsenter error: %v", err) } + + reapChildren(t, parent) } func init() { if strings.HasPrefix(os.Args[0], "nsenter-") { os.Exit(0) } +} + +func newPipe(t *testing.T) (parent *os.File, child *os.File) { + t.Helper() + fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0) + if err != nil { + t.Fatal("socketpair failed:", err) + } + parent = os.NewFile(uintptr(fds[1]), "parent") + child = os.NewFile(uintptr(fds[0]), "child") + t.Cleanup(func() { + parent.Close() + child.Close() + }) return } -func newPipe() (parent *os.File, child *os.File, err error) { - fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0) - if err != nil { - return nil, nil, err +// initWaiter reads back the initial \0 from runc init +func initWaiter(t *testing.T, r io.Reader) { + inited := make([]byte, 1) + n, err := r.Read(inited) + if err == nil { + if n < 1 { + err = errors.New("short read") + } else if inited[0] != 0 { + err = fmt.Errorf("unexpected %d != 0", inited[0]) + } else { + return + } + } + t.Fatalf("waiting for init preliminary setup: %v", err) +} + +func reapChildren(t *testing.T, parent *os.File) { + t.Helper() + decoder := json.NewDecoder(parent) + decoder.DisallowUnknownFields() + var pid struct { + Pid2 int `json:"stage2_pid"` + Pid1 int `json:"stage1_pid"` + } + if err := decoder.Decode(&pid); err != nil { + t.Fatal(err) + } + + // Reap children. + _, _ = unix.Wait4(pid.Pid1, nil, 0, nil) + _, _ = unix.Wait4(pid.Pid2, nil, 0, nil) + + // Sanity check. + if pid.Pid1 == 0 || pid.Pid2 == 0 { + t.Fatal("got pids:", pid) + } +} + +func getLogs(t *testing.T, logread *os.File) { + logsDecoder := json.NewDecoder(logread) + logsDecoder.DisallowUnknownFields() + var logentry struct { + Level string `json:"level"` + Msg string `json:"msg"` + } + + for { + if err := logsDecoder.Decode(&logentry); err != nil { + if errors.Is(err, io.EOF) { + return + } + t.Fatal("init log decoding error:", err) + } + t.Logf("logentry: %+v", logentry) + if logentry.Level == "" || logentry.Msg == "" { + t.Fatalf("init log: empty log entry: %+v", logentry) + } } - return os.NewFile(uintptr(fds[1]), "parent"), os.NewFile(uintptr(fds[0]), "child"), nil } diff --git a/libcontainer/nsenter/nsenter_unsupported.go b/libcontainer/nsenter/nsenter_unsupported.go deleted file mode 100644 index 2459c63..0000000 --- a/libcontainer/nsenter/nsenter_unsupported.go +++ /dev/null @@ -1,3 +0,0 @@ -// +build !linux !cgo - -package nsenter diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c index 0726568..c53fb3d 100644 --- a/libcontainer/nsenter/nsexec.c +++ b/libcontainer/nsenter/nsexec.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -29,6 +30,8 @@ /* Get all of the CLONE_NEW* flags. */ #include "namespace.h" +extern char *escape_json_string(char *str); + /* Synchronisation values. */ enum sync_t { SYNC_USERMAP_PLS = 0x40, /* Request parent to map our users. */ @@ -36,19 +39,19 @@ enum sync_t { SYNC_RECVPID_PLS = 0x42, /* Tell parent we're sending the PID. */ SYNC_RECVPID_ACK = 0x43, /* PID was correctly received by parent. */ SYNC_GRANDCHILD = 0x44, /* The grandchild is ready to run. */ - SYNC_CHILD_READY = 0x45, /* The child or grandchild is ready to return. */ + SYNC_CHILD_FINISH = 0x45, /* The child or grandchild has finished. */ + SYNC_MOUNTSOURCES_PLS = 0x46, /* Tell parent to send mount sources by SCM_RIGHTS. */ + SYNC_MOUNTSOURCES_ACK = 0x47, /* All mount sources have been sent. */ }; -/* - * Synchronisation value for cgroup namespace setup. - * The same constant is defined in process_linux.go as "createCgroupns". - */ -#define CREATECGROUPNS 0x80 - +#define STAGE_SETUP -1 /* longjmp() arguments. */ -#define JUMP_PARENT 0x00 -#define JUMP_CHILD 0xA0 -#define JUMP_INIT 0xA1 +#define STAGE_PARENT 0 +#define STAGE_CHILD 1 +#define STAGE_INIT 2 + +/* Stores the current stage of nsexec. */ +int current_stage = STAGE_SETUP; /* Assume the stack grows down, so arguments should be above it. */ struct clone_t { @@ -56,7 +59,7 @@ struct clone_t { * Reserve some space for clone() to locate arguments * and retcode in this place */ - char stack[4096] __attribute__ ((aligned(16))); + char stack[4096] __attribute__((aligned(16))); char stack_ptr[0]; /* There's two children. This is used to execute the different code. */ @@ -87,46 +90,58 @@ struct nlconfig_t { size_t uidmappath_len; char *gidmappath; size_t gidmappath_len; + + /* Mount sources opened outside the container userns. */ + char *mountsources; + size_t mountsources_len; }; -#define PANIC "panic" -#define FATAL "fatal" -#define ERROR "error" -#define WARNING "warning" -#define INFO "info" -#define DEBUG "debug" +/* + * Log levels are the same as in logrus. + */ +#define PANIC 0 +#define FATAL 1 +#define ERROR 2 +#define WARNING 3 +#define INFO 4 +#define DEBUG 5 +#define TRACE 6 + +static const char *level_str[] = { "panic", "fatal", "error", "warning", "info", "debug", "trace" }; static int logfd = -1; +static int loglevel = DEBUG; /* * List of netlink message types sent to us as part of bootstrapping the init. * These constants are defined in libcontainer/message_linux.go. */ -#define INIT_MSG 62000 +#define INIT_MSG 62000 #define CLONE_FLAGS_ATTR 27281 #define NS_PATHS_ATTR 27282 -#define UIDMAP_ATTR 27283 -#define GIDMAP_ATTR 27284 +#define UIDMAP_ATTR 27283 +#define GIDMAP_ATTR 27284 #define SETGROUP_ATTR 27285 #define OOM_SCORE_ADJ_ATTR 27286 #define ROOTLESS_EUID_ATTR 27287 -#define UIDMAPPATH_ATTR 27288 -#define GIDMAPPATH_ATTR 27289 +#define UIDMAPPATH_ATTR 27288 +#define GIDMAPPATH_ATTR 27289 +#define MOUNT_SOURCES_ATTR 27290 /* * Use the raw syscall for versions of glibc which don't include a function for * it, namely (glibc 2.12). */ #if __GLIBC__ == 2 && __GLIBC_MINOR__ < 14 -# define _GNU_SOURCE -# include "syscall.h" -# if !defined(SYS_setns) && defined(__NR_setns) -# define SYS_setns __NR_setns -# endif +# define _GNU_SOURCE +# include "syscall.h" +# if !defined(SYS_setns) && defined(__NR_setns) +# define SYS_setns __NR_setns +# endif -#ifndef SYS_setns -# error "setns(2) syscall not supported by glibc version" -#endif +# ifndef SYS_setns +# error "setns(2) syscall not supported by glibc version" +# endif int setns(int fd, int nstype) { @@ -134,34 +149,63 @@ int setns(int fd, int nstype) } #endif -static void write_log_with_info(const char *level, const char *function, int line, const char *format, ...) +static void write_log(int level, const char *format, ...) { - char message[1024] = {}; - + char *message = NULL, *stage = NULL, *json = NULL; va_list args; + int ret; - if (logfd < 0 || level == NULL) - return; + if (logfd < 0 || level > loglevel) + goto out; va_start(args, format); - if (vsnprintf(message, sizeof(message), format, args) < 0) - goto done; - - dprintf(logfd, "{\"level\":\"%s\", \"msg\": \"%s:%d %s\"}\n", level, function, line, message); -done: + ret = vasprintf(&message, format, args); va_end(args); -} + if (ret < 0) { + message = NULL; + goto out; + } -#define write_log(level, fmt, ...) \ - write_log_with_info((level), __FUNCTION__, __LINE__, (fmt), ##__VA_ARGS__) + message = escape_json_string(message); + + if (current_stage == STAGE_SETUP) + stage = strdup("nsexec"); + else + ret = asprintf(&stage, "nsexec-%d", current_stage); + if (ret < 0) { + stage = NULL; + goto out; + } + + ret = asprintf(&json, "{\"level\":\"%s\", \"msg\": \"%s[%d]: %s\"}\n", + level_str[level], stage, getpid(), message); + if (ret < 0) { + json = NULL; + goto out; + } + + /* This logging is on a best-effort basis. In case of a short or failed + * write there is nothing we can do, so just ignore write() errors. + */ + ssize_t __attribute__((unused)) __res = write(logfd, json, ret); + +out: + free(message); + free(stage); + free(json); +} /* XXX: This is ugly. */ static int syncfd = -1; -#define bail(fmt, ...) \ - do { \ - write_log(FATAL, "nsenter: " fmt ": %m", ##__VA_ARGS__); \ - exit(1); \ +#define bail(fmt, ...) \ + do { \ + if (logfd < 0) \ + fprintf(stderr, "FATAL: " fmt ": %m\n", \ + ##__VA_ARGS__); \ + else \ + write_log(FATAL, fmt ": %m", ##__VA_ARGS__); \ + exit(1); \ } while(0) static int write_file(char *data, size_t data_len, char *pathfmt, ...) @@ -187,7 +231,7 @@ static int write_file(char *data, size_t data_len, char *pathfmt, ...) goto out; } - out: +out: close(fd); return ret; } @@ -294,12 +338,14 @@ static int try_mapping_tool(const char *app, int pid, char *map, size_t map_len) static void update_uidmap(const char *path, int pid, char *map, size_t map_len) { - if (map == NULL || map_len <= 0) + if (map == NULL || map_len == 0) return; + write_log(DEBUG, "update /proc/%d/uid_map to '%s'", pid, map); if (write_file(map, map_len, "/proc/%d/uid_map", pid) < 0) { if (errno != EPERM) bail("failed to update /proc/%d/uid_map", pid); + write_log(DEBUG, "update /proc/%d/uid_map got -EPERM (trying %s)", pid, path); if (try_mapping_tool(path, pid, map, map_len)) bail("failed to use newuid map on %d", pid); } @@ -307,12 +353,14 @@ static void update_uidmap(const char *path, int pid, char *map, size_t map_len) static void update_gidmap(const char *path, int pid, char *map, size_t map_len) { - if (map == NULL || map_len <= 0) + if (map == NULL || map_len == 0) return; + write_log(DEBUG, "update /proc/%d/gid_map to '%s'", pid, map); if (write_file(map, map_len, "/proc/%d/gid_map", pid) < 0) { if (errno != EPERM) bail("failed to update /proc/%d/gid_map", pid); + write_log(DEBUG, "update /proc/%d/gid_map got -EPERM (trying %s)", pid, path); if (try_mapping_tool(path, pid, map, map_len)) bail("failed to use newgid map on %d", pid); } @@ -320,22 +368,23 @@ static void update_gidmap(const char *path, int pid, char *map, size_t map_len) static void update_oom_score_adj(char *data, size_t len) { - if (data == NULL || len <= 0) + if (data == NULL || len == 0) return; + write_log(DEBUG, "update /proc/self/oom_score_adj to '%s'", data); if (write_file(data, len, "/proc/self/oom_score_adj") < 0) bail("failed to update /proc/self/oom_score_adj"); } /* A dummy function that just jumps to the given jumpval. */ -static int child_func(void *arg) __attribute__ ((noinline)); +static int child_func(void *arg) __attribute__((noinline)); static int child_func(void *arg) { struct clone_t *ca = (struct clone_t *)arg; longjmp(*ca->env, ca->jmpval); } -static int clone_parent(jmp_buf *env, int jmpval) __attribute__ ((noinline)); +static int clone_parent(jmp_buf *env, int jmpval) __attribute__((noinline)); static int clone_parent(jmp_buf *env, int jmpval) { struct clone_t ca = { @@ -347,41 +396,55 @@ static int clone_parent(jmp_buf *env, int jmpval) } /* - * Gets the init pipe fd from the environment, which is used to read the - * bootstrap data and tell the parent what the new pid is after we finish - * setting up the environment. + * Returns an environment variable value as a non-negative integer, or -ENOENT + * if the variable was not found or has an empty value. + * + * If the value can not be converted to an integer, or the result is out of + * range, the function bails out. */ -static int initpipe(void) +static int getenv_int(const char *name) { - int pipenum; - char *initpipe, *endptr; + char *val, *endptr; + int ret; - initpipe = getenv("_LIBCONTAINER_INITPIPE"); - if (initpipe == NULL || *initpipe == '\0') - return -1; + val = getenv(name); + /* Treat empty value as unset variable. */ + if (val == NULL || *val == '\0') + return -ENOENT; - pipenum = strtol(initpipe, &endptr, 10); - if (*endptr != '\0') - bail("unable to parse _LIBCONTAINER_INITPIPE"); + ret = strtol(val, &endptr, 10); + if (val == endptr || *endptr != '\0') + bail("unable to parse %s=%s", name, val); + /* + * Sanity check: this must be a small non-negative number. + * Practically, we pass two fds (3 and 4) and a log level, + * for which the maximum is 6 (TRACE). + * */ + if (ret < 0 || ret > TRACE) + bail("bad value for %s=%s (%d)", name, val, ret); - return pipenum; + return ret; } +/* + * Sets up logging by getting log fd and log level from the environment, + * if available. + */ static void setup_logpipe(void) { - char *logpipe, *endptr; + int i; - logpipe = getenv("_LIBCONTAINER_LOGPIPE"); - if (logpipe == NULL || *logpipe == '\0') { + i = getenv_int("_LIBCONTAINER_LOGPIPE"); + if (i < 0) { + /* We are not runc init, or log pipe was not provided. */ return; } + logfd = i; - logfd = strtol(logpipe, &endptr, 10); - if (logpipe == endptr || *endptr != '\0') { - fprintf(stderr, "unable to parse _LIBCONTAINER_LOGPIPE, value: %s\n", logpipe); - /* It is too early to use bail */ - exit(1); - } + i = getenv_int("_LIBCONTAINER_LOGLEVEL"); + if (i < 0) + return; + loglevel = i; } /* Returns the clone(2) flag for a namespace, given the name of a namespace. */ @@ -487,6 +550,10 @@ static void nl_parse(int fd, struct nlconfig_t *config) case SETGROUP_ATTR: config->is_setgroup = readint8(current); break; + case MOUNT_SOURCES_ATTR: + config->mountsources = current; + config->mountsources_len = payload_len; + break; default: bail("unknown netlink message type %d", nlattr->nla_type); } @@ -507,7 +574,6 @@ void join_namespaces(char *nslist) char *namespace = strtok_r(nslist, ",", &saveptr); struct namespace_t { int fd; - int ns; char type[PATH_MAX]; char path[PATH_MAX]; } *namespaces = NULL; @@ -542,7 +608,7 @@ void join_namespaces(char *nslist) bail("failed to open %s", path); ns->fd = fd; - ns->ns = nsflag(namespace); + strncpy(ns->type, namespace, PATH_MAX - 1); strncpy(ns->path, path, PATH_MAX - 1); ns->path[PATH_MAX - 1] = '\0'; } while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL); @@ -555,12 +621,14 @@ void join_namespaces(char *nslist) */ for (i = 0; i < num; i++) { - struct namespace_t ns = namespaces[i]; + struct namespace_t *ns = &namespaces[i]; + int flag = nsflag(ns->type); - if (setns(ns.fd, ns.ns) < 0) - bail("failed to setns to %s", ns.path); + write_log(DEBUG, "setns(%#x) into %s namespace (with path %s)", flag, ns->type, ns->path); + if (setns(ns->fd, flag) < 0) + bail("failed to setns into %s namespace", ns->type); - close(ns.fd); + close(ns->fd); } free(namespaces); @@ -569,6 +637,201 @@ void join_namespaces(char *nslist) /* Defined in cloned_binary.c. */ extern int ensure_cloned_binary(void); +static inline int sane_kill(pid_t pid, int signum) +{ + if (pid > 0) + return kill(pid, signum); + else + return 0; +} + +void receive_fd(int sockfd, int new_fd) +{ + int bytes_read; + struct msghdr msg = { }; + struct cmsghdr *cmsg; + struct iovec iov = { }; + char null_byte = '\0'; + int ret; + int fd_count; + int *fd_payload; + + iov.iov_base = &null_byte; + iov.iov_len = 1; + + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + + msg.msg_controllen = CMSG_SPACE(sizeof(int)); + msg.msg_control = malloc(msg.msg_controllen); + if (msg.msg_control == NULL) { + bail("Can't allocate memory to receive fd."); + } + + memset(msg.msg_control, 0, msg.msg_controllen); + + bytes_read = recvmsg(sockfd, &msg, 0); + if (bytes_read != 1) + bail("failed to receive fd from unix socket %d", sockfd); + if (msg.msg_flags & MSG_CTRUNC) + bail("received truncated control message from unix socket %d", sockfd); + + cmsg = CMSG_FIRSTHDR(&msg); + if (!cmsg) + bail("received message from unix socket %d without control message", sockfd); + + if (cmsg->cmsg_level != SOL_SOCKET) + bail("received unknown control message from unix socket %d: cmsg_level=%d", sockfd, cmsg->cmsg_level); + + if (cmsg->cmsg_type != SCM_RIGHTS) + bail("received unknown control message from unix socket %d: cmsg_type=%d", sockfd, cmsg->cmsg_type); + + fd_count = (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int); + if (fd_count != 1) + bail("received control message from unix socket %d with too many fds: %d", sockfd, fd_count); + + fd_payload = (int *)CMSG_DATA(cmsg); + ret = dup3(*fd_payload, new_fd, O_CLOEXEC); + if (ret < 0) + bail("cannot dup3 fd %d to %d", *fd_payload, new_fd); + + free(msg.msg_control); + + ret = close(*fd_payload); + if (ret < 0) + bail("cannot close fd %d", *fd_payload); +} + +void send_fd(int sockfd, int fd) +{ + int bytes_written; + struct msghdr msg = { }; + struct cmsghdr *cmsg; + struct iovec iov[1] = { }; + char null_byte = '\0'; + + iov[0].iov_base = &null_byte; + iov[0].iov_len = 1; + + msg.msg_iov = iov; + msg.msg_iovlen = 1; + + /* We send only one fd as specified by cmsg->cmsg_len below, even + * though msg.msg_controllen might have more space due to alignment. */ + msg.msg_controllen = CMSG_SPACE(sizeof(int)); + msg.msg_control = malloc(msg.msg_controllen); + if (msg.msg_control == NULL) { + bail("Can't allocate memory to send fd."); + } + + memset(msg.msg_control, 0, msg.msg_controllen); + + cmsg = CMSG_FIRSTHDR(&msg); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); + memcpy(CMSG_DATA(cmsg), &fd, sizeof(int)); + + bytes_written = sendmsg(sockfd, &msg, 0); + + free(msg.msg_control); + + if (bytes_written != 1) + bail("failed to send fd %d via unix socket %d", fd, sockfd); +} + +void receive_mountsources(int sockfd) +{ + char *mount_fds, *endp; + long new_fd; + + // This env var must be a json array of ints. + mount_fds = getenv("_LIBCONTAINER_MOUNT_FDS"); + + if (mount_fds[0] != '[') { + bail("malformed _LIBCONTAINER_MOUNT_FDS env var: missing '['"); + } + mount_fds++; + + for (endp = mount_fds; *endp != ']'; mount_fds = endp + 1) { + new_fd = strtol(mount_fds, &endp, 10); + if (endp == mount_fds) { + bail("malformed _LIBCONTAINER_MOUNT_FDS env var: not a number"); + } + if (*endp == '\0') { + bail("malformed _LIBCONTAINER_MOUNT_FDS env var: missing ]"); + } + // The list contains -1 when no fd is needed. Ignore them. + if (new_fd == -1) { + continue; + } + + if (new_fd == LONG_MAX || new_fd < 0 || new_fd > INT_MAX) { + bail("malformed _LIBCONTAINER_MOUNT_FDS env var: fds out of range"); + } + + receive_fd(sockfd, new_fd); + } +} + +void send_mountsources(int sockfd, pid_t child, char *mountsources, size_t mountsources_len) +{ + char proc_path[PATH_MAX]; + int host_mntns_fd; + int container_mntns_fd; + int fd; + int ret; + + // container_linux.go shouldSendMountSources() decides if mount sources + // should be pre-opened (O_PATH) and passed via SCM_RIGHTS + if (mountsources == NULL) + return; + + host_mntns_fd = open("/proc/self/ns/mnt", O_RDONLY | O_CLOEXEC); + if (host_mntns_fd == -1) + bail("failed to get current mount namespace"); + + if (snprintf(proc_path, PATH_MAX, "/proc/%d/ns/mnt", child) < 0) + bail("failed to get mount namespace path"); + + container_mntns_fd = open(proc_path, O_RDONLY | O_CLOEXEC); + if (container_mntns_fd == -1) + bail("failed to get container mount namespace"); + + if (setns(container_mntns_fd, CLONE_NEWNS) < 0) + bail("failed to setns to container mntns"); + + char *mountsources_end = mountsources + mountsources_len; + while (mountsources < mountsources_end) { + if (mountsources[0] == '\0') { + mountsources++; + continue; + } + + fd = open(mountsources, O_PATH | O_CLOEXEC); + if (fd < 0) + bail("failed to open mount source %s", mountsources); + + send_fd(sockfd, fd); + + ret = close(fd); + if (ret != 0) + bail("failed to close mount source fd %d", fd); + + mountsources += strlen(mountsources) + 1; + } + + if (setns(host_mntns_fd, CLONE_NEWNS) < 0) + bail("failed to setns to host mntns"); + + ret = close(host_mntns_fd); + if (ret != 0) + bail("failed to close host mount namespace fd %d", host_mntns_fd); + ret = close(container_mntns_fd); + if (ret != 0) + bail("failed to close container mount namespace fd %d", container_mntns_fd); +} + void nsexec(void) { int pipenum; @@ -583,12 +846,15 @@ void nsexec(void) setup_logpipe(); /* - * If we don't have an init pipe, just return to the go routine. - * We'll only get an init pipe for start or exec. + * Get the init pipe fd from the environment. The init pipe is used to + * read the bootstrap data and tell the parent what the new pids are + * after the setup is done. */ - pipenum = initpipe(); - if (pipenum == -1) + pipenum = getenv_int("_LIBCONTAINER_INITPIPE"); + if (pipenum < 0) { + /* We are not a runc init. Just return to go runtime. */ return; + } /* * We need to re-exec if we are not in a cloned binary. This is necessary @@ -598,7 +864,14 @@ void nsexec(void) if (ensure_cloned_binary() < 0) bail("could not ensure we are a cloned binary"); - write_log(DEBUG, "nsexec started"); + /* + * Inform the parent we're past initial setup. + * For the other side of this, see initWaiter. + */ + if (write(pipenum, "", 1) != 1) + bail("could not inform the parent we are past initial setup"); + + write_log(DEBUG, "=> nsexec container setup"); /* Parse all of the netlink configuration. */ nl_parse(pipenum, &config); @@ -622,6 +895,7 @@ void nsexec(void) * containers), which is the recommendation from the kernel folks. */ if (config.namespaces) { + write_log(DEBUG, "set process as non-dumpable"); if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0) bail("failed to set process as non-dumpable"); } @@ -686,45 +960,50 @@ void nsexec(void) * -- Aleksa "what has my life come to?" Sarai */ - switch (setjmp(env)) { + current_stage = setjmp(env); + switch (current_stage) { /* * Stage 0: We're in the parent. Our job is just to create a new child - * (stage 1: JUMP_CHILD) process and write its uid_map and + * (stage 1: STAGE_CHILD) process and write its uid_map and * gid_map. That process will go on to create a new process, then * it will send us its PID which we will send to the bootstrap * process. */ - case JUMP_PARENT:{ + case STAGE_PARENT:{ int len; - pid_t child, first_child = -1; - bool ready = false; + pid_t stage1_pid = -1, stage2_pid = -1; + bool stage1_complete, stage2_complete; /* For debugging. */ prctl(PR_SET_NAME, (unsigned long)"runc:[0:PARENT]", 0, 0, 0); + write_log(DEBUG, "~> nsexec stage-0"); /* Start the process of getting a container. */ - child = clone_parent(&env, JUMP_CHILD); - if (child < 0) - bail("unable to fork: child_func"); + write_log(DEBUG, "spawn stage-1"); + stage1_pid = clone_parent(&env, STAGE_CHILD); + if (stage1_pid < 0) + bail("unable to spawn stage-1"); + + syncfd = sync_child_pipe[1]; + if (close(sync_child_pipe[0]) < 0) + bail("failed to close sync_child_pipe[0] fd"); /* - * State machine for synchronisation with the children. - * - * Father only return when both child and grandchild are - * ready, so we can receive all possible error codes - * generated by children. + * State machine for synchronisation with the children. We only + * return once both the child and grandchild are ready. */ - while (!ready) { + write_log(DEBUG, "-> stage-1 synchronisation loop"); + stage1_complete = false; + while (!stage1_complete) { enum sync_t s; - syncfd = sync_child_pipe[1]; - close(sync_child_pipe[0]); - if (read(syncfd, &s, sizeof(s)) != sizeof(s)) - bail("failed to sync with child: next state"); + bail("failed to sync with stage-1: next state"); switch (s) { case SYNC_USERMAP_PLS: + write_log(DEBUG, "stage-1 requested userns mappings"); + /* * Enable setgroups(2) if we've been asked to. But we also * have to explicitly disable setgroups(2) if we're @@ -735,70 +1014,90 @@ void nsexec(void) * For rootless multi-entry mapping, config.is_setgroup shall be true and * newuidmap/newgidmap shall be used. */ - if (config.is_rootless_euid && !config.is_setgroup) - update_setgroups(child, SETGROUPS_DENY); + update_setgroups(stage1_pid, SETGROUPS_DENY); /* Set up mappings. */ - update_uidmap(config.uidmappath, child, config.uidmap, config.uidmap_len); - update_gidmap(config.gidmappath, child, config.gidmap, config.gidmap_len); + update_uidmap(config.uidmappath, stage1_pid, config.uidmap, config.uidmap_len); + update_gidmap(config.gidmappath, stage1_pid, config.gidmap, config.gidmap_len); s = SYNC_USERMAP_ACK; if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { - kill(child, SIGKILL); - bail("failed to sync with child: write(SYNC_USERMAP_ACK)"); + sane_kill(stage1_pid, SIGKILL); + sane_kill(stage2_pid, SIGKILL); + bail("failed to sync with stage-1: write(SYNC_USERMAP_ACK)"); } break; - case SYNC_RECVPID_PLS:{ - first_child = child; + case SYNC_RECVPID_PLS: + write_log(DEBUG, "stage-1 requested pid to be forwarded"); - /* Get the init_func pid. */ - if (read(syncfd, &child, sizeof(child)) != sizeof(child)) { - kill(first_child, SIGKILL); - bail("failed to sync with child: read(childpid)"); - } + /* Get the stage-2 pid. */ + if (read(syncfd, &stage2_pid, sizeof(stage2_pid)) != sizeof(stage2_pid)) { + sane_kill(stage1_pid, SIGKILL); + sane_kill(stage2_pid, SIGKILL); + bail("failed to sync with stage-1: read(stage2_pid)"); + } - /* Send ACK. */ - s = SYNC_RECVPID_ACK; - if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { - kill(first_child, SIGKILL); - kill(child, SIGKILL); - bail("failed to sync with child: write(SYNC_RECVPID_ACK)"); - } + /* Send ACK. */ + s = SYNC_RECVPID_ACK; + if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { + sane_kill(stage1_pid, SIGKILL); + sane_kill(stage2_pid, SIGKILL); + bail("failed to sync with stage-1: write(SYNC_RECVPID_ACK)"); + } - /* Send the init_func pid back to our parent. - * - * Send the init_func pid and the pid of the first child back to our parent. - * We need to send both back because we can't reap the first child we created (CLONE_PARENT). - * It becomes the responsibility of our parent to reap the first child. - */ - len = dprintf(pipenum, "{\"pid\": %d, \"pid_first\": %d}\n", child, first_child); - if (len < 0) { - kill(child, SIGKILL); - bail("unable to generate JSON for child pid"); - } + /* + * Send both the stage-1 and stage-2 pids back to runc. + * runc needs the stage-2 to continue process management, + * but because stage-1 was spawned with CLONE_PARENT we + * cannot reap it within stage-0 and thus we need to ask + * runc to reap the zombie for us. + */ + write_log(DEBUG, "forward stage-1 (%d) and stage-2 (%d) pids to runc", + stage1_pid, stage2_pid); + len = + dprintf(pipenum, "{\"stage1_pid\":%d,\"stage2_pid\":%d}\n", stage1_pid, + stage2_pid); + if (len < 0) { + sane_kill(stage1_pid, SIGKILL); + sane_kill(stage2_pid, SIGKILL); + bail("failed to sync with runc: write(pid-JSON)"); } break; - case SYNC_CHILD_READY: - ready = true; + case SYNC_MOUNTSOURCES_PLS: + send_mountsources(syncfd, stage1_pid, config.mountsources, + config.mountsources_len); + + s = SYNC_MOUNTSOURCES_ACK; + if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { + kill(stage1_pid, SIGKILL); + bail("failed to sync with child: write(SYNC_MOUNTSOURCES_ACK)"); + } + break; + case SYNC_CHILD_FINISH: + write_log(DEBUG, "stage-1 complete"); + stage1_complete = true; break; default: bail("unexpected sync value: %u", s); } } + write_log(DEBUG, "<- stage-1 synchronisation loop"); /* Now sync with grandchild. */ + syncfd = sync_grandchild_pipe[1]; + if (close(sync_grandchild_pipe[0]) < 0) + bail("failed to close sync_grandchild_pipe[0] fd"); - ready = false; - while (!ready) { + write_log(DEBUG, "-> stage-2 synchronisation loop"); + stage2_complete = false; + while (!stage2_complete) { enum sync_t s; - syncfd = sync_grandchild_pipe[1]; - close(sync_grandchild_pipe[0]); - + write_log(DEBUG, "signalling stage-2 to run"); s = SYNC_GRANDCHILD; if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { - kill(child, SIGKILL); + sane_kill(stage2_pid, SIGKILL); bail("failed to sync with child: write(SYNC_GRANDCHILD)"); } @@ -806,40 +1105,46 @@ void nsexec(void) bail("failed to sync with child: next state"); switch (s) { - case SYNC_CHILD_READY: - ready = true; + case SYNC_CHILD_FINISH: + write_log(DEBUG, "stage-2 complete"); + stage2_complete = true; break; default: bail("unexpected sync value: %u", s); } } + write_log(DEBUG, "<- stage-2 synchronisation loop"); + write_log(DEBUG, "<~ nsexec stage-0"); exit(0); } + break; /* * Stage 1: We're in the first child process. Our job is to join any - * provided namespaces in the netlink payload and unshare all - * of the requested namespaces. If we've been asked to - * CLONE_NEWUSER, we will ask our parent (stage 0) to set up - * our user mappings for us. Then, we create a new child - * (stage 2: JUMP_INIT) for PID namespace. We then send the - * child's PID to our parent (stage 0). + * provided namespaces in the netlink payload and unshare all of + * the requested namespaces. If we've been asked to CLONE_NEWUSER, + * we will ask our parent (stage 0) to set up our user mappings + * for us. Then, we create a new child (stage 2: STAGE_INIT) for + * PID namespace. We then send the child's PID to our parent + * (stage 0). */ - case JUMP_CHILD:{ - pid_t child; + case STAGE_CHILD:{ + pid_t stage2_pid = -1; enum sync_t s; /* We're in a child and thus need to tell the parent if we die. */ syncfd = sync_child_pipe[0]; - close(sync_child_pipe[1]); + if (close(sync_child_pipe[1]) < 0) + bail("failed to close sync_child_pipe[1] fd"); /* For debugging. */ prctl(PR_SET_NAME, (unsigned long)"runc:[1:CHILD]", 0, 0, 0); + write_log(DEBUG, "~> nsexec stage-1"); /* * We need to setns first. We cannot do this earlier (in stage 0) * because of the fact that we forked to get here (the PID of - * [stage 2: JUMP_INIT]) would be meaningless). We could send it + * [stage 2: STAGE_INIT]) would be meaningless). We could send it * using cmsg(3) but that's just annoying. */ if (config.namespaces) @@ -865,40 +1170,50 @@ void nsexec(void) * problem. */ if (config.cloneflags & CLONE_NEWUSER) { + write_log(DEBUG, "unshare user namespace"); if (unshare(CLONE_NEWUSER) < 0) bail("failed to unshare user namespace"); config.cloneflags &= ~CLONE_NEWUSER; /* - * We don't have the privileges to do any mapping here (see the - * clone_parent rant). So signal our parent to hook us up. + * We need to set ourselves as dumpable temporarily so that the + * parent process can write to our procfs files. */ - - /* Switching is only necessary if we joined namespaces. */ if (config.namespaces) { + write_log(DEBUG, "temporarily set process as dumpable"); if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0) < 0) - bail("failed to set process as dumpable"); + bail("failed to temporarily set process as dumpable"); } + + /* + * We don't have the privileges to do any mapping here (see the + * clone_parent rant). So signal stage-0 to do the mapping for + * us. + */ + write_log(DEBUG, "request stage-0 to map user namespace"); s = SYNC_USERMAP_PLS; if (write(syncfd, &s, sizeof(s)) != sizeof(s)) bail("failed to sync with parent: write(SYNC_USERMAP_PLS)"); /* ... wait for mapping ... */ - + write_log(DEBUG, "request stage-0 to map user namespace"); if (read(syncfd, &s, sizeof(s)) != sizeof(s)) bail("failed to sync with parent: read(SYNC_USERMAP_ACK)"); if (s != SYNC_USERMAP_ACK) bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s); - /* Switching is only necessary if we joined namespaces. */ + + /* Revert temporary re-dumpable setting. */ if (config.namespaces) { + write_log(DEBUG, "re-set process as non-dumpable"); if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0) - bail("failed to set process as dumpable"); + bail("failed to re-set process as non-dumpable"); } /* Become root in the namespace proper. */ if (setresuid(0, 0, 0) < 0) bail("failed to become root in user namespace"); } + /* * Unshare all of the namespaces. Now, it should be noted that this * ordering might break in the future (especially with rootless @@ -909,8 +1224,31 @@ void nsexec(void) * some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID) * was broken, so we'll just do it the long way anyway. */ + write_log(DEBUG, "unshare remaining namespace (except cgroupns)"); if (unshare(config.cloneflags & ~CLONE_NEWCGROUP) < 0) - bail("failed to unshare namespaces"); + bail("failed to unshare remaining namespaces (except cgroupns)"); + + /* Ask our parent to send the mount sources fds. */ + if (config.mountsources) { + s = SYNC_MOUNTSOURCES_PLS; + if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { + kill(stage2_pid, SIGKILL); + bail("failed to sync with parent: write(SYNC_MOUNTSOURCES_PLS)"); + } + + /* Receive and install all mount sources fds. */ + receive_mountsources(syncfd); + + /* Parent finished to send the mount sources fds. */ + if (read(syncfd, &s, sizeof(s)) != sizeof(s)) { + kill(stage2_pid, SIGKILL); + bail("failed to sync with parent: read(SYNC_MOUNTSOURCES_ACK)"); + } + if (s != SYNC_MOUNTSOURCES_ACK) { + kill(stage2_pid, SIGKILL); + bail("failed to sync with parent: SYNC_MOUNTSOURCES_ACK: got %u", s); + } + } /* * TODO: What about non-namespace clone flags that we're dropping here? @@ -921,41 +1259,45 @@ void nsexec(void) * which would break many applications and libraries, so we must fork * to actually enter the new PID namespace. */ - child = clone_parent(&env, JUMP_INIT); - if (child < 0) - bail("unable to fork: init_func"); + write_log(DEBUG, "spawn stage-2"); + stage2_pid = clone_parent(&env, STAGE_INIT); + if (stage2_pid < 0) + bail("unable to spawn stage-2"); /* Send the child to our parent, which knows what it's doing. */ + write_log(DEBUG, "request stage-0 to forward stage-2 pid (%d)", stage2_pid); s = SYNC_RECVPID_PLS; if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { - kill(child, SIGKILL); + sane_kill(stage2_pid, SIGKILL); bail("failed to sync with parent: write(SYNC_RECVPID_PLS)"); } - if (write(syncfd, &child, sizeof(child)) != sizeof(child)) { - kill(child, SIGKILL); - bail("failed to sync with parent: write(childpid)"); + if (write(syncfd, &stage2_pid, sizeof(stage2_pid)) != sizeof(stage2_pid)) { + sane_kill(stage2_pid, SIGKILL); + bail("failed to sync with parent: write(stage2_pid)"); } /* ... wait for parent to get the pid ... */ - if (read(syncfd, &s, sizeof(s)) != sizeof(s)) { - kill(child, SIGKILL); + sane_kill(stage2_pid, SIGKILL); bail("failed to sync with parent: read(SYNC_RECVPID_ACK)"); } if (s != SYNC_RECVPID_ACK) { - kill(child, SIGKILL); + sane_kill(stage2_pid, SIGKILL); bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s); } - s = SYNC_CHILD_READY; + write_log(DEBUG, "signal completion to stage-0"); + s = SYNC_CHILD_FINISH; if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { - kill(child, SIGKILL); - bail("failed to sync with parent: write(SYNC_CHILD_READY)"); + sane_kill(stage2_pid, SIGKILL); + bail("failed to sync with parent: write(SYNC_CHILD_FINISH)"); } - /* Our work is done. [Stage 2: JUMP_INIT] is doing the rest of the work. */ + /* Our work is done. [Stage 2: STAGE_INIT] is doing the rest of the work. */ + write_log(DEBUG, "<~ nsexec stage-1"); exit(0); } + break; /* * Stage 2: We're the final child process, and the only process that will @@ -963,7 +1305,7 @@ void nsexec(void) * final cleanup steps and then return to the Go runtime to allow * init_linux.go to run. */ - case JUMP_INIT:{ + case STAGE_INIT:{ /* * We're inside the child now, having jumped from the * start_child() code after forking in the parent. @@ -972,12 +1314,15 @@ void nsexec(void) /* We're in a child and thus need to tell the parent if we die. */ syncfd = sync_grandchild_pipe[0]; - close(sync_grandchild_pipe[1]); - close(sync_child_pipe[0]); - close(sync_child_pipe[1]); + if (close(sync_grandchild_pipe[1]) < 0) + bail("failed to close sync_grandchild_pipe[1] fd"); + + if (close(sync_child_pipe[0]) < 0) + bail("failed to close sync_child_pipe[0] fd"); /* For debugging. */ prctl(PR_SET_NAME, (unsigned long)"runc:[2:INIT]", 0, 0, 0); + write_log(DEBUG, "~> nsexec stage-2"); if (read(syncfd, &s, sizeof(s)) != sizeof(s)) bail("failed to sync with parent: read(SYNC_GRANDCHILD)"); @@ -998,33 +1343,31 @@ void nsexec(void) bail("setgroups failed"); } - /* ... wait until our topmost parent has finished cgroup setup in p.manager.Apply() ... */ if (config.cloneflags & CLONE_NEWCGROUP) { - uint8_t value; - if (read(pipenum, &value, sizeof(value)) != sizeof(value)) - bail("read synchronisation value failed"); - if (value == CREATECGROUPNS) { - if (unshare(CLONE_NEWCGROUP) < 0) - bail("failed to unshare cgroup namespace"); - } else - bail("received unknown synchronisation value"); + if (unshare(CLONE_NEWCGROUP) < 0) + bail("failed to unshare cgroup namespace"); } - s = SYNC_CHILD_READY; + write_log(DEBUG, "signal completion to stage-0"); + s = SYNC_CHILD_FINISH; if (write(syncfd, &s, sizeof(s)) != sizeof(s)) - bail("failed to sync with patent: write(SYNC_CHILD_READY)"); + bail("failed to sync with parent: write(SYNC_CHILD_FINISH)"); /* Close sync pipes. */ - close(sync_grandchild_pipe[0]); + if (close(sync_grandchild_pipe[0]) < 0) + bail("failed to close sync_grandchild_pipe[0] fd"); /* Free netlink data. */ nl_free(&config); /* Finish executing, let the Go runtime take over. */ + write_log(DEBUG, "<= nsexec container setup"); + write_log(DEBUG, "booting up go runtime ..."); return; } + break; default: - bail("unexpected jump value"); + bail("unknown stage '%d' for jump value", current_stage); } /* Should never be reached. */ diff --git a/libcontainer/nsenter/test/escape.c b/libcontainer/nsenter/test/escape.c new file mode 120000 index 0000000..c53e316 --- /dev/null +++ b/libcontainer/nsenter/test/escape.c @@ -0,0 +1 @@ +../escape.c \ No newline at end of file diff --git a/libcontainer/nsenter/test/escape.go b/libcontainer/nsenter/test/escape.go new file mode 100644 index 0000000..f85d9e2 --- /dev/null +++ b/libcontainer/nsenter/test/escape.go @@ -0,0 +1,53 @@ +package escapetest + +// This file is part of escape_json_string unit test. +// It is in a separate package so cgo can be used together +// with go test. + +// #include +// extern char *escape_json_string(char *str); +// #cgo CFLAGS: -DESCAPE_TEST=1 +import "C" + +import ( + "testing" + "unsafe" +) + +func testEscapeJSONString(t *testing.T, input, want string) { + in := C.CString(input) + out := C.escape_json_string(in) + got := C.GoString(out) + C.free(unsafe.Pointer(out)) + t.Logf("input: %q, output: %q", input, got) + if got != want { + t.Errorf("Failed on input: %q, want %q, got %q", input, want, got) + } +} + +func testEscapeJSON(t *testing.T) { + testCases := []struct { + input, output string + }{ + {"", ""}, + {"abcdef", "abcdef"}, + {`\\\\\\`, `\\\\\\\\\\\\`}, + {`with"quote`, `with\"quote`}, + {"\n\r\b\t\f\\", `\n\r\b\t\f\\`}, + {"\007", "\\u0007"}, + {"\017 \020 \037", "\\u000f \\u0010 \\u001f"}, + {"\033", "\\u001b"}, + {`<->`, `<->`}, + {"\176\177\200", "~\\u007f\200"}, + {"\000", ""}, + {"a\x7fxc", "a\\u007fxc"}, + {"a\033xc", "a\\u001bxc"}, + {"a\nxc", "a\\nxc"}, + {"a\\xc", "a\\\\xc"}, + {"Barney B\303\244r", "Barney B\303\244r"}, + } + + for _, tc := range testCases { + testEscapeJSONString(t, tc.input, tc.output) + } +} diff --git a/libcontainer/nsenter/test/escape_test.go b/libcontainer/nsenter/test/escape_test.go new file mode 100644 index 0000000..3841cd0 --- /dev/null +++ b/libcontainer/nsenter/test/escape_test.go @@ -0,0 +1,11 @@ +package escapetest + +import "testing" + +// The actual test function is in escape.go +// so that it can use cgo (import "C"). +// This wrapper is here for gotest to find. + +func TestEscapeJSON(t *testing.T) { + testEscapeJSON(t) +} diff --git a/libcontainer/process.go b/libcontainer/process.go index d3e472a..8a5d340 100644 --- a/libcontainer/process.go +++ b/libcontainer/process.go @@ -1,7 +1,7 @@ package libcontainer import ( - "fmt" + "errors" "io" "math" "os" @@ -9,6 +9,8 @@ import ( "github.com/opencontainers/runc/libcontainer/configs" ) +var errInvalidProcess = errors.New("invalid process") + type processOperations interface { wait() (*os.ProcessState, error) signal(sig os.Signal) error @@ -78,13 +80,22 @@ type Process struct { ops processOperations LogLevel string + + // SubCgroupPaths specifies sub-cgroups to run the process in. + // Map keys are controller names, map values are paths (relative to + // container's top-level cgroup). + // + // If empty, the default top-level container's cgroup is used. + // + // For cgroup v2, the only key allowed is "". + SubCgroupPaths map[string]string } // Wait waits for the process to exit. // Wait releases any resources associated with the Process func (p Process) Wait() (*os.ProcessState, error) { if p.ops == nil { - return nil, newGenericError(fmt.Errorf("invalid process"), NoProcessOps) + return nil, errInvalidProcess } return p.ops.wait() } @@ -94,7 +105,7 @@ func (p Process) Pid() (int, error) { // math.MinInt32 is returned here, because it's invalid value // for the kill() system call. if p.ops == nil { - return math.MinInt32, newGenericError(fmt.Errorf("invalid process"), NoProcessOps) + return math.MinInt32, errInvalidProcess } return p.ops.pid(), nil } @@ -102,7 +113,7 @@ func (p Process) Pid() (int, error) { // Signal sends a signal to the Process. func (p Process) Signal(sig os.Signal) error { if p.ops == nil { - return newGenericError(fmt.Errorf("invalid process"), NoProcessOps) + return errInvalidProcess } return p.ops.signal(sig) } diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go index de989b5..e025445 100644 --- a/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go @@ -1,5 +1,3 @@ -// +build linux - package libcontainer import ( @@ -7,26 +5,25 @@ import ( "errors" "fmt" "io" + "net" "os" "os/exec" "path/filepath" "strconv" - "syscall" // only for Signal + "time" "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fs2" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/intelrdt" "github.com/opencontainers/runc/libcontainer/logs" "github.com/opencontainers/runc/libcontainer/system" "github.com/opencontainers/runc/libcontainer/utils" - + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/sirupsen/logrus" "golang.org/x/sys/unix" ) -// Synchronisation value for cgroup namespace setup. -// The same constant is defined in nsexec.c as "CREATECGROUPNS". -const createCgroupns = 0x80 - type parentProcess interface { // pid returns the pid for the running process. pid() int @@ -49,7 +46,7 @@ type parentProcess interface { setExternalDescriptors(fds []string) - forwardChildLogs() + forwardChildLogs() chan error } type filePair struct { @@ -63,11 +60,13 @@ type setnsProcess struct { logFilePair filePair cgroupPaths map[string]string rootlessCgroups bool + manager cgroups.Manager intelRdtPath string config *initConfig fds []string process *Process bootstrapData io.Reader + initProcessPid int } func (p *setnsProcess) startTime() (uint64, error) { @@ -76,33 +75,76 @@ func (p *setnsProcess) startTime() (uint64, error) { } func (p *setnsProcess) signal(sig os.Signal) error { - s, ok := sig.(syscall.Signal) + s, ok := sig.(unix.Signal) if !ok { return errors.New("os: unsupported signal type") } return unix.Kill(p.pid(), s) } -func (p *setnsProcess) start() (err error) { +func (p *setnsProcess) start() (retErr error) { defer p.messageSockPair.parent.Close() - err = p.cmd.Start() + // get the "before" value of oom kill count + oom, _ := p.manager.OOMKillCount() + err := p.cmd.Start() // close the write-side of the pipes (controlled by child) p.messageSockPair.child.Close() p.logFilePair.child.Close() if err != nil { - return newSystemErrorWithCause(err, "starting setns process") + return fmt.Errorf("error starting setns process: %w", err) } + + waitInit := initWaiter(p.messageSockPair.parent) + defer func() { + if retErr != nil { + if newOom, err := p.manager.OOMKillCount(); err == nil && newOom != oom { + // Someone in this cgroup was killed, this _might_ be us. + retErr = fmt.Errorf("%w (possibly OOM-killed)", retErr) + } + werr := <-waitInit + if werr != nil { + logrus.WithError(werr).Warn() + } + err := ignoreTerminateErrors(p.terminate()) + if err != nil { + logrus.WithError(err).Warn("unable to terminate setnsProcess") + } + } + }() + if p.bootstrapData != nil { if _, err := io.Copy(p.messageSockPair.parent, p.bootstrapData); err != nil { - return newSystemErrorWithCause(err, "copying bootstrap data to pipe") + return fmt.Errorf("error copying bootstrap data to pipe: %w", err) } } - if err = p.execSetns(); err != nil { - return newSystemErrorWithCause(err, "executing setns process") + err = <-waitInit + if err != nil { + return err } - if len(p.cgroupPaths) > 0 { - if err := cgroups.EnterPid(p.cgroupPaths, p.pid()); err != nil && !p.rootlessCgroups { - return newSystemErrorWithCausef(err, "adding pid %d to cgroups", p.pid()) + if err := p.execSetns(); err != nil { + return fmt.Errorf("error executing setns process: %w", err) + } + for _, path := range p.cgroupPaths { + if err := cgroups.WriteCgroupProc(path, p.pid()); err != nil && !p.rootlessCgroups { + // On cgroup v2 + nesting + domain controllers, WriteCgroupProc may fail with EBUSY. + // https://github.com/opencontainers/runc/issues/2356#issuecomment-621277643 + // Try to join the cgroup of InitProcessPid. + if cgroups.IsCgroup2UnifiedMode() && p.initProcessPid != 0 { + initProcCgroupFile := fmt.Sprintf("/proc/%d/cgroup", p.initProcessPid) + initCg, initCgErr := cgroups.ParseCgroupFile(initProcCgroupFile) + if initCgErr == nil { + if initCgPath, ok := initCg[""]; ok { + initCgDirpath := filepath.Join(fs2.UnifiedMountpoint, initCgPath) + logrus.Debugf("adding pid %d to cgroups %v failed (%v), attempting to join %q (obtained from %s)", + p.pid(), p.cgroupPaths, err, initCg, initCgDirpath) + // NOTE: initCgDirPath is not guaranteed to exist because we didn't pause the container. + err = cgroups.WriteCgroupProc(initCgDirpath, p.pid()) + } + } + } + if err != nil { + return fmt.Errorf("error adding pid %d to cgroups: %w", p.pid(), err) + } } } if p.intelRdtPath != "" { @@ -110,17 +152,17 @@ func (p *setnsProcess) start() (err error) { _, err := os.Stat(p.intelRdtPath) if err == nil { if err := intelrdt.WriteIntelRdtTasks(p.intelRdtPath, p.pid()); err != nil { - return newSystemErrorWithCausef(err, "adding pid %d to Intel RDT resource control filesystem", p.pid()) + return fmt.Errorf("error adding pid %d to Intel RDT: %w", p.pid(), err) } } } // set rlimits, this has to be done here because we lose permissions // to raise the limits once we enter a user-namespace if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil { - return newSystemErrorWithCause(err, "setting rlimits for process") + return fmt.Errorf("error setting rlimits for process: %w", err) } if err := utils.WriteJSON(p.messageSockPair.parent, p.config); err != nil { - return newSystemErrorWithCause(err, "writing config to pipe") + return fmt.Errorf("error writing config to pipe: %w", err) } ierr := parseSync(p.messageSockPair.parent, func(sync *syncT) error { @@ -131,17 +173,53 @@ func (p *setnsProcess) start() (err error) { case procHooks: // This shouldn't happen. panic("unexpected procHooks in setns") + case procSeccomp: + if p.config.Config.Seccomp.ListenerPath == "" { + return errors.New("listenerPath is not set") + } + + seccompFd, err := recvSeccompFd(uintptr(p.pid()), uintptr(sync.Fd)) + if err != nil { + return err + } + defer unix.Close(seccompFd) + + bundle, annotations := utils.Annotations(p.config.Config.Labels) + containerProcessState := &specs.ContainerProcessState{ + Version: specs.Version, + Fds: []string{specs.SeccompFdName}, + Pid: p.cmd.Process.Pid, + Metadata: p.config.Config.Seccomp.ListenerMetadata, + State: specs.State{ + Version: specs.Version, + ID: p.config.ContainerId, + Status: specs.StateRunning, + Pid: p.initProcessPid, + Bundle: bundle, + Annotations: annotations, + }, + } + if err := sendContainerProcessState(p.config.Config.Seccomp.ListenerPath, + containerProcessState, seccompFd); err != nil { + return err + } + + // Sync with child. + if err := writeSync(p.messageSockPair.parent, procSeccompDone); err != nil { + return err + } + return nil default: - return newSystemError(fmt.Errorf("invalid JSON payload from child")) + return errors.New("invalid JSON payload from child") } }) if err := unix.Shutdown(int(p.messageSockPair.parent.Fd()), unix.SHUT_WR); err != nil { - return newSystemErrorWithCause(err, "calling shutdown on init pipe") + return &os.PathError{Op: "shutdown", Path: "(init pipe)", Err: err} } // Must be done after Shutdown so the child will exit and we can wait for it. if ierr != nil { - p.wait() + _, _ = p.wait() return ierr } return nil @@ -154,17 +232,17 @@ func (p *setnsProcess) start() (err error) { func (p *setnsProcess) execSetns() error { status, err := p.cmd.Process.Wait() if err != nil { - p.cmd.Wait() - return newSystemErrorWithCause(err, "waiting on setns process to finish") + _ = p.cmd.Wait() + return fmt.Errorf("error waiting on setns process to finish: %w", err) } if !status.Success() { - p.cmd.Wait() - return newSystemError(&exec.ExitError{ProcessState: status}) + _ = p.cmd.Wait() + return &exec.ExitError{ProcessState: status} } var pid *pid if err := json.NewDecoder(p.messageSockPair.parent).Decode(&pid); err != nil { - p.cmd.Wait() - return newSystemErrorWithCause(err, "reading pid from init pipe") + _ = p.cmd.Wait() + return fmt.Errorf("error reading pid from init pipe: %w", err) } // Clean up the zombie parent process @@ -215,8 +293,8 @@ func (p *setnsProcess) setExternalDescriptors(newFds []string) { p.fds = newFds } -func (p *setnsProcess) forwardChildLogs() { - go logs.ForwardLogs(p.logFilePair.parent) +func (p *setnsProcess) forwardChildLogs() chan error { + return logs.ForwardLogs(p.logFilePair.parent) } type initProcess struct { @@ -245,7 +323,7 @@ func (p *initProcess) externalDescriptors() []string { func (p *initProcess) getChildPid() (int, error) { var pid pid if err := json.NewDecoder(p.messageSockPair.parent).Decode(&pid); err != nil { - p.cmd.Wait() + _ = p.cmd.Wait() return -1, err } @@ -262,11 +340,11 @@ func (p *initProcess) getChildPid() (int, error) { func (p *initProcess) waitForChildExit(childPid int) error { status, err := p.cmd.Process.Wait() if err != nil { - p.cmd.Wait() + _ = p.cmd.Wait() return err } if !status.Success() { - p.cmd.Wait() + _ = p.cmd.Wait() return &exec.ExitError{ProcessState: status} } @@ -279,44 +357,80 @@ func (p *initProcess) waitForChildExit(childPid int) error { return nil } -func (p *initProcess) start() error { - defer p.messageSockPair.parent.Close() +func (p *initProcess) start() (retErr error) { + defer p.messageSockPair.parent.Close() //nolint: errcheck err := p.cmd.Start() p.process.ops = p // close the write-side of the pipes (controlled by child) - p.messageSockPair.child.Close() - p.logFilePair.child.Close() + _ = p.messageSockPair.child.Close() + _ = p.logFilePair.child.Close() if err != nil { p.process.ops = nil - return newSystemErrorWithCause(err, "starting init process command") - } - // Do this before syncing with child so that no children can escape the - // cgroup. We don't need to worry about not doing this and not being root - // because we'd be using the rootless cgroup manager in that case. - if err := p.manager.Apply(p.pid()); err != nil { - return newSystemErrorWithCause(err, "applying cgroup configuration for process") - } - if p.intelRdtManager != nil { - if err := p.intelRdtManager.Apply(p.pid()); err != nil { - return newSystemErrorWithCause(err, "applying Intel RDT configuration for process") - } + return fmt.Errorf("unable to start init: %w", err) } + + waitInit := initWaiter(p.messageSockPair.parent) defer func() { - if err != nil { - // TODO: should not be the responsibility to call here - p.manager.Destroy() + if retErr != nil { + // Find out if init is killed by the kernel's OOM killer. + // Get the count before killing init as otherwise cgroup + // might be removed by systemd. + oom, err := p.manager.OOMKillCount() + if err != nil { + logrus.WithError(err).Warn("unable to get oom kill count") + } else if oom > 0 { + // Does not matter what the particular error was, + // its cause is most probably OOM, so report that. + const oomError = "container init was OOM-killed (memory limit too low?)" + + if logrus.GetLevel() >= logrus.DebugLevel { + // Only show the original error if debug is set, + // as it is not generally very useful. + retErr = fmt.Errorf(oomError+": %w", retErr) + } else { + retErr = errors.New(oomError) + } + } + + werr := <-waitInit + if werr != nil { + logrus.WithError(werr).Warn() + } + + // Terminate the process to ensure we can remove cgroups. + if err := ignoreTerminateErrors(p.terminate()); err != nil { + logrus.WithError(err).Warn("unable to terminate initProcess") + } + + _ = p.manager.Destroy() if p.intelRdtManager != nil { - p.intelRdtManager.Destroy() + _ = p.intelRdtManager.Destroy() } } }() - if _, err := io.Copy(p.messageSockPair.parent, p.bootstrapData); err != nil { - return newSystemErrorWithCause(err, "copying bootstrap data to pipe") + // Do this before syncing with child so that no children can escape the + // cgroup. We don't need to worry about not doing this and not being root + // because we'd be using the rootless cgroup manager in that case. + if err := p.manager.Apply(p.pid()); err != nil { + return fmt.Errorf("unable to apply cgroup configuration: %w", err) } + if p.intelRdtManager != nil { + if err := p.intelRdtManager.Apply(p.pid()); err != nil { + return fmt.Errorf("unable to apply Intel RDT configuration: %w", err) + } + } + if _, err := io.Copy(p.messageSockPair.parent, p.bootstrapData); err != nil { + return fmt.Errorf("can't copy bootstrap data to pipe: %w", err) + } + err = <-waitInit + if err != nil { + return err + } + childPid, err := p.getChildPid() if err != nil { - return newSystemErrorWithCause(err, "getting the final child's pid from pipe") + return fmt.Errorf("can't get final child's PID from pipe: %w", err) } // Save the standard descriptor names before the container process @@ -324,45 +438,23 @@ func (p *initProcess) start() error { // we won't know at checkpoint time which file descriptor to look up. fds, err := getPipeFds(childPid) if err != nil { - return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", childPid) + return fmt.Errorf("error getting pipe fds for pid %d: %w", childPid, err) } p.setExternalDescriptors(fds) - // Do this before syncing with child so that no children - // can escape the cgroup - if err := p.manager.Apply(childPid); err != nil { - return newSystemErrorWithCause(err, "applying cgroup configuration for process") - } - if p.intelRdtManager != nil { - if err := p.intelRdtManager.Apply(childPid); err != nil { - return newSystemErrorWithCause(err, "applying Intel RDT configuration for process") - } - } - // Now it's time to setup cgroup namesapce - if p.config.Config.Namespaces.Contains(configs.NEWCGROUP) && p.config.Config.Namespaces.PathOf(configs.NEWCGROUP) == "" { - if _, err := p.messageSockPair.parent.Write([]byte{createCgroupns}); err != nil { - return newSystemErrorWithCause(err, "sending synchronization value to init process") - } - } // Wait for our first child to exit if err := p.waitForChildExit(childPid); err != nil { - return newSystemErrorWithCause(err, "waiting for our first child to exit") + return fmt.Errorf("error waiting for our first child to exit: %w", err) } - defer func() { - if err != nil { - // TODO: should not be the responsibility to call here - p.manager.Destroy() - if p.intelRdtManager != nil { - p.intelRdtManager.Destroy() - } - } - }() if err := p.createNetworkInterfaces(); err != nil { - return newSystemErrorWithCause(err, "creating network interfaces") + return fmt.Errorf("error creating network interfaces: %w", err) + } + if err := p.updateSpecState(); err != nil { + return fmt.Errorf("error updating spec state: %w", err) } if err := p.sendConfig(); err != nil { - return newSystemErrorWithCause(err, "sending config to init process") + return fmt.Errorf("error sending config to init process: %w", err) } var ( sentRun bool @@ -371,93 +463,156 @@ func (p *initProcess) start() error { ierr := parseSync(p.messageSockPair.parent, func(sync *syncT) error { switch sync.Type { + case procSeccomp: + if p.config.Config.Seccomp.ListenerPath == "" { + return errors.New("listenerPath is not set") + } + + seccompFd, err := recvSeccompFd(uintptr(childPid), uintptr(sync.Fd)) + if err != nil { + return err + } + defer unix.Close(seccompFd) + + s, err := p.container.currentOCIState() + if err != nil { + return err + } + + // initProcessStartTime hasn't been set yet. + s.Pid = p.cmd.Process.Pid + s.Status = specs.StateCreating + containerProcessState := &specs.ContainerProcessState{ + Version: specs.Version, + Fds: []string{specs.SeccompFdName}, + Pid: s.Pid, + Metadata: p.config.Config.Seccomp.ListenerMetadata, + State: *s, + } + if err := sendContainerProcessState(p.config.Config.Seccomp.ListenerPath, + containerProcessState, seccompFd); err != nil { + return err + } + + // Sync with child. + if err := writeSync(p.messageSockPair.parent, procSeccompDone); err != nil { + return err + } case procReady: // set rlimits, this has to be done here because we lose permissions // to raise the limits once we enter a user-namespace if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil { - return newSystemErrorWithCause(err, "setting rlimits for ready process") + return fmt.Errorf("error setting rlimits for ready process: %w", err) } - // call prestart hooks + // call prestart and CreateRuntime hooks if !p.config.Config.Namespaces.Contains(configs.NEWNS) { - // Setup cgroup before prestart hook, so that the prestart hook could apply cgroup permissions. - if err := p.manager.Set(p.config.Config); err != nil { - return newSystemErrorWithCause(err, "setting cgroup config for ready process") + // Setup cgroup before the hook, so that the prestart and CreateRuntime hook could apply cgroup permissions. + if err := p.manager.Set(p.config.Config.Cgroups.Resources); err != nil { + return fmt.Errorf("error setting cgroup config for ready process: %w", err) } if p.intelRdtManager != nil { if err := p.intelRdtManager.Set(p.config.Config); err != nil { - return newSystemErrorWithCause(err, "setting Intel RDT config for ready process") + return fmt.Errorf("error setting Intel RDT config for ready process: %w", err) } } - if p.config.Config.Hooks != nil { + if len(p.config.Config.Hooks) != 0 { s, err := p.container.currentOCIState() if err != nil { return err } // initProcessStartTime hasn't been set yet. s.Pid = p.cmd.Process.Pid - s.Status = "creating" - for i, hook := range p.config.Config.Hooks.Prestart { - if err := hook.Run(s); err != nil { - return newSystemErrorWithCausef(err, "running prestart hook %d", i) - } + s.Status = specs.StateCreating + hooks := p.config.Config.Hooks + + if err := hooks[configs.Prestart].RunHooks(s); err != nil { + return err + } + if err := hooks[configs.CreateRuntime].RunHooks(s); err != nil { + return err } } } + + // generate a timestamp indicating when the container was started + p.container.created = time.Now().UTC() + p.container.state = &createdState{ + c: p.container, + } + + // NOTE: If the procRun state has been synced and the + // runc-create process has been killed for some reason, + // the runc-init[2:stage] process will be leaky. And + // the runc command also fails to parse root directory + // because the container doesn't have state.json. + // + // In order to cleanup the runc-init[2:stage] by + // runc-delete/stop, we should store the status before + // procRun sync. + state, uerr := p.container.updateState(p) + if uerr != nil { + return fmt.Errorf("unable to store init state: %w", err) + } + p.container.initProcessStartTime = state.InitProcessStartTime + // Sync with child. if err := writeSync(p.messageSockPair.parent, procRun); err != nil { - return newSystemErrorWithCause(err, "writing syncT 'run'") + return err } sentRun = true case procHooks: // Setup cgroup before prestart hook, so that the prestart hook could apply cgroup permissions. - if err := p.manager.Set(p.config.Config); err != nil { - return newSystemErrorWithCause(err, "setting cgroup config for procHooks process") + if err := p.manager.Set(p.config.Config.Cgroups.Resources); err != nil { + return fmt.Errorf("error setting cgroup config for procHooks process: %w", err) } if p.intelRdtManager != nil { if err := p.intelRdtManager.Set(p.config.Config); err != nil { - return newSystemErrorWithCause(err, "setting Intel RDT config for procHooks process") + return fmt.Errorf("error setting Intel RDT config for procHooks process: %w", err) } } - if p.config.Config.Hooks != nil { + if len(p.config.Config.Hooks) != 0 { s, err := p.container.currentOCIState() if err != nil { return err } // initProcessStartTime hasn't been set yet. s.Pid = p.cmd.Process.Pid - s.Status = "creating" - for i, hook := range p.config.Config.Hooks.Prestart { - if err := hook.Run(s); err != nil { - return newSystemErrorWithCausef(err, "running prestart hook %d", i) - } + s.Status = specs.StateCreating + hooks := p.config.Config.Hooks + + if err := hooks[configs.Prestart].RunHooks(s); err != nil { + return err + } + if err := hooks[configs.CreateRuntime].RunHooks(s); err != nil { + return err } } // Sync with child. if err := writeSync(p.messageSockPair.parent, procResume); err != nil { - return newSystemErrorWithCause(err, "writing syncT 'resume'") + return err } sentResume = true default: - return newSystemError(fmt.Errorf("invalid JSON payload from child")) + return errors.New("invalid JSON payload from child") } return nil }) if !sentRun { - return newSystemErrorWithCause(ierr, "container init") + return fmt.Errorf("error during container init: %w", ierr) } if p.config.Config.Namespaces.Contains(configs.NEWNS) && !sentResume { - return newSystemError(fmt.Errorf("could not synchronise after executing prestart hooks with container process")) + return errors.New("could not synchronise after executing prestart and CreateRuntime hooks with container process") } if err := unix.Shutdown(int(p.messageSockPair.parent.Fd()), unix.SHUT_WR); err != nil { - return newSystemErrorWithCause(err, "shutting down init pipe") + return &os.PathError{Op: "shutdown", Path: "(init pipe)", Err: err} } // Must be done after Shutdown so the child will exit and we can wait for it. if ierr != nil { - p.wait() + _, _ = p.wait() return ierr } return nil @@ -465,14 +620,11 @@ func (p *initProcess) start() error { func (p *initProcess) wait() (*os.ProcessState, error) { err := p.cmd.Wait() - if err != nil { - return p.cmd.ProcessState, err - } // we should kill all processes in cgroup when init is died if we use host PID namespace if p.sharePidns { - signalAllProcesses(p.manager, unix.SIGKILL) + _ = signalAllProcesses(p.manager, unix.SIGKILL) } - return p.cmd.ProcessState, nil + return p.cmd.ProcessState, err } func (p *initProcess) terminate() error { @@ -491,6 +643,16 @@ func (p *initProcess) startTime() (uint64, error) { return stat.StartTime, err } +func (p *initProcess) updateSpecState() error { + s, err := p.container.currentOCIState() + if err != nil { + return err + } + + p.config.SpecState = s + return nil +} + func (p *initProcess) sendConfig() error { // send the config to the container's init process, we don't use JSON Encode // here because there might be a problem in JSON decoder in some cases, see: @@ -516,7 +678,7 @@ func (p *initProcess) createNetworkInterfaces() error { } func (p *initProcess) signal(sig os.Signal) error { - s, ok := sig.(syscall.Signal) + s, ok := sig.(unix.Signal) if !ok { return errors.New("os: unsupported signal type") } @@ -527,8 +689,48 @@ func (p *initProcess) setExternalDescriptors(newFds []string) { p.fds = newFds } -func (p *initProcess) forwardChildLogs() { - go logs.ForwardLogs(p.logFilePair.parent) +func (p *initProcess) forwardChildLogs() chan error { + return logs.ForwardLogs(p.logFilePair.parent) +} + +func recvSeccompFd(childPid, childFd uintptr) (int, error) { + pidfd, _, errno := unix.Syscall(unix.SYS_PIDFD_OPEN, childPid, 0, 0) + if errno != 0 { + return -1, fmt.Errorf("performing SYS_PIDFD_OPEN syscall: %w", errno) + } + defer unix.Close(int(pidfd)) + + seccompFd, _, errno := unix.Syscall(unix.SYS_PIDFD_GETFD, pidfd, childFd, 0) + if errno != 0 { + return -1, fmt.Errorf("performing SYS_PIDFD_GETFD syscall: %w", errno) + } + + return int(seccompFd), nil +} + +func sendContainerProcessState(listenerPath string, state *specs.ContainerProcessState, fd int) error { + conn, err := net.Dial("unix", listenerPath) + if err != nil { + return fmt.Errorf("failed to connect with seccomp agent specified in the seccomp profile: %w", err) + } + + socket, err := conn.(*net.UnixConn).File() + if err != nil { + return fmt.Errorf("cannot get seccomp socket: %w", err) + } + defer socket.Close() + + b, err := json.Marshal(state) + if err != nil { + return fmt.Errorf("cannot marshall seccomp state: %w", err) + } + + err = utils.SendFds(socket, b, fd) + if err != nil { + return fmt.Errorf("cannot send seccomp fd to %s: %w", listenerPath, err) + } + + return nil } func getPipeFds(pid int) ([]string, error) { @@ -565,7 +767,7 @@ func (p *Process) InitializeIO(rootuid, rootgid int) (i *IO, err error) { defer func() { if err != nil { for _, fd := range fds { - unix.Close(int(fd)) + _ = unix.Close(int(fd)) } } }() @@ -591,8 +793,33 @@ func (p *Process) InitializeIO(rootuid, rootgid int) (i *IO, err error) { // change ownership of the pipes in case we are in a user namespace for _, fd := range fds { if err := unix.Fchown(int(fd), rootuid, rootgid); err != nil { - return nil, err + return nil, &os.PathError{Op: "fchown", Path: "fd " + strconv.Itoa(int(fd)), Err: err} } } return i, nil } + +// initWaiter returns a channel to wait on for making sure +// runc init has finished the initial setup. +func initWaiter(r io.Reader) chan error { + ch := make(chan error, 1) + go func() { + defer close(ch) + + inited := make([]byte, 1) + n, err := r.Read(inited) + if err == nil { + if n < 1 { + err = errors.New("short read") + } else if inited[0] != 0 { + err = fmt.Errorf("unexpected %d != 0", inited[0]) + } else { + ch <- nil + return + } + } + ch <- fmt.Errorf("waiting for init preliminary setup: %w", err) + }() + + return ch +} diff --git a/libcontainer/restored_process.go b/libcontainer/restored_process.go index 28d52ad..cdffbd3 100644 --- a/libcontainer/restored_process.go +++ b/libcontainer/restored_process.go @@ -1,49 +1,43 @@ -// +build linux - package libcontainer import ( - "fmt" + "errors" "os" + "os/exec" "github.com/opencontainers/runc/libcontainer/system" ) -func newRestoredProcess(pid int, fds []string) (*restoredProcess, error) { - var ( - err error - ) - proc, err := os.FindProcess(pid) - if err != nil { - return nil, err - } +func newRestoredProcess(cmd *exec.Cmd, fds []string) (*restoredProcess, error) { + var err error + pid := cmd.Process.Pid stat, err := system.Stat(pid) if err != nil { return nil, err } return &restoredProcess{ - proc: proc, + cmd: cmd, processStartTime: stat.StartTime, fds: fds, }, nil } type restoredProcess struct { - proc *os.Process + cmd *exec.Cmd processStartTime uint64 fds []string } func (p *restoredProcess) start() error { - return newGenericError(fmt.Errorf("restored process cannot be started"), SystemError) + return errors.New("restored process cannot be started") } func (p *restoredProcess) pid() int { - return p.proc.Pid + return p.cmd.Process.Pid } func (p *restoredProcess) terminate() error { - err := p.proc.Kill() + err := p.cmd.Process.Kill() if _, werr := p.wait(); err == nil { err = werr } @@ -53,10 +47,14 @@ func (p *restoredProcess) terminate() error { func (p *restoredProcess) wait() (*os.ProcessState, error) { // TODO: how do we wait on the actual process? // maybe use --exec-cmd in criu - st, err := p.proc.Wait() + err := p.cmd.Wait() if err != nil { - return nil, err + var exitErr *exec.ExitError + if !errors.As(err, &exitErr) { + return nil, err + } } + st := p.cmd.ProcessState return st, nil } @@ -65,7 +63,7 @@ func (p *restoredProcess) startTime() (uint64, error) { } func (p *restoredProcess) signal(s os.Signal) error { - return p.proc.Signal(s) + return p.cmd.Process.Signal(s) } func (p *restoredProcess) externalDescriptors() []string { @@ -76,7 +74,8 @@ func (p *restoredProcess) setExternalDescriptors(newFds []string) { p.fds = newFds } -func (p *restoredProcess) forwardChildLogs() { +func (p *restoredProcess) forwardChildLogs() chan error { + return nil } // nonChildProcess represents a process where the calling process is not @@ -89,7 +88,7 @@ type nonChildProcess struct { } func (p *nonChildProcess) start() error { - return newGenericError(fmt.Errorf("restored process cannot be started"), SystemError) + return errors.New("restored process cannot be started") } func (p *nonChildProcess) pid() int { @@ -97,11 +96,11 @@ func (p *nonChildProcess) pid() int { } func (p *nonChildProcess) terminate() error { - return newGenericError(fmt.Errorf("restored process cannot be terminated"), SystemError) + return errors.New("restored process cannot be terminated") } func (p *nonChildProcess) wait() (*os.ProcessState, error) { - return nil, newGenericError(fmt.Errorf("restored process cannot be waited on"), SystemError) + return nil, errors.New("restored process cannot be waited on") } func (p *nonChildProcess) startTime() (uint64, error) { @@ -124,5 +123,6 @@ func (p *nonChildProcess) setExternalDescriptors(newFds []string) { p.fds = newFds } -func (p *nonChildProcess) forwardChildLogs() { +func (p *nonChildProcess) forwardChildLogs() chan error { + return nil } diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go index 106c4c2..51660f5 100644 --- a/libcontainer/rootfs_linux.go +++ b/libcontainer/rootfs_linux.go @@ -1,36 +1,47 @@ -// +build linux - package libcontainer import ( + "errors" "fmt" "io" - "io/ioutil" "os" "os/exec" "path" "path/filepath" + "strconv" "strings" "time" securejoin "github.com/cyphar/filepath-securejoin" + "github.com/moby/sys/mountinfo" "github.com/mrunalp/fileutils" "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fs2" "github.com/opencontainers/runc/libcontainer/configs" - "github.com/opencontainers/runc/libcontainer/mount" - "github.com/opencontainers/runc/libcontainer/system" - libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils" + "github.com/opencontainers/runc/libcontainer/devices" + "github.com/opencontainers/runc/libcontainer/userns" + "github.com/opencontainers/runc/libcontainer/utils" + "github.com/opencontainers/runtime-spec/specs-go" "github.com/opencontainers/selinux/go-selinux/label" - + "github.com/sirupsen/logrus" "golang.org/x/sys/unix" ) const defaultMountFlags = unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV +type mountConfig struct { + root string + label string + cgroup2Path string + rootlessCgroups bool + cgroupns bool + fd *int +} + // needsSetupDev returns true if /dev needs to be set up. func needsSetupDev(config *configs.Config) bool { for _, m := range config.Mounts { - if m.Device == "bind" && libcontainerUtils.CleanPath(m.Destination) == "/dev" { + if m.Device == "bind" && utils.CleanPath(m.Destination) == "/dev" { return false } } @@ -40,40 +51,57 @@ func needsSetupDev(config *configs.Config) bool { // prepareRootfs sets up the devices, mount points, and filesystems for use // inside a new mount namespace. It doesn't set anything as ro. You must call // finalizeRootfs after this function to finish setting up the rootfs. -func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig) (err error) { +func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig, mountFds []int) (err error) { config := iConfig.Config if err := prepareRoot(config); err != nil { - return newSystemErrorWithCause(err, "preparing rootfs") + return fmt.Errorf("error preparing rootfs: %w", err) } - hasCgroupns := config.Namespaces.Contains(configs.NEWCGROUP) + if mountFds != nil && len(mountFds) != len(config.Mounts) { + return fmt.Errorf("malformed mountFds slice. Expected size: %v, got: %v. Slice: %v", len(config.Mounts), len(mountFds), mountFds) + } + + mountConfig := &mountConfig{ + root: config.Rootfs, + label: config.MountLabel, + cgroup2Path: iConfig.Cgroup2Path, + rootlessCgroups: iConfig.RootlessCgroups, + cgroupns: config.Namespaces.Contains(configs.NEWCGROUP), + } setupDev := needsSetupDev(config) - for _, m := range config.Mounts { + for i, m := range config.Mounts { for _, precmd := range m.PremountCmds { if err := mountCmd(precmd); err != nil { - return newSystemErrorWithCause(err, "running premount command") + return fmt.Errorf("error running premount command: %w", err) } } - if err := mountToRootfs(m, config.Rootfs, config.MountLabel, hasCgroupns); err != nil { - return newSystemErrorWithCausef(err, "mounting %q to rootfs %q at %q", m.Source, config.Rootfs, m.Destination) + + // Just before the loop we checked that if not empty, len(mountFds) == len(config.Mounts). + // Therefore, we can access mountFds[i] without any concerns. + if mountFds != nil && mountFds[i] != -1 { + mountConfig.fd = &mountFds[i] + } + + if err := mountToRootfs(m, mountConfig); err != nil { + return fmt.Errorf("error mounting %q to rootfs at %q: %w", m.Source, m.Destination, err) } for _, postcmd := range m.PostmountCmds { if err := mountCmd(postcmd); err != nil { - return newSystemErrorWithCause(err, "running postmount command") + return fmt.Errorf("error running postmount command: %w", err) } } } if setupDev { if err := createDevices(config); err != nil { - return newSystemErrorWithCause(err, "creating device nodes") + return fmt.Errorf("error creating device nodes: %w", err) } if err := setupPtmx(config); err != nil { - return newSystemErrorWithCause(err, "setting up ptmx") + return fmt.Errorf("error setting up ptmx: %w", err) } if err := setupDevSymlinks(config.Rootfs); err != nil { - return newSystemErrorWithCause(err, "setting up /dev symlinks") + return fmt.Errorf("error setting up /dev symlinks: %w", err) } } @@ -95,7 +123,14 @@ func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig) (err error) { // operation not being perfectly split). if err := unix.Chdir(config.Rootfs); err != nil { - return newSystemErrorWithCausef(err, "changing dir to %q", config.Rootfs) + return &os.PathError{Op: "chdir", Path: config.Rootfs, Err: err} + } + + s := iConfig.SpecState + s.Pid = unix.Getpid() + s.Status = specs.StateCreating + if err := iConfig.Config.Hooks[configs.CreateContainer].RunHooks(s); err != nil { + return err } if config.NoPivotRoot { @@ -103,22 +138,22 @@ func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig) (err error) { } else if config.Namespaces.Contains(configs.NEWNS) { err = pivotRoot(config.Rootfs) } else { - err = chroot(config.Rootfs) + err = chroot() } if err != nil { - return newSystemErrorWithCause(err, "jailing process inside rootfs") + return fmt.Errorf("error jailing process inside rootfs: %w", err) } if setupDev { if err := reOpenDevNull(); err != nil { - return newSystemErrorWithCause(err, "reopening /dev/null inside container") + return fmt.Errorf("error reopening /dev/null inside container: %w", err) } } if cwd := iConfig.Cwd; cwd != "" { // Note that spec.Process.Cwd can contain unclean value like "../../../../foo/bar...". // However, we are safe to call MkDirAll directly because we are in the jail here. - if err := os.MkdirAll(cwd, 0755); err != nil { + if err := os.MkdirAll(cwd, 0o755); err != nil { return err } } @@ -129,47 +164,52 @@ func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig) (err error) { // finalizeRootfs sets anything to ro if necessary. You must call // prepareRootfs first. func finalizeRootfs(config *configs.Config) (err error) { - // remount dev as ro if specified + // All tmpfs mounts and /dev were previously mounted as rw + // by mountPropagate. Remount them read-only as requested. for _, m := range config.Mounts { - if libcontainerUtils.CleanPath(m.Destination) == "/dev" { - if m.Flags&unix.MS_RDONLY == unix.MS_RDONLY { - if err := remountReadonly(m); err != nil { - return newSystemErrorWithCausef(err, "remounting %q as readonly", m.Destination) - } + if m.Flags&unix.MS_RDONLY != unix.MS_RDONLY { + continue + } + if m.Device == "tmpfs" || utils.CleanPath(m.Destination) == "/dev" { + if err := remountReadonly(m); err != nil { + return err } - break } } // set rootfs ( / ) as readonly if config.Readonlyfs { if err := setReadonly(); err != nil { - return newSystemErrorWithCause(err, "setting rootfs as readonly") + return fmt.Errorf("error setting rootfs as readonly: %w", err) } } - unix.Umask(0022) + if config.Umask != nil { + unix.Umask(int(*config.Umask)) + } else { + unix.Umask(0o022) + } return nil } // /tmp has to be mounted as private to allow MS_MOVE to work in all situations func prepareTmp(topTmpDir string) (string, error) { - tmpdir, err := ioutil.TempDir(topTmpDir, "runctop") + tmpdir, err := os.MkdirTemp(topTmpDir, "runctop") if err != nil { return "", err } - if err := unix.Mount(tmpdir, tmpdir, "bind", unix.MS_BIND, ""); err != nil { + if err := mount(tmpdir, tmpdir, "", "bind", unix.MS_BIND, ""); err != nil { return "", err } - if err := unix.Mount("", tmpdir, "", uintptr(unix.MS_PRIVATE), ""); err != nil { + if err := mount("", tmpdir, "", "", uintptr(unix.MS_PRIVATE), ""); err != nil { return "", err } return tmpdir, nil } -func cleanupTmp(tmpdir string) error { - unix.Unmount(tmpdir, 0) - return os.RemoveAll(tmpdir) +func cleanupTmp(tmpdir string) { + _ = unix.Unmount(tmpdir, 0) + _ = os.RemoveAll(tmpdir) } func mountCmd(cmd configs.Command) error { @@ -177,13 +217,18 @@ func mountCmd(cmd configs.Command) error { command.Env = cmd.Env command.Dir = cmd.Dir if out, err := command.CombinedOutput(); err != nil { - return fmt.Errorf("%#v failed: %s: %v", cmd, string(out), err) + return fmt.Errorf("%#v failed: %s: %w", cmd, string(out), err) } return nil } -func prepareBindMount(m *configs.Mount, rootfs string) error { - stat, err := os.Stat(m.Source) +func prepareBindMount(m *configs.Mount, rootfs string, mountFd *int) error { + source := m.Source + if mountFd != nil { + source = "/proc/self/fd/" + strconv.Itoa(*mountFd) + } + + stat, err := os.Stat(source) if err != nil { // error out if the source of a bind mount does not exist as we will be // unable to bind anything to it. @@ -197,11 +242,9 @@ func prepareBindMount(m *configs.Mount, rootfs string) error { if dest, err = securejoin.SecureJoin(rootfs, m.Destination); err != nil { return err } - if err := checkProcMount(rootfs, dest, m.Source); err != nil { + if err := checkProcMount(rootfs, dest, source); err != nil { return err } - // update the mount with the correct dest after symlinks are resolved. - m.Destination = dest if err := createIfNotExists(dest, stat.IsDir()); err != nil { return err } @@ -209,7 +252,7 @@ func prepareBindMount(m *configs.Mount, rootfs string) error { return nil } -func mountCgroupV1(m *configs.Mount, rootfs, mountLabel string, enableCgroupns bool) error { +func mountCgroupV1(m *configs.Mount, c *mountConfig) error { binds, err := getCgroupMounts(m) if err != nil { return err @@ -229,31 +272,36 @@ func mountCgroupV1(m *configs.Mount, rootfs, mountLabel string, enableCgroupns b Data: "mode=755", PropagationFlags: m.PropagationFlags, } - if err := mountToRootfs(tmpfs, rootfs, mountLabel, enableCgroupns); err != nil { + + if err := mountToRootfs(tmpfs, c); err != nil { return err } + for _, b := range binds { - if enableCgroupns { - subsystemPath := filepath.Join(rootfs, b.Destination) - if err := os.MkdirAll(subsystemPath, 0755); err != nil { + if c.cgroupns { + subsystemPath := filepath.Join(c.root, b.Destination) + if err := os.MkdirAll(subsystemPath, 0o755); err != nil { return err } - flags := defaultMountFlags - if m.Flags&unix.MS_RDONLY != 0 { - flags = flags | unix.MS_RDONLY - } - cgroupmount := &configs.Mount{ - Source: "cgroup", - Device: "cgroup", - Destination: subsystemPath, - Flags: flags, - Data: filepath.Base(subsystemPath), - } - if err := mountNewCgroup(cgroupmount); err != nil { + if err := utils.WithProcfd(c.root, b.Destination, func(procfd string) error { + flags := defaultMountFlags + if m.Flags&unix.MS_RDONLY != 0 { + flags = flags | unix.MS_RDONLY + } + var ( + source = "cgroup" + data = filepath.Base(subsystemPath) + ) + if data == "systemd" { + data = cgroups.CgroupNamePrefix + data + source = "systemd" + } + return mount(source, b.Destination, procfd, "cgroup", uintptr(flags), data) + }); err != nil { return err } } else { - if err := mountToRootfs(b, rootfs, mountLabel, enableCgroupns); err != nil { + if err := mountToRootfs(b, c); err != nil { return err } } @@ -263,7 +311,7 @@ func mountCgroupV1(m *configs.Mount, rootfs, mountLabel string, enableCgroupns b // symlink(2) is very dumb, it will just shove the path into // the link and doesn't do any checks or relative path // conversion. Also, don't error out if the cgroup already exists. - if err := os.Symlink(mc, filepath.Join(rootfs, m.Destination, ss)); err != nil && !os.IsExist(err) { + if err := os.Symlink(mc, filepath.Join(c.root, m.Destination, ss)); err != nil && !os.IsExist(err) { return err } } @@ -271,30 +319,88 @@ func mountCgroupV1(m *configs.Mount, rootfs, mountLabel string, enableCgroupns b return nil } -func mountCgroupV2(m *configs.Mount, rootfs, mountLabel string, enableCgroupns bool) error { - cgroupPath, err := securejoin.SecureJoin(rootfs, m.Destination) +func mountCgroupV2(m *configs.Mount, c *mountConfig) error { + dest, err := securejoin.SecureJoin(c.root, m.Destination) if err != nil { return err } - if err := os.MkdirAll(cgroupPath, 0755); err != nil { + if err := os.MkdirAll(dest, 0o755); err != nil { return err } - if err := unix.Mount(m.Source, cgroupPath, "cgroup2", uintptr(m.Flags), m.Data); err != nil { - // when we are in UserNS but CgroupNS is not unshared, we cannot mount cgroup2 (#2158) - if err == unix.EPERM || err == unix.EBUSY { - return unix.Mount("/sys/fs/cgroup", cgroupPath, "", uintptr(m.Flags)|unix.MS_BIND, "") + return utils.WithProcfd(c.root, m.Destination, func(procfd string) error { + if err := mount(m.Source, m.Destination, procfd, "cgroup2", uintptr(m.Flags), m.Data); err != nil { + // when we are in UserNS but CgroupNS is not unshared, we cannot mount cgroup2 (#2158) + if errors.Is(err, unix.EPERM) || errors.Is(err, unix.EBUSY) { + src := fs2.UnifiedMountpoint + if c.cgroupns && c.cgroup2Path != "" { + // Emulate cgroupns by bind-mounting + // the container cgroup path rather than + // the whole /sys/fs/cgroup. + src = c.cgroup2Path + } + err = mount(src, m.Destination, procfd, "", uintptr(m.Flags)|unix.MS_BIND, "") + if c.rootlessCgroups && errors.Is(err, unix.ENOENT) { + err = nil + } + } + return err } - return err - } - return nil + return nil + }) } -func mountToRootfs(m *configs.Mount, rootfs, mountLabel string, enableCgroupns bool) error { - var ( - dest = m.Destination - ) - if !strings.HasPrefix(dest, rootfs) { - dest = filepath.Join(rootfs, dest) +func doTmpfsCopyUp(m *configs.Mount, rootfs, mountLabel string) (Err error) { + // Set up a scratch dir for the tmpfs on the host. + tmpdir, err := prepareTmp("/tmp") + if err != nil { + return fmt.Errorf("tmpcopyup: failed to setup tmpdir: %w", err) + } + defer cleanupTmp(tmpdir) + tmpDir, err := os.MkdirTemp(tmpdir, "runctmpdir") + if err != nil { + return fmt.Errorf("tmpcopyup: failed to create tmpdir: %w", err) + } + defer os.RemoveAll(tmpDir) + + // Configure the *host* tmpdir as if it's the container mount. We change + // m.Destination since we are going to mount *on the host*. + oldDest := m.Destination + m.Destination = tmpDir + err = mountPropagate(m, "/", mountLabel, nil) + m.Destination = oldDest + if err != nil { + return err + } + defer func() { + if Err != nil { + if err := unmount(tmpDir, unix.MNT_DETACH); err != nil { + logrus.Warnf("tmpcopyup: %v", err) + } + } + }() + + return utils.WithProcfd(rootfs, m.Destination, func(procfd string) (Err error) { + // Copy the container data to the host tmpdir. We append "/" to force + // CopyDirectory to resolve the symlink rather than trying to copy the + // symlink itself. + if err := fileutils.CopyDirectory(procfd+"/", tmpDir); err != nil { + return fmt.Errorf("tmpcopyup: failed to copy %s to %s (%s): %w", m.Destination, procfd, tmpDir, err) + } + // Now move the mount into the container. + if err := mount(tmpDir, m.Destination, procfd, "", unix.MS_MOVE, ""); err != nil { + return fmt.Errorf("tmpcopyup: failed to move mount: %w", err) + } + return nil + }) +} + +func mountToRootfs(m *configs.Mount, c *mountConfig) error { + rootfs := c.root + mountLabel := c.label + mountFd := c.fd + dest, err := securejoin.SecureJoin(rootfs, m.Destination) + if err != nil { + return err } switch m.Device { @@ -311,64 +417,37 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string, enableCgroupns b } else if fi.Mode()&os.ModeDir == 0 { return fmt.Errorf("filesystem %q must be mounted on ordinary directory", m.Device) } - if err := os.MkdirAll(dest, 0755); err != nil { + if err := os.MkdirAll(dest, 0o755); err != nil { return err } // Selinux kernels do not support labeling of /proc or /sys - return mountPropagate(m, rootfs, "") + return mountPropagate(m, rootfs, "", nil) case "mqueue": - if err := os.MkdirAll(dest, 0755); err != nil { + if err := os.MkdirAll(dest, 0o755); err != nil { return err } - if err := mountPropagate(m, rootfs, mountLabel); err != nil { - // older kernels do not support labeling of /dev/mqueue - if err := mountPropagate(m, rootfs, ""); err != nil { - return err - } - return label.SetFileLabel(dest, mountLabel) + if err := mountPropagate(m, rootfs, "", nil); err != nil { + return err } - return nil + return label.SetFileLabel(dest, mountLabel) case "tmpfs": - copyUp := m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP - tmpDir := "" stat, err := os.Stat(dest) if err != nil { - if err := os.MkdirAll(dest, 0755); err != nil { + if err := os.MkdirAll(dest, 0o755); err != nil { return err } } - if copyUp { - tmpdir, err := prepareTmp("/tmp") - if err != nil { - return newSystemErrorWithCause(err, "tmpcopyup: failed to setup tmpdir") - } - defer cleanupTmp(tmpdir) - tmpDir, err = ioutil.TempDir(tmpdir, "runctmpdir") - if err != nil { - return newSystemErrorWithCause(err, "tmpcopyup: failed to create tmpdir") - } - defer os.RemoveAll(tmpDir) - m.Destination = tmpDir + + if m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP { + err = doTmpfsCopyUp(m, rootfs, mountLabel) + } else { + err = mountPropagate(m, rootfs, mountLabel, nil) } - if err := mountPropagate(m, rootfs, mountLabel); err != nil { + + if err != nil { return err } - if copyUp { - if err := fileutils.CopyDirectory(dest, tmpDir); err != nil { - errMsg := fmt.Errorf("tmpcopyup: failed to copy %s to %s: %v", dest, tmpDir, err) - if err1 := unix.Unmount(tmpDir, unix.MNT_DETACH); err1 != nil { - return newSystemErrorWithCausef(err1, "tmpcopyup: %v: failed to unmount", errMsg) - } - return errMsg - } - if err := unix.Mount(tmpDir, dest, "", unix.MS_MOVE, ""); err != nil { - errMsg := fmt.Errorf("tmpcopyup: failed to move mount %s to %s: %v", tmpDir, dest, err) - if err1 := unix.Unmount(tmpDir, unix.MNT_DETACH); err1 != nil { - return newSystemErrorWithCausef(err1, "tmpcopyup: %v: failed to unmount", errMsg) - } - return errMsg - } - } + if stat != nil { if err = os.Chmod(dest, stat.Mode()); err != nil { return err @@ -376,17 +455,17 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string, enableCgroupns b } return nil case "bind": - if err := prepareBindMount(m, rootfs); err != nil { + if err := prepareBindMount(m, rootfs, mountFd); err != nil { return err } - if err := mountPropagate(m, rootfs, mountLabel); err != nil { + if err := mountPropagate(m, rootfs, mountLabel, mountFd); err != nil { return err } // bind mount won't change mount options, we need remount to make mount options effective. // first check that we have non-default options required before attempting a remount if m.Flags&^(unix.MS_REC|unix.MS_REMOUNT|unix.MS_BIND) != 0 { // only remount if unique mount options are set - if err := remount(m, rootfs); err != nil { + if err := remount(m, rootfs, mountFd); err != nil { return err } } @@ -402,45 +481,20 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string, enableCgroupns b } case "cgroup": if cgroups.IsCgroup2UnifiedMode() { - if err := mountCgroupV2(m, rootfs, mountLabel, enableCgroupns); err != nil { - return err - } - } else { - - if err := mountCgroupV1(m, rootfs, mountLabel, enableCgroupns); err != nil { - return err - } - } - if m.Flags&unix.MS_RDONLY != 0 { - // remount cgroup root as readonly - mcgrouproot := &configs.Mount{ - Source: m.Destination, - Device: "bind", - Destination: m.Destination, - Flags: defaultMountFlags | unix.MS_RDONLY | unix.MS_BIND, - } - if err := remount(mcgrouproot, rootfs); err != nil { - return err - } + return mountCgroupV2(m, c) } + return mountCgroupV1(m, c) default: - // ensure that the destination of the mount is resolved of symlinks at mount time because - // any previous mounts can invalidate the next mount's destination. - // this can happen when a user specifies mounts within other mounts to cause breakouts or other - // evil stuff to try to escape the container's rootfs. - var err error - if dest, err = securejoin.SecureJoin(rootfs, m.Destination); err != nil { - return err - } if err := checkProcMount(rootfs, dest, m.Source); err != nil { return err } - // update the mount with the correct dest after symlinks are resolved. - m.Destination = dest - if err := os.MkdirAll(dest, 0755); err != nil { + if err := os.MkdirAll(dest, 0o755); err != nil { return err } - return mountPropagate(m, rootfs, mountLabel) + return mountPropagate(m, rootfs, mountLabel, mountFd) + } + if err := setRecAttr(m, rootfs); err != nil { + return err } return nil } @@ -485,28 +539,6 @@ func getCgroupMounts(m *configs.Mount) ([]*configs.Mount, error) { // if source is nil, don't stat the filesystem. This is used for restore of a checkpoint. func checkProcMount(rootfs, dest, source string) error { const procPath = "/proc" - // White list, it should be sub directories of invalid destinations - validDestinations := []string{ - // These entries can be bind mounted by files emulated by fuse, - // so commands like top, free displays stats in container. - "/proc/cpuinfo", - "/proc/diskstats", - "/proc/meminfo", - "/proc/stat", - "/proc/swaps", - "/proc/uptime", - "/proc/loadavg", - "/proc/net/dev", - } - for _, valid := range validDestinations { - path, err := filepath.Rel(filepath.Join(rootfs, valid), dest) - if err != nil { - return err - } - if path == "." { - return nil - } - } path, err := filepath.Rel(filepath.Join(rootfs, procPath), dest) if err != nil { return err @@ -532,19 +564,43 @@ func checkProcMount(rootfs, dest, source string) error { } return fmt.Errorf("%q cannot be mounted because it is not of type proc", dest) } + + // Here dest is definitely under /proc. Do not allow those, + // except for a few specific entries emulated by lxcfs. + validProcMounts := []string{ + "/proc/cpuinfo", + "/proc/diskstats", + "/proc/meminfo", + "/proc/stat", + "/proc/swaps", + "/proc/uptime", + "/proc/loadavg", + "/proc/slabinfo", + "/proc/net/dev", + } + for _, valid := range validProcMounts { + path, err := filepath.Rel(filepath.Join(rootfs, valid), dest) + if err != nil { + return err + } + if path == "." { + return nil + } + } + return fmt.Errorf("%q cannot be mounted because it is inside /proc", dest) } func isProc(path string) (bool, error) { var s unix.Statfs_t if err := unix.Statfs(path, &s); err != nil { - return false, err + return false, &os.PathError{Op: "statfs", Path: path, Err: err} } return s.Type == unix.PROC_SUPER_MAGIC, nil } func setupDevSymlinks(rootfs string) error { - var links = [][2]string{ + links := [][2]string{ {"/proc/self/fd", "/dev/fd"}, {"/proc/self/fd/0", "/dev/stdin"}, {"/proc/self/fd/1", "/dev/stdout"}, @@ -561,7 +617,7 @@ func setupDevSymlinks(rootfs string) error { dst = filepath.Join(rootfs, link[1]) ) if err := os.Symlink(src, dst); err != nil && !os.IsExist(err) { - return fmt.Errorf("symlink %s %s %s", src, dst, err) + return err } } return nil @@ -575,20 +631,24 @@ func reOpenDevNull() error { var stat, devNullStat unix.Stat_t file, err := os.OpenFile("/dev/null", os.O_RDWR, 0) if err != nil { - return fmt.Errorf("Failed to open /dev/null - %s", err) - } - defer file.Close() - if err := unix.Fstat(int(file.Fd()), &devNullStat); err != nil { return err } + defer file.Close() //nolint: errcheck + if err := unix.Fstat(int(file.Fd()), &devNullStat); err != nil { + return &os.PathError{Op: "fstat", Path: file.Name(), Err: err} + } for fd := 0; fd < 3; fd++ { if err := unix.Fstat(fd, &stat); err != nil { - return err + return &os.PathError{Op: "fstat", Path: "fd " + strconv.Itoa(fd), Err: err} } if stat.Rdev == devNullStat.Rdev { // Close and re-open the fd. if err := unix.Dup3(int(file.Fd()), fd, 0); err != nil { - return err + return &os.PathError{ + Op: "dup3", + Path: "fd " + strconv.Itoa(int(file.Fd())), + Err: err, + } } } } @@ -597,9 +657,15 @@ func reOpenDevNull() error { // Create the device nodes in the container. func createDevices(config *configs.Config) error { - useBindMount := system.RunningInUserNS() || config.Namespaces.Contains(configs.NEWUSER) - oldMask := unix.Umask(0000) + useBindMount := userns.RunningInUserNS() || config.Namespaces.Contains(configs.NEWUSER) + oldMask := unix.Umask(0o000) for _, node := range config.Devices { + + // The /dev/ptmx device is setup by setupPtmx() + if utils.CleanPath(node.Path) == "/dev/ptmx" { + continue + } + // containers running in a user namespace are not allowed to mknod // devices so we can just bind mount it from the host. if err := createDeviceNode(config.Rootfs, node, useBindMount); err != nil { @@ -611,96 +677,88 @@ func createDevices(config *configs.Config) error { return nil } -func bindMountDeviceNode(dest string, node *configs.Device) error { +func bindMountDeviceNode(rootfs, dest string, node *devices.Device) error { f, err := os.Create(dest) if err != nil && !os.IsExist(err) { return err } if f != nil { - f.Close() + _ = f.Close() } - return unix.Mount(node.Path, dest, "bind", unix.MS_BIND, "") + return utils.WithProcfd(rootfs, dest, func(procfd string) error { + return mount(node.Path, dest, procfd, "bind", unix.MS_BIND, "") + }) } // Creates the device node in the rootfs of the container. -func createDeviceNode(rootfs string, node *configs.Device, bind bool) error { - dest := filepath.Join(rootfs, node.Path) - if err := os.MkdirAll(filepath.Dir(dest), 0755); err != nil { +func createDeviceNode(rootfs string, node *devices.Device, bind bool) error { + if node.Path == "" { + // The node only exists for cgroup reasons, ignore it here. + return nil + } + dest, err := securejoin.SecureJoin(rootfs, node.Path) + if err != nil { + return err + } + if err := os.MkdirAll(filepath.Dir(dest), 0o755); err != nil { return err } - if bind { - return bindMountDeviceNode(dest, node) + return bindMountDeviceNode(rootfs, dest, node) } if err := mknodDevice(dest, node); err != nil { - if os.IsExist(err) { + if errors.Is(err, os.ErrExist) { return nil - } else if os.IsPermission(err) { - return bindMountDeviceNode(dest, node) + } else if errors.Is(err, os.ErrPermission) { + return bindMountDeviceNode(rootfs, dest, node) } return err } return nil } -func mknodDevice(dest string, node *configs.Device) error { +func mknodDevice(dest string, node *devices.Device) error { fileMode := node.FileMode switch node.Type { - case 'c', 'u': - fileMode |= unix.S_IFCHR - case 'b': + case devices.BlockDevice: fileMode |= unix.S_IFBLK - case 'p': + case devices.CharDevice: + fileMode |= unix.S_IFCHR + case devices.FifoDevice: fileMode |= unix.S_IFIFO default: return fmt.Errorf("%c is not a valid device type for device %s", node.Type, node.Path) } - if err := unix.Mknod(dest, uint32(fileMode), node.Mkdev()); err != nil { + dev, err := node.Mkdev() + if err != nil { return err } - return unix.Chown(dest, int(node.Uid), int(node.Gid)) -} - -func getMountInfo(mountinfo []*mount.Info, dir string) *mount.Info { - for _, m := range mountinfo { - if m.Mountpoint == dir { - return m - } + if err := unix.Mknod(dest, uint32(fileMode), int(dev)); err != nil { + return &os.PathError{Op: "mknod", Path: dest, Err: err} } - return nil + return os.Chown(dest, int(node.Uid), int(node.Gid)) } // Get the parent mount point of directory passed in as argument. Also return // optional fields. func getParentMount(rootfs string) (string, string, error) { - var path string - - mountinfos, err := mount.GetMounts() + mi, err := mountinfo.GetMounts(mountinfo.ParentsFilter(rootfs)) if err != nil { return "", "", err } - - mountinfo := getMountInfo(mountinfos, rootfs) - if mountinfo != nil { - return rootfs, mountinfo.Optional, nil + if len(mi) < 1 { + return "", "", fmt.Errorf("could not find parent mount of %s", rootfs) } - path = rootfs - for { - path = filepath.Dir(path) - - mountinfo = getMountInfo(mountinfos, path) - if mountinfo != nil { - return path, mountinfo.Optional, nil - } - - if path == "/" { - break + // find the longest mount point + var idx, maxlen int + for i := range mi { + if len(mi[i].Mountpoint) > maxlen { + maxlen = len(mi[i].Mountpoint) + idx = i } } - - // If we are here, we did not find parent mount. Something is wrong. - return "", "", fmt.Errorf("Could not find parent mount of %s", rootfs) + return mi[idx].Mountpoint, mi[idx].Optional, nil } // Make parent mount private if it was shared @@ -725,7 +783,7 @@ func rootfsParentMountPrivate(rootfs string) error { // shared. Secondly when we bind mount rootfs it will propagate to // parent namespace and we don't want that to happen. if sharedMount { - return unix.Mount("", parentMount, "", unix.MS_PRIVATE, "") + return mount("", parentMount, "", "", unix.MS_PRIVATE, "") } return nil @@ -736,7 +794,7 @@ func prepareRoot(config *configs.Config) error { if config.RootPropagation != 0 { flag = config.RootPropagation } - if err := unix.Mount("", "/", "", uintptr(flag), ""); err != nil { + if err := mount("", "/", "", "", uintptr(flag), ""); err != nil { return err } @@ -747,11 +805,22 @@ func prepareRoot(config *configs.Config) error { return err } - return unix.Mount(config.Rootfs, config.Rootfs, "bind", unix.MS_BIND|unix.MS_REC, "") + return mount(config.Rootfs, config.Rootfs, "", "bind", unix.MS_BIND|unix.MS_REC, "") } func setReadonly() error { - return unix.Mount("/", "/", "bind", unix.MS_BIND|unix.MS_REMOUNT|unix.MS_RDONLY|unix.MS_REC, "") + flags := uintptr(unix.MS_BIND | unix.MS_REMOUNT | unix.MS_RDONLY) + + err := mount("", "/", "", "", flags, "") + if err == nil { + return nil + } + var s unix.Statfs_t + if err := unix.Statfs("/", &s); err != nil { + return &os.PathError{Op: "statfs", Path: "/", Err: err} + } + flags |= uintptr(s.Flags) + return mount("", "/", "", "", flags, "") } func setupPtmx(config *configs.Config) error { @@ -760,7 +829,7 @@ func setupPtmx(config *configs.Config) error { return err } if err := os.Symlink("pts/ptmx", ptmx); err != nil { - return fmt.Errorf("symlink dev ptmx %s", err) + return err } return nil } @@ -776,23 +845,23 @@ func pivotRoot(rootfs string) error { oldroot, err := unix.Open("/", unix.O_DIRECTORY|unix.O_RDONLY, 0) if err != nil { - return err + return &os.PathError{Op: "open", Path: "/", Err: err} } - defer unix.Close(oldroot) + defer unix.Close(oldroot) //nolint: errcheck newroot, err := unix.Open(rootfs, unix.O_DIRECTORY|unix.O_RDONLY, 0) if err != nil { - return err + return &os.PathError{Op: "open", Path: rootfs, Err: err} } - defer unix.Close(newroot) + defer unix.Close(newroot) //nolint: errcheck // Change to the new root so that the pivot_root actually acts on it. if err := unix.Fchdir(newroot); err != nil { - return err + return &os.PathError{Op: "fchdir", Path: "fd " + strconv.Itoa(newroot), Err: err} } if err := unix.PivotRoot(".", "."); err != nil { - return fmt.Errorf("pivot_root %s", err) + return &os.PathError{Op: "pivot_root", Path: ".", Err: err} } // Currently our "." is oldroot (according to the current kernel code). @@ -801,7 +870,7 @@ func pivotRoot(rootfs string) error { // pivot_root(2). if err := unix.Fchdir(oldroot); err != nil { - return err + return &os.PathError{Op: "fchdir", Path: "fd " + strconv.Itoa(oldroot), Err: err} } // Make oldroot rslave to make sure our unmounts don't propagate to the @@ -809,68 +878,92 @@ func pivotRoot(rootfs string) error { // known to cause issues due to races where we still have a reference to a // mount while a process in the host namespace are trying to operate on // something they think has no mounts (devicemapper in particular). - if err := unix.Mount("", ".", "", unix.MS_SLAVE|unix.MS_REC, ""); err != nil { + if err := mount("", ".", "", "", unix.MS_SLAVE|unix.MS_REC, ""); err != nil { return err } - // Preform the unmount. MNT_DETACH allows us to unmount /proc/self/cwd. - if err := unix.Unmount(".", unix.MNT_DETACH); err != nil { + // Perform the unmount. MNT_DETACH allows us to unmount /proc/self/cwd. + if err := unmount(".", unix.MNT_DETACH); err != nil { return err } // Switch back to our shiny new root. if err := unix.Chdir("/"); err != nil { - return fmt.Errorf("chdir / %s", err) + return &os.PathError{Op: "chdir", Path: "/", Err: err} } return nil } func msMoveRoot(rootfs string) error { - mountinfos, err := mount.GetMounts() + // Before we move the root and chroot we have to mask all "full" sysfs and + // procfs mounts which exist on the host. This is because while the kernel + // has protections against mounting procfs if it has masks, when using + // chroot(2) the *host* procfs mount is still reachable in the mount + // namespace and the kernel permits procfs mounts inside --no-pivot + // containers. + // + // Users shouldn't be using --no-pivot except in exceptional circumstances, + // but to avoid such a trivial security flaw we apply a best-effort + // protection here. The kernel only allows a mount of a pseudo-filesystem + // like procfs or sysfs if there is a *full* mount (the root of the + // filesystem is mounted) without any other locked mount points covering a + // subtree of the mount. + // + // So we try to unmount (or mount tmpfs on top of) any mountpoint which is + // a full mount of either sysfs or procfs (since those are the most + // concerning filesystems to us). + mountinfos, err := mountinfo.GetMounts(func(info *mountinfo.Info) (skip, stop bool) { + // Collect every sysfs and procfs filesystem, except for those which + // are non-full mounts or are inside the rootfs of the container. + if info.Root != "/" || + (info.FSType != "proc" && info.FSType != "sysfs") || + strings.HasPrefix(info.Mountpoint, rootfs) { + skip = true + } + return + }) if err != nil { return err } - - absRootfs, err := filepath.Abs(rootfs) - if err != nil { - return err - } - for _, info := range mountinfos { - p, err := filepath.Abs(info.Mountpoint) - if err != nil { - return err - } - // Umount every syfs and proc file systems, except those under the container rootfs - if (info.Fstype != "proc" && info.Fstype != "sysfs") || filepath.HasPrefix(p, absRootfs) { - continue - } + p := info.Mountpoint // Be sure umount events are not propagated to the host. - if err := unix.Mount("", p, "", unix.MS_SLAVE|unix.MS_REC, ""); err != nil { + if err := mount("", p, "", "", unix.MS_SLAVE|unix.MS_REC, ""); err != nil { + if errors.Is(err, unix.ENOENT) { + // If the mountpoint doesn't exist that means that we've + // already blasted away some parent directory of the mountpoint + // and so we don't care about this error. + continue + } return err } - if err := unix.Unmount(p, unix.MNT_DETACH); err != nil { - if err != unix.EINVAL && err != unix.EPERM { + if err := unmount(p, unix.MNT_DETACH); err != nil { + if !errors.Is(err, unix.EINVAL) && !errors.Is(err, unix.EPERM) { return err } else { // If we have not privileges for umounting (e.g. rootless), then // cover the path. - if err := unix.Mount("tmpfs", p, "tmpfs", 0, ""); err != nil { + if err := mount("tmpfs", p, "", "tmpfs", 0, ""); err != nil { return err } } } } - if err := unix.Mount(rootfs, "/", "", unix.MS_MOVE, ""); err != nil { + + // Move the rootfs on top of "/" in our mount namespace. + if err := mount(rootfs, "/", "", "", unix.MS_MOVE, ""); err != nil { return err } - return chroot(rootfs) + return chroot() } -func chroot(rootfs string) error { +func chroot() error { if err := unix.Chroot("."); err != nil { - return err + return &os.PathError{Op: "chroot", Path: ".", Err: err} } - return unix.Chdir("/") + if err := unix.Chdir("/"); err != nil { + return &os.PathError{Op: "chdir", Path: "/", Err: err} + } + return nil } // createIfNotExists creates a file or a directory only if it does not already exist. @@ -878,16 +971,16 @@ func createIfNotExists(path string, isDir bool) error { if _, err := os.Stat(path); err != nil { if os.IsNotExist(err) { if isDir { - return os.MkdirAll(path, 0755) + return os.MkdirAll(path, 0o755) } - if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil { + if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { return err } - f, err := os.OpenFile(path, os.O_CREATE, 0755) + f, err := os.OpenFile(path, os.O_CREATE, 0o755) if err != nil { return err } - f.Close() + _ = f.Close() } } return nil @@ -895,13 +988,24 @@ func createIfNotExists(path string, isDir bool) error { // readonlyPath will make a path read only. func readonlyPath(path string) error { - if err := unix.Mount(path, path, "", unix.MS_BIND|unix.MS_REC, ""); err != nil { - if os.IsNotExist(err) { + if err := mount(path, path, "", "", unix.MS_BIND|unix.MS_REC, ""); err != nil { + if errors.Is(err, os.ErrNotExist) { return nil } return err } - return unix.Mount(path, path, "", unix.MS_BIND|unix.MS_REMOUNT|unix.MS_RDONLY|unix.MS_REC, "") + + var s unix.Statfs_t + if err := unix.Statfs(path, &s); err != nil { + return &os.PathError{Op: "statfs", Path: path, Err: err} + } + flags := uintptr(s.Flags) & (unix.MS_NOSUID | unix.MS_NODEV | unix.MS_NOEXEC) + + if err := mount(path, path, "", "", flags|unix.MS_BIND|unix.MS_REMOUNT|unix.MS_RDONLY, ""); err != nil { + return err + } + + return nil } // remountReadonly will remount an existing mount point and ensure that it is read-only. @@ -918,14 +1022,12 @@ func remountReadonly(m *configs.Mount) error { // nosuid, etc.). So, let's use that case so that we can do // this re-mount without failing in a userns. flags |= unix.MS_REMOUNT | unix.MS_BIND | unix.MS_RDONLY - if err := unix.Mount("", dest, "", uintptr(flags), ""); err != nil { - switch err { - case unix.EBUSY: + if err := mount("", dest, "", "", uintptr(flags), ""); err != nil { + if errors.Is(err, unix.EBUSY) { time.Sleep(100 * time.Millisecond) continue - default: - return err } + return err } return nil } @@ -938,9 +1040,9 @@ func remountReadonly(m *configs.Mount) error { // For files, maskPath bind mounts /dev/null over the top of the specified path. // For directories, maskPath mounts read-only tmpfs over the top of the specified path. func maskPath(path string, mountLabel string) error { - if err := unix.Mount("/dev/null", path, "", unix.MS_BIND, ""); err != nil && !os.IsNotExist(err) { - if err == unix.ENOTDIR { - return unix.Mount("tmpfs", path, "tmpfs", unix.MS_RDONLY, label.FormatMountLabel("", mountLabel)) + if err := mount("/dev/null", path, "", "", unix.MS_BIND, ""); err != nil && !errors.Is(err, os.ErrNotExist) { + if errors.Is(err, unix.ENOTDIR) { + return mount("tmpfs", path, "", "tmpfs", unix.MS_RDONLY, label.FormatMountLabel("", mountLabel)) } return err } @@ -951,59 +1053,85 @@ func maskPath(path string, mountLabel string) error { // For e.g. net.ipv4.ip_forward translated to /proc/sys/net/ipv4/ip_forward. func writeSystemProperty(key, value string) error { keyPath := strings.Replace(key, ".", "/", -1) - return ioutil.WriteFile(path.Join("/proc/sys", keyPath), []byte(value), 0644) + return os.WriteFile(path.Join("/proc/sys", keyPath), []byte(value), 0o644) } -func remount(m *configs.Mount, rootfs string) error { - var ( - dest = m.Destination - ) - if !strings.HasPrefix(dest, rootfs) { - dest = filepath.Join(rootfs, dest) +func remount(m *configs.Mount, rootfs string, mountFd *int) error { + source := m.Source + if mountFd != nil { + source = "/proc/self/fd/" + strconv.Itoa(*mountFd) } - return unix.Mount(m.Source, dest, m.Device, uintptr(m.Flags|unix.MS_REMOUNT), "") + + return utils.WithProcfd(rootfs, m.Destination, func(procfd string) error { + flags := uintptr(m.Flags | unix.MS_REMOUNT) + err := mount(source, m.Destination, procfd, m.Device, flags, "") + if err == nil { + return nil + } + // Check if the source has ro flag... + var s unix.Statfs_t + if err := unix.Statfs(source, &s); err != nil { + return &os.PathError{Op: "statfs", Path: source, Err: err} + } + if s.Flags&unix.MS_RDONLY != unix.MS_RDONLY { + return err + } + // ... and retry the mount with ro flag set. + flags |= unix.MS_RDONLY + return mount(source, m.Destination, procfd, m.Device, flags, "") + }) } // Do the mount operation followed by additional mounts required to take care -// of propagation flags. -func mountPropagate(m *configs.Mount, rootfs string, mountLabel string) error { +// of propagation flags. This will always be scoped inside the container rootfs. +func mountPropagate(m *configs.Mount, rootfs string, mountLabel string, mountFd *int) error { var ( - dest = m.Destination data = label.FormatMountLabel(m.Data, mountLabel) flags = m.Flags ) - if libcontainerUtils.CleanPath(dest) == "/dev" { + // Delay mounting the filesystem read-only if we need to do further + // operations on it. We need to set up files in "/dev", and other tmpfs + // mounts may need to be chmod-ed after mounting. These mounts will be + // remounted ro later in finalizeRootfs(), if necessary. + if m.Device == "tmpfs" || utils.CleanPath(m.Destination) == "/dev" { flags &= ^unix.MS_RDONLY } - copyUp := m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP - if !(copyUp || strings.HasPrefix(dest, rootfs)) { - dest = filepath.Join(rootfs, dest) + // Because the destination is inside a container path which might be + // mutating underneath us, we verify that we are actually going to mount + // inside the container with WithProcfd() -- mounting through a procfd + // mounts on the target. + source := m.Source + if mountFd != nil { + source = "/proc/self/fd/" + strconv.Itoa(*mountFd) } - if err := unix.Mount(m.Source, dest, m.Device, uintptr(flags), data); err != nil { + if err := utils.WithProcfd(rootfs, m.Destination, func(procfd string) error { + return mount(source, m.Destination, procfd, m.Device, uintptr(flags), data) + }); err != nil { return err } - - for _, pflag := range m.PropagationFlags { - if err := unix.Mount("", dest, "", uintptr(pflag), ""); err != nil { - return err + // We have to apply mount propagation flags in a separate WithProcfd() call + // because the previous call invalidates the passed procfd -- the mount + // target needs to be re-opened. + if err := utils.WithProcfd(rootfs, m.Destination, func(procfd string) error { + for _, pflag := range m.PropagationFlags { + if err := mount("", m.Destination, procfd, "", uintptr(pflag), ""); err != nil { + return err + } } + return nil + }); err != nil { + return fmt.Errorf("change mount propagation through procfd: %w", err) } return nil } -func mountNewCgroup(m *configs.Mount) error { - var ( - data = m.Data - source = m.Source - ) - if data == "systemd" { - data = cgroups.CgroupNamePrefix + data - source = "systemd" +func setRecAttr(m *configs.Mount, rootfs string) error { + if m.RecAttr == nil { + return nil } - if err := unix.Mount(source, m.Destination, m.Device, uintptr(m.Flags), data); err != nil { - return err - } - return nil + return utils.WithProcfd(rootfs, m.Destination, func(procfd string) error { + return unix.MountSetattr(-1, procfd, unix.AT_RECURSIVE, m.RecAttr) + }) } diff --git a/libcontainer/rootfs_linux_test.go b/libcontainer/rootfs_linux_test.go index 1bfe7c6..e3bfdc5 100644 --- a/libcontainer/rootfs_linux_test.go +++ b/libcontainer/rootfs_linux_test.go @@ -1,5 +1,3 @@ -// +build linux - package libcontainer import ( diff --git a/libcontainer/seccomp/config.go b/libcontainer/seccomp/config.go index c321227..d0c9bb7 100644 --- a/libcontainer/seccomp/config.go +++ b/libcontainer/seccomp/config.go @@ -2,6 +2,7 @@ package seccomp import ( "fmt" + "sort" "github.com/opencontainers/runc/libcontainer/configs" ) @@ -16,13 +17,36 @@ var operators = map[string]configs.Operator{ "SCMP_CMP_MASKED_EQ": configs.MaskEqualTo, } +// KnownOperators returns the list of the known operations. +// Used by `runc features`. +func KnownOperators() []string { + var res []string + for k := range operators { + res = append(res, k) + } + sort.Strings(res) + return res +} + var actions = map[string]configs.Action{ - "SCMP_ACT_KILL": configs.Kill, - "SCMP_ACT_ERRNO": configs.Errno, - "SCMP_ACT_TRAP": configs.Trap, - "SCMP_ACT_ALLOW": configs.Allow, - "SCMP_ACT_TRACE": configs.Trace, - "SCMP_ACT_LOG": configs.Log, + "SCMP_ACT_KILL": configs.Kill, + "SCMP_ACT_ERRNO": configs.Errno, + "SCMP_ACT_TRAP": configs.Trap, + "SCMP_ACT_ALLOW": configs.Allow, + "SCMP_ACT_TRACE": configs.Trace, + "SCMP_ACT_LOG": configs.Log, + "SCMP_ACT_NOTIFY": configs.Notify, +} + +// KnownActions returns the list of the known actions. +// Used by `runc features`. +func KnownActions() []string { + var res []string + for k := range actions { + res = append(res, k) + } + sort.Strings(res) + return res } var archs = map[string]string{ @@ -44,25 +68,34 @@ var archs = map[string]string{ "SCMP_ARCH_S390X": "s390x", } +// KnownArchs returns the list of the known archs. +// Used by `runc features`. +func KnownArchs() []string { + var res []string + for k := range archs { + res = append(res, k) + } + sort.Strings(res) + return res +} + // ConvertStringToOperator converts a string into a Seccomp comparison operator. // Comparison operators use the names they are assigned by Libseccomp's header. // Attempting to convert a string that is not a valid operator results in an // error. func ConvertStringToOperator(in string) (configs.Operator, error) { - if op, ok := operators[in]; ok == true { + if op, ok := operators[in]; ok { return op, nil } return 0, fmt.Errorf("string %s is not a valid operator for seccomp", in) } // ConvertStringToAction converts a string into a Seccomp rule match action. -// Actions use the names they are assigned in Libseccomp's header, though some -// (notable, SCMP_ACT_TRACE) are not available in this implementation and will -// return errors. +// Actions use the names they are assigned in Libseccomp's header. // Attempting to convert a string that is not a valid action results in an // error. func ConvertStringToAction(in string) (configs.Action, error) { - if act, ok := actions[in]; ok == true { + if act, ok := actions[in]; ok { return act, nil } return 0, fmt.Errorf("string %s is not a valid action for seccomp", in) @@ -70,7 +103,7 @@ func ConvertStringToAction(in string) (configs.Action, error) { // ConvertStringToArch converts a string into a Seccomp comparison arch. func ConvertStringToArch(in string) (string, error) { - if arch, ok := archs[in]; ok == true { + if arch, ok := archs[in]; ok { return arch, nil } return "", fmt.Errorf("string %s is not a valid arch for seccomp", in) diff --git a/libcontainer/seccomp/fixtures/proc_self_status b/libcontainer/seccomp/fixtures/proc_self_status deleted file mode 100644 index 0e0084f..0000000 --- a/libcontainer/seccomp/fixtures/proc_self_status +++ /dev/null @@ -1,47 +0,0 @@ -Name: cat -State: R (running) -Tgid: 19383 -Ngid: 0 -Pid: 19383 -PPid: 19275 -TracerPid: 0 -Uid: 1000 1000 1000 1000 -Gid: 1000 1000 1000 1000 -FDSize: 256 -Groups: 24 25 27 29 30 44 46 102 104 108 111 1000 1001 -NStgid: 19383 -NSpid: 19383 -NSpgid: 19383 -NSsid: 19275 -VmPeak: 5944 kB -VmSize: 5944 kB -VmLck: 0 kB -VmPin: 0 kB -VmHWM: 744 kB -VmRSS: 744 kB -VmData: 324 kB -VmStk: 136 kB -VmExe: 48 kB -VmLib: 1776 kB -VmPTE: 32 kB -VmPMD: 12 kB -VmSwap: 0 kB -Threads: 1 -SigQ: 0/30067 -SigPnd: 0000000000000000 -ShdPnd: 0000000000000000 -SigBlk: 0000000000000000 -SigIgn: 0000000000000080 -SigCgt: 0000000000000000 -CapInh: 0000000000000000 -CapPrm: 0000000000000000 -CapEff: 0000000000000000 -CapBnd: 0000003fffffffff -CapAmb: 0000000000000000 -Seccomp: 0 -Cpus_allowed: f -Cpus_allowed_list: 0-3 -Mems_allowed: 00000000,00000001 -Mems_allowed_list: 0 -voluntary_ctxt_switches: 0 -nonvoluntary_ctxt_switches: 1 diff --git a/libcontainer/seccomp/patchbpf/enosys_linux.go b/libcontainer/seccomp/patchbpf/enosys_linux.go new file mode 100644 index 0000000..dfb8a0a --- /dev/null +++ b/libcontainer/seccomp/patchbpf/enosys_linux.go @@ -0,0 +1,680 @@ +//go:build cgo && seccomp +// +build cgo,seccomp + +package patchbpf + +import ( + "bytes" + "encoding/binary" + "errors" + "fmt" + "io" + "os" + "runtime" + "unsafe" + + libseccomp "github.com/seccomp/libseccomp-golang" + "github.com/sirupsen/logrus" + "golang.org/x/net/bpf" + "golang.org/x/sys/unix" + + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/utils" +) + +// #cgo pkg-config: libseccomp +/* +#include +#include +#include +#include + +const uint32_t C_ACT_ERRNO_ENOSYS = SCMP_ACT_ERRNO(ENOSYS); + +// Copied from . + +#ifndef SECCOMP_SET_MODE_FILTER +# define SECCOMP_SET_MODE_FILTER 1 +#endif +const uintptr_t C_SET_MODE_FILTER = SECCOMP_SET_MODE_FILTER; + +#ifndef SECCOMP_FILTER_FLAG_LOG +# define SECCOMP_FILTER_FLAG_LOG (1UL << 1) +#endif +const uintptr_t C_FILTER_FLAG_LOG = SECCOMP_FILTER_FLAG_LOG; + +#ifndef SECCOMP_FILTER_FLAG_NEW_LISTENER +# define SECCOMP_FILTER_FLAG_NEW_LISTENER (1UL << 3) +#endif +const uintptr_t C_FILTER_FLAG_NEW_LISTENER = SECCOMP_FILTER_FLAG_NEW_LISTENER; + +// We use the AUDIT_ARCH_* values because those are the ones used by the kernel +// and SCMP_ARCH_* sometimes has fake values (such as SCMP_ARCH_X32). But we +// use so we get libseccomp's fallback definitions of AUDIT_ARCH_*. + +const uint32_t C_AUDIT_ARCH_I386 = AUDIT_ARCH_I386; +const uint32_t C_AUDIT_ARCH_X86_64 = AUDIT_ARCH_X86_64; +const uint32_t C_AUDIT_ARCH_ARM = AUDIT_ARCH_ARM; +const uint32_t C_AUDIT_ARCH_AARCH64 = AUDIT_ARCH_AARCH64; +const uint32_t C_AUDIT_ARCH_MIPS = AUDIT_ARCH_MIPS; +const uint32_t C_AUDIT_ARCH_MIPS64 = AUDIT_ARCH_MIPS64; +const uint32_t C_AUDIT_ARCH_MIPS64N32 = AUDIT_ARCH_MIPS64N32; +const uint32_t C_AUDIT_ARCH_MIPSEL = AUDIT_ARCH_MIPSEL; +const uint32_t C_AUDIT_ARCH_MIPSEL64 = AUDIT_ARCH_MIPSEL64; +const uint32_t C_AUDIT_ARCH_MIPSEL64N32 = AUDIT_ARCH_MIPSEL64N32; +const uint32_t C_AUDIT_ARCH_PPC = AUDIT_ARCH_PPC; +const uint32_t C_AUDIT_ARCH_PPC64 = AUDIT_ARCH_PPC64; +const uint32_t C_AUDIT_ARCH_PPC64LE = AUDIT_ARCH_PPC64LE; +const uint32_t C_AUDIT_ARCH_S390 = AUDIT_ARCH_S390; +const uint32_t C_AUDIT_ARCH_S390X = AUDIT_ARCH_S390X; +*/ +import "C" + +var retErrnoEnosys = uint32(C.C_ACT_ERRNO_ENOSYS) + +func isAllowAction(action configs.Action) bool { + switch action { + // Trace is considered an "allow" action because a good tracer should + // support future syscalls (by handling -ENOSYS on its own), and giving + // -ENOSYS will be disruptive for emulation. + case configs.Allow, configs.Log, configs.Trace: + return true + default: + return false + } +} + +func parseProgram(rdr io.Reader) ([]bpf.RawInstruction, error) { + var program []bpf.RawInstruction +loop: + for { + // Read the next instruction. We have to use NativeEndian because + // seccomp_export_bpf outputs the program in *host* endian-ness. + var insn unix.SockFilter + if err := binary.Read(rdr, utils.NativeEndian, &insn); err != nil { + if errors.Is(err, io.EOF) { + // Parsing complete. + break loop + } + if errors.Is(err, io.ErrUnexpectedEOF) { + // Parsing stopped mid-instruction. + return nil, fmt.Errorf("program parsing halted mid-instruction: %w", err) + } + // All other errors. + return nil, fmt.Errorf("error parsing instructions: %w", err) + } + program = append(program, bpf.RawInstruction{ + Op: insn.Code, + Jt: insn.Jt, + Jf: insn.Jf, + K: insn.K, + }) + } + return program, nil +} + +func disassembleFilter(filter *libseccomp.ScmpFilter) ([]bpf.Instruction, error) { + rdr, wtr, err := os.Pipe() + if err != nil { + return nil, fmt.Errorf("error creating scratch pipe: %w", err) + } + defer wtr.Close() + defer rdr.Close() + + readerBuffer := new(bytes.Buffer) + errChan := make(chan error, 1) + go func() { + _, err := io.Copy(readerBuffer, rdr) + errChan <- err + close(errChan) + }() + + if err := filter.ExportBPF(wtr); err != nil { + return nil, fmt.Errorf("error exporting BPF: %w", err) + } + // Close so that the reader actually gets EOF. + _ = wtr.Close() + + if copyErr := <-errChan; copyErr != nil { + return nil, fmt.Errorf("error reading from ExportBPF pipe: %w", copyErr) + } + + // Parse the instructions. + rawProgram, err := parseProgram(readerBuffer) + if err != nil { + return nil, fmt.Errorf("parsing generated BPF filter: %w", err) + } + program, ok := bpf.Disassemble(rawProgram) + if !ok { + return nil, errors.New("could not disassemble entire BPF filter") + } + return program, nil +} + +type nativeArch uint32 + +const invalidArch nativeArch = 0 + +func archToNative(arch libseccomp.ScmpArch) (nativeArch, error) { + switch arch { + case libseccomp.ArchNative: + // Convert to actual native architecture. + arch, err := libseccomp.GetNativeArch() + if err != nil { + return invalidArch, fmt.Errorf("unable to get native arch: %w", err) + } + return archToNative(arch) + case libseccomp.ArchX86: + return nativeArch(C.C_AUDIT_ARCH_I386), nil + case libseccomp.ArchAMD64, libseccomp.ArchX32: + // NOTE: x32 is treated like x86_64 except all x32 syscalls have the + // 30th bit of the syscall number set to indicate that it's not a + // normal x86_64 syscall. + return nativeArch(C.C_AUDIT_ARCH_X86_64), nil + case libseccomp.ArchARM: + return nativeArch(C.C_AUDIT_ARCH_ARM), nil + case libseccomp.ArchARM64: + return nativeArch(C.C_AUDIT_ARCH_AARCH64), nil + case libseccomp.ArchMIPS: + return nativeArch(C.C_AUDIT_ARCH_MIPS), nil + case libseccomp.ArchMIPS64: + return nativeArch(C.C_AUDIT_ARCH_MIPS64), nil + case libseccomp.ArchMIPS64N32: + return nativeArch(C.C_AUDIT_ARCH_MIPS64N32), nil + case libseccomp.ArchMIPSEL: + return nativeArch(C.C_AUDIT_ARCH_MIPSEL), nil + case libseccomp.ArchMIPSEL64: + return nativeArch(C.C_AUDIT_ARCH_MIPSEL64), nil + case libseccomp.ArchMIPSEL64N32: + return nativeArch(C.C_AUDIT_ARCH_MIPSEL64N32), nil + case libseccomp.ArchPPC: + return nativeArch(C.C_AUDIT_ARCH_PPC), nil + case libseccomp.ArchPPC64: + return nativeArch(C.C_AUDIT_ARCH_PPC64), nil + case libseccomp.ArchPPC64LE: + return nativeArch(C.C_AUDIT_ARCH_PPC64LE), nil + case libseccomp.ArchS390: + return nativeArch(C.C_AUDIT_ARCH_S390), nil + case libseccomp.ArchS390X: + return nativeArch(C.C_AUDIT_ARCH_S390X), nil + default: + return invalidArch, fmt.Errorf("unknown architecture: %v", arch) + } +} + +type lastSyscallMap map[nativeArch]map[libseccomp.ScmpArch]libseccomp.ScmpSyscall + +// Figure out largest syscall number referenced in the filter for each +// architecture. We will be generating code based on the native architecture +// representation, but SCMP_ARCH_X32 means we have to track cases where the +// same architecture has different largest syscalls based on the mode. +func findLastSyscalls(config *configs.Seccomp) (lastSyscallMap, error) { + lastSyscalls := make(lastSyscallMap) + // Only loop over architectures which are present in the filter. Any other + // architectures will get the libseccomp bad architecture action anyway. + for _, ociArch := range config.Architectures { + arch, err := libseccomp.GetArchFromString(ociArch) + if err != nil { + return nil, fmt.Errorf("unable to validate seccomp architecture: %w", err) + } + + // Map native architecture to a real architecture value to avoid + // doubling-up the lastSyscall mapping. + if arch == libseccomp.ArchNative { + nativeArch, err := libseccomp.GetNativeArch() + if err != nil { + return nil, fmt.Errorf("unable to get native architecture: %w", err) + } + arch = nativeArch + } + + // Figure out native architecture representation of the architecture. + nativeArch, err := archToNative(arch) + if err != nil { + return nil, fmt.Errorf("cannot map architecture %v to AUDIT_ARCH_ constant: %w", arch, err) + } + + if _, ok := lastSyscalls[nativeArch]; !ok { + lastSyscalls[nativeArch] = map[libseccomp.ScmpArch]libseccomp.ScmpSyscall{} + } + if _, ok := lastSyscalls[nativeArch][arch]; ok { + // Because of ArchNative we may hit the same entry multiple times. + // Just skip it if we've seen this (nativeArch, ScmpArch) + // combination before. + continue + } + + // Find the largest syscall in the filter for this architecture. + var largestSyscall libseccomp.ScmpSyscall + for _, rule := range config.Syscalls { + sysno, err := libseccomp.GetSyscallFromNameByArch(rule.Name, arch) + if err != nil { + // Ignore unknown syscalls. + continue + } + if sysno > largestSyscall { + largestSyscall = sysno + } + } + if largestSyscall != 0 { + lastSyscalls[nativeArch][arch] = largestSyscall + } else { + logrus.Warnf("could not find any syscalls for arch %s", ociArch) + delete(lastSyscalls[nativeArch], arch) + } + } + return lastSyscalls, nil +} + +// FIXME FIXME FIXME +// +// This solution is less than ideal. In the future it would be great to have +// per-arch information about which syscalls were added in which kernel +// versions so we can create far more accurate filter rules (handling holes in +// the syscall table and determining -ENOSYS requirements based on kernel +// minimum version alone. +// +// This implementation can in principle cause issues with syscalls like +// close_range(2) which were added out-of-order in the syscall table between +// kernel releases. +func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error) { + // A jump-table for each nativeArch used to generate the initial + // conditional jumps -- measured from the *END* of the program so they + // remain valid after prepending to the tail. + archJumpTable := map[nativeArch]uint32{} + + // Generate our own -ENOSYS rules for each architecture. They have to be + // generated in reverse (prepended to the tail of the program) because the + // JumpIf jumps need to be computed from the end of the program. + programTail := []bpf.Instruction{ + // Fall-through rules jump into the filter. + bpf.Jump{Skip: 1}, + // Rules which jump to here get -ENOSYS. + bpf.RetConstant{Val: retErrnoEnosys}, + } + + // Generate the syscall -ENOSYS rules. + for nativeArch, maxSyscalls := range lastSyscalls { + // The number of instructions from the tail of this section which need + // to be jumped in order to reach the -ENOSYS return. If the section + // does not jump, it will fall through to the actual filter. + baseJumpEnosys := uint32(len(programTail) - 1) + baseJumpFilter := baseJumpEnosys + 1 + + // Add the load instruction for the syscall number -- we jump here + // directly from the arch code so we need to do it here. Sadly we can't + // share this code between architecture branches. + section := []bpf.Instruction{ + // load [0] + bpf.LoadAbsolute{Off: 0, Size: 4}, // NOTE: We assume sizeof(int) == 4. + } + + switch len(maxSyscalls) { + case 0: + // No syscalls found for this arch -- skip it and move on. + continue + case 1: + // Get the only syscall in the map. + var sysno libseccomp.ScmpSyscall + for _, no := range maxSyscalls { + sysno = no + } + + // The simplest case just boils down to a single jgt instruction, + // with special handling if baseJumpEnosys is larger than 255 (and + // thus a long jump is required). + var sectionTail []bpf.Instruction + if baseJumpEnosys+1 <= 255 { + sectionTail = []bpf.Instruction{ + // jgt [syscall],[baseJumpEnosys+1] + bpf.JumpIf{ + Cond: bpf.JumpGreaterThan, + Val: uint32(sysno), + SkipTrue: uint8(baseJumpEnosys + 1), + }, + // ja [baseJumpFilter] + bpf.Jump{Skip: baseJumpFilter}, + } + } else { + sectionTail = []bpf.Instruction{ + // jle [syscall],1 + bpf.JumpIf{Cond: bpf.JumpLessOrEqual, Val: uint32(sysno), SkipTrue: 1}, + // ja [baseJumpEnosys+1] + bpf.Jump{Skip: baseJumpEnosys + 1}, + // ja [baseJumpFilter] + bpf.Jump{Skip: baseJumpFilter}, + } + } + + // If we're on x86 we need to add a check for x32 and if we're in + // the wrong mode we jump over the section. + if uint32(nativeArch) == uint32(C.C_AUDIT_ARCH_X86_64) { + // Grab the only architecture in the map. + var scmpArch libseccomp.ScmpArch + for arch := range maxSyscalls { + scmpArch = arch + } + + // Generate a prefix to check the mode. + switch scmpArch { + case libseccomp.ArchAMD64: + sectionTail = append([]bpf.Instruction{ + // jset (1<<30),[len(tail)-1] + bpf.JumpIf{ + Cond: bpf.JumpBitsSet, + Val: 1 << 30, + SkipTrue: uint8(len(sectionTail) - 1), + }, + }, sectionTail...) + case libseccomp.ArchX32: + sectionTail = append([]bpf.Instruction{ + // jset (1<<30),0,[len(tail)-1] + bpf.JumpIf{ + Cond: bpf.JumpBitsNotSet, + Val: 1 << 30, + SkipTrue: uint8(len(sectionTail) - 1), + }, + }, sectionTail...) + default: + return nil, fmt.Errorf("unknown amd64 native architecture %#x", scmpArch) + } + } + + section = append(section, sectionTail...) + case 2: + // x32 and x86_64 are a unique case, we can't handle any others. + if uint32(nativeArch) != uint32(C.C_AUDIT_ARCH_X86_64) { + return nil, fmt.Errorf("unknown architecture overlap on native arch %#x", nativeArch) + } + + x32sysno, ok := maxSyscalls[libseccomp.ArchX32] + if !ok { + return nil, fmt.Errorf("missing %v in overlapping x86_64 arch: %v", libseccomp.ArchX32, maxSyscalls) + } + x86sysno, ok := maxSyscalls[libseccomp.ArchAMD64] + if !ok { + return nil, fmt.Errorf("missing %v in overlapping x86_64 arch: %v", libseccomp.ArchAMD64, maxSyscalls) + } + + // The x32 ABI indicates that a syscall is being made by an x32 + // process by setting the 30th bit of the syscall number, but we + // need to do some special-casing depending on whether we need to + // do long jumps. + if baseJumpEnosys+2 <= 255 { + // For the simple case we want to have something like: + // jset (1<<30),1 + // jgt [x86 syscall],[baseJumpEnosys+2],1 + // jgt [x32 syscall],[baseJumpEnosys+1] + // ja [baseJumpFilter] + section = append(section, []bpf.Instruction{ + // jset (1<<30),1 + bpf.JumpIf{Cond: bpf.JumpBitsSet, Val: 1 << 30, SkipTrue: 1}, + // jgt [x86 syscall],[baseJumpEnosys+1],1 + bpf.JumpIf{ + Cond: bpf.JumpGreaterThan, + Val: uint32(x86sysno), + SkipTrue: uint8(baseJumpEnosys + 2), SkipFalse: 1, + }, + // jgt [x32 syscall],[baseJumpEnosys] + bpf.JumpIf{ + Cond: bpf.JumpGreaterThan, + Val: uint32(x32sysno), + SkipTrue: uint8(baseJumpEnosys + 1), + }, + // ja [baseJumpFilter] + bpf.Jump{Skip: baseJumpFilter}, + }...) + } else { + // But if the [baseJumpEnosys+2] jump is larger than 255 we + // need to do a long jump like so: + // jset (1<<30),1 + // jgt [x86 syscall],1,2 + // jle [x32 syscall],1 + // ja [baseJumpEnosys+1] + // ja [baseJumpFilter] + section = append(section, []bpf.Instruction{ + // jset (1<<30),1 + bpf.JumpIf{Cond: bpf.JumpBitsSet, Val: 1 << 30, SkipTrue: 1}, + // jgt [x86 syscall],1,2 + bpf.JumpIf{ + Cond: bpf.JumpGreaterThan, + Val: uint32(x86sysno), + SkipTrue: 1, SkipFalse: 2, + }, + // jle [x32 syscall],[baseJumpEnosys] + bpf.JumpIf{ + Cond: bpf.JumpLessOrEqual, + Val: uint32(x32sysno), + SkipTrue: 1, + }, + // ja [baseJumpEnosys+1] + bpf.Jump{Skip: baseJumpEnosys + 1}, + // ja [baseJumpFilter] + bpf.Jump{Skip: baseJumpFilter}, + }...) + } + default: + return nil, fmt.Errorf("invalid number of architecture overlaps: %v", len(maxSyscalls)) + } + + // Prepend this section to the tail. + programTail = append(section, programTail...) + + // Update jump table. + archJumpTable[nativeArch] = uint32(len(programTail)) + } + + // Add a dummy "jump to filter" for any architecture we might miss below. + // Such architectures will probably get the BadArch action of the filter + // regardless. + programTail = append([]bpf.Instruction{ + // ja [end of stub and start of filter] + bpf.Jump{Skip: uint32(len(programTail))}, + }, programTail...) + + // Generate the jump rules for each architecture. This has to be done in + // reverse as well for the same reason as above. We add to programTail + // directly because the jumps are impacted by each architecture rule we add + // as well. + // + // TODO: Maybe we want to optimise to avoid long jumps here? So sort the + // architectures based on how large the jumps are going to be, or + // re-sort the candidate architectures each time to make sure that we + // pick the largest jump which is going to be smaller than 255. + for nativeArch := range lastSyscalls { + // We jump forwards but the jump table is calculated from the *END*. + jump := uint32(len(programTail)) - archJumpTable[nativeArch] + + // Same routine as above -- this is a basic jeq check, complicated + // slightly if it turns out that we need to do a long jump. + if jump <= 255 { + programTail = append([]bpf.Instruction{ + // jeq [arch],[jump] + bpf.JumpIf{ + Cond: bpf.JumpEqual, + Val: uint32(nativeArch), + SkipTrue: uint8(jump), + }, + }, programTail...) + } else { + programTail = append([]bpf.Instruction{ + // jne [arch],1 + bpf.JumpIf{ + Cond: bpf.JumpNotEqual, + Val: uint32(nativeArch), + SkipTrue: 1, + }, + // ja [jump] + bpf.Jump{Skip: jump}, + }, programTail...) + } + } + + // Prepend the load instruction for the architecture. + programTail = append([]bpf.Instruction{ + // load [4] + bpf.LoadAbsolute{Off: 4, Size: 4}, // NOTE: We assume sizeof(int) == 4. + }, programTail...) + + // And that's all folks! + return programTail, nil +} + +func assemble(program []bpf.Instruction) ([]unix.SockFilter, error) { + rawProgram, err := bpf.Assemble(program) + if err != nil { + return nil, fmt.Errorf("error assembling program: %w", err) + } + + // Convert to []unix.SockFilter for unix.SockFilter. + var filter []unix.SockFilter + for _, insn := range rawProgram { + filter = append(filter, unix.SockFilter{ + Code: insn.Op, + Jt: insn.Jt, + Jf: insn.Jf, + K: insn.K, + }) + } + return filter, nil +} + +func generatePatch(config *configs.Seccomp) ([]bpf.Instruction, error) { + // Patch the generated cBPF only when there is not a defaultErrnoRet set + // and it is different from ENOSYS + if config.DefaultErrnoRet != nil && *config.DefaultErrnoRet == uint(retErrnoEnosys) { + return nil, nil + } + // We only add the stub if the default action is not permissive. + if isAllowAction(config.DefaultAction) { + logrus.Debugf("seccomp: skipping -ENOSYS stub filter generation") + return nil, nil + } + + lastSyscalls, err := findLastSyscalls(config) + if err != nil { + return nil, fmt.Errorf("error finding last syscalls for -ENOSYS stub: %w", err) + } + stubProgram, err := generateEnosysStub(lastSyscalls) + if err != nil { + return nil, fmt.Errorf("error generating -ENOSYS stub: %w", err) + } + return stubProgram, nil +} + +func enosysPatchFilter(config *configs.Seccomp, filter *libseccomp.ScmpFilter) ([]unix.SockFilter, error) { + program, err := disassembleFilter(filter) + if err != nil { + return nil, fmt.Errorf("error disassembling original filter: %w", err) + } + + patch, err := generatePatch(config) + if err != nil { + return nil, fmt.Errorf("error generating patch for filter: %w", err) + } + fullProgram := append(patch, program...) + + logrus.Debugf("seccomp: prepending -ENOSYS stub filter to user filter...") + for idx, insn := range patch { + logrus.Debugf(" [%4.1d] %s", idx, insn) + } + logrus.Debugf(" [....] --- original filter ---") + + fprog, err := assemble(fullProgram) + if err != nil { + return nil, fmt.Errorf("error assembling modified filter: %w", err) + } + return fprog, nil +} + +func filterFlags(config *configs.Seccomp, filter *libseccomp.ScmpFilter) (flags uint, noNewPrivs bool, err error) { + // Ignore the error since pre-2.4 libseccomp is treated as API level 0. + apiLevel, _ := libseccomp.GetAPI() + + noNewPrivs, err = filter.GetNoNewPrivsBit() + if err != nil { + return 0, false, fmt.Errorf("unable to fetch no_new_privs filter bit: %w", err) + } + + if apiLevel >= 3 { + if logBit, err := filter.GetLogBit(); err != nil { + return 0, false, fmt.Errorf("unable to fetch SECCOMP_FILTER_FLAG_LOG bit: %w", err) + } else if logBit { + flags |= uint(C.C_FILTER_FLAG_LOG) + } + } + + // TODO: Support seccomp flags not yet added to libseccomp-golang... + + for _, call := range config.Syscalls { + if call.Action == configs.Notify { + flags |= uint(C.C_FILTER_FLAG_NEW_LISTENER) + break + } + } + + return +} + +func sysSeccompSetFilter(flags uint, filter []unix.SockFilter) (fd int, err error) { + fprog := unix.SockFprog{ + Len: uint16(len(filter)), + Filter: &filter[0], + } + fd = -1 // only return a valid fd when C_FILTER_FLAG_NEW_LISTENER is set + // If no seccomp flags were requested we can use the old-school prctl(2). + if flags == 0 { + err = unix.Prctl(unix.PR_SET_SECCOMP, + unix.SECCOMP_MODE_FILTER, + uintptr(unsafe.Pointer(&fprog)), 0, 0) + } else { + fdptr, _, errno := unix.RawSyscall(unix.SYS_SECCOMP, + uintptr(C.C_SET_MODE_FILTER), + uintptr(flags), uintptr(unsafe.Pointer(&fprog))) + if errno != 0 { + err = errno + } + if flags&uint(C.C_FILTER_FLAG_NEW_LISTENER) != 0 { + fd = int(fdptr) + } + } + runtime.KeepAlive(filter) + runtime.KeepAlive(fprog) + return +} + +// PatchAndLoad takes a seccomp configuration and a libseccomp filter which has +// been pre-configured with the set of rules in the seccomp config. It then +// patches said filter to handle -ENOSYS in a much nicer manner than the +// default libseccomp default action behaviour, and loads the patched filter +// into the kernel for the current process. +func PatchAndLoad(config *configs.Seccomp, filter *libseccomp.ScmpFilter) (int, error) { + // Generate a patched filter. + fprog, err := enosysPatchFilter(config, filter) + if err != nil { + return -1, fmt.Errorf("error patching filter: %w", err) + } + + // Get the set of libseccomp flags set. + seccompFlags, noNewPrivs, err := filterFlags(config, filter) + if err != nil { + return -1, fmt.Errorf("unable to fetch seccomp filter flags: %w", err) + } + + // Set no_new_privs if it was requested, though in runc we handle + // no_new_privs separately so warn if we hit this path. + if noNewPrivs { + logrus.Warnf("potentially misconfigured filter -- setting no_new_privs in seccomp path") + if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil { + return -1, fmt.Errorf("error enabling no_new_privs bit: %w", err) + } + } + + // Finally, load the filter. + fd, err := sysSeccompSetFilter(seccompFlags, fprog) + if err != nil { + return -1, fmt.Errorf("error loading seccomp filter: %w", err) + } + + return fd, nil +} diff --git a/libcontainer/seccomp/patchbpf/enosys_linux_test.go b/libcontainer/seccomp/patchbpf/enosys_linux_test.go new file mode 100644 index 0000000..b2ee625 --- /dev/null +++ b/libcontainer/seccomp/patchbpf/enosys_linux_test.go @@ -0,0 +1,296 @@ +//go:build cgo && seccomp +// +build cgo,seccomp + +package patchbpf + +import ( + "bytes" + "encoding/binary" + "fmt" + "testing" + + "github.com/opencontainers/runc/libcontainer/configs" + + libseccomp "github.com/seccomp/libseccomp-golang" + "golang.org/x/net/bpf" +) + +type seccompData struct { + Syscall uint32 // NOTE: We assume sizeof(int) == 4. + Arch uint32 + IP uint64 + Args [6]uint64 +} + +// mockSyscallPayload creates a fake seccomp_data struct with the given data. +func mockSyscallPayload(t *testing.T, sysno libseccomp.ScmpSyscall, arch nativeArch, args ...uint64) []byte { + var buf bytes.Buffer + + data := seccompData{ + Syscall: uint32(sysno), + Arch: uint32(arch), + IP: 0xDEADBEEFCAFE, + } + + copy(data.Args[:], args) + if len(args) > 6 { + t.Fatalf("bad syscall payload: linux only supports 6-argument syscalls") + } + + // NOTE: We use BigEndian here because golang.org/x/net/bpf assumes that + // all payloads are big-endian while seccomp uses host endianness. + if err := binary.Write(&buf, binary.BigEndian, data); err != nil { + t.Fatalf("bad syscall payload: cannot write data: %v", err) + } + return buf.Bytes() +} + +// retFallthrough is returned by the mockFilter. If a the mock filter returns +// this value, it indicates "fallthrough to libseccomp-generated filter". +const retFallthrough uint32 = 0xDEADBEEF + +// mockFilter returns a BPF VM that contains a mock filter with an -ENOSYS +// stub. If the filter returns retFallthrough, the stub filter has permitted +// the syscall to pass. +func mockFilter(t *testing.T, config *configs.Seccomp) (*bpf.VM, []bpf.Instruction) { + patch, err := generatePatch(config) + if err != nil { + t.Fatalf("mock filter: generate enosys patch: %v", err) + } + + program := append(patch, bpf.RetConstant{Val: retFallthrough}) + + vm, err := bpf.NewVM(program) + if err != nil { + t.Fatalf("mock filter: compile BPF VM: %v", err) + } + return vm, program +} + +// fakeConfig generates a fake libcontainer seccomp configuration. The syscalls +// are added with an action distinct from the default action. +func fakeConfig(defaultAction configs.Action, explicitSyscalls []string, arches []string) *configs.Seccomp { + config := configs.Seccomp{ + DefaultAction: defaultAction, + Architectures: arches, + } + syscallAction := configs.Allow + if syscallAction == defaultAction { + syscallAction = configs.Kill + } + for _, syscall := range explicitSyscalls { + config.Syscalls = append(config.Syscalls, &configs.Syscall{ + Name: syscall, + Action: syscallAction, + }) + } + return &config +} + +// List copied from . +var testArches = []string{ + "x86", + "amd64", + "x32", + "arm", + "arm64", + "mips", + "mips64", + "mips64n32", + "mipsel", + "mipsel64", + "mipsel64n32", + "ppc", + "ppc64", + "ppc64le", + "s390", + "s390x", +} + +func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string) { + explicitSyscalls := []string{ + "setns", + "kcmp", + "renameat2", + "copy_file_range", + } + + implicitSyscalls := []string{ + "clone", + "openat", + "read", + "write", + } + + futureSyscalls := []libseccomp.ScmpSyscall{1000, 7331} + + // Quick lookups for which arches are enabled. + archSet := map[string]bool{} + for _, arch := range arches { + archSet[arch] = true + } + + for _, test := range []struct { + start, end int + }{ + {0, 1}, // [setns] + {0, 2}, // [setns, process_vm_readv] + {1, 2}, // [process_vm_readv] + {1, 3}, // [process_vm_readv, renameat2, copy_file_range] + {1, 4}, // [process_vm_readv, renameat2, copy_file_range] + {3, 4}, // [copy_file_range] + } { + allowedSyscalls := explicitSyscalls[test.start:test.end] + config := fakeConfig(defaultAction, allowedSyscalls, arches) + filter, program := mockFilter(t, config) + + // The syscalls are in increasing order of newness, so all syscalls + // after the last allowed syscall will give -ENOSYS. + enosysStart := test.end + + for _, arch := range testArches { + type syscallTest struct { + syscall string + sysno libseccomp.ScmpSyscall + expected uint32 + } + + scmpArch, err := libseccomp.GetArchFromString(arch) + if err != nil { + t.Fatalf("unknown libseccomp architecture %q: %v", arch, err) + } + + nativeArch, err := archToNative(scmpArch) + if err != nil { + t.Fatalf("unknown audit architecture %q: %v", arch, err) + } + + var syscallTests []syscallTest + + // Add explicit syscalls (whether they will return -ENOSYS + // depends on the filter rules). + for idx, syscall := range explicitSyscalls { + expected := retFallthrough + if idx >= enosysStart { + expected = retErrnoEnosys + } + sysno, err := libseccomp.GetSyscallFromNameByArch(syscall, scmpArch) + if err != nil { + t.Fatalf("unknown syscall %q on arch %q: %v", syscall, arch, err) + } + syscallTests = append(syscallTests, syscallTest{ + syscall, + sysno, + expected, + }) + } + + // Add implicit syscalls. + for _, syscall := range implicitSyscalls { + sysno, err := libseccomp.GetSyscallFromNameByArch(syscall, scmpArch) + if err != nil { + t.Fatalf("unknown syscall %q on arch %q: %v", syscall, arch, err) + } + syscallTests = append(syscallTests, syscallTest{ + sysno: sysno, + syscall: syscall, + expected: retFallthrough, + }) + } + + // Add future syscalls. + for _, sysno := range futureSyscalls { + baseSysno, err := libseccomp.GetSyscallFromNameByArch("copy_file_range", scmpArch) + if err != nil { + t.Fatalf("unknown syscall 'copy_file_range' on arch %q: %v", arch, err) + } + sysno += baseSysno + + syscallTests = append(syscallTests, syscallTest{ + sysno: sysno, + syscall: fmt.Sprintf("syscall_%#x", sysno), + expected: retErrnoEnosys, + }) + } + + // Test syscalls in the explicit list. + for _, test := range syscallTests { + // Override the expected value in the two special cases. + if !archSet[arch] || isAllowAction(defaultAction) { + test.expected = retFallthrough + } + + payload := mockSyscallPayload(t, test.sysno, nativeArch, 0x1337, 0xF00BA5) + // NOTE: golang.org/x/net/bpf returns int here rather + // than uint32. + rawRet, err := filter.Run(payload) + if err != nil { + t.Fatalf("error running filter: %v", err) + } + ret := uint32(rawRet) + if ret != test.expected { + t.Logf("mock filter for %v %v:", arches, allowedSyscalls) + for idx, insn := range program { + t.Logf(" [%4.1d] %s", idx, insn) + } + t.Logf("payload: %#v", payload) + t.Errorf("filter %s(%d) %q(%d): got %#x, want %#x", arch, nativeArch, test.syscall, test.sysno, ret, test.expected) + } + } + } + } +} + +var testActions = map[string]configs.Action{ + "allow": configs.Allow, + "log": configs.Log, + "errno": configs.Errno, + "kill": configs.Kill, +} + +func TestEnosysStub_SingleArch(t *testing.T) { + for _, arch := range testArches { + arches := []string{arch} + t.Run("arch="+arch, func(t *testing.T) { + for name, action := range testActions { + t.Run("action="+name, func(t *testing.T) { + testEnosysStub(t, action, arches) + }) + } + }) + } +} + +func TestEnosysStub_MultiArch(t *testing.T) { + for end := 0; end < len(testArches); end++ { + for start := 0; start < end; start++ { + arches := testArches[start:end] + if len(arches) <= 1 { + continue + } + for _, action := range testActions { + testEnosysStub(t, action, arches) + } + } + } +} + +func TestDisassembleHugeFilterDoesNotHang(t *testing.T) { + hugeFilter, err := libseccomp.NewFilter(libseccomp.ActAllow) + if err != nil { + t.Fatalf("failed to create seccomp filter: %v", err) + } + + for i := 1; i < 10000; i++ { + if err := hugeFilter.AddRule(libseccomp.ScmpSyscall(i), libseccomp.ActKill); err != nil { + t.Fatalf("failed to add rule to filter %d: %v", i, err) + } + } + + _, err = disassembleFilter(hugeFilter) + if err != nil { + t.Fatalf("failed to disassembleFilter: %v", err) + } + + // if we exit, we did not hang +} diff --git a/libcontainer/seccomp/patchbpf/enosys_unsupported.go b/libcontainer/seccomp/patchbpf/enosys_unsupported.go new file mode 100644 index 0000000..d23167a --- /dev/null +++ b/libcontainer/seccomp/patchbpf/enosys_unsupported.go @@ -0,0 +1,4 @@ +//go:build !linux || !cgo || !seccomp +// +build !linux !cgo !seccomp + +package patchbpf diff --git a/libcontainer/seccomp/seccomp_linux.go b/libcontainer/seccomp/seccomp_linux.go index 1b7a071..f177b7f 100644 --- a/libcontainer/seccomp/seccomp_linux.go +++ b/libcontainer/seccomp/seccomp_linux.go @@ -1,25 +1,22 @@ -// +build linux,cgo,seccomp +//go:build cgo && seccomp +// +build cgo,seccomp package seccomp import ( - "bufio" + "errors" "fmt" - "os" - "strings" + + libseccomp "github.com/seccomp/libseccomp-golang" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" "github.com/opencontainers/runc/libcontainer/configs" - libseccomp "github.com/seccomp/libseccomp-golang" - - "golang.org/x/sys/unix" + "github.com/opencontainers/runc/libcontainer/seccomp/patchbpf" ) var ( - actAllow = libseccomp.ActAllow - actTrap = libseccomp.ActTrap - actKill = libseccomp.ActKill actTrace = libseccomp.ActTrace.SetReturnCode(int16(unix.EPERM)) - actLog = libseccomp.ActLog actErrno = libseccomp.ActErrno.SetReturnCode(int16(unix.EPERM)) ) @@ -28,95 +25,120 @@ const ( syscallMaxArguments int = 6 ) -// Filters given syscalls in a container, preventing them from being used -// Started in the container init process, and carried over to all child processes -// Setns calls, however, require a separate invocation, as they are not children -// of the init until they join the namespace -func InitSeccomp(config *configs.Seccomp) error { +// InitSeccomp installs the seccomp filters to be used in the container as +// specified in config. +// Returns the seccomp file descriptor if any of the filters include a +// SCMP_ACT_NOTIFY action, otherwise returns -1. +func InitSeccomp(config *configs.Seccomp) (int, error) { if config == nil { - return fmt.Errorf("cannot initialize Seccomp - nil config passed") + return -1, errors.New("cannot initialize Seccomp - nil config passed") } - defaultAction, err := getAction(config.DefaultAction) + defaultAction, err := getAction(config.DefaultAction, config.DefaultErrnoRet) if err != nil { - return fmt.Errorf("error initializing seccomp - invalid default action") + return -1, errors.New("error initializing seccomp - invalid default action") + } + + // Ignore the error since pre-2.4 libseccomp is treated as API level 0. + apiLevel, _ := libseccomp.GetAPI() + for _, call := range config.Syscalls { + if call.Action == configs.Notify { + if apiLevel < 6 { + return -1, fmt.Errorf("seccomp notify unsupported: API level: got %d, want at least 6. Please try with libseccomp >= 2.5.0 and Linux >= 5.7", apiLevel) + } + + // We can't allow the write syscall to notify to the seccomp agent. + // After InitSeccomp() is called, we need to syncParentSeccomp() to write the seccomp fd plain + // number, so the parent sends it to the seccomp agent. If we use SCMP_ACT_NOTIFY on write, we + // never can write the seccomp fd to the parent and therefore the seccomp agent never receives + // the seccomp fd and runc is hang during initialization. + // + // Note that read()/close(), that are also used in syncParentSeccomp(), _can_ use SCMP_ACT_NOTIFY. + // Because we write the seccomp fd on the pipe to the parent, the parent is able to proceed and + // send the seccomp fd to the agent (it is another process and not subject to the seccomp + // filter). We will be blocked on read()/close() inside syncParentSeccomp() but if the seccomp + // agent allows those syscalls to proceed, initialization works just fine and the agent can + // handle future read()/close() syscalls as it wanted. + if call.Name == "write" { + return -1, errors.New("SCMP_ACT_NOTIFY cannot be used for the write syscall") + } + } + } + + // See comment on why write is not allowed. The same reason applies, as this can mean handling write too. + if defaultAction == libseccomp.ActNotify { + return -1, errors.New("SCMP_ACT_NOTIFY cannot be used as default action") } filter, err := libseccomp.NewFilter(defaultAction) if err != nil { - return fmt.Errorf("error creating filter: %s", err) + return -1, fmt.Errorf("error creating filter: %w", err) } // Add extra architectures for _, arch := range config.Architectures { scmpArch, err := libseccomp.GetArchFromString(arch) if err != nil { - return fmt.Errorf("error validating Seccomp architecture: %s", err) + return -1, fmt.Errorf("error validating Seccomp architecture: %w", err) } - if err := filter.AddArch(scmpArch); err != nil { - return fmt.Errorf("error adding architecture to seccomp filter: %s", err) + return -1, fmt.Errorf("error adding architecture to seccomp filter: %w", err) } } // Unset no new privs bit if err := filter.SetNoNewPrivsBit(false); err != nil { - return fmt.Errorf("error setting no new privileges: %s", err) + return -1, fmt.Errorf("error setting no new privileges: %w", err) } // Add a rule for each syscall for _, call := range config.Syscalls { if call == nil { - return fmt.Errorf("encountered nil syscall while initializing Seccomp") + return -1, errors.New("encountered nil syscall while initializing Seccomp") } - if err = matchCall(filter, call); err != nil { - return err + if err := matchCall(filter, call, defaultAction); err != nil { + return -1, err } } - if err = filter.Load(); err != nil { - return fmt.Errorf("error loading seccomp filter into kernel: %s", err) - } - - return nil -} - -// IsEnabled returns if the kernel has been configured to support seccomp. -func IsEnabled() bool { - // Try to read from /proc/self/status for kernels > 3.8 - s, err := parseStatusFile("/proc/self/status") + seccompFd, err := patchbpf.PatchAndLoad(config, filter) if err != nil { - // Check if Seccomp is supported, via CONFIG_SECCOMP. - if err := unix.Prctl(unix.PR_GET_SECCOMP, 0, 0, 0, 0); err != unix.EINVAL { - // Make sure the kernel has CONFIG_SECCOMP_FILTER. - if err := unix.Prctl(unix.PR_SET_SECCOMP, unix.SECCOMP_MODE_FILTER, 0, 0, 0); err != unix.EINVAL { - return true - } - } - return false + return -1, fmt.Errorf("error loading seccomp filter into kernel: %w", err) } - _, ok := s["Seccomp"] - return ok + + return seccompFd, nil } // Convert Libcontainer Action to Libseccomp ScmpAction -func getAction(act configs.Action) (libseccomp.ScmpAction, error) { +func getAction(act configs.Action, errnoRet *uint) (libseccomp.ScmpAction, error) { switch act { case configs.Kill: - return actKill, nil + return libseccomp.ActKill, nil case configs.Errno: + if errnoRet != nil { + return libseccomp.ActErrno.SetReturnCode(int16(*errnoRet)), nil + } return actErrno, nil case configs.Trap: - return actTrap, nil + return libseccomp.ActTrap, nil case configs.Allow: - return actAllow, nil + return libseccomp.ActAllow, nil case configs.Trace: + if errnoRet != nil { + return libseccomp.ActTrace.SetReturnCode(int16(*errnoRet)), nil + } return actTrace, nil case configs.Log: - return actLog, nil + return libseccomp.ActLog, nil + case configs.Notify: + return libseccomp.ActNotify, nil + case configs.KillThread: + return libseccomp.ActKillThread, nil + case configs.KillProcess: + return libseccomp.ActKillProcess, nil default: - return libseccomp.ActInvalid, fmt.Errorf("invalid action, cannot use in rule") + return libseccomp.ActInvalid, errors.New("invalid action, cannot use in rule") } } @@ -138,7 +160,7 @@ func getOperator(op configs.Operator) (libseccomp.ScmpCompareOp, error) { case configs.MaskEqualTo: return libseccomp.CompareMaskedEqual, nil default: - return libseccomp.CompareInvalid, fmt.Errorf("invalid operator, cannot use in rule") + return libseccomp.CompareInvalid, errors.New("invalid operator, cannot use in rule") } } @@ -147,7 +169,7 @@ func getCondition(arg *configs.Arg) (libseccomp.ScmpCondition, error) { cond := libseccomp.ScmpCondition{} if arg == nil { - return cond, fmt.Errorf("cannot convert nil to syscall condition") + return cond, errors.New("cannot convert nil to syscall condition") } op, err := getOperator(arg.Op) @@ -159,32 +181,38 @@ func getCondition(arg *configs.Arg) (libseccomp.ScmpCondition, error) { } // Add a rule to match a single syscall -func matchCall(filter *libseccomp.ScmpFilter, call *configs.Syscall) error { +func matchCall(filter *libseccomp.ScmpFilter, call *configs.Syscall, defAct libseccomp.ScmpAction) error { if call == nil || filter == nil { - return fmt.Errorf("cannot use nil as syscall to block") + return errors.New("cannot use nil as syscall to block") } if len(call.Name) == 0 { - return fmt.Errorf("empty string is not a valid syscall") - } - - // If we can't resolve the syscall, assume it's not supported on this kernel - // Ignore it, don't error out - callNum, err := libseccomp.GetSyscallFromName(call.Name) - if err != nil { - return nil + return errors.New("empty string is not a valid syscall") } // Convert the call's action to the libseccomp equivalent - callAct, err := getAction(call.Action) + callAct, err := getAction(call.Action, call.ErrnoRet) if err != nil { - return fmt.Errorf("action in seccomp profile is invalid: %s", err) + return fmt.Errorf("action in seccomp profile is invalid: %w", err) + } + if callAct == defAct { + // This rule is redundant, silently skip it + // to avoid error from AddRule. + return nil + } + + // If we can't resolve the syscall, assume it is not supported + // by this kernel. Warn about it, don't error out. + callNum, err := libseccomp.GetSyscallFromName(call.Name) + if err != nil { + logrus.Debugf("unknown seccomp syscall %q ignored", call.Name) + return nil } // Unconditional match - just add the rule if len(call.Args) == 0 { - if err = filter.AddRule(callNum, callAct); err != nil { - return fmt.Errorf("error adding seccomp filter rule for syscall %s: %s", call.Name, err) + if err := filter.AddRule(callNum, callAct); err != nil { + return fmt.Errorf("error adding seccomp filter rule for syscall %s: %w", call.Name, err) } } else { // If two or more arguments have the same condition, @@ -195,7 +223,7 @@ func matchCall(filter *libseccomp.ScmpFilter, call *configs.Syscall) error { for _, cond := range call.Args { newCond, err := getCondition(cond) if err != nil { - return fmt.Errorf("error creating seccomp syscall condition for syscall %s: %s", call.Name, err) + return fmt.Errorf("error creating seccomp syscall condition for syscall %s: %w", call.Name, err) } argCounts[cond.Index] += 1 @@ -217,15 +245,15 @@ func matchCall(filter *libseccomp.ScmpFilter, call *configs.Syscall) error { for _, cond := range conditions { condArr := []libseccomp.ScmpCondition{cond} - if err = filter.AddRuleConditional(callNum, callAct, condArr); err != nil { - return fmt.Errorf("error adding seccomp rule for syscall %s: %s", call.Name, err) + if err := filter.AddRuleConditional(callNum, callAct, condArr); err != nil { + return fmt.Errorf("error adding seccomp rule for syscall %s: %w", call.Name, err) } } } else { // No conditions share same argument // Use new, proper behavior - if err = filter.AddRuleConditional(callNum, callAct, conditions); err != nil { - return fmt.Errorf("error adding seccomp rule for syscall %s: %s", call.Name, err) + if err := filter.AddRuleConditional(callNum, callAct, conditions); err != nil { + return fmt.Errorf("error adding seccomp rule for syscall %s: %w", call.Name, err) } } } @@ -233,29 +261,10 @@ func matchCall(filter *libseccomp.ScmpFilter, call *configs.Syscall) error { return nil } -func parseStatusFile(path string) (map[string]string, error) { - f, err := os.Open(path) - if err != nil { - return nil, err - } - defer f.Close() - - s := bufio.NewScanner(f) - status := make(map[string]string) - - for s.Scan() { - text := s.Text() - parts := strings.Split(text, ":") - - if len(parts) <= 1 { - continue - } - - status[parts[0]] = parts[1] - } - if err := s.Err(); err != nil { - return nil, err - } - - return status, nil +// Version returns major, minor, and micro. +func Version() (uint, uint, uint) { + return libseccomp.GetLibraryVersion() } + +// Enabled is true if seccomp support is compiled in. +const Enabled = true diff --git a/libcontainer/seccomp/seccomp_linux_test.go b/libcontainer/seccomp/seccomp_linux_test.go deleted file mode 100644 index 67a2ef6..0000000 --- a/libcontainer/seccomp/seccomp_linux_test.go +++ /dev/null @@ -1,17 +0,0 @@ -// +build linux,cgo,seccomp - -package seccomp - -import "testing" - -func TestParseStatusFile(t *testing.T) { - s, err := parseStatusFile("fixtures/proc_self_status") - if err != nil { - t.Fatal(err) - } - - if _, ok := s["Seccomp"]; !ok { - - t.Fatal("expected to find 'Seccomp' in the map but did not.") - } -} diff --git a/libcontainer/seccomp/seccomp_unsupported.go b/libcontainer/seccomp/seccomp_unsupported.go index 44df1ad..be2b324 100644 --- a/libcontainer/seccomp/seccomp_unsupported.go +++ b/libcontainer/seccomp/seccomp_unsupported.go @@ -1,3 +1,4 @@ +//go:build !linux || !cgo || !seccomp // +build !linux !cgo !seccomp package seccomp @@ -11,14 +12,17 @@ import ( var ErrSeccompNotEnabled = errors.New("seccomp: config provided but seccomp not supported") // InitSeccomp does nothing because seccomp is not supported. -func InitSeccomp(config *configs.Seccomp) error { +func InitSeccomp(config *configs.Seccomp) (int, error) { if config != nil { - return ErrSeccompNotEnabled + return -1, ErrSeccompNotEnabled } - return nil + return -1, nil } -// IsEnabled returns false, because it is not supported. -func IsEnabled() bool { - return false +// Version returns major, minor, and micro. +func Version() (uint, uint, uint) { + return 0, 0, 0 } + +// Enabled is true if seccomp support is compiled in. +const Enabled = false diff --git a/libcontainer/setns_init_linux.go b/libcontainer/setns_init_linux.go index 888981f..09ab552 100644 --- a/libcontainer/setns_init_linux.go +++ b/libcontainer/setns_init_linux.go @@ -1,20 +1,19 @@ -// +build linux - package libcontainer import ( + "errors" "fmt" "os" - "runtime" + "strconv" + + "github.com/opencontainers/selinux/go-selinux" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" "github.com/opencontainers/runc/libcontainer/apparmor" "github.com/opencontainers/runc/libcontainer/keys" "github.com/opencontainers/runc/libcontainer/seccomp" "github.com/opencontainers/runc/libcontainer/system" - "github.com/opencontainers/selinux/go-selinux/label" - "github.com/pkg/errors" - - "golang.org/x/sys/unix" ) // linuxSetnsInit performs the container's initialization for running a new process @@ -23,29 +22,27 @@ type linuxSetnsInit struct { pipe *os.File consoleSocket *os.File config *initConfig + logFd int } func (l *linuxSetnsInit) getSessionRingName() string { - return fmt.Sprintf("_ses.%s", l.config.ContainerId) + return "_ses." + l.config.ContainerId } func (l *linuxSetnsInit) Init() error { - runtime.LockOSThread() - defer runtime.UnlockOSThread() - if !l.config.Config.NoNewKeyring { - if err := label.SetKeyLabel(l.config.ProcessLabel); err != nil { + if err := selinux.SetKeyLabel(l.config.ProcessLabel); err != nil { return err } - defer label.SetKeyLabel("") + defer selinux.SetKeyLabel("") //nolint: errcheck // Do not inherit the parent's session keyring. if _, err := keys.JoinSessionKeyring(l.getSessionRingName()); err != nil { // Same justification as in standart_init_linux.go as to why we // don't bail on ENOSYS. // // TODO(cyphar): And we should have logging here too. - if errors.Cause(err) != unix.ENOSYS { - return errors.Wrap(err, "join session keyring") + if !errors.Is(err, unix.ENOSYS) { + return fmt.Errorf("unable to join session keyring: %w", err) } } } @@ -62,15 +59,20 @@ func (l *linuxSetnsInit) Init() error { return err } } - if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil { + if err := selinux.SetExecLabel(l.config.ProcessLabel); err != nil { return err } - defer label.SetProcessLabel("") + defer selinux.SetExecLabel("") //nolint: errcheck // Without NoNewPrivileges seccomp is a privileged operation, so we need to // do this before dropping capabilities; otherwise do it as late as possible // just before execve so as few syscalls take place after it as possible. if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges { - if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil { + seccompFd, err := seccomp.InitSeccomp(l.config.Config.Seccomp) + if err != nil { + return err + } + + if err := syncParentSeccomp(l.pipe, seccompFd); err != nil { return err } } @@ -84,9 +86,20 @@ func (l *linuxSetnsInit) Init() error { // place afterward (reducing the amount of syscalls that users need to // enable in their seccomp profiles). if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges { - if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil { - return newSystemErrorWithCause(err, "init seccomp") + seccompFd, err := seccomp.InitSeccomp(l.config.Config.Seccomp) + if err != nil { + return fmt.Errorf("unable to init seccomp: %w", err) + } + + if err := syncParentSeccomp(l.pipe, seccompFd); err != nil { + return err } } + logrus.Debugf("setns_init: about to exec") + // Close the log pipe fd so the parent's ForwardLogs can exit. + if err := unix.Close(l.logFd); err != nil { + return &os.PathError{Op: "close log pipe", Path: "fd " + strconv.Itoa(l.logFd), Err: err} + } + return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ()) } diff --git a/libcontainer/specconv/example.go b/libcontainer/specconv/example.go index 23e225c..56bab3b 100644 --- a/libcontainer/specconv/example.go +++ b/libcontainer/specconv/example.go @@ -2,15 +2,17 @@ package specconv import ( "os" + "path/filepath" "strings" + "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runtime-spec/specs-go" ) // Example returns an example spec file, with many options set so a user can // see what a standard spec file looks like. func Example() *specs.Spec { - return &specs.Spec{ + spec := &specs.Spec{ Version: specs.Version, Root: &specs.Root{ Path: "rootfs", @@ -138,23 +140,29 @@ func Example() *specs.Spec { }, Namespaces: []specs.LinuxNamespace{ { - Type: "pid", + Type: specs.PIDNamespace, }, { - Type: "network", + Type: specs.NetworkNamespace, }, { - Type: "ipc", + Type: specs.IPCNamespace, }, { - Type: "uts", + Type: specs.UTSNamespace, }, { - Type: "mount", + Type: specs.MountNamespace, }, }, }, } + if cgroups.IsCgroup2UnifiedMode() { + spec.Linux.Namespaces = append(spec.Linux.Namespaces, specs.LinuxNamespace{ + Type: specs.CgroupNamespace, + }) + } + return spec } // ToRootless converts the given spec file into one that should work with @@ -193,8 +201,14 @@ func ToRootless(spec *specs.Spec) { // Fix up mounts. var mounts []specs.Mount for _, mount := range spec.Mounts { - // Ignore all mounts that are under /sys. - if strings.HasPrefix(mount.Destination, "/sys") { + // Replace the /sys mount with an rbind. + if filepath.Clean(mount.Destination) == "/sys" { + mounts = append(mounts, specs.Mount{ + Source: "/sys", + Destination: "/sys", + Type: "none", + Options: []string{"rbind", "nosuid", "noexec", "nodev", "ro"}, + }) continue } @@ -209,13 +223,6 @@ func ToRootless(spec *specs.Spec) { mount.Options = options mounts = append(mounts, mount) } - // Add the sysfs mount as an rbind. - mounts = append(mounts, specs.Mount{ - Source: "/sys", - Destination: "/sys", - Type: "none", - Options: []string{"rbind", "nosuid", "noexec", "nodev", "ro"}, - }) spec.Mounts = mounts // Remove cgroup settings. diff --git a/libcontainer/specconv/spec_linux.go b/libcontainer/specconv/spec_linux.go index d9e73c4..c7ca4c8 100644 --- a/libcontainer/specconv/spec_linux.go +++ b/libcontainer/specconv/spec_linux.go @@ -1,146 +1,319 @@ -// +build linux - // Package specconv implements conversion of specifications to libcontainer // configurations package specconv import ( + "errors" "fmt" "os" "path/filepath" + "sort" "strings" + "sync" "time" + systemdDbus "github.com/coreos/go-systemd/v22/dbus" + dbus "github.com/godbus/dbus/v5" + "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/devices" "github.com/opencontainers/runc/libcontainer/seccomp" libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils" "github.com/opencontainers/runtime-spec/specs-go" + "github.com/sirupsen/logrus" "golang.org/x/sys/unix" ) -const wildcard = -1 +var ( + initMapsOnce sync.Once + namespaceMapping map[specs.LinuxNamespaceType]configs.NamespaceType + mountPropagationMapping map[string]int + recAttrFlags map[string]struct { + clear bool + flag uint64 + } + mountFlags, extensionFlags map[string]struct { + clear bool + flag int + } +) -var namespaceMapping = map[specs.LinuxNamespaceType]configs.NamespaceType{ - specs.PIDNamespace: configs.NEWPID, - specs.NetworkNamespace: configs.NEWNET, - specs.MountNamespace: configs.NEWNS, - specs.UserNamespace: configs.NEWUSER, - specs.IPCNamespace: configs.NEWIPC, - specs.UTSNamespace: configs.NEWUTS, - specs.CgroupNamespace: configs.NEWCGROUP, +func initMaps() { + initMapsOnce.Do(func() { + namespaceMapping = map[specs.LinuxNamespaceType]configs.NamespaceType{ + specs.PIDNamespace: configs.NEWPID, + specs.NetworkNamespace: configs.NEWNET, + specs.MountNamespace: configs.NEWNS, + specs.UserNamespace: configs.NEWUSER, + specs.IPCNamespace: configs.NEWIPC, + specs.UTSNamespace: configs.NEWUTS, + specs.CgroupNamespace: configs.NEWCGROUP, + } + + mountPropagationMapping = map[string]int{ + "rprivate": unix.MS_PRIVATE | unix.MS_REC, + "private": unix.MS_PRIVATE, + "rslave": unix.MS_SLAVE | unix.MS_REC, + "slave": unix.MS_SLAVE, + "rshared": unix.MS_SHARED | unix.MS_REC, + "shared": unix.MS_SHARED, + "runbindable": unix.MS_UNBINDABLE | unix.MS_REC, + "unbindable": unix.MS_UNBINDABLE, + "": 0, + } + + mountFlags = map[string]struct { + clear bool + flag int + }{ + "acl": {false, unix.MS_POSIXACL}, + "async": {true, unix.MS_SYNCHRONOUS}, + "atime": {true, unix.MS_NOATIME}, + "bind": {false, unix.MS_BIND}, + "defaults": {false, 0}, + "dev": {true, unix.MS_NODEV}, + "diratime": {true, unix.MS_NODIRATIME}, + "dirsync": {false, unix.MS_DIRSYNC}, + "exec": {true, unix.MS_NOEXEC}, + "iversion": {false, unix.MS_I_VERSION}, + "lazytime": {false, unix.MS_LAZYTIME}, + "loud": {true, unix.MS_SILENT}, + "mand": {false, unix.MS_MANDLOCK}, + "noacl": {true, unix.MS_POSIXACL}, + "noatime": {false, unix.MS_NOATIME}, + "nodev": {false, unix.MS_NODEV}, + "nodiratime": {false, unix.MS_NODIRATIME}, + "noexec": {false, unix.MS_NOEXEC}, + "noiversion": {true, unix.MS_I_VERSION}, + "nolazytime": {true, unix.MS_LAZYTIME}, + "nomand": {true, unix.MS_MANDLOCK}, + "norelatime": {true, unix.MS_RELATIME}, + "nostrictatime": {true, unix.MS_STRICTATIME}, + "nosuid": {false, unix.MS_NOSUID}, + "nosymfollow": {false, unix.MS_NOSYMFOLLOW}, // since kernel 5.10 + "rbind": {false, unix.MS_BIND | unix.MS_REC}, + "relatime": {false, unix.MS_RELATIME}, + "remount": {false, unix.MS_REMOUNT}, + "ro": {false, unix.MS_RDONLY}, + "rw": {true, unix.MS_RDONLY}, + "silent": {false, unix.MS_SILENT}, + "strictatime": {false, unix.MS_STRICTATIME}, + "suid": {true, unix.MS_NOSUID}, + "sync": {false, unix.MS_SYNCHRONOUS}, + "symfollow": {true, unix.MS_NOSYMFOLLOW}, // since kernel 5.10 + } + + recAttrFlags = map[string]struct { + clear bool + flag uint64 + }{ + "rro": {false, unix.MOUNT_ATTR_RDONLY}, + "rrw": {true, unix.MOUNT_ATTR_RDONLY}, + "rnosuid": {false, unix.MOUNT_ATTR_NOSUID}, + "rsuid": {true, unix.MOUNT_ATTR_NOSUID}, + "rnodev": {false, unix.MOUNT_ATTR_NODEV}, + "rdev": {true, unix.MOUNT_ATTR_NODEV}, + "rnoexec": {false, unix.MOUNT_ATTR_NOEXEC}, + "rexec": {true, unix.MOUNT_ATTR_NOEXEC}, + "rnodiratime": {false, unix.MOUNT_ATTR_NODIRATIME}, + "rdiratime": {true, unix.MOUNT_ATTR_NODIRATIME}, + "rrelatime": {false, unix.MOUNT_ATTR_RELATIME}, + "rnorelatime": {true, unix.MOUNT_ATTR_RELATIME}, + "rnoatime": {false, unix.MOUNT_ATTR_NOATIME}, + "ratime": {true, unix.MOUNT_ATTR_NOATIME}, + "rstrictatime": {false, unix.MOUNT_ATTR_STRICTATIME}, + "rnostrictatime": {true, unix.MOUNT_ATTR_STRICTATIME}, + "rnosymfollow": {false, unix.MOUNT_ATTR_NOSYMFOLLOW}, // since kernel 5.14 + "rsymfollow": {true, unix.MOUNT_ATTR_NOSYMFOLLOW}, // since kernel 5.14 + // No support for MOUNT_ATTR_IDMAP yet (needs UserNS FD) + } + + extensionFlags = map[string]struct { + clear bool + flag int + }{ + "tmpcopyup": {false, configs.EXT_COPYUP}, + } + }) } -var mountPropagationMapping = map[string]int{ - "rprivate": unix.MS_PRIVATE | unix.MS_REC, - "private": unix.MS_PRIVATE, - "rslave": unix.MS_SLAVE | unix.MS_REC, - "slave": unix.MS_SLAVE, - "rshared": unix.MS_SHARED | unix.MS_REC, - "shared": unix.MS_SHARED, - "runbindable": unix.MS_UNBINDABLE | unix.MS_REC, - "unbindable": unix.MS_UNBINDABLE, - "": 0, +// KnownNamespaces returns the list of the known namespaces. +// Used by `runc features`. +func KnownNamespaces() []string { + initMaps() + var res []string + for k := range namespaceMapping { + res = append(res, string(k)) + } + sort.Strings(res) + return res } -// AllowedDevices is exposed for devicefilter_test.go -var AllowedDevices = []*configs.Device{ +// KnownMountOptions returns the list of the known mount options. +// Used by `runc features`. +func KnownMountOptions() []string { + initMaps() + var res []string + for k := range mountFlags { + res = append(res, k) + } + for k := range mountPropagationMapping { + if k != "" { + res = append(res, k) + } + } + for k := range recAttrFlags { + res = append(res, k) + } + for k := range extensionFlags { + res = append(res, k) + } + sort.Strings(res) + return res +} + +// AllowedDevices is the set of devices which are automatically included for +// all containers. +// +// XXX (cyphar) +// This behaviour is at the very least "questionable" (if not outright +// wrong) according to the runtime-spec. +// +// Yes, we have to include certain devices other than the ones the user +// specifies, but several devices listed here are not part of the spec +// (including "mknod for any device"?!). In addition, these rules are +// appended to the user-provided set which means that users *cannot disable +// this behaviour*. +// +// ... unfortunately I'm too scared to change this now because who knows how +// many people depend on this (incorrect and arguably insecure) behaviour. +var AllowedDevices = []*devices.Device{ // allow mknod for any device { - Type: 'c', - Major: wildcard, - Minor: wildcard, - Permissions: "m", - Allow: true, + Rule: devices.Rule{ + Type: devices.CharDevice, + Major: devices.Wildcard, + Minor: devices.Wildcard, + Permissions: "m", + Allow: true, + }, }, { - Type: 'b', - Major: wildcard, - Minor: wildcard, - Permissions: "m", - Allow: true, + Rule: devices.Rule{ + Type: devices.BlockDevice, + Major: devices.Wildcard, + Minor: devices.Wildcard, + Permissions: "m", + Allow: true, + }, }, { - Type: 'c', - Path: "/dev/null", - Major: 1, - Minor: 3, - Permissions: "rwm", - Allow: true, + Path: "/dev/null", + FileMode: 0o666, + Uid: 0, + Gid: 0, + Rule: devices.Rule{ + Type: devices.CharDevice, + Major: 1, + Minor: 3, + Permissions: "rwm", + Allow: true, + }, }, { - Type: 'c', - Path: "/dev/random", - Major: 1, - Minor: 8, - Permissions: "rwm", - Allow: true, + Path: "/dev/random", + FileMode: 0o666, + Uid: 0, + Gid: 0, + Rule: devices.Rule{ + Type: devices.CharDevice, + Major: 1, + Minor: 8, + Permissions: "rwm", + Allow: true, + }, }, { - Type: 'c', - Path: "/dev/full", - Major: 1, - Minor: 7, - Permissions: "rwm", - Allow: true, + Path: "/dev/full", + FileMode: 0o666, + Uid: 0, + Gid: 0, + Rule: devices.Rule{ + Type: devices.CharDevice, + Major: 1, + Minor: 7, + Permissions: "rwm", + Allow: true, + }, }, { - Type: 'c', - Path: "/dev/tty", - Major: 5, - Minor: 0, - Permissions: "rwm", - Allow: true, + Path: "/dev/tty", + FileMode: 0o666, + Uid: 0, + Gid: 0, + Rule: devices.Rule{ + Type: devices.CharDevice, + Major: 5, + Minor: 0, + Permissions: "rwm", + Allow: true, + }, }, { - Type: 'c', - Path: "/dev/zero", - Major: 1, - Minor: 5, - Permissions: "rwm", - Allow: true, + Path: "/dev/zero", + FileMode: 0o666, + Uid: 0, + Gid: 0, + Rule: devices.Rule{ + Type: devices.CharDevice, + Major: 1, + Minor: 5, + Permissions: "rwm", + Allow: true, + }, }, { - Type: 'c', - Path: "/dev/urandom", - Major: 1, - Minor: 9, - Permissions: "rwm", - Allow: true, - }, - { - Path: "/dev/console", - Type: 'c', - Major: 5, - Minor: 1, - Permissions: "rwm", - Allow: true, + Path: "/dev/urandom", + FileMode: 0o666, + Uid: 0, + Gid: 0, + Rule: devices.Rule{ + Type: devices.CharDevice, + Major: 1, + Minor: 9, + Permissions: "rwm", + Allow: true, + }, }, // /dev/pts/ - pts namespaces are "coming soon" { - Path: "", - Type: 'c', - Major: 136, - Minor: wildcard, - Permissions: "rwm", - Allow: true, + Rule: devices.Rule{ + Type: devices.CharDevice, + Major: 136, + Minor: devices.Wildcard, + Permissions: "rwm", + Allow: true, + }, }, { - Path: "", - Type: 'c', - Major: 5, - Minor: 2, - Permissions: "rwm", - Allow: true, + Rule: devices.Rule{ + Type: devices.CharDevice, + Major: 5, + Minor: 2, + Permissions: "rwm", + Allow: true, + }, }, // tuntap { - Path: "", - Type: 'c', - Major: 10, - Minor: 200, - Permissions: "rwm", - Allow: true, + Rule: devices.Rule{ + Type: devices.CharDevice, + Major: 10, + Minor: 200, + Permissions: "rwm", + Allow: true, + }, }, } @@ -168,7 +341,7 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) { } spec := opts.Spec if spec.Root == nil { - return nil, fmt.Errorf("Root must be specified") + return nil, errors.New("root must be specified") } rootfsPath := spec.Root.Path if !filepath.IsAbs(rootfsPath) { @@ -176,38 +349,48 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) { } labels := []string{} for k, v := range spec.Annotations { - labels = append(labels, fmt.Sprintf("%s=%s", k, v)) + labels = append(labels, k+"="+v) } config := &configs.Config{ Rootfs: rootfsPath, NoPivotRoot: opts.NoPivotRoot, Readonlyfs: spec.Root.Readonly, Hostname: spec.Hostname, - Labels: append(labels, fmt.Sprintf("bundle=%s", cwd)), + Labels: append(labels, "bundle="+cwd), NoNewKeyring: opts.NoNewKeyring, RootlessEUID: opts.RootlessEUID, RootlessCgroups: opts.RootlessCgroups, } - exists := false for _, m := range spec.Mounts { - config.Mounts = append(config.Mounts, createLibcontainerMount(cwd, m)) + cm, err := createLibcontainerMount(cwd, m) + if err != nil { + return nil, fmt.Errorf("invalid mount %+v: %w", m, err) + } + config.Mounts = append(config.Mounts, cm) } - if err := createDevices(spec, config); err != nil { - return nil, err - } - c, err := CreateCgroupConfig(opts) + + defaultDevs, err := createDevices(spec, config) if err != nil { return nil, err } + + c, err := CreateCgroupConfig(opts, defaultDevs) + if err != nil { + return nil, err + } + config.Cgroups = c // set linux-specific config if spec.Linux != nil { + initMaps() + + var exists bool if config.RootPropagation, exists = mountPropagationMapping[spec.Linux.RootfsPropagation]; !exists { return nil, fmt.Errorf("rootfsPropagation=%v is not supported", spec.Linux.RootfsPropagation) } if config.NoPivotRoot && (config.RootPropagation&unix.MS_PRIVATE != 0) { - return nil, fmt.Errorf("rootfsPropagation of [r]private is not safe without pivot_root") + return nil, errors.New("rootfsPropagation of [r]private is not safe without pivot_root") } for _, ns := range spec.Linux.Namespaces { @@ -244,20 +427,61 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) { config.Seccomp = seccomp } if spec.Linux.IntelRdt != nil { - config.IntelRdt = &configs.IntelRdt{} - if spec.Linux.IntelRdt.L3CacheSchema != "" { - config.IntelRdt.L3CacheSchema = spec.Linux.IntelRdt.L3CacheSchema - } - if spec.Linux.IntelRdt.MemBwSchema != "" { - config.IntelRdt.MemBwSchema = spec.Linux.IntelRdt.MemBwSchema + config.IntelRdt = &configs.IntelRdt{ + ClosID: spec.Linux.IntelRdt.ClosID, + L3CacheSchema: spec.Linux.IntelRdt.L3CacheSchema, + MemBwSchema: spec.Linux.IntelRdt.MemBwSchema, } } } + + // Set the host UID that should own the container's cgroup. + // This must be performed after setupUserNamespace, so that + // config.HostRootUID() returns the correct result. + // + // Only set it if the container will have its own cgroup + // namespace and the cgroupfs will be mounted read/write. + // + hasCgroupNS := config.Namespaces.Contains(configs.NEWCGROUP) && config.Namespaces.PathOf(configs.NEWCGROUP) == "" + hasRwCgroupfs := false + if hasCgroupNS { + for _, m := range config.Mounts { + if m.Source == "cgroup" && filepath.Clean(m.Destination) == "/sys/fs/cgroup" && (m.Flags&unix.MS_RDONLY) == 0 { + hasRwCgroupfs = true + break + } + } + } + processUid := 0 + if spec.Process != nil { + // Chown the cgroup to the UID running the process, + // which is not necessarily UID 0 in the container + // namespace (e.g., an unprivileged UID in the host + // user namespace). + processUid = int(spec.Process.User.UID) + } + if hasCgroupNS && hasRwCgroupfs { + ownerUid, err := config.HostUID(processUid) + // There are two error cases; we can ignore both. + // + // 1. uidMappings is unset. Either there is no user + // namespace (fine), or it is an error (which is + // checked elsewhere). + // + // 2. The user is unmapped in the user namespace. This is an + // unusual configuration and might be an error. But it too + // will be checked elsewhere, so we can ignore it here. + // + if err == nil { + config.Cgroups.OwnerUID = &ownerUid + } + } + if spec.Process != nil { config.OomScoreAdj = spec.Process.OOMScoreAdj - if spec.Process.SelinuxLabel != "" { - config.ProcessLabel = spec.Process.SelinuxLabel - } + config.NoNewPrivileges = spec.Process.NoNewPrivileges + config.Umask = spec.Process.User.Umask + config.ProcessLabel = spec.Process.SelinuxLabel if spec.Process.Capabilities != nil { config.Capabilities = &configs.Capabilities{ Bounding: spec.Process.Capabilities.Bounding, @@ -273,31 +497,123 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) { return config, nil } -func createLibcontainerMount(cwd string, m specs.Mount) *configs.Mount { - flags, pgflags, data, ext := parseMountOptions(m.Options) - source := m.Source - device := m.Type - if flags&unix.MS_BIND != 0 { +func createLibcontainerMount(cwd string, m specs.Mount) (*configs.Mount, error) { + if !filepath.IsAbs(m.Destination) { + // Relax validation for backward compatibility + // TODO (runc v1.x.x): change warning to an error + // return nil, fmt.Errorf("mount destination %s is not absolute", m.Destination) + logrus.Warnf("mount destination %s is not absolute. Support for non-absolute mount destinations will be removed in a future release.", m.Destination) + } + mnt := parseMountOptions(m.Options) + + mnt.Destination = m.Destination + mnt.Source = m.Source + mnt.Device = m.Type + if mnt.Flags&unix.MS_BIND != 0 { // Any "type" the user specified is meaningless (and ignored) for // bind-mounts -- so we set it to "bind" because rootfs_linux.go // (incorrectly) relies on this for some checks. - device = "bind" - if !filepath.IsAbs(source) { - source = filepath.Join(cwd, m.Source) + mnt.Device = "bind" + if !filepath.IsAbs(mnt.Source) { + mnt.Source = filepath.Join(cwd, m.Source) } } - return &configs.Mount{ - Device: device, - Source: source, - Destination: m.Destination, - Data: data, - Flags: flags, - PropagationFlags: pgflags, - Extensions: ext, + + // None of the mount arguments can contain a null byte. Normally such + // strings would either cause some other failure or would just be truncated + // when we hit the null byte, but because we serialise these strings as + // netlink messages (which don't have special null-byte handling) we need + // to block this as early as possible. + if strings.IndexByte(mnt.Source, 0) >= 0 || + strings.IndexByte(mnt.Destination, 0) >= 0 || + strings.IndexByte(mnt.Device, 0) >= 0 { + return nil, errors.New("mount field contains null byte") } + + return mnt, nil } -func CreateCgroupConfig(opts *CreateOpts) (*configs.Cgroup, error) { +// checkPropertyName checks if systemd property name is valid. A valid name +// should consist of latin letters only, and have least 3 of them. +func checkPropertyName(s string) error { + if len(s) < 3 { + return errors.New("too short") + } + // Check ASCII characters rather than Unicode runes. + for _, ch := range s { + if (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') { + continue + } + return errors.New("contains non-alphabetic character") + } + return nil +} + +// Some systemd properties are documented as having "Sec" suffix +// (e.g. TimeoutStopSec) but are expected to have "USec" suffix +// here, so let's provide conversion to improve compatibility. +func convertSecToUSec(value dbus.Variant) (dbus.Variant, error) { + var sec uint64 + const M = 1000000 + vi := value.Value() + switch value.Signature().String() { + case "y": + sec = uint64(vi.(byte)) * M + case "n": + sec = uint64(vi.(int16)) * M + case "q": + sec = uint64(vi.(uint16)) * M + case "i": + sec = uint64(vi.(int32)) * M + case "u": + sec = uint64(vi.(uint32)) * M + case "x": + sec = uint64(vi.(int64)) * M + case "t": + sec = vi.(uint64) * M + case "d": + sec = uint64(vi.(float64) * M) + default: + return value, errors.New("not a number") + } + return dbus.MakeVariant(sec), nil +} + +func initSystemdProps(spec *specs.Spec) ([]systemdDbus.Property, error) { + const keyPrefix = "org.systemd.property." + var sp []systemdDbus.Property + + for k, v := range spec.Annotations { + name := strings.TrimPrefix(k, keyPrefix) + if len(name) == len(k) { // prefix not there + continue + } + if err := checkPropertyName(name); err != nil { + return nil, fmt.Errorf("annotation %s name incorrect: %w", k, err) + } + value, err := dbus.ParseVariant(v, dbus.Signature{}) + if err != nil { + return nil, fmt.Errorf("annotation %s=%s value parse error: %w", k, v, err) + } + // Check for Sec suffix. + if trimName := strings.TrimSuffix(name, "Sec"); len(trimName) < len(name) { + // Check for a lowercase ascii a-z just before Sec. + if ch := trimName[len(trimName)-1]; ch >= 'a' && ch <= 'z' { + // Convert from Sec to USec. + name = trimName + "USec" + value, err = convertSecToUSec(value) + if err != nil { + return nil, fmt.Errorf("annotation %s=%s value parse error: %w", k, v, err) + } + } + } + sp = append(sp, systemdDbus.Property{Name: name, Value: value}) + } + + return sp, nil +} + +func CreateCgroupConfig(opts *CreateOpts, defaultDevs []*devices.Device) (*configs.Cgroup, error) { var ( myCgroupPath string @@ -307,19 +623,30 @@ func CreateCgroupConfig(opts *CreateOpts) (*configs.Cgroup, error) { ) c := &configs.Cgroup{ + Systemd: useSystemdCgroup, + Rootless: opts.RootlessCgroups, Resources: &configs.Resources{}, } + if useSystemdCgroup { + sp, err := initSystemdProps(spec) + if err != nil { + return nil, err + } + c.SystemdProps = sp + } + if spec.Linux != nil && spec.Linux.CgroupsPath != "" { - myCgroupPath = libcontainerUtils.CleanPath(spec.Linux.CgroupsPath) if useSystemdCgroup { myCgroupPath = spec.Linux.CgroupsPath + } else { + myCgroupPath = libcontainerUtils.CleanPath(spec.Linux.CgroupsPath) } } if useSystemdCgroup { if myCgroupPath == "" { - c.Parent = "system.slice" + // Default for c.Parent is set by systemd cgroup drivers. c.ScopePrefix = "runc" c.Name = name } else { @@ -342,255 +669,227 @@ func CreateCgroupConfig(opts *CreateOpts) (*configs.Cgroup, error) { // In rootless containers, any attempt to make cgroup changes is likely to fail. // libcontainer will validate this but ignores the error. - c.Resources.AllowedDevices = AllowedDevices if spec.Linux != nil { r := spec.Linux.Resources - if r == nil { - return c, nil - } - for i, d := range spec.Linux.Resources.Devices { - var ( - t = "a" - major = int64(-1) - minor = int64(-1) - ) - if d.Type != "" { - t = d.Type + if r != nil { + for i, d := range spec.Linux.Resources.Devices { + var ( + t = "a" + major = int64(-1) + minor = int64(-1) + ) + if d.Type != "" { + t = d.Type + } + if d.Major != nil { + major = *d.Major + } + if d.Minor != nil { + minor = *d.Minor + } + if d.Access == "" { + return nil, fmt.Errorf("device access at %d field cannot be empty", i) + } + dt, err := stringToCgroupDeviceRune(t) + if err != nil { + return nil, err + } + c.Resources.Devices = append(c.Resources.Devices, &devices.Rule{ + Type: dt, + Major: major, + Minor: minor, + Permissions: devices.Permissions(d.Access), + Allow: d.Allow, + }) } - if d.Major != nil { - major = *d.Major + if r.Memory != nil { + if r.Memory.Limit != nil { + c.Resources.Memory = *r.Memory.Limit + } + if r.Memory.Reservation != nil { + c.Resources.MemoryReservation = *r.Memory.Reservation + } + if r.Memory.Swap != nil { + c.Resources.MemorySwap = *r.Memory.Swap + } + if r.Memory.Kernel != nil || r.Memory.KernelTCP != nil { + logrus.Warn("Kernel memory settings are ignored and will be removed") + } + if r.Memory.Swappiness != nil { + c.Resources.MemorySwappiness = r.Memory.Swappiness + } + if r.Memory.DisableOOMKiller != nil { + c.Resources.OomKillDisable = *r.Memory.DisableOOMKiller + } } - if d.Minor != nil { - minor = *d.Minor - } - if d.Access == "" { - return nil, fmt.Errorf("device access at %d field cannot be empty", i) - } - dt, err := stringToCgroupDeviceRune(t) - if err != nil { - return nil, err - } - dd := &configs.Device{ - Type: dt, - Major: major, - Minor: minor, - Permissions: d.Access, - Allow: d.Allow, - } - c.Resources.Devices = append(c.Resources.Devices, dd) - } - if r.Memory != nil { - if r.Memory.Limit != nil { - c.Resources.Memory = *r.Memory.Limit - } - if r.Memory.Reservation != nil { - c.Resources.MemoryReservation = *r.Memory.Reservation - } - if r.Memory.Swap != nil { - c.Resources.MemorySwap = *r.Memory.Swap - } - if r.Memory.Kernel != nil { - c.Resources.KernelMemory = *r.Memory.Kernel - } - if r.Memory.KernelTCP != nil { - c.Resources.KernelMemoryTCP = *r.Memory.KernelTCP - } - if r.Memory.Swappiness != nil { - c.Resources.MemorySwappiness = r.Memory.Swappiness - } - if r.Memory.DisableOOMKiller != nil { - c.Resources.OomKillDisable = *r.Memory.DisableOOMKiller - } - } - if r.CPU != nil { - if r.CPU.Shares != nil { - c.Resources.CpuShares = *r.CPU.Shares - } - if r.CPU.Quota != nil { - c.Resources.CpuQuota = *r.CPU.Quota - } - if r.CPU.Period != nil { - c.Resources.CpuPeriod = *r.CPU.Period - } - if r.CPU.RealtimeRuntime != nil { - c.Resources.CpuRtRuntime = *r.CPU.RealtimeRuntime - } - if r.CPU.RealtimePeriod != nil { - c.Resources.CpuRtPeriod = *r.CPU.RealtimePeriod - } - if r.CPU.Cpus != "" { + if r.CPU != nil { + if r.CPU.Shares != nil { + c.Resources.CpuShares = *r.CPU.Shares + + // CpuWeight is used for cgroupv2 and should be converted + c.Resources.CpuWeight = cgroups.ConvertCPUSharesToCgroupV2Value(c.Resources.CpuShares) + } + if r.CPU.Quota != nil { + c.Resources.CpuQuota = *r.CPU.Quota + } + if r.CPU.Period != nil { + c.Resources.CpuPeriod = *r.CPU.Period + } + if r.CPU.RealtimeRuntime != nil { + c.Resources.CpuRtRuntime = *r.CPU.RealtimeRuntime + } + if r.CPU.RealtimePeriod != nil { + c.Resources.CpuRtPeriod = *r.CPU.RealtimePeriod + } c.Resources.CpusetCpus = r.CPU.Cpus - } - if r.CPU.Mems != "" { c.Resources.CpusetMems = r.CPU.Mems } - } - if r.Pids != nil { - c.Resources.PidsLimit = r.Pids.Limit - } - if r.BlockIO != nil { - if r.BlockIO.Weight != nil { - c.Resources.BlkioWeight = *r.BlockIO.Weight + if r.Pids != nil { + c.Resources.PidsLimit = r.Pids.Limit } - if r.BlockIO.LeafWeight != nil { - c.Resources.BlkioLeafWeight = *r.BlockIO.LeafWeight - } - if r.BlockIO.WeightDevice != nil { - for _, wd := range r.BlockIO.WeightDevice { - var weight, leafWeight uint16 - if wd.Weight != nil { - weight = *wd.Weight + if r.BlockIO != nil { + if r.BlockIO.Weight != nil { + c.Resources.BlkioWeight = *r.BlockIO.Weight + } + if r.BlockIO.LeafWeight != nil { + c.Resources.BlkioLeafWeight = *r.BlockIO.LeafWeight + } + if r.BlockIO.WeightDevice != nil { + for _, wd := range r.BlockIO.WeightDevice { + var weight, leafWeight uint16 + if wd.Weight != nil { + weight = *wd.Weight + } + if wd.LeafWeight != nil { + leafWeight = *wd.LeafWeight + } + weightDevice := configs.NewWeightDevice(wd.Major, wd.Minor, weight, leafWeight) + c.Resources.BlkioWeightDevice = append(c.Resources.BlkioWeightDevice, weightDevice) } - if wd.LeafWeight != nil { - leafWeight = *wd.LeafWeight + } + if r.BlockIO.ThrottleReadBpsDevice != nil { + for _, td := range r.BlockIO.ThrottleReadBpsDevice { + rate := td.Rate + throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate) + c.Resources.BlkioThrottleReadBpsDevice = append(c.Resources.BlkioThrottleReadBpsDevice, throttleDevice) + } + } + if r.BlockIO.ThrottleWriteBpsDevice != nil { + for _, td := range r.BlockIO.ThrottleWriteBpsDevice { + rate := td.Rate + throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate) + c.Resources.BlkioThrottleWriteBpsDevice = append(c.Resources.BlkioThrottleWriteBpsDevice, throttleDevice) + } + } + if r.BlockIO.ThrottleReadIOPSDevice != nil { + for _, td := range r.BlockIO.ThrottleReadIOPSDevice { + rate := td.Rate + throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate) + c.Resources.BlkioThrottleReadIOPSDevice = append(c.Resources.BlkioThrottleReadIOPSDevice, throttleDevice) + } + } + if r.BlockIO.ThrottleWriteIOPSDevice != nil { + for _, td := range r.BlockIO.ThrottleWriteIOPSDevice { + rate := td.Rate + throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate) + c.Resources.BlkioThrottleWriteIOPSDevice = append(c.Resources.BlkioThrottleWriteIOPSDevice, throttleDevice) } - weightDevice := configs.NewWeightDevice(wd.Major, wd.Minor, weight, leafWeight) - c.Resources.BlkioWeightDevice = append(c.Resources.BlkioWeightDevice, weightDevice) } } - if r.BlockIO.ThrottleReadBpsDevice != nil { - for _, td := range r.BlockIO.ThrottleReadBpsDevice { - rate := td.Rate - throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate) - c.Resources.BlkioThrottleReadBpsDevice = append(c.Resources.BlkioThrottleReadBpsDevice, throttleDevice) - } - } - if r.BlockIO.ThrottleWriteBpsDevice != nil { - for _, td := range r.BlockIO.ThrottleWriteBpsDevice { - rate := td.Rate - throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate) - c.Resources.BlkioThrottleWriteBpsDevice = append(c.Resources.BlkioThrottleWriteBpsDevice, throttleDevice) - } - } - if r.BlockIO.ThrottleReadIOPSDevice != nil { - for _, td := range r.BlockIO.ThrottleReadIOPSDevice { - rate := td.Rate - throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate) - c.Resources.BlkioThrottleReadIOPSDevice = append(c.Resources.BlkioThrottleReadIOPSDevice, throttleDevice) - } - } - if r.BlockIO.ThrottleWriteIOPSDevice != nil { - for _, td := range r.BlockIO.ThrottleWriteIOPSDevice { - rate := td.Rate - throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate) - c.Resources.BlkioThrottleWriteIOPSDevice = append(c.Resources.BlkioThrottleWriteIOPSDevice, throttleDevice) - } - } - } - for _, l := range r.HugepageLimits { - c.Resources.HugetlbLimit = append(c.Resources.HugetlbLimit, &configs.HugepageLimit{ - Pagesize: l.Pagesize, - Limit: l.Limit, - }) - } - if r.Network != nil { - if r.Network.ClassID != nil { - c.Resources.NetClsClassid = *r.Network.ClassID - } - for _, m := range r.Network.Priorities { - c.Resources.NetPrioIfpriomap = append(c.Resources.NetPrioIfpriomap, &configs.IfPrioMap{ - Interface: m.Name, - Priority: int64(m.Priority), + for _, l := range r.HugepageLimits { + c.Resources.HugetlbLimit = append(c.Resources.HugetlbLimit, &configs.HugepageLimit{ + Pagesize: l.Pagesize, + Limit: l.Limit, }) } + if len(r.Rdma) > 0 { + c.Resources.Rdma = make(map[string]configs.LinuxRdma, len(r.Rdma)) + for k, v := range r.Rdma { + c.Resources.Rdma[k] = configs.LinuxRdma{ + HcaHandles: v.HcaHandles, + HcaObjects: v.HcaObjects, + } + } + } + if r.Network != nil { + if r.Network.ClassID != nil { + c.Resources.NetClsClassid = *r.Network.ClassID + } + for _, m := range r.Network.Priorities { + c.Resources.NetPrioIfpriomap = append(c.Resources.NetPrioIfpriomap, &configs.IfPrioMap{ + Interface: m.Name, + Priority: int64(m.Priority), + }) + } + } + if len(r.Unified) > 0 { + // copy the map + c.Resources.Unified = make(map[string]string, len(r.Unified)) + for k, v := range r.Unified { + c.Resources.Unified[k] = v + } + } } } - // append the default allowed devices to the end of the list - c.Resources.Devices = append(c.Resources.Devices, AllowedDevices...) + + // Append the default allowed devices to the end of the list. + for _, device := range defaultDevs { + c.Resources.Devices = append(c.Resources.Devices, &device.Rule) + } return c, nil } -func stringToCgroupDeviceRune(s string) (rune, error) { +func stringToCgroupDeviceRune(s string) (devices.Type, error) { switch s { case "a": - return 'a', nil + return devices.WildcardDevice, nil case "b": - return 'b', nil + return devices.BlockDevice, nil case "c": - return 'c', nil + return devices.CharDevice, nil default: return 0, fmt.Errorf("invalid cgroup device type %q", s) } } -func stringToDeviceRune(s string) (rune, error) { +func stringToDeviceRune(s string) (devices.Type, error) { switch s { case "p": - return 'p', nil - case "u": - return 'u', nil + return devices.FifoDevice, nil + case "u", "c": + return devices.CharDevice, nil case "b": - return 'b', nil - case "c": - return 'c', nil + return devices.BlockDevice, nil default: return 0, fmt.Errorf("invalid device type %q", s) } } -func createDevices(spec *specs.Spec, config *configs.Config) error { - // add whitelisted devices - config.Devices = []*configs.Device{ - { - Type: 'c', - Path: "/dev/null", - Major: 1, - Minor: 3, - FileMode: 0666, - Uid: 0, - Gid: 0, - }, - { - Type: 'c', - Path: "/dev/random", - Major: 1, - Minor: 8, - FileMode: 0666, - Uid: 0, - Gid: 0, - }, - { - Type: 'c', - Path: "/dev/full", - Major: 1, - Minor: 7, - FileMode: 0666, - Uid: 0, - Gid: 0, - }, - { - Type: 'c', - Path: "/dev/tty", - Major: 5, - Minor: 0, - FileMode: 0666, - Uid: 0, - Gid: 0, - }, - { - Type: 'c', - Path: "/dev/zero", - Major: 1, - Minor: 5, - FileMode: 0666, - Uid: 0, - Gid: 0, - }, - { - Type: 'c', - Path: "/dev/urandom", - Major: 1, - Minor: 9, - FileMode: 0666, - Uid: 0, - Gid: 0, - }, +func createDevices(spec *specs.Spec, config *configs.Config) ([]*devices.Device, error) { + // If a spec device is redundant with a default device, remove that default + // device (the spec one takes priority). + dedupedAllowDevs := []*devices.Device{} + +next: + for _, ad := range AllowedDevices { + if ad.Path != "" && spec.Linux != nil { + for _, sd := range spec.Linux.Devices { + if sd.Path == ad.Path { + continue next + } + } + } + dedupedAllowDevs = append(dedupedAllowDevs, ad) + if ad.Path != "" { + config.Devices = append(config.Devices, ad) + } } - // merge in additional devices from the spec + + // Merge in additional devices from the spec. if spec.Linux != nil { for _, d := range spec.Linux.Devices { var uid, gid uint32 - var filemode os.FileMode = 0666 + var filemode os.FileMode = 0o666 if d.UID != nil { uid = *d.UID @@ -600,16 +899,18 @@ func createDevices(spec *specs.Spec, config *configs.Config) error { } dt, err := stringToDeviceRune(d.Type) if err != nil { - return err + return nil, err } if d.FileMode != nil { - filemode = *d.FileMode + filemode = *d.FileMode &^ unix.S_IFMT } - device := &configs.Device{ - Type: dt, + device := &devices.Device{ + Rule: devices.Rule{ + Type: dt, + Major: d.Major, + Minor: d.Minor, + }, Path: d.Path, - Major: d.Major, - Minor: d.Minor, FileMode: filemode, Uid: uid, Gid: gid, @@ -617,7 +918,8 @@ func createDevices(spec *specs.Spec, config *configs.Config) error { config.Devices = append(config.Devices, device) } } - return nil + + return dedupedAllowDevs, nil } func setupUserNamespace(spec *specs.Spec, config *configs.Config) error { @@ -651,92 +953,56 @@ func setupUserNamespace(spec *specs.Spec, config *configs.Config) error { return nil } -// parseMountOptions parses the string and returns the flags, propagation -// flags and any mount data that it contains. -func parseMountOptions(options []string) (int, []int, string, int) { +// parseMountOptions parses options and returns a configs.Mount +// structure with fields that depends on options set accordingly. +func parseMountOptions(options []string) *configs.Mount { var ( - flag int - pgflag []int - data []string - extFlags int + data []string + m configs.Mount + recAttrSet, recAttrClr uint64 ) - flags := map[string]struct { - clear bool - flag int - }{ - "acl": {false, unix.MS_POSIXACL}, - "async": {true, unix.MS_SYNCHRONOUS}, - "atime": {true, unix.MS_NOATIME}, - "bind": {false, unix.MS_BIND}, - "defaults": {false, 0}, - "dev": {true, unix.MS_NODEV}, - "diratime": {true, unix.MS_NODIRATIME}, - "dirsync": {false, unix.MS_DIRSYNC}, - "exec": {true, unix.MS_NOEXEC}, - "iversion": {false, unix.MS_I_VERSION}, - "lazytime": {false, unix.MS_LAZYTIME}, - "loud": {true, unix.MS_SILENT}, - "mand": {false, unix.MS_MANDLOCK}, - "noacl": {true, unix.MS_POSIXACL}, - "noatime": {false, unix.MS_NOATIME}, - "nodev": {false, unix.MS_NODEV}, - "nodiratime": {false, unix.MS_NODIRATIME}, - "noexec": {false, unix.MS_NOEXEC}, - "noiversion": {true, unix.MS_I_VERSION}, - "nolazytime": {true, unix.MS_LAZYTIME}, - "nomand": {true, unix.MS_MANDLOCK}, - "norelatime": {true, unix.MS_RELATIME}, - "nostrictatime": {true, unix.MS_STRICTATIME}, - "nosuid": {false, unix.MS_NOSUID}, - "rbind": {false, unix.MS_BIND | unix.MS_REC}, - "relatime": {false, unix.MS_RELATIME}, - "remount": {false, unix.MS_REMOUNT}, - "ro": {false, unix.MS_RDONLY}, - "rw": {true, unix.MS_RDONLY}, - "silent": {false, unix.MS_SILENT}, - "strictatime": {false, unix.MS_STRICTATIME}, - "suid": {true, unix.MS_NOSUID}, - "sync": {false, unix.MS_SYNCHRONOUS}, - } - propagationFlags := map[string]int{ - "private": unix.MS_PRIVATE, - "shared": unix.MS_SHARED, - "slave": unix.MS_SLAVE, - "unbindable": unix.MS_UNBINDABLE, - "rprivate": unix.MS_PRIVATE | unix.MS_REC, - "rshared": unix.MS_SHARED | unix.MS_REC, - "rslave": unix.MS_SLAVE | unix.MS_REC, - "runbindable": unix.MS_UNBINDABLE | unix.MS_REC, - } - extensionFlags := map[string]struct { - clear bool - flag int - }{ - "tmpcopyup": {false, configs.EXT_COPYUP}, - } + initMaps() for _, o := range options { - // If the option does not exist in the flags table or the flag - // is not supported on the platform, - // then it is a data value for a specific fs type - if f, exists := flags[o]; exists && f.flag != 0 { + // If the option does not exist in the mountFlags table, + // or the flag is not supported on the platform, + // then it is a data value for a specific fs type. + if f, exists := mountFlags[o]; exists && f.flag != 0 { if f.clear { - flag &= ^f.flag + m.Flags &= ^f.flag } else { - flag |= f.flag + m.Flags |= f.flag + } + } else if f, exists := mountPropagationMapping[o]; exists && f != 0 { + m.PropagationFlags = append(m.PropagationFlags, f) + } else if f, exists := recAttrFlags[o]; exists { + if f.clear { + recAttrClr |= f.flag + } else { + recAttrSet |= f.flag + if f.flag&unix.MOUNT_ATTR__ATIME == f.flag { + // https://man7.org/linux/man-pages/man2/mount_setattr.2.html + // "cannot simply specify the access-time setting in attr_set, but must also include MOUNT_ATTR__ATIME in the attr_clr field." + recAttrClr |= unix.MOUNT_ATTR__ATIME + } } - } else if f, exists := propagationFlags[o]; exists && f != 0 { - pgflag = append(pgflag, f) } else if f, exists := extensionFlags[o]; exists && f.flag != 0 { if f.clear { - extFlags &= ^f.flag + m.Extensions &= ^f.flag } else { - extFlags |= f.flag + m.Extensions |= f.flag } } else { data = append(data, o) } } - return flag, pgflag, strings.Join(data, ","), extFlags + m.Data = strings.Join(data, ",") + if recAttrSet != 0 || recAttrClr != 0 { + m.RecAttr = &unix.MountAttr{ + Attr_set: recAttrSet, + Attr_clr: recAttrClr, + } + } + return &m } func SetupSeccomp(config *specs.LinuxSeccomp) (*configs.Seccomp, error) { @@ -749,6 +1015,11 @@ func SetupSeccomp(config *specs.LinuxSeccomp) (*configs.Seccomp, error) { return nil, nil } + // We don't currently support seccomp flags. + if len(config.Flags) != 0 { + return nil, errors.New("seccomp flags are not yet supported by runc") + } + newConfig := new(configs.Seccomp) newConfig.Syscalls = []*configs.Syscall{} @@ -769,6 +1040,10 @@ func SetupSeccomp(config *specs.LinuxSeccomp) (*configs.Seccomp, error) { return nil, err } newConfig.DefaultAction = newDefaultAction + newConfig.DefaultErrnoRet = config.DefaultErrnoRet + + newConfig.ListenerPath = config.ListenerPath + newConfig.ListenerMetadata = config.ListenerMetadata // Loop through all syscall blocks and convert them to libcontainer format for _, call := range config.Syscalls { @@ -779,9 +1054,10 @@ func SetupSeccomp(config *specs.LinuxSeccomp) (*configs.Seccomp, error) { for _, name := range call.Names { newCall := configs.Syscall{ - Name: name, - Action: newAction, - Args: []*configs.Arg{}, + Name: name, + Action: newAction, + ErrnoRet: call.ErrnoRet, + Args: []*configs.Arg{}, } // Loop through all the arguments of the syscall and convert them for _, arg := range call.Args { @@ -807,20 +1083,31 @@ func SetupSeccomp(config *specs.LinuxSeccomp) (*configs.Seccomp, error) { } func createHooks(rspec *specs.Spec, config *configs.Config) { - config.Hooks = &configs.Hooks{} + config.Hooks = configs.Hooks{} if rspec.Hooks != nil { - for _, h := range rspec.Hooks.Prestart { cmd := createCommandHook(h) - config.Hooks.Prestart = append(config.Hooks.Prestart, configs.NewCommandHook(cmd)) + config.Hooks[configs.Prestart] = append(config.Hooks[configs.Prestart], configs.NewCommandHook(cmd)) + } + for _, h := range rspec.Hooks.CreateRuntime { + cmd := createCommandHook(h) + config.Hooks[configs.CreateRuntime] = append(config.Hooks[configs.CreateRuntime], configs.NewCommandHook(cmd)) + } + for _, h := range rspec.Hooks.CreateContainer { + cmd := createCommandHook(h) + config.Hooks[configs.CreateContainer] = append(config.Hooks[configs.CreateContainer], configs.NewCommandHook(cmd)) + } + for _, h := range rspec.Hooks.StartContainer { + cmd := createCommandHook(h) + config.Hooks[configs.StartContainer] = append(config.Hooks[configs.StartContainer], configs.NewCommandHook(cmd)) } for _, h := range rspec.Hooks.Poststart { cmd := createCommandHook(h) - config.Hooks.Poststart = append(config.Hooks.Poststart, configs.NewCommandHook(cmd)) + config.Hooks[configs.Poststart] = append(config.Hooks[configs.Poststart], configs.NewCommandHook(cmd)) } for _, h := range rspec.Hooks.Poststop { cmd := createCommandHook(h) - config.Hooks.Poststop = append(config.Hooks.Poststop, configs.NewCommandHook(cmd)) + config.Hooks[configs.Poststop] = append(config.Hooks[configs.Poststop], configs.NewCommandHook(cmd)) } } } diff --git a/libcontainer/specconv/spec_linux_test.go b/libcontainer/specconv/spec_linux_test.go index da6a43a..56d8086 100644 --- a/libcontainer/specconv/spec_linux_test.go +++ b/libcontainer/specconv/spec_linux_test.go @@ -1,5 +1,3 @@ -// +build linux - package specconv import ( @@ -7,11 +5,12 @@ import ( "strings" "testing" - "golang.org/x/sys/unix" - + dbus "github.com/godbus/dbus/v5" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/configs/validate" + "github.com/opencontainers/runc/libcontainer/devices" "github.com/opencontainers/runtime-spec/specs-go" + "golang.org/x/sys/unix" ) func TestCreateCommandHookTimeout(t *testing.T) { @@ -41,6 +40,33 @@ func TestCreateHooks(t *testing.T) { Args: []string{"--some", "thing"}, }, }, + CreateRuntime: []specs.Hook{ + { + Path: "/some/hook/path", + }, + { + Path: "/some/hook2/path", + Args: []string{"--some", "thing"}, + }, + }, + CreateContainer: []specs.Hook{ + { + Path: "/some/hook/path", + }, + { + Path: "/some/hook2/path", + Args: []string{"--some", "thing"}, + }, + }, + StartContainer: []specs.Hook{ + { + Path: "/some/hook/path", + }, + { + Path: "/some/hook2/path", + Args: []string{"--some", "thing"}, + }, + }, Poststart: []specs.Hook{ { Path: "/some/hook/path", @@ -76,29 +102,96 @@ func TestCreateHooks(t *testing.T) { conf := &configs.Config{} createHooks(rspec, conf) - prestart := conf.Hooks.Prestart + prestart := conf.Hooks[configs.Prestart] if len(prestart) != 2 { t.Error("Expected 2 Prestart hooks") } - poststart := conf.Hooks.Poststart + createRuntime := conf.Hooks[configs.CreateRuntime] + + if len(createRuntime) != 2 { + t.Error("Expected 2 createRuntime hooks") + } + + createContainer := conf.Hooks[configs.CreateContainer] + + if len(createContainer) != 2 { + t.Error("Expected 2 createContainer hooks") + } + + startContainer := conf.Hooks[configs.StartContainer] + + if len(startContainer) != 2 { + t.Error("Expected 2 startContainer hooks") + } + + poststart := conf.Hooks[configs.Poststart] if len(poststart) != 3 { t.Error("Expected 3 Poststart hooks") } - poststop := conf.Hooks.Poststop + poststop := conf.Hooks[configs.Poststop] if len(poststop) != 4 { t.Error("Expected 4 Poststop hooks") } - } -func TestSetupSeccomp(t *testing.T) { + +func TestSetupSeccompNil(t *testing.T) { + seccomp, err := SetupSeccomp(nil) + if err != nil { + t.Error("Expected error to be nil") + } + + if seccomp != nil { + t.Error("Expected seccomp to be nil") + } +} + +func TestSetupSeccompEmpty(t *testing.T) { + conf := &specs.LinuxSeccomp{} + seccomp, err := SetupSeccomp(conf) + if err != nil { + t.Error("Expected error to be nil") + } + + if seccomp != nil { + t.Error("Expected seccomp to be nil") + } +} + +// TestSetupSeccompWrongAction tests that a wrong action triggers an error +func TestSetupSeccompWrongAction(t *testing.T) { conf := &specs.LinuxSeccomp{ - DefaultAction: "SCMP_ACT_ERRNO", - Architectures: []specs.Arch{specs.ArchX86_64, specs.ArchARM}, + DefaultAction: "SCMP_ACT_NON_EXIXTENT_ACTION", + } + _, err := SetupSeccomp(conf) + if err == nil { + t.Error("Expected error") + } +} + +// TestSetupSeccompWrongArchitecture tests that a wrong architecture triggers an error +func TestSetupSeccompWrongArchitecture(t *testing.T) { + conf := &specs.LinuxSeccomp{ + DefaultAction: "SCMP_ACT_ALLOW", + Architectures: []specs.Arch{"SCMP_ARCH_NON_EXISTENT_ARCH"}, + } + _, err := SetupSeccomp(conf) + if err == nil { + t.Error("Expected error") + } +} + +func TestSetupSeccomp(t *testing.T) { + errnoRet := uint(55) + conf := &specs.LinuxSeccomp{ + DefaultAction: "SCMP_ACT_ERRNO", + Architectures: []specs.Arch{specs.ArchX86_64, specs.ArchARM}, + ListenerPath: "/var/run/mysocket", + ListenerMetadata: "mymetadatastring", Syscalls: []specs.LinuxSyscall{ { Names: []string{"clone"}, @@ -113,26 +206,42 @@ func TestSetupSeccomp(t *testing.T) { }, }, { - Names: []string{ - "select", - "semctl", - "semget", - "semop", - "semtimedop", - "send", - "sendfile", - }, - Action: "SCMP_ACT_ALLOW", + Names: []string{"semctl"}, + Action: "SCMP_ACT_KILL", + }, + { + Names: []string{"semget"}, + Action: "SCMP_ACT_ERRNO", + }, + { + Names: []string{"send"}, + Action: "SCMP_ACT_ERRNO", + ErrnoRet: &errnoRet, + }, + { + Names: []string{"lchown"}, + Action: "SCMP_ACT_TRAP", + }, + { + Names: []string{"lremovexattr"}, + Action: "SCMP_ACT_TRACE", + }, + { + Names: []string{"mbind"}, + Action: "SCMP_ACT_LOG", + }, + { + Names: []string{"mknod"}, + Action: "SCMP_ACT_NOTIFY", }, }, } seccomp, err := SetupSeccomp(conf) - if err != nil { t.Errorf("Couldn't create Seccomp config: %v", err) } - if seccomp.DefaultAction != 2 { // SCMP_ACT_ERRNO + if seccomp.DefaultAction != configs.Errno { t.Error("Wrong conversion for DefaultAction") } @@ -144,6 +253,14 @@ func TestSetupSeccomp(t *testing.T) { t.Error("Expected architectures are not found") } + if seccomp.ListenerPath != "/var/run/mysocket" { + t.Error("Expected ListenerPath is wrong") + } + + if seccomp.ListenerMetadata != "mymetadatastring" { + t.Error("Expected ListenerMetadata is wrong") + } + calls := seccomp.Syscalls callsLength := len(calls) @@ -151,24 +268,59 @@ func TestSetupSeccomp(t *testing.T) { t.Errorf("Expected 8 syscalls, got :%d", callsLength) } - for i, call := range calls { - if i == 0 { + for _, call := range calls { + switch call.Name { + case "clone": + if call.Action != configs.Allow { + t.Error("Wrong conversion for the clone syscall action") + } expectedCloneSyscallArgs := configs.Arg{ Index: 0, - Op: 7, // SCMP_CMP_MASKED_EQ + Op: configs.MaskEqualTo, Value: unix.CLONE_NEWNS | unix.CLONE_NEWUTS | unix.CLONE_NEWIPC | unix.CLONE_NEWUSER | unix.CLONE_NEWPID | unix.CLONE_NEWNET | unix.CLONE_NEWCGROUP, ValueTwo: 0, } if expectedCloneSyscallArgs != *call.Args[0] { t.Errorf("Wrong arguments conversion for the clone syscall under test") } + case "semctl": + if call.Action != configs.Kill { + t.Errorf("Wrong conversion for the %s syscall action", call.Name) + } + case "semget": + if call.Action != configs.Errno { + t.Errorf("Wrong conversion for the %s syscall action", call.Name) + } + if call.ErrnoRet != nil { + t.Errorf("Wrong error ret for the %s syscall", call.Name) + } + case "send": + if call.Action != configs.Errno { + t.Errorf("Wrong conversion for the %s syscall action", call.Name) + } + if *call.ErrnoRet != errnoRet { + t.Errorf("Wrong error ret for the %s syscall", call.Name) + } + case "lchown": + if call.Action != configs.Trap { + t.Errorf("Wrong conversion for the %s syscall action", call.Name) + } + case "lremovexattr": + if call.Action != configs.Trace { + t.Errorf("Wrong conversion for the %s syscall action", call.Name) + } + case "mbind": + if call.Action != configs.Log { + t.Errorf("Wrong conversion for the %s syscall action", call.Name) + } + case "mknod": + if call.Action != configs.Notify { + t.Errorf("Wrong conversion for the %s syscall action", call.Name) + } + default: + t.Errorf("Unexpected syscall %s found", call.Name) } - if call.Action != 4 { - t.Error("Wrong conversion for the clone syscall action") - } - } - } func TestLinuxCgroupWithMemoryResource(t *testing.T) { @@ -213,7 +365,7 @@ func TestLinuxCgroupWithMemoryResource(t *testing.T) { Spec: spec, } - cgroup, err := CreateCgroupConfig(opts) + cgroup, err := CreateCgroupConfig(opts, nil) if err != nil { t.Errorf("Couldn't create Cgroup config: %v", err) } @@ -230,12 +382,6 @@ func TestLinuxCgroupWithMemoryResource(t *testing.T) { if cgroup.Resources.MemorySwap != swap { t.Errorf("Expected to have %d as swap, got %d", swap, cgroup.Resources.MemorySwap) } - if cgroup.Resources.KernelMemory != kernel { - t.Errorf("Expected to have %d as Kernel Memory, got %d", kernel, cgroup.Resources.KernelMemory) - } - if cgroup.Resources.KernelMemoryTCP != kernelTCP { - t.Errorf("Expected to have %d as TCP Kernel Memory, got %d", kernelTCP, cgroup.Resources.KernelMemoryTCP) - } if cgroup.Resources.MemorySwappiness != swappinessPtr { t.Errorf("Expected to have %d as memory swappiness, got %d", swappinessPtr, cgroup.Resources.MemorySwappiness) } @@ -257,8 +403,7 @@ func TestLinuxCgroupSystemd(t *testing.T) { Spec: spec, } - cgroup, err := CreateCgroupConfig(opts) - + cgroup, err := CreateCgroupConfig(opts, nil) if err != nil { t.Errorf("Couldn't create Cgroup config: %v", err) } @@ -293,13 +438,12 @@ func TestLinuxCgroupSystemdWithEmptyPath(t *testing.T) { Spec: spec, } - cgroup, err := CreateCgroupConfig(opts) - + cgroup, err := CreateCgroupConfig(opts, nil) if err != nil { t.Errorf("Couldn't create Cgroup config: %v", err) } - expectedParent := "system.slice" + expectedParent := "" if cgroup.Parent != expectedParent { t.Errorf("Expected to have %s as Parent instead of %s", expectedParent, cgroup.Parent) } @@ -328,11 +472,12 @@ func TestLinuxCgroupSystemdWithInvalidPath(t *testing.T) { Spec: spec, } - _, err := CreateCgroupConfig(opts) + _, err := CreateCgroupConfig(opts, nil) if err == nil { t.Error("Expected to produce an error if not using the correct format for cgroup paths belonging to systemd") } } + func TestLinuxCgroupsPathSpecified(t *testing.T) { cgroupsPath := "/user/cgroups/path/id" @@ -347,7 +492,7 @@ func TestLinuxCgroupsPathSpecified(t *testing.T) { Spec: spec, } - cgroup, err := CreateCgroupConfig(opts) + cgroup, err := CreateCgroupConfig(opts, nil) if err != nil { t.Errorf("Couldn't create Cgroup config: %v", err) } @@ -365,7 +510,7 @@ func TestLinuxCgroupsPathNotSpecified(t *testing.T) { Spec: spec, } - cgroup, err := CreateCgroupConfig(opts) + cgroup, err := CreateCgroupConfig(opts, nil) if err != nil { t.Errorf("Couldn't create Cgroup config: %v", err) } @@ -390,6 +535,33 @@ func TestSpecconvExampleValidate(t *testing.T) { t.Errorf("Couldn't create libcontainer config: %v", err) } + if config.NoNewPrivileges != spec.Process.NoNewPrivileges { + t.Errorf("specconv NoNewPrivileges mismatch. Expected %v got %v", + spec.Process.NoNewPrivileges, config.NoNewPrivileges) + } + + validator := validate.New() + if err := validator.Validate(config); err != nil { + t.Errorf("Expected specconv to produce valid container config: %v", err) + } +} + +func TestSpecconvNoLinuxSection(t *testing.T) { + spec := Example() + spec.Root.Path = "/" + spec.Linux = nil + spec.Hostname = "" + + opts := &CreateOpts{ + CgroupName: "ContainerID", + Spec: spec, + } + + config, err := CreateLibcontainerConfig(opts) + if err != nil { + t.Errorf("Couldn't create libcontainer config: %v", err) + } + validator := validate.New() if err := validator.Validate(config); err != nil { t.Errorf("Expected specconv to produce valid container config: %v", err) @@ -425,7 +597,7 @@ func TestDupNamespaces(t *testing.T) { func TestNonZeroEUIDCompatibleSpecconvValidate(t *testing.T) { if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) { - t.Skip("userns is unsupported") + t.Skip("Test requires userns.") } spec := Example() @@ -450,3 +622,291 @@ func TestNonZeroEUIDCompatibleSpecconvValidate(t *testing.T) { t.Errorf("Expected specconv to produce valid rootless container config: %v", err) } } + +func TestInitSystemdProps(t *testing.T) { + type inT struct { + name, value string + } + type expT struct { + isErr bool + name string + value interface{} + } + + testCases := []struct { + desc string + in inT + exp expT + }{ + { + in: inT{"org.systemd.property.TimeoutStopUSec", "uint64 123456789"}, + exp: expT{false, "TimeoutStopUSec", uint64(123456789)}, + }, + { + desc: "convert USec to Sec (default numeric type)", + in: inT{"org.systemd.property.TimeoutStopSec", "456"}, + exp: expT{false, "TimeoutStopUSec", uint64(456000000)}, + }, + { + desc: "convert USec to Sec (byte)", + in: inT{"org.systemd.property.TimeoutStopSec", "byte 234"}, + exp: expT{false, "TimeoutStopUSec", uint64(234000000)}, + }, + { + desc: "convert USec to Sec (int16)", + in: inT{"org.systemd.property.TimeoutStopSec", "int16 234"}, + exp: expT{false, "TimeoutStopUSec", uint64(234000000)}, + }, + { + desc: "convert USec to Sec (uint16)", + in: inT{"org.systemd.property.TimeoutStopSec", "uint16 234"}, + exp: expT{false, "TimeoutStopUSec", uint64(234000000)}, + }, + { + desc: "convert USec to Sec (int32)", + in: inT{"org.systemd.property.TimeoutStopSec", "int32 234"}, + exp: expT{false, "TimeoutStopUSec", uint64(234000000)}, + }, + { + desc: "convert USec to Sec (uint32)", + in: inT{"org.systemd.property.TimeoutStopSec", "uint32 234"}, + exp: expT{false, "TimeoutStopUSec", uint64(234000000)}, + }, + { + desc: "convert USec to Sec (int64)", + in: inT{"org.systemd.property.TimeoutStopSec", "int64 234"}, + exp: expT{false, "TimeoutStopUSec", uint64(234000000)}, + }, + { + desc: "convert USec to Sec (uint64)", + in: inT{"org.systemd.property.TimeoutStopSec", "uint64 234"}, + exp: expT{false, "TimeoutStopUSec", uint64(234000000)}, + }, + { + desc: "convert USec to Sec (float)", + in: inT{"org.systemd.property.TimeoutStopSec", "234.789"}, + exp: expT{false, "TimeoutStopUSec", uint64(234789000)}, + }, + { + desc: "convert USec to Sec (bool -- invalid value)", + in: inT{"org.systemd.property.TimeoutStopSec", "false"}, + exp: expT{true, "", ""}, + }, + { + desc: "convert USec to Sec (string -- invalid value)", + in: inT{"org.systemd.property.TimeoutStopSec", "'covfefe'"}, + exp: expT{true, "", ""}, + }, + { + desc: "convert USec to Sec (bad variable name, no conversion)", + in: inT{"org.systemd.property.FOOSec", "123"}, + exp: expT{false, "FOOSec", 123}, + }, + { + in: inT{"org.systemd.property.CollectMode", "'inactive-or-failed'"}, + exp: expT{false, "CollectMode", "inactive-or-failed"}, + }, + { + desc: "unrelated property", + in: inT{"some.other.annotation", "0"}, + exp: expT{false, "", ""}, + }, + { + desc: "too short property name", + in: inT{"org.systemd.property.Xo", "1"}, + exp: expT{true, "", ""}, + }, + { + desc: "invalid character in property name", + in: inT{"org.systemd.property.Number1", "1"}, + exp: expT{true, "", ""}, + }, + { + desc: "invalid property value", + in: inT{"org.systemd.property.ValidName", "invalid-value"}, + exp: expT{true, "", ""}, + }, + } + + spec := &specs.Spec{} + + for _, tc := range testCases { + tc := tc + spec.Annotations = map[string]string{tc.in.name: tc.in.value} + + outMap, err := initSystemdProps(spec) + // t.Logf("input %+v, expected %+v, got err:%v out:%+v", tc.in, tc.exp, err, outMap) + + if tc.exp.isErr != (err != nil) { + t.Errorf("input %+v, expecting error: %v, got %v", tc.in, tc.exp.isErr, err) + } + expLen := 1 // expect a single item + if tc.exp.name == "" { + expLen = 0 // expect nothing + } + if len(outMap) != expLen { + t.Fatalf("input %+v, expected %d, got %d entries: %v", tc.in, expLen, len(outMap), outMap) + } + if expLen == 0 { + continue + } + + out := outMap[0] + if tc.exp.name != out.Name { + t.Errorf("input %+v, expecting name: %q, got %q", tc.in, tc.exp.name, out.Name) + } + expValue := dbus.MakeVariant(tc.exp.value).String() + if expValue != out.Value.String() { + t.Errorf("input %+v, expecting value: %s, got %s", tc.in, expValue, out.Value) + } + } +} + +func TestIsValidName(t *testing.T) { + testCases := []struct { + in string + valid bool + }{ + {"", false}, // too short + {"xx", false}, // too short + {"xxx", true}, + {"someValidName", true}, + {"A name", false}, // space + {"3335", false}, // numbers + {"Name1", false}, // numbers + {"Кир", false}, // non-ascii + {"მადლობა", false}, // non-ascii + {"合い言葉", false}, // non-ascii + } + + for _, tc := range testCases { + err := checkPropertyName(tc.in) + if (err == nil) != tc.valid { + t.Errorf("case %q: expected valid: %v, got error: %v", tc.in, tc.valid, err) + } + } +} + +func BenchmarkIsValidName(b *testing.B) { + for i := 0; i < b.N; i++ { + for _, s := range []string{"", "xx", "xxx", "someValidName", "A name", "Кир", "მადლობა", "合い言葉"} { + _ = checkPropertyName(s) + } + } +} + +func TestNullProcess(t *testing.T) { + spec := Example() + spec.Process = nil + + _, err := CreateLibcontainerConfig(&CreateOpts{ + Spec: spec, + }) + if err != nil { + t.Errorf("Null process should be forbidden") + } +} + +func TestCreateDevices(t *testing.T) { + spec := Example() + + // dummy uid/gid for /dev/tty; will enable the test to check if createDevices() + // preferred the spec's device over the redundant default device + ttyUid := uint32(1000) + ttyGid := uint32(1000) + fm := os.FileMode(0o666) + + spec.Linux = &specs.Linux{ + Devices: []specs.LinuxDevice{ + { + // This is purposely redundant with one of runc's default devices + Path: "/dev/tty", + Type: "c", + Major: 5, + Minor: 0, + FileMode: &fm, + UID: &ttyUid, + GID: &ttyGid, + }, + { + // This is purposely not redundant with one of runc's default devices + Path: "/dev/ram0", + Type: "b", + Major: 1, + Minor: 0, + }, + }, + } + + conf := &configs.Config{} + + defaultDevs, err := createDevices(spec, conf) + if err != nil { + t.Errorf("failed to create devices: %v", err) + } + + // Verify the returned default devices has the /dev/tty entry deduplicated + found := false + for _, d := range defaultDevs { + if d.Path == "/dev/tty" { + if found { + t.Errorf("createDevices failed: returned a duplicated device entry: %v", defaultDevs) + } + found = true + } + } + + // Verify that createDevices() placed all default devices in the config + for _, allowedDev := range AllowedDevices { + if allowedDev.Path == "" { + continue + } + + found := false + for _, configDev := range conf.Devices { + if configDev.Path == allowedDev.Path { + found = true + } + } + if !found { + configDevPaths := []string{} + for _, configDev := range conf.Devices { + configDevPaths = append(configDevPaths, configDev.Path) + } + t.Errorf("allowedDevice %s was not found in the config's devices: %v", allowedDev.Path, configDevPaths) + } + } + + // Verify that createDevices() deduplicated the /dev/tty entry in the config + for _, configDev := range conf.Devices { + if configDev.Path == "/dev/tty" { + wantDev := &devices.Device{ + Path: "/dev/tty", + FileMode: 0o666, + Uid: 1000, + Gid: 1000, + Rule: devices.Rule{ + Type: devices.CharDevice, + Major: 5, + Minor: 0, + }, + } + + if *configDev != *wantDev { + t.Errorf("redundant dev was not deduplicated correctly: want %v, got %v", wantDev, configDev) + } + } + } + + // Verify that createDevices() added the entry for /dev/ram0 in the config + found = false + for _, configDev := range conf.Devices { + if configDev.Path == "/dev/ram0" { + found = true + break + } + } + if !found { + t.Errorf("device /dev/ram0 not found in config devices; got %v", conf.Devices) + } +} diff --git a/libcontainer/stacktrace/capture.go b/libcontainer/stacktrace/capture.go deleted file mode 100644 index 0bbe149..0000000 --- a/libcontainer/stacktrace/capture.go +++ /dev/null @@ -1,27 +0,0 @@ -package stacktrace - -import "runtime" - -// Capture captures a stacktrace for the current calling go program -// -// skip is the number of frames to skip -func Capture(userSkip int) Stacktrace { - var ( - skip = userSkip + 1 // add one for our own function - frames []Frame - prevPc uintptr - ) - for i := skip; ; i++ { - pc, file, line, ok := runtime.Caller(i) - //detect if caller is repeated to avoid loop, gccgo - //currently runs into a loop without this check - if !ok || pc == prevPc { - break - } - frames = append(frames, NewFrame(pc, file, line)) - prevPc = pc - } - return Stacktrace{ - Frames: frames, - } -} diff --git a/libcontainer/stacktrace/capture_test.go b/libcontainer/stacktrace/capture_test.go deleted file mode 100644 index 978f6c4..0000000 --- a/libcontainer/stacktrace/capture_test.go +++ /dev/null @@ -1,31 +0,0 @@ -package stacktrace - -import ( - "strings" - "testing" -) - -func captureFunc() Stacktrace { - return Capture(0) -} - -func TestCaptureTestFunc(t *testing.T) { - stack := captureFunc() - - if len(stack.Frames) == 0 { - t.Fatal("expected stack frames to be returned") - } - - // the first frame is the caller - frame := stack.Frames[0] - if expected := "captureFunc"; frame.Function != expected { - t.Fatalf("expected function %q but received %q", expected, frame.Function) - } - expected := "/runc/libcontainer/stacktrace" - if !strings.HasSuffix(frame.Package, expected) { - t.Fatalf("expected package %q but received %q", expected, frame.Package) - } - if expected := "capture_test.go"; frame.File != expected { - t.Fatalf("expected file %q but received %q", expected, frame.File) - } -} diff --git a/libcontainer/stacktrace/frame.go b/libcontainer/stacktrace/frame.go deleted file mode 100644 index 0d590d9..0000000 --- a/libcontainer/stacktrace/frame.go +++ /dev/null @@ -1,38 +0,0 @@ -package stacktrace - -import ( - "path/filepath" - "runtime" - "strings" -) - -// NewFrame returns a new stack frame for the provided information -func NewFrame(pc uintptr, file string, line int) Frame { - fn := runtime.FuncForPC(pc) - if fn == nil { - return Frame{} - } - pack, name := parseFunctionName(fn.Name()) - return Frame{ - Line: line, - File: filepath.Base(file), - Package: pack, - Function: name, - } -} - -func parseFunctionName(name string) (string, string) { - i := strings.LastIndex(name, ".") - if i == -1 { - return "", name - } - return name[:i], name[i+1:] -} - -// Frame contains all the information for a stack frame within a go program -type Frame struct { - File string - Function string - Package string - Line int -} diff --git a/libcontainer/stacktrace/frame_test.go b/libcontainer/stacktrace/frame_test.go deleted file mode 100644 index c6fc78e..0000000 --- a/libcontainer/stacktrace/frame_test.go +++ /dev/null @@ -1,20 +0,0 @@ -package stacktrace - -import "testing" - -func TestParsePackageName(t *testing.T) { - var ( - name = "github.com/opencontainers/runc/libcontainer/stacktrace.captureFunc" - expectedPackage = "github.com/opencontainers/runc/libcontainer/stacktrace" - expectedFunction = "captureFunc" - ) - - pack, funcName := parseFunctionName(name) - if pack != expectedPackage { - t.Fatalf("expected package %q but received %q", expectedPackage, pack) - } - - if funcName != expectedFunction { - t.Fatalf("expected function %q but received %q", expectedFunction, funcName) - } -} diff --git a/libcontainer/stacktrace/stacktrace.go b/libcontainer/stacktrace/stacktrace.go deleted file mode 100644 index 5e8b58d..0000000 --- a/libcontainer/stacktrace/stacktrace.go +++ /dev/null @@ -1,5 +0,0 @@ -package stacktrace - -type Stacktrace struct { - Frames []Frame -} diff --git a/libcontainer/standard_init_linux.go b/libcontainer/standard_init_linux.go index 4e03b8b..585a04f 100644 --- a/libcontainer/standard_init_linux.go +++ b/libcontainer/standard_init_linux.go @@ -1,23 +1,22 @@ -// +build linux - package libcontainer import ( + "errors" "fmt" "os" "os/exec" - "runtime" - "syscall" //only for Exec + "strconv" + + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/opencontainers/selinux/go-selinux" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" "github.com/opencontainers/runc/libcontainer/apparmor" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/keys" "github.com/opencontainers/runc/libcontainer/seccomp" "github.com/opencontainers/runc/libcontainer/system" - "github.com/opencontainers/selinux/go-selinux/label" - "github.com/pkg/errors" - - "golang.org/x/sys/unix" ) type linuxStandardInit struct { @@ -25,6 +24,8 @@ type linuxStandardInit struct { consoleSocket *os.File parentPid int fifoFd int + logFd int + mountFds []int config *initConfig } @@ -41,17 +42,15 @@ func (l *linuxStandardInit) getSessionRingParams() (string, uint32, uint32) { // Create a unique per session container name that we can join in setns; // However, other containers can also join it. - return fmt.Sprintf("_ses.%s", l.config.ContainerId), 0xffffffff, newperms + return "_ses." + l.config.ContainerId, 0xffffffff, newperms } func (l *linuxStandardInit) Init() error { - runtime.LockOSThread() - defer runtime.UnlockOSThread() if !l.config.Config.NoNewKeyring { - if err := label.SetKeyLabel(l.config.ProcessLabel); err != nil { + if err := selinux.SetKeyLabel(l.config.ProcessLabel); err != nil { return err } - defer label.SetKeyLabel("") + defer selinux.SetKeyLabel("") //nolint: errcheck ringname, keepperms, newperms := l.getSessionRingParams() // Do not inherit the parent's session keyring. @@ -64,15 +63,15 @@ func (l *linuxStandardInit) Init() error { // // TODO(cyphar): Log this so people know what's going on, once we // have proper logging in 'runc init'. - if errors.Cause(err) != unix.ENOSYS { - return errors.Wrap(err, "join session keyring") + if !errors.Is(err, unix.ENOSYS) { + return fmt.Errorf("unable to join session keyring: %w", err) } } else { - // Make session keyring searcheable. If we've gotten this far we + // Make session keyring searchable. If we've gotten this far we // bail on any error -- we don't want to have a keyring with bad // permissions. if err := keys.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil { - return errors.Wrap(err, "mod keyring permissions") + return fmt.Errorf("unable to mod keyring permissions: %w", err) } } } @@ -84,10 +83,25 @@ func (l *linuxStandardInit) Init() error { return err } - label.Init() - if err := prepareRootfs(l.pipe, l.config); err != nil { + // initialises the labeling system + selinux.GetEnabled() + + // We don't need the mountFds after prepareRootfs() nor if it fails. + err := prepareRootfs(l.pipe, l.config, l.mountFds) + for _, m := range l.mountFds { + if m == -1 { + continue + } + + if err := unix.Close(m); err != nil { + return fmt.Errorf("Unable to close mountFds fds: %w", err) + } + } + + if err != nil { return err } + // Set up the console. This has to be done *before* we finalize the rootfs, // but *after* we've given the user the chance to set up all of the mounts // they wanted. @@ -96,7 +110,7 @@ func (l *linuxStandardInit) Init() error { return err } if err := system.Setctty(); err != nil { - return errors.Wrap(err, "setctty") + return &os.SyscallError{Syscall: "ioctl(setctty)", Err: err} } } @@ -109,52 +123,57 @@ func (l *linuxStandardInit) Init() error { if hostname := l.config.Config.Hostname; hostname != "" { if err := unix.Sethostname([]byte(hostname)); err != nil { - return errors.Wrap(err, "sethostname") + return &os.SyscallError{Syscall: "sethostname", Err: err} } } if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil { - return errors.Wrap(err, "apply apparmor profile") + return fmt.Errorf("unable to apply apparmor profile: %w", err) } for key, value := range l.config.Config.Sysctl { if err := writeSystemProperty(key, value); err != nil { - return errors.Wrapf(err, "write sysctl key %s", key) + return err } } for _, path := range l.config.Config.ReadonlyPaths { if err := readonlyPath(path); err != nil { - return errors.Wrapf(err, "readonly path %s", path) + return fmt.Errorf("can't make %q read-only: %w", path, err) } } for _, path := range l.config.Config.MaskPaths { if err := maskPath(path, l.config.Config.MountLabel); err != nil { - return errors.Wrapf(err, "mask path %s", path) + return fmt.Errorf("can't mask path %s: %w", path, err) } } pdeath, err := system.GetParentDeathSignal() if err != nil { - return errors.Wrap(err, "get pdeath signal") + return fmt.Errorf("can't get pdeath signal: %w", err) } if l.config.NoNewPrivileges { if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil { - return errors.Wrap(err, "set nonewprivileges") + return &os.SyscallError{Syscall: "prctl(SET_NO_NEW_PRIVS)", Err: err} } } // Tell our parent that we're ready to Execv. This must be done before the // Seccomp rules have been applied, because we need to be able to read and // write to a socket. if err := syncParentReady(l.pipe); err != nil { - return errors.Wrap(err, "sync ready") + return fmt.Errorf("sync ready: %w", err) } - if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil { - return errors.Wrap(err, "set process label") + if err := selinux.SetExecLabel(l.config.ProcessLabel); err != nil { + return fmt.Errorf("can't set process label: %w", err) } - defer label.SetProcessLabel("") + defer selinux.SetExecLabel("") //nolint: errcheck // Without NoNewPrivileges seccomp is a privileged operation, so we need to // do this before dropping capabilities; otherwise do it as late as possible // just before execve so as few syscalls take place after it as possible. if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges { - if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil { + seccompFd, err := seccomp.InitSeccomp(l.config.Config.Seccomp) + if err != nil { + return err + } + + if err := syncParentSeccomp(l.pipe, seccompFd); err != nil { return err } } @@ -164,7 +183,7 @@ func (l *linuxStandardInit) Init() error { // finalizeNamespace can change user/group which clears the parent death // signal, so we restore it here. if err := pdeath.Restore(); err != nil { - return errors.Wrap(err, "restore pdeath signal") + return fmt.Errorf("can't restore pdeath signal: %w", err) } // Compare the parent from the initial start of the init process and make // sure that it did not change. if the parent changes that means it died @@ -179,36 +198,57 @@ func (l *linuxStandardInit) Init() error { if err != nil { return err } + // Set seccomp as close to execve as possible, so as few syscalls take + // place afterward (reducing the amount of syscalls that users need to + // enable in their seccomp profiles). However, this needs to be done + // before closing the pipe since we need it to pass the seccompFd to + // the parent. + if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges { + seccompFd, err := seccomp.InitSeccomp(l.config.Config.Seccomp) + if err != nil { + return fmt.Errorf("unable to init seccomp: %w", err) + } + + if err := syncParentSeccomp(l.pipe, seccompFd); err != nil { + return err + } + } // Close the pipe to signal that we have completed our init. - l.pipe.Close() + logrus.Debugf("init: closing the pipe to signal completion") + _ = l.pipe.Close() + + // Close the log pipe fd so the parent's ForwardLogs can exit. + if err := unix.Close(l.logFd); err != nil { + return &os.PathError{Op: "close log pipe", Path: "fd " + strconv.Itoa(l.logFd), Err: err} + } + // Wait for the FIFO to be opened on the other side before exec-ing the // user process. We open it through /proc/self/fd/$fd, because the fd that // was given to us was an O_PATH fd to the fifo itself. Linux allows us to // re-open an O_PATH fd through /proc. - fd, err := unix.Open(fmt.Sprintf("/proc/self/fd/%d", l.fifoFd), unix.O_WRONLY|unix.O_CLOEXEC, 0) + fifoPath := "/proc/self/fd/" + strconv.Itoa(l.fifoFd) + fd, err := unix.Open(fifoPath, unix.O_WRONLY|unix.O_CLOEXEC, 0) if err != nil { - return newSystemErrorWithCause(err, "open exec fifo") + return &os.PathError{Op: "open exec fifo", Path: fifoPath, Err: err} } if _, err := unix.Write(fd, []byte("0")); err != nil { - return newSystemErrorWithCause(err, "write 0 exec fifo") + return &os.PathError{Op: "write exec fifo", Path: fifoPath, Err: err} } + // Close the O_PATH fifofd fd before exec because the kernel resets // dumpable in the wrong order. This has been fixed in newer kernels, but // we keep this to ensure CVE-2016-9962 doesn't re-emerge on older kernels. // N.B. the core issue itself (passing dirfds to the host filesystem) has // since been resolved. // https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318 - unix.Close(l.fifoFd) - // Set seccomp as close to execve as possible, so as few syscalls take - // place afterward (reducing the amount of syscalls that users need to - // enable in their seccomp profiles). - if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges { - if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil { - return newSystemErrorWithCause(err, "init seccomp") - } + _ = unix.Close(l.fifoFd) + + s := l.config.SpecState + s.Pid = unix.Getpid() + s.Status = specs.StateCreated + if err := l.config.Config.Hooks[configs.StartContainer].RunHooks(s); err != nil { + return err } - if err := syscall.Exec(name, l.config.Args[0:], os.Environ()); err != nil { - return newSystemErrorWithCause(err, "exec user process") - } - return nil + + return system.Exec(name, l.config.Args[0:], os.Environ()) } diff --git a/libcontainer/state_linux.go b/libcontainer/state_linux.go index 5c16a42..aa6259b 100644 --- a/libcontainer/state_linux.go +++ b/libcontainer/state_linux.go @@ -1,5 +1,3 @@ -// +build linux - package libcontainer import ( @@ -8,7 +6,7 @@ import ( "path/filepath" "github.com/opencontainers/runc/libcontainer/configs" - + "github.com/opencontainers/runtime-spec/specs-go" "github.com/sirupsen/logrus" "golang.org/x/sys/unix" ) @@ -38,7 +36,8 @@ type containerState interface { } func destroy(c *linuxContainer) error { - if !c.config.Namespaces.Contains(configs.NEWPID) { + if !c.config.Namespaces.Contains(configs.NEWPID) || + c.config.Namespaces.PathOf(configs.NEWPID) != "" { if err := signalAllProcesses(c.cgroupManager, unix.SIGKILL); err != nil { logrus.Warn(err) } @@ -61,17 +60,21 @@ func destroy(c *linuxContainer) error { } func runPoststopHooks(c *linuxContainer) error { - if c.config.Hooks != nil { - s, err := c.currentOCIState() - if err != nil { - return err - } - for _, hook := range c.config.Hooks.Poststop { - if err := hook.Run(s); err != nil { - return err - } - } + hooks := c.config.Hooks + if hooks == nil { + return nil } + + s, err := c.currentOCIState() + if err != nil { + return err + } + s.Status = specs.StateStopped + + if err := hooks[configs.Poststop].RunHooks(s); err != nil { + return err + } + return nil } @@ -111,12 +114,8 @@ func (r *runningState) status() Status { func (r *runningState) transition(s containerState) error { switch s.(type) { case *stoppedState: - t, err := r.c.runType() - if err != nil { - return err - } - if t == Running { - return newGenericError(fmt.Errorf("container still running"), ContainerNotStopped) + if r.c.runType() == Running { + return ErrRunning } r.c.state = s return nil @@ -130,12 +129,8 @@ func (r *runningState) transition(s containerState) error { } func (r *runningState) destroy() error { - t, err := r.c.runType() - if err != nil { - return err - } - if t == Running { - return newGenericError(fmt.Errorf("container is not destroyed"), ContainerNotStopped) + if r.c.runType() == Running { + return ErrRunning } return destroy(r.c) } @@ -160,7 +155,7 @@ func (i *createdState) transition(s containerState) error { } func (i *createdState) destroy() error { - i.c.initProcess.signal(unix.SIGKILL) + _ = i.c.initProcess.signal(unix.SIGKILL) return destroy(i.c) } @@ -186,17 +181,14 @@ func (p *pausedState) transition(s containerState) error { } func (p *pausedState) destroy() error { - t, err := p.c.runType() - if err != nil { - return err - } + t := p.c.runType() if t != Running && t != Created { if err := p.c.cgroupManager.Freeze(configs.Thawed); err != nil { return err } return destroy(p.c) } - return newGenericError(fmt.Errorf("container is paused"), ContainerPaused) + return ErrPaused } // restoredState is the same as the running state but also has associated checkpoint diff --git a/libcontainer/state_linux_test.go b/libcontainer/state_linux_test.go index 6ef516b..413626d 100644 --- a/libcontainer/state_linux_test.go +++ b/libcontainer/state_linux_test.go @@ -1,8 +1,7 @@ -// +build linux - package libcontainer import ( + "errors" "reflect" "testing" ) @@ -24,11 +23,6 @@ func TestStateStatus(t *testing.T) { } } -func isStateTransitionError(err error) bool { - _, ok := err.(*stateTransitionError) - return ok -} - func testTransitions(t *testing.T, initialState containerState, valid []containerState) { validMap := map[reflect.Type]interface{}{} for _, validState := range valid { @@ -48,7 +42,8 @@ func testTransitions(t *testing.T, initialState containerState, valid []containe if err == nil { t.Fatal("transition should fail") } - if !isStateTransitionError(err) { + var stErr *stateTransitionError + if !errors.As(err, &stErr) { t.Fatal("expected stateTransitionError") } }) diff --git a/libcontainer/sync.go b/libcontainer/sync.go index a8704a2..c9a23ef 100644 --- a/libcontainer/sync.go +++ b/libcontainer/sync.go @@ -2,6 +2,7 @@ package libcontainer import ( "encoding/json" + "errors" "fmt" "io" @@ -12,60 +13,81 @@ type syncType string // Constants that are used for synchronisation between the parent and child // during container setup. They come in pairs (with procError being a generic -// response which is followed by a &genericError). +// response which is followed by an &initError). // // [ child ] <-> [ parent ] // // procHooks --> [run hooks] // <-- procResume // -// procConsole --> -// <-- procConsoleReq -// [send(fd)] --> [recv(fd)] -// <-- procConsoleAck -// // procReady --> [final setup] // <-- procRun +// +// procSeccomp --> [pick up seccomp fd with pidfd_getfd()] +// <-- procSeccompDone const ( - procError syncType = "procError" - procReady syncType = "procReady" - procRun syncType = "procRun" - procHooks syncType = "procHooks" - procResume syncType = "procResume" + procError syncType = "procError" + procReady syncType = "procReady" + procRun syncType = "procRun" + procHooks syncType = "procHooks" + procResume syncType = "procResume" + procSeccomp syncType = "procSeccomp" + procSeccompDone syncType = "procSeccompDone" ) type syncT struct { Type syncType `json:"type"` + Fd int `json:"fd"` +} + +// initError is used to wrap errors for passing them via JSON, +// as encoding/json can't unmarshal into error type. +type initError struct { + Message string `json:"message,omitempty"` +} + +func (i initError) Error() string { + return i.Message } // writeSync is used to write to a synchronisation pipe. An error is returned // if there was a problem writing the payload. func writeSync(pipe io.Writer, sync syncType) error { - return utils.WriteJSON(pipe, syncT{sync}) + return writeSyncWithFd(pipe, sync, -1) +} + +// writeSyncWithFd is used to write to a synchronisation pipe. An error is +// returned if there was a problem writing the payload. +func writeSyncWithFd(pipe io.Writer, sync syncType, fd int) error { + if err := utils.WriteJSON(pipe, syncT{sync, fd}); err != nil { + return fmt.Errorf("writing syncT %q: %w", string(sync), err) + } + return nil } // readSync is used to read from a synchronisation pipe. An error is returned -// if we got a genericError, the pipe was closed, or we got an unexpected flag. +// if we got an initError, the pipe was closed, or we got an unexpected flag. func readSync(pipe io.Reader, expected syncType) error { var procSync syncT if err := json.NewDecoder(pipe).Decode(&procSync); err != nil { - if err == io.EOF { - return fmt.Errorf("parent closed synchronisation channel") + if errors.Is(err, io.EOF) { + return errors.New("parent closed synchronisation channel") + } + return fmt.Errorf("failed reading error from parent: %w", err) + } + + if procSync.Type == procError { + var ierr initError + + if err := json.NewDecoder(pipe).Decode(&ierr); err != nil { + return fmt.Errorf("failed reading error from parent: %w", err) } - if procSync.Type == procError { - var ierr genericError + return &ierr + } - if err := json.NewDecoder(pipe).Decode(&ierr); err != nil { - return fmt.Errorf("failed reading error from parent: %v", err) - } - - return &ierr - } - - if procSync.Type != expected { - return fmt.Errorf("invalid synchronisation flag from parent") - } + if procSync.Type != expected { + return errors.New("invalid synchronisation flag from parent") } return nil } @@ -77,17 +99,17 @@ func parseSync(pipe io.Reader, fn func(*syncT) error) error { for { var sync syncT if err := dec.Decode(&sync); err != nil { - if err == io.EOF { + if errors.Is(err, io.EOF) { break } return err } // We handle this case outside fn for cleanliness reasons. - var ierr *genericError + var ierr *initError if sync.Type == procError { - if err := dec.Decode(&ierr); err != nil && err != io.EOF { - return newSystemErrorWithCause(err, "decoding proc error from init") + if err := dec.Decode(&ierr); err != nil && !errors.Is(err, io.EOF) { + return fmt.Errorf("error decoding proc error from init: %w", err) } if ierr != nil { return ierr diff --git a/libcontainer/system/linux.go b/libcontainer/system/linux.go index a4ae890..e1d6eb1 100644 --- a/libcontainer/system/linux.go +++ b/libcontainer/system/linux.go @@ -1,3 +1,4 @@ +//go:build linux // +build linux package system @@ -5,26 +6,11 @@ package system import ( "os" "os/exec" - "syscall" // only for exec "unsafe" - "github.com/opencontainers/runc/libcontainer/user" "golang.org/x/sys/unix" ) -// If arg2 is nonzero, set the "child subreaper" attribute of the -// calling process; if arg2 is zero, unset the attribute. When a -// process is marked as a child subreaper, all of the children -// that it creates, and their descendants, will be marked as -// having a subreaper. In effect, a subreaper fulfills the role -// of init(1) for its descendant processes. Upon termination of -// a process that is orphaned (i.e., its immediate parent has -// already terminated) and marked as having a subreaper, the -// nearest still living ancestor subreaper will receive a SIGCHLD -// signal and be able to wait(2) on the process to discover its -// termination status. -const PR_SET_CHILD_SUBREAPER = 36 - type ParentDeathSignal int func (p ParentDeathSignal) Restore() error { @@ -51,15 +37,16 @@ func Execv(cmd string, args []string, env []string) error { return err } - return syscall.Exec(name, args, env) + return Exec(name, args, env) } -func Prlimit(pid, resource int, limit unix.Rlimit) error { - _, _, err := unix.RawSyscall6(unix.SYS_PRLIMIT64, uintptr(pid), uintptr(resource), uintptr(unsafe.Pointer(&limit)), uintptr(unsafe.Pointer(&limit)), 0, 0) - if err != 0 { - return err +func Exec(cmd string, args []string, env []string) error { + for { + err := unix.Exec(cmd, args, env) + if err != unix.EINTR { //nolint:errorlint // unix errors are bare + return &os.PathError{Op: "exec", Path: cmd, Err: err} + } } - return nil } func SetParentDeathSignal(sig uintptr) error { @@ -100,47 +87,9 @@ func Setctty() error { return nil } -// RunningInUserNS detects whether we are currently running in a user namespace. -// Originally copied from github.com/lxc/lxd/shared/util.go -func RunningInUserNS() bool { - uidmap, err := user.CurrentProcessUIDMap() - if err != nil { - // This kernel-provided file only exists if user namespaces are supported - return false - } - return UIDMapInUserNS(uidmap) -} - -func UIDMapInUserNS(uidmap []user.IDMap) bool { - /* - * We assume we are in the initial user namespace if we have a full - * range - 4294967295 uids starting at uid 0. - */ - if len(uidmap) == 1 && uidmap[0].ID == 0 && uidmap[0].ParentID == 0 && uidmap[0].Count == 4294967295 { - return false - } - return true -} - -// GetParentNSeuid returns the euid within the parent user namespace -func GetParentNSeuid() int64 { - euid := int64(os.Geteuid()) - uidmap, err := user.CurrentProcessUIDMap() - if err != nil { - // This kernel-provided file only exists if user namespaces are supported - return euid - } - for _, um := range uidmap { - if um.ID <= euid && euid <= um.ID+um.Count-1 { - return um.ParentID + euid - um.ID - } - } - return euid -} - // SetSubreaper sets the value i as the subreaper setting for the calling process func SetSubreaper(i int) error { - return unix.Prctl(PR_SET_CHILD_SUBREAPER, uintptr(i), 0, 0, 0) + return unix.Prctl(unix.PR_SET_CHILD_SUBREAPER, uintptr(i), 0, 0, 0) } // GetSubreaper returns the subreaper setting for the calling process diff --git a/libcontainer/system/proc.go b/libcontainer/system/proc.go index 79232a4..774443e 100644 --- a/libcontainer/system/proc.go +++ b/libcontainer/system/proc.go @@ -2,7 +2,7 @@ package system import ( "fmt" - "io/ioutil" + "os" "path/filepath" "strconv" "strings" @@ -19,6 +19,8 @@ const ( // Only values for Linux 3.14 and later are listed here Stopped State = 'T' TracingStop State = 't' Zombie State = 'Z' + Parked State = 'P' + Idle State = 'I' ) // String forms of the state from proc(5)'s documentation for @@ -39,6 +41,10 @@ func (s State) String() string { return "tracing stop" case Zombie: return "zombie" + case Parked: + return "parked" + case Idle: + return "idle" // kernel thread default: return fmt.Sprintf("unknown (%c)", s) } @@ -48,9 +54,6 @@ func (s State) String() string { // described in proc(5) with names based on the /proc/[pid]/status // fields. type Stat_t struct { - // PID is the process ID. - PID uint - // Name is the command run by the process. Name string @@ -64,50 +67,61 @@ type Stat_t struct { // Stat returns a Stat_t instance for the specified process. func Stat(pid int) (stat Stat_t, err error) { - bytes, err := ioutil.ReadFile(filepath.Join("/proc", strconv.Itoa(pid), "stat")) + bytes, err := os.ReadFile(filepath.Join("/proc", strconv.Itoa(pid), "stat")) if err != nil { return stat, err } return parseStat(string(bytes)) } -// GetProcessStartTime is deprecated. Use Stat(pid) and -// Stat_t.StartTime instead. -func GetProcessStartTime(pid int) (string, error) { - stat, err := Stat(pid) - if err != nil { - return "", err - } - return fmt.Sprintf("%d", stat.StartTime), nil -} - func parseStat(data string) (stat Stat_t, err error) { - // From proc(5), field 2 could contain space and is inside `(` and `)`. - // The following is an example: + // Example: // 89653 (gunicorn: maste) S 89630 89653 89653 0 -1 4194560 29689 28896 0 3 146 32 76 19 20 0 1 0 2971844 52965376 3920 18446744073709551615 1 1 0 0 0 0 0 16781312 137447943 0 0 0 17 1 0 0 0 0 0 0 0 0 0 0 0 0 0 - i := strings.LastIndex(data, ")") - if i <= 2 || i >= len(data)-1 { - return stat, fmt.Errorf("invalid stat data: %q", data) + // The fields are space-separated, see full description in proc(5). + // + // We are only interested in: + // * field 2: process name. It is the only field enclosed into + // parenthesis, as it can contain spaces (and parenthesis) inside. + // * field 3: process state, a single character (%c) + // * field 22: process start time, a long unsigned integer (%llu). + + // 1. Look for the first '(' and the last ')' first, what's in between is Name. + // We expect at least 20 fields and a space after the last one. + + const minAfterName = 20*2 + 1 // the min field is '0 '. + + first := strings.IndexByte(data, '(') + if first < 0 || first+minAfterName >= len(data) { + return stat, fmt.Errorf("invalid stat data (no comm or too short): %q", data) } - parts := strings.SplitN(data[:i], "(", 2) - if len(parts) != 2 { - return stat, fmt.Errorf("invalid stat data: %q", data) + last := strings.LastIndexByte(data, ')') + if last <= first || last+minAfterName >= len(data) { + return stat, fmt.Errorf("invalid stat data (no comm or too short): %q", data) } - stat.Name = parts[1] - _, err = fmt.Sscanf(parts[0], "%d", &stat.PID) + stat.Name = data[first+1 : last] + + // 2. Remove fields 1 and 2 and a space after. State is right after. + data = data[last+2:] + stat.State = State(data[0]) + + // 3. StartTime is field 22, data is at field 3 now, so we need to skip 19 spaces. + skipSpaces := 22 - 3 + for first = 0; skipSpaces > 0 && first < len(data); first++ { + if data[first] == ' ' { + skipSpaces-- + } + } + // Now first points to StartTime; look for space right after. + i := strings.IndexByte(data[first:], ' ') + if i < 0 { + return stat, fmt.Errorf("invalid stat data (too short): %q", data) + } + stat.StartTime, err = strconv.ParseUint(data[first:first+i], 10, 64) if err != nil { - return stat, err + return stat, fmt.Errorf("invalid stat data (bad start time): %w", err) } - // parts indexes should be offset by 3 from the field number given - // proc(5), because parts is zero-indexed and we've removed fields - // one (PID) and two (Name) in the paren-split. - parts = strings.Split(data[i+2:], " ") - var state int - fmt.Sscanf(parts[3-3], "%c", &state) - stat.State = State(state) - fmt.Sscanf(parts[22-3], "%d", &stat.StartTime) return stat, nil } diff --git a/libcontainer/system/proc_test.go b/libcontainer/system/proc_test.go index 7e1acc5..949d840 100644 --- a/libcontainer/system/proc_test.go +++ b/libcontainer/system/proc_test.go @@ -1,45 +1,180 @@ package system -import "testing" +import ( + "errors" + "math/bits" + "os" + "reflect" + "strconv" + "testing" +) -func TestParseStartTime(t *testing.T) { - data := map[string]Stat_t{ - "4902 (gunicorn: maste) S 4885 4902 4902 0 -1 4194560 29683 29929 61 83 78 16 96 17 20 0 1 0 9126532 52965376 1903 18446744073709551615 4194304 7461796 140733928751520 140733928698072 139816984959091 0 0 16781312 137447943 1 0 0 17 3 0 0 9 0 0 9559488 10071156 33050624 140733928758775 140733928758945 140733928758945 140733928759264 0": { - PID: 4902, - Name: "gunicorn: maste", - State: 'S', - StartTime: 9126532, - }, - "9534 (cat) R 9323 9534 9323 34828 9534 4194304 95 0 0 0 0 0 0 0 20 0 1 0 9214966 7626752 168 18446744073709551615 4194304 4240332 140732237651568 140732237650920 140570710391216 0 0 0 0 0 0 0 17 1 0 0 0 0 0 6340112 6341364 21553152 140732237653865 140732237653885 140732237653885 140732237656047 0": { - PID: 9534, - Name: "cat", - State: 'R', - StartTime: 9214966, - }, +var procdata = map[string]Stat_t{ + "4902 (gunicorn: maste) S 4885 4902 4902 0 -1 4194560 29683 29929 61 83 78 16 96 17 20 0 1 0 9126532 52965376 1903 18446744073709551615 4194304 7461796 140733928751520 140733928698072 139816984959091 0 0 16781312 137447943 1 0 0 17 3 0 0 9 0 0 9559488 10071156 33050624 140733928758775 140733928758945 140733928758945 140733928759264 0": { + Name: "gunicorn: maste", + State: 'S', + StartTime: 9126532, + }, + "9534 (cat) R 9323 9534 9323 34828 9534 4194304 95 0 0 0 0 0 0 0 20 0 1 0 9214966 7626752 168 18446744073709551615 4194304 4240332 140732237651568 140732237650920 140570710391216 0 0 0 0 0 0 0 17 1 0 0 0 0 0 6340112 6341364 21553152 140732237653865 140732237653885 140732237653885 140732237656047 0": { + Name: "cat", + State: 'R', + StartTime: 9214966, + }, + "12345 ((ugly )pr()cess() R 9323 9534 9323 34828 9534 4194304 95 0 0 0 0 0 0 0 20 0 1 0 9214966 7626752 168 18446744073709551615 4194304 4240332 140732237651568 140732237650920 140570710391216 0 0 0 0 0 0 0 17 1 0 0 0 0 0 6340112 6341364 21553152 140732237653865 140732237653885 140732237653885 140732237656047 0": { + Name: "(ugly )pr()cess(", + State: 'R', + StartTime: 9214966, + }, + "24767 (irq/44-mei_me) S 2 0 0 0 -1 2129984 0 0 0 0 0 0 0 0 -51 0 1 0 8722075 0 0 18446744073709551615 0 0 0 0 0 0 0 2147483647 0 0 0 0 17 1 50 1 0 0 0 0 0 0 0 0 0 0 0": { + Name: "irq/44-mei_me", + State: 'S', + StartTime: 8722075, + }, + "0 () I 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0": { + Name: "", + State: 'I', + StartTime: 0, + }, + // Not entirely correct, but minimally viable input (StartTime and a space after). + "1 (woo hoo) S 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4 ": { + Name: "woo hoo", + State: 'S', + StartTime: 4, + }, +} - "24767 (irq/44-mei_me) S 2 0 0 0 -1 2129984 0 0 0 0 0 0 0 0 -51 0 1 0 8722075 0 0 18446744073709551615 0 0 0 0 0 0 0 2147483647 0 0 0 0 17 1 50 1 0 0 0 0 0 0 0 0 0 0 0": { - PID: 24767, - Name: "irq/44-mei_me", - State: 'S', - StartTime: 8722075, - }, - } - for line, expected := range data { +func TestParseStat(t *testing.T) { + for line, exp := range procdata { st, err := parseStat(line) if err != nil { - t.Fatal(err) - } - if st.PID != expected.PID { - t.Fatalf("expected PID %q but received %q", expected.PID, st.PID) - } - if st.State != expected.State { - t.Fatalf("expected state %q but received %q", expected.State, st.State) - } - if st.Name != expected.Name { - t.Fatalf("expected name %q but received %q", expected.Name, st.Name) - } - if st.StartTime != expected.StartTime { - t.Fatalf("expected start time %q but received %q", expected.StartTime, st.StartTime) + t.Errorf("input %q, unexpected error %v", line, err) + } else if !reflect.DeepEqual(st, exp) { + t.Errorf("input %q, expected %+v, got %+v", line, exp, st) } } } + +func TestParseStatBadInput(t *testing.T) { + cases := []struct { + desc, input string + }{ + { + "no (", + "123 ) S 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0", + }, + { + "no )", + "123 ( S 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0", + }, + { + ") at end", + "123 (cmd) S 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0)", + }, + { + "misplaced ()", + "123 )one( S 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0", + }, + { + "misplaced empty ()", + "123 )( S 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0", + }, + { + "empty line", + "", + }, + { + "short line", + "123 (cmd) S 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0", + }, + { + "short line (no space after stime)", + "123 (cmd) S 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42", + }, + { + "bad stime", + "123 (cmd) S 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -1 ", + }, + { + "bad stime 2", // would be valid if not -1 + "123 (cmd) S -1 ", + }, + { + "a tad short", + "1234 (cmd) ", + }, + { + "bad stime", + "123 (cmd) S 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1", + }, + } + for _, c := range cases { + st, err := parseStat(c.input) + if err == nil { + t.Errorf("case %q, expected error, got nil, %+v", c.desc, st) + } + } +} + +func BenchmarkParseStat(b *testing.B) { + var ( + st, exp Stat_t + line string + err error + ) + + for i := 0; i != b.N; i++ { + for line, exp = range procdata { + st, err = parseStat(line) + } + } + if err != nil { + b.Fatal(err) + } + if !reflect.DeepEqual(st, exp) { + b.Fatal("wrong result") + } +} + +func BenchmarkParseRealStat(b *testing.B) { + var ( + st Stat_t + err error + total int + ) + b.StopTimer() + fd, err := os.Open("/proc") + if err != nil { + b.Fatal(err) + } + defer fd.Close() + + for i := 0; i != b.N; i++ { + count := 0 + if _, err := fd.Seek(0, 0); err != nil { + b.Fatal(err) + } + names, err := fd.Readdirnames(-1) + if err != nil { + b.Fatal(err) + } + for _, n := range names { + pid, err := strconv.ParseUint(n, 10, bits.UintSize) + if err != nil { + continue + } + b.StartTimer() + st, err = Stat(int(pid)) + b.StopTimer() + if err != nil { + // Ignore a process that just finished. + if errors.Is(err, os.ErrNotExist) { + continue + } + b.Fatal(err) + } + count++ + } + total += count + } + b.Logf("N: %d, parsed %d pids, last stat: %+v, err: %v", b.N, total, st, err) +} diff --git a/libcontainer/system/syscall_linux_32.go b/libcontainer/system/syscall_linux_32.go index c5ca5d8..1acc5cb 100644 --- a/libcontainer/system/syscall_linux_32.go +++ b/libcontainer/system/syscall_linux_32.go @@ -1,3 +1,4 @@ +//go:build linux && (386 || arm) // +build linux // +build 386 arm diff --git a/libcontainer/system/syscall_linux_64.go b/libcontainer/system/syscall_linux_64.go index e05e30a..1ed0dba 100644 --- a/libcontainer/system/syscall_linux_64.go +++ b/libcontainer/system/syscall_linux_64.go @@ -1,3 +1,4 @@ +//go:build linux && (arm64 || amd64 || mips || mipsle || mips64 || mips64le || ppc || ppc64 || ppc64le || riscv64 || s390x) // +build linux // +build arm64 amd64 mips mipsle mips64 mips64le ppc ppc64 ppc64le riscv64 s390x diff --git a/libcontainer/system/sysconfig.go b/libcontainer/system/sysconfig.go deleted file mode 100644 index b8434f1..0000000 --- a/libcontainer/system/sysconfig.go +++ /dev/null @@ -1,12 +0,0 @@ -// +build cgo,linux - -package system - -/* -#include -*/ -import "C" - -func GetClockTicks() int { - return int(C.sysconf(C._SC_CLK_TCK)) -} diff --git a/libcontainer/system/sysconfig_notcgo.go b/libcontainer/system/sysconfig_notcgo.go deleted file mode 100644 index d93b5d5..0000000 --- a/libcontainer/system/sysconfig_notcgo.go +++ /dev/null @@ -1,15 +0,0 @@ -// +build !cgo windows - -package system - -func GetClockTicks() int { - // TODO figure out a better alternative for platforms where we're missing cgo - // - // TODO Windows. This could be implemented using Win32 QueryPerformanceFrequency(). - // https://msdn.microsoft.com/en-us/library/windows/desktop/ms644905(v=vs.85).aspx - // - // An example of its usage can be found here. - // https://msdn.microsoft.com/en-us/library/windows/desktop/dn553408(v=vs.85).aspx - - return 100 -} diff --git a/libcontainer/system/unsupported.go b/libcontainer/system/unsupported.go deleted file mode 100644 index b94be74..0000000 --- a/libcontainer/system/unsupported.go +++ /dev/null @@ -1,27 +0,0 @@ -// +build !linux - -package system - -import ( - "os" - - "github.com/opencontainers/runc/libcontainer/user" -) - -// RunningInUserNS is a stub for non-Linux systems -// Always returns false -func RunningInUserNS() bool { - return false -} - -// UIDMapInUserNS is a stub for non-Linux systems -// Always returns false -func UIDMapInUserNS(uidmap []user.IDMap) bool { - return false -} - -// GetParentNSeuid returns the euid within the parent user namespace -// Always returns os.Geteuid on non-linux -func GetParentNSeuid() int { - return os.Geteuid() -} diff --git a/libcontainer/system/xattrs_linux.go b/libcontainer/system/xattrs_linux.go deleted file mode 100644 index a6823fc..0000000 --- a/libcontainer/system/xattrs_linux.go +++ /dev/null @@ -1,35 +0,0 @@ -package system - -import "golang.org/x/sys/unix" - -// Returns a []byte slice if the xattr is set and nil otherwise -// Requires path and its attribute as arguments -func Lgetxattr(path string, attr string) ([]byte, error) { - var sz int - // Start with a 128 length byte array - dest := make([]byte, 128) - sz, errno := unix.Lgetxattr(path, attr, dest) - - switch { - case errno == unix.ENODATA: - return nil, errno - case errno == unix.ENOTSUP: - return nil, errno - case errno == unix.ERANGE: - // 128 byte array might just not be good enough, - // A dummy buffer is used to get the real size - // of the xattrs on disk - sz, errno = unix.Lgetxattr(path, attr, []byte{}) - if errno != nil { - return nil, errno - } - dest = make([]byte, sz) - sz, errno = unix.Lgetxattr(path, attr, dest) - if errno != nil { - return nil, errno - } - case errno != nil: - return nil, errno - } - return dest[:sz], nil -} diff --git a/libcontainer/user/MAINTAINERS b/libcontainer/user/MAINTAINERS deleted file mode 100644 index edbe200..0000000 --- a/libcontainer/user/MAINTAINERS +++ /dev/null @@ -1,2 +0,0 @@ -Tianon Gravi (@tianon) -Aleksa Sarai (@cyphar) diff --git a/libcontainer/user/lookup.go b/libcontainer/user/lookup.go deleted file mode 100644 index 6fd8dd0..0000000 --- a/libcontainer/user/lookup.go +++ /dev/null @@ -1,41 +0,0 @@ -package user - -import ( - "errors" -) - -var ( - // The current operating system does not provide the required data for user lookups. - ErrUnsupported = errors.New("user lookup: operating system does not provide passwd-formatted data") - // No matching entries found in file. - ErrNoPasswdEntries = errors.New("no matching entries in passwd file") - ErrNoGroupEntries = errors.New("no matching entries in group file") -) - -// LookupUser looks up a user by their username in /etc/passwd. If the user -// cannot be found (or there is no /etc/passwd file on the filesystem), then -// LookupUser returns an error. -func LookupUser(username string) (User, error) { - return lookupUser(username) -} - -// LookupUid looks up a user by their user id in /etc/passwd. If the user cannot -// be found (or there is no /etc/passwd file on the filesystem), then LookupId -// returns an error. -func LookupUid(uid int) (User, error) { - return lookupUid(uid) -} - -// LookupGroup looks up a group by its name in /etc/group. If the group cannot -// be found (or there is no /etc/group file on the filesystem), then LookupGroup -// returns an error. -func LookupGroup(groupname string) (Group, error) { - return lookupGroup(groupname) -} - -// LookupGid looks up a group by its group id in /etc/group. If the group cannot -// be found (or there is no /etc/group file on the filesystem), then LookupGid -// returns an error. -func LookupGid(gid int) (Group, error) { - return lookupGid(gid) -} diff --git a/libcontainer/user/lookup_unix.go b/libcontainer/user/lookup_unix.go index 92b5ae8..f95c140 100644 --- a/libcontainer/user/lookup_unix.go +++ b/libcontainer/user/lookup_unix.go @@ -1,3 +1,4 @@ +//go:build darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris // +build darwin dragonfly freebsd linux netbsd openbsd solaris package user @@ -16,13 +17,19 @@ const ( unixGroupPath = "/etc/group" ) -func lookupUser(username string) (User, error) { +// LookupUser looks up a user by their username in /etc/passwd. If the user +// cannot be found (or there is no /etc/passwd file on the filesystem), then +// LookupUser returns an error. +func LookupUser(username string) (User, error) { return lookupUserFunc(func(u User) bool { return u.Name == username }) } -func lookupUid(uid int) (User, error) { +// LookupUid looks up a user by their user id in /etc/passwd. If the user cannot +// be found (or there is no /etc/passwd file on the filesystem), then LookupId +// returns an error. +func LookupUid(uid int) (User, error) { return lookupUserFunc(func(u User) bool { return u.Uid == uid }) @@ -51,13 +58,19 @@ func lookupUserFunc(filter func(u User) bool) (User, error) { return users[0], nil } -func lookupGroup(groupname string) (Group, error) { +// LookupGroup looks up a group by its name in /etc/group. If the group cannot +// be found (or there is no /etc/group file on the filesystem), then LookupGroup +// returns an error. +func LookupGroup(groupname string) (Group, error) { return lookupGroupFunc(func(g Group) bool { return g.Name == groupname }) } -func lookupGid(gid int) (Group, error) { +// LookupGid looks up a group by its group id in /etc/group. If the group cannot +// be found (or there is no /etc/group file on the filesystem), then LookupGid +// returns an error. +func LookupGid(gid int) (Group, error) { return lookupGroupFunc(func(g Group) bool { return g.Gid == gid }) diff --git a/libcontainer/user/lookup_windows.go b/libcontainer/user/lookup_windows.go deleted file mode 100644 index 65cd40e..0000000 --- a/libcontainer/user/lookup_windows.go +++ /dev/null @@ -1,40 +0,0 @@ -// +build windows - -package user - -import ( - "fmt" - "os/user" -) - -func lookupUser(username string) (User, error) { - u, err := user.Lookup(username) - if err != nil { - return User{}, err - } - return userFromOS(u) -} - -func lookupUid(uid int) (User, error) { - u, err := user.LookupId(fmt.Sprintf("%d", uid)) - if err != nil { - return User{}, err - } - return userFromOS(u) -} - -func lookupGroup(groupname string) (Group, error) { - g, err := user.LookupGroup(groupname) - if err != nil { - return Group{}, err - } - return groupFromOS(g) -} - -func lookupGid(gid int) (Group, error) { - g, err := user.LookupGroupId(fmt.Sprintf("%d", gid)) - if err != nil { - return Group{}, err - } - return groupFromOS(g) -} diff --git a/libcontainer/user/user.go b/libcontainer/user/user.go index 38caded..2473c5e 100644 --- a/libcontainer/user/user.go +++ b/libcontainer/user/user.go @@ -2,21 +2,27 @@ package user import ( "bufio" + "bytes" + "errors" "fmt" "io" "os" - "os/user" "strconv" "strings" ) const ( - minId = 0 - maxId = 1<<31 - 1 //for 32-bit systems compatibility + minID = 0 + maxID = 1<<31 - 1 // for 32-bit systems compatibility ) var ( - ErrRange = fmt.Errorf("uids and gids must be in range %d-%d", minId, maxId) + // ErrNoPasswdEntries is returned if no matching entries were found in /etc/group. + ErrNoPasswdEntries = errors.New("no matching entries in passwd file") + // ErrNoGroupEntries is returned if no matching entries were found in /etc/passwd. + ErrNoGroupEntries = errors.New("no matching entries in group file") + // ErrRange is returned if a UID or GID is outside of the valid range. + ErrRange = fmt.Errorf("uids and gids must be in range %d-%d", minID, maxID) ) type User struct { @@ -29,28 +35,6 @@ type User struct { Shell string } -// userFromOS converts an os/user.(*User) to local User -// -// (This does not include Pass, Shell or Gecos) -func userFromOS(u *user.User) (User, error) { - newUser := User{ - Name: u.Username, - Home: u.HomeDir, - } - id, err := strconv.Atoi(u.Uid) - if err != nil { - return newUser, err - } - newUser.Uid = id - - id, err = strconv.Atoi(u.Gid) - if err != nil { - return newUser, err - } - newUser.Gid = id - return newUser, nil -} - type Group struct { Name string Pass string @@ -58,23 +42,6 @@ type Group struct { List []string } -// groupFromOS converts an os/user.(*Group) to local Group -// -// (This does not include Pass, Shell or Gecos) -func groupFromOS(g *user.Group) (Group, error) { - newGroup := Group{ - Name: g.Name, - } - - id, err := strconv.Atoi(g.Gid) - if err != nil { - return newGroup, err - } - newGroup.Gid = id - - return newGroup, nil -} - // SubID represents an entry in /etc/sub{u,g}id type SubID struct { Name string @@ -89,11 +56,11 @@ type IDMap struct { Count int64 } -func parseLine(line string, v ...interface{}) { - parseParts(strings.Split(line, ":"), v...) +func parseLine(line []byte, v ...interface{}) { + parseParts(bytes.Split(line, []byte(":")), v...) } -func parseParts(parts []string, v ...interface{}) { +func parseParts(parts [][]byte, v ...interface{}) { if len(parts) == 0 { return } @@ -109,16 +76,16 @@ func parseParts(parts []string, v ...interface{}) { // This is legit. switch e := v[i].(type) { case *string: - *e = p + *e = string(p) case *int: // "numbers", with conversion errors ignored because of some misbehaving configuration files. - *e, _ = strconv.Atoi(p) + *e, _ = strconv.Atoi(string(p)) case *int64: - *e, _ = strconv.ParseInt(p, 10, 64) + *e, _ = strconv.ParseInt(string(p), 10, 64) case *[]string: // Comma-separated lists. - if p != "" { - *e = strings.Split(p, ",") + if len(p) != 0 { + *e = strings.Split(string(p), ",") } else { *e = []string{} } @@ -153,7 +120,7 @@ func ParsePasswdFileFilter(path string, filter func(User) bool) ([]User, error) func ParsePasswdFilter(r io.Reader, filter func(User) bool) ([]User, error) { if r == nil { - return nil, fmt.Errorf("nil source for passwd-formatted data") + return nil, errors.New("nil source for passwd-formatted data") } var ( @@ -162,12 +129,8 @@ func ParsePasswdFilter(r io.Reader, filter func(User) bool) ([]User, error) { ) for s.Scan() { - if err := s.Err(); err != nil { - return nil, err - } - - line := strings.TrimSpace(s.Text()) - if line == "" { + line := bytes.TrimSpace(s.Bytes()) + if len(line) == 0 { continue } @@ -183,6 +146,9 @@ func ParsePasswdFilter(r io.Reader, filter func(User) bool) ([]User, error) { out = append(out, p) } } + if err := s.Err(); err != nil { + return nil, err + } return out, nil } @@ -212,21 +178,55 @@ func ParseGroupFileFilter(path string, filter func(Group) bool) ([]Group, error) func ParseGroupFilter(r io.Reader, filter func(Group) bool) ([]Group, error) { if r == nil { - return nil, fmt.Errorf("nil source for group-formatted data") + return nil, errors.New("nil source for group-formatted data") } + rd := bufio.NewReader(r) + out := []Group{} - var ( - s = bufio.NewScanner(r) - out = []Group{} - ) + // Read the file line-by-line. + for { + var ( + isPrefix bool + wholeLine []byte + err error + ) - for s.Scan() { - if err := s.Err(); err != nil { - return nil, err + // Read the next line. We do so in chunks (as much as reader's + // buffer is able to keep), check if we read enough columns + // already on each step and store final result in wholeLine. + for { + var line []byte + line, isPrefix, err = rd.ReadLine() + + if err != nil { + // We should return no error if EOF is reached + // without a match. + if err == io.EOF { //nolint:errorlint // comparison with io.EOF is legit, https://github.com/polyfloyd/go-errorlint/pull/12 + err = nil + } + return out, err + } + + // Simple common case: line is short enough to fit in a + // single reader's buffer. + if !isPrefix && len(wholeLine) == 0 { + wholeLine = line + break + } + + wholeLine = append(wholeLine, line...) + + // Check if we read the whole line already. + if !isPrefix { + break + } } - text := s.Text() - if text == "" { + // There's no spec for /etc/passwd or /etc/group, but we try to follow + // the same rules as the glibc parser, which allows comments and blank + // space at the beginning of a line. + wholeLine = bytes.TrimSpace(wholeLine) + if len(wholeLine) == 0 || wholeLine[0] == '#' { continue } @@ -236,14 +236,12 @@ func ParseGroupFilter(r io.Reader, filter func(Group) bool) ([]Group, error) { // root:x:0:root // adm:x:4:root,adm,daemon p := Group{} - parseLine(text, &p.Name, &p.Pass, &p.Gid, &p.List) + parseLine(wholeLine, &p.Name, &p.Pass, &p.Gid, &p.List) if filter == nil || filter(p) { out = append(out, p) } } - - return out, nil } type ExecUser struct { @@ -314,7 +312,7 @@ func GetExecUser(userSpec string, defaults *ExecUser, passwd, group io.Reader) ( // Allow for userArg to have either "user" syntax, or optionally "user:group" syntax var userArg, groupArg string - parseLine(userSpec, &userArg, &groupArg) + parseLine([]byte(userSpec), &userArg, &groupArg) // Convert userArg and groupArg to be numeric, so we don't have to execute // Atoi *twice* for each iteration over lines. @@ -341,7 +339,7 @@ func GetExecUser(userSpec string, defaults *ExecUser, passwd, group io.Reader) ( if userArg == "" { userArg = strconv.Itoa(user.Uid) } - return nil, fmt.Errorf("unable to find user %s: %v", userArg, err) + return nil, fmt.Errorf("unable to find user %s: %w", userArg, err) } var matchedUserName string @@ -357,12 +355,12 @@ func GetExecUser(userSpec string, defaults *ExecUser, passwd, group io.Reader) ( if uidErr != nil { // Not numeric. - return nil, fmt.Errorf("unable to find user %s: %v", userArg, ErrNoPasswdEntries) + return nil, fmt.Errorf("unable to find user %s: %w", userArg, ErrNoPasswdEntries) } user.Uid = uidArg // Must be inside valid uid range. - if user.Uid < minId || user.Uid > maxId { + if user.Uid < minID || user.Uid > maxID { return nil, ErrRange } @@ -392,7 +390,7 @@ func GetExecUser(userSpec string, defaults *ExecUser, passwd, group io.Reader) ( return g.Name == groupArg }) if err != nil && group != nil { - return nil, fmt.Errorf("unable to find groups for spec %v: %v", matchedUserName, err) + return nil, fmt.Errorf("unable to find groups for spec %v: %w", matchedUserName, err) } // Only start modifying user.Gid if it is in explicit form. @@ -406,12 +404,12 @@ func GetExecUser(userSpec string, defaults *ExecUser, passwd, group io.Reader) ( if gidErr != nil { // Not numeric. - return nil, fmt.Errorf("unable to find group %s: %v", groupArg, ErrNoGroupEntries) + return nil, fmt.Errorf("unable to find group %s: %w", groupArg, ErrNoGroupEntries) } user.Gid = gidArg // Must be inside valid gid range. - if user.Gid < minId || user.Gid > maxId { + if user.Gid < minID || user.Gid > maxID { return nil, ErrRange } @@ -435,7 +433,7 @@ func GetExecUser(userSpec string, defaults *ExecUser, passwd, group io.Reader) ( // or the given group data is nil, the id will be returned as-is // provided it is in the legal range. func GetAdditionalGroups(additionalGroups []string, group io.Reader) ([]int, error) { - var groups = []Group{} + groups := []Group{} if group != nil { var err error groups, err = ParseGroupFilter(group, func(g Group) bool { @@ -447,7 +445,7 @@ func GetAdditionalGroups(additionalGroups []string, group io.Reader) ([]int, err return false }) if err != nil { - return nil, fmt.Errorf("Unable to find additional groups %v: %v", additionalGroups, err) + return nil, fmt.Errorf("Unable to find additional groups %v: %w", additionalGroups, err) } } @@ -468,15 +466,16 @@ func GetAdditionalGroups(additionalGroups []string, group io.Reader) ([]int, err // we asked for a group but didn't find it. let's check to see // if we wanted a numeric group if !found { - gid, err := strconv.Atoi(ag) + gid, err := strconv.ParseInt(ag, 10, 64) if err != nil { - return nil, fmt.Errorf("Unable to find group %s", ag) + // Not a numeric ID either. + return nil, fmt.Errorf("Unable to find group %s: %w", ag, ErrNoGroupEntries) } // Ensure gid is inside gid range. - if gid < minId || gid >= maxId { + if gid < minID || gid > maxID { return nil, ErrRange } - gidMap[gid] = struct{}{} + gidMap[int(gid)] = struct{}{} } } gids := []int{} @@ -523,7 +522,7 @@ func ParseSubIDFileFilter(path string, filter func(SubID) bool) ([]SubID, error) func ParseSubIDFilter(r io.Reader, filter func(SubID) bool) ([]SubID, error) { if r == nil { - return nil, fmt.Errorf("nil source for subid-formatted data") + return nil, errors.New("nil source for subid-formatted data") } var ( @@ -532,12 +531,8 @@ func ParseSubIDFilter(r io.Reader, filter func(SubID) bool) ([]SubID, error) { ) for s.Scan() { - if err := s.Err(); err != nil { - return nil, err - } - - line := strings.TrimSpace(s.Text()) - if line == "" { + line := bytes.TrimSpace(s.Bytes()) + if len(line) == 0 { continue } @@ -549,6 +544,9 @@ func ParseSubIDFilter(r io.Reader, filter func(SubID) bool) ([]SubID, error) { out = append(out, p) } } + if err := s.Err(); err != nil { + return nil, err + } return out, nil } @@ -577,7 +575,7 @@ func ParseIDMapFileFilter(path string, filter func(IDMap) bool) ([]IDMap, error) func ParseIDMapFilter(r io.Reader, filter func(IDMap) bool) ([]IDMap, error) { if r == nil { - return nil, fmt.Errorf("nil source for idmap-formatted data") + return nil, errors.New("nil source for idmap-formatted data") } var ( @@ -586,23 +584,22 @@ func ParseIDMapFilter(r io.Reader, filter func(IDMap) bool) ([]IDMap, error) { ) for s.Scan() { - if err := s.Err(); err != nil { - return nil, err - } - - line := strings.TrimSpace(s.Text()) - if line == "" { + line := bytes.TrimSpace(s.Bytes()) + if len(line) == 0 { continue } // see: man 7 user_namespaces p := IDMap{} - parseParts(strings.Fields(line), &p.ID, &p.ParentID, &p.Count) + parseParts(bytes.Fields(line), &p.ID, &p.ParentID, &p.Count) if filter == nil || filter(p) { out = append(out, p) } } + if err := s.Err(); err != nil { + return nil, err + } return out, nil } diff --git a/libcontainer/user/user_fuzzer.go b/libcontainer/user/user_fuzzer.go new file mode 100644 index 0000000..e018eae --- /dev/null +++ b/libcontainer/user/user_fuzzer.go @@ -0,0 +1,43 @@ +//go:build gofuzz +// +build gofuzz + +package user + +import ( + "io" + "strings" +) + +func IsDivisbleBy(n int, divisibleby int) bool { + return (n % divisibleby) == 0 +} + +func FuzzUser(data []byte) int { + if len(data) == 0 { + return -1 + } + if !IsDivisbleBy(len(data), 5) { + return -1 + } + + var divided [][]byte + + chunkSize := len(data) / 5 + + for i := 0; i < len(data); i += chunkSize { + end := i + chunkSize + + divided = append(divided, data[i:end]) + } + + _, _ = ParsePasswdFilter(strings.NewReader(string(divided[0])), nil) + + var passwd, group io.Reader + + group = strings.NewReader(string(divided[1])) + _, _ = GetAdditionalGroups([]string{string(divided[2])}, group) + + passwd = strings.NewReader(string(divided[3])) + _, _ = GetExecUser(string(divided[4]), nil, passwd, group) + return 1 +} diff --git a/libcontainer/user/user_test.go b/libcontainer/user/user_test.go index a4aabdc..c0c762d 100644 --- a/libcontainer/user/user_test.go +++ b/libcontainer/user/user_test.go @@ -1,14 +1,13 @@ package user import ( + "fmt" "io" "reflect" "sort" "strconv" "strings" "testing" - - "github.com/opencontainers/runc/libcontainer/utils" ) func TestUserParseLine(t *testing.T) { @@ -18,42 +17,42 @@ func TestUserParseLine(t *testing.T) { d int ) - parseLine("", &a, &b) + parseLine([]byte(""), &a, &b) if a != "" || b != "" { t.Fatalf("a and b should be empty ('%v', '%v')", a, b) } - parseLine("a", &a, &b) + parseLine([]byte("a"), &a, &b) if a != "a" || b != "" { t.Fatalf("a should be 'a' and b should be empty ('%v', '%v')", a, b) } - parseLine("bad boys:corny cows", &a, &b) + parseLine([]byte("bad boys:corny cows"), &a, &b) if a != "bad boys" || b != "corny cows" { t.Fatalf("a should be 'bad boys' and b should be 'corny cows' ('%v', '%v')", a, b) } - parseLine("", &c) + parseLine([]byte(""), &c) if len(c) != 0 { t.Fatalf("c should be empty (%#v)", c) } - parseLine("d,e,f:g:h:i,j,k", &c, &a, &b, &c) + parseLine([]byte("d,e,f:g:h:i,j,k"), &c, &a, &b, &c) if a != "g" || b != "h" || len(c) != 3 || c[0] != "i" || c[1] != "j" || c[2] != "k" { t.Fatalf("a should be 'g', b should be 'h', and c should be ['i','j','k'] ('%v', '%v', '%#v')", a, b, c) } - parseLine("::::::::::", &a, &b, &c) + parseLine([]byte("::::::::::"), &a, &b, &c) if a != "" || b != "" || len(c) != 0 { t.Fatalf("a, b, and c should all be empty ('%v', '%v', '%#v')", a, b, c) } - parseLine("not a number", &d) + parseLine([]byte("not a number"), &d) if d != 0 { t.Fatalf("d should be 0 (%v)", d) } - parseLine("b:12:c", &a, &d, &b) + parseLine([]byte("b:12:c"), &a, &d, &b) if a != "b" || b != "c" || d != 12 { t.Fatalf("a should be 'b' and b should be 'c', and d should be 12 ('%v', '%v', %v)", a, b, d) } @@ -84,12 +83,12 @@ func TestUserParseGroup(t *testing.T) { root:x:0:root adm:x:4:root,adm,daemon this is just some garbage data -`), nil) +`+largeGroup()), nil) if err != nil { t.Fatalf("Unexpected error: %v", err) } - if len(groups) != 3 { - t.Fatalf("Expected 3 groups, got %v", len(groups)) + if len(groups) != 4 { + t.Fatalf("Expected 4 groups, got %v", len(groups)) } if groups[0].Gid != 0 || groups[0].Name != "root" || len(groups[0].List) != 1 { t.Fatalf("Expected groups[0] to be 0 - root - 1 member, got %v - %v - %v", groups[0].Gid, groups[0].Name, len(groups[0].List)) @@ -105,16 +104,18 @@ root:x:0:0:root user:/root:/bin/bash adm:x:42:43:adm:/var/adm:/bin/false 111:x:222:333::/var/garbage odd:x:111:112::/home/odd::::: +user7456:x:7456:100:Vasya:/home/user7456 this is just some garbage data ` - const groupContent = ` + groupContent := ` root:x:0:root adm:x:43: -grp:x:1234:root,adm +grp:x:1234:root,adm,user7456 444:x:555:111 odd:x:444: this is just some garbage data -` +` + largeGroup() + defaultExecUser := ExecUser{ Uid: 8888, Gid: 8888, @@ -218,6 +219,16 @@ this is just some garbage data Home: "/home/odd", }, }, + // Test for #3036. + { + ref: "7456", + expected: ExecUser{ + Uid: 7456, + Gid: 100, + Sgids: []int{1234, 1000}, // 1000 is largegroup GID + Home: "/home/user7456", + }, + }, } for _, test := range tests { @@ -390,13 +401,13 @@ func TestGetAdditionalGroups(t *testing.T) { hasError bool } - const groupContent = ` + groupContent := ` root:x:0:root adm:x:43: grp:x:1234:root,adm adm:x:4343:root,adm-duplicate this is just some garbage data -` +` + largeGroup() tests := []foo{ { // empty group @@ -440,15 +451,17 @@ this is just some garbage data expected: nil, hasError: true, }, - } - - if utils.GetIntSize() > 4 { - tests = append(tests, foo{ + { // groups with too large id - groups: []string{strconv.Itoa( 1<<31 -1 )}, + groups: []string{strconv.FormatInt(1<<31, 10)}, expected: nil, hasError: true, - }) + }, + { + // group with very long list of users + groups: []string{"largegroup"}, + expected: []int{1000}, + }, } for _, test := range tests { @@ -463,7 +476,7 @@ this is just some garbage data t.Errorf("Parse(%#v) has error %v", test, err) continue } - sort.Sort(sort.IntSlice(gids)) + sort.Ints(gids) if !reflect.DeepEqual(gids, test.expected) { t.Errorf("Gids(%v), expect %v from groups %v", gids, test.expected, test.groups) } @@ -499,9 +512,19 @@ func TestGetAdditionalGroupsNumeric(t *testing.T) { t.Errorf("Parse(%#v) has error %v", test, err) continue } - sort.Sort(sort.IntSlice(gids)) + sort.Ints(gids) if !reflect.DeepEqual(gids, test.expected) { t.Errorf("Gids(%v), expect %v from groups %v", gids, test.expected, test.groups) } } } + +// Generate a proper "largegroup" entry for group tests. +func largeGroup() (res string) { + var b strings.Builder + b.WriteString("largegroup:x:1000:user1") + for i := 2; i <= 7500; i++ { + fmt.Fprintf(&b, ",user%d", i) + } + return b.String() +} diff --git a/libcontainer/userns/userns.go b/libcontainer/userns/userns.go new file mode 100644 index 0000000..f6cb98e --- /dev/null +++ b/libcontainer/userns/userns.go @@ -0,0 +1,5 @@ +package userns + +// RunningInUserNS detects whether we are currently running in a user namespace. +// Originally copied from github.com/lxc/lxd/shared/util.go +var RunningInUserNS = runningInUserNS diff --git a/libcontainer/userns/userns_fuzzer.go b/libcontainer/userns/userns_fuzzer.go new file mode 100644 index 0000000..1e00ab8 --- /dev/null +++ b/libcontainer/userns/userns_fuzzer.go @@ -0,0 +1,16 @@ +//go:build gofuzz +// +build gofuzz + +package userns + +import ( + "strings" + + "github.com/opencontainers/runc/libcontainer/user" +) + +func FuzzUIDMap(data []byte) int { + uidmap, _ := user.ParseIDMap(strings.NewReader(string(data))) + _ = uidMapInUserNS(uidmap) + return 1 +} diff --git a/libcontainer/userns/userns_linux.go b/libcontainer/userns/userns_linux.go new file mode 100644 index 0000000..724e6df --- /dev/null +++ b/libcontainer/userns/userns_linux.go @@ -0,0 +1,37 @@ +package userns + +import ( + "sync" + + "github.com/opencontainers/runc/libcontainer/user" +) + +var ( + inUserNS bool + nsOnce sync.Once +) + +// runningInUserNS detects whether we are currently running in a user namespace. +// Originally copied from github.com/lxc/lxd/shared/util.go +func runningInUserNS() bool { + nsOnce.Do(func() { + uidmap, err := user.CurrentProcessUIDMap() + if err != nil { + // This kernel-provided file only exists if user namespaces are supported + return + } + inUserNS = uidMapInUserNS(uidmap) + }) + return inUserNS +} + +func uidMapInUserNS(uidmap []user.IDMap) bool { + /* + * We assume we are in the initial user namespace if we have a full + * range - 4294967295 uids starting at uid 0. + */ + if len(uidmap) == 1 && uidmap[0].ID == 0 && uidmap[0].ParentID == 0 && uidmap[0].Count == 4294967295 { + return false + } + return true +} diff --git a/libcontainer/system/linux_test.go b/libcontainer/userns/userns_linux_test.go similarity index 92% rename from libcontainer/system/linux_test.go rename to libcontainer/userns/userns_linux_test.go index 4d613d8..90d9270 100644 --- a/libcontainer/system/linux_test.go +++ b/libcontainer/userns/userns_linux_test.go @@ -1,6 +1,4 @@ -// +build linux - -package system +package userns import ( "strings" @@ -37,7 +35,7 @@ func TestUIDMapInUserNS(t *testing.T) { if err != nil { t.Fatal(err) } - actual := UIDMapInUserNS(uidmap) + actual := uidMapInUserNS(uidmap) if c.expected != actual { t.Fatalf("expected %v, got %v for %q", c.expected, actual, c.s) } diff --git a/libcontainer/userns/userns_unsupported.go b/libcontainer/userns/userns_unsupported.go new file mode 100644 index 0000000..f35c13a --- /dev/null +++ b/libcontainer/userns/userns_unsupported.go @@ -0,0 +1,18 @@ +//go:build !linux +// +build !linux + +package userns + +import "github.com/opencontainers/runc/libcontainer/user" + +// runningInUserNS is a stub for non-Linux systems +// Always returns false +func runningInUserNS() bool { + return false +} + +// uidMapInUserNS is a stub for non-Linux systems +// Always returns false +func uidMapInUserNS(uidmap []user.IDMap) bool { + return false +} diff --git a/libcontainer/utils/cmsg.go b/libcontainer/utils/cmsg.go index c8a9364..7ef9da2 100644 --- a/libcontainer/utils/cmsg.go +++ b/libcontainer/utils/cmsg.go @@ -1,5 +1,3 @@ -// +build linux - package utils /* @@ -88,6 +86,11 @@ func SendFd(socket *os.File, name string, fd uintptr) error { if len(name) >= MaxNameLen { return fmt.Errorf("sendfd: filename too long: %s", name) } - oob := unix.UnixRights(int(fd)) - return unix.Sendmsg(int(socket.Fd()), []byte(name), oob, nil, 0) + return SendFds(socket, []byte(name), int(fd)) +} + +// SendFds sends a list of files descriptor and msg over the given AF_UNIX socket. +func SendFds(socket *os.File, msg []byte, fds ...int) error { + oob := unix.UnixRights(fds...) + return unix.Sendmsg(int(socket.Fd()), msg, oob, nil, 0) } diff --git a/libcontainer/utils/utils.go b/libcontainer/utils/utils.go index 40ccfaa..6b9fc34 100644 --- a/libcontainer/utils/utils.go +++ b/libcontainer/utils/utils.go @@ -1,13 +1,17 @@ package utils import ( + "encoding/binary" "encoding/json" + "fmt" "io" "os" "path/filepath" + "strconv" "strings" "unsafe" + securejoin "github.com/cyphar/filepath-securejoin" "golang.org/x/sys/unix" ) @@ -15,14 +19,18 @@ const ( exitSignalOffset = 128 ) -// ResolveRootfs ensures that the current working directory is -// not a symlink and returns the absolute path to the rootfs -func ResolveRootfs(uncleanRootfs string) (string, error) { - rootfs, err := filepath.Abs(uncleanRootfs) - if err != nil { - return "", err +// NativeEndian is the native byte order of the host system. +var NativeEndian binary.ByteOrder + +func init() { + // Copied from . + i := uint32(1) + b := (*[4]byte)(unsafe.Pointer(&i)) + if b[0] == 1 { + NativeEndian = binary.LittleEndian + } else { + NativeEndian = binary.BigEndian } - return filepath.EvalSymlinks(rootfs) } // ExitStatus returns the correct exit status for a process based on if it @@ -73,6 +81,57 @@ func CleanPath(path string) string { return filepath.Clean(path) } +// stripRoot returns the passed path, stripping the root path if it was +// (lexicially) inside it. Note that both passed paths will always be treated +// as absolute, and the returned path will also always be absolute. In +// addition, the paths are cleaned before stripping the root. +func stripRoot(root, path string) string { + // Make the paths clean and absolute. + root, path = CleanPath("/"+root), CleanPath("/"+path) + switch { + case path == root: + path = "/" + case root == "/": + // do nothing + case strings.HasPrefix(path, root+"/"): + path = strings.TrimPrefix(path, root+"/") + } + return CleanPath("/" + path) +} + +// WithProcfd runs the passed closure with a procfd path (/proc/self/fd/...) +// corresponding to the unsafePath resolved within the root. Before passing the +// fd, this path is verified to have been inside the root -- so operating on it +// through the passed fdpath should be safe. Do not access this path through +// the original path strings, and do not attempt to use the pathname outside of +// the passed closure (the file handle will be freed once the closure returns). +func WithProcfd(root, unsafePath string, fn func(procfd string) error) error { + // Remove the root then forcefully resolve inside the root. + unsafePath = stripRoot(root, unsafePath) + path, err := securejoin.SecureJoin(root, unsafePath) + if err != nil { + return fmt.Errorf("resolving path inside rootfs failed: %w", err) + } + + // Open the target path. + fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0) + if err != nil { + return fmt.Errorf("open o_path procfd: %w", err) + } + defer fh.Close() + + // Double-check the path is the one we expected. + procfd := "/proc/self/fd/" + strconv.Itoa(int(fh.Fd())) + if realpath, err := os.Readlink(procfd); err != nil { + return fmt.Errorf("procfd verification failed: %w", err) + } else if realpath != path { + return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath) + } + + // Run the closure. + return fn(procfd) +} + // SearchLabels searches a list of key-value pairs for the provided key and // returns the corresponding value. The pairs must be separated with '='. func SearchLabels(labels []string, query string) string { @@ -106,7 +165,3 @@ func Annotations(labels []string) (bundle string, userAnnotations map[string]str } return } - -func GetIntSize() int { - return int(unsafe.Sizeof(1)) -} diff --git a/libcontainer/utils/utils_test.go b/libcontainer/utils/utils_test.go index 395eedc..52fc936 100644 --- a/libcontainer/utils/utils_test.go +++ b/libcontainer/utils/utils_test.go @@ -2,9 +2,6 @@ package utils import ( "bytes" - "fmt" - "os" - "path/filepath" "testing" "golang.org/x/sys/unix" @@ -31,47 +28,6 @@ func TestSearchLabels(t *testing.T) { } } -func TestResolveRootfs(t *testing.T) { - dir := "rootfs" - os.Mkdir(dir, 0600) - defer os.Remove(dir) - - path, err := ResolveRootfs(dir) - if err != nil { - t.Fatal(err) - } - pwd, err := os.Getwd() - if err != nil { - t.Fatal(err) - } - if path != fmt.Sprintf("%s/%s", pwd, "rootfs") { - t.Errorf("expected rootfs to be abs and was %s", path) - } -} - -func TestResolveRootfsWithSymlink(t *testing.T) { - dir := "rootfs" - tmpDir, _ := filepath.EvalSymlinks(os.TempDir()) - os.Symlink(tmpDir, dir) - defer os.Remove(dir) - - path, err := ResolveRootfs(dir) - if err != nil { - t.Fatal(err) - } - - if path != tmpDir { - t.Errorf("expected rootfs to be the real path %s and was %s", path, os.TempDir()) - } -} - -func TestResolveRootfsWithNonExistingDir(t *testing.T) { - _, err := ResolveRootfs("foo") - if err == nil { - t.Error("expected error to happen but received nil") - } -} - func TestExitStatus(t *testing.T) { status := unix.WaitStatus(0) ex := ExitStatus(status) @@ -140,3 +96,38 @@ func TestCleanPath(t *testing.T) { t.Errorf("expected to receive '/foo' and received %s", path) } } + +func TestStripRoot(t *testing.T) { + for _, test := range []struct { + root, path, out string + }{ + // Works with multiple components. + {"/a/b", "/a/b/c", "/c"}, + {"/hello/world", "/hello/world/the/quick-brown/fox", "/the/quick-brown/fox"}, + // '/' must be a no-op. + {"/", "/a/b/c", "/a/b/c"}, + // Must be the correct order. + {"/a/b", "/a/c/b", "/a/c/b"}, + // Must be at start. + {"/abc/def", "/foo/abc/def/bar", "/foo/abc/def/bar"}, + // Must be a lexical parent. + {"/foo/bar", "/foo/barSAMECOMPONENT", "/foo/barSAMECOMPONENT"}, + // Must only strip the root once. + {"/foo/bar", "/foo/bar/foo/bar/baz", "/foo/bar/baz"}, + // Deal with .. in a fairly sane way. + {"/foo/bar", "/foo/bar/../baz", "/foo/baz"}, + {"/foo/bar", "../../../../../../foo/bar/baz", "/baz"}, + {"/foo/bar", "/../../../../../../foo/bar/baz", "/baz"}, + {"/foo/bar/../baz", "/foo/baz/bar", "/bar"}, + {"/foo/bar/../baz", "/foo/baz/../bar/../baz/./foo", "/foo"}, + // All paths are made absolute before stripping. + {"foo/bar", "/foo/bar/baz/bee", "/baz/bee"}, + {"/foo/bar", "foo/bar/baz/beef", "/baz/beef"}, + {"foo/bar", "foo/bar/baz/beets", "/baz/beets"}, + } { + got := stripRoot(test.root, test.path) + if got != test.out { + t.Errorf("stripRoot(%q, %q) -- got %q, expected %q", test.root, test.path, got, test.out) + } + } +} diff --git a/libcontainer/utils/utils_unix.go b/libcontainer/utils/utils_unix.go index 1576f2d..220d0b4 100644 --- a/libcontainer/utils/utils_unix.go +++ b/libcontainer/utils/utils_unix.go @@ -1,3 +1,4 @@ +//go:build !windows // +build !windows package utils @@ -14,7 +15,7 @@ import ( func EnsureProcHandle(fh *os.File) error { var buf unix.Statfs_t if err := unix.Fstatfs(int(fh.Fd()), &buf); err != nil { - return fmt.Errorf("ensure %s is on procfs: %v", fh.Name(), err) + return fmt.Errorf("ensure %s is on procfs: %w", fh.Name(), err) } if buf.Type != unix.PROC_SUPER_MAGIC { return fmt.Errorf("%s is not on procfs", fh.Name()) @@ -52,7 +53,7 @@ func CloseExecFrom(minFd int) error { // Intentionally ignore errors from unix.CloseOnExec -- the cases where // this might fail are basically file descriptors that have already // been closed (including and especially the one that was created when - // ioutil.ReadDir did the "opendir" syscall). + // os.ReadDir did the "opendir" syscall). unix.CloseOnExec(fd) } return nil diff --git a/list.go b/list.go index 0313d8c..3503dcd 100644 --- a/list.go +++ b/list.go @@ -1,18 +1,15 @@ -// +build linux - package main import ( + "encoding/json" + "errors" "fmt" - "io/ioutil" "os" "path/filepath" "syscall" "text/tabwriter" "time" - "encoding/json" - "github.com/opencontainers/runc/libcontainer" "github.com/opencontainers/runc/libcontainer/user" "github.com/opencontainers/runc/libcontainer/utils" @@ -107,7 +104,7 @@ To list containers created using a non-default value for "--root": return err } default: - return fmt.Errorf("invalid format option") + return errors.New("invalid format option") } return nil }, @@ -123,7 +120,7 @@ func getContainers(context *cli.Context) ([]containerState, error) { if err != nil { return nil, err } - list, err := ioutil.ReadDir(absRoot) + list, err := os.ReadDir(absRoot) if err != nil { fatal(err) } @@ -131,11 +128,15 @@ func getContainers(context *cli.Context) ([]containerState, error) { var s []containerState for _, item := range list { if item.IsDir() { - // This cast is safe on Linux. - stat := item.Sys().(*syscall.Stat_t) - owner, err := user.LookupUid(int(stat.Uid)) + st, err := os.Stat(filepath.Join(absRoot, item.Name())) if err != nil { - owner.Name = fmt.Sprintf("#%d", stat.Uid) + fatal(err) + } + // This cast is safe on Linux. + uid := st.Sys().(*syscall.Stat_t).Uid + owner, err := user.LookupUid(int(uid)) + if err != nil { + owner.Name = fmt.Sprintf("#%d", uid) } container, err := factory.Load(item.Name()) diff --git a/main.go b/main.go index 3a8c163..4d66638 100644 --- a/main.go +++ b/main.go @@ -1,22 +1,25 @@ package main import ( + "errors" "fmt" "io" "os" + "path/filepath" + "runtime" + "strconv" "strings" - "github.com/opencontainers/runc/libcontainer/logs" - + "github.com/opencontainers/runc/libcontainer/seccomp" "github.com/opencontainers/runtime-spec/specs-go" "github.com/sirupsen/logrus" "github.com/urfave/cli" ) -// version will be populated by the Makefile, read from -// VERSION file of the source code. -var version = "" +// version must be set from the contents of VERSION file by go build's +// -X main.version= option in the Makefile. +var version = "unknown" // gitCommit will be the hash that the binary was built from // and will be populated by the Makefile @@ -54,46 +57,43 @@ func main() { app.Name = "runc" app.Usage = usage - var v []string - if version != "" { - v = append(v, version) - } + v := []string{version} + if gitCommit != "" { - v = append(v, fmt.Sprintf("commit: %s", gitCommit)) + v = append(v, "commit: "+gitCommit) + } + v = append(v, "spec: "+specs.Version) + v = append(v, "go: "+runtime.Version()) + + major, minor, micro := seccomp.Version() + if major+minor+micro > 0 { + v = append(v, fmt.Sprintf("libseccomp: %d.%d.%d", major, minor, micro)) } - v = append(v, fmt.Sprintf("spec: %s", specs.Version)) app.Version = strings.Join(v, "\n") + xdgRuntimeDir := "" root := "/run/runc" if shouldHonorXDGRuntimeDir() { if runtimeDir := os.Getenv("XDG_RUNTIME_DIR"); runtimeDir != "" { root = runtimeDir + "/runc" - // According to the XDG specification, we need to set anything in - // XDG_RUNTIME_DIR to have a sticky bit if we don't want it to get - // auto-pruned. - if err := os.MkdirAll(root, 0700); err != nil { - fatal(err) - } - if err := os.Chmod(root, 0700|os.ModeSticky); err != nil { - fatal(err) - } + xdgRuntimeDir = root } } app.Flags = []cli.Flag{ cli.BoolFlag{ Name: "debug", - Usage: "enable debug output for logging", + Usage: "enable debug logging", }, cli.StringFlag{ Name: "log", Value: "", - Usage: "set the log file path where internal debug information is written", + Usage: "set the log file to write runc logs to (default is '/dev/stderr')", }, cli.StringFlag{ Name: "log-format", Value: "text", - Usage: "set the format used by logs ('text' (default), or 'json')", + Usage: "set the log format ('text' (default), or 'json')", }, cli.StringFlag{ Name: "root", @@ -121,7 +121,6 @@ func main() { deleteCommand, eventsCommand, execCommand, - initCommand, killCommand, listCommand, pauseCommand, @@ -133,9 +132,27 @@ func main() { startCommand, stateCommand, updateCommand, + featuresCommand, } app.Before = func(context *cli.Context) error { - return logs.ConfigureLogging(createLogConfig(context)) + if !context.IsSet("root") && xdgRuntimeDir != "" { + // According to the XDG specification, we need to set anything in + // XDG_RUNTIME_DIR to have a sticky bit if we don't want it to get + // auto-pruned. + if err := os.MkdirAll(root, 0o700); err != nil { + fmt.Fprintln(os.Stderr, "the path in $XDG_RUNTIME_DIR must be writable by the user") + fatal(err) + } + if err := os.Chmod(root, os.FileMode(0o700)|os.ModeSticky); err != nil { + fmt.Fprintln(os.Stderr, "you should check permission of the path in $XDG_RUNTIME_DIR") + fatal(err) + } + } + if err := reviseRootDir(context); err != nil { + return err + } + + return configLogrus(context) } // If the command returns an error, cli takes upon itself to print @@ -153,24 +170,48 @@ type FatalWriter struct { func (f *FatalWriter) Write(p []byte) (n int, err error) { logrus.Error(string(p)) - return f.cliErrWriter.Write(p) + if !logrusToStderr() { + return f.cliErrWriter.Write(p) + } + return len(p), nil } -func createLogConfig(context *cli.Context) logs.Config { - logFilePath := context.GlobalString("log") - logPipeFd := "" - if logFilePath == "" { - logPipeFd = "2" - } - config := logs.Config{ - LogPipeFd: logPipeFd, - LogLevel: logrus.InfoLevel, - LogFilePath: logFilePath, - LogFormat: context.GlobalString("log-format"), - } +func configLogrus(context *cli.Context) error { if context.GlobalBool("debug") { - config.LogLevel = logrus.DebugLevel + logrus.SetLevel(logrus.DebugLevel) + logrus.SetReportCaller(true) + // Shorten function and file names reported by the logger, by + // trimming common "github.com/opencontainers/runc" prefix. + // This is only done for text formatter. + _, file, _, _ := runtime.Caller(0) + prefix := filepath.Dir(file) + "/" + logrus.SetFormatter(&logrus.TextFormatter{ + CallerPrettyfier: func(f *runtime.Frame) (string, string) { + function := strings.TrimPrefix(f.Function, prefix) + "()" + fileLine := strings.TrimPrefix(f.File, prefix) + ":" + strconv.Itoa(f.Line) + return function, fileLine + }, + }) } - return config + switch f := context.GlobalString("log-format"); f { + case "": + // do nothing + case "text": + // do nothing + case "json": + logrus.SetFormatter(new(logrus.JSONFormatter)) + default: + return errors.New("invalid log-format: " + f) + } + + if file := context.GlobalString("log"); file != "" { + f, err := os.OpenFile(file, os.O_CREATE|os.O_WRONLY|os.O_APPEND|os.O_SYNC, 0o644) + if err != nil { + return err + } + logrus.SetOutput(f) + } + + return nil } diff --git a/man/md2man-all.sh b/man/md2man-all.sh index f850ddf..eaee58e 100755 --- a/man/md2man-all.sh +++ b/man/md2man-all.sh @@ -9,7 +9,7 @@ cd "$(dirname "$(readlink -f "$BASH_SOURCE")")" pwd } -if ! ( which go-md2man &>/dev/null ); then +if ! type go-md2man; then echo "To install man pages, please install 'go-md2man'." exit 0 fi diff --git a/man/runc-checkpoint.8.md b/man/runc-checkpoint.8.md index 08e6b1f..373259d 100644 --- a/man/runc-checkpoint.8.md +++ b/man/runc-checkpoint.8.md @@ -1,30 +1,76 @@ % runc-checkpoint "8" # NAME - runc checkpoint - checkpoint a running container +**runc-checkpoint** - checkpoint a running container # SYNOPSIS - runc checkpoint [command options] `` - -Where "``" is the name for the instance of the container to be -checkpointed. +**runc checkpoint** [_option_ ...] _container-id_ # DESCRIPTION - The checkpoint command saves the state of the container instance. +The **checkpoint** command saves the state of the running container instance +with the help of **criu**(8) tool, to be restored later. # OPTIONS - --image-path value path for saving criu image files - --work-path value path for saving work files and logs - --parent-path value path for previous criu image files in pre-dump - --leave-running leave the process running after checkpointing - --tcp-established allow open tcp connections - --ext-unix-sk allow external unix sockets - --shell-job allow shell jobs - --lazy-pages use userfaultfd to lazily restore memory pages - --status-fd value criu writes \0 to this FD once lazy-pages is ready - --page-server value ADDRESS:PORT of the page server - --file-locks handle file locks, for safety - --pre-dump dump container's memory information only, leave the container running after this - --manage-cgroups-mode value cgroups mode: 'soft' (default), 'full' and 'strict' - --empty-ns value create a namespace, but don't restore its properties - --auto-dedup enable auto deduplication of memory images +**--image-path** _path_ +: Set path for saving criu image files. The default is *./checkpoint*. + +**--work-path** _path_ +: Set path for saving criu work files and logs. The default is to reuse the +image files directory. + +**--parent-path** _path_ +: Set path for previous criu image files, in pre-dump. + +**--leave-running** +: Leave the process running after checkpointing. + +**--tcp-established** +: Allow checkpoint/restore of established TCP connections. See +[criu --tcp-establised option](https://criu.org/CLI/opt/--tcp-established). + +**--ext-unix-sk** +: Allow checkpoint/restore of external unix sockets. See +[criu --ext-unix-sk option](https://criu.org/CLI/opt/--ext-unix-sk). + +**--shell-job** +: Allow checkpoint/restore of shell jobs. + +**--lazy-pages** +: Use lazy migration mechanism. See +[criu --lazy-pages option](https://criu.org/CLI/opt/--lazy-pages). + +**--status-fd** _fd_ +: Pass a file descriptor _fd_ to **criu**. Once **lazy-pages** server is ready, +**criu** writes **\0** (a zero byte) to that _fd_. Used together with +**--lazy-pages**. + +**--page-server** _IP-address_:_port_ +: Start a page server at the specified _IP-address_ and _port_. This is used +together with **criu lazy-pages**. See +[criu lazy migration](https://criu.org/Lazy_migration). + +**--file-locks** +: Allow checkpoint/restore of file locks. See +[criu --file-locks option](https://criu.org/CLI/opt/--file-locks). + +**--pre-dump** +: Do a pre-dump, i.e. dump container's memory information only, leaving the +container running. See [criu iterative migration](https://criu.org/Iterative_migration). + +**--manage-cgroups-mode** **soft**|**full**|**strict**. +: Cgroups mode. Default is **soft**. See +[criu --manage-cgroups option](https://criu.org/CLI/opt/--manage-cgroups). + +**--empty-ns** _namespace_ +: Checkpoint a _namespace_, but don't save its properties. See +[criu --empty-ns option](https://criu.org/CLI/opt/--empty-ns). + +**--auto-dedup** +: Enable auto deduplication of memory images. See +[criu --auto-dedup option](https://criu.org/CLI/opt/--auto-dedup). + +# SEE ALSO +**criu**(8), +**runc-restore**(8), +**runc**(8), +**criu**(8). diff --git a/man/runc-create.8.md b/man/runc-create.8.md index 99c0a2c..cfe6d17 100644 --- a/man/runc-create.8.md +++ b/man/runc-create.8.md @@ -1,29 +1,44 @@ % runc-create "8" # NAME - runc create - create a container +**runc-create** - create a container # SYNOPSIS - runc create [command options] `` - -Where "``" is your name for the instance of the container that you -are starting. The name you provide for the container instance must be unique on -your host. +**runc create** [_option_ ...] _container-id_ # DESCRIPTION - The create command creates an instance of a container for a bundle. The bundle -is a directory with a specification file named "config.json" and a root -filesystem. - -The specification file includes an args parameter. The args parameter is used -to specify command(s) that get run when the container is started. To change the -command(s) that get executed on start, edit the args parameter of the spec. See -"runc spec --help" for more explanation. +The **create** command creates an instance of a container from a bundle. +The bundle is a directory with a specification file named _config.json_, +and a root filesystem. # OPTIONS - --bundle value, -b value path to the root of the bundle directory, defaults to the current directory - --console-socket value path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal - --pid-file value specify the file to write the process id to - --no-pivot do not use pivot root to jail process inside rootfs. This should be used whenever the rootfs is on top of a ramdisk - --no-new-keyring do not create a new session keyring for the container. This will cause the container to inherit the calling processes session key - --preserve-fds value Pass N additional file descriptors to the container (stdio + $LISTEN_FDS + N in total) (default: 0) + +**--bundle**|**-b** _path_ +: Path to the root of the bundle directory. Default is current directory. + +**--console-socket** _path_ +: Path to an **AF_UNIX** socket which will receive a file descriptor +referencing the master end of the console's pseudoterminal. See +[docs/terminals](https://github.com/opencontainers/runc/blob/master/docs/terminals.md). + +**--pid-file** _path_ +: Specify the file to write the initial container process' PID to. + +**--no-pivot** +: Do not use pivot root to jail process inside rootfs. This should not be used +except in exceptional circumstances, and may be unsafe from the security +standpoint. + +**--no-new-keyring** +: Do not create a new session keyring for the container. This will cause the +container to inherit the calling processes session key. + +**--preserve-fds** _N_ +: Pass _N_ additional file descriptors to the container (**stdio** + +**$LISTEN_FDS** + _N_ in total). Default is **0**. + +# SEE ALSO + +**runc-spec**(8), +**runc-start**(8), +**runc**(8). diff --git a/man/runc-delete.8.md b/man/runc-delete.8.md index 84922a0..249528c 100644 --- a/man/runc-delete.8.md +++ b/man/runc-delete.8.md @@ -1,19 +1,24 @@ % runc-delete "8" # NAME - runc delete - delete any resources held by the container often used with detached container +**runc-delete** - delete any resources held by the container # SYNOPSIS - runc delete [command options] `` - -Where "``" is the name for the instance of the container. +**runc delete** [**--force**|**-f**] _container-id_ # OPTIONS - --force, -f Forcibly deletes the container if it is still running (uses SIGKILL) +**--force**|**-f** +: Forcibly delete the running container, using **SIGKILL** **signal**(7) +to stop it first. -# EXAMPLE -For example, if the container id is "ubuntu01" and runc list currently shows the -status of "ubuntu01" as "stopped" the following will delete resources held for -"ubuntu01" removing "ubuntu01" from the runc list of containers: +# EXAMPLES +If the container id is **ubuntu01** and **runc list** currently shows +its status as **stopped**, the following will delete resources held for +**ubuntu01**, removing it from the **runc list**: - # runc delete ubuntu01 + # runc delete ubuntu01 + +# SEE ALSO + +**runc-kill**(8), +**runc**(8). diff --git a/man/runc-events.8.md b/man/runc-events.8.md index d998a38..eab7865 100644 --- a/man/runc-events.8.md +++ b/man/runc-events.8.md @@ -1,17 +1,23 @@ % runc-events "8" # NAME - runc events - display container events such as OOM notifications, cpu, memory, and IO usage statistics +**runc-events** - display container events and statistics. # SYNOPSIS - runc events [command options] `` - -Where "``" is the name for the instance of the container. +**runc events** [_option_ ...] _container-id_ # DESCRIPTION - The events command displays information about the container. By default the -information is displayed once every 5 seconds. +The **events** command displays information about the container. By default, +it works continuously, displaying stats every 5 seconds, and container events +as they occur. # OPTIONS - --interval value set the stats collection interval (default: 5s) - --stats display the container's stats then exit +**--interval** _time_ +: Set the stats collection interval. Default is **5s**. + +**--stats** +: Show the container's stats once then exit. + +# SEE ALSO + +**runc**(8). diff --git a/man/runc-exec.8.md b/man/runc-exec.8.md index dbaaefe..c0ca411 100644 --- a/man/runc-exec.8.md +++ b/man/runc-exec.8.md @@ -1,33 +1,91 @@ % runc-exec "8" # NAME - runc exec - execute new process inside the container +**runc-exec** - execute new process inside the container # SYNOPSIS - runc exec [command options] `` -- `` [args...] +**runc exec** [_option_ ...] _container-id_ [--] _command_ [_arg_ ...] -Where "``" is the name for the instance of the container and -"``" is the command to be executed in the container. - -# EXAMPLE -For example, if the container is configured to run the linux ps command the -following will output a list of processes running in the container: - - # runc exec ps +**runc exec** [_option_ ...] **-p** _process.json_ _container-id_ # OPTIONS - --console value specify the pty slave path for use with the container - --cwd value current working directory in the container - --env value, -e value set environment variables - --tty, -t allocate a pseudo-TTY - --user value, -u value UID (format: [:]) - --additional-gids value, -g value additional gids - --process value, -p value path to the process.json - --detach, -d detach from the container's process - --pid-file value specify the file to write the process id to - --process-label value set the asm process label for the process commonly used with selinux - --apparmor value set the apparmor profile for the process - --no-new-privs set the no new privileges value for the process - --cap value, -c value add a capability to the bounding set for the process - --no-subreaper disable the use of the subreaper used to reap reparented processes - --preserve-fds value pass N additional file descriptors to the container (stdio + $LISTEN_FDS + N in total) (default: 0) +**--console-socket** _path_ +: Path to an **AF_UNIX** socket which will receive a file descriptor +referencing the master end of the console's pseudoterminal. See +[docs/terminals](https://github.com/opencontainers/runc/blob/master/docs/terminals.md). + +**--cwd** _path_ +: Change to _path_ in the container before executing the command. + +**--env**|**-e** _name_=_value_ +: Set an environment variable _name_ to _value_. Can be specified multiple times. + +**--tty**|**-t** +: Allocate a pseudo-TTY. + +**--user**|**-u** _uid_[:_gid_] +: Run the _command_ as a user (and, optionally, group) specified by _uid_ (and +_gid_). + +**--additional-gids**|**-g** _gid_ +: Add additional group IDs. Can be specified multiple times. + +**--process**|**-p** _process.json_ +: Instead of specifying all the exec parameters directly on the command line, +get them from a _process.json_, a JSON file containing the process +specification as defined by the +[OCI runtime spec](https://github.com/opencontainers/runtime-spec/blob/master/config.md#process). + +**--detach**|**-d** +: Detach from the container's process. + +**--pid-file** _path_ +: Specify the file to write the container process' PID to. + +**--process-label** _label_ +: Set the asm process label for the process commonly used with **selinux**(7). + +**--apparmor** _profile_ +: Set the **apparmor**(7) _profile_ for the process. + +**--no-new-privs** +: Set the "no new privileges" value for the process. + +**--cap** _cap_ +: Add a capability to the bounding set for the process. Can be specified +multiple times. + +**--preserve-fds** _N_ +: Pass _N_ additional file descriptors to the container (**stdio** + +**$LISTEN_FDS** + _N_ in total). Default is **0**. + +**--ignore-paused** +: Allow exec in a paused container. By default, if a container is paused, +**runc exec** errors out; this option can be used to override it. +A paused container needs to be resumed for the exec to complete. + +**--cgroup** _path_ | _controller_[,_controller_...]:_path_ +: Execute a process in a sub-cgroup. If the specified cgroup does not exist, an +error is returned. Default is empty path, which means to use container's top +level cgroup. +: For cgroup v1 only, a particular _controller_ (or multiple comma-separated +controllers) can be specified, and the option can be used multiple times to set +different paths for different controllers. +: Note for cgroup v2, in case the process can't join the top level cgroup, +**runc exec** fallback is to try joining the cgroup of container's init. +This fallback can be disabled by using **--cgroup /**. + +# EXIT STATUS + +Exits with a status of _command_ (unless **-d** is used), or **255** if +an error occurred. + +# EXAMPLES +If the container can run **ps**(1) command, the following +will output a list of processes running in the container: + + # runc exec ps + +# SEE ALSO + +**runc**(8). diff --git a/man/runc-kill.8.md b/man/runc-kill.8.md index 1ea579a..c18fe94 100644 --- a/man/runc-kill.8.md +++ b/man/runc-kill.8.md @@ -1,20 +1,31 @@ % runc-kill "8" # NAME - runc kill - kill sends the specified signal (default: SIGTERM) to the container's init process +**runc-kill** - send a specified signal to container # SYNOPSIS - runc kill [command options] `` `` +**runc kill** [**--all**|**-a**] _container-id_ [_signal_] -Where "``" is the name for the instance of the container and -"``" is the signal to be sent to the init process. +# DESCRIPTION + +By default, **runc kill** sends **SIGTERM** to the container's initial process +only. + +A different signal can be specified either by its name (with or without the +**SIG** prefix), or its numeric value. Use **kill**(1) with **-l** option +to list available signals. # OPTIONS - --all, -a send the specified signal to all processes inside the container +**--all**|**-a** +: Send the signal to all processes inside the container. -# EXAMPLE +# EXAMPLES -For example, if the container id is "ubuntu01" the following will send a "KILL" -signal to the init process of the "ubuntu01" container: +The following will send a **KILL** signal to the init process of the +**ubuntu01** container: - # runc kill ubuntu01 KILL + # runc kill ubuntu01 KILL + +# SEE ALSO + +**runc**(1). diff --git a/man/runc-list.8.md b/man/runc-list.8.md index 46cd5d0..43545d5 100644 --- a/man/runc-list.8.md +++ b/man/runc-list.8.md @@ -1,21 +1,39 @@ % runc-list "8" # NAME - runc list - lists containers started by runc with the given root +**runc-list** - lists containers # SYNOPSIS - runc list [command options] +**runc list** [_option_ ...] -# EXAMPLE -Where the given root is specified via the global option "--root" -(default: "/run/runc"). +# DESCRIPTION -To list containers created via the default "--root": - # runc list - -To list containers created using a non-default value for "--root": - # runc --root value list +The **list** commands lists containers. Note that a global **--root** +option can be specified to change the default root. For the description +of **--root**, see **runc**(8). # OPTIONS - --format value, -f value select one of: table or json (default: "table") - --quiet, -q display only container IDs +**--format**|**-f** **table**|**json** +: Specify the format. Default is **table**. The **json** format provides +more details. + +**--quiet**|**-q** +: Only display container IDs. + +# EXAMPLES +To list containers created with the default root: + + # runc list + +To list containers in a human-readable JSON (with the help of **jq**(1) +utility): + + # runc list -f json | jq + +To list containers created with the root of **/tmp/myroot**: + + # runc --root /tmp/myroot + +# SEE ALSO + +**runc**(8). diff --git a/man/runc-pause.8.md b/man/runc-pause.8.md index 965f7da..e6e8381 100644 --- a/man/runc-pause.8.md +++ b/man/runc-pause.8.md @@ -1,14 +1,18 @@ % runc-pause "8" # NAME - runc pause - pause suspends all processes inside the container +**runc-pause** - suspend all processes inside the container # SYNOPSIS - runc pause `` - -Where "``" is the name for the instance of the container to be -paused. +**runc pause** _container-id_ # DESCRIPTION - The pause command suspends all processes in the instance of the container. -Use runc list to identify instances of containers and their current status. +The **pause** command suspends all processes in the instance of the container +identified by _container-id_. + +Use **runc list** to identify instances of containers and their current status. + +# SEE ALSO +**runc-list**(8), +**runc-resume**(8), +**runc**(8). diff --git a/man/runc-ps.8.md b/man/runc-ps.8.md index 1fad467..9f6cf96 100644 --- a/man/runc-ps.8.md +++ b/man/runc-ps.8.md @@ -1,15 +1,26 @@ % runc-ps "8" # NAME - runc ps - ps displays the processes running inside a container +**runc-ps** - display the processes inside a container # SYNOPSIS - runc ps [command options] `` [ps options] +**runc ps** [_option_ ...] _container-id_ [_ps-option_ ...] + +# DESCRIPTION +The command **ps** is a wrapper around the stock **ps**(1) utility, +which filters its output to only contain processes belonging to a specified +_container-id_. Therefore, the PIDs shown are the host PIDs. + +Any **ps**(1) options can be used, but some might break the filtering. +In particular, if PID column is not available, an error is returned, +and if there are columns with values containing spaces before the PID +column, the result is undefined. # OPTIONS - --format value, -f value select one of: table(default) or json +**--format**|**-f** **table**|**json** +: Output format. Default is **table**. The **json** format shows a mere array +of PIDs belonging to a container; if used, all **ps** options are gnored. -The default format is table. The following will output the processes of a container -in json format: - - # runc ps -f json +# SEE ALSO +**runc-list**(8), +**runc**(8). diff --git a/man/runc-restore.8.md b/man/runc-restore.8.md index e475bd5..a2b3da6 100644 --- a/man/runc-restore.8.md +++ b/man/runc-restore.8.md @@ -1,28 +1,89 @@ % runc-restore "8" # NAME - runc restore - restore a container from a previous checkpoint +**runc-restore** - restore a container from a previous checkpoint # SYNOPSIS - runc restore [command options] `` - -Where "``" is the name for the instance of the container to be -restored. +**runc restore** [_option_ ...] _container-id_ # DESCRIPTION - Restores the saved state of the container instance that was previously saved -using the runc checkpoint command. +Restores the container instance from a previously performed **runc checkpoint**. # OPTIONS - --image-path value path to criu image files for restoring - --work-path value path for saving work files and logs - --tcp-established allow open tcp connections - --ext-unix-sk allow external unix sockets - --shell-job allow shell jobs - --file-locks handle file locks, for safety - --manage-cgroups-mode value cgroups mode: 'soft' (default), 'full' and 'strict' - --bundle value, -b value path to the root of the bundle directory - --detach, -d detach from the container's process - --pid-file value specify the file to write the process id to - --no-subreaper disable the use of the subreaper used to reap reparented processes - --no-pivot do not use pivot root to jail process inside rootfs. This should be used whenever the rootfs is on top of a ramdisk +**--console-socket** _path_ +: Path to an **AF_UNIX** socket which will receive a file descriptor +referencing the master end of the console's pseudoterminal. See +[docs/terminals](https://github.com/opencontainers/runc/blob/master/docs/terminals.md). + +**--image-path** _path_ +: Set path to get criu image files to restore from. + +**--work-path** _path_ +: Set path for saving criu work files and logs. The default is to reuse the +image files directory. + +**--tcp-established** +: Allow checkpoint/restore of established TCP connections. See +[criu --tcp-establised option](https://criu.org/CLI/opt/--tcp-established). + +**--ext-unix-sk** +: Allow checkpoint/restore of external unix sockets. See +[criu --ext-unix-sk option](https://criu.org/CLI/opt/--ext-unix-sk). + +**--shell-job** +: Allow checkpoint/restore of shell jobs. + +**--file-locks** +: Allow checkpoint/restore of file locks. See +[criu --file-locks option](https://criu.org/CLI/opt/--file-locks). + +**--manage-cgroups-mode** **soft**|**full**|**strict**. +: Cgroups mode. Default is **soft**. See +[criu --manage-cgroups option](https://criu.org/CLI/opt/--manage-cgroups). + +**--bundle**|**-b** _path_ +: Path to the root of the bundle directory. Default is current directory. + +**--detach**|**-d** +: Detach from the container's process. + +**--pid-file** _path_ +: Specify the file to write the initial container process' PID to. + +**--no-subreaper** +: Disable the use of the subreaper used to reap reparented processes. + +**--no-pivot** +: Do not use pivot root to jail process inside rootfs. This should not be used +except in exceptional circumstances, and may be unsafe from the security +standpoint. + +**--empty-ns** _namespace_ +: Create a _namespace_, but don't restore its properties. See +[criu --empty-ns option](https://criu.org/CLI/opt/--empty-ns). + +**--auto-dedup** +: Enable auto deduplication of memory images. See +[criu --auto-dedup option](https://criu.org/CLI/opt/--auto-dedup). + +**--lazy-pages** +: Use lazy migration mechanism. This requires a running **criu lazy-pages** +daemon. See [criu --lazy-pages option](https://criu.org/CLI/opt/--lazy-pages). + +**--lsm-profile** _type_:_label_ +: Specify an LSM profile to be used during restore. Here _type_ can either be +**apparamor** or **selinux**, and _label_ is a valid LSM label. For example, +**--lsm-profile "selinux:system_u:system_r:container_t:s0:c82,c137"**. +By default, the checkpointed LSM profile is used upon restore. + +**--lsm-mount-context** _context_ +: Specify an LSM mount context to be used during restore. Only mounts with an +existing context will have their context replaced. With this option it is +possible to change SELinux mount options. Instead of mounting with the +checkpointed context, the specified _context_ will be used. +For example, **--lsm-mount-context "system_u:object_r:container_file_t:s0:c82,c137"**. + +# SEE ALSO +**criu**(8), +**runc-checkpoint**(8), +**runc**(8). diff --git a/man/runc-resume.8.md b/man/runc-resume.8.md index 25d342f..cb809d0 100644 --- a/man/runc-resume.8.md +++ b/man/runc-resume.8.md @@ -1,14 +1,18 @@ % runc-resume "8" # NAME - runc resume - resumes all processes that have been previously paused +**runc-resume** - resume all processes that have been previously paused # SYNOPSIS - runc resume `` - -Where "``" is the name for the instance of the container to be -resumed. +**runc resume** _container-id_ # DESCRIPTION - The resume command resumes all processes in the instance of the container. -Use runc list to identify instances of containers and their current status. +The **resume** command resumes all processes in the instance of the container +identified by _container-id_. + +Use **runc list** to identify instances of containers and their current status. + +# SEE ALSO +**runc-list**(8), +**runc-pause**(8), +**runc**(8). diff --git a/man/runc-run.8.md b/man/runc-run.8.md index ad2b8b2..4959469 100644 --- a/man/runc-run.8.md +++ b/man/runc-run.8.md @@ -1,31 +1,53 @@ % runc-run "8" # NAME - runc run - create and run a container +**runc-run** - create and start a container # SYNOPSIS - runc run [command options] `` - -Where "``" is your name for the instance of the container that you -are starting. The name you provide for the container instance must be unique on -your host. +**runc run** [_option_ ...] _container-id_ # DESCRIPTION - The run command creates an instance of a container for a bundle. The bundle -is a directory with a specification file named "config.json" and a root -filesystem. - -The specification file includes an args parameter. The args parameter is used -to specify command(s) that get run when the container is started. To change the -command(s) that get executed on start, edit the args parameter of the spec. See -"runc spec --help" for more explanation. +The **run** command creates an instance of a container from a bundle, and +starts it. You can think of **run** as a shortcut for **create** followed by +**start**. # OPTIONS - --bundle value, -b value path to the root of the bundle directory, defaults to the current directory - --console-socket value path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal - --detach, -d detach from the container's process - --pid-file value specify the file to write the process id to - --no-subreaper disable the use of the subreaper used to reap reparented processes - --no-pivot do not use pivot root to jail process inside rootfs. This should be used whenever the rootfs is on top of a ramdisk - --no-new-keyring do not create a new session keyring for the container. This will cause the container to inherit the calling processes session key - --preserve-fds value Pass N additional file descriptors to the container (stdio + $LISTEN_FDS + N in total) (default: 0) +**--bundle**|**-b** _path_ +: Path to the root of the bundle directory. Default is current directory. + +**--console-socket** _path_ +: Path to an **AF_UNIX** socket which will receive a file descriptor +referencing the master end of the console's pseudoterminal. See +[docs/terminals](https://github.com/opencontainers/runc/blob/master/docs/terminals.md). + +**--detach**|**-d** +: Detach from the container's process. + +**--pid-file** _path_ +: Specify the file to write the initial container process' PID to. + +**--no-subreaper** +: Disable the use of the subreaper used to reap reparented processes. + +**--no-pivot** +: Do not use pivot root to jail process inside rootfs. This should not be used +except in exceptional circumstances, and may be unsafe from the security +standpoint. + +**--no-new-keyring** +: Do not create a new session keyring for the container. This will cause the +container to inherit the calling processes session key. + +**--preserve-fds** _N_ +: Pass _N_ additional file descriptors to the container (**stdio** + +**$LISTEN_FDS** + _N_ in total). Default is **0**. + +**--keep** +: Keep container's state directory and cgroup. This can be helpful if a user +wants to check the state (e.g. of cgroup controllers) after the container has +exited. If this option is used, a manual **runc delete** is needed afterwards +to clean an exited container's artefacts. + +# SEE ALSO + +**runc**(8). diff --git a/man/runc-spec.8.md b/man/runc-spec.8.md index 6a181cd..8e7dbfc 100644 --- a/man/runc-spec.8.md +++ b/man/runc-spec.8.md @@ -1,56 +1,71 @@ % runc-spec "8" # NAME - runc spec - create a new specification file +**runc-spec** - create a new specification file # SYNOPSIS - runc spec [command options] [arguments...] +**runc spec** [_option_ ...] # DESCRIPTION - The spec command creates the new specification file named "config.json" for +The **spec** command creates the new specification file named _config.json_ for the bundle. The spec generated is just a starter file. Editing of the spec is required to -achieve desired results. For example, the newly generated spec includes an args -parameter that is initially set to call the "sh" command when the container is -started. Calling "sh" may work for an ubuntu container or busybox, but will not -work for containers that do not include the "sh" program. +achieve desired results. For example, the newly generated spec includes an +**args** parameter that is initially set to call the **sh** command when the +container is started. Calling **sh** may work for an ubuntu container or busybox, +but will not work for containers that do not include the **sh** binary. -# EXAMPLE - To run docker's hello-world container one needs to set the args parameter -in the spec to call hello. This can be done using the sed command or a text -editor. The following commands create a bundle for hello-world, change the -default args parameter in the spec from "sh" to "/hello", then run the hello -command in a new hello-world container named container1: +# OPTIONS +**--bundle**|**-b** _path_ +: Set _path_ to the root of the bundle directory. - mkdir hello - cd hello - docker pull hello-world - docker export $(docker create hello-world) > hello-world.tar - mkdir rootfs - tar -C rootfs -xf hello-world.tar - runc spec - sed -i 's;"sh";"/hello";' config.json - runc start container1 +**--rootless** +: Generate a configuration for a rootless container. Note this option +is entirely different from the global **--rootless** option. -In the start command above, "container1" is the name for the instance of the +# EXAMPLES +To run a simple "hello-world" container, one needs to set the **args** +parameter in the spec to call hello. This can be done using **sed**(1), +**jq**(1), or a text editor. + +The following commands will: + - create a bundle for hello-world; + - change the command to run in a container to **/hello** using **jq**(1); + - run the **hello** command in a new hello-world container named **container1**. + + mkdir hello + cd hello + docker pull hello-world + docker export $(docker create hello-world) > hello-world.tar + mkdir rootfs + tar -C rootfs -xf hello-world.tar + runc spec + jq '.process.args |= ["/hello"]' < config.json > new.json + mv -f new.json config.json + runc run container1 + +In the **run** command above, **container1** is the name for the instance of the container that you are starting. The name you provide for the container instance must be unique on your host. -An alternative for generating a customized spec config is to use "oci-runtime-tool", the -sub-command "oci-runtime-tool generate" has lots of options that can be used to do any -customizations as you want, see [runtime-tools](https://github.com/opencontainers/runtime-tools) -to get more information. +An alternative for generating a customized spec config is to use +**oci-runtime-tool**; its sub-command **oci-runtime-tool generate** has lots of +options that can be used to do any customizations as you want. See +[runtime-tools](https://github.com/opencontainers/runtime-tools) to get more +information. -When starting a container through runc, runc needs root privilege. If not -already running as root, you can use sudo to give runc root privilege. For -example: "sudo runc start container1" will give runc root privilege to start the -container on your host. +When starting a container through **runc**, the latter usually needs root +privileges. If not already running as root, you can use **sudo**(8), for +example: -Alternatively, you can start a rootless container, which has the ability to run without root privileges. -For this to work, the specification file needs to be adjusted accordingly. -You can pass the parameter **--rootless** to this command to generate a proper rootless spec file. + sudo runc start container1 -# OPTIONS - --bundle value, -b value path to the root of the bundle directory - --rootless generate a configuration for a rootless container +Alternatively, you can start a rootless container, which has the ability to run +without root privileges. For this to work, the specification file needs to be +adjusted accordingly. You can pass the **--rootless** option to this command +to generate a proper rootless spec file. + +# SEE ALSO +**runc-run**(8), +**runc**(8). diff --git a/man/runc-start.8.md b/man/runc-start.8.md index e4bbacc..fa72ca5 100644 --- a/man/runc-start.8.md +++ b/man/runc-start.8.md @@ -1,14 +1,15 @@ % runc-start "8" # NAME - runc start - start executes the user defined process in a created container +**runc start** - start a previously created container # SYNOPSIS - runc start `` - -Where "``" is your name for the instance of the container that you -are starting. The name you provide for the container instance must be unique on -your host. +**runc start** _container-id_ # DESCRIPTION - The start command executes the user defined process in a created container. +The **start** command executes the process defined in _config.json_ in a +container previously created by **runc-create**(8). + +# SEE ALSO +**runc-create**(8), +**runc**(8). diff --git a/man/runc-state.8.md b/man/runc-state.8.md index 768f79f..727b80a 100644 --- a/man/runc-state.8.md +++ b/man/runc-state.8.md @@ -1,13 +1,15 @@ % runc-state "8" # NAME - runc state - output the state of a container +**runc-state** - show the state of a container # SYNOPSIS - runc state `` - -Where "``" is your name for the instance of the container. +**runc state** _container-id_ # DESCRIPTION - The state command outputs current state information for the -instance of a container. +The **state** command outputs current state information for the specified +_container-id_ in a JSON format. + +# SEE ALSO + +**runc**(8). diff --git a/man/runc-update.8.md b/man/runc-update.8.md index fa269d6..8ceaa38 100644 --- a/man/runc-update.8.md +++ b/man/runc-update.8.md @@ -1,55 +1,94 @@ % runc-update "8" # NAME - runc update - update container resource constraints +**runc-update** - update running container resource constraints # SYNOPSIS - runc update [command options] `` +**runc update** [_option_ ...] _container-id_ + +**runc update** **-r** _resources.json_|**-** _container-id_ # DESCRIPTION - The data can be read from a file or the standard input, the -accepted format is as follow (unchanged values can be omitted): +The **update** command change the resource constraints of a running container +instance. - { - "memory": { - "limit": 0, - "reservation": 0, - "swap": 0, - "kernel": 0, - "kernelTCP": 0 - }, - "cpu": { - "shares": 0, - "quota": 0, - "period": 0, - "realtimeRuntime": 0, - "realtimePeriod": 0, - "cpus": "", - "mems": "" - }, - "blockIO": { - "blkioWeight": 0 - } - } +The resources can be set using options, or, if **-r** is used, parsed from JSON +provided as a file or from stdin. -Note: if data is to be read from a file or the standard input, all -other options are ignored. +In case **-r** is used, the JSON format is like this: + + { + "memory": { + "limit": 0, + "reservation": 0, + "swap": 0, + "kernel": 0, + "kernelTCP": 0 + }, + "cpu": { + "shares": 0, + "quota": 0, + "period": 0, + "realtimeRuntime": 0, + "realtimePeriod": 0, + "cpus": "", + "mems": "" + }, + "blockIO": { + "blkioWeight": 0 + } + } # OPTIONS - --resources value, -r value path to the file containing the resources to update or '-' to read from the standard input - --blkio-weight value Specifies per cgroup weight, range is from 10 to 1000 (default: 0) - --cpu-period value CPU CFS period to be used for hardcapping (in usecs). 0 to use system default - --cpu-quota value CPU CFS hardcap limit (in usecs). Allowed cpu time in a given period - --cpu-rt-period value CPU realtime period to be used for hardcapping (in usecs). 0 to use system default - --cpu-rt-runtime value CPU realtime hardcap limit (in usecs). Allowed cpu time in a given period - --cpu-share value CPU shares (relative weight vs. other containers) - --cpuset-cpus value CPU(s) to use - --cpuset-mems value Memory node(s) to use - --kernel-memory value Kernel memory limit (in bytes) - --kernel-memory-tcp value Kernel memory limit (in bytes) for tcp buffer - --memory value Memory limit (in bytes) - --memory-reservation value Memory reservation or soft_limit (in bytes) - --memory-swap value Total memory usage (memory + swap); set '-1' to enable unlimited swap - --pids-limit value Maximum number of pids allowed in the container (default: 0) - --l3-cache-schema The string of Intel RDT/CAT L3 cache schema - --mem-bw-schema The string of Intel RDT/MBA memory bandwidth schema +**--resources**|**-r** _resources.json_ +: Read the new resource limtis from _resources.json_. Use **-** to read from +stdin. If this option is used, all other options are ignored. + +**--blkio-weight** _weight_ +: Set a new io weight. + +**--cpu-period** _num_ +: Set CPU CFS period to be used for hardcapping (in microseconds) + +**--cpu-quota** _num_ +: Set CPU usage limit within a given period (in microseconds). + +**--cpu-rt-period** _num_ +: Set CPU realtime period to be used for hardcapping (in microseconds). + +**--cpu-rt-runtime** _num_ +: Set CPU realtime hardcap limit (in usecs). Allowed cpu time in a given period. + +**--cpu-share** _num_ +: Set CPU shares (relative weight vs. other containers). + +**--cpuset-cpus** _list_ +: Set CPU(s) to use. The _list_ can contain commas and ranges. For example: +**0-3,7**. + +**--cpuset-mems** _list_ +: Set memory node(s) to use. The _list_ format is the same as for +**--cpuset-cpus**. + +**--memory** _num_ +: Set memory limit to _num_ bytes. + +**--memory-reservation** _num_ +: Set memory reservation, or soft limit, to _num_ bytes. + +**--memory-swap** _num_ +: Set total memory + swap usage to _num_ bytes. Use **-1** to unset the limit +(i.e. use unlimited swap). + +**--pids-limit** _num_ +: Set the maximum number of processes allowed in the container. + +**--l3-cache-schema** _value_ +: Set the value for Intel RDT/CAT L3 cache schema. + +**--mem-bw-schema** _value_ +: Set the Intel RDT/MBA memory bandwidth schema. + +# SEE ALSO + +**runc**(8). diff --git a/man/runc.8.md b/man/runc.8.md index 49df525..09db1ef 100644 --- a/man/runc.8.md +++ b/man/runc.8.md @@ -1,11 +1,12 @@ % runc "8" # NAME - runc - Open Container Initiative runtime +**runc** - Open Container Initiative runtime # SYNOPSIS - runc [global options] command [command options] [arguments...] - + +**runc** [_global-option_ ...] _command_ [_command-option_ ...] [_argument_ ...] + # DESCRIPTION runc is a command line client for running applications packaged according to the Open Container Initiative (OCI) format and is a compliant implementation of the @@ -17,45 +18,132 @@ existing process monitoring tools and the container will be spawned as a direct child of the process supervisor. Containers are configured using bundles. A bundle for a container is a directory -that includes a specification file named "config.json" and a root filesystem. -The root filesystem contains the contents of the container. +that includes a specification file named _config.json_ and a root filesystem. +The root filesystem contains the contents of the container. -To start a new instance of a container: +To run a new instance of a container: - # runc start [ -b bundle ] + # runc run [ -b bundle ] container-id -Where "``" is your name for the instance of the container that you +Where _container-id_ is your name for the instance of the container that you are starting. The name you provide for the container instance must be unique on -your host. Providing the bundle directory using "-b" is optional. The default -value for "bundle" is the current directory. +your host. + +Providing the bundle directory using **-b** is optional. The default +value for _bundle_ is the current directory. # COMMANDS - checkpoint checkpoint a running container - create create a container - delete delete any resources held by the container often used with detached containers - events display container events such as OOM notifications, cpu, memory, IO and network stats - exec execute new process inside the container - init initialize the namespaces and launch the process (do not call it outside of runc) - kill kill sends the specified signal (default: SIGTERM) to the container's init process - list lists containers started by runc with the given root - pause pause suspends all processes inside the container - ps displays the processes running inside a container - restore restore a container from a previous checkpoint - resume resumes all processes that have been previously paused - run create and run a container - spec create a new specification file - start executes the user defined process in a created container - state output the state of a container - update update container resource constraints - help, h Shows a list of commands or help for one command - +**checkpoint** +: Checkpoint a running container. See **runc-checkpoint**(8). + +**create** +: Create a container. See **runc-create**(8). + +**delete** +: Delete any resources held by the container often used with detached +containers. See **runc-delete**(8). + +**events** +: Display container events such as OOM notifications, cpu, memory, IO and +network stats. See **runc-events**(8). + +**exec** +: Execute a new process inside the container. See **runc-exec**(8). + +**init** +: Initialize the namespaces and launch the container init process. This command +is not supposed to be used directly. + +**kill** +: Send a specified signal to the container's init process. See +**runc-kill**(8). + +**list** +: List containers started by runc with the given **--root**. See +**runc-list**(8). + +**pause** +: Suspend all processes inside the container. See **runc-pause**(8). + +**ps** +: Show processes running inside the container. See **runc-ps**(8). + +**restore** +: Restore a container from a previous checkpoint. See **runc-restore**(8). + +**resume** +: Resume all processes that have been previously paused. See **runc-resume**(8). + +**run** +: Create and start a container. See **runc-run**(8). + +**spec** +: Create a new specification file (_config.json_). See **runc-spec**(8). + +**start** +: Start a container previously created by **runc create**. See **runc-start**(8). + +**state** +: Show the container state. See **runc-state**(8). + +**update** +: Update container resource constraints. See **runc-update**(8). + +**help**, **h** +: Show a list of commands or help for a particular command. + # GLOBAL OPTIONS - --debug enable debug output for logging - --log value set the log file path where internal debug information is written (default: "/dev/null") - --log-format value set the format used by logs ('text' (default), or 'json') (default: "text") - --root value root directory for storage of container state (this should be located in tmpfs) (default: "/run/runc" or $XDG_RUNTIME_DIR/runc for rootless containers) - --criu value path to the criu binary used for checkpoint and restore (default: "criu") - --systemd-cgroup enable systemd cgroup support, expects cgroupsPath to be of form "slice:prefix:name" for e.g. "system.slice:runc:434234" - --rootless value enable rootless mode ('true', 'false', or 'auto') (default: "auto") - --help, -h show help - --version, -v print the version + +These options can be used with any command, and must precede the **command**. + +**--debug** +: Enable debug logging. + +**--log** _path_ +: Set the log destination to _path_. The default is to log to stderr. + +**--log-format** **text**|**json** +: Set the log format (default is **text**). + +**--root** _path_ +: Set the root directory to store containers' state. The _path_ should be +located on tmpfs. Default is */run/runc*, or *$XDG_RUNTIME_DIR/runc* for +rootless containers. + +**--criu** _path_ +: Set the path to the **criu**(8) binary used for checkpoint and restore. +Default is **criu**. + +**--systemd-cgroup** +: Enable systemd cgroup support. If this is set, the container spec +(_config.json_) is expected to have **cgroupsPath** value in the +*slice:prefix:name* form (e.g. **system.slice:runc:434234**). + +**--rootless** **true**|**false**|**auto** +: Enable or disable rootless mode. Default is **auto**, meaning to auto-detect +whether rootless should be enabled. + +**--help**|**-h** +: Show help. + +**--version**|**-v** +: Show version. + +# SEE ALSO + +**runc-checkpoint**(8), +**runc-create**(8), +**runc-delete**(8), +**runc-events**(8), +**runc-exec**(8), +**runc-kill**(8), +**runc-list**(8), +**runc-pause**(8), +**runc-ps**(8), +**runc-restore**(8), +**runc-resume**(8), +**runc-run**(8), +**runc-spec**(8), +**runc-start**(8), +**runc-state**(8), +**runc-update**(8). diff --git a/notify_socket.go b/notify_socket.go index e7453c6..76aa27c 100644 --- a/notify_socket.go +++ b/notify_socket.go @@ -1,17 +1,16 @@ -// +build linux - package main import ( "bytes" - "fmt" "net" "os" + "path" "path/filepath" + "strconv" + "time" + "github.com/opencontainers/runc/libcontainer" "github.com/opencontainers/runtime-spec/specs-go" - - "github.com/sirupsen/logrus" "github.com/urfave/cli" ) @@ -27,12 +26,12 @@ func newNotifySocket(context *cli.Context, notifySocketHost string, id string) * } root := filepath.Join(context.GlobalString("root"), id) - path := filepath.Join(root, "notify.sock") + socketPath := filepath.Join(root, "notify", "notify.sock") notifySocket := ¬ifySocket{ socket: nil, host: notifySocketHost, - socketPath: path, + socketPath: socketPath, } return notifySocket @@ -44,13 +43,18 @@ func (s *notifySocket) Close() error { // If systemd is supporting sd_notify protocol, this function will add support // for sd_notify protocol from within the container. -func (s *notifySocket) setupSpec(context *cli.Context, spec *specs.Spec) { - mount := specs.Mount{Destination: s.host, Source: s.socketPath, Options: []string{"bind"}} +func (s *notifySocket) setupSpec(spec *specs.Spec) { + pathInContainer := filepath.Join("/run/notify", path.Base(s.socketPath)) + mount := specs.Mount{ + Destination: path.Dir(pathInContainer), + Source: path.Dir(s.socketPath), + Options: []string{"bind", "nosuid", "noexec", "nodev", "ro"}, + } spec.Mounts = append(spec.Mounts, mount) - spec.Process.Env = append(spec.Process.Env, fmt.Sprintf("NOTIFY_SOCKET=%s", s.host)) + spec.Process.Env = append(spec.Process.Env, "NOTIFY_SOCKET="+pathInContainer) } -func (s *notifySocket) setupSocket() error { +func (s *notifySocket) bindSocket() error { addr := net.UnixAddr{ Name: s.socketPath, Net: "unixgram", @@ -61,7 +65,7 @@ func (s *notifySocket) setupSocket() error { return err } - err = os.Chmod(s.socketPath, 0777) + err = os.Chmod(s.socketPath, 0o777) if err != nil { socket.Close() return err @@ -71,46 +75,95 @@ func (s *notifySocket) setupSocket() error { return nil } -// pid1 must be set only with -d, as it is used to set the new process as the main process -// for the service in systemd -func (s *notifySocket) run(pid1 int) { - buf := make([]byte, 512) - notifySocketHostAddr := net.UnixAddr{Name: s.host, Net: "unixgram"} +func (s *notifySocket) setupSocketDirectory() error { + return os.Mkdir(path.Dir(s.socketPath), 0o755) +} + +func notifySocketStart(context *cli.Context, notifySocketHost, id string) (*notifySocket, error) { + notifySocket := newNotifySocket(context, notifySocketHost, id) + if notifySocket == nil { + return nil, nil + } + + if err := notifySocket.bindSocket(); err != nil { + return nil, err + } + return notifySocket, nil +} + +func (n *notifySocket) waitForContainer(container libcontainer.Container) error { + s, err := container.State() + if err != nil { + return err + } + return n.run(s.InitProcessPid) +} + +func (n *notifySocket) run(pid1 int) error { + if n.socket == nil { + return nil + } + notifySocketHostAddr := net.UnixAddr{Name: n.host, Net: "unixgram"} client, err := net.DialUnix("unixgram", nil, ¬ifySocketHostAddr) if err != nil { - logrus.Error(err) - return + return err } - for { - r, err := s.socket.Read(buf) - if err != nil { - break - } - var out bytes.Buffer - for _, line := range bytes.Split(buf[0:r], []byte{'\n'}) { - if bytes.HasPrefix(line, []byte("READY=")) { - _, err = out.Write(line) - if err != nil { - return - } - _, err = out.Write([]byte{'\n'}) - if err != nil { - return - } + ticker := time.NewTicker(time.Millisecond * 100) + defer ticker.Stop() - _, err = client.Write(out.Bytes()) - if err != nil { - return - } - - // now we can inform systemd to use pid1 as the pid to monitor - if pid1 > 0 { - newPid := fmt.Sprintf("MAINPID=%d\n", pid1) - client.Write([]byte(newPid)) - } + fileChan := make(chan []byte) + go func() { + for { + buf := make([]byte, 4096) + r, err := n.socket.Read(buf) + if err != nil { return } + got := buf[0:r] + // systemd-ready sends a single datagram with the state string as payload, + // so we don't need to worry about partial messages. + for _, line := range bytes.Split(got, []byte{'\n'}) { + if bytes.HasPrefix(got, []byte("READY=")) { + fileChan <- line + return + } + } + + } + }() + + for { + select { + case <-ticker.C: + _, err := os.Stat(filepath.Join("/proc", strconv.Itoa(pid1))) + if err != nil { + return nil + } + case b := <-fileChan: + var out bytes.Buffer + _, err = out.Write(b) + if err != nil { + return err + } + + _, err = out.Write([]byte{'\n'}) + if err != nil { + return err + } + + _, err = client.Write(out.Bytes()) + if err != nil { + return err + } + + // now we can inform systemd to use pid1 as the pid to monitor + newPid := "MAINPID=" + strconv.Itoa(pid1) + _, err := client.Write([]byte(newPid + "\n")) + if err != nil { + return err + } + return nil } } } diff --git a/pause.go b/pause.go index 224c79f..a7f0aac 100644 --- a/pause.go +++ b/pause.go @@ -1,5 +1,3 @@ -// +build linux - package main import ( diff --git a/ps.go b/ps.go index e7f635f..4083e55 100644 --- a/ps.go +++ b/ps.go @@ -1,9 +1,8 @@ -// +build linux - package main import ( "encoding/json" + "errors" "fmt" "os" "os/exec" @@ -52,11 +51,11 @@ var psCommand = cli.Command{ case "json": return json.NewEncoder(os.Stdout).Encode(pids) default: - return fmt.Errorf("invalid format option") + return errors.New("invalid format option") } // [1:] is to remove command name, ex: - // context.Args(): [containet_id ps_arg1 ps_arg2 ...] + // context.Args(): [container_id ps_arg1 ps_arg2 ...] // psArgs: [ps_arg1 ps_arg2 ...] // psArgs := context.Args()[1:] @@ -67,7 +66,7 @@ var psCommand = cli.Command{ cmd := exec.Command("ps", psArgs...) output, err := cmd.CombinedOutput() if err != nil { - return fmt.Errorf("%s: %s", err, output) + return fmt.Errorf("%w: %s", err, output) } lines := strings.Split(string(output), "\n") @@ -84,7 +83,7 @@ var psCommand = cli.Command{ fields := strings.Fields(line) p, err := strconv.Atoi(fields[pidIndex]) if err != nil { - return fmt.Errorf("unexpected pid '%s': %s", fields[pidIndex], err) + return fmt.Errorf("unable to parse pid: %w", err) } for _, pid := range pids { @@ -109,5 +108,5 @@ func getPidIndex(title string) (int, error) { } } - return pidIndex, fmt.Errorf("couldn't find PID field in ps output") + return pidIndex, errors.New("couldn't find PID field in ps output") } diff --git a/restore.go b/restore.go index 53f50d2..59d2904 100644 --- a/restore.go +++ b/restore.go @@ -1,12 +1,10 @@ -// +build linux - package main import ( "os" "github.com/opencontainers/runc/libcontainer" - "github.com/opencontainers/runc/libcontainer/system" + "github.com/opencontainers/runc/libcontainer/userns" "github.com/sirupsen/logrus" "github.com/urfave/cli" ) @@ -91,25 +89,31 @@ using the runc checkpoint command.`, Name: "lazy-pages", Usage: "use userfaultfd to lazily restore memory pages", }, + cli.StringFlag{ + Name: "lsm-profile", + Value: "", + Usage: "Specify an LSM profile to be used during restore in the form of TYPE:NAME.", + }, + cli.StringFlag{ + Name: "lsm-mount-context", + Value: "", + Usage: "Specify an LSM mount context to be used during restore.", + }, }, Action: func(context *cli.Context) error { if err := checkArgs(context, 1, exactArgs); err != nil { return err } // XXX: Currently this is untested with rootless containers. - if os.Geteuid() != 0 || system.RunningInUserNS() { + if os.Geteuid() != 0 || userns.RunningInUserNS() { logrus.Warn("runc checkpoint is untested with rootless containers") } - spec, err := setupSpec(context) - if err != nil { - return err - } options := criuOptions(context) if err := setEmptyNsMask(context, options); err != nil { return err } - status, err := startContainer(context, spec, CT_ACT_RESTORE, options) + status, err := startContainer(context, CT_ACT_RESTORE, options) if err != nil { return err } @@ -121,14 +125,15 @@ using the runc checkpoint command.`, } func criuOptions(context *cli.Context) *libcontainer.CriuOpts { - imagePath := getCheckpointImagePath(context) - if err := os.MkdirAll(imagePath, 0755); err != nil { + imagePath, parentPath, err := prepareImagePaths(context) + if err != nil { fatal(err) } + return &libcontainer.CriuOpts{ ImagesDirectory: imagePath, WorkDirectory: context.String("work-path"), - ParentImage: context.String("parent-path"), + ParentImage: parentPath, LeaveRunning: context.Bool("leave-running"), TcpEstablished: context.Bool("tcp-established"), ExternalUnixConnections: context.Bool("ext-unix-sk"), @@ -137,6 +142,8 @@ func criuOptions(context *cli.Context) *libcontainer.CriuOpts { PreDump: context.Bool("pre-dump"), AutoDedup: context.Bool("auto-dedup"), LazyPages: context.Bool("lazy-pages"), - StatusFd: context.String("status-fd"), + StatusFd: context.Int("status-fd"), + LsmProfile: context.String("lsm-profile"), + LsmMountContext: context.String("lsm-mount-context"), } } diff --git a/rlimit_linux.go b/rlimit_linux.go index c97a0fb..1f7a205 100644 --- a/rlimit_linux.go +++ b/rlimit_linux.go @@ -1,43 +1,28 @@ package main -import "fmt" +import ( + "fmt" -const ( - RLIMIT_CPU = iota // CPU time in sec - RLIMIT_FSIZE // Maximum filesize - RLIMIT_DATA // max data size - RLIMIT_STACK // max stack size - RLIMIT_CORE // max core file size - RLIMIT_RSS // max resident set size - RLIMIT_NPROC // max number of processes - RLIMIT_NOFILE // max number of open files - RLIMIT_MEMLOCK // max locked-in-memory address space - RLIMIT_AS // address space limit - RLIMIT_LOCKS // maximum file locks held - RLIMIT_SIGPENDING // max number of pending signals - RLIMIT_MSGQUEUE // maximum bytes in POSIX mqueues - RLIMIT_NICE // max nice prio allowed to raise to - RLIMIT_RTPRIO // maximum realtime priority - RLIMIT_RTTIME // timeout for RT tasks in us + "golang.org/x/sys/unix" ) var rlimitMap = map[string]int{ - "RLIMIT_CPU": RLIMIT_CPU, - "RLIMIT_FSIZE": RLIMIT_FSIZE, - "RLIMIT_DATA": RLIMIT_DATA, - "RLIMIT_STACK": RLIMIT_STACK, - "RLIMIT_CORE": RLIMIT_CORE, - "RLIMIT_RSS": RLIMIT_RSS, - "RLIMIT_NPROC": RLIMIT_NPROC, - "RLIMIT_NOFILE": RLIMIT_NOFILE, - "RLIMIT_MEMLOCK": RLIMIT_MEMLOCK, - "RLIMIT_AS": RLIMIT_AS, - "RLIMIT_LOCKS": RLIMIT_LOCKS, - "RLIMIT_SIGPENDING": RLIMIT_SIGPENDING, - "RLIMIT_MSGQUEUE": RLIMIT_MSGQUEUE, - "RLIMIT_NICE": RLIMIT_NICE, - "RLIMIT_RTPRIO": RLIMIT_RTPRIO, - "RLIMIT_RTTIME": RLIMIT_RTTIME, + "RLIMIT_CPU": unix.RLIMIT_CPU, + "RLIMIT_FSIZE": unix.RLIMIT_FSIZE, + "RLIMIT_DATA": unix.RLIMIT_DATA, + "RLIMIT_STACK": unix.RLIMIT_STACK, + "RLIMIT_CORE": unix.RLIMIT_CORE, + "RLIMIT_RSS": unix.RLIMIT_RSS, + "RLIMIT_NPROC": unix.RLIMIT_NPROC, + "RLIMIT_NOFILE": unix.RLIMIT_NOFILE, + "RLIMIT_MEMLOCK": unix.RLIMIT_MEMLOCK, + "RLIMIT_AS": unix.RLIMIT_AS, + "RLIMIT_LOCKS": unix.RLIMIT_LOCKS, + "RLIMIT_SIGPENDING": unix.RLIMIT_SIGPENDING, + "RLIMIT_MSGQUEUE": unix.RLIMIT_MSGQUEUE, + "RLIMIT_NICE": unix.RLIMIT_NICE, + "RLIMIT_RTPRIO": unix.RLIMIT_RTPRIO, + "RLIMIT_RTTIME": unix.RLIMIT_RTTIME, } func strToRlimit(key string) (int, error) { diff --git a/rootless_linux.go b/rootless_linux.go index 3c425dc..ae01703 100644 --- a/rootless_linux.go +++ b/rootless_linux.go @@ -1,11 +1,11 @@ -// +build linux - package main import ( "os" - "github.com/opencontainers/runc/libcontainer/system" + "github.com/opencontainers/runc/libcontainer/cgroups/systemd" + "github.com/opencontainers/runc/libcontainer/userns" + "github.com/sirupsen/logrus" "github.com/urfave/cli" ) @@ -19,19 +19,32 @@ func shouldUseRootlessCgroupManager(context *cli.Context) (bool, error) { if b != nil { return *b, nil } - - if context.GlobalBool("systemd-cgroup") { - return false, nil - } } if os.Geteuid() != 0 { return true, nil } - if !system.RunningInUserNS() { + if !userns.RunningInUserNS() { // euid == 0 , in the initial ns (i.e. the real root) return false, nil } // euid = 0, in a userns. + // + // [systemd driver] + // We can call DetectUID() to parse the OwnerUID value from `busctl --user --no-pager status` result. + // The value corresponds to sd_bus_creds_get_owner_uid(3). + // If the value is 0, we have rootful systemd inside userns, so we do not need the rootless cgroup manager. + // + // On error, we assume we are root. An error may happen during shelling out to `busctl` CLI, + // mostly when $DBUS_SESSION_BUS_ADDRESS is unset. + if context.GlobalBool("systemd-cgroup") { + ownerUID, err := systemd.DetectUID() + if err != nil { + logrus.WithError(err).Debug("failed to get the OwnerUID value, assuming the value to be 0") + ownerUID = 0 + } + return ownerUID != 0, nil + } + // [cgroupfs driver] // As we are unaware of cgroups path, we can't determine whether we have the full // access to the cgroups path. // Either way, we can safely decide to use the rootless cgroups manager. @@ -45,7 +58,7 @@ func shouldHonorXDGRuntimeDir() bool { if os.Geteuid() != 0 { return true } - if !system.RunningInUserNS() { + if !userns.RunningInUserNS() { // euid == 0 , in the initial ns (i.e. the real root) // in this case, we should use /run/runc and ignore // $XDG_RUNTIME_DIR (e.g. /run/user/0) for backward diff --git a/run.go b/run.go index f8d6317..8278166 100644 --- a/run.go +++ b/run.go @@ -1,8 +1,7 @@ -// +build linux - package main import ( + "fmt" "os" "github.com/urfave/cli" @@ -40,6 +39,10 @@ command(s) that get executed on start, edit the args parameter of the spec. See Name: "detach, d", Usage: "detach from the container's process", }, + cli.BoolFlag{ + Name: "keep", + Usage: "do not delete the container after it exits", + }, cli.StringFlag{ Name: "pid-file", Value: "", @@ -66,19 +69,12 @@ command(s) that get executed on start, edit the args parameter of the spec. See if err := checkArgs(context, 1, exactArgs); err != nil { return err } - if err := revisePidFile(context); err != nil { - return err - } - spec, err := setupSpec(context) - if err != nil { - return err - } - status, err := startContainer(context, spec, CT_ACT_RUN, nil) + status, err := startContainer(context, CT_ACT_RUN, nil) if err == nil { // exit with the container's exit status so any external supervisor is // notified of the exit with the correct exit status. os.Exit(status) } - return err + return fmt.Errorf("runc run failed: %w", err) }, } diff --git a/script/.validate b/script/.validate deleted file mode 100644 index 170d674..0000000 --- a/script/.validate +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/bash - -if [ -z "$VALIDATE_UPSTREAM" ]; then - # this is kind of an expensive check, so let's not do this twice if we - # are running more than one validate bundlescript - - VALIDATE_REPO='https://github.com/opencontainers/runc.git' - VALIDATE_BRANCH='master' - - if [ "$TRAVIS" = 'true' -a "$TRAVIS_PULL_REQUEST" != 'false' ]; then - VALIDATE_REPO="https://github.com/${TRAVIS_REPO_SLUG}.git" - VALIDATE_BRANCH="${TRAVIS_BRANCH}" - fi - - VALIDATE_HEAD="$(git rev-parse --verify HEAD)" - - git fetch -q "$VALIDATE_REPO" "refs/heads/$VALIDATE_BRANCH" - VALIDATE_UPSTREAM="$(git rev-parse --verify FETCH_HEAD)" - - VALIDATE_COMMIT_LOG="$VALIDATE_UPSTREAM..$VALIDATE_HEAD" - VALIDATE_COMMIT_DIFF="$VALIDATE_UPSTREAM...$VALIDATE_HEAD" - - validate_diff() { - if [ "$VALIDATE_UPSTREAM" != "$VALIDATE_HEAD" ]; then - git diff "$VALIDATE_COMMIT_DIFF" "$@" - fi - } - validate_log() { - if [ "$VALIDATE_UPSTREAM" != "$VALIDATE_HEAD" ]; then - git log "$VALIDATE_COMMIT_LOG" "$@" - fi - } -fi diff --git a/script/check-config.sh b/script/check-config.sh index 6b8158e..0fe1391 100755 --- a/script/check-config.sh +++ b/script/check-config.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash set -e -# bits of this were adapted from check_config.sh in docker +# bits of this were adapted from check_config.sh in docker # see also https://github.com/docker/docker/blob/master/contrib/check-config.sh possibleConfigs=( @@ -164,17 +164,21 @@ echo echo 'Generally Necessary:' echo -n '- ' -cgroupSubsystemDir="$(awk '/[, ](cpu|cpuacct|cpuset|devices|freezer|memory)[, ]/ && $3 == "cgroup" { print $2 }' /proc/mounts | head -n1)" -cgroupDir="$(dirname "$cgroupSubsystemDir")" -if [ -d "$cgroupDir/cpu" -o -d "$cgroupDir/cpuacct" -o -d "$cgroupDir/cpuset" -o -d "$cgroupDir/devices" -o -d "$cgroupDir/freezer" -o -d "$cgroupDir/memory" ]; then - echo "$(wrap_good 'cgroup hierarchy' 'properly mounted') [$cgroupDir]" +if [ "$(stat -f -c %t /sys/fs/cgroup 2>/dev/null)" = "63677270" ]; then + echo "$(wrap_good 'cgroup hierarchy' 'cgroupv2')" else - if [ "$cgroupSubsystemDir" ]; then - echo "$(wrap_bad 'cgroup hierarchy' 'single mountpoint!') [$cgroupSubsystemDir]" + cgroupSubsystemDir="$(awk '/[, ](cpu|cpuacct|cpuset|devices|freezer|memory)[, ]/ && $3 == "cgroup" { print $2 }' /proc/mounts | head -n1)" + cgroupDir="$(dirname "$cgroupSubsystemDir")" + if [ -d "$cgroupDir/cpu" -o -d "$cgroupDir/cpuacct" -o -d "$cgroupDir/cpuset" -o -d "$cgroupDir/devices" -o -d "$cgroupDir/freezer" -o -d "$cgroupDir/memory" ]; then + echo "$(wrap_good 'cgroup hierarchy' 'properly mounted') [$cgroupDir]" else - echo "$(wrap_bad 'cgroup hierarchy' 'nonexistent??')" + if [ "$cgroupSubsystemDir" ]; then + echo "$(wrap_bad 'cgroup hierarchy' 'single mountpoint!') [$cgroupSubsystemDir]" + else + echo "$(wrap_bad 'cgroup hierarchy' 'nonexistent??')" + fi + echo " $(wrap_color '(see https://github.com/tianon/cgroupfs-mount)' yellow)" fi - echo " $(wrap_color '(see https://github.com/tianon/cgroupfs-mount)' yellow)" fi if [ "$(cat /sys/module/apparmor/parameters/enabled 2>/dev/null)" = 'Y' ]; then @@ -199,14 +203,23 @@ flags=( CGROUPS CGROUP_CPUACCT CGROUP_DEVICE CGROUP_FREEZER CGROUP_SCHED CPUSETS MEMCG KEYS VETH BRIDGE BRIDGE_NETFILTER - NF_NAT_IPV4 IP_NF_FILTER IP_NF_TARGET_MASQUERADE + IP_NF_FILTER IP_NF_TARGET_MASQUERADE NETFILTER_XT_MATCH_{ADDRTYPE,CONNTRACK,IPVS} - IP_NF_NAT NF_NAT NF_NAT_NEEDED - + IP_NF_NAT NF_NAT + # required for bind-mounting /dev/mqueue into containers POSIX_MQUEUE ) check_flags "${flags[@]}" + +if [ "$kernelMajor" -lt 5 ] || [ "$kernelMajor" -eq 5 -a "$kernelMinor" -le 1 ]; then + check_flags NF_NAT_IPV4 +fi + +if [ "$kernelMajor" -lt 5 ] || [ "$kernelMajor" -eq 5 -a "$kernelMinor" -le 2 ]; then + check_flags NF_NAT_NEEDED +fi + echo echo 'Optional Features:' @@ -215,11 +228,16 @@ echo 'Optional Features:' check_distro_userns check_flags SECCOMP + check_flags SECCOMP_FILTER check_flags CGROUP_PIDS - check_flags MEMCG_SWAP MEMCG_SWAP_ENABLED - if is_set MEMCG_SWAP && ! is_set MEMCG_SWAP_ENABLED; then - echo " $(wrap_color '(note that cgroup swap accounting is not enabled in your kernel config, you can enable it by setting boot option "swapaccount=1")' bold black)" + check_flags MEMCG_SWAP + + if [ "$kernelMajor" -lt 5 ] || [ "$kernelMajor" -eq 5 -a "$kernelMinor" -le 8 ]; then + check_flags MEMCG_SWAP_ENABLED + if is_set MEMCG_SWAP && ! is_set MEMCG_SWAP_ENABLED; then + echo " $(wrap_color '(note that cgroup swap accounting is not enabled in your kernel config, you can enable it by setting boot option "swapaccount=1")' bold black)" + fi fi } @@ -237,8 +255,12 @@ else netprio=CGROUP_NET_PRIO fi +if [ "$kernelMajor" -lt 5 ]; then + check_flags IOSCHED_CFQ CFQ_GROUP_IOSCHED +fi + flags=( - BLK_CGROUP BLK_DEV_THROTTLING IOSCHED_CFQ CFQ_GROUP_IOSCHED + BLK_CGROUP BLK_DEV_THROTTLING CGROUP_PERF CGROUP_HUGETLB NET_CLS_CGROUP $netprio @@ -249,5 +271,7 @@ flags=( IP_VS_PROTO_TCP IP_VS_PROTO_UDP IP_VS_RR + SECURITY_SELINUX + SECURITY_APPARMOR ) check_flags "${flags[@]}" diff --git a/script/lib.sh b/script/lib.sh new file mode 100644 index 0000000..9afa0b4 --- /dev/null +++ b/script/lib.sh @@ -0,0 +1,39 @@ +#!/bin/bash + +# set_cross_vars sets a few environment variables used for cross-compiling, +# based on the architecture specified in $1. +function set_cross_vars() { + GOARCH="$1" # default, may be overridden below + unset GOARM + + case $1 in + arm64) + HOST=aarch64-linux-gnu + ;; + armel) + HOST=arm-linux-gnueabi + GOARCH=arm + GOARM=6 + ;; + armhf) + HOST=arm-linux-gnueabihf + GOARCH=arm + GOARM=7 + ;; + ppc64le) + HOST=powerpc64le-linux-gnu + ;; + s390x) + HOST=s390x-linux-gnu + ;; + *) + echo "set_cross_vars: unsupported architecture: $1" >&2 + exit 1 + ;; + esac + + CC=$HOST-gcc + STRIP=$HOST-strip + + export HOST GOARM GOARCH CC STRIP +} diff --git a/script/release.sh b/script/release.sh deleted file mode 100755 index a1ebc95..0000000 --- a/script/release.sh +++ /dev/null @@ -1,130 +0,0 @@ -#!/bin/bash -# Copyright (C) 2017 SUSE LLC. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -e - -## ---> -# Project-specific options and functions. In *theory* you shouldn't need to -# touch anything else in this script in order to use this elsewhere. -project="runc" -root="$(readlink -f "$(dirname "${BASH_SOURCE}")/..")" - -# This function takes an output path as an argument, where the built -# (preferably static) binary should be placed. -function build_project() { - builddir="$(dirname "$1")" - - # Build with all tags enabled. - make -C "$root" COMMIT_NO= BUILDTAGS="seccomp selinux apparmor" static - mv "$root/$project" "$1" -} - -# End of the easy-to-configure portion. -## <--- - -# Print usage information. -function usage() { - echo "usage: release.sh [-S ] [-c ] [-r ] [-v ]" >&2 - exit 1 -} - -# Log something to stderr. -function log() { - echo "[*] $*" >&2 -} - -# Log something to stderr and then exit with 0. -function bail() { - log "$@" - exit 0 -} - -# Conduct a sanity-check to make sure that GPG provided with the given -# arguments can sign something. Inability to sign things is not a fatal error. -function gpg_cansign() { - gpg "$@" --clear-sign /dev/null -} - -# When creating releases we need to build static binaries, an archive of the -# current commit, and generate detached signatures for both. -keyid="" -commit="HEAD" -version="" -releasedir="" -hashcmd="" -while getopts "S:c:r:v:h:" opt; do - case "$opt" in - S) - keyid="$OPTARG" - ;; - c) - commit="$OPTARG" - ;; - r) - releasedir="$OPTARG" - ;; - v) - version="$OPTARG" - ;; - h) - hashcmd="$OPTARG" - ;; - \:) - echo "Missing argument: -$OPTARG" >&2 - usage - ;; - \?) - echo "Invalid option: -$OPTARG" >&2 - usage - ;; - esac -done - -version="${version:-$(<"$root/VERSION")}" -releasedir="${releasedir:-release/$version}" -hashcmd="${hashcmd:-sha256sum}" -goarch="$(go env GOARCH || echo "amd64")" - -log "creating $project release in '$releasedir'" -log " version: $version" -log " commit: $commit" -log " key: ${keyid:-DEFAULT}" -log " hash: $hashcmd" - -# Make explicit what we're doing. -set -x - -# Make the release directory. -rm -rf "$releasedir" && mkdir -p "$releasedir" - -# Build project. -build_project "$releasedir/$project.$goarch" - -# Generate new archive. -git archive --format=tar --prefix="$project-$version/" "$commit" | xz > "$releasedir/$project.tar.xz" - -# Generate sha256 checksums for both. -( cd "$releasedir" ; "$hashcmd" "$project".{"$goarch",tar.xz} > "$project.$hashcmd" ; ) - -# Set up the gpgflags. -[[ "$keyid" ]] && export gpgflags="--default-key $keyid" -gpg_cansign $gpgflags || bail "Could not find suitable GPG key, skipping signing step." - -# Sign everything. -gpg $gpgflags --detach-sign --armor "$releasedir/$project.$goarch" -gpg $gpgflags --detach-sign --armor "$releasedir/$project.tar.xz" -gpg $gpgflags --clear-sign --armor \ - --output "$releasedir/$project.$hashcmd"{.tmp,} && \ - mv "$releasedir/$project.$hashcmd"{.tmp,} diff --git a/script/release_build.sh b/script/release_build.sh new file mode 100755 index 0000000..2525161 --- /dev/null +++ b/script/release_build.sh @@ -0,0 +1,187 @@ +#!/bin/bash +# Copyright (C) 2017 SUSE LLC. +# Copyright (C) 2017-2021 Open Containers Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e + +## ---> +# Project-specific options and functions. In *theory* you shouldn't need to +# touch anything else in this script in order to use this elsewhere. +: "${LIBSECCOMP_VERSION:=2.5.3}" +project="runc" +root="$(readlink -f "$(dirname "${BASH_SOURCE[0]}")/..")" + +# shellcheck source=./script/lib.sh +source "$root/script/lib.sh" + +# This function takes an output path as an argument, where the built +# (preferably static) binary should be placed. +# Parameters: +# $1 -- destination directory to place build artefacts to. +# $2 -- native architecture (a .suffix for a native binary file name). +# $@ -- additional architectures to cross-build for. +function build_project() { + local builddir + builddir="$(dirname "$1")" + shift + local native_arch="$1" + shift + local arches=("$@") + + # Assume that if /opt/libseccomp exists, then we are run + # via Dockerfile, and seccomp is already built. + local seccompdir=/opt/libseccomp temp_dir + if [ ! -d "$seccompdir" ]; then + temp_dir="$(mktemp -d)" + seccompdir="$temp_dir" + # Download and build libseccomp. + "$root/script/seccomp.sh" "$LIBSECCOMP_VERSION" "$seccompdir" "${arches[@]}" + fi + + # For reproducible builds, add these to EXTRA_LDFLAGS: + # -w to disable DWARF generation; + # -s to disable symbol table; + # -buildid= to remove variable build id. + local ldflags="-w -s -buildid=" + # Add -a to go build flags to make sure it links against + # the provided libseccomp, not the system one (otherwise + # it can reuse cached pkg-config results). + local make_args=(COMMIT_NO= EXTRA_FLAGS="-a" EXTRA_LDFLAGS="${ldflags}" static) + + # Build natively. + make -C "$root" \ + PKG_CONFIG_PATH="$seccompdir/lib/pkgconfig" \ + "${make_args[@]}" + strip "$root/$project" + # Sanity check: make sure libseccomp version is as expected. + local ver + ver=$("$root/$project" --version | awk '$1 == "libseccomp:" {print $2}') + if [ "$ver" != "$LIBSECCOMP_VERSION" ]; then + echo >&2 "libseccomp version mismatch: want $LIBSECCOMP_VERSION, got $ver" + exit 1 + fi + + mv "$root/$project" "$builddir/$project.$native_arch" + + # Cross-build for for other architectures. + local arch + for arch in "${arches[@]}"; do + set_cross_vars "$arch" + make -C "$root" \ + PKG_CONFIG_PATH="$seccompdir/$arch/lib/pkgconfig" \ + "${make_args[@]}" + "$STRIP" "$root/$project" + mv "$root/$project" "$builddir/$project.$arch" + done + + # Copy libseccomp source tarball. + cp "$seccompdir"/src/* "$builddir" + + # Clean up. + if [ -n "$tempdir" ]; then + rm -rf "$tempdir" + fi +} + +# End of the easy-to-configure portion. +## <--- + +# Print usage information. +function usage() { + echo "usage: release_build.sh [-a ]... [-c ] [-H ]" >&2 + echo " [-r ] [-v ]" >&2 + exit 1 +} + +# Log something to stderr. +function log() { + echo "[*] $*" >&2 +} + +# Log something to stderr and then exit with 0. +function bail() { + log "$@" + exit 0 +} + +# When creating releases we need to build static binaries, an archive of the +# current commit, and generate detached signatures for both. +commit="HEAD" +version="" +releasedir="" +hashcmd="" +declare -a add_arches + +while getopts "a:c:H:hr:v:" opt; do + case "$opt" in + a) + add_arches+=("$OPTARG") + ;; + c) + commit="$OPTARG" + ;; + H) + hashcmd="$OPTARG" + ;; + h) + usage + ;; + r) + releasedir="$OPTARG" + ;; + v) + version="$OPTARG" + ;; + :) + echo "Missing argument: -$OPTARG" >&2 + usage + ;; + \?) + echo "Invalid option: -$OPTARG" >&2 + usage + ;; + esac +done + +version="${version:-$(<"$root/VERSION")}" +releasedir="${releasedir:-release/$version}" +hashcmd="${hashcmd:-sha256sum}" +native_arch="$(go env GOARCH || echo "amd64")" +# Suffixes of files to checksum/sign. +suffixes=("$native_arch" "${add_arches[@]}" tar.xz) + +log "creating $project release in '$releasedir'" +log " version: $version" +log " commit: $commit" +log " hash: $hashcmd" + +# Make explicit what we're doing. +set -x + +# Make the release directory. +rm -rf "$releasedir" && mkdir -p "$releasedir" + +# Build project. +build_project "$releasedir/$project" "$native_arch" "${add_arches[@]}" + +# Generate new archive. +git archive --format=tar --prefix="$project-$version/" "$commit" | xz >"$releasedir/$project.tar.xz" + +# Generate sha256 checksums for binaries and libseccomp tarball. +( + cd "$releasedir" + # Add $project. prefix to all suffixes. + "$hashcmd" "${suffixes[@]/#/$project.}" >"$project.$hashcmd" +) diff --git a/script/release_sign.sh b/script/release_sign.sh new file mode 100755 index 0000000..8cc224a --- /dev/null +++ b/script/release_sign.sh @@ -0,0 +1,108 @@ +#!/bin/bash +# Copyright (C) 2017 SUSE LLC. +# Copyright (C) 2017-2021 Open Containers Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e + +project="runc" +root="$(readlink -f "$(dirname "${BASH_SOURCE[0]}")/..")" + +# Print usage information. +function usage() { + echo "usage: release_sign.sh [-S ] [-H ]" >&2 + echo " [-r ] [-v ]" >&2 + exit 1 +} + +# Log something to stderr. +function log() { + echo "[*] $*" >&2 +} + +# Log something to stderr and then exit with 0. +function bail() { + log "$@" + exit 0 +} + +# Conduct a sanity-check to make sure that GPG provided with the given +# arguments can sign something. Inability to sign things is not a fatal error. +function gpg_cansign() { + gpg "$@" --clear-sign /dev/null +} + +# When creating releases we need to build static binaries, an archive of the +# current commit, and generate detached signatures for both. +keyid="" +version="" +releasedir="" +hashcmd="" + +while getopts "H:hr:S:v:" opt; do + case "$opt" in + H) + hashcmd="$OPTARG" + ;; + h) + usage + ;; + r) + releasedir="$OPTARG" + ;; + S) + keyid="$OPTARG" + ;; + v) + version="$OPTARG" + ;; + :) + echo "Missing argument: -$OPTARG" >&2 + usage + ;; + \?) + echo "Invalid option: -$OPTARG" >&2 + usage + ;; + esac +done + +version="${version:-$(<"$root/VERSION")}" +releasedir="${releasedir:-release/$version}" +hashcmd="${hashcmd:-sha256sum}" + +log "signing $project release in '$releasedir'" +log " key: ${keyid:-DEFAULT}" +log " hash: $hashcmd" + +# Make explicit what we're doing. +set -x + +# Set up the gpgflags. +gpgflags=() +[[ "$keyid" ]] && gpgflags=(--default-key "$keyid") +gpg_cansign "${gpgflags[@]}" || bail "Could not find suitable GPG key, skipping signing step." + +# Only needed for local signing -- change the owner since by default it's built +# inside a container which means it'll have the wrong owner and permissions. +[ -w "$releasedir" ] || sudo chown -R "$USER:$GROUP" "$releasedir" + +# Sign everything. +for bin in "$releasedir/$project".*; do + [[ "$(basename "$bin")" == "$project.$hashcmd" ]] && continue # skip hash + gpg "${gpgflags[@]}" --detach-sign --armor "$bin" +done +gpg "${gpgflags[@]}" --clear-sign --armor \ + --output "$releasedir/$project.$hashcmd"{.tmp,} && + mv "$releasedir/$project.$hashcmd"{.tmp,} diff --git a/script/seccomp.sh b/script/seccomp.sh new file mode 100755 index 0000000..2c2ea84 --- /dev/null +++ b/script/seccomp.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +set -e -u -o pipefail + +# shellcheck source=./script/lib.sh +source "$(dirname "${BASH_SOURCE[0]}")/lib.sh" + +# Due to libseccomp being LGPL we must include its sources, +# so download, install and build against it. +# Parameters: +# $1 -- libseccomp version to download and build. +# $2 -- destination directory. +# $@ -- additional architectures to cross-compile for. +function build_libseccomp() { + local ver="$1" + shift + local dest="$1" + shift + local arches=("$@") + local tar="libseccomp-${ver}.tar.gz" + + # Download and extract. + wget "https://github.com/seccomp/libseccomp/releases/download/v${ver}/${tar}"{,.asc} + local srcdir + srcdir="$(mktemp -d)" + tar xf "$tar" -C "$srcdir" + pushd "$srcdir/libseccomp-$ver" || return + + # Build natively and install to /usr/local. + ./configure \ + --prefix="$dest" --libdir="$dest/lib" \ + --enable-static --enable-shared + make install + make clean + + # Build and install for additional architectures. + local arch + for arch in "${arches[@]}"; do + set_cross_vars "$arch" + ./configure --host "$HOST" \ + --prefix="$dest/$arch" --libdir="$dest/$arch/lib" \ + --enable-static --enable-shared + make install + make clean + done + + # Place the source tarball to $dest/src. + popd || return + mkdir "$dest"/src + mv "$tar"{,.asc} "$dest"/src +} + +if [ $# -lt 2 ]; then + echo "Usage: seccomp.sh [ ...]" >&2 + exit 1 +fi + +build_libseccomp "$@" diff --git a/script/tmpmount b/script/tmpmount deleted file mode 100755 index 5ac6bc2..0000000 --- a/script/tmpmount +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -mount -t tmpfs none /tmp -exec "$@" diff --git a/script/validate-c b/script/validate-c deleted file mode 100755 index 7c01b51..0000000 --- a/script/validate-c +++ /dev/null @@ -1,42 +0,0 @@ -#!/bin/bash - -source "$(dirname "$BASH_SOURCE")/.validate" - -IFS=$'\n' -files=($(validate_diff --diff-filter=ACMR --name-only -- '*.c' | grep -v '^vendor/' || true)) -unset IFS - -# indent(1): "You must use the ‘-T’ option to tell indent the name of all the typenames in your program that are defined by typedef." -INDENT="indent -linux -l120 -T size_t -T jmp_buf" -if [ -z "$(indent --version 2>&1 | grep GNU)" ]; then - echo "Skipping C indentation checks, as GNU indent is not installed." - exit 0 -fi - -badFiles=() -for f in "${files[@]}"; do - orig=$(mktemp) - formatted=$(mktemp) - # we use "git show" here to validate that what's committed is formatted - git show "$VALIDATE_HEAD:$f" > ${orig} - ${INDENT} ${orig} -o ${formatted} - if [ "$(diff -u ${orig} ${formatted})" ]; then - badFiles+=("$f") - fi - rm -f ${orig} ${formatted} -done - -if [ ${#badFiles[@]} -eq 0 ]; then - echo 'Congratulations! All C source files are properly formatted.' -else - { - echo "These files are not properly formatted:" - for f in "${badFiles[@]}"; do - echo " - $f" - done - echo - echo "Please reformat the above files using \"${INDENT}\" and commit the result." - echo - } >&2 - false -fi diff --git a/script/validate-gofmt b/script/validate-gofmt deleted file mode 100755 index 8337ed2..0000000 --- a/script/validate-gofmt +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash - -source "$(dirname "$BASH_SOURCE")/.validate" - -IFS=$'\n' -files=($(validate_diff --diff-filter=ACMR --name-only -- '*.go' | grep -v '^vendor/' || true)) -unset IFS - -badFiles=() -for f in "${files[@]}"; do - # we use "git show" here to validate that what's committed is formatted - if [ "$(git show "$VALIDATE_HEAD:$f" | gofmt -s -l)" ]; then - badFiles+=("$f") - fi -done - -if [ ${#badFiles[@]} -eq 0 ]; then - echo 'Congratulations! All Go source files are properly formatted.' -else - { - echo "These files are not properly gofmt'd:" - for f in "${badFiles[@]}"; do - echo " - $f" - done - echo - echo 'Please reformat the above files using "gofmt -s -w" and commit the result.' - echo - } >&2 - false -fi diff --git a/signalmap.go b/signalmap.go deleted file mode 100644 index f9a6347..0000000 --- a/signalmap.go +++ /dev/null @@ -1,47 +0,0 @@ -// +build linux -// +build !mips,!mipsle,!mips64,!mips64le - -package main - -import ( - "syscall" - - "golang.org/x/sys/unix" -) - -var signalMap = map[string]syscall.Signal{ - "ABRT": unix.SIGABRT, - "ALRM": unix.SIGALRM, - "BUS": unix.SIGBUS, - "CHLD": unix.SIGCHLD, - "CLD": unix.SIGCLD, - "CONT": unix.SIGCONT, - "FPE": unix.SIGFPE, - "HUP": unix.SIGHUP, - "ILL": unix.SIGILL, - "INT": unix.SIGINT, - "IO": unix.SIGIO, - "IOT": unix.SIGIOT, - "KILL": unix.SIGKILL, - "PIPE": unix.SIGPIPE, - "POLL": unix.SIGPOLL, - "PROF": unix.SIGPROF, - "PWR": unix.SIGPWR, - "QUIT": unix.SIGQUIT, - "SEGV": unix.SIGSEGV, - "STKFLT": unix.SIGSTKFLT, - "STOP": unix.SIGSTOP, - "SYS": unix.SIGSYS, - "TERM": unix.SIGTERM, - "TRAP": unix.SIGTRAP, - "TSTP": unix.SIGTSTP, - "TTIN": unix.SIGTTIN, - "TTOU": unix.SIGTTOU, - "URG": unix.SIGURG, - "USR1": unix.SIGUSR1, - "USR2": unix.SIGUSR2, - "VTALRM": unix.SIGVTALRM, - "WINCH": unix.SIGWINCH, - "XCPU": unix.SIGXCPU, - "XFSZ": unix.SIGXFSZ, -} diff --git a/signalmap_mipsx.go b/signalmap_mipsx.go deleted file mode 100644 index 046bf15..0000000 --- a/signalmap_mipsx.go +++ /dev/null @@ -1,45 +0,0 @@ -// +build linux,mips linux,mipsle linux,mips64 linux,mips64le - -package main - -import ( - "syscall" - - "golang.org/x/sys/unix" -) - -var signalMap = map[string]syscall.Signal{ - "ABRT": unix.SIGABRT, - "ALRM": unix.SIGALRM, - "BUS": unix.SIGBUS, - "CHLD": unix.SIGCHLD, - "CLD": unix.SIGCLD, - "CONT": unix.SIGCONT, - "FPE": unix.SIGFPE, - "HUP": unix.SIGHUP, - "ILL": unix.SIGILL, - "INT": unix.SIGINT, - "IO": unix.SIGIO, - "IOT": unix.SIGIOT, - "KILL": unix.SIGKILL, - "PIPE": unix.SIGPIPE, - "POLL": unix.SIGPOLL, - "PROF": unix.SIGPROF, - "PWR": unix.SIGPWR, - "QUIT": unix.SIGQUIT, - "SEGV": unix.SIGSEGV, - "STOP": unix.SIGSTOP, - "SYS": unix.SIGSYS, - "TERM": unix.SIGTERM, - "TRAP": unix.SIGTRAP, - "TSTP": unix.SIGTSTP, - "TTIN": unix.SIGTTIN, - "TTOU": unix.SIGTTOU, - "URG": unix.SIGURG, - "USR1": unix.SIGUSR1, - "USR2": unix.SIGUSR2, - "VTALRM": unix.SIGVTALRM, - "WINCH": unix.SIGWINCH, - "XCPU": unix.SIGXCPU, - "XFSZ": unix.SIGXFSZ, -} diff --git a/signals.go b/signals.go index b67f65a..2555b76 100644 --- a/signals.go +++ b/signals.go @@ -1,11 +1,8 @@ -// +build linux - package main import ( "os" "os/signal" - "syscall" // only for Signal "github.com/opencontainers/runc/libcontainer" "github.com/opencontainers/runc/libcontainer/system" @@ -67,10 +64,11 @@ func (h *signalHandler) forward(process *libcontainer.Process, tty *tty, detach if h.notifySocket != nil { if detach { - h.notifySocket.run(pid1) + _ = h.notifySocket.run(pid1) return 0, nil } - go h.notifySocket.run(0) + _ = h.notifySocket.run(os.Getpid()) + go func() { _ = h.notifySocket.run(0) }() } // Perform the initial tty resize. Always ignore errors resizing because @@ -96,16 +94,13 @@ func (h *signalHandler) forward(process *libcontainer.Process, tty *tty, detach // call Wait() on the process even though we already have the exit // status because we must ensure that any of the go specific process // fun such as flushing pipes are complete before we return. - process.Wait() - if h.notifySocket != nil { - h.notifySocket.Close() - } + _, _ = process.Wait() return e.status, nil } } default: logrus.Debugf("sending signal to process %s", s) - if err := unix.Kill(pid1, s.(syscall.Signal)); err != nil { + if err := unix.Kill(pid1, s.(unix.Signal)); err != nil { logrus.Error(err) } } @@ -123,7 +118,7 @@ func (h *signalHandler) reap() (exits []exit, err error) { for { pid, err := unix.Wait4(-1, &ws, unix.WNOHANG, &rus) if err != nil { - if err == unix.ECHILD { + if err == unix.ECHILD { //nolint:errorlint // unix errors are bare return exits, nil } return nil, err diff --git a/spec.go b/spec.go index 322a83d..806d2f1 100644 --- a/spec.go +++ b/spec.go @@ -1,11 +1,8 @@ -// +build linux - package main import ( "encoding/json" "fmt" - "io/ioutil" "os" "github.com/opencontainers/runc/libcontainer/configs" @@ -111,7 +108,7 @@ created by an unprivileged user. if err != nil { return err } - return ioutil.WriteFile(specConfig, data, 0666) + return os.WriteFile(specConfig, data, 0o666) }, } diff --git a/start.go b/start.go index 2bb698b..338737c 100644 --- a/start.go +++ b/start.go @@ -3,6 +3,7 @@ package main import ( "errors" "fmt" + "os" "github.com/opencontainers/runc/libcontainer" "github.com/urfave/cli" @@ -31,13 +32,23 @@ your host.`, } switch status { case libcontainer.Created: - return container.Exec() + notifySocket, err := notifySocketStart(context, os.Getenv("NOTIFY_SOCKET"), container.ID()) + if err != nil { + return err + } + if err := container.Exec(); err != nil { + return err + } + if notifySocket != nil { + return notifySocket.waitForContainer(container) + } + return nil case libcontainer.Stopped: return errors.New("cannot start a container that has stopped") case libcontainer.Running: return errors.New("cannot start an already running container") default: - return fmt.Errorf("cannot start a container in the %s state\n", status) + return fmt.Errorf("cannot start a container in the %s state", status) } }, } diff --git a/state.go b/state.go index 718813c..b645e5a 100644 --- a/state.go +++ b/state.go @@ -1,5 +1,3 @@ -// +build linux - package main import ( diff --git a/tests/fuzzing/oss_fuzz_build.sh b/tests/fuzzing/oss_fuzz_build.sh new file mode 100755 index 0000000..6072dd9 --- /dev/null +++ b/tests/fuzzing/oss_fuzz_build.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +# This file is only meant to be run by OSS-fuzz and will not work +# if run outside of it. +# The api, compile_go_fuzzer() is provided by the OSS-fuzz +# environment and is a high level helper function for a series +# of compilation and linking steps to build the fuzzers in the +# OSS-fuzz environment. +# More info about compile_go_fuzzer() can be found here: +# https://google.github.io/oss-fuzz/getting-started/new-project-guide/go-lang/#buildsh +compile_go_fuzzer github.com/opencontainers/runc/libcontainer/system FuzzUIDMap id_map_fuzzer linux,gofuzz +compile_go_fuzzer github.com/opencontainers/runc/libcontainer/user FuzzUser user_fuzzer +compile_go_fuzzer github.com/opencontainers/runc/libcontainer/configs FuzzUnmarshalJSON configs_fuzzer diff --git a/tests/integration/README.md b/tests/integration/README.md index 8ee6ebf..6d0201c 100644 --- a/tests/integration/README.md +++ b/tests/integration/README.md @@ -9,7 +9,8 @@ Integration tests on the other hand are meant to test a specific feature end to end. Integration tests are written in *bash* using the -[bats](https://github.com/sstephenson/bats) framework. +[bats (Bash Automated Testing System)](https://github.com/bats-core/bats-core) +framework. ## Running integration tests @@ -31,13 +32,14 @@ $ make integration TESTPATH="/checkpoint.bats" ``` -To run them on your host, you will need to setup a development environment plus -[bats](https://github.com/sstephenson/bats#installing-bats-from-source) +To run them on your host, you need to set up a development environment plus +[bats (Bash Automated Testing System)](https://github.com/bats-core/bats-core#installing-bats-from-source). + For example: ``` $ cd ~/go/src/github.com -$ git clone https://github.com/sstephenson/bats.git -$ cd bats +$ git clone https://github.com/bats-core/bats-core.git +$ cd bats-core $ ./install.sh /usr/local ``` @@ -47,8 +49,7 @@ $ ./install.sh /usr/local ## Writing integration tests -[helper functions] -(https://github.com/opencontainers/runc/blob/master/test/integration/helpers.bash) +[helper functions](https://github.com/opencontainers/runc/blob/master/tests/integration/helpers.bash) are provided in order to facilitate writing tests. ```sh @@ -59,15 +60,12 @@ load helpers # setup is called at the beginning of every test. function setup() { - # see functions teardown_hello and setup_hello in helpers.bash, used to - # create a pristine environment for running your tests - teardown_hello setup_hello } # teardown is called at the end of every test. function teardown() { - teardown_hello + teardown_bundle } @test "this is a simple test" { diff --git a/tests/integration/cgroup_delegation.bats b/tests/integration/cgroup_delegation.bats new file mode 100644 index 0000000..db1407c --- /dev/null +++ b/tests/integration/cgroup_delegation.bats @@ -0,0 +1,61 @@ +#!/usr/bin/env bats + +load helpers + +function teardown() { + teardown_bundle +} + +function setup() { + requires root cgroups_v2 systemd + + setup_busybox + + # chown test temp dir to allow host user to read it + chown 100000 "$ROOT" + + # chown rootfs to allow host user to mkdir mount points + chown 100000 "$ROOT"/bundle/rootfs + + set_cgroups_path + + # configure a user namespace + update_config ' .linux.namespaces += [{"type": "user"}] + | .linux.uidMappings += [{"hostID": 100000, "containerID": 0, "size": 65536}] + | .linux.gidMappings += [{"hostID": 100000, "containerID": 0, "size": 65536}] + ' +} + +@test "runc exec (cgroup v2, ro cgroupfs, new cgroupns) does not chown cgroup" { + runc run -d --console-socket "$CONSOLE_SOCKET" test_cgroup_chown + [ "$status" -eq 0 ] + + runc exec test_cgroup_chown sh -c "stat -c %U /sys/fs/cgroup" + [ "$status" -eq 0 ] + [ "$output" = "nobody" ] # /sys/fs/cgroup owned by unmapped user +} + +@test "runc exec (cgroup v2, rw cgroupfs, inh cgroupns) does not chown cgroup" { + set_cgroup_mount_writable + + # inherit cgroup namespace (remove cgroup from namespaces list) + update_config '.linux.namespaces |= map(select(.type != "cgroup"))' + + runc run -d --console-socket "$CONSOLE_SOCKET" test_cgroup_chown + [ "$status" -eq 0 ] + + runc exec test_cgroup_chown sh -c "stat -c %U /sys/fs/cgroup" + [ "$status" -eq 0 ] + [ "$output" = "nobody" ] # /sys/fs/cgroup owned by unmapped user +} + +@test "runc exec (cgroup v2, rw cgroupfs, new cgroupns) does chown cgroup" { + set_cgroup_mount_writable + + runc run -d --console-socket "$CONSOLE_SOCKET" test_cgroup_chown + [ "$status" -eq 0 ] + + runc exec test_cgroup_chown sh -c "stat -c %U /sys/fs/cgroup" + [ "$status" -eq 0 ] + [ "$output" = "root" ] # /sys/fs/cgroup owned by root (of user namespace) +} diff --git a/tests/integration/cgroups.bats b/tests/integration/cgroups.bats index 17812ab..ff7cf6d 100644 --- a/tests/integration/cgroups.bats +++ b/tests/integration/cgroups.bats @@ -3,125 +3,409 @@ load helpers function teardown() { - rm -f $BATS_TMPDIR/runc-cgroups-integration-test.json - teardown_running_container test_cgroups_kmem - teardown_running_container test_cgroups_permissions - teardown_busybox + teardown_bundle } function setup() { - teardown - setup_busybox -} - -function check_cgroup_value() { - cgroup=$1 - source=$2 - expected=$3 - - current=$(cat $cgroup/$source) - echo $cgroup/$source - echo "current" $current "!?" "$expected" - [ "$current" -eq "$expected" ] -} - -@test "runc update --kernel-memory (initialized)" { - [[ "$ROOTLESS" -ne 0 ]] && requires rootless_cgroup - requires cgroups_kmem - - set_cgroups_path "$BUSYBOX_BUNDLE" - - # Set some initial known values - DATA=$(cat <<-EOF - "memory": { - "kernel": 16777216 - }, -EOF - ) - DATA=$(echo ${DATA} | sed 's/\n/\\n/g') - sed -i "s/\(\"resources\": {\)/\1\n${DATA}/" ${BUSYBOX_BUNDLE}/config.json - - # run a detached busybox to work with - runc run -d --console-socket $CONSOLE_SOCKET test_cgroups_kmem - [ "$status" -eq 0 ] - - # update kernel memory limit - runc update test_cgroups_kmem --kernel-memory 50331648 - [ "$status" -eq 0 ] - - # check the value - check_cgroup_value $CGROUP_MEMORY "memory.kmem.limit_in_bytes" 50331648 -} - -@test "runc update --kernel-memory (uninitialized)" { - [[ "$ROOTLESS" -ne 0 ]] && requires rootless_cgroup - requires cgroups_kmem - - set_cgroups_path "$BUSYBOX_BUNDLE" - - # run a detached busybox to work with - runc run -d --console-socket $CONSOLE_SOCKET test_cgroups_kmem - [ "$status" -eq 0 ] - - # update kernel memory limit - runc update test_cgroups_kmem --kernel-memory 50331648 - # Since kernel 4.6, we can update kernel memory without initialization - # because it's accounted by default. - if [ "$KERNEL_MAJOR" -lt 4 ] || [ "$KERNEL_MAJOR" -eq 4 -a "$KERNEL_MINOR" -le 5 ]; then - [ ! "$status" -eq 0 ] - else - [ "$status" -eq 0 ] - check_cgroup_value $CGROUP_MEMORY "memory.kmem.limit_in_bytes" 50331648 - fi + setup_busybox } @test "runc create (no limits + no cgrouppath + no permission) succeeds" { - runc run -d --console-socket $CONSOLE_SOCKET test_cgroups_permissions - [ "$status" -eq 0 ] + runc run -d --console-socket "$CONSOLE_SOCKET" test_cgroups_permissions + [ "$status" -eq 0 ] } @test "runc create (rootless + no limits + cgrouppath + no permission) fails with permission error" { - requires rootless - requires rootless_no_cgroup + requires rootless rootless_no_cgroup - set_cgroups_path "$BUSYBOX_BUNDLE" + set_cgroups_path - runc run -d --console-socket $CONSOLE_SOCKET test_cgroups_permissions - [ "$status" -eq 1 ] - [[ ${lines[1]} == *"permission denied"* ]] + runc run -d --console-socket "$CONSOLE_SOCKET" test_cgroups_permissions + [ "$status" -eq 1 ] + [[ "$output" == *"unable to apply cgroup configuration"*"permission denied"* ]] } @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error" { - requires rootless - requires rootless_no_cgroup + requires rootless rootless_no_cgroup - set_resources_limit "$BUSYBOX_BUNDLE" + set_resources_limit - runc run -d --console-socket $CONSOLE_SOCKET test_cgroups_permissions - [ "$status" -eq 1 ] - [[ ${lines[1]} == *"cannot set pids limit: container could not join or create cgroup"* ]] + runc run -d --console-socket "$CONSOLE_SOCKET" test_cgroups_permissions + [ "$status" -eq 1 ] + [[ "$output" == *"rootless needs no limits + no cgrouppath when no permission is granted for cgroups"* ]] || + [[ "$output" == *"cannot set pids limit: container could not join or create cgroup"* ]] } @test "runc create (limits + cgrouppath + permission on the cgroup dir) succeeds" { - [[ "$ROOTLESS" -ne 0 ]] && requires rootless_cgroup + [[ "$ROOTLESS" -ne 0 ]] && requires rootless_cgroup - set_cgroups_path "$BUSYBOX_BUNDLE" - set_resources_limit "$BUSYBOX_BUNDLE" + set_cgroups_path + set_resources_limit - runc run -d --console-socket $CONSOLE_SOCKET test_cgroups_permissions - [ "$status" -eq 0 ] + runc run -d --console-socket "$CONSOLE_SOCKET" test_cgroups_permissions + [ "$status" -eq 0 ] + if [ "$CGROUP_UNIFIED" != "no" ]; then + if [ -n "${RUNC_USE_SYSTEMD}" ]; then + if [ "$(id -u)" = "0" ]; then + check_cgroup_value "cgroup.controllers" "$(cat /sys/fs/cgroup/machine.slice/cgroup.controllers)" + else + # Filter out hugetlb and misc as systemd is unable to delegate them. + check_cgroup_value "cgroup.controllers" "$(sed -e 's/ hugetlb//' -e 's/ misc//' /sys/fs/cgroup/foo/cgroup.procs" + [ "$status" -eq 0 ] + + # the init process is now in "/foo", but an exec process can still join "/" + # because we haven't enabled any domain controller. + runc exec test_cgroups_group cat /proc/self/cgroup + [ "$status" -eq 0 ] + [[ ${lines[0]} == "0::/" ]] + + # turn on a domain controller (memory) + runc exec test_cgroups_group sh -euxc 'echo $$ > /sys/fs/cgroup/foo/cgroup.procs; echo +memory > /sys/fs/cgroup/cgroup.subtree_control' + [ "$status" -eq 0 ] + + # an exec process can no longer join "/" after turning on a domain controller. + # falls back to "/foo". + runc exec test_cgroups_group cat /proc/self/cgroup + [ "$status" -eq 0 ] + [[ ${lines[0]} == "0::/foo" ]] + + # teardown: remove "/foo" + # shellcheck disable=SC2016 + runc exec test_cgroups_group sh -uxc 'echo -memory > /sys/fs/cgroup/cgroup.subtree_control; for f in $(cat /sys/fs/cgroup/foo/cgroup.procs); do echo $f > /sys/fs/cgroup/cgroup.procs; done; rmdir /sys/fs/cgroup/foo' + runc exec test_cgroups_group test ! -d /sys/fs/cgroup/foo + [ "$status" -eq 0 ] + # +} + +@test "runc run (cgroup v1 + unified resources should fail)" { + requires root cgroups_v1 + + set_cgroups_path + set_resources_limit + update_config '.linux.resources.unified |= {"memory.min": "131072"}' + + runc run -d --console-socket "$CONSOLE_SOCKET" test_cgroups_unified + [ "$status" -ne 0 ] + [[ "$output" == *'invalid configuration'* ]] +} + +@test "runc run (blkio weight)" { + requires cgroups_v2 + [[ "$ROOTLESS" -ne 0 ]] && requires rootless_cgroup + + set_cgroups_path + update_config '.linux.resources.blockIO |= {"weight": 750}' + + runc run -d --console-socket "$CONSOLE_SOCKET" test_cgroups_unified + [ "$status" -eq 0 ] + + runc exec test_cgroups_unified sh -c 'cat /sys/fs/cgroup/io.bfq.weight' + if [[ "$status" -eq 0 ]]; then + [ "$output" = 'default 750' ] + else + runc exec test_cgroups_unified sh -c 'cat /sys/fs/cgroup/io.weight' + [ "$output" = 'default 7475' ] + fi +} + +@test "runc run (per-device io weight for bfq)" { + requires root # to create a loop device + + dd if=/dev/zero of=backing.img bs=4096 count=1 + dev=$(losetup --find --show backing.img) || skip "unable to create a loop device" + + # See if BFQ scheduler is available. + if ! { grep -qw bfq "/sys/block/${dev#/dev/}/queue/scheduler" && + echo bfq >"/sys/block/${dev#/dev/}/queue/scheduler"; }; then + losetup -d "$dev" + skip "BFQ scheduler not available" + fi + + set_cgroups_path + + IFS=$' \t:' read -r major minor <<<"$(lsblk -nd -o MAJ:MIN "$dev")" + update_config ' .linux.devices += [{path: "'"$dev"'", type: "b", major: '"$major"', minor: '"$minor"'}] + | .linux.resources.blockIO.weight |= 333 + | .linux.resources.blockIO.weightDevice |= [ + { major: '"$major"', minor: '"$minor"', weight: 444 } + ]' + runc run -d --console-socket "$CONSOLE_SOCKET" test_dev_weight + [ "$status" -eq 0 ] + + # The loop device itself is no longer needed. + losetup -d "$dev" + + if [ "$CGROUP_UNIFIED" = "yes" ]; then + file="io.bfq.weight" + else + file="blkio.bfq.weight_device" + fi + weights=$(get_cgroup_value $file) + [[ "$weights" == *"default 333"* ]] + [[ "$weights" == *"$major:$minor 444"* ]] +} + +@test "runc run (cgroup v2 resources.unified only)" { + requires root cgroups_v2 + + set_cgroups_path + update_config ' .linux.resources.unified |= { + "memory.min": "131072", + "memory.low": "524288", + "memory.high": "5242880", + "memory.max": "10485760", + "memory.swap.max": "20971520", + "pids.max": "99", + "cpu.max": "10000 100000", + "cpu.weight": "42" + }' + + runc run -d --console-socket "$CONSOLE_SOCKET" test_cgroups_unified + [ "$status" -eq 0 ] + + runc exec test_cgroups_unified sh -c 'cd /sys/fs/cgroup && grep . *.min *.max *.low *.high' + [ "$status" -eq 0 ] + echo "$output" + + echo "$output" | grep -q '^memory.min:131072$' + echo "$output" | grep -q '^memory.low:524288$' + echo "$output" | grep -q '^memory.high:5242880$' + echo "$output" | grep -q '^memory.max:10485760$' + echo "$output" | grep -q '^memory.swap.max:20971520$' + echo "$output" | grep -q '^pids.max:99$' + echo "$output" | grep -q '^cpu.max:10000 100000$' + + check_systemd_value "MemoryMin" 131072 + check_systemd_value "MemoryLow" 524288 + check_systemd_value "MemoryHigh" 5242880 + check_systemd_value "MemoryMax" 10485760 + check_systemd_value "MemorySwapMax" 20971520 + check_systemd_value "TasksMax" 99 + check_cpu_quota 10000 100000 "100ms" + check_cpu_weight 42 +} + +@test "runc run (cgroup v2 resources.unified override)" { + requires root cgroups_v2 + + set_cgroups_path + # CPU shares of 3333 corresponds to CPU weight of 128. + update_config ' .linux.resources.memory |= {"limit": 33554432} + | .linux.resources.memorySwap |= {"limit": 33554432} + | .linux.resources.cpu |= { + "shares": 3333, + "quota": 40000, + "period": 100000 + } + | .linux.resources.unified |= { + "memory.min": "131072", + "memory.max": "10485760", + "pids.max": "42", + "cpu.max": "5000 50000", + "cpu.weight": "42" + }' + + runc run -d --console-socket "$CONSOLE_SOCKET" test_cgroups_unified + [ "$status" -eq 0 ] + + runc exec test_cgroups_unified cat /sys/fs/cgroup/memory.min + [ "$status" -eq 0 ] + [ "$output" = '131072' ] + + runc exec test_cgroups_unified cat /sys/fs/cgroup/memory.max + [ "$status" -eq 0 ] + [ "$output" = '10485760' ] + + runc exec test_cgroups_unified cat /sys/fs/cgroup/pids.max + [ "$status" -eq 0 ] + [ "$output" = '42' ] + check_systemd_value "TasksMax" 42 + + check_cpu_quota 5000 50000 "100ms" + + check_cpu_weight 42 +} + +@test "runc run (cgroupv2 mount inside container)" { + requires cgroups_v2 + [[ "$ROOTLESS" -ne 0 ]] && requires rootless_cgroup + + set_cgroups_path + + runc run -d --console-socket "$CONSOLE_SOCKET" test_cgroups_unified + [ "$status" -eq 0 ] + + # Make sure we don't have any extra cgroups inside + runc exec test_cgroups_unified find /sys/fs/cgroup/ -type d + [ "$status" -eq 0 ] + [ "$(wc -l <<<"$output")" -eq 1 ] +} + +@test "runc exec (cgroup v1+hybrid joins correct cgroup)" { + requires root cgroups_hybrid + + set_cgroups_path + + runc run --pid-file pid.txt -d --console-socket "$CONSOLE_SOCKET" test_cgroups_group + [ "$status" -eq 0 ] + + pid=$(cat pid.txt) + run_cgroup=$(tail -1 "$FREEZER" + + # Start a container. + runc run -d --console-socket "$CONSOLE_SOCKET" ct1 + [ "$status" -eq 1 ] + # A warning should be printed. + [[ "$output" == *"container's cgroup unexpectedly frozen"* ]] + + # Same check for runc create. + runc create --console-socket "$CONSOLE_SOCKET" ct2 + [ "$status" -eq 1 ] + # A warning should be printed. + [[ "$output" == *"container's cgroup unexpectedly frozen"* ]] + + # Cleanup. + rmdir "$FREEZER_DIR" } diff --git a/tests/integration/checkpoint.bats b/tests/integration/checkpoint.bats index 87696df..4b7e442 100644 --- a/tests/integration/checkpoint.bats +++ b/tests/integration/checkpoint.bats @@ -3,346 +3,405 @@ load helpers function setup() { - if [[ -n "${RUNC_USE_SYSTEMD}" ]] ; then - skip "CRIU test suite is skipped on systemd cgroup driver for now." - fi + # XXX: currently criu require root containers. + requires criu root - teardown_busybox - setup_busybox + setup_busybox } function teardown() { - teardown_busybox + teardown_bundle +} + +function setup_pipes() { + # The changes to 'terminal' are needed for running in detached mode + # shellcheck disable=SC2016 + update_config ' (.. | select(.terminal? != null)) .terminal |= false + | (.. | select(.[]? == "sh")) += ["-c", "for i in `seq 10`; do read xxx || continue; echo ponG $xxx; done"]' + + # Create three sets of pipes for __runc run. + # for stderr + exec {pipe}<> <(:) + exec {err_r}/proc/self/fd/$pipe + exec {pipe}>&- + # for stdout + exec {pipe}<> <(:) + exec {out_r}/proc/self/fd/$pipe + exec {pipe}>&- + # for stdin + exec {pipe}<> <(:) + exec {in_r}/proc/self/fd/$pipe + exec {pipe}>&- +} + +function check_pipes() { + local output stderr + + echo Ping >&${in_w} + exec {in_w}>&- + exec {out_w}>&- + exec {err_w}>&- + + exec {in_r}>&- + output=$(cat <&${out_r}) + exec {out_r}>&- + stderr=$(cat <&${err_r}) + exec {err_r}>&- + + [[ "${output}" == *"ponG Ping"* ]] + if [ -n "$stderr" ]; then + fail "runc stderr: $stderr" + fi +} + +# Usage: runc_run_with_pipes container-name +function runc_run_with_pipes() { + # Start a container to be checkpointed, with stdin/stdout redirected + # so that check_pipes can be used to check it's working fine. + # We have to redirect stderr as well because otherwise it is + # redirected to a bats log file, which is not accessible to CRIU + # (i.e. outside of container) so checkpointing will fail. + ret=0 + __runc run -d "$1" <&${in_r} >&${out_w} 2>&${err_w} || ret=$? + if [ "$ret" -ne 0 ]; then + echo "runc run -d $1 (status: $ret):" + exec {err_w}>&- + cat <&${err_r} + fail "runc run failed" + fi + + testcontainer "$1" running +} + +# Usage: runc_restore_with_pipes work-dir container-name [optional-arguments ...] +function runc_restore_with_pipes() { + workdir="$1" + shift + name="$1" + shift + + ret=0 + __runc --criu "$CRIU" restore -d --work-path "$workdir" --image-path ./image-dir "$@" "$name" <&${in_r} >&${out_w} 2>&${err_w} || ret=$? + if [ "$ret" -ne 0 ]; then + echo "__runc restore $name failed (status: $ret)" + exec {err_w}>&- + cat <&${err_r} + echo "CRIU log errors (if any):" + grep -B 5 Error "$workdir"/*.log ./image-dir/*.log || true + fail "runc restore failed" + fi + + testcontainer "$name" running + + runc exec --cwd /bin "$name" echo ok + [ "$status" -eq 0 ] + [[ ${output} == "ok" ]] +} + +function simple_cr() { + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + testcontainer test_busybox running + + for _ in $(seq 2); do + # checkpoint the running container + runc --criu "$CRIU" "$@" checkpoint --work-path ./work-dir test_busybox + grep -B 5 Error ./work-dir/dump.log || true + [ "$status" -eq 0 ] + + # after checkpoint busybox is no longer running + testcontainer test_busybox checkpointed + + # restore from checkpoint + runc --criu "$CRIU" "$@" restore -d --work-path ./work-dir --console-socket "$CONSOLE_SOCKET" test_busybox + grep -B 5 Error ./work-dir/restore.log || true + [ "$status" -eq 0 ] + + # busybox should be back up and running + testcontainer test_busybox running + done } @test "checkpoint and restore" { - # XXX: currently criu require root containers. - requires criu root + simple_cr +} - runc run -d --console-socket $CONSOLE_SOCKET test_busybox - [ "$status" -eq 0 ] +@test "checkpoint and restore (bind mount, destination is symlink)" { + mkdir -p rootfs/real/conf + ln -s /real/conf rootfs/conf + update_config ' .mounts += [{ + source: ".", + destination: "/conf", + options: ["bind"] + }]' + simple_cr +} - testcontainer test_busybox running +@test "checkpoint and restore (with --debug)" { + simple_cr --debug +} - for i in `seq 2`; do - # checkpoint the running container - runc --criu "$CRIU" checkpoint --work-path ./work-dir test_busybox - ret=$? - # if you are having problems getting criu to work uncomment the following dump: - #cat /run/opencontainer/containers/test_busybox/criu.work/dump.log - cat ./work-dir/dump.log | grep -B 5 Error || true - [ "$ret" -eq 0 ] +@test "checkpoint and restore (cgroupns)" { + # cgroupv2 already enables cgroupns so this case was tested above already + requires cgroups_v1 cgroupns - # after checkpoint busybox is no longer running - runc state test_busybox - [ "$status" -ne 0 ] + # enable CGROUPNS + update_config '.linux.namespaces += [{"type": "cgroup"}]' - # restore from checkpoint - runc --criu "$CRIU" restore -d --work-path ./work-dir --console-socket $CONSOLE_SOCKET test_busybox - ret=$? - cat ./work-dir/restore.log | grep -B 5 Error || true - [ "$ret" -eq 0 ] + simple_cr +} - # busybox should be back up and running - testcontainer test_busybox running - done +@test "checkpoint --pre-dump (bad --parent-path)" { + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + testcontainer test_busybox running + + # runc should fail with absolute parent image path. + runc --criu "$CRIU" checkpoint --parent-path "$(pwd)"/parent-dir --work-path ./work-dir --image-path ./image-dir test_busybox + [[ "${output}" == *"--parent-path"* ]] + [ "$status" -ne 0 ] + + # runc should fail with invalid parent image path. + runc --criu "$CRIU" checkpoint --parent-path ./parent-dir --work-path ./work-dir --image-path ./image-dir test_busybox + [[ "${output}" == *"--parent-path"* ]] + [ "$status" -ne 0 ] } @test "checkpoint --pre-dump and restore" { - # XXX: currently criu require root containers. - requires criu root + setup_pipes + runc_run_with_pipes test_busybox - # The changes to 'terminal' are needed for running in detached mode - sed -i 's;"terminal": true;"terminal": false;' config.json - sed -i 's/"sh"/"sh","-c","for i in `seq 10`; do read xxx || continue; echo ponG $xxx; done"/' config.json + #test checkpoint pre-dump + mkdir parent-dir + runc --criu "$CRIU" checkpoint --pre-dump --image-path ./parent-dir test_busybox + [ "$status" -eq 0 ] - # The following code creates pipes for stdin and stdout. - # CRIU can't handle fifo-s, so we need all these tricks. - fifo=`mktemp -u /tmp/runc-fifo-XXXXXX` - mkfifo $fifo + # busybox should still be running + testcontainer test_busybox running - # stdout - cat $fifo | cat $fifo & - pid=$! - exec 50/proc/$pid/fd/0 + # checkpoint the running container + mkdir image-dir + mkdir work-dir + runc --criu "$CRIU" checkpoint --parent-path ../parent-dir --work-path ./work-dir --image-path ./image-dir test_busybox + grep -B 5 Error ./work-dir/dump.log || true + [ "$status" -eq 0 ] - # stdin - cat $fifo | cat $fifo & - pid=$! - exec 60/proc/$pid/fd/0 + # check parent path is valid + [ -e ./image-dir/parent ] - echo -n > $fifo - unlink $fifo + # after checkpoint busybox is no longer running + testcontainer test_busybox checkpointed - # run busybox - __runc run -d test_busybox <&60 >&51 2>&51 - [ $? -eq 0 ] - - testcontainer test_busybox running - - #test checkpoint pre-dump - mkdir parent-dir - runc --criu "$CRIU" checkpoint --pre-dump --image-path ./parent-dir test_busybox - [ "$status" -eq 0 ] - - # busybox should still be running - runc state test_busybox - [ "$status" -eq 0 ] - [[ "${output}" == *"running"* ]] - - # checkpoint the running container - mkdir image-dir - mkdir work-dir - runc --criu "$CRIU" checkpoint --parent-path ./parent-dir --work-path ./work-dir --image-path ./image-dir test_busybox - cat ./work-dir/dump.log | grep -B 5 Error || true - [ "$status" -eq 0 ] - - # after checkpoint busybox is no longer running - runc state test_busybox - [ "$status" -ne 0 ] - - # restore from checkpoint - __runc --criu "$CRIU" restore -d --work-path ./work-dir --image-path ./image-dir test_busybox <&60 >&51 2>&51 - ret=$? - cat ./work-dir/restore.log | grep -B 5 Error || true - [ $ret -eq 0 ] - - # busybox should be back up and running - testcontainer test_busybox running - - runc exec --cwd /bin test_busybox echo ok - [ "$status" -eq 0 ] - [[ ${output} == "ok" ]] - - echo Ping >&61 - exec 61>&- - exec 51>&- - run cat <&50 - [ "$status" -eq 0 ] - [[ "${output}" == *"ponG Ping"* ]] + runc_restore_with_pipes ./work-dir test_busybox + check_pipes } @test "checkpoint --lazy-pages and restore" { - # XXX: currently criu require root containers. - requires criu root + # check if lazy-pages is supported + if ! "${CRIU}" check --feature uffd-noncoop; then + skip "this criu does not support lazy migration" + fi - # check if lazy-pages is supported - run ${CRIU} check --feature uffd-noncoop - if [ "$status" -eq 1 ]; then - # this criu does not support lazy migration; skip the test - skip "this criu does not support lazy migration" - fi + setup_pipes + runc_run_with_pipes test_busybox - # The changes to 'terminal' are needed for running in detached mode - sed -i 's;"terminal": true;"terminal": false;' config.json - # This should not be necessary: https://github.com/checkpoint-restore/criu/issues/575 - sed -i 's;"readonly": true;"readonly": false;' config.json - sed -i 's/"sh"/"sh","-c","for i in `seq 10`; do read xxx || continue; echo ponG $xxx; done"/' config.json + # checkpoint the running container + mkdir image-dir + mkdir work-dir - # The following code creates pipes for stdin and stdout. - # CRIU can't handle fifo-s, so we need all these tricks. - fifo=`mktemp -u /tmp/runc-fifo-XXXXXX` - mkfifo $fifo + # For lazy migration we need to know when CRIU is ready to serve + # the memory pages via TCP. + exec {pipe}<> <(:) + # shellcheck disable=SC2094 + exec {lazy_r}/proc/self/fd/$pipe + exec {pipe}>&- - # For lazy migration we need to know when CRIU is ready to serve - # the memory pages via TCP. - lazy_pipe=`mktemp -u /tmp/lazy-pipe-XXXXXX` - mkfifo $lazy_pipe + # TCP port for lazy migration + port=27277 - # TCP port for lazy migration - port=27277 + __runc --criu "$CRIU" checkpoint --lazy-pages --page-server 0.0.0.0:${port} --status-fd ${lazy_w} --work-path ./work-dir --image-path ./image-dir test_busybox & + cpt_pid=$! - # stdout - cat $fifo | cat $fifo & - pid=$! - exec 50/proc/$pid/fd/0 + # wait for lazy page server to be ready + out=$(timeout 2 dd if=/proc/self/fd/${lazy_r} bs=1 count=1 2>/dev/null | od) + exec {lazy_r}>&- + exec {lazy_w}>&- + # shellcheck disable=SC2116,SC2086 + out=$(echo $out) # rm newlines + # show errors if there are any before we fail + grep -B5 Error ./work-dir/dump.log || true + # expecting \0 which od prints as + [ "$out" = "0000000 000000 0000001" ] - # stdin - cat $fifo | cat $fifo & - pid=$! - exec 60/proc/$pid/fd/0 + # Check if inventory.img was written + [ -e image-dir/inventory.img ] - echo -n > $fifo - unlink $fifo + # Start CRIU in lazy-daemon mode + ${CRIU} lazy-pages --page-server --address 127.0.0.1 --port ${port} -D image-dir & + lp_pid=$! - # run busybox - __runc run -d test_busybox <&60 >&51 2>&51 - [ $? -eq 0 ] + # Restore lazily from checkpoint. + # The restored container needs a different name (as well as systemd + # unit name, in case systemd cgroup driver is used) as the checkpointed + # container is not yet destroyed. It is only destroyed at that point + # in time when the last page is lazily transferred to the destination. + # Killing the CRIU on the checkpoint side will let the container + # continue to run if the migration failed at some point. + [ -n "$RUNC_USE_SYSTEMD" ] && set_cgroups_path + runc_restore_with_pipes ./image-dir test_busybox_restore --lazy-pages - testcontainer test_busybox running + wait $cpt_pid - # checkpoint the running container - mkdir image-dir - mkdir work-dir - # Double fork taken from helpers.bats - # We need to start 'runc checkpoint --lazy-pages' in the background, - # so we double fork in the shell. - (runc --criu "$CRIU" checkpoint --lazy-pages --page-server 0.0.0.0:${port} --status-fd ${lazy_pipe} --work-path ./work-dir --image-path ./image-dir test_busybox & ) & - # Sleeping here. This is ugly, but not sure how else to handle it. - # The return code of the in the background running runc is needed, if - # there is some basic error. If the lazy migration is ready can - # be handled by $lazy_pipe. Which probably will always be ready - # after sleeping two seconds. - sleep 2 - # Check if inventory.img was written - [ -e image-dir/inventory.img ] - # If the inventory.img exists criu checkpointed some things, let's see - # if there were other errors in the log file. - run grep -B 5 Error ./work-dir/dump.log -q - [ "$status" -eq 1 ] + wait $lp_pid - # This will block until CRIU is ready to serve memory pages - cat $lazy_pipe - [ "$status" -eq 1 ] - - unlink $lazy_pipe - - # Double fork taken from helpers.bats - # We need to start 'criu lazy-pages' in the background, - # so we double fork in the shell. - # Start CRIU in lazy-daemon mode - $(${CRIU} lazy-pages --page-server --address 127.0.0.1 --port ${port} -D image-dir &) & - - # Restore lazily from checkpoint. - # The restored container needs a different name as the checkpointed - # container is not yet destroyed. It is only destroyed at that point - # in time when the last page is lazily transferred to the destination. - # Killing the CRIU on the checkpoint side will let the container - # continue to run if the migration failed at some point. - __runc --criu "$CRIU" restore -d --work-path ./image-dir --image-path ./image-dir --lazy-pages test_busybox_restore <&60 >&51 2>&51 - ret=$? - [ $ret -eq 0 ] - run grep -B 5 Error ./work-dir/dump.log -q - [ "$status" -eq 1 ] - - # busybox should be back up and running - testcontainer test_busybox_restore running - - runc exec --cwd /bin test_busybox_restore echo ok - [ "$status" -eq 0 ] - [[ ${output} == "ok" ]] - - echo Ping >&61 - exec 61>&- - exec 51>&- - run cat <&50 - [ "$status" -eq 0 ] - [[ "${output}" == *"ponG Ping"* ]] + check_pipes } @test "checkpoint and restore in external network namespace" { - # XXX: currently criu require root containers. - requires criu root + # check if external_net_ns is supported; only with criu 3.10++ + if ! "${CRIU}" check --feature external_net_ns; then + # this criu does not support external_net_ns; skip the test + skip "this criu does not support external network namespaces" + fi - # check if external_net_ns is supported; only with criu 3.10++ - run ${CRIU} check --feature external_net_ns - if [ "$status" -eq 1 ]; then - # this criu does not support external_net_ns; skip the test - skip "this criu does not support external network namespaces" - fi + # create a temporary name for the test network namespace + tmp=$(mktemp) + rm -f "$tmp" + ns_name=$(basename "$tmp") + # create network namespace + ip netns add "$ns_name" + ns_path=$(ip netns add "$ns_name" 2>&1 | sed -e 's/.*"\(.*\)".*/\1/') + # shellcheck disable=SC2012 + ns_inode=$(ls -iL "$ns_path" | awk '{ print $1 }') - # create a temporary name for the test network namespace - tmp=`mktemp` - rm -f $tmp - ns_name=`basename $tmp` - # create network namespace - ip netns add $ns_name - ns_path=`ip netns add $ns_name 2>&1 | sed -e 's/.*"\(.*\)".*/\1/'` + # tell runc which network namespace to use + update_config '(.. | select(.type? == "network")) .path |= "'"$ns_path"'"' - ns_inode=`ls -iL $ns_path | awk '{ print $1 }'` + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] - # tell runc which network namespace to use - sed -i "s;\"type\": \"network\";\"type\": \"network\",\"path\": \"$ns_path\";" config.json + testcontainer test_busybox running - runc run -d --console-socket $CONSOLE_SOCKET test_busybox - [ "$status" -eq 0 ] + for _ in $(seq 2); do + # checkpoint the running container; this automatically tells CRIU to + # handle the network namespace defined in config.json as an external + runc --criu "$CRIU" checkpoint --work-path ./work-dir test_busybox + grep -B 5 Error ./work-dir/dump.log || true + [ "$status" -eq 0 ] - testcontainer test_busybox running + # after checkpoint busybox is no longer running + testcontainer test_busybox checkpointed - for i in `seq 2`; do - # checkpoint the running container; this automatically tells CRIU to - # handle the network namespace defined in config.json as an external - runc --criu "$CRIU" checkpoint --work-path ./work-dir test_busybox - ret=$? - # if you are having problems getting criu to work uncomment the following dump: - #cat /run/opencontainer/containers/test_busybox/criu.work/dump.log - cat ./work-dir/dump.log | grep -B 5 Error || true - [ "$ret" -eq 0 ] + # restore from checkpoint; this should restore the container into the existing network namespace + runc --criu "$CRIU" restore -d --work-path ./work-dir --console-socket "$CONSOLE_SOCKET" test_busybox + grep -B 5 Error ./work-dir/restore.log || true + [ "$status" -eq 0 ] - # after checkpoint busybox is no longer running - runc state test_busybox - [ "$status" -ne 0 ] + # busybox should be back up and running + testcontainer test_busybox running - # restore from checkpoint; this should restore the container into the existing network namespace - runc --criu "$CRIU" restore -d --work-path ./work-dir --console-socket $CONSOLE_SOCKET test_busybox - ret=$? - cat ./work-dir/restore.log | grep -B 5 Error || true - [ "$ret" -eq 0 ] - - # busybox should be back up and running - testcontainer test_busybox running - - # container should be running in same network namespace as before - pid=`__runc state test_busybox | jq '.pid'` - ns_inode_new=`readlink /proc/$pid/ns/net | sed -e 's/.*\[\(.*\)\]/\1/'` - echo "old network namespace inode $ns_inode" - echo "new network namespace inode $ns_inode_new" - [ "$ns_inode" -eq "$ns_inode_new" ] - done - ip netns del $ns_name + # container should be running in same network namespace as before + pid=$(__runc state test_busybox | jq '.pid') + ns_inode_new=$(readlink /proc/"$pid"/ns/net | sed -e 's/.*\[\(.*\)\]/\1/') + echo "old network namespace inode $ns_inode" + echo "new network namespace inode $ns_inode_new" + [ "$ns_inode" -eq "$ns_inode_new" ] + done + ip netns del "$ns_name" } @test "checkpoint and restore with container specific CRIU config" { - # XXX: currently criu require root containers. - requires criu root + tmp=$(mktemp /tmp/runc-criu-XXXXXX.conf) + # This is the file we write to /etc/criu/default.conf + tmplog1=$(mktemp /tmp/runc-criu-log-XXXXXX.log) + unlink "$tmplog1" + tmplog1=$(basename "$tmplog1") + # That is the actual configuration file to be used + tmplog2=$(mktemp /tmp/runc-criu-log-XXXXXX.log) + unlink "$tmplog2" + tmplog2=$(basename "$tmplog2") + # This adds the annotation 'org.criu.config' to set a container + # specific CRIU config file. + update_config '.annotations += {"org.criu.config": "'"$tmp"'"}' - tmp=`mktemp /tmp/runc-criu-XXXXXX.conf` - # This is the file we write to /etc/criu/default.conf - tmplog1=`mktemp /tmp/runc-criu-log-XXXXXX.log` - unlink $tmplog1 - tmplog1=`basename $tmplog1` - # That is the actual configuration file to be used - tmplog2=`mktemp /tmp/runc-criu-log-XXXXXX.log` - unlink $tmplog2 - tmplog2=`basename $tmplog2` - # This adds the annotation 'org.criu.config' to set a container - # specific CRIU config file. - sed -i "s;\"process\";\"annotations\":{\"org.criu.config\": \"$tmp\"},\"process\";" config.json - # Tell CRIU to use another configuration file - mkdir -p /etc/criu - echo "log-file=$tmplog1" > /etc/criu/default.conf - # Make sure the RPC defined configuration file overwrites the previous - echo "log-file=$tmplog2" > $tmp + # Tell CRIU to use another configuration file + mkdir -p /etc/criu + echo "log-file=$tmplog1" >/etc/criu/default.conf + # Make sure the RPC defined configuration file overwrites the previous + echo "log-file=$tmplog2" >"$tmp" - runc run -d --console-socket $CONSOLE_SOCKET test_busybox - [ "$status" -eq 0 ] + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] - testcontainer test_busybox running + testcontainer test_busybox running - # checkpoint the running container - runc --criu "$CRIU" checkpoint --work-path ./work-dir test_busybox - [ "$status" -eq 0 ] - ! test -f ./work-dir/$tmplog1 - test -f ./work-dir/$tmplog2 + # checkpoint the running container + runc --criu "$CRIU" checkpoint --work-path ./work-dir test_busybox + grep -B 5 Error ./work-dir/dump.log || true + [ "$status" -eq 0 ] + ! test -f ./work-dir/"$tmplog1" + test -f ./work-dir/"$tmplog2" - # after checkpoint busybox is no longer running - runc state test_busybox - [ "$status" -ne 0 ] + # after checkpoint busybox is no longer running + testcontainer test_busybox checkpointed - test -f ./work-dir/$tmplog2 && unlink ./work-dir/$tmplog2 - # restore from checkpoint - runc --criu "$CRIU" restore -d --work-path ./work-dir --console-socket $CONSOLE_SOCKET test_busybox - [ "$status" -eq 0 ] - ! test -f ./work-dir/$tmplog1 - test -f ./work-dir/$tmplog2 + test -f ./work-dir/"$tmplog2" && unlink ./work-dir/"$tmplog2" + # restore from checkpoint + runc --criu "$CRIU" restore -d --work-path ./work-dir --console-socket "$CONSOLE_SOCKET" test_busybox + grep -B 5 Error ./work-dir/restore.log || true + [ "$status" -eq 0 ] + ! test -f ./work-dir/"$tmplog1" + test -f ./work-dir/"$tmplog2" - # busybox should be back up and running - testcontainer test_busybox running - unlink $tmp - test -f ./work-dir/$tmplog2 && unlink ./work-dir/$tmplog2 + # busybox should be back up and running + testcontainer test_busybox running + unlink "$tmp" + test -f ./work-dir/"$tmplog2" && unlink ./work-dir/"$tmplog2" } +@test "checkpoint and restore with nested bind mounts" { + bind1=$(mktemp -d -p .) + bind2=$(mktemp -d -p .) + update_config ' .mounts += [{ + type: "bind", + source: "'"$bind1"'", + destination: "/test", + options: ["rw", "bind"] + }, + { + type: "bind", + source: "'"$bind2"'", + destination: "/test/for/nested", + options: ["rw", "bind"] + }]' + + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + testcontainer test_busybox running + + # checkpoint the running container + runc --criu "$CRIU" checkpoint --work-path ./work-dir test_busybox + grep -B 5 Error ./work-dir/dump.log || true + [ "$status" -eq 0 ] + + # after checkpoint busybox is no longer running + testcontainer test_busybox checkpointed + + # cleanup mountpoints created by runc during creation + # the mountpoints should be recreated during restore - that is the actual thing tested here + rm -rf "${bind1:?}"/* + + # restore from checkpoint + runc --criu "$CRIU" restore -d --work-path ./work-dir --console-socket "$CONSOLE_SOCKET" test_busybox + grep -B 5 Error ./work-dir/restore.log || true + [ "$status" -eq 0 ] + + # busybox should be back up and running + testcontainer test_busybox running +} diff --git a/tests/integration/create.bats b/tests/integration/create.bats index abd4da2..afa9f9d 100644 --- a/tests/integration/create.bats +++ b/tests/integration/create.bats @@ -3,87 +3,81 @@ load helpers function setup() { - teardown_busybox - setup_busybox + setup_busybox } function teardown() { - teardown_busybox + teardown_bundle } @test "runc create" { - runc create --console-socket $CONSOLE_SOCKET test_busybox - [ "$status" -eq 0 ] + runc create --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] - testcontainer test_busybox created + testcontainer test_busybox created - # start the command - runc start test_busybox - [ "$status" -eq 0 ] + # start the command + runc start test_busybox + [ "$status" -eq 0 ] - testcontainer test_busybox running + testcontainer test_busybox running } @test "runc create exec" { - runc create --console-socket $CONSOLE_SOCKET test_busybox - [ "$status" -eq 0 ] + runc create --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] - testcontainer test_busybox created + testcontainer test_busybox created - runc exec test_busybox true - [ "$status" -eq 0 ] + runc exec test_busybox true + [ "$status" -eq 0 ] - testcontainer test_busybox created + testcontainer test_busybox created - # start the command - runc start test_busybox - [ "$status" -eq 0 ] + # start the command + runc start test_busybox + [ "$status" -eq 0 ] - testcontainer test_busybox running + testcontainer test_busybox running } @test "runc create --pid-file" { - runc create --pid-file pid.txt --console-socket $CONSOLE_SOCKET test_busybox - [ "$status" -eq 0 ] + runc create --pid-file pid.txt --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] - testcontainer test_busybox created + testcontainer test_busybox created - # check pid.txt was generated - [ -e pid.txt ] + # check pid.txt was generated + [ -e pid.txt ] - run cat pid.txt - [ "$status" -eq 0 ] - [[ ${lines[0]} == $(__runc state test_busybox | jq '.pid') ]] + [[ $(cat pid.txt) == $(__runc state test_busybox | jq '.pid') ]] - # start the command - runc start test_busybox - [ "$status" -eq 0 ] + # start the command + runc start test_busybox + [ "$status" -eq 0 ] - testcontainer test_busybox running + testcontainer test_busybox running } @test "runc create --pid-file with new CWD" { - # create pid_file directory as the CWD - run mkdir pid_file - [ "$status" -eq 0 ] - run cd pid_file - [ "$status" -eq 0 ] + bundle="$(pwd)" + # create pid_file directory as the CWD + mkdir pid_file + cd pid_file - runc create --pid-file pid.txt -b $BUSYBOX_BUNDLE --console-socket $CONSOLE_SOCKET test_busybox - [ "$status" -eq 0 ] + runc create --pid-file pid.txt -b "$bundle" --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] - testcontainer test_busybox created + testcontainer test_busybox created - # check pid.txt was generated - [ -e pid.txt ] + # check pid.txt was generated + [ -e pid.txt ] - run cat pid.txt - [ "$status" -eq 0 ] - [[ ${lines[0]} == $(__runc state test_busybox | jq '.pid') ]] + [[ $(cat pid.txt) == $(__runc state test_busybox | jq '.pid') ]] - # start the command - runc start test_busybox - [ "$status" -eq 0 ] + # start the command + runc start test_busybox + [ "$status" -eq 0 ] - testcontainer test_busybox running + testcontainer test_busybox running } diff --git a/tests/integration/cwd.bats b/tests/integration/cwd.bats new file mode 100644 index 0000000..114efb9 --- /dev/null +++ b/tests/integration/cwd.bats @@ -0,0 +1,74 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + setup_busybox +} + +function teardown() { + teardown_bundle +} + +# Test case for https://github.com/opencontainers/runc/pull/2086 +@test "runc exec --user with no access to cwd" { + requires root + + chown 42 rootfs/root + chmod 700 rootfs/root + + update_config ' .process.cwd = "/root" + | .process.user.uid = 42 + | .process.args |= ["sleep", "1h"]' + + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + runc exec --user 0 test_busybox true + [ "$status" -eq 0 ] +} + +# Verify a cwd owned by the container user can be chdir'd to, +# even if runc doesn't have the privilege to do so. +@test "runc create sets up user before chdir to cwd if needed" { + requires rootless rootless_idmap + + # Some setup for this test (AUX_DIR and AUX_UID) is done + # by rootless.sh. Check that setup is done... + if [[ ! -d "$AUX_DIR" || -z "$AUX_UID" ]]; then + skip "bad/unset AUX_DIR/AUX_UID" + fi + # ... and is correct, i.e. the current user + # does not have permission to access AUX_DIR. + if ls -l "$AUX_DIR" 2>/dev/null; then + skip "bad AUX_DIR permissions" + fi + + update_config ' .mounts += [{ + source: "'"$AUX_DIR"'", + destination: "'"$AUX_DIR"'", + options: ["bind"] + }] + | .process.user.uid = '"$AUX_UID"' + | .process.cwd = "'"$AUX_DIR"'" + | .process.args |= ["ls", "'"$AUX_DIR"'"]' + + runc run test_busybox + [ "$status" -eq 0 ] +} + +# Verify a cwd not owned by the container user can be chdir'd to, +# if runc does have the privilege to do so. +@test "runc create can chdir if runc has access" { + requires root + + mkdir -p rootfs/home/nonroot + chmod 700 rootfs/home/nonroot + + update_config ' .process.cwd = "/root" + | .process.user.uid = 42 + | .process.args |= ["ls", "/tmp"]' + + runc run test_busybox + [ "$status" -eq 0 ] +} diff --git a/tests/integration/debug.bats b/tests/integration/debug.bats index e02cf4a..333745e 100644 --- a/tests/integration/debug.bats +++ b/tests/integration/debug.bats @@ -3,79 +3,70 @@ load helpers function setup() { - teardown_hello - setup_hello + setup_hello } function teardown() { - teardown_hello + teardown_bundle +} + +function check_debug() { + [[ "$*" == *"nsexec container setup"* ]] + [[ "$*" == *"child process in init()"* ]] + [[ "$*" == *"init: closing the pipe to signal completion"* ]] } @test "global --debug" { - # run hello-world - runc --debug run test_hello - echo "${output}" - [ "$status" -eq 0 ] + # run hello-world + runc --debug run test_hello + [ "$status" -eq 0 ] - # check expected debug output was sent to stderr - [[ "${output}" == *"level=debug"* ]] - [[ "${output}" == *"nsexec started"* ]] - [[ "${output}" == *"child process in init()"* ]] + # check expected debug output was sent to stderr + [[ "${output}" == *"level=debug"* ]] + check_debug "$output" } @test "global --debug to --log" { - # run hello-world - runc --log log.out --debug run test_hello - [ "$status" -eq 0 ] + # run hello-world + runc --log log.out --debug run test_hello + [ "$status" -eq 0 ] - # check output does not include debug info - [[ "${output}" != *"level=debug"* ]] + # check output does not include debug info + [[ "${output}" != *"level=debug"* ]] - # check log.out was generated - [ -e log.out ] - - # check expected debug output was sent to log.out - run cat log.out - [ "$status" -eq 0 ] - [[ "${output}" == *"level=debug"* ]] - [[ "${output}" == *"nsexec started"* ]] - [[ "${output}" == *"child process in init()"* ]] + cat log.out >&2 + # check expected debug output was sent to log.out + output=$(cat log.out) + [[ "${output}" == *"level=debug"* ]] + check_debug "$output" } @test "global --debug to --log --log-format 'text'" { - # run hello-world - runc --log log.out --log-format "text" --debug run test_hello - [ "$status" -eq 0 ] + # run hello-world + runc --log log.out --log-format "text" --debug run test_hello + [ "$status" -eq 0 ] - # check output does not include debug info - [[ "${output}" != *"level=debug"* ]] + # check output does not include debug info + [[ "${output}" != *"level=debug"* ]] - # check log.out was generated - [ -e log.out ] - - # check expected debug output was sent to log.out - run cat log.out - [ "$status" -eq 0 ] - [[ "${output}" == *"level=debug"* ]] - [[ "${output}" == *"nsexec started"* ]] - [[ "${output}" == *"child process in init()"* ]] + cat log.out >&2 + # check expected debug output was sent to log.out + output=$(cat log.out) + [[ "${output}" == *"level=debug"* ]] + check_debug "$output" } @test "global --debug to --log --log-format 'json'" { - # run hello-world - runc --log log.out --log-format "json" --debug run test_hello - [ "$status" -eq 0 ] + # run hello-world + runc --log log.out --log-format "json" --debug run test_hello + [ "$status" -eq 0 ] - # check output does not include debug info - [[ "${output}" != *"level=debug"* ]] + # check output does not include debug info + [[ "${output}" != *"level=debug"* ]] - # check log.out was generated - [ -e log.out ] - - # check expected debug output was sent to log.out - run cat log.out - [ "$status" -eq 0 ] - [[ "${output}" == *'"level":"debug"'* ]] - [[ "${output}" == *"nsexec started"* ]] - [[ "${output}" == *"child process in init()"* ]] + cat log.out >&2 + # check expected debug output was sent to log.out + output=$(cat log.out) + [[ "${output}" == *'"level":"debug"'* ]] + check_debug "$output" } diff --git a/tests/integration/delete.bats b/tests/integration/delete.bats index c5ed215..ea8a3bb 100644 --- a/tests/integration/delete.bats +++ b/tests/integration/delete.bats @@ -3,51 +3,156 @@ load helpers function setup() { - teardown_busybox - setup_busybox + setup_busybox } function teardown() { - teardown_busybox + teardown_bundle } @test "runc delete" { - # run busybox detached - runc run -d --console-socket $CONSOLE_SOCKET test_busybox - [ "$status" -eq 0 ] + runc run -d --console-socket "$CONSOLE_SOCKET" testbusyboxdelete + [ "$status" -eq 0 ] - # check state - testcontainer test_busybox running + testcontainer testbusyboxdelete running - runc kill test_busybox KILL - [ "$status" -eq 0 ] - # wait for busybox to be in the destroyed state - retry 10 1 eval "__runc state test_busybox | grep -q 'stopped'" + runc kill testbusyboxdelete KILL + [ "$status" -eq 0 ] + wait_for_container 10 1 testbusyboxdelete stopped - # delete test_busybox - runc delete test_busybox - [ "$status" -eq 0 ] + runc delete testbusyboxdelete + [ "$status" -eq 0 ] - runc state test_busybox - [ "$status" -ne 0 ] + runc state testbusyboxdelete + [ "$status" -ne 0 ] + + output=$(find /sys/fs/cgroup -wholename '*testbusyboxdelete*' -type d) + [ "$output" = "" ] || fail "cgroup not cleaned up correctly: $output" } @test "runc delete --force" { - # run busybox detached - runc run -d --console-socket $CONSOLE_SOCKET test_busybox - [ "$status" -eq 0 ] + # run busybox detached + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] - # check state - testcontainer test_busybox running + # check state + testcontainer test_busybox running - # force delete test_busybox - runc delete --force test_busybox + # force delete test_busybox + runc delete --force test_busybox - runc state test_busybox - [ "$status" -ne 0 ] + runc state test_busybox + [ "$status" -ne 0 ] } @test "runc delete --force ignore not exist" { - runc delete --force notexists - [ "$status" -eq 0 ] + runc delete --force notexists + [ "$status" -eq 0 ] +} + +@test "runc delete --force [paused container]" { + runc run -d --console-socket "$CONSOLE_SOCKET" ct1 + [ "$status" -eq 0 ] + testcontainer ct1 running + + runc pause ct1 + runc delete --force ct1 + [ "$status" -eq 0 ] +} + +@test "runc delete --force in cgroupv1 with subcgroups" { + requires cgroups_v1 root cgroupns + set_cgroups_path + set_cgroup_mount_writable + # enable cgroupns + update_config '.linux.namespaces += [{"type": "cgroup"}]' + + local subsystems="memory freezer" + + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + testcontainer test_busybox running + + __runc exec -d test_busybox sleep 1d + + # find the pid of sleep + pid=$(__runc exec test_busybox ps -a | grep 1d | awk '{print $1}') + [[ ${pid} =~ [0-9]+ ]] + + # create a sub-cgroup + cat < tasks + cat tasks +done +EOF + [ "$status" -eq 0 ] + [[ "$output" =~ [0-9]+ ]] + + for s in ${subsystems}; do + name=CGROUP_${s^^}_BASE_PATH + eval path=\$"${name}${REL_CGROUPS_PATH}/foo" + # shellcheck disable=SC2154 + [ -d "${path}" ] || fail "test failed to create memory sub-cgroup ($path not found)" + done + + runc delete --force test_busybox + + runc state test_busybox + [ "$status" -ne 0 ] + + output=$(find /sys/fs/cgroup -wholename '*testbusyboxdelete*' -type d) + [ "$output" = "" ] || fail "cgroup not cleaned up correctly: $output" +} + +@test "runc delete --force in cgroupv2 with subcgroups" { + requires cgroups_v2 root + set_cgroups_path + set_cgroup_mount_writable + + # run busybox detached + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + # check state + testcontainer test_busybox running + + # create a sub process + __runc exec -d test_busybox sleep 1d + + # find the pid of sleep + pid=$(__runc exec test_busybox ps -a | grep 1d | awk '{print $1}') + [[ ${pid} =~ [0-9]+ ]] + + # create subcgroups + cat <nest.sh + set -e -u -x + cd /sys/fs/cgroup + echo +pids > cgroup.subtree_control + mkdir foo + cd foo + echo threaded > cgroup.type + echo ${pid} > cgroup.threads + cat cgroup.threads +EOF + runc exec test_busybox sh events.log) & - ( - retry 10 1 eval "grep -q 'test_busybox' events.log" - teardown_running_container test_busybox - ) & - wait # wait for the above sub shells to finish + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] - [ -e events.log ] + # Spawn two subshels: + # 1. Event logger that sends stats events to events.log. + (__runc events ${interval:+ --interval "$interval"} test_busybox >events.log) & + # 2. Waits for an event that includes test_busybox then kills the + # test_busybox container which causes the event logger to exit. + ( + retry 10 "$retry_every" grep -q test_busybox events.log + __runc delete -f test_busybox + ) & + wait # for both subshells to finish - run cat events.log - [ "$status" -eq 0 ] - [[ "${lines[0]}" == [\{]"\"type\""[:]"\"stats\""[,]"\"id\""[:]"\"test_busybox\""[,]* ]] - [[ "${lines[0]}" == *"data"* ]] + [ -e events.log ] + + output=$(head -1 events.log) + [[ "$output" == [\{]"\"type\""[:]"\"stats\""[,]"\"id\""[:]"\"test_busybox\""[,]* ]] + [[ "$output" == *"data"* ]] } -@test "events --interval 1s " { - # XXX: currently cgroups require root containers. - requires root - - # run busybox detached - runc run -d --console-socket $CONSOLE_SOCKET test_busybox - [ "$status" -eq 0 ] - - # spawn two sub processes (shells) - # the first sub process is an event logger that sends stats events to events.log once a second - # the second sub process tries 3 times for an event that incudes test_busybox - # pausing 1s between each attempt then kills the test_busybox container which - # causes the event logger to exit - (__runc events --interval 1s test_busybox > events.log) & - ( - retry 3 1 eval "grep -q 'test_busybox' events.log" - teardown_running_container test_busybox - ) & - wait # wait for the above sub shells to finish - - [ -e events.log ] - - run eval "grep -q 'test_busybox' events.log" - [ "$status" -eq 0 ] +@test "events --interval default" { + test_events } -@test "events --interval 100ms " { - # XXX: currently cgroups require root containers. - requires root - - # run busybox detached - runc run -d --console-socket $CONSOLE_SOCKET test_busybox - [ "$status" -eq 0 ] - - #prove there is no carry over of events.log from a prior test - [ ! -e events.log ] - - # spawn two sub processes (shells) - # the first sub process is an event logger that sends stats events to events.log once every 100ms - # the second sub process tries 3 times for an event that incudes test_busybox - # pausing 100s between each attempt then kills the test_busybox container which - # causes the event logger to exit - (__runc events --interval 100ms test_busybox > events.log) & - ( - retry 3 0.100 eval "grep -q 'test_busybox' events.log" - teardown_running_container test_busybox - ) & - wait # wait for the above sub shells to finish - - [ -e events.log ] - - run eval "grep -q 'test_busybox' events.log" - [ "$status" -eq 0 ] +@test "events --interval 1s" { + test_events 1s 1 +} + +@test "events --interval 100ms" { + test_events 100ms 0.1 +} + +@test "events oom" { + # XXX: currently cgroups require root containers. + requires root cgroups_swap + init_cgroup_paths + + # we need the container to hit OOM, so disable swap + update_config '(.. | select(.resources? != null)) .resources.memory |= {"limit": 33554432, "swap": 33554432}' + + # run busybox detached + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + # spawn two sub processes (shells) + # the first sub process is an event logger that sends stats events to events.log + # the second sub process exec a memory hog process to cause a oom condition + # and waits for an oom event + (__runc events test_busybox >events.log) & + ( + retry 10 1 grep -q test_busybox events.log + # shellcheck disable=SC2016 + __runc exec -d test_busybox sh -c 'test=$(dd if=/dev/urandom ibs=5120k)' + retry 10 1 grep -q oom events.log + __runc delete -f test_busybox + ) & + wait # wait for the above sub shells to finish + + grep -q '{"type":"oom","id":"test_busybox"}' events.log } diff --git a/tests/integration/exec.bats b/tests/integration/exec.bats index 19647c1..24444d1 100644 --- a/tests/integration/exec.bats +++ b/tests/integration/exec.bats @@ -3,138 +3,298 @@ load helpers function setup() { - teardown_busybox - setup_busybox + setup_busybox } function teardown() { - teardown_busybox + teardown_bundle } @test "runc exec" { - # run busybox detached - runc run -d --console-socket $CONSOLE_SOCKET test_busybox - [ "$status" -eq 0 ] + # run busybox detached + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] - runc exec test_busybox echo Hello from exec - [ "$status" -eq 0 ] - echo text echoed = "'""${output}""'" - [[ "${output}" == *"Hello from exec"* ]] + runc exec test_busybox echo Hello from exec + [ "$status" -eq 0 ] + echo text echoed = "'""${output}""'" + [[ "${output}" == *"Hello from exec"* ]] +} + +@test "runc exec [exit codes]" { + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + runc exec test_busybox false + [ "$status" -eq 1 ] + + runc exec test_busybox sh -c "exit 42" + [ "$status" -eq 42 ] + + runc exec --pid-file /non-existent/directory test_busybox true + [ "$status" -eq 255 ] + + runc exec test_busybox no-such-binary + [ "$status" -eq 255 ] + + runc exec no_such_container true + [ "$status" -eq 255 ] } @test "runc exec --pid-file" { - # run busybox detached - runc run -d --console-socket $CONSOLE_SOCKET test_busybox - [ "$status" -eq 0 ] + # run busybox detached + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] - runc exec --pid-file pid.txt test_busybox echo Hello from exec - [ "$status" -eq 0 ] - echo text echoed = "'""${output}""'" - [[ "${output}" == *"Hello from exec"* ]] + runc exec --pid-file pid.txt test_busybox echo Hello from exec + [ "$status" -eq 0 ] + echo text echoed = "'""${output}""'" + [[ "${output}" == *"Hello from exec"* ]] - # check pid.txt was generated - [ -e pid.txt ] + # check pid.txt was generated + [ -e pid.txt ] - run cat pid.txt - [ "$status" -eq 0 ] - [[ ${lines[0]} =~ [0-9]+ ]] - [[ ${lines[0]} != $(__runc state test_busybox | jq '.pid') ]] + output=$(cat pid.txt) + [[ "$output" =~ [0-9]+ ]] + [[ "$output" != $(__runc state test_busybox | jq '.pid') ]] } @test "runc exec --pid-file with new CWD" { - # create pid_file directory as the CWD - run mkdir pid_file - [ "$status" -eq 0 ] - run cd pid_file - [ "$status" -eq 0 ] + bundle="$(pwd)" + # create pid_file directory as the CWD + mkdir pid_file + cd pid_file - # run busybox detached - runc run -d -b $BUSYBOX_BUNDLE --console-socket $CONSOLE_SOCKET test_busybox - [ "$status" -eq 0 ] + # run busybox detached + runc run -d -b "$bundle" --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] - runc exec --pid-file pid.txt test_busybox echo Hello from exec - [ "$status" -eq 0 ] - echo text echoed = "'""${output}""'" - [[ "${output}" == *"Hello from exec"* ]] + runc exec --pid-file pid.txt test_busybox echo Hello from exec + [ "$status" -eq 0 ] + echo text echoed = "'""${output}""'" + [[ "${output}" == *"Hello from exec"* ]] - # check pid.txt was generated - [ -e pid.txt ] + # check pid.txt was generated + [ -e pid.txt ] - run cat pid.txt - [ "$status" -eq 0 ] - [[ ${lines[0]} =~ [0-9]+ ]] - [[ ${lines[0]} != $(__runc state test_busybox | jq '.pid') ]] + output=$(cat pid.txt) + [[ "$output" =~ [0-9]+ ]] + [[ "$output" != $(__runc state test_busybox | jq '.pid') ]] } @test "runc exec ls -la" { - # run busybox detached - runc run -d --console-socket $CONSOLE_SOCKET test_busybox - [ "$status" -eq 0 ] + # run busybox detached + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] - runc exec test_busybox ls -la - [ "$status" -eq 0 ] - [[ ${lines[0]} == *"total"* ]] - [[ ${lines[1]} == *"."* ]] - [[ ${lines[2]} == *".."* ]] + runc exec test_busybox ls -la + [ "$status" -eq 0 ] + [[ ${lines[0]} == *"total"* ]] + [[ ${lines[1]} == *"."* ]] + [[ ${lines[2]} == *".."* ]] } @test "runc exec ls -la with --cwd" { - # run busybox detached - runc run -d --console-socket $CONSOLE_SOCKET test_busybox - [ "$status" -eq 0 ] + # run busybox detached + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] - runc exec --cwd /bin test_busybox pwd - [ "$status" -eq 0 ] - [[ ${output} == "/bin"* ]] + runc exec --cwd /bin test_busybox pwd + [ "$status" -eq 0 ] + [[ ${output} == "/bin"* ]] } @test "runc exec --env" { - # run busybox detached - runc run -d --console-socket $CONSOLE_SOCKET test_busybox - [ "$status" -eq 0 ] + # run busybox detached + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] - runc exec --env RUNC_EXEC_TEST=true test_busybox env - [ "$status" -eq 0 ] + runc exec --env RUNC_EXEC_TEST=true test_busybox env + [ "$status" -eq 0 ] - [[ ${output} == *"RUNC_EXEC_TEST=true"* ]] + [[ ${output} == *"RUNC_EXEC_TEST=true"* ]] } @test "runc exec --user" { - # --user can't work in rootless containers that don't have idmap. - [[ "$ROOTLESS" -ne 0 ]] && requires rootless_idmap + # --user can't work in rootless containers that don't have idmap. + [[ "$ROOTLESS" -ne 0 ]] && requires rootless_idmap - # run busybox detached - runc run -d --console-socket $CONSOLE_SOCKET test_busybox - [ "$status" -eq 0 ] + # run busybox detached + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] - runc exec --user 1000:1000 test_busybox id - [ "$status" -eq 0 ] + runc exec --user 1000:1000 test_busybox id + [ "$status" -eq 0 ] - [[ "${output}" == "uid=1000 gid=1000"* ]] + [[ "${output}" == "uid=1000 gid=1000"* ]] } @test "runc exec --additional-gids" { - requires root + requires root - # run busybox detached - runc run -d --console-socket $CONSOLE_SOCKET test_busybox - [ "$status" -eq 0 ] + # run busybox detached + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] - wait_for_container 15 1 test_busybox + wait_for_container 15 1 test_busybox - runc exec --user 1000:1000 --additional-gids 100 --additional-gids 65534 test_busybox id - [ "$status" -eq 0 ] + runc exec --user 1000:1000 --additional-gids 100 --additional-gids 65534 test_busybox id -G + [ "$status" -eq 0 ] - [[ ${output} == "uid=1000 gid=1000 groups=100(users),65534(nogroup)" ]] + [[ ${output} == "1000 100 65534" ]] } @test "runc exec --preserve-fds" { - # run busybox detached - runc run -d --console-socket $CONSOLE_SOCKET test_busybox - [ "$status" -eq 0 ] + # run busybox detached + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] - run bash -c "cat hello > preserve-fds.test; exec 3preserve-fds.test + # fd 3 is used by bats, so we use 4 + exec 4&2 + # check expected debug output was sent to log.out + output=$(cat log.out) + [[ "${output}" == *"level=debug"* ]] + check_exec_debug "$output" +} + +@test "runc exec --cgroup sub-cgroups [v1]" { + requires root cgroups_v1 + + set_cgroups_path + set_cgroup_mount_writable + + __runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + testcontainer test_busybox running + + # Check we can't join non-existing subcgroup. + runc exec --cgroup nonexistent test_busybox cat /proc/self/cgroup + [ "$status" -ne 0 ] + [[ "$output" == *" adding pid "*"/nonexistent/cgroup.procs: no such file "* ]] + + # Check we can't join non-existing subcgroup (for a particular controller). + runc exec --cgroup cpu:nonexistent test_busybox cat /proc/self/cgroup + [ "$status" -ne 0 ] + [[ "$output" == *" adding pid "*"/nonexistent/cgroup.procs: no such file "* ]] + + # Check we can't specify non-existent controller. + runc exec --cgroup whaaat:/ test_busybox true + [ "$status" -ne 0 ] + [[ "$output" == *"unknown controller "* ]] + + # Check we can join top-level cgroup (implicit). + runc exec test_busybox cat /proc/self/cgroup + [ "$status" -eq 0 ] + ! grep -v ":$REL_CGROUPS_PATH\$" <<<"$output" + + # Check we can join top-level cgroup (explicit). + runc exec --cgroup / test_busybox cat /proc/self/cgroup + [ "$status" -eq 0 ] + ! grep -v ":$REL_CGROUPS_PATH\$" <<<"$output" + + # Create a few subcgroups. + # Note that cpu,cpuacct may be mounted together or separate. + runc exec test_busybox sh -euc "mkdir -p /sys/fs/cgroup/memory/submem /sys/fs/cgroup/cpu/subcpu /sys/fs/cgroup/cpuacct/subcpu" + [ "$status" -eq 0 ] + + # Check that explicit --cgroup works. + runc exec --cgroup memory:submem --cgroup cpu,cpuacct:subcpu test_busybox cat /proc/self/cgroup + [ "$status" -eq 0 ] + [[ "$output" == *":memory:$REL_CGROUPS_PATH/submem"* ]] + [[ "$output" == *":cpu"*":$REL_CGROUPS_PATH/subcpu"* ]] +} + +@test "runc exec --cgroup subcgroup [v2]" { + requires root cgroups_v2 + + set_cgroups_path + set_cgroup_mount_writable + + __runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + testcontainer test_busybox running + + # Check we can't join non-existing subcgroup. + runc exec --cgroup nonexistent test_busybox cat /proc/self/cgroup + [ "$status" -ne 0 ] + [[ "$output" == *" adding pid "*"/nonexistent/cgroup.procs: no such file "* ]] + + # Check we can join top-level cgroup (implicit). + runc exec test_busybox grep '^0::/$' /proc/self/cgroup + [ "$status" -eq 0 ] + + # Check we can join top-level cgroup (explicit). + runc exec --cgroup / test_busybox grep '^0::/$' /proc/self/cgroup + [ "$status" -eq 0 ] + + # Now move "init" to a subcgroup, and check it was moved. + runc exec test_busybox sh -euc "mkdir /sys/fs/cgroup/foobar \ + && echo 1 > /sys/fs/cgroup/foobar/cgroup.procs \ + && grep -w foobar /proc/1/cgroup" + [ "$status" -eq 0 ] + + # The following part is taken from + # @test "runc exec (cgroup v2 + init process in non-root cgroup) succeeds" + + # The init process is now in "/foo", but an exec process can still + # join "/" because we haven't enabled any domain controller yet. + runc exec test_busybox grep '^0::/$' /proc/self/cgroup + [ "$status" -eq 0 ] + + # Turn on a domain controller (memory). + runc exec test_busybox sh -euc 'echo $$ > /sys/fs/cgroup/foobar/cgroup.procs; echo +memory > /sys/fs/cgroup/cgroup.subtree_control' + [ "$status" -eq 0 ] + + # An exec process can no longer join "/" after turning on a domain + # controller. Check that cgroup v2 fallback to init cgroup works. + runc exec test_busybox sh -euc "cat /proc/self/cgroup && grep '^0::/foobar$' /proc/self/cgroup" + [ "$status" -eq 0 ] + + # Check that --cgroup / disables the init cgroup fallback. + runc exec --cgroup / test_busybox true + [ "$status" -ne 0 ] + [[ "$output" == *" adding pid "*" to cgroups"*"/cgroup.procs: device or resource busy"* ]] + + # Check that explicit --cgroup foobar works. + runc exec --cgroup foobar test_busybox grep '^0::/foobar$' /proc/self/cgroup + [ "$status" -eq 0 ] + + # Check all processes is in foobar (this check is redundant). + runc exec --cgroup foobar test_busybox sh -euc '! grep -vwH foobar /proc/*/cgroup' + [ "$status" -eq 0 ] + + # Add a second subcgroup, check we're in it. + runc exec --cgroup foobar test_busybox mkdir /sys/fs/cgroup/second + [ "$status" -eq 0 ] + runc exec --cgroup second test_busybox grep -w second /proc/self/cgroup + [ "$status" -eq 0 ] } diff --git a/tests/integration/get-images.sh b/tests/integration/get-images.sh new file mode 100755 index 0000000..9f1bf96 --- /dev/null +++ b/tests/integration/get-images.sh @@ -0,0 +1,70 @@ +#!/bin/bash + +# This script checks if container images needed for tests (currently +# busybox and Debian 10 "Buster") are available locally, and downloads +# them to testdata directory if not. +# +# The script is self-contained/standalone and is used from a few places +# that need to ensure the images are downloaded. Its output is suitable +# for consumption by shell via eval (see helpers.bash). +# +# XXX: Latest available images are fetched. Theoretically, +# this can bring some instability in case of a broken image. +# In this case, images will need to be pinned to a checksum +# on a per-image and per-architecture basis. + +set -e -u -o pipefail + +# Root directory of integration tests. +INTEGRATION_ROOT=$(dirname "$(readlink -f "${BASH_SOURCE[0]}")") +# Test data path. +TESTDATA="${INTEGRATION_ROOT}/testdata" +# Sanity check: $TESTDATA directory must exist. +if [ ! -d "$TESTDATA" ]; then + echo "Bad TESTDATA directory: $TESTDATA. Aborting" >&2 + exit 1 +fi + +function get() { + local dest="$1" url="$2" + + [ -e "$dest" ] && return + + # Sanity check: $TESTDATA directory must be writable. + if [ ! -w "$TESTDATA" ]; then + echo "TESTDATA directory ($TESTDATA) not writable. Aborting" >&2 + exit 1 + fi + + if ! curl -o "$dest" -fsSL --retry 5 "$url"; then + echo "Failed to get $url" 1>&2 + exit 1 + fi +} + +arch=$(go env GOARCH) +# Convert from GOARCH to whatever the URLs below are using. +case $arch in +arm64) + arch=arm64v8 + ;; +386) + arch=i386 + ;; +esac + +# busybox +BUSYBOX_IMAGE="$TESTDATA/busybox-${arch}.tar.xz" +get "$BUSYBOX_IMAGE" \ + "https://github.com/docker-library/busybox/raw/dist-${arch}/stable/glibc/busybox.tar.xz" +echo "BUSYBOX_IMAGE=$BUSYBOX_IMAGE" + +# debian +DEBIAN_IMAGE="$TESTDATA/debian-${arch}.tar.xz" +get "$DEBIAN_IMAGE" \ + "https://github.com/debuerreotype/docker-debian-artifacts/raw/dist-${arch}/buster/slim/rootfs.tar.xz" +echo "DEBIAN_IMAGE=$DEBIAN_IMAGE" + +# hello-world is local, no need to download. +HELLO_IMAGE="$TESTDATA/hello-world-${arch}.tar" +echo "HELLO_IMAGE=$HELLO_IMAGE" diff --git a/tests/integration/help.bats b/tests/integration/help.bats index 163de2d..c512d7a 100644 --- a/tests/integration/help.bats +++ b/tests/integration/help.bats @@ -3,85 +3,85 @@ load helpers @test "runc -h" { - runc -h - [ "$status" -eq 0 ] - [[ ${lines[0]} =~ NAME:+ ]] - [[ ${lines[1]} =~ runc\ '-'\ Open\ Container\ Initiative\ runtime+ ]] + runc -h + [ "$status" -eq 0 ] + [[ ${lines[0]} =~ NAME:+ ]] + [[ ${lines[1]} =~ runc\ '-'\ Open\ Container\ Initiative\ runtime+ ]] - runc --help - [ "$status" -eq 0 ] - [[ ${lines[0]} =~ NAME:+ ]] - [[ ${lines[1]} =~ runc\ '-'\ Open\ Container\ Initiative\ runtime+ ]] + runc --help + [ "$status" -eq 0 ] + [[ ${lines[0]} =~ NAME:+ ]] + [[ ${lines[1]} =~ runc\ '-'\ Open\ Container\ Initiative\ runtime+ ]] } @test "runc command -h" { - runc checkpoint -h - [ "$status" -eq 0 ] - [[ ${lines[1]} =~ runc\ checkpoint+ ]] + runc checkpoint -h + [ "$status" -eq 0 ] + [[ ${lines[1]} =~ runc\ checkpoint+ ]] - runc delete -h - [ "$status" -eq 0 ] - [[ ${lines[1]} =~ runc\ delete+ ]] + runc delete -h + [ "$status" -eq 0 ] + [[ ${lines[1]} =~ runc\ delete+ ]] - runc events -h - [ "$status" -eq 0 ] - [[ ${lines[1]} =~ runc\ events+ ]] + runc events -h + [ "$status" -eq 0 ] + [[ ${lines[1]} =~ runc\ events+ ]] - runc exec -h - [ "$status" -eq 0 ] - [[ ${lines[1]} =~ runc\ exec+ ]] + runc exec -h + [ "$status" -eq 0 ] + [[ ${lines[1]} =~ runc\ exec+ ]] - runc kill -h - [ "$status" -eq 0 ] - [[ ${lines[1]} =~ runc\ kill+ ]] + runc kill -h + [ "$status" -eq 0 ] + [[ ${lines[1]} =~ runc\ kill+ ]] - runc list -h - [ "$status" -eq 0 ] - [[ ${lines[0]} =~ NAME:+ ]] - [[ ${lines[1]} =~ runc\ list+ ]] + runc list -h + [ "$status" -eq 0 ] + [[ ${lines[0]} =~ NAME:+ ]] + [[ ${lines[1]} =~ runc\ list+ ]] - runc list --help - [ "$status" -eq 0 ] - [[ ${lines[0]} =~ NAME:+ ]] - [[ ${lines[1]} =~ runc\ list+ ]] + runc list --help + [ "$status" -eq 0 ] + [[ ${lines[0]} =~ NAME:+ ]] + [[ ${lines[1]} =~ runc\ list+ ]] - runc pause -h - [ "$status" -eq 0 ] - [[ ${lines[1]} =~ runc\ pause+ ]] + runc pause -h + [ "$status" -eq 0 ] + [[ ${lines[1]} =~ runc\ pause+ ]] - runc restore -h - [ "$status" -eq 0 ] - [[ ${lines[1]} =~ runc\ restore+ ]] + runc restore -h + [ "$status" -eq 0 ] + [[ ${lines[1]} =~ runc\ restore+ ]] - runc resume -h - [ "$status" -eq 0 ] - [[ ${lines[1]} =~ runc\ resume+ ]] + runc resume -h + [ "$status" -eq 0 ] + [[ ${lines[1]} =~ runc\ resume+ ]] - # We don't use runc_spec here, because we're just testing the help page. - runc spec -h - [ "$status" -eq 0 ] - [[ ${lines[1]} =~ runc\ spec+ ]] + # We don't use runc_spec here, because we're just testing the help page. + runc spec -h + [ "$status" -eq 0 ] + [[ ${lines[1]} =~ runc\ spec+ ]] - runc start -h - [ "$status" -eq 0 ] - [[ ${lines[1]} =~ runc\ start+ ]] + runc start -h + [ "$status" -eq 0 ] + [[ ${lines[1]} =~ runc\ start+ ]] - runc run -h - [ "$status" -eq 0 ] - [[ ${lines[1]} =~ runc\ run+ ]] + runc run -h + [ "$status" -eq 0 ] + [[ ${lines[1]} =~ runc\ run+ ]] - runc state -h - [ "$status" -eq 0 ] - [[ ${lines[1]} =~ runc\ state+ ]] + runc state -h + [ "$status" -eq 0 ] + [[ ${lines[1]} =~ runc\ state+ ]] - runc update -h - [ "$status" -eq 0 ] - [[ ${lines[1]} =~ runc\ update+ ]] + runc update -h + [ "$status" -eq 0 ] + [[ ${lines[1]} =~ runc\ update+ ]] } @test "runc foo -h" { - runc foo -h - [ "$status" -ne 0 ] - [[ "${output}" == *"No help topic for 'foo'"* ]] + runc foo -h + [ "$status" -ne 0 ] + [[ "${output}" == *"No help topic for 'foo'"* ]] } diff --git a/tests/integration/helpers.bash b/tests/integration/helpers.bash index 8862dcb..aaa68dd 100644 --- a/tests/integration/helpers.bash +++ b/tests/integration/helpers.bash @@ -1,28 +1,30 @@ #!/bin/bash +# bats-core v1.2.1 defines BATS_RUN_TMPDIR +if [ -z "$BATS_RUN_TMPDIR" ]; then + echo "bats >= v1.2.1 is required. Aborting." >&2 + exit 1 +fi + # Root directory of integration tests. -INTEGRATION_ROOT=$(dirname "$(readlink -f "$BASH_SOURCE")") +INTEGRATION_ROOT=$(dirname "$(readlink -f "${BASH_SOURCE[0]}")") -. ${INTEGRATION_ROOT}/multi-arch.bash +# Download images, get *_IMAGE variables. +IMAGES=$("${INTEGRATION_ROOT}"/get-images.sh) +eval "$IMAGES" +unset IMAGES -RUNC="${INTEGRATION_ROOT}/../../runc" +: "${RUNC:="${INTEGRATION_ROOT}/../../runc"}" RECVTTY="${INTEGRATION_ROOT}/../../contrib/cmd/recvtty/recvtty" -GOPATH="$(mktemp -d --tmpdir runc-integration-gopath.XXXXXX)" +SD_HELPER="${INTEGRATION_ROOT}/../../contrib/cmd/sd-helper/sd-helper" +SECCOMP_AGENT="${INTEGRATION_ROOT}/../../contrib/cmd/seccompagent/seccompagent" # Test data path. +# shellcheck disable=SC2034 TESTDATA="${INTEGRATION_ROOT}/testdata" -# Busybox image -BUSYBOX_IMAGE="$BATS_TMPDIR/busybox.tar" -BUSYBOX_BUNDLE="$BATS_TMPDIR/busyboxtest" - -# hello-world in tar format -HELLO_FILE=`get_hello` -HELLO_IMAGE="$TESTDATA/$HELLO_FILE" -HELLO_BUNDLE="$BATS_TMPDIR/hello-world" - # CRIU PATH -CRIU="$(which criu || true)" +CRIU="$(which criu 2>/dev/null || true)" # Kernel version KERNEL_VERSION="$(uname -r)" @@ -30,25 +32,10 @@ KERNEL_MAJOR="${KERNEL_VERSION%%.*}" KERNEL_MINOR="${KERNEL_VERSION#$KERNEL_MAJOR.}" KERNEL_MINOR="${KERNEL_MINOR%%.*}" -# Root state path. -ROOT=$(mktemp -d "$BATS_TMPDIR/runc.XXXXXX") +ARCH=$(uname -m) -# Path to console socket. -CONSOLE_SOCKET="$BATS_TMPDIR/console.sock" - -# Cgroup paths -CGROUP_MEMORY_BASE_PATH=$(grep "cgroup" /proc/self/mountinfo | gawk 'toupper($NF) ~ /\/ { print $5; exit }') -CGROUP_CPU_BASE_PATH=$(grep "cgroup" /proc/self/mountinfo | gawk 'toupper($NF) ~ /\/ { print $5; exit }') -if [[ -n "${RUNC_USE_SYSTEMD}" ]] ; then - CGROUPS_PATH="/machine.slice/runc-cgroups-integration-test.scope" -else - CGROUPS_PATH="/runc-cgroups-integration-test/test-cgroup" -fi -CGROUP_MEMORY="${CGROUP_MEMORY_BASE_PATH}${CGROUPS_PATH}" - -# CONFIG_MEMCG_KMEM support -KMEM="${CGROUP_MEMORY_BASE_PATH}/memory.kmem.limit_in_bytes" -RT_PERIOD="${CGROUP_CPU_BASE_PATH}/cpu.rt_period_us" +# Seccomp agent socket. +SECCCOMP_AGENT_SOCKET="$BATS_TMPDIR/seccomp-agent.sock" # Check if we're in rootless mode. ROOTLESS=$(id -u) @@ -59,80 +46,290 @@ function runc() { # Some debug information to make life easier. bats will only print it if the # test failed, in which case the output is useful. - echo "runc $@ (status=$status):" >&2 + # shellcheck disable=SC2154 + echo "$(basename "$RUNC") $* (status=$status):" >&2 + # shellcheck disable=SC2154 echo "$output" >&2 } # Raw wrapper for runc. function __runc() { - "$RUNC" ${RUNC_USE_SYSTEMD+--systemd-cgroup} --root "$ROOT" "$@" + "$RUNC" ${RUNC_USE_SYSTEMD+--systemd-cgroup} --root "$ROOT/state" "$@" } -# Wrapper for runc spec, which takes only one argument (the bundle path). +# Wrapper for runc spec. function runc_spec() { - ! [[ "$#" > 1 ]] - local args=() - local bundle="" - if [ "$ROOTLESS" -ne 0 ]; then args+=("--rootless") fi - if [ "$#" -ne 0 ]; then - bundle="$1" - args+=("--bundle" "$bundle") - fi runc spec "${args[@]}" # Always add additional mappings if we have idmaps. if [[ "$ROOTLESS" -ne 0 ]] && [[ "$ROOTLESS_FEATURES" == *"idmap"* ]]; then - runc_rootless_idmap "$bundle" + runc_rootless_idmap fi +} - # Ensure config.json contains linux.resources - if [[ "$ROOTLESS" -ne 0 ]] && [[ "$ROOTLESS_FEATURES" == *"cgroup"* ]]; then - runc_rootless_cgroup "$bundle" - fi +# Helper function to reformat config.json file. Input uses jq syntax. +function update_config() { + jq "$@" "./config.json" | awk 'BEGIN{RS="";getline<"-";print>ARGV[1]}' "./config.json" } # Shortcut to add additional uids and gids, based on the values set as part of # a rootless configuration. function runc_rootless_idmap() { - bundle="${1:-.}" - cat "$bundle/config.json" \ - | jq '.mounts |= map((select(.type == "devpts") | .options += ["gid=5"]) // .)' \ - | jq '.linux.uidMappings |= .+ [{"hostID": '"$ROOTLESS_UIDMAP_START"', "containerID": 1000, "size": '"$ROOTLESS_UIDMAP_LENGTH"'}]' \ - | jq '.linux.gidMappings |= .+ [{"hostID": '"$ROOTLESS_GIDMAP_START"', "containerID": 100, "size": 1}]' \ - | jq '.linux.gidMappings |= .+ [{"hostID": '"$(($ROOTLESS_GIDMAP_START+10))"', "containerID": 1, "size": 20}]' \ - | jq '.linux.gidMappings |= .+ [{"hostID": '"$(($ROOTLESS_GIDMAP_START+100))"', "containerID": 1000, "size": '"$(($ROOTLESS_GIDMAP_LENGTH-1000))"'}]' \ - >"$bundle/config.json.tmp" - mv "$bundle/config.json"{.tmp,} + update_config ' .mounts |= map((select(.type == "devpts") | .options += ["gid=5"]) // .) + | .linux.uidMappings += [{"hostID": '"$ROOTLESS_UIDMAP_START"', "containerID": 1000, "size": '"$ROOTLESS_UIDMAP_LENGTH"'}] + | .linux.gidMappings += [{"hostID": '"$ROOTLESS_GIDMAP_START"', "containerID": 100, "size": 1}] + | .linux.gidMappings += [{"hostID": '"$((ROOTLESS_GIDMAP_START + 10))"', "containerID": 1, "size": 20}] + | .linux.gidMappings += [{"hostID": '"$((ROOTLESS_GIDMAP_START + 100))"', "containerID": 1000, "size": '"$((ROOTLESS_GIDMAP_LENGTH - 1000))"'}]' } -# Shortcut to add empty resources as part of a rootless configuration. -function runc_rootless_cgroup() { - bundle="${1:-.}" - cat "$bundle/config.json" \ - | jq '.linux.resources |= .+ {"memory":{},"cpu":{},"blockio":{},"pids":{}}' \ - >"$bundle/config.json.tmp" - mv "$bundle/config.json"{.tmp,} +# Returns systemd version as a number (-1 if systemd is not enabled/supported). +function systemd_version() { + if [ -n "${RUNC_USE_SYSTEMD}" ]; then + systemctl --version | awk '/^systemd / {print $2; exit}' + return + fi + + echo "-1" } -# Helper function to set cgroupsPath to the value of $CGROUPS_PATH +function init_cgroup_paths() { + # init once + test -n "$CGROUP_UNIFIED" && return + + if stat -f -c %t /sys/fs/cgroup | grep -qFw 63677270; then + CGROUP_UNIFIED=yes + local controllers="/sys/fs/cgroup/cgroup.controllers" + # For rootless + systemd case, controllers delegation is required, + # so check the controllers that the current user has, not the top one. + # NOTE: delegation of cpuset requires systemd >= 244 (Fedora >= 32, Ubuntu >= 20.04). + if [[ "$ROOTLESS" -ne 0 && -n "$RUNC_USE_SYSTEMD" ]]; then + controllers="/sys/fs/cgroup/user.slice/user-$(id -u).slice/user@$(id -u).service/cgroup.controllers" + fi + + # "pseudo" controllers do not appear in /sys/fs/cgroup/cgroup.controllers. + # - devices (since kernel 4.15) we must assume to be supported because + # it's quite hard to test. + # - freezer (since kernel 5.2) we can auto-detect by looking for the + # "cgroup.freeze" file a *non-root* cgroup. + CGROUP_SUBSYSTEMS=$( + cat "$controllers" + echo devices + ) + CGROUP_BASE_PATH=/sys/fs/cgroup + + # Find any cgroup.freeze files... + if [ -n "$(find "$CGROUP_BASE_PATH" -type f -name "cgroup.freeze" -print -quit)" ]; then + CGROUP_SUBSYSTEMS+=" freezer" + fi + else + if stat -f -c %t /sys/fs/cgroup/unified | grep -qFw 63677270; then + CGROUP_HYBRID=yes + fi + CGROUP_UNIFIED=no + CGROUP_SUBSYSTEMS=$(awk '!/^#/ {print $1}' /proc/cgroups) + local g base_path + for g in ${CGROUP_SUBSYSTEMS}; do + base_path=$(gawk '$(NF-2) == "cgroup" && $NF ~ /\<'"${g}"'\>/ { print $5; exit }' /proc/self/mountinfo) + test -z "$base_path" && continue + eval CGROUP_"${g^^}"_BASE_PATH="${base_path}" + done + fi +} + +function create_parent() { + if [ -n "$RUNC_USE_SYSTEMD" ]; then + [ -z "$SD_PARENT_NAME" ] && return + "$SD_HELPER" --parent machine.slice start "$SD_PARENT_NAME" + else + [ -z "$REL_PARENT_PATH" ] && return + if [ "$CGROUP_UNIFIED" == "yes" ]; then + mkdir "/sys/fs/cgroup$REL_PARENT_PATH" + else + local subsys + for subsys in ${CGROUP_SUBSYSTEMS}; do + # Have to ignore EEXIST (-p) as some subsystems + # are mounted together (e.g. cpu,cpuacct), so + # the path is created more than once. + mkdir -p "/sys/fs/cgroup/$subsys$REL_PARENT_PATH" + done + fi + fi +} + +function remove_parent() { + if [ -n "$RUNC_USE_SYSTEMD" ]; then + [ -z "$SD_PARENT_NAME" ] && return + "$SD_HELPER" --parent machine.slice stop "$SD_PARENT_NAME" + else + [ -z "$REL_PARENT_PATH" ] && return + if [ "$CGROUP_UNIFIED" == "yes" ]; then + rmdir "/sys/fs/cgroup/$REL_PARENT_PATH" + else + local subsys + for subsys in ${CGROUP_SUBSYSTEMS} systemd; do + rmdir "/sys/fs/cgroup/$subsys/$REL_PARENT_PATH" + done + fi + fi + unset SD_PARENT_NAME + unset REL_PARENT_PATH +} + +function set_parent_systemd_properties() { + [ -z "$SD_PARENT_NAME" ] && return + local user + [ "$(id -u)" != "0" ] && user="--user" + systemctl set-property $user "$SD_PARENT_NAME" "$@" +} + +# Randomize cgroup path(s), and update cgroupsPath in config.json. +# This function sets a few cgroup-related variables. +# +# Optional parameter $1 is a pod/parent name. If set, a parent/pod cgroup is +# created, and variables $REL_PARENT_PATH and $SD_PARENT_NAME can be used to +# refer to it. function set_cgroups_path() { - bundle="${1:-.}" - cgroups_path="/runc-cgroups-integration-test/test-cgroup" - if [[ -n "${RUNC_USE_SYSTEMD}" ]] ; then - cgroups_path="machine.slice:runc-cgroups:integration-test" - fi - sed -i 's#\("linux": {\)#\1\n "cgroupsPath": "'"${cgroups_path}"'",#' "$bundle/config.json" + init_cgroup_paths + local pod dash_pod slash_pod pod_slice + if [ "$#" -ne 0 ] && [ "$1" != "" ]; then + # Set up a parent/pod cgroup. + pod="$1" + dash_pod="-$pod" + slash_pod="/$pod" + SD_PARENT_NAME="machine-${pod}.slice" + pod_slice="/$SD_PARENT_NAME" + fi + + local rnd="$RANDOM" + if [ -n "${RUNC_USE_SYSTEMD}" ]; then + SD_UNIT_NAME="runc-cgroups-integration-test-${rnd}.scope" + if [ "$(id -u)" = "0" ]; then + REL_PARENT_PATH="/machine.slice${pod_slice}" + OCI_CGROUPS_PATH="machine${dash_pod}.slice:runc-cgroups:integration-test-${rnd}" + else + REL_PARENT_PATH="/user.slice/user-$(id -u).slice/user@$(id -u).service/machine.slice${pod_slice}" + # OCI path doesn't contain "/user.slice/user-$(id -u).slice/user@$(id -u).service/" prefix + OCI_CGROUPS_PATH="machine${dash_pod}.slice:runc-cgroups:integration-test-${rnd}" + fi + REL_CGROUPS_PATH="$REL_PARENT_PATH/$SD_UNIT_NAME" + else + REL_PARENT_PATH="/runc-cgroups-integration-test${slash_pod}" + REL_CGROUPS_PATH="$REL_PARENT_PATH/test-cgroup-${rnd}" + OCI_CGROUPS_PATH=$REL_CGROUPS_PATH + fi + + # Absolute path to container's cgroup v2. + if [ "$CGROUP_UNIFIED" == "yes" ]; then + CGROUP_PATH=${CGROUP_BASE_PATH}${REL_CGROUPS_PATH} + fi + + [ -n "$pod" ] && create_parent + + update_config '.linux.cgroupsPath |= "'"${OCI_CGROUPS_PATH}"'"' +} + +# Get a value from a cgroup file. +function get_cgroup_value() { + local source=$1 + local cgroup var current + + if [ "$CGROUP_UNIFIED" = "yes" ]; then + cgroup=$CGROUP_PATH + else + var=${source%%.*} # controller name (e.g. memory) + var=CGROUP_${var^^}_BASE_PATH # variable name (e.g. CGROUP_MEMORY_BASE_PATH) + eval cgroup=\$"${var}${REL_CGROUPS_PATH}" + fi + cat "$cgroup/$source" +} + +# Helper to check a if value in a cgroup file matches the expected one. +function check_cgroup_value() { + local current + current="$(get_cgroup_value "$1")" + local expected=$2 + + echo "current $current !? $expected" + [ "$current" = "$expected" ] +} + +# Helper to check a value in systemd. +function check_systemd_value() { + [ -z "${RUNC_USE_SYSTEMD}" ] && return + local source="$1" + [ "$source" = "unsupported" ] && return + local expected="$2" + local expected2="$3" + local user="" + [ "$(id -u)" != "0" ] && user="--user" + + current=$(systemctl show $user --property "$source" "$SD_UNIT_NAME" | awk -F= '{print $2}') + echo "systemd $source: current $current !? $expected $expected2" + [ "$current" = "$expected" ] || [[ -n "$expected2" && "$current" = "$expected2" ]] +} + +function check_cpu_quota() { + local quota=$1 + local period=$2 + local sd_quota=$3 + + if [ "$CGROUP_UNIFIED" = "yes" ]; then + if [ "$quota" = "-1" ]; then + quota="max" + fi + check_cgroup_value "cpu.max" "$quota $period" + else + check_cgroup_value "cpu.cfs_quota_us" $quota + check_cgroup_value "cpu.cfs_period_us" "$period" + fi + # systemd values are the same for v1 and v2 + check_systemd_value "CPUQuotaPerSecUSec" "$sd_quota" + + # CPUQuotaPeriodUSec requires systemd >= v242 + [ "$(systemd_version)" -lt 242 ] && return + + local sd_period=$((period / 1000))ms + [ "$sd_period" = "1000ms" ] && sd_period="1s" + local sd_infinity="" + # 100ms is the default value, and if not set, shown as infinity + [ "$sd_period" = "100ms" ] && sd_infinity="infinity" + check_systemd_value "CPUQuotaPeriodUSec" $sd_period $sd_infinity +} + +# Works for cgroup v1 and v2, accepts v1 shares as an argument. +function check_cpu_shares() { + local shares=$1 + + if [ "$CGROUP_UNIFIED" = "yes" ]; then + local weight=$((1 + ((shares - 2) * 9999) / 262142)) + check_cpu_weight "$weight" + else + check_cgroup_value "cpu.shares" "$shares" + check_systemd_value "CPUShares" "$shares" + fi +} + +# Works only for cgroup v2, accept v2 weight. +function check_cpu_weight() { + local weight=$1 + + check_cgroup_value "cpu.weight" "$weight" + check_systemd_value "CPUWeight" "$weight" } # Helper function to set a resources limit function set_resources_limit() { - bundle="${1:-.}" - sed -i 's/\("linux": {\)/\1\n "resources": { "pids": { "limit": 100 } },/' "$bundle/config.json" + update_config '.linux.resources.pids.limit |= 100' +} + +# Helper function to make /sys/fs/cgroup writable +function set_cgroup_mount_writable() { + update_config '.mounts |= map((select(.type == "cgroup") | .options -= ["ro"]) // .)' } # Fails the current test, providing the error given. @@ -141,55 +338,123 @@ function fail() { exit 1 } +# Check whether rootless runc can use cgroups. +function rootless_cgroup() { + [[ "$ROOTLESS_FEATURES" == *"cgroup"* || -n "$RUNC_USE_SYSTEMD" ]] +} + # Allows a test to specify what things it requires. If the environment can't # support it, the test is skipped with a message. function requires() { for var in "$@"; do + local skip_me case $var in criu) if [ ! -e "$CRIU" ]; then - skip "test requires ${var}" + skip_me=1 fi ;; root) if [ "$ROOTLESS" -ne 0 ]; then - skip "test requires ${var}" + skip_me=1 fi ;; rootless) if [ "$ROOTLESS" -eq 0 ]; then - skip "test requires ${var}" + skip_me=1 fi ;; rootless_idmap) if [[ "$ROOTLESS_FEATURES" != *"idmap"* ]]; then - skip "test requires ${var}" + skip_me=1 fi ;; rootless_cgroup) - if [[ "$ROOTLESS_FEATURES" != *"cgroup"* ]]; then - skip "test requires ${var}" + if ! rootless_cgroup; then + skip_me=1 fi ;; rootless_no_cgroup) - if [[ "$ROOTLESS_FEATURES" == *"cgroup"* ]]; then - skip "test requires ${var}" + if rootless_cgroup; then + skip_me=1 fi ;; - cgroups_kmem) - if [ ! -e "$KMEM" ]; then - skip "Test requires ${var}" + rootless_no_features) + if [ "$ROOTLESS_FEATURES" != "" ]; then + skip_me=1 fi ;; cgroups_rt) - if [ ! -e "$RT_PERIOD" ]; then - skip "Test requires ${var}" + init_cgroup_paths + if [ ! -e "${CGROUP_CPU_BASE_PATH}/cpu.rt_period_us" ]; then + skip_me=1 + fi + ;; + cgroups_swap) + init_cgroup_paths + if [ $CGROUP_UNIFIED = "no" ] && [ ! -e "${CGROUP_MEMORY_BASE_PATH}/memory.memsw.limit_in_bytes" ]; then + skip_me=1 + fi + ;; + cgroupns) + if [ ! -e "/proc/self/ns/cgroup" ]; then + skip_me=1 + fi + ;; + cgroups_v1) + init_cgroup_paths + if [ "$CGROUP_UNIFIED" != "no" ]; then + skip_me=1 + fi + ;; + cgroups_v2) + init_cgroup_paths + if [ "$CGROUP_UNIFIED" != "yes" ]; then + skip_me=1 + fi + ;; + cgroups_hybrid) + init_cgroup_paths + if [ "$CGROUP_HYBRID" != "yes" ]; then + skip_me=1 + fi + ;; + cgroups_*) + init_cgroup_paths + var=${var#cgroups_} + if [[ "$CGROUP_SUBSYSTEMS" != *"$var"* ]]; then + skip_me=1 + fi + ;; + smp) + local cpus + cpus=$(grep -c '^processor' /proc/cpuinfo) + if [ "$cpus" -lt 2 ]; then + skip_me=1 + fi + ;; + systemd) + if [ -z "${RUNC_USE_SYSTEMD}" ]; then + skip_me=1 + fi + ;; + no_systemd) + if [ -n "${RUNC_USE_SYSTEMD}" ]; then + skip_me=1 + fi + ;; + arch_x86_64) + if [ "$ARCH" != "x86_64" ]; then + skip_me=1 fi ;; *) - fail "BUG: Invalid requires ${var}." + fail "BUG: Invalid requires $var." ;; esac + if [ -n "$skip_me" ]; then + skip "test requires $var" + fi done } @@ -206,141 +471,124 @@ function retry() { if [[ "$status" -eq 0 ]]; then return 0 fi - sleep $delay + sleep "$delay" done - echo "Command \"$@\" failed $attempts times. Output: $output" + echo "Command \"$*\" failed $attempts times. Output: $output" false } # retry until the given container has state function wait_for_container() { - local attempts=$1 - local delay=$2 - local cid=$3 - # optionally wait for a specific status - local wait_for_status="${4:-}" - local i - - for ((i = 0; i < attempts; i++)); do - runc state $cid - if [[ "$status" -eq 0 ]]; then - if [[ "${output}" == *"${wait_for_status}"* ]]; then - return 0 - fi - fi - sleep $delay - done - - echo "runc state failed to return state $statecheck $attempts times. Output: $output" - false -} - -# retry until the given container has state -function wait_for_container_inroot() { - local attempts=$1 - local delay=$2 - local cid=$3 - # optionally wait for a specific status - local wait_for_status="${4:-}" - local i - - for ((i = 0; i < attempts; i++)); do - ROOT=$4 runc state $cid - if [[ "$status" -eq 0 ]]; then - if [[ "${output}" == *"${wait_for_status}"* ]]; then - return 0 - fi - fi - sleep $delay - done - - echo "runc state failed to return state $statecheck $attempts times. Output: $output" - false + if [ $# -eq 3 ]; then + retry "$1" "$2" __runc state "$3" + elif [ $# -eq 4 ]; then + retry "$1" "$2" eval "__runc state $3 | grep -qw $4" + else + echo "Usage: wait_for_container ATTEMPTS DELAY ID [STATUS]" 1>&2 + return 1 + fi } function testcontainer() { # test state of container - runc state $1 + runc state "$1" + if [ "$2" == "checkpointed" ]; then + [ "$status" -eq 1 ] + return + fi [ "$status" -eq 0 ] [[ "${output}" == *"$2"* ]] } function setup_recvtty() { + [ -z "$ROOT" ] && return 1 # must not be called without ROOT set + local dir="$ROOT/tty" + + mkdir "$dir" + export CONSOLE_SOCKET="$dir/sock" + # We need to start recvtty in the background, so we double fork in the shell. - ("$RECVTTY" --pid-file "$BATS_TMPDIR/recvtty.pid" --mode null "$CONSOLE_SOCKET" &) & + ("$RECVTTY" --pid-file "$dir/pid" --mode null "$CONSOLE_SOCKET" &) & } function teardown_recvtty() { + [ -z "$ROOT" ] && return 0 # nothing to teardown + local dir="$ROOT/tty" + # When we kill recvtty, the container will also be killed. - if [ -f "$BATS_TMPDIR/recvtty.pid" ]; then - kill -9 $(cat "$BATS_TMPDIR/recvtty.pid") + if [ -f "$dir/pid" ]; then + kill -9 "$(cat "$dir/pid")" fi # Clean up the files that might be left over. - rm -f "$BATS_TMPDIR/recvtty.pid" - rm -f "$CONSOLE_SOCKET" + rm -rf "$dir" +} + +function setup_seccompagent() { + ("${SECCOMP_AGENT}" -socketfile="$SECCCOMP_AGENT_SOCKET" -pid-file "$BATS_TMPDIR/seccompagent.pid" &) & +} + +function teardown_seccompagent() { + if [ -f "$BATS_TMPDIR/seccompagent.pid" ]; then + kill -9 "$(cat "$BATS_TMPDIR/seccompagent.pid")" + fi + rm -f "$BATS_TMPDIR/seccompagent.pid" + rm -f "$SECCCOMP_AGENT_SOCKET" +} + +function setup_bundle() { + local image="$1" + + # Root for various container directories (state, tty, bundle). + ROOT=$(mktemp -d "$BATS_RUN_TMPDIR/runc.XXXXXX") + mkdir -p "$ROOT/state" "$ROOT/bundle/rootfs" + + # Directories created by mktemp -d have 0700 permission bits. Tests + # running inside userns (see userns.bats) need to access the directory + # as a different user to mount the rootfs. Since kernel v5.12, parent + # directories are also checked. Give a+x for these tests to work. + chmod a+x "$ROOT" "$BATS_RUN_TMPDIR" + + setup_recvtty + cd "$ROOT/bundle" || return + + tar --exclude './dev/*' -C rootfs -xf "$image" + + runc_spec } function setup_busybox() { - setup_recvtty - run mkdir "$BUSYBOX_BUNDLE" - run mkdir "$BUSYBOX_BUNDLE"/rootfs - if [ -e "/testdata/busybox.tar" ]; then - BUSYBOX_IMAGE="/testdata/busybox.tar" - fi - if [ ! -e $BUSYBOX_IMAGE ]; then - curl -o $BUSYBOX_IMAGE -sSL `get_busybox` - fi - tar --exclude './dev/*' -C "$BUSYBOX_BUNDLE"/rootfs -xf "$BUSYBOX_IMAGE" - cd "$BUSYBOX_BUNDLE" - runc_spec + setup_bundle "$BUSYBOX_IMAGE" } function setup_hello() { - setup_recvtty - run mkdir "$HELLO_BUNDLE" - run mkdir "$HELLO_BUNDLE"/rootfs - tar --exclude './dev/*' -C "$HELLO_BUNDLE"/rootfs -xf "$HELLO_IMAGE" - cd "$HELLO_BUNDLE" - runc_spec - sed -i 's;"sh";"/hello";' config.json + setup_bundle "$HELLO_IMAGE" + update_config '(.. | select(.? == "sh")) |= "/hello"' } -function teardown_running_container() { - runc list - # $1 should be a container name such as "test_busybox" - # here we detect "test_busybox "(with one extra blank) to avoid conflict prefix - # e.g. "test_busybox" and "test_busybox_update" - if [[ "${output}" == *"$1 "* ]]; then - runc kill $1 KILL - retry 10 1 eval "__runc state '$1' | grep -q 'stopped'" - runc delete $1 +function setup_debian() { + setup_bundle "$DEBIAN_IMAGE" +} + +function teardown_bundle() { + [ -z "$ROOT" ] && return 0 # nothing to teardown + + cd "$INTEGRATION_ROOT" || return + teardown_recvtty + local ct + for ct in $(__runc list -q); do + __runc delete -f "$ct" + done + rm -rf "$ROOT" + remove_parent +} + +function requires_kernel() { + local major_required minor_required + major_required=$(echo "$1" | cut -d. -f1) + minor_required=$(echo "$1" | cut -d. -f2) + if [[ "$KERNEL_MAJOR" -lt $major_required || ("$KERNEL_MAJOR" -eq $major_required && "$KERNEL_MINOR" -lt $minor_required) ]]; then + skip "requires kernel $1" fi } - -function teardown_running_container_inroot() { - ROOT=$2 runc list - # $1 should be a container name such as "test_busybox" - # here we detect "test_busybox "(with one extra blank) to avoid conflict prefix - # e.g. "test_busybox" and "test_busybox_update" - if [[ "${output}" == *"$1 "* ]]; then - ROOT=$2 runc kill $1 KILL - retry 10 1 eval "ROOT='$2' __runc state '$1' | grep -q 'stopped'" - ROOT=$2 runc delete $1 - fi -} - -function teardown_busybox() { - cd "$INTEGRATION_ROOT" - teardown_recvtty - teardown_running_container test_busybox - run rm -f -r "$BUSYBOX_BUNDLE" -} - -function teardown_hello() { - cd "$INTEGRATION_ROOT" - teardown_recvtty - teardown_running_container test_hello - run rm -f -r "$HELLO_BUNDLE" -} diff --git a/tests/integration/hooks.bats b/tests/integration/hooks.bats new file mode 100644 index 0000000..25c08aa --- /dev/null +++ b/tests/integration/hooks.bats @@ -0,0 +1,58 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + requires root no_systemd + + setup_debian + # CR = CreateRuntime, CC = CreateContainer + HOOKLIBCR=librunc-hooks-create-runtime.so + HOOKLIBCC=librunc-hooks-create-container.so + LIBPATH="$(pwd)/rootfs/lib/" +} + +function teardown() { + if [ -n "$LIBPATH" ]; then + umount "$LIBPATH"/$HOOKLIBCR.1.0.0 &>/dev/null || true + umount "$LIBPATH"/$HOOKLIBCC.1.0.0 &>/dev/null || true + rm -f $HOOKLIBCR.1.0.0 $HOOKLIBCC.1.0.0 + fi + teardown_bundle +} + +@test "runc run (hooks library tests)" { + # setup some dummy libs + gcc -shared -Wl,-soname,librunc-hooks-create-runtime.so.1 -o "$HOOKLIBCR.1.0.0" + gcc -shared -Wl,-soname,librunc-hooks-create-container.so.1 -o "$HOOKLIBCC.1.0.0" + + bundle=$(pwd) + + # To mount $HOOKLIBCR we need to do that in the container namespace + create_runtime_hook=$( + cat <<-EOF + pid=\$(cat - | jq -r '.pid') + touch "$LIBPATH/$HOOKLIBCR.1.0.0" + nsenter -m \$ns -t \$pid mount --bind "$bundle/$HOOKLIBCR.1.0.0" "$LIBPATH/$HOOKLIBCR.1.0.0" + EOF + ) + + create_container_hook="touch ./lib/$HOOKLIBCC.1.0.0 && mount --bind $bundle/$HOOKLIBCC.1.0.0 ./lib/$HOOKLIBCC.1.0.0" + + # shellcheck disable=SC2016 + update_config --arg create_runtime_hook "$create_runtime_hook" --arg create_container_hook "$create_container_hook" ' + .hooks |= . + {"createRuntime": [{"path": "/bin/sh", "args": ["/bin/sh", "-c", $create_runtime_hook]}]} | + .hooks |= . + {"createContainer": [{"path": "/bin/sh", "args": ["/bin/sh", "-c", $create_container_hook]}]} | + .hooks |= . + {"startContainer": [{"path": "/bin/sh", "args": ["/bin/sh", "-c", "ldconfig"]}]} | + .root.readonly |= false | + .process.args = ["/bin/sh", "-c", "ldconfig -p | grep librunc"]' + + runc run test_debian + [ "$status" -eq 0 ] + + echo "Checking create-runtime library" + echo "$output" | grep $HOOKLIBCR + + echo "Checking create-container library" + echo "$output" | grep $HOOKLIBCC +} diff --git a/tests/integration/kill.bats b/tests/integration/kill.bats index d9afe92..590ddd5 100644 --- a/tests/integration/kill.bats +++ b/tests/integration/kill.bats @@ -3,28 +3,29 @@ load helpers function setup() { - teardown_busybox - setup_busybox + setup_busybox } function teardown() { - teardown_busybox + teardown_bundle } - @test "kill detached busybox" { - # run busybox detached - runc run -d --console-socket $CONSOLE_SOCKET test_busybox - [ "$status" -eq 0 ] + # run busybox detached + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] - # check state - testcontainer test_busybox running + # check state + testcontainer test_busybox running - runc kill test_busybox KILL - [ "$status" -eq 0 ] + runc kill test_busybox KILL + [ "$status" -eq 0 ] + wait_for_container 10 1 test_busybox stopped - retry 10 1 eval "__runc state test_busybox | grep -q 'stopped'" + # we should ensure kill work after the container stopped + runc kill -a test_busybox 0 + [ "$status" -eq 0 ] - runc delete test_busybox - [ "$status" -eq 0 ] + runc delete test_busybox + [ "$status" -eq 0 ] } diff --git a/tests/integration/list.bats b/tests/integration/list.bats index 0a938c0..9d25dfd 100644 --- a/tests/integration/list.bats +++ b/tests/integration/list.bats @@ -3,54 +3,53 @@ load helpers function setup() { - teardown_running_container_inroot test_box1 $HELLO_BUNDLE - teardown_running_container_inroot test_box2 $HELLO_BUNDLE - teardown_running_container_inroot test_box3 $HELLO_BUNDLE - teardown_busybox - setup_busybox + setup_busybox + ALT_ROOT="$ROOT/alt" + mkdir -p "$ALT_ROOT/state" } function teardown() { - teardown_running_container_inroot test_box1 $HELLO_BUNDLE - teardown_running_container_inroot test_box2 $HELLO_BUNDLE - teardown_running_container_inroot test_box3 $HELLO_BUNDLE - teardown_busybox + if [ -n "$ALT_ROOT" ]; then + ROOT="$ALT_ROOT" teardown_bundle + fi + teardown_bundle } @test "list" { - # run a few busyboxes detached - ROOT=$HELLO_BUNDLE runc run -d --console-socket $CONSOLE_SOCKET test_box1 - [ "$status" -eq 0 ] + bundle=$(pwd) + # run a few busyboxes detached + ROOT=$ALT_ROOT runc run -d --console-socket "$CONSOLE_SOCKET" test_box1 + [ "$status" -eq 0 ] - ROOT=$HELLO_BUNDLE runc run -d --console-socket $CONSOLE_SOCKET test_box2 - [ "$status" -eq 0 ] + ROOT=$ALT_ROOT runc run -d --console-socket "$CONSOLE_SOCKET" test_box2 + [ "$status" -eq 0 ] - ROOT=$HELLO_BUNDLE runc run -d --console-socket $CONSOLE_SOCKET test_box3 - [ "$status" -eq 0 ] + ROOT=$ALT_ROOT runc run -d --console-socket "$CONSOLE_SOCKET" test_box3 + [ "$status" -eq 0 ] - ROOT=$HELLO_BUNDLE runc list - [ "$status" -eq 0 ] - [[ ${lines[0]} =~ ID\ +PID\ +STATUS\ +BUNDLE\ +CREATED+ ]] - [[ "${lines[1]}" == *"test_box1"*[0-9]*"running"*$BUSYBOX_BUNDLE*[0-9]* ]] - [[ "${lines[2]}" == *"test_box2"*[0-9]*"running"*$BUSYBOX_BUNDLE*[0-9]* ]] - [[ "${lines[3]}" == *"test_box3"*[0-9]*"running"*$BUSYBOX_BUNDLE*[0-9]* ]] + ROOT=$ALT_ROOT runc list + [ "$status" -eq 0 ] + [[ ${lines[0]} =~ ID\ +PID\ +STATUS\ +BUNDLE\ +CREATED+ ]] + [[ "${lines[1]}" == *"test_box1"*[0-9]*"running"*$bundle*[0-9]* ]] + [[ "${lines[2]}" == *"test_box2"*[0-9]*"running"*$bundle*[0-9]* ]] + [[ "${lines[3]}" == *"test_box3"*[0-9]*"running"*$bundle*[0-9]* ]] - ROOT=$HELLO_BUNDLE runc list -q - [ "$status" -eq 0 ] - [[ "${lines[0]}" == "test_box1" ]] - [[ "${lines[1]}" == "test_box2" ]] - [[ "${lines[2]}" == "test_box3" ]] + ROOT=$ALT_ROOT runc list -q + [ "$status" -eq 0 ] + [[ "${lines[0]}" == "test_box1" ]] + [[ "${lines[1]}" == "test_box2" ]] + [[ "${lines[2]}" == "test_box3" ]] - ROOT=$HELLO_BUNDLE runc list --format table - [ "$status" -eq 0 ] - [[ ${lines[0]} =~ ID\ +PID\ +STATUS\ +BUNDLE\ +CREATED+ ]] - [[ "${lines[1]}" == *"test_box1"*[0-9]*"running"*$BUSYBOX_BUNDLE*[0-9]* ]] - [[ "${lines[2]}" == *"test_box2"*[0-9]*"running"*$BUSYBOX_BUNDLE*[0-9]* ]] - [[ "${lines[3]}" == *"test_box3"*[0-9]*"running"*$BUSYBOX_BUNDLE*[0-9]* ]] + ROOT=$ALT_ROOT runc list --format table + [ "$status" -eq 0 ] + [[ ${lines[0]} =~ ID\ +PID\ +STATUS\ +BUNDLE\ +CREATED+ ]] + [[ "${lines[1]}" == *"test_box1"*[0-9]*"running"*$bundle*[0-9]* ]] + [[ "${lines[2]}" == *"test_box2"*[0-9]*"running"*$bundle*[0-9]* ]] + [[ "${lines[3]}" == *"test_box3"*[0-9]*"running"*$bundle*[0-9]* ]] - ROOT=$HELLO_BUNDLE runc list --format json - [ "$status" -eq 0 ] - [[ "${lines[0]}" == [\[][\{]"\"ociVersion\""[:]"\""*[0-9][\.]*[0-9][\.]*[0-9]*"\""[,]"\"id\""[:]"\"test_box1\""[,]"\"pid\""[:]*[0-9][,]"\"status\""[:]*"\"running\""[,]"\"bundle\""[:]*$BUSYBOX_BUNDLE*[,]"\"rootfs\""[:]"\""*"\""[,]"\"created\""[:]*[0-9]*[\}]* ]] - [[ "${lines[0]}" == *[,][\{]"\"ociVersion\""[:]"\""*[0-9][\.]*[0-9][\.]*[0-9]*"\""[,]"\"id\""[:]"\"test_box2\""[,]"\"pid\""[:]*[0-9][,]"\"status\""[:]*"\"running\""[,]"\"bundle\""[:]*$BUSYBOX_BUNDLE*[,]"\"rootfs\""[:]"\""*"\""[,]"\"created\""[:]*[0-9]*[\}]* ]] - [[ "${lines[0]}" == *[,][\{]"\"ociVersion\""[:]"\""*[0-9][\.]*[0-9][\.]*[0-9]*"\""[,]"\"id\""[:]"\"test_box3\""[,]"\"pid\""[:]*[0-9][,]"\"status\""[:]*"\"running\""[,]"\"bundle\""[:]*$BUSYBOX_BUNDLE*[,]"\"rootfs\""[:]"\""*"\""[,]"\"created\""[:]*[0-9]*[\}][\]] ]] + ROOT=$ALT_ROOT runc list --format json + [ "$status" -eq 0 ] + [[ "${lines[0]}" == [\[][\{]"\"ociVersion\""[:]"\""*[0-9][\.]*[0-9][\.]*[0-9]*"\""[,]"\"id\""[:]"\"test_box1\""[,]"\"pid\""[:]*[0-9][,]"\"status\""[:]*"\"running\""[,]"\"bundle\""[:]*$bundle*[,]"\"rootfs\""[:]"\""*"\""[,]"\"created\""[:]*[0-9]*[\}]* ]] + [[ "${lines[0]}" == *[,][\{]"\"ociVersion\""[:]"\""*[0-9][\.]*[0-9][\.]*[0-9]*"\""[,]"\"id\""[:]"\"test_box2\""[,]"\"pid\""[:]*[0-9][,]"\"status\""[:]*"\"running\""[,]"\"bundle\""[:]*$bundle*[,]"\"rootfs\""[:]"\""*"\""[,]"\"created\""[:]*[0-9]*[\}]* ]] + [[ "${lines[0]}" == *[,][\{]"\"ociVersion\""[:]"\""*[0-9][\.]*[0-9][\.]*[0-9]*"\""[,]"\"id\""[:]"\"test_box3\""[,]"\"pid\""[:]*[0-9][,]"\"status\""[:]*"\"running\""[,]"\"bundle\""[:]*$bundle*[,]"\"rootfs\""[:]"\""*"\""[,]"\"created\""[:]*[0-9]*[\}][\]] ]] } diff --git a/tests/integration/mask.bats b/tests/integration/mask.bats index aaa8042..b5f2967 100644 --- a/tests/integration/mask.bats +++ b/tests/integration/mask.bats @@ -3,24 +3,23 @@ load helpers function setup() { - teardown_busybox setup_busybox # Create fake rootfs. mkdir rootfs/testdir - echo "Forbidden information!" > rootfs/testfile + echo "Forbidden information!" >rootfs/testfile # add extra masked paths - sed -i 's;"maskedPaths": \[;"maskedPaths": \["/testdir","/testfile",;g' config.json + update_config '(.. | select(.maskedPaths? != null)) .maskedPaths += ["/testdir", "/testfile"]' } function teardown() { - teardown_busybox + teardown_bundle } @test "mask paths [file]" { # run busybox detached - runc run -d --console-socket $CONSOLE_SOCKET test_busybox + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox [ "$status" -eq 0 ] runc exec test_busybox cat /testfile @@ -38,7 +37,7 @@ function teardown() { @test "mask paths [directory]" { # run busybox detached - runc run -d --console-socket $CONSOLE_SOCKET test_busybox + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox [ "$status" -eq 0 ] runc exec test_busybox ls /testdir diff --git a/tests/integration/mounts.bats b/tests/integration/mounts.bats old mode 100755 new mode 100644 index c35b3c5..1ec675a --- a/tests/integration/mounts.bats +++ b/tests/integration/mounts.bats @@ -3,19 +3,63 @@ load helpers function setup() { - teardown_busybox setup_busybox } function teardown() { - teardown_busybox + teardown_bundle } @test "runc run [bind mount]" { - CONFIG=$(jq '.mounts |= . + [{"source": ".", "destination": "/tmp/bind", "options": ["bind"]}] | .process.args = ["ls", "/tmp/bind/config.json"]' config.json) - echo "${CONFIG}" >config.json + update_config ' .mounts += [{ + source: ".", + destination: "/tmp/bind", + options: ["bind"] + }] + | .process.args |= ["ls", "/tmp/bind/config.json"]' - runc run test_bind_mount + runc run test_busybox + [ "$status" -eq 0 ] + [[ "${lines[0]}" == *'/tmp/bind/config.json'* ]] +} + +# https://github.com/opencontainers/runc/issues/2246 +@test "runc run [ro tmpfs mount]" { + update_config ' .mounts += [{ + source: "tmpfs", + destination: "/mnt", + type: "tmpfs", + options: ["ro", "nodev", "nosuid", "mode=755"] + }] + | .process.args |= ["grep", "^tmpfs /mnt", "/proc/mounts"]' + + runc run test_busybox + [ "$status" -eq 0 ] + [[ "${lines[0]}" == *'ro,'* ]] +} + +# https://github.com/opencontainers/runc/issues/3248 +@test "runc run [ro /dev mount]" { + update_config ' .mounts |= map((select(.destination == "/dev") | .options += ["ro"]) // .) + | .process.args |= ["grep", "^tmpfs /dev", "/proc/mounts"]' + + runc run test_busybox + [ "$status" -eq 0 ] + [[ "${lines[0]}" == *'ro,'* ]] +} + +# https://github.com/opencontainers/runc/issues/2683 +@test "runc run [tmpfs mount with absolute symlink]" { + # in container, /conf -> /real/conf + mkdir -p rootfs/real/conf + ln -s /real/conf rootfs/conf + update_config ' .mounts += [{ + type: "tmpfs", + source: "tmpfs", + destination: "/conf/stack", + options: ["ro", "nodev", "nosuid"] + }] + | .process.args |= ["true"]' + runc run test_busybox [ "$status" -eq 0 ] - [[ "${lines[0]}" =~ '/tmp/bind/config.json' ]] } diff --git a/tests/integration/mounts_recursive.bats b/tests/integration/mounts_recursive.bats new file mode 100644 index 0000000..b3ce579 --- /dev/null +++ b/tests/integration/mounts_recursive.bats @@ -0,0 +1,78 @@ +#!/usr/bin/env bats + +load helpers + +TESTVOLUME="${BATS_RUN_TMPDIR}/mounts_recursive" + +function setup_volume() { + # requires root (in the current user namespace) to mount tmpfs outside runc + requires root + + mkdir -p "${TESTVOLUME}" + mount -t tmpfs none "${TESTVOLUME}" + echo "foo" >"${TESTVOLUME}/foo" + + mkdir "${TESTVOLUME}/subvol" + mount -t tmpfs none "${TESTVOLUME}/subvol" + echo "bar" >"${TESTVOLUME}/subvol/bar" +} + +function teardown_volume() { + umount -R "${TESTVOLUME}" +} + +function setup() { + setup_volume + setup_busybox +} + +function teardown() { + teardown_volume + teardown_bundle +} + +@test "runc run [rbind,ro mount is read-only but not recursively]" { + update_config ".mounts += [{source: \"${TESTVOLUME}\" , destination: \"/mnt\", options: [\"rbind\",\"ro\"]}]" + + runc run -d --console-socket "$CONSOLE_SOCKET" test_rbind_ro + [ "$status" -eq 0 ] + + runc exec test_rbind_ro touch /mnt/foo + [ "$status" -eq 1 ] + [[ "${output}" == *"Read-only file system"* ]] + + runc exec test_rbind_ro touch /mnt/subvol/bar + [ "$status" -eq 0 ] +} + +@test "runc run [rbind,rro mount is recursively read-only]" { + requires_kernel 5.12 + update_config ".mounts += [{source: \"${TESTVOLUME}\" , destination: \"/mnt\", options: [\"rbind\",\"rro\"]}]" + + runc run -d --console-socket "$CONSOLE_SOCKET" test_rbind_rro + [ "$status" -eq 0 ] + + runc exec test_rbind_rro touch /mnt/foo + [ "$status" -eq 1 ] + [[ "${output}" == *"Read-only file system"* ]] + + runc exec test_rbind_rro touch /mnt/subvol/bar + [ "$status" -eq 1 ] + [[ "${output}" == *"Read-only file system"* ]] +} + +@test "runc run [rbind,ro,rro mount is recursively read-only too]" { + requires_kernel 5.12 + update_config ".mounts += [{source: \"${TESTVOLUME}\" , destination: \"/mnt\", options: [\"rbind\",\"ro\",\"rro\"]}]" + + runc run -d --console-socket "$CONSOLE_SOCKET" test_rbind_ro_rro + [ "$status" -eq 0 ] + + runc exec test_rbind_ro_rro touch /mnt/foo + [ "$status" -eq 1 ] + [[ "${output}" == *"Read-only file system"* ]] + + runc exec test_rbind_ro_rro touch /mnt/subvol/bar + [ "$status" -eq 1 ] + [[ "${output}" == *"Read-only file system"* ]] +} diff --git a/tests/integration/mounts_sshfs.bats b/tests/integration/mounts_sshfs.bats new file mode 100644 index 0000000..abf8235 --- /dev/null +++ b/tests/integration/mounts_sshfs.bats @@ -0,0 +1,40 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + # Create a ro fuse-sshfs mount; skip the test if it's not working. + local sshfs="sshfs + -o UserKnownHostsFile=/dev/null + -o StrictHostKeyChecking=no + -o PasswordAuthentication=no" + + DIR="$BATS_RUN_TMPDIR/fuse-sshfs" + mkdir -p "$DIR" + + if ! $sshfs -o ro rootless@localhost: "$DIR"; then + skip "test requires working sshfs mounts" + fi + + setup_hello +} + +function teardown() { + # New distros (Fedora 35) do not have fusermount installed + # as a dependency of fuse-sshfs, and good ol' umount works. + fusermount -u "$DIR" || umount "$DIR" + + teardown_bundle +} + +@test "runc run [rw bind mount of a ro fuse sshfs mount]" { + update_config ' .mounts += [{ + type: "bind", + source: "'"$DIR"'", + destination: "/mnt", + options: ["rw", "rprivate", "nosuid", "nodev", "rbind"] + }]' + + runc run test_busybox + [ "$status" -eq 0 ] +} diff --git a/tests/integration/multi-arch.bash b/tests/integration/multi-arch.bash deleted file mode 100644 index 5616bf7..0000000 --- a/tests/integration/multi-arch.bash +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash -get_busybox(){ - case $(go env GOARCH) in - arm64) - echo 'https://github.com/docker-library/busybox/raw/dist-arm64v8/glibc/busybox.tar.xz' - ;; - *) - echo 'https://github.com/docker-library/busybox/raw/dist-amd64/glibc/busybox.tar.xz' - ;; - esac -} - -get_hello(){ - case $(go env GOARCH) in - arm64) - echo 'hello-world-aarch64.tar' - ;; - *) - echo 'hello-world.tar' - ;; - esac -} diff --git a/tests/integration/no_pivot.bats b/tests/integration/no_pivot.bats new file mode 100644 index 0000000..30dbe7f --- /dev/null +++ b/tests/integration/no_pivot.bats @@ -0,0 +1,23 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + setup_busybox +} + +function teardown() { + teardown_bundle +} + +@test "runc run --no-pivot must not expose bare /proc" { + requires root + + update_config ' .process.args |= ["unshare", "-mrpf", "sh", "-euxc", "mount -t proc none /proc && echo h > /proc/sysrq-trigger"] + | .process.capabilities.bounding += ["CAP_SETFCAP"] + | .process.capabilities.permitted += ["CAP_SETFCAP"]' + + runc run --no-pivot test_no_pivot + [ "$status" -eq 1 ] + [[ "$output" == *"mount: permission denied"* ]] +} diff --git a/tests/integration/pause.bats b/tests/integration/pause.bats index 4e25e59..c2daa1f 100644 --- a/tests/integration/pause.bats +++ b/tests/integration/pause.bats @@ -3,70 +3,75 @@ load helpers function setup() { - teardown_busybox - setup_busybox + setup_busybox } function teardown() { - teardown_busybox + teardown_bundle } @test "runc pause and resume" { - # XXX: currently cgroups require root containers. - requires root + if [[ "$ROOTLESS" -ne 0 ]]; then + requires rootless_cgroup + set_cgroups_path + fi + requires cgroups_freezer - # run busybox detached - runc run -d --console-socket $CONSOLE_SOCKET test_busybox - [ "$status" -eq 0 ] + # run busybox detached + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] - testcontainer test_busybox running + testcontainer test_busybox running - # pause busybox - runc pause test_busybox - [ "$status" -eq 0 ] + # pause busybox + runc pause test_busybox + [ "$status" -eq 0 ] - # test state of busybox is paused - testcontainer test_busybox paused + # test state of busybox is paused + testcontainer test_busybox paused - # resume busybox - runc resume test_busybox - [ "$status" -eq 0 ] + # resume busybox + runc resume test_busybox + [ "$status" -eq 0 ] - # test state of busybox is back to running - testcontainer test_busybox running + # test state of busybox is back to running + testcontainer test_busybox running } @test "runc pause and resume with nonexist container" { - # XXX: currently cgroups require root containers. - requires root + if [[ "$ROOTLESS" -ne 0 ]]; then + requires rootless_cgroup + set_cgroups_path + fi + requires cgroups_freezer - # run test_busybox detached - runc run -d --console-socket $CONSOLE_SOCKET test_busybox - [ "$status" -eq 0 ] + # run test_busybox detached + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] - testcontainer test_busybox running + testcontainer test_busybox running - # pause test_busybox and nonexistent container - runc pause test_busybox - [ "$status" -eq 0 ] - runc pause nonexistent - [ "$status" -ne 0 ] + # pause test_busybox and nonexistent container + runc pause test_busybox + [ "$status" -eq 0 ] + runc pause nonexistent + [ "$status" -ne 0 ] - # test state of test_busybox is paused - testcontainer test_busybox paused + # test state of test_busybox is paused + testcontainer test_busybox paused - # resume test_busybox and nonexistent container - runc resume test_busybox - [ "$status" -eq 0 ] - runc resume nonexistent - [ "$status" -ne 0 ] + # resume test_busybox and nonexistent container + runc resume test_busybox + [ "$status" -eq 0 ] + runc resume nonexistent + [ "$status" -ne 0 ] - # test state of test_busybox is back to running - testcontainer test_busybox running + # test state of test_busybox is back to running + testcontainer test_busybox running - # delete test_busybox - runc delete --force test_busybox + # delete test_busybox + runc delete --force test_busybox - runc state test_busybox - [ "$status" -ne 0 ] + runc state test_busybox + [ "$status" -ne 0 ] } diff --git a/tests/integration/ps.bats b/tests/integration/ps.bats index 646b5ab..c28abfc 100644 --- a/tests/integration/ps.bats +++ b/tests/integration/ps.bats @@ -3,60 +3,82 @@ load helpers function setup() { - teardown_busybox - setup_busybox + setup_busybox } function teardown() { - teardown_busybox + teardown_bundle } @test "ps" { - # ps is not supported, it requires cgroups - requires root + # ps is not supported, it requires cgroups + requires root - # start busybox detached - runc run -d --console-socket $CONSOLE_SOCKET test_busybox - [ "$status" -eq 0 ] + # start busybox detached + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] - # check state - testcontainer test_busybox running + # check state + testcontainer test_busybox running - runc ps test_busybox - [ "$status" -eq 0 ] - [[ ${lines[0]} =~ UID\ +PID\ +PPID\ +C\ +STIME\ +TTY\ +TIME\ +CMD+ ]] - [[ "${lines[1]}" == *"$(id -un 2>/dev/null)"*[0-9]* ]] + runc ps test_busybox + [ "$status" -eq 0 ] + [[ ${lines[0]} =~ UID\ +PID\ +PPID\ +C\ +STIME\ +TTY\ +TIME\ +CMD+ ]] + [[ "${lines[1]}" == *"$(id -un 2>/dev/null)"*[0-9]* ]] } @test "ps -f json" { - # ps is not supported, it requires cgroups - requires root + # ps is not supported, it requires cgroups + requires root - # start busybox detached - runc run -d --console-socket $CONSOLE_SOCKET test_busybox - [ "$status" -eq 0 ] + # start busybox detached + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] - # check state - testcontainer test_busybox running + # check state + testcontainer test_busybox running - runc ps -f json test_busybox - [ "$status" -eq 0 ] - [[ ${lines[0]} =~ [0-9]+ ]] + runc ps -f json test_busybox + [ "$status" -eq 0 ] + [[ ${lines[0]} =~ [0-9]+ ]] } @test "ps -e -x" { - # ps is not supported, it requires cgroups - requires root + # ps is not supported, it requires cgroups + requires root - # start busybox detached - runc run -d --console-socket $CONSOLE_SOCKET test_busybox - [ "$status" -eq 0 ] + # start busybox detached + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] - # check state - testcontainer test_busybox running + # check state + testcontainer test_busybox running - runc ps test_busybox -e -x - [ "$status" -eq 0 ] - [[ ${lines[0]} =~ \ +PID\ +TTY\ +STAT\ +TIME\ +COMMAND+ ]] - [[ "${lines[1]}" =~ [0-9]+ ]] + runc ps test_busybox -e -x + [ "$status" -eq 0 ] + [[ ${lines[0]} =~ \ +PID\ +TTY\ +STAT\ +TIME\ +COMMAND+ ]] + [[ "${lines[1]}" =~ [0-9]+ ]] +} + +@test "ps after the container stopped" { + # ps requires cgroups + [[ "$ROOTLESS" -ne 0 ]] && requires rootless_cgroup + set_cgroups_path + + # start busybox detached + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + # check state + testcontainer test_busybox running + + runc ps test_busybox + [ "$status" -eq 0 ] + + runc kill test_busybox KILL + [ "$status" -eq 0 ] + wait_for_container 10 1 test_busybox stopped + + runc ps test_busybox + [ "$status" -eq 0 ] } diff --git a/tests/integration/root.bats b/tests/integration/root.bats index 90b53b4..53bd1ce 100644 --- a/tests/integration/root.bats +++ b/tests/integration/root.bats @@ -3,48 +3,51 @@ load helpers function setup() { - teardown_running_container_inroot test_dotbox $HELLO_BUNDLE - teardown_busybox - setup_busybox + setup_busybox + ALT_ROOT="$ROOT/alt" + mkdir -p "$ALT_ROOT/state" } function teardown() { - teardown_running_container_inroot test_dotbox $HELLO_BUNDLE - teardown_busybox + if [ -n "$ALT_ROOT" ]; then + ROOT=$ALT_ROOT __runc delete -f test_dotbox + rm -rf "$ALT_ROOT" + fi + teardown_bundle } @test "global --root" { - # run busybox detached using $HELLO_BUNDLE for state - ROOT=$HELLO_BUNDLE runc run -d --console-socket $CONSOLE_SOCKET test_dotbox - [ "$status" -eq 0 ] + # run busybox detached using $ALT_ROOT for state + ROOT=$ALT_ROOT runc run -d --console-socket "$CONSOLE_SOCKET" test_dotbox + [ "$status" -eq 0 ] - # run busybox detached in default root - runc run -d --console-socket $CONSOLE_SOCKET test_busybox - [ "$status" -eq 0 ] + # run busybox detached in default root + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] - runc state test_busybox - [ "$status" -eq 0 ] - [[ "${output}" == *"running"* ]] + runc state test_busybox + [ "$status" -eq 0 ] + [[ "${output}" == *"running"* ]] - ROOT=$HELLO_BUNDLE runc state test_dotbox - [ "$status" -eq 0 ] - [[ "${output}" == *"running"* ]] + ROOT=$ALT_ROOT runc state test_dotbox + [ "$status" -eq 0 ] + [[ "${output}" == *"running"* ]] - ROOT=$HELLO_BUNDLE runc state test_busybox - [ "$status" -ne 0 ] + ROOT=$ALT_ROOT runc state test_busybox + [ "$status" -ne 0 ] - runc state test_dotbox - [ "$status" -ne 0 ] + runc state test_dotbox + [ "$status" -ne 0 ] - runc kill test_busybox KILL - [ "$status" -eq 0 ] - retry 10 1 eval "__runc state test_busybox | grep -q 'stopped'" - runc delete test_busybox - [ "$status" -eq 0 ] + runc kill test_busybox KILL + [ "$status" -eq 0 ] + wait_for_container 10 1 test_busybox stopped + runc delete test_busybox + [ "$status" -eq 0 ] - ROOT=$HELLO_BUNDLE runc kill test_dotbox KILL - [ "$status" -eq 0 ] - retry 10 1 eval "ROOT='$HELLO_BUNDLE' __runc state test_dotbox | grep -q 'stopped'" - ROOT=$HELLO_BUNDLE runc delete test_dotbox - [ "$status" -eq 0 ] + ROOT=$ALT_ROOT runc kill test_dotbox KILL + [ "$status" -eq 0 ] + ROOT=$ALT_ROOT wait_for_container 10 1 test_dotbox stopped + ROOT=$ALT_ROOT runc delete test_dotbox + [ "$status" -eq 0 ] } diff --git a/tests/integration/run.bats b/tests/integration/run.bats new file mode 100644 index 0000000..63be89d --- /dev/null +++ b/tests/integration/run.bats @@ -0,0 +1,59 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + setup_hello +} + +function teardown() { + teardown_bundle +} + +@test "runc run" { + runc run test_hello + [ "$status" -eq 0 ] + + runc state test_hello + [ "$status" -ne 0 ] +} + +@test "runc run --keep" { + runc run --keep test_run_keep + [ "$status" -eq 0 ] + + testcontainer test_run_keep stopped + + runc state test_run_keep + [ "$status" -eq 0 ] + + runc delete test_run_keep + + runc state test_run_keep + [ "$status" -ne 0 ] +} + +@test "runc run --keep (check cgroup exists)" { + # for systemd driver, the unit's cgroup path will be auto removed if container's all processes exited + requires no_systemd + + [[ "$ROOTLESS" -ne 0 ]] && requires rootless_cgroup + + set_cgroups_path + + runc run --keep test_run_keep + [ "$status" -eq 0 ] + + testcontainer test_run_keep stopped + + runc state test_run_keep + [ "$status" -eq 0 ] + + # check that cgroup exists + check_cgroup_value "pids.max" "max" + + runc delete test_run_keep + + runc state test_run_keep + [ "$status" -ne 0 ] +} diff --git a/tests/integration/seccomp-notify-compat.bats b/tests/integration/seccomp-notify-compat.bats new file mode 100644 index 0000000..8d663ed --- /dev/null +++ b/tests/integration/seccomp-notify-compat.bats @@ -0,0 +1,35 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + if [[ "$KERNEL_MAJOR" -gt 5 || ("$KERNEL_MAJOR" -eq 5 && "$KERNEL_MINOR" -ge 6) ]]; then + skip "requires kernel less than 5.6" + fi + + requires arch_x86_64 + + setup_seccompagent + setup_busybox +} + +function teardown() { + teardown_seccompagent + teardown_bundle +} + +# Support for seccomp notify requires Linux > 5.6, check that on older kernels +# return an error. +@test "runc run [seccomp] (SCMP_ACT_NOTIFY old kernel)" { + # Use just any seccomp profile with a notify action. + update_config ' .linux.seccomp = { + "defaultAction": "SCMP_ACT_ALLOW", + "listenerPath": "'"$SECCCOMP_AGENT_SOCKET"'", + "architectures": [ "SCMP_ARCH_X86","SCMP_ARCH_X32", "SCMP_ARCH_X86_64" ], + "syscalls": [{ "names": [ "mkdir" ], "action": "SCMP_ACT_NOTIFY" }] + }' + + runc run test_busybox + [ "$status" -ne 0 ] + [[ "$output" == *"seccomp notify unsupported:"* ]] +} diff --git a/tests/integration/seccomp-notify.bats b/tests/integration/seccomp-notify.bats new file mode 100644 index 0000000..d5dac52 --- /dev/null +++ b/tests/integration/seccomp-notify.bats @@ -0,0 +1,216 @@ +#!/usr/bin/env bats + +load helpers + +# Support for seccomp notify requires Linux > 5.6 because +# runc uses the pidfd_getfd system call to fetch the seccomp fd. +# https://github.com/torvalds/linux/commit/8649c322f75c96e7ced2fec201e123b2b073bf09 +# We also require arch x86_64, to not make this fail when people run tests +# locally on other archs. +function setup() { + requires_kernel 5.6 + requires arch_x86_64 + + setup_seccompagent + setup_busybox +} + +function teardown() { + teardown_seccompagent + teardown_bundle +} + +# Create config.json template with SCMP_ACT_NOTIFY actions +# $1: command to run +# $2: noNewPrivileges (false/true) +# $3: list of syscalls +function scmp_act_notify_template() { + # The agent intercepts mkdir syscalls and creates the folder appending + # "-bar" (listenerMetadata below) to the name. + update_config ' .process.args = ["/bin/sh", "-c", "'"$1"'"] + | .process.noNewPrivileges = '"$2"' + | .linux.seccomp = { + "defaultAction":"SCMP_ACT_ALLOW", + "listenerPath": "'"$SECCCOMP_AGENT_SOCKET"'", + "listenerMetadata": "bar", + "architectures": [ "SCMP_ARCH_X86","SCMP_ARCH_X32", "SCMP_ARCH_X86_64" ], + "syscalls": [{ "names": ['"$3"'], "action": "SCMP_ACT_NOTIFY" }] + }' +} + +# The call to seccomp is done at different places according to the value of +# noNewPrivileges, for this reason many of the following cases are tested with +# both values. + +@test "runc run [seccomp] (SCMP_ACT_NOTIFY noNewPrivileges false)" { + scmp_act_notify_template "mkdir /dev/shm/foo && stat /dev/shm/foo-bar" false '"mkdir"' + + runc run test_busybox + [ "$status" -eq 0 ] +} + +@test "runc run [seccomp] (SCMP_ACT_NOTIFY noNewPrivileges true)" { + scmp_act_notify_template "mkdir /dev/shm/foo && stat /dev/shm/foo-bar" true '"mkdir"' + + runc run test_busybox + [ "$status" -eq 0 ] +} + +@test "runc exec [seccomp] (SCMP_ACT_NOTIFY noNewPrivileges false)" { + requires root + + scmp_act_notify_template "sleep infinity" false '"mkdir"' + + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + runc exec test_busybox /bin/sh -c "mkdir /dev/shm/foo && stat /dev/shm/foo-bar" + [ "$status" -eq 0 ] +} + +@test "runc exec [seccomp] (SCMP_ACT_NOTIFY noNewPrivileges true)" { + requires root + + scmp_act_notify_template "sleep infinity" true '"mkdir"' + + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + runc exec test_busybox /bin/sh -c "mkdir /dev/shm/foo && stat /dev/shm/foo-bar" + [ "$status" -eq 0 ] +} + +@test "runc run [seccomp] (SCMP_ACT_NOTIFY important syscalls noNewPrivileges false)" { + scmp_act_notify_template "/bin/true" false '"execve","openat","open","read","close"' + + runc run test_busybox + [ "$status" -eq 0 ] +} + +@test "runc run [seccomp] (SCMP_ACT_NOTIFY important syscalls noNewPrivileges true)" { + scmp_act_notify_template "/bin/true" true '"execve","openat","open","read","close"' + + runc run test_busybox + [ "$status" -eq 0 ] +} + +@test "runc run [seccomp] (empty listener path)" { + update_config ' .process.args = ["/bin/sh", "-c", "mkdir /dev/shm/foo && stat /dev/shm/foo"] + | .linux.seccomp = { + "defaultAction":"SCMP_ACT_ALLOW", + "listenerPath": "'"$SECCCOMP_AGENT_SOCKET"'", + "listenerMetadata": "bar", + }' + + runc run test_busybox + [ "$status" -eq 0 ] +} + +@test "runc run [seccomp] (SCMP_ACT_NOTIFY empty listener path)" { + scmp_act_notify_template "/bin/true" false '"mkdir"' + update_config '.linux.seccomp.listenerPath = ""' + + runc run test_busybox + [ "$status" -ne 0 ] +} + +@test "runc run [seccomp] (SCMP_ACT_NOTIFY wrong listener path)" { + scmp_act_notify_template "/bin/true" false '"mkdir"' + update_config '.linux.seccomp.listenerPath = "/some-non-existing-listener-path.sock"' + + runc run test_busybox + [ "$status" -ne 0 ] +} + +@test "runc run [seccomp] (SCMP_ACT_NOTIFY abstract listener path)" { + scmp_act_notify_template "/bin/true" false '"mkdir"' + update_config '.linux.seccomp.listenerPath = "@mysocketishere"' + + runc run test_busybox + [ "$status" -ne 0 ] +} + +# Check that killing the seccompagent doesn't block syscalls in +# the container. They should return ENOSYS instead. +@test "runc run [seccomp] (SCMP_ACT_NOTIFY kill seccompagent)" { + scmp_act_notify_template "sleep 4 && mkdir /dev/shm/foo" false '"mkdir"' + + sleep 2 && teardown_seccompagent & + runc run test_busybox + [ "$status" -ne 0 ] + [[ "$output" == *"mkdir:"*"/dev/shm/foo"*"Function not implemented"* ]] +} + +# Check that starting with no seccomp agent running fails with a clear error. +@test "runc run [seccomp] (SCMP_ACT_NOTIFY no seccompagent)" { + teardown_seccompagent + + scmp_act_notify_template "/bin/true" false '"mkdir"' + + runc run test_busybox + [ "$status" -ne 0 ] + [[ "$output" == *"failed to connect with seccomp agent"* ]] +} + +# Check that agent-returned error for the syscall works. +@test "runc run [seccomp] (SCMP_ACT_NOTIFY error chmod)" { + scmp_act_notify_template "touch /dev/shm/foo && chmod 777 /dev/shm/foo" false '"chmod", "fchmod", "fchmodat"' + + runc run test_busybox + [ "$status" -ne 0 ] + [[ "$output" == *"chmod:"*"/dev/shm/foo"*"No medium found"* ]] +} + +# check that trying to use SCMP_ACT_NOTIFY with write() gives a meaningful error. +@test "runc run [seccomp] (SCMP_ACT_NOTIFY write)" { + scmp_act_notify_template "/bin/true" false '"write"' + + runc run test_busybox + [ "$status" -ne 0 ] + [[ "$output" == *"SCMP_ACT_NOTIFY cannot be used for the write syscall"* ]] +} + +# check that a startContainer hook doesn't get any extra file descriptor. +@test "runc run [seccomp] (SCMP_ACT_NOTIFY startContainer hook)" { + # shellcheck disable=SC2016 + # We use single quotes to properly delimit the $1 param to + # update_config(), but this shellshcheck is quite silly and fails if the + # multi-line string includes some $var (even when it is properly outside of the + # single quotes) or when we use this syntax to execute commands in the + # string: $(command). + # So, just disable this check for our usage of update_config(). + update_config ' .process.args = ["/bin/true"] + | .linux.seccomp = { + "defaultAction":"SCMP_ACT_ALLOW", + "listenerPath": "'"$SECCCOMP_AGENT_SOCKET"'", + "architectures": [ "SCMP_ARCH_X86", "SCMP_ARCH_X32", "SCMP_ARCH_X86_64" ], + "syscalls":[{ "names": [ "mkdir" ], "action": "SCMP_ACT_NOTIFY" }] + } + |.hooks = { + "startContainer": [ { + "path": "/bin/sh", + "args": [ + "sh", + "-c", + "if [ $(ls /proc/self/fd/ | wc -l) -ne 4 ]; then echo \"File descriptors is not 4\". && ls /proc/self/fd/ | wc -l && exit 1; fi" + ], + } ] + }' + + runc run test_busybox + [ "$status" -eq 0 ] +} + +# Check that example config in the seccomp agent dir works. +@test "runc run [seccomp] (SCMP_ACT_NOTIFY example config)" { + # Run the script used in the seccomp agent example. + # This takes a bare config.json and modifies it to run an example. + "${INTEGRATION_ROOT}/../../contrib/cmd/seccompagent/gen-seccomp-example-cfg.sh" + + # The listenerPath the previous command uses is the default used by the + # seccomp agent. However, inside bats the socket is in a bats tmp dir. + update_config '.linux.seccomp.listenerPath = "'"$SECCCOMP_AGENT_SOCKET"'"' + + runc run test_busybox + + [ "$status" -eq 0 ] + [[ "$output" == *"chmod:"*"test-file"*"No medium found"* ]] +} diff --git a/tests/integration/seccomp.bats b/tests/integration/seccomp.bats new file mode 100644 index 0000000..e81beca --- /dev/null +++ b/tests/integration/seccomp.bats @@ -0,0 +1,101 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + setup_busybox +} + +function teardown() { + teardown_bundle +} + +@test "runc run [seccomp -ENOSYS handling]" { + TEST_NAME="seccomp_syscall_test1" + + # Compile the test binary and update the config to run it. + gcc -static -o rootfs/seccomp_test "${TESTDATA}/${TEST_NAME}.c" + update_config ".linux.seccomp = $(<"${TESTDATA}/${TEST_NAME}.json")" + update_config '.process.args = ["/seccomp_test"]' + + runc run test_busybox + [ "$status" -eq 0 ] +} + +@test "runc run [seccomp defaultErrnoRet=ENXIO]" { + TEST_NAME="seccomp_syscall_test2" + + # Compile the test binary and update the config to run it. + gcc -static -o rootfs/seccomp_test2 "${TESTDATA}/${TEST_NAME}.c" + update_config ".linux.seccomp = $(<"${TESTDATA}/${TEST_NAME}.json")" + update_config '.process.args = ["/seccomp_test2"]' + + runc run test_busybox + [ "$status" -eq 0 ] +} + +# TODO: +# - Test other actions like SCMP_ACT_TRAP, SCMP_ACT_TRACE, SCMP_ACT_LOG. +# - Test args (index, value, valueTwo, etc). + +@test "runc run [seccomp] (SCMP_ACT_ERRNO default)" { + update_config ' .process.args = ["/bin/sh", "-c", "mkdir /dev/shm/foo"] + | .process.noNewPrivileges = false + | .linux.seccomp = { + "defaultAction":"SCMP_ACT_ALLOW", + "architectures":["SCMP_ARCH_X86","SCMP_ARCH_X32"], + "syscalls":[{"names":["mkdir"], "action":"SCMP_ACT_ERRNO"}] + }' + + runc run test_busybox + [ "$status" -ne 0 ] + [[ "$output" == *"mkdir:"*"/dev/shm/foo"*"Operation not permitted"* ]] +} + +@test "runc run [seccomp] (SCMP_ACT_ERRNO explicit errno)" { + update_config ' .process.args = ["/bin/sh", "-c", "mkdir /dev/shm/foo"] + | .process.noNewPrivileges = false + | .linux.seccomp = { + "defaultAction":"SCMP_ACT_ALLOW", + "architectures":["SCMP_ARCH_X86","SCMP_ARCH_X32"], + "syscalls":[{"names":["mkdir"], "action":"SCMP_ACT_ERRNO", "errnoRet": 100}] + }' + + runc run test_busybox + [ "$status" -ne 0 ] + [[ "$output" == *"Network is down"* ]] +} + +@test "runc run [seccomp] (SCMP_ACT_KILL)" { + update_config ' .process.args = ["/bin/sh", "-c", "mkdir /dev/shm/foo"] + | .process.noNewPrivileges = false + | .linux.seccomp = { + "defaultAction":"SCMP_ACT_ALLOW", + "architectures":["SCMP_ARCH_X86","SCMP_ARCH_X32"], + "syscalls":[{"names":["mkdir"], "action":"SCMP_ACT_KILL"}] + }' + + runc run test_busybox + [ "$status" -ne 0 ] +} + +# check that a startContainer hook is run with the seccomp filters applied +@test "runc run [seccomp] (startContainer hook)" { + update_config ' .process.args = ["/bin/true"] + | .linux.seccomp = { + "defaultAction":"SCMP_ACT_ALLOW", + "architectures":["SCMP_ARCH_X86","SCMP_ARCH_X32"], + "syscalls":[{"names":["mkdir"], "action":"SCMP_ACT_KILL"}] + } + | .hooks = { + "startContainer": [ { + "path": "/bin/sh", + "args": ["sh", "-c", "mkdir /dev/shm/foo"] + } ] + }' + + runc run test_busybox + [ "$status" -ne 0 ] + [[ "$output" == *"error running hook"* ]] + [[ "$output" == *"bad system call"* ]] +} diff --git a/tests/integration/spec.bats b/tests/integration/spec.bats index 5df8f70..b24c6b7 100644 --- a/tests/integration/spec.bats +++ b/tests/integration/spec.bats @@ -3,94 +3,38 @@ load helpers function setup() { - # initial cleanup in case a prior test exited and did not cleanup - cd "$INTEGRATION_ROOT" - run rm -f -r "$HELLO_BUNDLE" - - # setup hello-world for spec generation testing - run mkdir "$HELLO_BUNDLE" - run mkdir "$HELLO_BUNDLE"/rootfs - run tar -C "$HELLO_BUNDLE"/rootfs -xf "$HELLO_IMAGE" + setup_hello } function teardown() { - cd "$INTEGRATION_ROOT" - run rm -f -r "$HELLO_BUNDLE" + teardown_bundle } @test "spec generation cwd" { - cd "$HELLO_BUNDLE" - # note this test runs from the bundle not the integration root - - # test that config.json does not exist after the above partial setup - [ ! -e config.json ] - - # test generation of spec does not return an error - runc_spec - [ "$status" -eq 0 ] - - # test generation of spec created our config.json (spec) - [ -e config.json ] - - # test existence of required args parameter in the generated config.json - run bash -c "grep -A2 'args' config.json | grep 'sh'" - [[ "${output}" == *"sh"* ]] - - # change the default args parameter from sh to hello - sed -i 's;"sh";"/hello";' config.json - - # ensure the generated spec works by running hello-world - runc run test_hello - [ "$status" -eq 0 ] + runc run test_hello + [ "$status" -eq 0 ] } @test "spec generation --bundle" { - # note this test runs from the integration root not the bundle - - # test that config.json does not exist after the above partial setup - [ ! -e "$HELLO_BUNDLE"/config.json ] - - # test generation of spec does not return an error - runc_spec "$HELLO_BUNDLE" - [ "$status" -eq 0 ] - - # test generation of spec created our config.json (spec) - [ -e "$HELLO_BUNDLE"/config.json ] - - # change the default args parameter from sh to hello - sed -i 's;"sh";"/hello";' "$HELLO_BUNDLE"/config.json - - # ensure the generated spec works by running hello-world - runc run --bundle "$HELLO_BUNDLE" test_hello - [ "$status" -eq 0 ] + runc run --bundle "$(pwd)" test_hello + [ "$status" -eq 0 ] } @test "spec validator" { - TESTDIR=$(pwd) - cd "$HELLO_BUNDLE" + requires rootless_no_features - run git clone https://github.com/opencontainers/runtime-spec.git src/runtime-spec - [ "$status" -eq 0 ] + SPEC_VERSION=$(awk '$1 == "github.com/opencontainers/runtime-spec" {print $2}' "$BATS_TEST_DIRNAME"/../../go.mod) + # Will look like this when not pinned to specific tag: "v0.0.0-20190207185410-29686dbc5559", otherwise "v1.0.0" + SPEC_COMMIT=$(cut -d "-" -f 3 <<<"$SPEC_VERSION") + SPEC_REF=$([[ -z "$SPEC_COMMIT" ]] && echo "$SPEC_VERSION" || echo "$SPEC_COMMIT") - SPEC_COMMIT=$(grep '^github.com/opencontainers/runtime-spec' ${TESTDIR}/../../vendor.conf | tr -s ' ' | cut -d ' ' -f 2) - run git -C src/runtime-spec reset --hard "${SPEC_COMMIT}" + git clone https://github.com/opencontainers/runtime-spec.git + (cd runtime-spec && git reset --hard "$SPEC_REF") + SCHEMA='runtime-spec/schema/config-schema.json' + [ -e "$SCHEMA" ] - [ "$status" -eq 0 ] - [ -e src/runtime-spec/schema/config-schema.json ] + GO111MODULE=auto go get github.com/xeipuuv/gojsonschema + GO111MODULE=auto go build runtime-spec/schema/validate.go - run bash -c "GOPATH='$GOPATH' go get github.com/xeipuuv/gojsonschema" - [ "$status" -eq 0 ] - - run git -C "${GOPATH}/src/github.com/xeipuuv/gojsonschema" reset --hard 6637feb73ee44cd4640bb3def285c29774234c7f - [ "$status" -eq 0 ] - - GOPATH="$GOPATH" go build src/runtime-spec/schema/validate.go - [ -e ./validate ] - - runc spec - [ -e config.json ] - - run ./validate src/runtime-spec/schema/config-schema.json config.json - [ "$status" -eq 0 ] - [[ "${lines[0]}" == *"The document is valid"* ]] + ./validate "$SCHEMA" config.json } diff --git a/tests/integration/start.bats b/tests/integration/start.bats index 1f0ea8e..1c844b2 100644 --- a/tests/integration/start.bats +++ b/tests/integration/start.bats @@ -3,29 +3,28 @@ load helpers function setup() { - teardown_busybox - setup_busybox + setup_busybox } function teardown() { - teardown_busybox + teardown_bundle } @test "runc start" { - runc create --console-socket $CONSOLE_SOCKET test_busybox - [ "$status" -eq 0 ] + runc create --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] - testcontainer test_busybox created + testcontainer test_busybox created - # start container test_busybox - runc start test_busybox - [ "$status" -eq 0 ] + # start container test_busybox + runc start test_busybox + [ "$status" -eq 0 ] - testcontainer test_busybox running + testcontainer test_busybox running - # delete test_busybox - runc delete --force test_busybox + # delete test_busybox + runc delete --force test_busybox - runc state test_busybox - [ "$status" -ne 0 ] + runc state test_busybox + [ "$status" -ne 0 ] } diff --git a/tests/integration/start_detached.bats b/tests/integration/start_detached.bats index 7f177b8..0f1c233 100644 --- a/tests/integration/start_detached.bats +++ b/tests/integration/start_detached.bats @@ -3,74 +3,68 @@ load helpers function setup() { - teardown_busybox - setup_busybox + setup_busybox } function teardown() { - teardown_busybox + teardown_bundle } @test "runc run detached" { - # run busybox detached - runc run -d --console-socket $CONSOLE_SOCKET test_busybox - [ "$status" -eq 0 ] + # run busybox detached + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] - # check state - testcontainer test_busybox running + # check state + testcontainer test_busybox running } @test "runc run detached ({u,g}id != 0)" { - # cannot start containers as another user in rootless setup without idmap - [[ "$ROOTLESS" -ne 0 ]] && requires rootless_idmap + # cannot start containers as another user in rootless setup without idmap + [[ "$ROOTLESS" -ne 0 ]] && requires rootless_idmap - # replace "uid": 0 with "uid": 1000 - # and do a similar thing for gid. - sed -i 's;"uid": 0;"uid": 1000;g' config.json - sed -i 's;"gid": 0;"gid": 100;g' config.json + # replace "uid": 0 with "uid": 1000 + # and do a similar thing for gid. + update_config ' (.. | select(.uid? == 0)) .uid |= 1000 + | (.. | select(.gid? == 0)) .gid |= 100' - # run busybox detached - runc run -d --console-socket $CONSOLE_SOCKET test_busybox - [ "$status" -eq 0 ] + # run busybox detached + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] - # check state - testcontainer test_busybox running + # check state + testcontainer test_busybox running } @test "runc run detached --pid-file" { - # run busybox detached - runc run --pid-file pid.txt -d --console-socket $CONSOLE_SOCKET test_busybox - [ "$status" -eq 0 ] + # run busybox detached + runc run --pid-file pid.txt -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] - # check state - testcontainer test_busybox running + # check state + testcontainer test_busybox running - # check pid.txt was generated - [ -e pid.txt ] + # check pid.txt was generated + [ -e pid.txt ] - run cat pid.txt - [ "$status" -eq 0 ] - [[ ${lines[0]} == $(__runc state test_busybox | jq '.pid') ]] + [[ "$(cat pid.txt)" == $(__runc state test_busybox | jq '.pid') ]] } @test "runc run detached --pid-file with new CWD" { - # create pid_file directory as the CWD - run mkdir pid_file - [ "$status" -eq 0 ] - run cd pid_file - [ "$status" -eq 0 ] + bundle="$(pwd)" + # create pid_file directory as the CWD + mkdir pid_file + cd pid_file - # run busybox detached - runc run --pid-file pid.txt -d -b $BUSYBOX_BUNDLE --console-socket $CONSOLE_SOCKET test_busybox - [ "$status" -eq 0 ] + # run busybox detached + runc run --pid-file pid.txt -d -b "$bundle" --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] - # check state - testcontainer test_busybox running + # check state + testcontainer test_busybox running - # check pid.txt was generated - [ -e pid.txt ] + # check pid.txt was generated + [ -e pid.txt ] - run cat pid.txt - [ "$status" -eq 0 ] - [[ ${lines[0]} == $(__runc state test_busybox | jq '.pid') ]] + [[ "$(cat pid.txt)" == $(__runc state test_busybox | jq '.pid') ]] } diff --git a/tests/integration/start_hello.bats b/tests/integration/start_hello.bats index a706be2..1c3c906 100644 --- a/tests/integration/start_hello.bats +++ b/tests/integration/start_hello.bats @@ -3,62 +3,88 @@ load helpers function setup() { - teardown_hello - setup_hello + setup_hello } function teardown() { - teardown_hello + teardown_bundle } @test "runc run" { - # run hello-world - runc run test_hello - [ "$status" -eq 0 ] + # run hello-world + runc run test_hello + [ "$status" -eq 0 ] - # check expected output - [[ "${output}" == *"Hello"* ]] + # check expected output + [[ "${output}" == *"Hello"* ]] } @test "runc run ({u,g}id != 0)" { - # cannot start containers as another user in rootless setup without idmap - [[ "$ROOTLESS" -ne 0 ]] && requires rootless_idmap + # cannot start containers as another user in rootless setup without idmap + [[ "$ROOTLESS" -ne 0 ]] && requires rootless_idmap - # replace "uid": 0 with "uid": 1000 - # and do a similar thing for gid. - sed -i 's;"uid": 0;"uid": 1000;g' config.json - sed -i 's;"gid": 0;"gid": 100;g' config.json + # replace "uid": 0 with "uid": 1000 + # and do a similar thing for gid. + update_config ' (.. | select(.uid? == 0)) .uid |= 1000 + | (.. | select(.gid? == 0)) .gid |= 100' - # run hello-world - runc run test_hello - [ "$status" -eq 0 ] + # run hello-world + runc run test_hello + [ "$status" -eq 0 ] - # check expected output - [[ "${output}" == *"Hello"* ]] + # check expected output + [[ "${output}" == *"Hello"* ]] } @test "runc run with rootfs set to ." { - cp config.json rootfs/. - rm config.json - cd rootfs - sed -i 's;"rootfs";".";' config.json + cp config.json rootfs/. + rm config.json + cd rootfs + update_config '(.. | select(. == "rootfs")) |= "."' - # run hello-world - runc run test_hello - [ "$status" -eq 0 ] - [[ "${output}" == *"Hello"* ]] + # run hello-world + runc run test_hello + [ "$status" -eq 0 ] + [[ "${output}" == *"Hello"* ]] } @test "runc run --pid-file" { - # run hello-world - runc run --pid-file pid.txt test_hello - [ "$status" -eq 0 ] - [[ "${output}" == *"Hello"* ]] + # run hello-world + runc run --pid-file pid.txt test_hello + [ "$status" -eq 0 ] + [[ "${output}" == *"Hello"* ]] - # check pid.txt was generated - [ -e pid.txt ] + # check pid.txt was generated + [ -e pid.txt ] - run cat pid.txt - [ "$status" -eq 0 ] - [[ ${lines[0]} =~ [0-9]+ ]] + [[ "$(cat pid.txt)" =~ [0-9]+ ]] +} + +# https://github.com/opencontainers/runc/pull/2897 +@test "runc run [rootless with host pidns]" { + requires rootless_no_features + + # Remove pid namespace, and replace /proc mount + # with a bind mount from the host. + update_config ' .linux.namespaces -= [{"type": "pid"}] + | .mounts |= map((select(.type == "proc") + | .type = "none" + | .source = "/proc" + | .options = ["rbind", "nosuid", "nodev", "noexec"] + ) // .)' + + runc run test_hello + [ "$status" -eq 0 ] +} + +@test "runc run [redundant seccomp rules]" { + update_config ' .linux.seccomp = { + "defaultAction": "SCMP_ACT_ALLOW", + "syscalls": [{ + "names": ["bdflush"], + "action": "SCMP_ACT_ALLOW", + }] + }' + runc run test_hello + [ "$status" -eq 0 ] } diff --git a/tests/integration/state.bats b/tests/integration/state.bats index 68dae38..5e88a3e 100644 --- a/tests/integration/state.bats +++ b/tests/integration/state.bats @@ -3,64 +3,61 @@ load helpers function setup() { - teardown_busybox - setup_busybox + setup_busybox } function teardown() { - teardown_busybox + teardown_bundle } @test "state (kill + delete)" { - runc state test_busybox - [ "$status" -ne 0 ] + runc state test_busybox + [ "$status" -ne 0 ] - # run busybox detached - runc run -d --console-socket $CONSOLE_SOCKET test_busybox - [ "$status" -eq 0 ] + # run busybox detached + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] - # check state - testcontainer test_busybox running + # check state + testcontainer test_busybox running - runc kill test_busybox KILL - [ "$status" -eq 0 ] + runc kill test_busybox KILL + [ "$status" -eq 0 ] + wait_for_container 10 1 test_busybox stopped - # wait for busybox to be in the destroyed state - retry 10 1 eval "__runc state test_busybox | grep -q 'stopped'" + # delete test_busybox + runc delete test_busybox + [ "$status" -eq 0 ] - # delete test_busybox - runc delete test_busybox - [ "$status" -eq 0 ] - - runc state test_busybox - [ "$status" -ne 0 ] + runc state test_busybox + [ "$status" -ne 0 ] } @test "state (pause + resume)" { - # XXX: pause and resume require cgroups. - requires root + # XXX: pause and resume require cgroups. + requires root - runc state test_busybox - [ "$status" -ne 0 ] + runc state test_busybox + [ "$status" -ne 0 ] - # run busybox detached - runc run -d --console-socket $CONSOLE_SOCKET test_busybox - [ "$status" -eq 0 ] + # run busybox detached + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] - # check state - testcontainer test_busybox running + # check state + testcontainer test_busybox running - # pause busybox - runc pause test_busybox - [ "$status" -eq 0 ] + # pause busybox + runc pause test_busybox + [ "$status" -eq 0 ] - # test state of busybox is paused - testcontainer test_busybox paused + # test state of busybox is paused + testcontainer test_busybox paused - # resume busybox - runc resume test_busybox - [ "$status" -eq 0 ] + # resume busybox + runc resume test_busybox + [ "$status" -eq 0 ] - # test state of busybox is back to running - testcontainer test_busybox running + # test state of busybox is back to running + testcontainer test_busybox running } diff --git a/tests/integration/testdata/.gitignore b/tests/integration/testdata/.gitignore new file mode 100644 index 0000000..70f5582 --- /dev/null +++ b/tests/integration/testdata/.gitignore @@ -0,0 +1,2 @@ +busybox-*.tar.xz +debian-*.tar.xz diff --git a/tests/integration/testdata/dev_access_test.c b/tests/integration/testdata/dev_access_test.c new file mode 100644 index 0000000..705096c --- /dev/null +++ b/tests/integration/testdata/dev_access_test.c @@ -0,0 +1,17 @@ +#include +#include + +int main(int argc, char *argv[]) +{ + const char *dev_name = "/dev/kmsg"; + + if (argc > 1) + dev_name = argv[1]; + + if (access(dev_name, F_OK) < 0) { + perror(dev_name); + return 1; + } + + return 0; +} diff --git a/tests/integration/testdata/hello-world.tar b/tests/integration/testdata/hello-world-amd64.tar similarity index 100% rename from tests/integration/testdata/hello-world.tar rename to tests/integration/testdata/hello-world-amd64.tar diff --git a/tests/integration/testdata/hello-world-aarch64.tar b/tests/integration/testdata/hello-world-arm64v8.tar similarity index 100% rename from tests/integration/testdata/hello-world-aarch64.tar rename to tests/integration/testdata/hello-world-arm64v8.tar diff --git a/tests/integration/testdata/seccomp_syscall_test1.c b/tests/integration/testdata/seccomp_syscall_test1.c new file mode 100644 index 0000000..d62598a --- /dev/null +++ b/tests/integration/testdata/seccomp_syscall_test1.c @@ -0,0 +1,79 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +static int exit_code = 0; + +/* + * We need raw wrappers around each syscall so that glibc won't rewrite the + * errno value when it is returned from the seccomp filter (glibc has a habit + * of hiding -ENOSYS if possible -- which counters what we're trying to test). + */ +#define raw(name, ...) \ + syscall(SYS_ ## name, ##__VA_ARGS__) + +#define syscall_assert(sval, rval) \ + do { \ + int L = (sval), R = (rval); \ + if (L < 0) \ + L = -errno; \ + if (L != R) { \ + printf("syscall_assert(%s == %s) failed: %d != %d\n", #sval, #rval, L, R); \ + exit_code = 32; \ + } \ + } while (0) + +int main(void) +{ + // Basic permitted syscalls. + syscall_assert(write(-1, NULL, 0), -EBADF); + + // Basic syscall with masked rules. + syscall_assert(raw(socket, AF_UNIX, SOCK_STREAM, 0x000), 3); + syscall_assert(raw(socket, AF_UNIX, SOCK_STREAM, 0x0FF), -EPROTONOSUPPORT); + syscall_assert(raw(socket, AF_UNIX, SOCK_STREAM, 0x001), 4); + syscall_assert(raw(socket, AF_UNIX, SOCK_STREAM, 0x100), -EPERM); + syscall_assert(raw(socket, AF_UNIX, SOCK_STREAM, 0xC00), -EPERM); + + // Multiple arguments with OR rules. + syscall_assert(raw(process_vm_readv, 100, NULL, 0, NULL, 0, ~0), -EINVAL); + syscall_assert(raw(process_vm_readv, 9001, NULL, 0, NULL, 0, ~0), -EINVAL); + syscall_assert(raw(process_vm_readv, 0, NULL, 0, NULL, 0, ~0), -EPERM); + syscall_assert(raw(process_vm_readv, 0, NULL, 0, NULL, 0, ~0), -EPERM); + + // Multiple arguments with OR rules -- rule is ERRNO(-ENOANO). + syscall_assert(raw(process_vm_writev, 1337, NULL, 0, NULL, 0, ~0), -ENOANO); + syscall_assert(raw(process_vm_writev, 2020, NULL, 0, NULL, 0, ~0), -ENOANO); + syscall_assert(raw(process_vm_writev, 0, NULL, 0, NULL, 0, ~0), -EPERM); + syscall_assert(raw(process_vm_writev, 0, NULL, 0, NULL, 0, ~0), -EPERM); + + // Multiple arguments with AND rules. + syscall_assert(raw(kcmp, 0, 1337, 0, 0, 0), -ESRCH); + syscall_assert(raw(kcmp, 0, 0, 0, 0, 0), -EPERM); + syscall_assert(raw(kcmp, 500, 1337, 0, 0, 0), -EPERM); + syscall_assert(raw(kcmp, 500, 500, 0, 0, 0), -EPERM); + + // Multiple rules for the same syscall. + syscall_assert(raw(dup3, 0, -100, 0xFFFF), -EPERM); + syscall_assert(raw(dup3, 1, -100, 0xFFFF), -EINVAL); + syscall_assert(raw(dup3, 2, -100, 0xFFFF), -EPERM); + syscall_assert(raw(dup3, 3, -100, 0xFFFF), -EINVAL); + + // Explicitly denied syscalls (those in Linux 3.0) get -EPERM. + syscall_assert(raw(unshare, 0), -EPERM); + syscall_assert(raw(setns, 0, 0), -EPERM); + + // Out-of-bounds fake syscall. + syscall_assert(syscall(1000, 0xDEADBEEF, 0xCAFEFEED, 0x1337), -ENOSYS); + + return exit_code; +} diff --git a/tests/integration/testdata/seccomp_syscall_test1.json b/tests/integration/testdata/seccomp_syscall_test1.json new file mode 100644 index 0000000..c48ceae --- /dev/null +++ b/tests/integration/testdata/seccomp_syscall_test1.json @@ -0,0 +1,464 @@ +{ + "defaultAction": "SCMP_ACT_ERRNO", + "architectures": [ + "SCMP_ARCH_X86", + "SCMP_ARCH_X32", + "SCMP_ARCH_X86_64", + "SCMP_ARCH_AARCH64", + "SCMP_ARCH_ARM" + ], + "syscalls": [ + { + "action": "SCMP_ACT_ALLOW", + "names": [ + "accept", + "accept4", + "access", + "adjtimex", + "alarm", + "arch_prctl", + "bind", + "brk", + "capget", + "capset", + "chdir", + "chmod", + "chown", + "chown32", + "clock_adjtime", + "clock_adjtime64", + "clock_getres", + "clock_getres_time64", + "clock_gettime", + "clock_gettime64", + "clock_nanosleep", + "clock_nanosleep_time64", + "close", + "connect", + "copy_file_range", + "creat", + "dup", + "dup2", + "epoll_create", + "epoll_create1", + "epoll_ctl", + "epoll_ctl_old", + "epoll_pwait", + "epoll_wait", + "epoll_wait_old", + "eventfd", + "eventfd2", + "execve", + "execveat", + "exit", + "exit_group", + "faccessat", + "faccessat2", + "fadvise64", + "fadvise64_64", + "fallocate", + "fanotify_mark", + "fchdir", + "fchmod", + "fchmodat", + "fchown", + "fchown32", + "fchownat", + "fcntl", + "fcntl64", + "fdatasync", + "fgetxattr", + "flistxattr", + "flock", + "fork", + "fremovexattr", + "fsetxattr", + "fstat", + "fstat64", + "fstatat64", + "fstatfs", + "fstatfs64", + "fsync", + "ftruncate", + "ftruncate64", + "futex", + "futex_time64", + "futimesat", + "getcpu", + "getcwd", + "getdents", + "getdents64", + "getegid", + "getegid32", + "geteuid", + "geteuid32", + "getgid", + "getgid32", + "getgroups", + "getgroups32", + "getitimer", + "getpeername", + "getpgid", + "getpgrp", + "getpid", + "getppid", + "getpriority", + "getrandom", + "getresgid", + "getresgid32", + "getresuid", + "getresuid32", + "getrlimit", + "get_robust_list", + "getrusage", + "getsid", + "getsockname", + "getsockopt", + "get_thread_area", + "gettid", + "gettimeofday", + "getuid", + "getuid32", + "getxattr", + "inotify_add_watch", + "inotify_init", + "inotify_init1", + "inotify_rm_watch", + "io_cancel", + "ioctl", + "io_destroy", + "io_getevents", + "io_pgetevents", + "io_pgetevents_time64", + "ioprio_get", + "ioprio_set", + "io_setup", + "io_submit", + "io_uring_enter", + "io_uring_register", + "io_uring_setup", + "ipc", + "kill", + "lchown", + "lchown32", + "lgetxattr", + "link", + "linkat", + "listen", + "listxattr", + "llistxattr", + "_llseek", + "lremovexattr", + "lseek", + "lsetxattr", + "lstat", + "lstat64", + "madvise", + "membarrier", + "memfd_create", + "mincore", + "mkdir", + "mkdirat", + "mknod", + "mknodat", + "mlock", + "mlock2", + "mlockall", + "mmap", + "mmap2", + "modify_ldt", + "mprotect", + "mq_getsetattr", + "mq_notify", + "mq_open", + "mq_timedreceive", + "mq_timedreceive_time64", + "mq_timedsend", + "mq_timedsend_time64", + "mq_unlink", + "mremap", + "msgctl", + "msgget", + "msgrcv", + "msgsnd", + "msync", + "munlock", + "munlockall", + "munmap", + "nanosleep", + "newfstatat", + "_newselect", + "open", + "openat", + "openat2", + "pause", + "pidfd_open", + "pidfd_send_signal", + "pipe", + "pipe2", + "poll", + "ppoll", + "ppoll_time64", + "prctl", + "pread64", + "preadv", + "preadv2", + "prlimit64", + "pselect6", + "pselect6_time64", + "pwrite64", + "pwritev", + "pwritev2", + "read", + "readahead", + "readlink", + "readlinkat", + "readv", + "recv", + "recvfrom", + "recvmmsg", + "recvmmsg_time64", + "recvmsg", + "remap_file_pages", + "removexattr", + "rename", + "renameat", + "renameat2", + "restart_syscall", + "rmdir", + "rseq", + "rt_sigaction", + "rt_sigpending", + "rt_sigprocmask", + "rt_sigqueueinfo", + "rt_sigreturn", + "rt_sigsuspend", + "rt_sigtimedwait", + "rt_sigtimedwait_time64", + "rt_tgsigqueueinfo", + "sched_getaffinity", + "sched_getattr", + "sched_getparam", + "sched_get_priority_max", + "sched_get_priority_min", + "sched_getscheduler", + "sched_rr_get_interval", + "sched_rr_get_interval_time64", + "sched_setaffinity", + "sched_setattr", + "sched_setparam", + "sched_setscheduler", + "sched_yield", + "seccomp", + "select", + "semctl", + "semget", + "semop", + "semtimedop", + "semtimedop_time64", + "send", + "sendfile", + "sendfile64", + "sendmmsg", + "sendmsg", + "sendto", + "setfsgid", + "setfsgid32", + "setfsuid", + "setfsuid32", + "setgid", + "setgid32", + "setgroups", + "setgroups32", + "setitimer", + "setpgid", + "setpriority", + "setregid", + "setregid32", + "setresgid", + "setresgid32", + "setresuid", + "setresuid32", + "setreuid", + "setreuid32", + "setrlimit", + "set_robust_list", + "setsid", + "setsockopt", + "set_thread_area", + "set_tid_address", + "setuid", + "setuid32", + "setxattr", + "shmat", + "shmctl", + "shmdt", + "shmget", + "shutdown", + "sigaltstack", + "signalfd", + "signalfd4", + "sigprocmask", + "sigreturn", + "socketcall", + "socketpair", + "splice", + "stat", + "stat64", + "statfs", + "statfs64", + "statx", + "symlink", + "symlinkat", + "sync", + "sync_file_range", + "syncfs", + "sysinfo", + "tee", + "tgkill", + "time", + "timer_create", + "timer_delete", + "timer_getoverrun", + "timer_gettime", + "timer_gettime64", + "timer_settime", + "timer_settime64", + "timerfd_create", + "timerfd_gettime", + "timerfd_gettime64", + "timerfd_settime", + "timerfd_settime64", + "times", + "tkill", + "truncate", + "truncate64", + "ugetrlimit", + "umask", + "uname", + "unlink", + "unlinkat", + "utime", + "utimensat", + "utimensat_time64", + "utimes", + "vfork", + "vmsplice", + "wait4", + "waitid", + "waitpid", + "write", + "writev" + ] + }, + { + "action": "SCMP_ACT_ALLOW", + "names": [ + "dup3" + ], + "args": [ + { + "index": 0, + "value": 1, + "op": "SCMP_CMP_EQ" + } + ] + }, + { + "action": "SCMP_ACT_ALLOW", + "names": [ + "dup3" + ], + "args": [ + { + "index": 0, + "value": 2, + "op": "SCMP_CMP_GT" + } + ] + }, + { + "action": "SCMP_ACT_ALLOW", + "names": [ + "kcmp" + ], + "args": [ + { + "index": 0, + "value": 0, + "op": "SCMP_CMP_EQ" + }, + { + "index": 1, + "value": 1337, + "op": "SCMP_CMP_EQ" + } + ] + }, + { + "action": "SCMP_ACT_ALLOW", + "names": [ + "process_vm_readv" + ], + "args": [ + { + "index": 0, + "value": 100, + "op": "SCMP_CMP_EQ" + }, + { + "index": 0, + "value": 9001, + "op": "SCMP_CMP_EQ" + } + ] + }, + { + "action": "SCMP_ACT_ERRNO", + "errnoRet": 55, + "names": [ + "process_vm_writev" + ], + "args": [ + { + "index": 0, + "value": 1337, + "op": "SCMP_CMP_EQ" + }, + { + "index": 0, + "value": 2020, + "op": "SCMP_CMP_EQ" + } + ] + }, + { + "action": "SCMP_ACT_ALLOW", + "names": [ + "clone" + ], + "args": [ + { + "index": 0, + "value": 2114060288, + "op": "SCMP_CMP_MASKED_EQ" + } + ] + }, + { + "action": "SCMP_ACT_ALLOW", + "names": [ + "socket" + ], + "args": [ + { + "index": 2, + "value": 3840, + "valueTwo": 0, + "op": "SCMP_CMP_MASKED_EQ" + } + ] + } + ] +} + diff --git a/tests/integration/testdata/seccomp_syscall_test2.c b/tests/integration/testdata/seccomp_syscall_test2.c new file mode 100644 index 0000000..71d4a1c --- /dev/null +++ b/tests/integration/testdata/seccomp_syscall_test2.c @@ -0,0 +1,12 @@ +#include +#include +#include +#include + +int main() +{ + if (chdir("/") < 0 && errno == ENXIO) + exit(EXIT_SUCCESS); + fprintf(stderr, "got errno=%m\n"); + exit(EXIT_FAILURE); +} diff --git a/tests/integration/testdata/seccomp_syscall_test2.json b/tests/integration/testdata/seccomp_syscall_test2.json new file mode 100644 index 0000000..f9fb11a --- /dev/null +++ b/tests/integration/testdata/seccomp_syscall_test2.json @@ -0,0 +1,356 @@ +{ + "defaultAction": "SCMP_ACT_ERRNO", + "defaultErrnoRet": 6, + "architectures": [ + "SCMP_ARCH_X86", + "SCMP_ARCH_X32", + "SCMP_ARCH_X86_64", + "SCMP_ARCH_AARCH64", + "SCMP_ARCH_ARM" + ], + "syscalls": [ + { + "action": "SCMP_ACT_ALLOW", + "names": [ + "accept", + "accept4", + "access", + "adjtimex", + "alarm", + "arch_prctl", + "bind", + "brk", + "capget", + "capset", + "chmod", + "chown", + "chown32", + "clock_adjtime", + "clock_adjtime64", + "clock_getres", + "clock_getres_time64", + "clock_gettime", + "clock_gettime64", + "clock_nanosleep", + "clock_nanosleep_time64", + "close", + "connect", + "copy_file_range", + "creat", + "dup", + "dup2", + "epoll_create", + "epoll_create1", + "epoll_ctl", + "epoll_ctl_old", + "epoll_pwait", + "epoll_wait", + "epoll_wait_old", + "eventfd", + "eventfd2", + "execve", + "execveat", + "exit", + "exit_group", + "faccessat", + "faccessat2", + "fadvise64", + "fadvise64_64", + "fallocate", + "fanotify_mark", + "fchdir", + "fchmod", + "fchmodat", + "fchown", + "fchown32", + "fchownat", + "fcntl", + "fcntl64", + "fdatasync", + "fgetxattr", + "flistxattr", + "flock", + "fork", + "fremovexattr", + "fsetxattr", + "fstat", + "fstat64", + "fstatat64", + "fstatfs", + "fstatfs64", + "fsync", + "ftruncate", + "ftruncate64", + "futex", + "futex_time64", + "futimesat", + "getcpu", + "getcwd", + "getdents", + "getdents64", + "getegid", + "getegid32", + "geteuid", + "geteuid32", + "getgid", + "getgid32", + "getgroups", + "getgroups32", + "getitimer", + "getpeername", + "getpgid", + "getpgrp", + "getpid", + "getppid", + "getpriority", + "getrandom", + "getresgid", + "getresgid32", + "getresuid", + "getresuid32", + "getrlimit", + "get_robust_list", + "getrusage", + "getsid", + "getsockname", + "getsockopt", + "get_thread_area", + "gettid", + "gettimeofday", + "getuid", + "getuid32", + "getxattr", + "inotify_add_watch", + "inotify_init", + "inotify_init1", + "inotify_rm_watch", + "io_cancel", + "ioctl", + "io_destroy", + "io_getevents", + "io_pgetevents", + "io_pgetevents_time64", + "ioprio_get", + "ioprio_set", + "io_setup", + "io_submit", + "io_uring_enter", + "io_uring_register", + "io_uring_setup", + "ipc", + "kill", + "lchown", + "lchown32", + "lgetxattr", + "link", + "linkat", + "listen", + "listxattr", + "llistxattr", + "_llseek", + "lremovexattr", + "lseek", + "lsetxattr", + "lstat", + "lstat64", + "madvise", + "membarrier", + "memfd_create", + "mincore", + "mkdir", + "mkdirat", + "mknod", + "mknodat", + "mlock", + "mlock2", + "mlockall", + "mmap", + "mmap2", + "modify_ldt", + "mprotect", + "mq_getsetattr", + "mq_notify", + "mq_open", + "mq_timedreceive", + "mq_timedreceive_time64", + "mq_timedsend", + "mq_timedsend_time64", + "mq_unlink", + "mremap", + "msgctl", + "msgget", + "msgrcv", + "msgsnd", + "msync", + "munlock", + "munlockall", + "munmap", + "nanosleep", + "newfstatat", + "_newselect", + "open", + "openat", + "openat2", + "pause", + "pidfd_open", + "pidfd_send_signal", + "pipe", + "pipe2", + "poll", + "ppoll", + "ppoll_time64", + "prctl", + "pread64", + "preadv", + "preadv2", + "prlimit64", + "pselect6", + "pselect6_time64", + "pwrite64", + "pwritev", + "pwritev2", + "read", + "readahead", + "readlink", + "readlinkat", + "readv", + "recv", + "recvfrom", + "recvmmsg", + "recvmmsg_time64", + "recvmsg", + "remap_file_pages", + "removexattr", + "rename", + "renameat", + "renameat2", + "restart_syscall", + "rmdir", + "rseq", + "rt_sigaction", + "rt_sigpending", + "rt_sigprocmask", + "rt_sigqueueinfo", + "rt_sigreturn", + "rt_sigsuspend", + "rt_sigtimedwait", + "rt_sigtimedwait_time64", + "rt_tgsigqueueinfo", + "sched_getaffinity", + "sched_getattr", + "sched_getparam", + "sched_get_priority_max", + "sched_get_priority_min", + "sched_getscheduler", + "sched_rr_get_interval", + "sched_rr_get_interval_time64", + "sched_setaffinity", + "sched_setattr", + "sched_setparam", + "sched_setscheduler", + "sched_yield", + "seccomp", + "select", + "semctl", + "semget", + "semop", + "semtimedop", + "semtimedop_time64", + "send", + "sendfile", + "sendfile64", + "sendmmsg", + "sendmsg", + "sendto", + "setfsgid", + "setfsgid32", + "setfsuid", + "setfsuid32", + "setgid", + "setgid32", + "setgroups", + "setgroups32", + "setitimer", + "setpgid", + "setpriority", + "setregid", + "setregid32", + "setresgid", + "setresgid32", + "setresuid", + "setresuid32", + "setreuid", + "setreuid32", + "setrlimit", + "set_robust_list", + "setsid", + "setsockopt", + "set_thread_area", + "set_tid_address", + "setuid", + "setuid32", + "setxattr", + "shmat", + "shmctl", + "shmdt", + "shmget", + "shutdown", + "sigaltstack", + "signalfd", + "signalfd4", + "sigprocmask", + "sigreturn", + "socketcall", + "socketpair", + "splice", + "stat", + "stat64", + "statfs", + "statfs64", + "statx", + "symlink", + "symlinkat", + "sync", + "sync_file_range", + "syncfs", + "sysinfo", + "tee", + "tgkill", + "time", + "timer_create", + "timer_delete", + "timer_getoverrun", + "timer_gettime", + "timer_gettime64", + "timer_settime", + "timer_settime64", + "timerfd_create", + "timerfd_gettime", + "timerfd_gettime64", + "timerfd_settime", + "timerfd_settime64", + "times", + "tkill", + "truncate", + "truncate64", + "ugetrlimit", + "umask", + "uname", + "unlink", + "unlinkat", + "utime", + "utimensat", + "utimensat_time64", + "utimes", + "vfork", + "vmsplice", + "wait4", + "waitid", + "waitpid", + "write", + "writev" + ] + } + ] +} + diff --git a/tests/integration/tty.bats b/tests/integration/tty.bats index 688875d..8bbb476 100644 --- a/tests/integration/tty.bats +++ b/tests/integration/tty.bats @@ -3,17 +3,25 @@ load helpers function setup() { - teardown_busybox setup_busybox } function teardown() { - teardown_busybox + teardown_bundle +} + +@test "runc run [stdin not a tty]" { + # stty size fails without a tty + update_config '(.. | select(.[]? == "sh")) += ["-c", "stty size"]' + # note that stdout/stderr are already redirected by bats' run + runc run test_busybox / { print $5; exit }') - eval CGROUP_${g}="${base_path}${CGROUPS_PATH}" - done + check_cgroup_value $MEM_RESERVE 25165824 + check_systemd_value $SD_MEM_RESERVE 25165824 - CGROUP_SYSTEM_MEMORY=$(grep "cgroup" /proc/self/mountinfo | gawk 'toupper($NF) ~ /\<'MEMORY'\>/ { print $5; exit }') + check_cgroup_value "pids.max" 20 + check_systemd_value "TasksMax" 20 - # check that initial values were properly set - check_cgroup_value $CGROUP_CPU "cpu.cfs_period_us" 1000000 - check_cgroup_value $CGROUP_CPU "cpu.cfs_quota_us" 500000 - check_cgroup_value $CGROUP_CPU "cpu.shares" 100 - check_cgroup_value $CGROUP_CPUSET "cpuset.cpus" 0 - check_cgroup_value $CGROUP_MEMORY "memory.kmem.limit_in_bytes" 16777216 - check_cgroup_value $CGROUP_MEMORY "memory.kmem.tcp.limit_in_bytes" 11534336 - check_cgroup_value $CGROUP_MEMORY "memory.limit_in_bytes" 33554432 - check_cgroup_value $CGROUP_MEMORY "memory.soft_limit_in_bytes" 25165824 - check_cgroup_value $CGROUP_PIDS "pids.max" 20 + # update cpuset if possible (i.e. we're running on a multicore cpu) + cpu_count=$(grep -c '^processor' /proc/cpuinfo) + if [ "$cpu_count" -gt 1 ]; then + runc update test_update --cpuset-cpus "1" + [ "$status" -eq 0 ] + check_cgroup_value "cpuset.cpus" 1 + fi - # update cpu-period - runc update test_update --cpu-period 900000 - [ "$status" -eq 0 ] - check_cgroup_value $CGROUP_CPU "cpu.cfs_period_us" 900000 + # update memory limit + runc update test_update --memory 67108864 + [ "$status" -eq 0 ] + check_cgroup_value $MEM_LIMIT 67108864 + check_systemd_value $SD_MEM_LIMIT 67108864 - # update cpu-quota - runc update test_update --cpu-quota 600000 - [ "$status" -eq 0 ] - check_cgroup_value $CGROUP_CPU "cpu.cfs_quota_us" 600000 + runc update test_update --memory 50M + [ "$status" -eq 0 ] + check_cgroup_value $MEM_LIMIT 52428800 + check_systemd_value $SD_MEM_LIMIT 52428800 - # update cpu-shares - runc update test_update --cpu-share 200 - [ "$status" -eq 0 ] - check_cgroup_value $CGROUP_CPU "cpu.shares" 200 + # update memory soft limit + runc update test_update --memory-reservation 33554432 + [ "$status" -eq 0 ] + check_cgroup_value "$MEM_RESERVE" 33554432 + check_systemd_value "$SD_MEM_RESERVE" 33554432 - # update cpuset if supported (i.e. we're running on a multicore cpu) - cpu_count=$(grep '^processor' /proc/cpuinfo | wc -l) - if [ $cpu_count -gt 1 ]; then - runc update test_update --cpuset-cpus "1" - [ "$status" -eq 0 ] - check_cgroup_value $CGROUP_CPUSET "cpuset.cpus" 1 - fi + # Run swap memory tests if swap is available + if [ "$HAVE_SWAP" = "yes" ]; then + # try to remove memory swap limit + runc update test_update --memory-swap -1 + [ "$status" -eq 0 ] + check_cgroup_value "$MEM_SWAP" $SYSTEM_MEM + check_systemd_value "$SD_MEM_SWAP" $SD_UNLIMITED - # update memory limit - runc update test_update --memory 67108864 - [ "$status" -eq 0 ] - check_cgroup_value $CGROUP_MEMORY "memory.limit_in_bytes" 67108864 + # update memory swap + if [ "$CGROUP_UNIFIED" = "yes" ]; then + # for cgroupv2, memory and swap can only be set together + runc update test_update --memory 52428800 --memory-swap 96468992 + [ "$status" -eq 0 ] + # for cgroupv2, swap is a separate limit (it does not include mem) + check_cgroup_value "$MEM_SWAP" $((96468992 - 52428800)) + check_systemd_value "$SD_MEM_SWAP" $((96468992 - 52428800)) + else + runc update test_update --memory-swap 96468992 + [ "$status" -eq 0 ] + check_cgroup_value "$MEM_SWAP" 96468992 + check_systemd_value "$SD_MEM_SWAP" 96468992 + fi + fi - runc update test_update --memory 50M - [ "$status" -eq 0 ] - check_cgroup_value $CGROUP_MEMORY "memory.limit_in_bytes" 52428800 + # try to remove memory limit + runc update test_update --memory -1 + [ "$status" -eq 0 ] - # update memory soft limit - runc update test_update --memory-reservation 33554432 - [ "$status" -eq 0 ] - check_cgroup_value $CGROUP_MEMORY "memory.soft_limit_in_bytes" 33554432 + # check memory limit is gone + check_cgroup_value $MEM_LIMIT $SYSTEM_MEM + check_systemd_value $SD_MEM_LIMIT $SD_UNLIMITED - # Run swap memory tests if swap is available - if [ -f "$CGROUP_MEMORY/memory.memsw.limit_in_bytes" ]; then - # try to remove memory swap limit - runc update test_update --memory-swap -1 - [ "$status" -eq 0 ] - # Get System memory swap limit - SYSTEM_MEMORY_SW=$(cat "${CGROUP_SYSTEM_MEMORY}/memory.memsw.limit_in_bytes") - check_cgroup_value $CGROUP_MEMORY "memory.memsw.limit_in_bytes" ${SYSTEM_MEMORY_SW} + # check swap memory limited is gone + if [ "$HAVE_SWAP" = "yes" ]; then + check_cgroup_value $MEM_SWAP $SYSTEM_MEM + check_systemd_value "$SD_MEM_SWAP" $SD_UNLIMITED + fi - # update memory swap - runc update test_update --memory-swap 96468992 - [ "$status" -eq 0 ] - check_cgroup_value $CGROUP_MEMORY "memory.memsw.limit_in_bytes" 96468992 - fi; + # update pids limit + runc update test_update --pids-limit 10 + [ "$status" -eq 0 ] + check_cgroup_value "pids.max" 10 + check_systemd_value "TasksMax" 10 - # try to remove memory limit - runc update test_update --memory -1 - [ "$status" -eq 0 ] + # unlimited + runc update test_update --pids-limit -1 + [ "$status" -eq 0 ] + check_cgroup_value "pids.max" max + check_systemd_value "TasksMax" $SD_UNLIMITED - # Get System memory limit - SYSTEM_MEMORY=$(cat "${CGROUP_SYSTEM_MEMORY}/memory.limit_in_bytes") - # check memory limited is gone - check_cgroup_value $CGROUP_MEMORY "memory.limit_in_bytes" ${SYSTEM_MEMORY} - - # check swap memory limited is gone - if [ -f "$CGROUP_MEMORY/memory.memsw.limit_in_bytes" ]; then - check_cgroup_value $CGROUP_MEMORY "memory.memsw.limit_in_bytes" ${SYSTEM_MEMORY} - fi - - # update kernel memory limit - runc update test_update --kernel-memory 50331648 - [ "$status" -eq 0 ] - check_cgroup_value $CGROUP_MEMORY "memory.kmem.limit_in_bytes" 50331648 - - # update kernel memory tcp limit - runc update test_update --kernel-memory-tcp 41943040 - [ "$status" -eq 0 ] - check_cgroup_value $CGROUP_MEMORY "memory.kmem.tcp.limit_in_bytes" 41943040 - - # update pids limit - runc update test_update --pids-limit 10 - [ "$status" -eq 0 ] - check_cgroup_value $CGROUP_PIDS "pids.max" 10 - - # Revert to the test initial value via json on stding - runc update -r - test_update <"$BATS_RUN_TMPDIR"/runc-cgroups-integration-test.json { "memory": { "limit": 33554432, - "reservation": 25165824, - "kernel": 16777216, - "kernelTCP": 11534336 + "reservation": 25165824 }, "cpu": { "shares": 100, @@ -224,34 +209,408 @@ EOF } } EOF -) - echo $DATA > $BATS_TMPDIR/runc-cgroups-integration-test.json - runc update -r $BATS_TMPDIR/runc-cgroups-integration-test.json test_update - [ "$status" -eq 0 ] - check_cgroup_value $CGROUP_CPU "cpu.cfs_period_us" 1000000 - check_cgroup_value $CGROUP_CPU "cpu.cfs_quota_us" 500000 - check_cgroup_value $CGROUP_CPU "cpu.shares" 100 - check_cgroup_value $CGROUP_CPUSET "cpuset.cpus" 0 - check_cgroup_value $CGROUP_MEMORY "memory.kmem.limit_in_bytes" 16777216 - check_cgroup_value $CGROUP_MEMORY "memory.kmem.tcp.limit_in_bytes" 11534336 - check_cgroup_value $CGROUP_MEMORY "memory.limit_in_bytes" 33554432 - check_cgroup_value $CGROUP_MEMORY "memory.soft_limit_in_bytes" 25165824 - check_cgroup_value $CGROUP_PIDS "pids.max" 20 + runc update -r "$BATS_RUN_TMPDIR"/runc-cgroups-integration-test.json test_update + [ "$status" -eq 0 ] + check_cgroup_value "cpuset.cpus" 0 + + check_cgroup_value $MEM_LIMIT 33554432 + check_systemd_value $SD_MEM_LIMIT 33554432 + + check_cgroup_value $MEM_RESERVE 25165824 + check_systemd_value $SD_MEM_RESERVE 25165824 + + check_cgroup_value "pids.max" 20 + check_systemd_value "TasksMax" 20 + + if [ "$HAVE_SWAP" = "yes" ]; then + # Test case for https://github.com/opencontainers/runc/pull/592, + # checking libcontainer/cgroups/fs/memory.go:setMemoryAndSwap. + + runc update test_update --memory 30M --memory-swap 50M + [ "$status" -eq 0 ] + + check_cgroup_value $MEM_LIMIT $((30 * 1024 * 1024)) + check_systemd_value $SD_MEM_LIMIT $((30 * 1024 * 1024)) + + if [ "$CGROUP_UNIFIED" = "yes" ]; then + # for cgroupv2, swap does not include mem + check_cgroup_value "$MEM_SWAP" $((20 * 1024 * 1024)) + check_systemd_value "$SD_MEM_SWAP" $((20 * 1024 * 1024)) + else + check_cgroup_value "$MEM_SWAP" $((50 * 1024 * 1024)) + check_systemd_value "$SD_MEM_SWAP" $((50 * 1024 * 1024)) + fi + + # Now, set new memory to more than old swap + runc update test_update --memory 60M --memory-swap 80M + [ "$status" -eq 0 ] + + check_cgroup_value $MEM_LIMIT $((60 * 1024 * 1024)) + check_systemd_value $SD_MEM_LIMIT $((60 * 1024 * 1024)) + + if [ "$CGROUP_UNIFIED" = "yes" ]; then + # for cgroupv2, swap does not include mem + check_cgroup_value "$MEM_SWAP" $((20 * 1024 * 1024)) + check_systemd_value "$SD_MEM_SWAP" $((20 * 1024 * 1024)) + else + check_cgroup_value "$MEM_SWAP" $((80 * 1024 * 1024)) + check_systemd_value "$SD_MEM_SWAP" $((80 * 1024 * 1024)) + fi + fi +} + +@test "update cgroup cpu limits" { + [[ "$ROOTLESS" -ne 0 ]] && requires rootless_cgroup + + # run a few busyboxes detached + runc run -d --console-socket "$CONSOLE_SOCKET" test_update + [ "$status" -eq 0 ] + + # check that initial values were properly set + check_cpu_quota 500000 1000000 "500ms" + check_cpu_shares 100 + + # update cpu period + runc update test_update --cpu-period 900000 + [ "$status" -eq 0 ] + check_cpu_quota 500000 900000 "560ms" + + # update cpu quota + runc update test_update --cpu-quota 600000 + [ "$status" -eq 0 ] + check_cpu_quota 600000 900000 "670ms" + + # remove cpu quota + runc update test_update --cpu-quota -1 + [ "$status" -eq 0 ] + check_cpu_quota -1 900000 "infinity" + + # update cpu-shares + runc update test_update --cpu-share 200 + [ "$status" -eq 0 ] + check_cpu_shares 200 + + # Revert to the test initial value via json on stding + runc update -r - test_update <"$BATS_RUN_TMPDIR"/runc-cgroups-integration-test.json +{ + "cpu": { + "shares": 100, + "quota": 500000, + "period": 1000000 + } +} +EOF + [ "$status" -eq 0 ] + + runc update -r "$BATS_RUN_TMPDIR"/runc-cgroups-integration-test.json test_update + [ "$status" -eq 0 ] + check_cpu_quota 500000 1000000 "500ms" + check_cpu_shares 100 +} + +@test "set cpu period with no quota" { + [[ "$ROOTLESS" -ne 0 ]] && requires rootless_cgroup + + update_config '.linux.resources.cpu |= { "period": 1000000 }' + + runc run -d --console-socket "$CONSOLE_SOCKET" test_update + [ "$status" -eq 0 ] + + check_cpu_quota -1 1000000 "infinity" +} + +@test "set cpu period with no quota (invalid period)" { + [[ "$ROOTLESS" -ne 0 ]] && requires rootless_cgroup + + update_config '.linux.resources.cpu |= { "period": 100 }' + + runc run -d --console-socket "$CONSOLE_SOCKET" test_update + [ "$status" -eq 1 ] +} + +@test "set cpu quota with no period" { + [[ "$ROOTLESS" -ne 0 ]] && requires rootless_cgroup + + update_config '.linux.resources.cpu |= { "quota": 5000 }' + + runc run -d --console-socket "$CONSOLE_SOCKET" test_update + [ "$status" -eq 0 ] + check_cpu_quota 5000 100000 "50ms" +} + +@test "update cpu period with no previous period/quota set" { + [[ "$ROOTLESS" -ne 0 ]] && requires rootless_cgroup + + update_config '.linux.resources.cpu |= {}' + + runc run -d --console-socket "$CONSOLE_SOCKET" test_update + [ "$status" -eq 0 ] + + # update the period alone, no old values were set + runc update --cpu-period 50000 test_update + [ "$status" -eq 0 ] + check_cpu_quota -1 50000 "infinity" +} + +@test "update cpu quota with no previous period/quota set" { + [[ "$ROOTLESS" -ne 0 ]] && requires rootless_cgroup + + update_config '.linux.resources.cpu |= {}' + + runc run -d --console-socket "$CONSOLE_SOCKET" test_update + [ "$status" -eq 0 ] + + # update the quota alone, no old values were set + runc update --cpu-quota 30000 test_update + [ "$status" -eq 0 ] + check_cpu_quota 30000 100000 "300ms" +} + +@test "update cpu period in a pod cgroup with pod limit set" { + requires cgroups_v1 + [[ "$ROOTLESS" -ne 0 ]] && requires rootless_cgroup + + set_cgroups_path "pod_${RANDOM}" + + # Set parent/pod CPU quota limit to 50%. + if [ -n "${RUNC_USE_SYSTEMD}" ]; then + set_parent_systemd_properties CPUQuota="50%" + else + echo 50000 >"/sys/fs/cgroup/cpu/$REL_PARENT_PATH/cpu.cfs_quota_us" + fi + # Sanity checks. + run cat "/sys/fs/cgroup/cpu$REL_PARENT_PATH/cpu.cfs_period_us" + [ "$output" -eq 100000 ] + run cat "/sys/fs/cgroup/cpu$REL_PARENT_PATH/cpu.cfs_quota_us" + [ "$output" -eq 50000 ] + + runc run -d --console-socket "$CONSOLE_SOCKET" test_update + [ "$status" -eq 0 ] + # Get the current period. + local cur + cur=$(get_cgroup_value cpu.cfs_period_us) + + # Sanity check: as the parent cgroup sets the limit to 50%, + # setting a higher limit (e.g. 60%) is expected to fail. + runc update --cpu-quota $((cur * 6 / 10)) test_update + [ "$status" -eq 1 ] + + # Finally, the test itself: set 30% limit but with lower period. + runc update --cpu-period 10000 --cpu-quota 3000 test_update + [ "$status" -eq 0 ] + check_cpu_quota 3000 10000 "300ms" +} + +@test "update cgroup v2 resources via unified map" { + [[ "$ROOTLESS" -ne 0 ]] && requires rootless_cgroup + requires cgroups_v2 + + runc run -d --console-socket "$CONSOLE_SOCKET" test_update + [ "$status" -eq 0 ] + + # check that initial values were properly set + check_cpu_quota 500000 1000000 "500ms" + # initial cpu shares of 100 corresponds to weight of 4 + check_cpu_weight 4 + check_systemd_value "TasksMax" 20 + + runc update -r - test_update <= v244 + if [ "$(systemd_version)" -lt 244 ]; then + # a hack to skip checks, see check_systemd_value() + AllowedCPUs='unsupported' + AllowedMemoryNodes='unsupported' + fi + + update_config ' .linux.resources.CPU |= { + "Cpus": "0", + "Mems": "0" + }' + runc run -d --console-socket "$CONSOLE_SOCKET" test_update + [ "$status" -eq 0 ] + + # check that initial values were properly set + check_systemd_value "$AllowedCPUs" 0 + check_systemd_value "$AllowedMemoryNodes" 0 + + runc update -r - test_update <= v244 + [[ "$ROOTLESS" -ne 0 ]] && requires rootless_cgroup + requires cgroups_v2 smp cgroups_cpuset + + update_config ' .linux.resources.unified |= { + "cpuset.cpus": "0", + "cpuset.mems": "0" + }' + runc run -d --console-socket "$CONSOLE_SOCKET" test_update + [ "$status" -eq 0 ] + + # check that initial values were properly set + check_systemd_value "AllowedCPUs" 0 + check_systemd_value "AllowedMemoryNodes" 0 + + runc update -r - test_update <"$target_period" + target_runtime="${target}/cpu.rt_runtime_us" + echo "Writing ${root_runtime} to ${target_runtime}" + echo "$root_runtime" >"$target_runtime" + done - runc update -r - test_update_rt < /dev/null; done"]' + + # Set up a temporary console socket and recvtty so we can get the stdio. + TMP_RECVTTY_DIR="$(mktemp -d "$BATS_RUN_TMPDIR/runc-tmp-recvtty.XXXXXX")" + TMP_RECVTTY_PID="$TMP_RECVTTY_DIR/recvtty.pid" + TMP_CONSOLE_SOCKET="$TMP_RECVTTY_DIR/console.sock" + CONTAINER_OUTPUT="$TMP_RECVTTY_DIR/output" + ("$RECVTTY" --no-stdin --pid-file "$TMP_RECVTTY_PID" \ + --mode single "$TMP_CONSOLE_SOCKET" &>"$CONTAINER_OUTPUT") & + retry 10 0.1 [ -e "$TMP_CONSOLE_SOCKET" ] + + # Run the container in the background. + runc run -d --console-socket "$TMP_CONSOLE_SOCKET" test_update + cat "$CONTAINER_OUTPUT" + [ "$status" -eq 0 ] + + # Trigger an update. This update doesn't actually change the device rules, + # but it will trigger the devices cgroup code to reapply the current rules. + # We trigger the update a few times to make sure we hit the race. + for _ in {1..30}; do + # TODO: Update "runc update" so we can change the device rules. + runc update --pids-limit 30 test_update + [ "$status" -eq 0 ] + done + + # Kill recvtty. + kill -9 "$(<"$TMP_RECVTTY_PID")" + + # There should've been no output from the container. + cat "$CONTAINER_OUTPUT" + [ -z "$(<"$CONTAINER_OUTPUT")" ] +} + +@test "update paused container" { + [[ "$ROOTLESS" -ne 0 ]] && requires rootless_cgroup + requires cgroups_freezer + + # Run the container in the background. + runc run -d --console-socket "$CONSOLE_SOCKET" test_update + [ "$status" -eq 0 ] + + # Pause the container. + runc pause test_update + [ "$status" -eq 0 ] + + # Trigger an unrelated update. + runc update --pids-limit 30 test_update + [ "$status" -eq 0 ] + + # The container should still be paused. + testcontainer test_update paused + + # Resume the container. + runc resume test_update + [ "$status" -eq 0 ] } diff --git a/tests/integration/userns.bats b/tests/integration/userns.bats new file mode 100644 index 0000000..b118859 --- /dev/null +++ b/tests/integration/userns.bats @@ -0,0 +1,66 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + setup_busybox + + # Prepare source folders for bind mount + mkdir -p source-{accessible,inaccessible-1,inaccessible-2}/dir + touch source-{accessible,inaccessible-1,inaccessible-2}/dir/foo.txt + + # Permissions only to the owner, it is inaccessible to group/others + chmod 700 source-inaccessible-{1,2} + + mkdir -p rootfs/{proc,sys,tmp} + mkdir -p rootfs/tmp/mount-{1,2} + + if [ "$ROOTLESS" -eq 0 ]; then + update_config ' .linux.namespaces += [{"type": "user"}] + | .linux.uidMappings += [{"hostID": 100000, "containerID": 0, "size": 65534}] + | .linux.gidMappings += [{"hostID": 100000, "containerID": 0, "size": 65534}] ' + fi +} + +function teardown() { + teardown_bundle +} + +@test "userns with simple mount" { + update_config ' .process.args += ["-c", "stat /tmp/mount-1/foo.txt"] + | .mounts += [{"source": "source-accessible/dir", "destination": "/tmp/mount-1", "options": ["bind"]}] ' + + runc run test_busybox + [ "$status" -eq 0 ] +} + +# We had bugs where 1 mount worked but not 2+, test with 2 as it is a more +# general case. +@test "userns with 2 inaccessible mounts" { + update_config ' .process.args += ["-c", "stat /tmp/mount-1/foo.txt /tmp/mount-2/foo.txt"] + | .mounts += [ { "source": "source-inaccessible-1/dir", "destination": "/tmp/mount-1", "options": ["bind"] }, + { "source": "source-inaccessible-2/dir", "destination": "/tmp/mount-2", "options": ["bind"] } + ]' + + # When not running rootless, this should work: while + # "source-inaccessible-1" can't be read by the uid in the userns, the fd + # is opened before changing to the userns and sent over via SCM_RIGHTs + # (with env var _LIBCONTAINER_MOUNT_FDS). Idem for + # source-inaccessible-2. + # On rootless, the owner is the same so it is accessible. + runc run test_busybox + [ "$status" -eq 0 ] +} + +# exec + bindmounts + user ns is a special case in the code. Test that it works. +@test "userns with inaccessible mount + exec" { + update_config ' .mounts += [ { "source": "source-inaccessible-1/dir", "destination": "/tmp/mount-1", "options": ["bind"] }, + { "source": "source-inaccessible-2/dir", "destination": "/tmp/mount-2", "options": ["bind"] } + ]' + + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + runc exec test_busybox stat /tmp/mount-1/foo.txt /tmp/mount-2/foo.txt + [ "$status" -eq 0 ] +} diff --git a/tests/integration/version.bats b/tests/integration/version.bats index ab77769..56667e4 100644 --- a/tests/integration/version.bats +++ b/tests/integration/version.bats @@ -3,9 +3,9 @@ load helpers @test "runc version" { - runc -v - [ "$status" -eq 0 ] - [[ ${lines[0]} =~ runc\ version\ [0-9]+\.[0-9]+\.[0-9]+ ]] - [[ ${lines[1]} =~ commit:+ ]] - [[ ${lines[2]} =~ spec:\ [0-9]+\.[0-9]+\.[0-9]+ ]] + runc -v + [ "$status" -eq 0 ] + [[ ${lines[0]} =~ runc\ version\ [0-9]+\.[0-9]+\.[0-9]+ ]] + [[ ${lines[1]} =~ commit:+ ]] + [[ ${lines[2]} =~ spec:\ [0-9]+\.[0-9]+\.[0-9]+ ]] } diff --git a/tests/rootless.sh b/tests/rootless.sh index 847c286..952a6dd 100755 --- a/tests/rootless.sh +++ b/tests/rootless.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/bin/bash -x # Copyright (C) 2017 SUSE LLC # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -19,8 +19,16 @@ # a new feature, please match the existing style. Add an entry to $ALL_FEATURES, # and add an enable_* and disable_* hook. +set -e -u -o pipefail +: "${RUNC_USE_SYSTEMD:=}" +: "${ROOTLESS_TESTPATH:=}" + ALL_FEATURES=("idmap" "cgroup") -ROOT="$(readlink -f "$(dirname "${BASH_SOURCE}")/..")" +# cgroup is managed by systemd when RUNC_USE_SYSTEMD is set +if [[ -n "${RUNC_USE_SYSTEMD}" ]]; then + ALL_FEATURES=("idmap") +fi +ROOT="$(readlink -f "$(dirname "${BASH_SOURCE[0]}")/..")" # FEATURE: Opportunistic new{uid,gid}map support, allowing a rootless container # to be set up with the usage of helper setuid binaries. @@ -31,15 +39,31 @@ function enable_idmap() { # Set up sub{uid,gid} mappings. [ -e /etc/subuid.tmp ] && mv /etc/subuid{.tmp,} - ( grep -v '^rootless' /etc/subuid ; echo "rootless:$ROOTLESS_UIDMAP_START:$ROOTLESS_UIDMAP_LENGTH" ) > /etc/subuid.tmp + ( + grep -v '^rootless' /etc/subuid + echo "rootless:$ROOTLESS_UIDMAP_START:$ROOTLESS_UIDMAP_LENGTH" + ) >/etc/subuid.tmp mv /etc/subuid{.tmp,} [ -e /etc/subgid.tmp ] && mv /etc/subgid{.tmp,} - ( grep -v '^rootless' /etc/subgid ; echo "rootless:$ROOTLESS_GIDMAP_START:$ROOTLESS_GIDMAP_LENGTH" ) > /etc/subgid.tmp + ( + grep -v '^rootless' /etc/subgid + echo "rootless:$ROOTLESS_GIDMAP_START:$ROOTLESS_GIDMAP_LENGTH" + ) >/etc/subgid.tmp mv /etc/subgid{.tmp,} # Reactivate new{uid,gid}map helpers if applicable. [ -e /usr/bin/unused-newuidmap ] && mv /usr/bin/{unused-,}newuidmap [ -e /usr/bin/unused-newgidmap ] && mv /usr/bin/{unused-,}newgidmap + + # Create a directory owned by $AUX_UID inside container, to be used + # by a test case in cwd.bats. This setup can't be done by the test itself, + # as it needs root for chown. + export AUX_UID=1024 + AUX_DIR="$(mktemp -d)" + # 1000 is linux.uidMappings.containerID value, + # as set by runc_rootless_idmap + chown "$((ROOTLESS_UIDMAP_START - 1000 + AUX_UID))" "$AUX_DIR" + export AUX_DIR } function disable_idmap() { @@ -53,6 +77,15 @@ function disable_idmap() { # Deactivate new{uid,gid}map helpers. setuid is preserved with mv(1). [ -e /usr/bin/newuidmap ] && mv /usr/bin/{,unused-}newuidmap [ -e /usr/bin/newgidmap ] && mv /usr/bin/{,unused-}newgidmap + + return 0 +} + +function cleanup() { + if [ -v AUX_DIR ]; then + rmdir "$AUX_DIR" + unset AUX_DIX + fi } # FEATURE: Opportunistic cgroups support, allowing a rootless container to set @@ -61,14 +94,14 @@ function disable_idmap() { # List of cgroups. We handle name= cgroups as well as combined # (comma-separated) cgroups and correctly split and/or strip them. -ALL_CGROUPS=( $(cat /proc/self/cgroup | cut -d: -f2 | sed -E '{s/^name=//;s/,/\n/;/^$/D}') ) +# shellcheck disable=SC2207 +ALL_CGROUPS=($(cut -d: -f2 "$CGROUP_MOUNT/cgroup.subtree_control"; done + set +x + # Create the cgroup. + mkdir -p "$CGROUP_MOUNT/$CGROUP_PATH" + # chown/chmod dir + cgroup.subtree_control + cgroup.procs + parent's cgroup.procs. + # See https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html#delegation-containment + chown root:rootless "$CGROUP_MOUNT/$CGROUP_PATH" "$CGROUP_MOUNT/$CGROUP_PATH/cgroup.subtree_control" "$CGROUP_MOUNT/$CGROUP_PATH/cgroup.procs" "$CGROUP_MOUNT/cgroup.procs" + chmod g+rwx "$CGROUP_MOUNT/$CGROUP_PATH" + chmod g+rw "$CGROUP_MOUNT/$CGROUP_PATH/cgroup.subtree_control" "$CGROUP_MOUNT/$CGROUP_PATH/cgroup.procs" "$CGROUP_MOUNT/cgroup.procs" + fi } function disable_cgroup() { # Remove cgroups used in rootless containers. - for cg in "${ALL_CGROUPS[@]}" - do + for cg in "${ALL_CGROUPS[@]}"; do [ -d "$CGROUP_MOUNT/$cg$CGROUP_PATH" ] && rmdir "$CGROUP_MOUNT/$cg$CGROUP_PATH" done + # cgroup v2 + [ -d "$CGROUP_MOUNT/$CGROUP_PATH" ] && rmdir "$CGROUP_MOUNT/$CGROUP_PATH" + + return 0 } # Create a powerset of $ALL_FEATURES (the set of all subsets of $ALL_FEATURES). @@ -97,29 +157,38 @@ function disable_cgroup() { # feature knobs this shouldn't take too long -- but the number of tested # combinations is O(2^n)). function powerset() { - eval printf '%s' $(printf '{,%s+}' "$@"): + eval printf '%s' "$(printf '{,%s+}' "$@")": } features_powerset="$(powerset "${ALL_FEATURES[@]}")" +# Make sure we have container images downloaded, as otherwise +# rootless user won't be able to write to $TESTDATA. +"$ROOT"/tests/integration/get-images.sh >/dev/null + # Iterate over the powerset of all features. IFS=: -for enabled_features in $features_powerset -do - idx="$(($idx+1))" - echo "[$(printf '%.2d' "$idx")] run rootless tests ... (${enabled_features%%+})" +idx=0 +for enabled_features in $features_powerset; do + ((++idx)) + printf "[%.2d] run rootless tests ... (${enabled_features%%+})\n" "$idx" unset IFS - for feature in "${ALL_FEATURES[@]}" - do + for feature in "${ALL_FEATURES[@]}"; do hook_func="disable_$feature" - grep -E "(^|\+)$feature(\+|$)" <<<$enabled_features &>/dev/null && hook_func="enable_$feature" + grep -E "(^|\+)$feature(\+|$)" <<<"$enabled_features" &>/dev/null && hook_func="enable_$feature" "$hook_func" done # Run the test suite! - set -e - echo path: $PATH + echo "path: $PATH" export ROOTLESS_FEATURES="$enabled_features" - sudo -HE -u rootless PATH="$PATH" bats -t "$ROOT/tests/integration$TESTFLAGS" - set +e + if [[ -n "${RUNC_USE_SYSTEMD}" ]]; then + # We use `ssh rootless@localhost` instead of `sudo -u rootless` for creating systemd user session. + # Alternatively we could use `machinectl shell`, but it is known not to work well on SELinux-enabled hosts as of April 2020: + # https://bugzilla.redhat.com/show_bug.cgi?id=1788616 + ssh -t -t -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -i "$HOME/rootless.key" rootless@localhost -- PATH="$PATH" RUNC_USE_SYSTEMD="$RUNC_USE_SYSTEMD" bats -t "$ROOT/tests/integration$ROOTLESS_TESTPATH" + else + sudo -HE -u rootless PATH="$PATH" "$(which bats)" -t "$ROOT/tests/integration$ROOTLESS_TESTPATH" + fi + cleanup done diff --git a/tty.go b/tty.go index 6106c2d..fba3e02 100644 --- a/tty.go +++ b/tty.go @@ -1,8 +1,7 @@ -// +build linux - package main import ( + "errors" "fmt" "io" "os" @@ -15,19 +14,19 @@ import ( ) type tty struct { - epoller *console.Epoller - console *console.EpollConsole - stdin console.Console - closers []io.Closer - postStart []io.Closer - wg sync.WaitGroup - consoleC chan error + epoller *console.Epoller + console *console.EpollConsole + hostConsole console.Console + closers []io.Closer + postStart []io.Closer + wg sync.WaitGroup + consoleC chan error } func (t *tty) copyIO(w io.Writer, r io.ReadCloser) { defer t.wg.Done() - io.Copy(w, r) - r.Close() + _, _ = io.Copy(w, r) + _ = r.Close() } // setup pipes for the process so that advanced features like c/r are able to easily checkpoint @@ -55,8 +54,8 @@ func setupProcessPipes(p *libcontainer.Process, rootuid, rootgid int) (*tty, err } } go func() { - io.Copy(i.Stdin, os.Stdin) - i.Stdin.Close() + _, _ = io.Copy(i.Stdin, os.Stdin) + _ = i.Stdin.Close() }() t.wg.Add(2) go t.copyIO(os.Stdout, i.Stdout) @@ -64,14 +63,43 @@ func setupProcessPipes(p *libcontainer.Process, rootuid, rootgid int) (*tty, err return t, nil } -func inheritStdio(process *libcontainer.Process) error { +func inheritStdio(process *libcontainer.Process) { process.Stdin = os.Stdin process.Stdout = os.Stdout process.Stderr = os.Stderr +} + +func (t *tty) initHostConsole() error { + // Usually all three (stdin, stdout, and stderr) streams are open to + // the terminal, but they might be redirected, so try them all. + for _, s := range []*os.File{os.Stderr, os.Stdout, os.Stdin} { + c, err := console.ConsoleFromFile(s) + if err == nil { + t.hostConsole = c + return nil + } + if errors.Is(err, console.ErrNotAConsole) { + continue + } + // should not happen + return fmt.Errorf("unable to get console: %w", err) + } + // If all streams are redirected, but we still have a controlling + // terminal, it can be obtained by opening /dev/tty. + tty, err := os.Open("/dev/tty") + if err != nil { + return err + } + c, err := console.ConsoleFromFile(tty) + if err != nil { + return fmt.Errorf("unable to get console: %w", err) + } + + t.hostConsole = c return nil } -func (t *tty) recvtty(process *libcontainer.Process, socket *os.File) (Err error) { +func (t *tty) recvtty(socket *os.File) (Err error) { f, err := utils.RecvFd(socket) if err != nil { return err @@ -80,7 +108,10 @@ func (t *tty) recvtty(process *libcontainer.Process, socket *os.File) (Err error if err != nil { return err } - console.ClearONLCR(cons.Fd()) + err = console.ClearONLCR(cons.Fd()) + if err != nil { + return err + } epoller, err := console.NewEpoller() if err != nil { return err @@ -91,26 +122,21 @@ func (t *tty) recvtty(process *libcontainer.Process, socket *os.File) (Err error } defer func() { if Err != nil { - epollConsole.Close() + _ = epollConsole.Close() } }() - go epoller.Wait() - go io.Copy(epollConsole, os.Stdin) + go func() { _ = epoller.Wait() }() + go func() { _, _ = io.Copy(epollConsole, os.Stdin) }() t.wg.Add(1) go t.copyIO(os.Stdout, epollConsole) - // set raw mode to stdin and also handle interrupt - stdin, err := console.ConsoleFromFile(os.Stdin) - if err != nil { - return err + // Set raw mode for the controlling terminal. + if err := t.hostConsole.SetRaw(); err != nil { + return fmt.Errorf("failed to set the terminal from the stdin: %w", err) } - if err := stdin.SetRaw(); err != nil { - return fmt.Errorf("failed to set the terminal from the stdin: %v", err) - } - go handleInterrupt(stdin) + go handleInterrupt(t.hostConsole) t.epoller = epoller - t.stdin = stdin t.console = epollConsole t.closers = []io.Closer{epollConsole} return nil @@ -120,7 +146,7 @@ func handleInterrupt(c console.Console) { sigchan := make(chan os.Signal, 1) signal.Notify(sigchan, os.Interrupt) <-sigchan - c.Reset() + _ = c.Reset() os.Exit(0) } @@ -133,38 +159,36 @@ func (t *tty) waitConsole() error { // ClosePostStart closes any fds that are provided to the container and dup2'd // so that we no longer have copy in our process. -func (t *tty) ClosePostStart() error { +func (t *tty) ClosePostStart() { for _, c := range t.postStart { - c.Close() + _ = c.Close() } - return nil } // Close closes all open fds for the tty and/or restores the original // stdin state to what it was prior to the container execution -func (t *tty) Close() error { +func (t *tty) Close() { // ensure that our side of the fds are always closed for _, c := range t.postStart { - c.Close() + _ = c.Close() } // the process is gone at this point, shutting down the console if we have // one and wait for all IO to be finished if t.console != nil && t.epoller != nil { - t.console.Shutdown(t.epoller.CloseConsole) + _ = t.console.Shutdown(t.epoller.CloseConsole) } t.wg.Wait() for _, c := range t.closers { - c.Close() + _ = c.Close() } - if t.stdin != nil { - t.stdin.Reset() + if t.hostConsole != nil { + _ = t.hostConsole.Reset() } - return nil } func (t *tty) resize() error { - if t.console == nil { + if t.console == nil || t.hostConsole == nil { return nil } - return t.console.ResizeFrom(console.Current()) + return t.console.ResizeFrom(t.hostConsole) } diff --git a/types/events.go b/types/events.go index c6f0e97..81bde82 100644 --- a/types/events.go +++ b/types/events.go @@ -1,5 +1,7 @@ package types +import "github.com/opencontainers/runc/libcontainer/intelrdt" + // Event struct for encoding the event data to json. type Event struct { Type string `json:"type"` @@ -10,6 +12,7 @@ type Event struct { // stats is the runc specific stats structure for stability when encoding and decoding stats. type Stats struct { CPU Cpu `json:"cpu"` + CPUSet CPUSet `json:"cpuset"` Memory Memory `json:"memory"` Pids Pids `json:"pids"` Blkio Blkio `json:"blkio"` @@ -55,10 +58,12 @@ type Throttling struct { type CpuUsage struct { // Units: nanoseconds. - Total uint64 `json:"total,omitempty"` - Percpu []uint64 `json:"percpu,omitempty"` - Kernel uint64 `json:"kernel"` - User uint64 `json:"user"` + Total uint64 `json:"total,omitempty"` + Percpu []uint64 `json:"percpu,omitempty"` + PercpuKernel []uint64 `json:"percpu_kernel,omitempty"` + PercpuUser []uint64 `json:"percpu_user,omitempty"` + Kernel uint64 `json:"kernel"` + User uint64 `json:"user"` } type Cpu struct { @@ -66,6 +71,20 @@ type Cpu struct { Throttling Throttling `json:"throttling,omitempty"` } +type CPUSet struct { + CPUs []uint16 `json:"cpus,omitempty"` + CPUExclusive uint64 `json:"cpu_exclusive"` + Mems []uint16 `json:"mems,omitempty"` + MemHardwall uint64 `json:"mem_hardwall"` + MemExclusive uint64 `json:"mem_exclusive"` + MemoryMigrate uint64 `json:"memory_migrate"` + MemorySpreadPage uint64 `json:"memory_spread_page"` + MemorySpreadSlab uint64 `json:"memory_spread_slab"` + MemoryPressure uint64 `json:"memory_pressure"` + SchedLoadBalance uint64 `json:"sched_load_balance"` + SchedRelaxDomainLevel int64 `json:"sched_relax_domain_level"` +} + type MemoryEntry struct { Limit uint64 `json:"limit"` Usage uint64 `json:"usage,omitempty"` @@ -113,6 +132,12 @@ type IntelRdt struct { // The memory bandwidth schema in 'container_id' group MemBwSchema string `json:"mem_bw_schema,omitempty"` + + // The memory bandwidth monitoring statistics from NUMA nodes in 'container_id' group + MBMStats *[]intelrdt.MBMNumaNodeStats `json:"mbm_stats,omitempty"` + + // The cache monitoring technology statistics from NUMA nodes in 'container_id' group + CMTStats *[]intelrdt.CMTNumaNodeStats `json:"cmt_stats,omitempty"` } type NetworkInterface struct { diff --git a/types/features/features.go b/types/features/features.go new file mode 100644 index 0000000..c6269ca --- /dev/null +++ b/types/features/features.go @@ -0,0 +1,131 @@ +// Package features provides the JSON structure that is printed by `runc features` (since runc v1.1.0). +// The types in this package are experimental and subject to change. +package features + +// Features represents the supported features of the runtime. +type Features struct { + // OCIVersionMin is the minimum OCI Runtime Spec version recognized by the runtime, e.g., "1.0.0". + OCIVersionMin string `json:"ociVersionMin,omitempty"` + + // OCIVersionMax is the maximum OCI Runtime Spec version recognized by the runtime, e.g., "1.0.2-dev". + OCIVersionMax string `json:"ociVersionMax,omitempty"` + + // Hooks is the list of the recognized hook names, e.g., "createRuntime". + // Nil value means "unknown", not "no support for any hook". + Hooks []string `json:"hooks,omitempty"` + + // MountOptions is the list of the recognized mount options, e.g., "ro". + // Nil value means "unknown", not "no support for any mount option". + // This list does not contain filesystem-specific options passed to mount(2) syscall as (const void *). + MountOptions []string `json:"mountOptions,omitempty"` + + // Linux is specific to Linux. + Linux *Linux `json:"linux,omitempty"` + + // Annotations contains implementation-specific annotation strings, + // such as the implementation version, and third-party extensions. + Annotations map[string]string `json:"annotations,omitempty"` +} + +// Linux is specific to Linux. +type Linux struct { + // Namespaces is the list of the recognized namespaces, e.g., "mount". + // Nil value means "unknown", not "no support for any namespace". + Namespaces []string `json:"namespaces,omitempty"` + + // Capabilities is the list of the recognized capabilities , e.g., "CAP_SYS_ADMIN". + // Nil value means "unknown", not "no support for any capability". + Capabilities []string `json:"capabilities,omitempty"` + + Cgroup *Cgroup `json:"cgroup,omitempty"` + Seccomp *Seccomp `json:"seccomp,omitempty"` + Apparmor *Apparmor `json:"apparmor,omitempty"` + Selinux *Selinux `json:"selinux,omitempty"` +} + +// Seccomp represents the "seccomp" field. +type Seccomp struct { + // Enabled is true if seccomp support is compiled in. + // Nil value means "unknown", not "false". + Enabled *bool `json:"enabled,omitempty"` + + // Actions is the list of the recognized actions, e.g., "SCMP_ACT_NOTIFY". + // Nil value means "unknown", not "no support for any action". + Actions []string `json:"actions,omitempty"` + + // Operators is the list of the recognized actions, e.g., "SCMP_CMP_NE". + // Nil value means "unknown", not "no support for any operator". + Operators []string `json:"operators,omitempty"` + + // Operators is the list of the recognized archs, e.g., "SCMP_ARCH_X86_64". + // Nil value means "unknown", not "no support for any arch". + Archs []string `json:"archs,omitempty"` +} + +// Apparmor represents the "apparmor" field. +type Apparmor struct { + // Enabled is true if AppArmor support is compiled in. + // Unrelated to whether the host supports AppArmor or not. + // Nil value means "unknown", not "false". + // Always true in the current version of runc. + Enabled *bool `json:"enabled,omitempty"` +} + +// Selinux represents the "selinux" field. +type Selinux struct { + // Enabled is true if SELinux support is compiled in. + // Unrelated to whether the host supports SELinux or not. + // Nil value means "unknown", not "false". + // Always true in the current version of runc. + Enabled *bool `json:"enabled,omitempty"` +} + +// Cgroup represents the "cgroup" field. +type Cgroup struct { + // V1 represents whether Cgroup v1 support is compiled in. + // Unrelated to whether the host uses cgroup v1 or not. + // Nil value means "unknown", not "false". + // Always true in the current version of runc. + V1 *bool `json:"v1,omitempty"` + + // V2 represents whether Cgroup v2 support is compiled in. + // Unrelated to whether the host uses cgroup v2 or not. + // Nil value means "unknown", not "false". + // Always true in the current version of runc. + V2 *bool `json:"v2,omitempty"` + + // Systemd represents whether systemd-cgroup support is compiled in. + // Unrelated to whether the host uses systemd or not. + // Nil value means "unknown", not "false". + // Always true in the current version of runc. + Systemd *bool `json:"systemd,omitempty"` + + // SystemdUser represents whether user-scoped systemd-cgroup support is compiled in. + // Unrelated to whether the host uses systemd or not. + // Nil value means "unknown", not "false". + // Always true in the current version of runc. + SystemdUser *bool `json:"systemdUser,omitempty"` +} + +const ( + // AnnotationRuncVersion represents the version of runc, e.g., "1.2.3", "1.2.3+dev", "1.2.3-rc.4.", "1.2.3-rc.4+dev". + // Third party implementations such as crun and runsc MAY use this annotation to report the most compatible runc version, + // however, parsing this annotation value is discouraged. + AnnotationRuncVersion = "org.opencontainers.runc.version" + + // AnnotationRuncCommit corresponds to the output of `git describe --dirty --long --always` in the runc repo. + // Third party implementations such as crun and runsc SHOULD NOT use this annotation, as their repo is different from the runc repo. + // Parsing this annotation value is discouraged. + AnnotationRuncCommit = "org.opencontainers.runc.commit" + + // AnnotationRuncCheckpointEnabled is set to "true" if CRIU-based checkpointing is supported. + // Unrelated to whether the host supports CRIU or not. + // Always set to "true" in the current version of runc. + // This is defined as an annotation because checkpointing is a runc-specific feature that is not defined in the OCI Runtime Spec. + // Third party implementations such as crun and runsc MAY use this annotation. + AnnotationRuncCheckpointEnabled = "org.opencontainers.runc.checkpoint.enabled" + + // AnnotationLibseccompVersion is the version of libseccomp, e.g., "2.5.1". + // Note that the runtime MAY support seccomp even when this annotation is not present. + AnnotationLibseccompVersion = "io.github.seccomp.libseccomp.version" +) diff --git a/update.go b/update.go index 05dc4b5..d02e7af 100644 --- a/update.go +++ b/update.go @@ -1,13 +1,15 @@ -// +build linux - package main import ( "encoding/json" + "errors" "fmt" "os" "strconv" + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/sirupsen/logrus" + "github.com/docker/go-units" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/intelrdt" @@ -35,9 +37,7 @@ The accepted format is as follow (unchanged values can be omitted): "memory": { "limit": 0, "reservation": 0, - "swap": 0, - "kernel": 0, - "kernelTCP": 0 + "swap": 0 }, "cpu": { "shares": 0, @@ -91,12 +91,14 @@ other options are ignored. Usage: "Memory node(s) to use", }, cli.StringFlag{ - Name: "kernel-memory", - Usage: "Kernel memory limit (in bytes)", + Name: "kernel-memory", + Usage: "(obsoleted; do not use)", + Hidden: true, }, cli.StringFlag{ - Name: "kernel-memory-tcp", - Usage: "Kernel memory limit (in bytes) for tcp buffer", + Name: "kernel-memory-tcp", + Usage: "(obsoleted; do not use)", + Hidden: true, }, cli.StringFlag{ Name: "memory", @@ -201,7 +203,7 @@ other options are ignored. var err error *pair.dest, err = strconv.ParseUint(val, 10, 64) if err != nil { - return fmt.Errorf("invalid value for %s: %s", pair.opt, err) + return fmt.Errorf("invalid value for %s: %w", pair.opt, err) } } } @@ -217,7 +219,7 @@ other options are ignored. var err error *pair.dest, err = strconv.ParseInt(val, 10, 64) if err != nil { - return fmt.Errorf("invalid value for %s: %s", pair.opt, err) + return fmt.Errorf("invalid value for %s: %w", pair.opt, err) } } } @@ -237,7 +239,7 @@ other options are ignored. if val != "-1" { v, err = units.RAMInBytes(val) if err != nil { - return fmt.Errorf("invalid value for %s: %s", pair.opt, err) + return fmt.Errorf("invalid value for %s: %w", pair.opt, err) } } else { v = -1 @@ -245,34 +247,66 @@ other options are ignored. *pair.dest = v } } + r.Pids.Limit = int64(context.Int("pids-limit")) } - // Update the value + if *r.Memory.Kernel != 0 || *r.Memory.KernelTCP != 0 { + logrus.Warn("Kernel memory settings are ignored and will be removed") + } + + // Update the values config.Cgroups.Resources.BlkioWeight = *r.BlockIO.Weight - config.Cgroups.Resources.CpuPeriod = *r.CPU.Period - config.Cgroups.Resources.CpuQuota = *r.CPU.Quota + + // Setting CPU quota and period independently does not make much sense, + // but historically runc allowed it and this needs to be supported + // to not break compatibility. + // + // For systemd cgroup drivers to set CPU quota/period correctly, + // it needs to know both values. For fs2 cgroup driver to be compatible + // with the fs driver, it also needs to know both values. + // + // Here in update, previously set values are available from config. + // If only one of {quota,period} is set and the other is not, leave + // the unset parameter at the old value (don't overwrite config). + p, q := *r.CPU.Period, *r.CPU.Quota + if (p == 0 && q == 0) || (p != 0 && q != 0) { + // both values are either set or unset (0) + config.Cgroups.Resources.CpuPeriod = p + config.Cgroups.Resources.CpuQuota = q + } else { + // one is set and the other is not + if p != 0 { + // set new period, leave quota at old value + config.Cgroups.Resources.CpuPeriod = p + } else if q != 0 { + // set new quota, leave period at old value + config.Cgroups.Resources.CpuQuota = q + } + } + config.Cgroups.Resources.CpuShares = *r.CPU.Shares + // CpuWeight is used for cgroupv2 and should be converted + config.Cgroups.Resources.CpuWeight = cgroups.ConvertCPUSharesToCgroupV2Value(*r.CPU.Shares) config.Cgroups.Resources.CpuRtPeriod = *r.CPU.RealtimePeriod config.Cgroups.Resources.CpuRtRuntime = *r.CPU.RealtimeRuntime config.Cgroups.Resources.CpusetCpus = r.CPU.Cpus config.Cgroups.Resources.CpusetMems = r.CPU.Mems - config.Cgroups.Resources.KernelMemory = *r.Memory.Kernel - config.Cgroups.Resources.KernelMemoryTCP = *r.Memory.KernelTCP config.Cgroups.Resources.Memory = *r.Memory.Limit config.Cgroups.Resources.MemoryReservation = *r.Memory.Reservation config.Cgroups.Resources.MemorySwap = *r.Memory.Swap config.Cgroups.Resources.PidsLimit = r.Pids.Limit + config.Cgroups.Resources.Unified = r.Unified // Update Intel RDT l3CacheSchema := context.String("l3-cache-schema") memBwSchema := context.String("mem-bw-schema") - if l3CacheSchema != "" && !intelrdt.IsCatEnabled() { - return fmt.Errorf("Intel RDT/CAT: l3 cache schema is not enabled") + if l3CacheSchema != "" && !intelrdt.IsCATEnabled() { + return errors.New("Intel RDT/CAT: l3 cache schema is not enabled") } - if memBwSchema != "" && !intelrdt.IsMbaEnabled() { - return fmt.Errorf("Intel RDT/MBA: memory bandwidth schema is not enabled") + if memBwSchema != "" && !intelrdt.IsMBAEnabled() { + return errors.New("Intel RDT/MBA: memory bandwidth schema is not enabled") } if l3CacheSchema != "" || memBwSchema != "" { @@ -286,11 +320,7 @@ other options are ignored. return err } config.IntelRdt = &configs.IntelRdt{} - intelRdtManager := intelrdt.IntelRdtManager{ - Config: &config, - Id: container.ID(), - Path: state.IntelRdtPath, - } + intelRdtManager := intelrdt.NewManager(&config, container.ID(), state.IntelRdtPath) if err := intelRdtManager.Apply(state.InitProcessPid); err != nil { return err } @@ -299,6 +329,13 @@ other options are ignored. config.IntelRdt.MemBwSchema = memBwSchema } + // XXX(kolyshkin@): currently "runc update" is unable to change + // device configuration, so add this to skip device update. + // This helps in case an extra plugin (nvidia GPU) applies some + // configuration on top of what runc does. + // Note this field is not saved into container's state.json. + config.Cgroups.SkipDevices = true + return container.Set(config) }, } diff --git a/utils.go b/utils.go index 5165336..32ab33e 100644 --- a/utils.go +++ b/utils.go @@ -39,19 +39,31 @@ func checkArgs(context *cli.Context, expected, checkType int) error { if err != nil { fmt.Printf("Incorrect Usage.\n\n") - cli.ShowCommandHelp(context, cmdName) + _ = cli.ShowCommandHelp(context, cmdName) return err } return nil } +func logrusToStderr() bool { + l, ok := logrus.StandardLogger().Out.(*os.File) + return ok && l.Fd() == os.Stderr.Fd() +} + // fatal prints the error's details if it is a libcontainer specific error type // then exits the program with an exit status of 1. func fatal(err error) { - // make sure the error is written to the logger + fatalWithCode(err, 1) +} + +func fatalWithCode(err error, ret int) { + // Make sure the error is written to the logger. logrus.Error(err) - fmt.Fprintln(os.Stderr, err) - os.Exit(1) + if !logrusToStderr() { + fmt.Fprintln(os.Stderr, err) + } + + os.Exit(ret) } // setupSpec performs initial setup based on the cli.Context for the container @@ -84,6 +96,21 @@ func revisePidFile(context *cli.Context) error { return context.Set("pid-file", pidFile) } +// reviseRootDir convert the root to absolute path +func reviseRootDir(context *cli.Context) error { + root := context.GlobalString("root") + if root == "" { + return nil + } + + root, err := filepath.Abs(root) + if err != nil { + return err + } + + return context.GlobalSet("root", root) +} + // parseBoolOrAuto returns (nil, nil) if s is empty or "auto" func parseBoolOrAuto(s string) (*bool, error) { if s == "" || strings.ToLower(s) == "auto" { diff --git a/utils_linux.go b/utils_linux.go index 984e6b0..a9badf2 100644 --- a/utils_linux.go +++ b/utils_linux.go @@ -1,8 +1,7 @@ -// +build linux - package main import ( + "errors" "fmt" "net" "os" @@ -10,20 +9,17 @@ import ( "path/filepath" "strconv" - "github.com/opencontainers/runc/libcontainer" - "github.com/opencontainers/runc/libcontainer/cgroups/systemd" - "github.com/opencontainers/runc/libcontainer/configs" - "github.com/opencontainers/runc/libcontainer/intelrdt" - "github.com/opencontainers/runc/libcontainer/specconv" - "github.com/opencontainers/runc/libcontainer/utils" + "github.com/coreos/go-systemd/v22/activation" "github.com/opencontainers/runtime-spec/specs-go" selinux "github.com/opencontainers/selinux/go-selinux" - - "github.com/coreos/go-systemd/activation" - "github.com/pkg/errors" "github.com/sirupsen/logrus" "github.com/urfave/cli" "golang.org/x/sys/unix" + + "github.com/opencontainers/runc/libcontainer" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/specconv" + "github.com/opencontainers/runc/libcontainer/utils" ) var errEmptyID = errors.New("container id cannot be empty") @@ -36,28 +32,7 @@ func loadFactory(context *cli.Context) (libcontainer.Factory, error) { return nil, err } - // We default to cgroupfs, and can only use systemd if the system is a - // systemd box. - cgroupManager := libcontainer.Cgroupfs - rootlessCg, err := shouldUseRootlessCgroupManager(context) - if err != nil { - return nil, err - } - if rootlessCg { - cgroupManager = libcontainer.RootlessCgroupfs - } - if context.GlobalBool("systemd-cgroup") { - if systemd.UseSystemd() { - cgroupManager = libcontainer.SystemdCgroups - } else { - return nil, fmt.Errorf("systemd cgroup flag passed, but systemd support for managing cgroups is not available") - } - } - intelRdtManager := libcontainer.IntelRdtFs - if !intelrdt.IsCatEnabled() && !intelrdt.IsMbaEnabled() { - intelRdtManager = nil - } // We resolve the paths for {newuidmap,newgidmap} from the context of runc, // to avoid doing a path lookup in the nsexec context. TODO: The binary @@ -71,7 +46,7 @@ func loadFactory(context *cli.Context) (libcontainer.Factory, error) { newgidmap = "" } - return libcontainer.New(abs, cgroupManager, intelRdtManager, + return libcontainer.New(abs, intelRdtManager, libcontainer.CriuPath(context.GlobalString("criu")), libcontainer.NewuidmapPath(newuidmap), libcontainer.NewgidmapPath(newgidmap)) @@ -91,11 +66,7 @@ func getContainer(context *cli.Context) (libcontainer.Container, error) { return factory.Load(id) } -func fatalf(t string, v ...interface{}) { - fatal(fmt.Errorf(t, v...)) -} - -func getDefaultImagePath(context *cli.Context) string { +func getDefaultImagePath() string { cwd, err := os.Getwd() if err != nil { panic(err) @@ -105,7 +76,7 @@ func getDefaultImagePath(context *cli.Context) string { // newProcess returns a new libcontainer Process with the arguments from the // spec and stdio from the current process. -func newProcess(p specs.Process, init bool, logLevel string) (*libcontainer.Process, error) { +func newProcess(p specs.Process) (*libcontainer.Process, error) { lp := &libcontainer.Process{ Args: p.Args, Env: p.Env, @@ -115,8 +86,6 @@ func newProcess(p specs.Process, init bool, logLevel string) (*libcontainer.Proc Label: p.SelinuxLabel, NoNewPrivileges: &p.NoNewPrivileges, AppArmorProfile: p.ApparmorProfile, - Init: init, - LogLevel: logLevel, } if p.ConsoleSize != nil { @@ -159,6 +128,9 @@ func setupIO(process *libcontainer.Process, rootuid, rootgid int, createTTY, det process.Stderr = nil t := &tty{} if !detach { + if err := t.initHostConsole(); err != nil { + return nil, err + } parent, child, err := utils.NewSockPair("console") if err != nil { return nil, err @@ -167,10 +139,7 @@ func setupIO(process *libcontainer.Process, rootuid, rootgid int, createTTY, det t.postStart = append(t.postStart, parent, child) t.consoleC = make(chan error, 1) go func() { - if err := t.recvtty(process, parent); err != nil { - t.consoleC <- err - } - t.consoleC <- nil + t.consoleC <- t.recvtty(parent) }() } else { // the caller of runc will handle receiving the console master @@ -180,7 +149,7 @@ func setupIO(process *libcontainer.Process, rootuid, rootgid int, createTTY, det } uc, ok := conn.(*net.UnixConn) if !ok { - return nil, fmt.Errorf("casting to UnixConn failed") + return nil, errors.New("casting to UnixConn failed") } t.postStart = append(t.postStart, uc) socket, err := uc.File() @@ -195,9 +164,7 @@ func setupIO(process *libcontainer.Process, rootuid, rootgid int, createTTY, det // when runc will detach the caller provides the stdio to runc via runc's 0,1,2 // and the container's process inherits runc's stdio. if detach { - if err := inheritStdio(process); err != nil { - return nil, err - } + inheritStdio(process) return &tty{}, nil } return setupProcessPipes(process, rootuid, rootgid) @@ -213,13 +180,13 @@ func createPidFile(path string, process *libcontainer.Process) error { } var ( tmpDir = filepath.Dir(path) - tmpName = filepath.Join(tmpDir, fmt.Sprintf(".%s", filepath.Base(path))) + tmpName = filepath.Join(tmpDir, "."+filepath.Base(path)) ) - f, err := os.OpenFile(tmpName, os.O_RDWR|os.O_CREATE|os.O_EXCL|os.O_SYNC, 0666) + f, err := os.OpenFile(tmpName, os.O_RDWR|os.O_CREATE|os.O_EXCL|os.O_SYNC, 0o666) if err != nil { return err } - _, err = fmt.Fprintf(f, "%d", pid) + _, err = f.WriteString(strconv.Itoa(pid)) f.Close() if err != nil { return err @@ -265,7 +232,7 @@ type runner struct { action CtAct notifySocket *notifySocket criuOpts *libcontainer.CriuOpts - logLevel string + subCgroupPaths map[string]string } func (r *runner) run(config *specs.Process) (int, error) { @@ -278,19 +245,23 @@ func (r *runner) run(config *specs.Process) (int, error) { if err = r.checkTerminal(config); err != nil { return -1, err } - process, err := newProcess(*config, r.init, r.logLevel) + process, err := newProcess(*config) if err != nil { return -1, err } + process.LogLevel = strconv.Itoa(int(logrus.GetLevel())) + // Populate the fields that come from runner. + process.Init = r.init + process.SubCgroupPaths = r.subCgroupPaths if len(r.listenFDs) > 0 { - process.Env = append(process.Env, fmt.Sprintf("LISTEN_FDS=%d", len(r.listenFDs)), "LISTEN_PID=1") + process.Env = append(process.Env, "LISTEN_FDS="+strconv.Itoa(len(r.listenFDs)), "LISTEN_PID=1") process.ExtraFiles = append(process.ExtraFiles, r.listenFDs...) } baseFd := 3 + len(process.ExtraFiles) for i := baseFd; i < baseFd+r.preserveFDs; i++ { - _, err = os.Stat(fmt.Sprintf("/proc/self/fd/%d", i)) + _, err = os.Stat("/proc/self/fd/" + strconv.Itoa(i)) if err != nil { - return -1, errors.Wrapf(err, "please check that preserved-fd %d (of %d) is present", i-baseFd, r.preserveFDs) + return -1, fmt.Errorf("unable to stat preserved-fd %d (of %d): %w", i-baseFd, r.preserveFDs, err) } process.ExtraFiles = append(process.ExtraFiles, os.NewFile(uintptr(i), "PreserveFD:"+strconv.Itoa(i))) } @@ -302,9 +273,7 @@ func (r *runner) run(config *specs.Process) (int, error) { if err != nil { return -1, err } - var ( - detach = r.detach || (r.action == CT_ACT_CREATE) - ) + detach := r.detach || (r.action == CT_ACT_CREATE) // Setting up IO is a two stage process. We need to modify process to deal // with detaching containers, and then we get a tty after the container has // started. @@ -332,10 +301,7 @@ func (r *runner) run(config *specs.Process) (int, error) { r.terminate(process) return -1, err } - if err = tty.ClosePostStart(); err != nil { - r.terminate(process) - return -1, err - } + tty.ClosePostStart() if r.pidFile != "" { if err = createPidFile(r.pidFile, process); err != nil { r.terminate(process) @@ -349,7 +315,9 @@ func (r *runner) run(config *specs.Process) (int, error) { if detach { return 0, nil } - r.destroy() + if err == nil { + r.destroy() + } return status, err } @@ -368,26 +336,29 @@ func (r *runner) checkTerminal(config *specs.Process) error { detach := r.detach || (r.action == CT_ACT_CREATE) // Check command-line for sanity. if detach && config.Terminal && r.consoleSocket == "" { - return fmt.Errorf("cannot allocate tty if runc will detach without setting console socket") + return errors.New("cannot allocate tty if runc will detach without setting console socket") } if (!detach || !config.Terminal) && r.consoleSocket != "" { - return fmt.Errorf("cannot use console socket if runc will not detach or allocate tty") + return errors.New("cannot use console socket if runc will not detach or allocate tty") } return nil } func validateProcessSpec(spec *specs.Process) error { + if spec == nil { + return errors.New("process property must not be empty") + } if spec.Cwd == "" { - return fmt.Errorf("Cwd property must not be empty") + return errors.New("Cwd property must not be empty") } if !filepath.IsAbs(spec.Cwd) { - return fmt.Errorf("Cwd must be an absolute path") + return errors.New("Cwd must be an absolute path") } if len(spec.Args) == 0 { - return fmt.Errorf("args must not be empty") + return errors.New("args must not be empty") } if spec.SelinuxLabel != "" && !selinux.GetEnabled() { - return fmt.Errorf("selinux label is specified in config, but selinux is disabled or not supported") + return errors.New("selinux label is specified in config, but selinux is disabled or not supported") } return nil } @@ -400,7 +371,15 @@ const ( CT_ACT_RESTORE ) -func startContainer(context *cli.Context, spec *specs.Spec, action CtAct, criuOpts *libcontainer.CriuOpts) (int, error) { +func startContainer(context *cli.Context, action CtAct, criuOpts *libcontainer.CriuOpts) (int, error) { + if err := revisePidFile(context); err != nil { + return -1, err + } + spec, err := setupSpec(context) + if err != nil { + return -1, err + } + id := context.Args().First() if id == "" { return -1, errEmptyID @@ -408,7 +387,7 @@ func startContainer(context *cli.Context, spec *specs.Spec, action CtAct, criuOp notifySocket := newNotifySocket(context, os.Getenv("NOTIFY_SOCKET"), id) if notifySocket != nil { - notifySocket.setupSpec(context, spec) + notifySocket.setupSpec(spec) } container, err := createContainer(context, id, spec) @@ -417,10 +396,14 @@ func startContainer(context *cli.Context, spec *specs.Spec, action CtAct, criuOp } if notifySocket != nil { - err := notifySocket.setupSocket() - if err != nil { + if err := notifySocket.setupSocketDirectory(); err != nil { return -1, err } + if action == CT_ACT_RUN { + if err := notifySocket.bindSocket(); err != nil { + return -1, err + } + } } // Support on-demand socket activation by passing file descriptors into the container init process. @@ -429,14 +412,9 @@ func startContainer(context *cli.Context, spec *specs.Spec, action CtAct, criuOp listenFDs = activation.Files(false) } - logLevel := "info" - if context.GlobalBool("debug") { - logLevel = "debug" - } - r := &runner{ enableSubreaper: !context.Bool("no-subreaper"), - shouldDestroy: true, + shouldDestroy: !context.Bool("keep"), container: container, listenFDs: listenFDs, notifySocket: notifySocket, @@ -447,7 +425,6 @@ func startContainer(context *cli.Context, spec *specs.Spec, action CtAct, criuOp action: action, criuOpts: criuOpts, init: true, - logLevel: logLevel, } return r.run(spec.Process) } diff --git a/vendor.conf b/vendor.conf deleted file mode 100644 index dd51785..0000000 --- a/vendor.conf +++ /dev/null @@ -1,31 +0,0 @@ -# OCI runtime-spec. When updating this, make sure you use a version tag rather -# than a commit ID so it's much more obvious what version of the spec we are -# using. -github.com/opencontainers/runtime-spec 29686dbc5559d93fb1ef402eeda3e35c38d75af4 # v1.0.1-59-g29686db - -# Core libcontainer functionality. -github.com/checkpoint-restore/go-criu 17b0214f6c48980c45dc47ecb0cfd6d9e02df723 # v3.11 -github.com/mrunalp/fileutils 7d4729fb36185a7c1719923406c9d40e54fb93c7 -github.com/opencontainers/selinux 5215b1806f52b1fcc2070a8826c542c9d33cd3cf # v1.3.0 (+ CVE-2019-16884) -github.com/seccomp/libseccomp-golang 689e3c1541a84461afc49c1c87352a6cedf72e9c # v0.9.1 -github.com/sirupsen/logrus 8bdbc7bcc01dcbb8ec23dc8a28e332258d25251f # v1.4.1 -github.com/syndtr/gocapability d98352740cb2c55f81556b63d4a1ec64c5a319c2 -github.com/vishvananda/netlink 1e2e08e8a2dcdacaae3f14ac44c5cfa31361f270 - -# systemd integration. -github.com/coreos/go-systemd 95778dfbb74eb7e4dbaf43bf7d71809650ef8076 # v19 -github.com/godbus/dbus 2ff6f7ffd60f0f2410b3105864bdd12c7894f844 # v5.0.1 -github.com/golang/protobuf 925541529c1fa6821df4e44ce2723319eb2be768 # v1.0.0 - -# Command-line interface. -github.com/cyphar/filepath-securejoin a261ee33d7a517f054effbf451841abaafe3e0fd # v0.2.2 -github.com/docker/go-units 47565b4f722fb6ceae66b95f853feed578a4a51c # v0.3.3 -github.com/urfave/cli cfb38830724cc34fedffe9a2a29fb54fa9169cd1 # v1.20.0 -golang.org/x/sys 9eafafc0a87e0fd0aeeba439a4573537970c44c7 https://github.com/golang/sys - -# console dependencies -github.com/containerd/console 0650fd9eeb50bab4fc99dceb9f2e14cf58f36e7f -github.com/pkg/errors ba968bfe8b2f7e042a574c888954fccecfa385b4 # v0.8.1 - -# ebpf dependencies -github.com/cilium/ebpf 95b36a581eed7b0f127306ed1d16cc0ddc06cf67 diff --git a/vendor/github.com/checkpoint-restore/go-criu/rpc/rpc.pb.go b/vendor/github.com/checkpoint-restore/go-criu/rpc/rpc.pb.go deleted file mode 100644 index 230faac..0000000 --- a/vendor/github.com/checkpoint-restore/go-criu/rpc/rpc.pb.go +++ /dev/null @@ -1,1211 +0,0 @@ -// Code generated by protoc-gen-go. DO NOT EDIT. -// source: rpc/rpc.proto - -/* -Package rpc is a generated protocol buffer package. - -It is generated from these files: - rpc/rpc.proto - -It has these top-level messages: - CriuPageServerInfo - CriuVethPair - ExtMountMap - JoinNamespace - InheritFd - CgroupRoot - UnixSk - CriuOpts - CriuDumpResp - CriuRestoreResp - CriuNotify - CriuFeatures - CriuReq - CriuResp - CriuVersion -*/ -package rpc - -import proto "github.com/golang/protobuf/proto" -import fmt "fmt" -import math "math" - -// Reference imports to suppress errors if they are not otherwise used. -var _ = proto.Marshal -var _ = fmt.Errorf -var _ = math.Inf - -// This is a compile-time assertion to ensure that this generated file -// is compatible with the proto package it is being compiled against. -// A compilation error at this line likely means your copy of the -// proto package needs to be updated. -const _ = proto.ProtoPackageIsVersion2 // please upgrade the proto package - -type CriuCgMode int32 - -const ( - CriuCgMode_IGNORE CriuCgMode = 0 - CriuCgMode_CG_NONE CriuCgMode = 1 - CriuCgMode_PROPS CriuCgMode = 2 - CriuCgMode_SOFT CriuCgMode = 3 - CriuCgMode_FULL CriuCgMode = 4 - CriuCgMode_STRICT CriuCgMode = 5 - CriuCgMode_DEFAULT CriuCgMode = 6 -) - -var CriuCgMode_name = map[int32]string{ - 0: "IGNORE", - 1: "CG_NONE", - 2: "PROPS", - 3: "SOFT", - 4: "FULL", - 5: "STRICT", - 6: "DEFAULT", -} -var CriuCgMode_value = map[string]int32{ - "IGNORE": 0, - "CG_NONE": 1, - "PROPS": 2, - "SOFT": 3, - "FULL": 4, - "STRICT": 5, - "DEFAULT": 6, -} - -func (x CriuCgMode) Enum() *CriuCgMode { - p := new(CriuCgMode) - *p = x - return p -} -func (x CriuCgMode) String() string { - return proto.EnumName(CriuCgMode_name, int32(x)) -} -func (x *CriuCgMode) UnmarshalJSON(data []byte) error { - value, err := proto.UnmarshalJSONEnum(CriuCgMode_value, data, "CriuCgMode") - if err != nil { - return err - } - *x = CriuCgMode(value) - return nil -} -func (CriuCgMode) EnumDescriptor() ([]byte, []int) { return fileDescriptor0, []int{0} } - -type CriuReqType int32 - -const ( - CriuReqType_EMPTY CriuReqType = 0 - CriuReqType_DUMP CriuReqType = 1 - CriuReqType_RESTORE CriuReqType = 2 - CriuReqType_CHECK CriuReqType = 3 - CriuReqType_PRE_DUMP CriuReqType = 4 - CriuReqType_PAGE_SERVER CriuReqType = 5 - CriuReqType_NOTIFY CriuReqType = 6 - CriuReqType_CPUINFO_DUMP CriuReqType = 7 - CriuReqType_CPUINFO_CHECK CriuReqType = 8 - CriuReqType_FEATURE_CHECK CriuReqType = 9 - CriuReqType_VERSION CriuReqType = 10 - CriuReqType_WAIT_PID CriuReqType = 11 - CriuReqType_PAGE_SERVER_CHLD CriuReqType = 12 -) - -var CriuReqType_name = map[int32]string{ - 0: "EMPTY", - 1: "DUMP", - 2: "RESTORE", - 3: "CHECK", - 4: "PRE_DUMP", - 5: "PAGE_SERVER", - 6: "NOTIFY", - 7: "CPUINFO_DUMP", - 8: "CPUINFO_CHECK", - 9: "FEATURE_CHECK", - 10: "VERSION", - 11: "WAIT_PID", - 12: "PAGE_SERVER_CHLD", -} -var CriuReqType_value = map[string]int32{ - "EMPTY": 0, - "DUMP": 1, - "RESTORE": 2, - "CHECK": 3, - "PRE_DUMP": 4, - "PAGE_SERVER": 5, - "NOTIFY": 6, - "CPUINFO_DUMP": 7, - "CPUINFO_CHECK": 8, - "FEATURE_CHECK": 9, - "VERSION": 10, - "WAIT_PID": 11, - "PAGE_SERVER_CHLD": 12, -} - -func (x CriuReqType) Enum() *CriuReqType { - p := new(CriuReqType) - *p = x - return p -} -func (x CriuReqType) String() string { - return proto.EnumName(CriuReqType_name, int32(x)) -} -func (x *CriuReqType) UnmarshalJSON(data []byte) error { - value, err := proto.UnmarshalJSONEnum(CriuReqType_value, data, "CriuReqType") - if err != nil { - return err - } - *x = CriuReqType(value) - return nil -} -func (CriuReqType) EnumDescriptor() ([]byte, []int) { return fileDescriptor0, []int{1} } - -type CriuPageServerInfo struct { - Address *string `protobuf:"bytes,1,opt,name=address" json:"address,omitempty"` - Port *int32 `protobuf:"varint,2,opt,name=port" json:"port,omitempty"` - Pid *int32 `protobuf:"varint,3,opt,name=pid" json:"pid,omitempty"` - Fd *int32 `protobuf:"varint,4,opt,name=fd" json:"fd,omitempty"` - XXX_unrecognized []byte `json:"-"` -} - -func (m *CriuPageServerInfo) Reset() { *m = CriuPageServerInfo{} } -func (m *CriuPageServerInfo) String() string { return proto.CompactTextString(m) } -func (*CriuPageServerInfo) ProtoMessage() {} -func (*CriuPageServerInfo) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{0} } - -func (m *CriuPageServerInfo) GetAddress() string { - if m != nil && m.Address != nil { - return *m.Address - } - return "" -} - -func (m *CriuPageServerInfo) GetPort() int32 { - if m != nil && m.Port != nil { - return *m.Port - } - return 0 -} - -func (m *CriuPageServerInfo) GetPid() int32 { - if m != nil && m.Pid != nil { - return *m.Pid - } - return 0 -} - -func (m *CriuPageServerInfo) GetFd() int32 { - if m != nil && m.Fd != nil { - return *m.Fd - } - return 0 -} - -type CriuVethPair struct { - IfIn *string `protobuf:"bytes,1,req,name=if_in,json=ifIn" json:"if_in,omitempty"` - IfOut *string `protobuf:"bytes,2,req,name=if_out,json=ifOut" json:"if_out,omitempty"` - XXX_unrecognized []byte `json:"-"` -} - -func (m *CriuVethPair) Reset() { *m = CriuVethPair{} } -func (m *CriuVethPair) String() string { return proto.CompactTextString(m) } -func (*CriuVethPair) ProtoMessage() {} -func (*CriuVethPair) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{1} } - -func (m *CriuVethPair) GetIfIn() string { - if m != nil && m.IfIn != nil { - return *m.IfIn - } - return "" -} - -func (m *CriuVethPair) GetIfOut() string { - if m != nil && m.IfOut != nil { - return *m.IfOut - } - return "" -} - -type ExtMountMap struct { - Key *string `protobuf:"bytes,1,req,name=key" json:"key,omitempty"` - Val *string `protobuf:"bytes,2,req,name=val" json:"val,omitempty"` - XXX_unrecognized []byte `json:"-"` -} - -func (m *ExtMountMap) Reset() { *m = ExtMountMap{} } -func (m *ExtMountMap) String() string { return proto.CompactTextString(m) } -func (*ExtMountMap) ProtoMessage() {} -func (*ExtMountMap) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{2} } - -func (m *ExtMountMap) GetKey() string { - if m != nil && m.Key != nil { - return *m.Key - } - return "" -} - -func (m *ExtMountMap) GetVal() string { - if m != nil && m.Val != nil { - return *m.Val - } - return "" -} - -type JoinNamespace struct { - Ns *string `protobuf:"bytes,1,req,name=ns" json:"ns,omitempty"` - NsFile *string `protobuf:"bytes,2,req,name=ns_file,json=nsFile" json:"ns_file,omitempty"` - ExtraOpt *string `protobuf:"bytes,3,opt,name=extra_opt,json=extraOpt" json:"extra_opt,omitempty"` - XXX_unrecognized []byte `json:"-"` -} - -func (m *JoinNamespace) Reset() { *m = JoinNamespace{} } -func (m *JoinNamespace) String() string { return proto.CompactTextString(m) } -func (*JoinNamespace) ProtoMessage() {} -func (*JoinNamespace) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{3} } - -func (m *JoinNamespace) GetNs() string { - if m != nil && m.Ns != nil { - return *m.Ns - } - return "" -} - -func (m *JoinNamespace) GetNsFile() string { - if m != nil && m.NsFile != nil { - return *m.NsFile - } - return "" -} - -func (m *JoinNamespace) GetExtraOpt() string { - if m != nil && m.ExtraOpt != nil { - return *m.ExtraOpt - } - return "" -} - -type InheritFd struct { - Key *string `protobuf:"bytes,1,req,name=key" json:"key,omitempty"` - Fd *int32 `protobuf:"varint,2,req,name=fd" json:"fd,omitempty"` - XXX_unrecognized []byte `json:"-"` -} - -func (m *InheritFd) Reset() { *m = InheritFd{} } -func (m *InheritFd) String() string { return proto.CompactTextString(m) } -func (*InheritFd) ProtoMessage() {} -func (*InheritFd) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{4} } - -func (m *InheritFd) GetKey() string { - if m != nil && m.Key != nil { - return *m.Key - } - return "" -} - -func (m *InheritFd) GetFd() int32 { - if m != nil && m.Fd != nil { - return *m.Fd - } - return 0 -} - -type CgroupRoot struct { - Ctrl *string `protobuf:"bytes,1,opt,name=ctrl" json:"ctrl,omitempty"` - Path *string `protobuf:"bytes,2,req,name=path" json:"path,omitempty"` - XXX_unrecognized []byte `json:"-"` -} - -func (m *CgroupRoot) Reset() { *m = CgroupRoot{} } -func (m *CgroupRoot) String() string { return proto.CompactTextString(m) } -func (*CgroupRoot) ProtoMessage() {} -func (*CgroupRoot) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{5} } - -func (m *CgroupRoot) GetCtrl() string { - if m != nil && m.Ctrl != nil { - return *m.Ctrl - } - return "" -} - -func (m *CgroupRoot) GetPath() string { - if m != nil && m.Path != nil { - return *m.Path - } - return "" -} - -type UnixSk struct { - Inode *uint32 `protobuf:"varint,1,req,name=inode" json:"inode,omitempty"` - XXX_unrecognized []byte `json:"-"` -} - -func (m *UnixSk) Reset() { *m = UnixSk{} } -func (m *UnixSk) String() string { return proto.CompactTextString(m) } -func (*UnixSk) ProtoMessage() {} -func (*UnixSk) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{6} } - -func (m *UnixSk) GetInode() uint32 { - if m != nil && m.Inode != nil { - return *m.Inode - } - return 0 -} - -type CriuOpts struct { - ImagesDirFd *int32 `protobuf:"varint,1,req,name=images_dir_fd,json=imagesDirFd" json:"images_dir_fd,omitempty"` - Pid *int32 `protobuf:"varint,2,opt,name=pid" json:"pid,omitempty"` - LeaveRunning *bool `protobuf:"varint,3,opt,name=leave_running,json=leaveRunning" json:"leave_running,omitempty"` - ExtUnixSk *bool `protobuf:"varint,4,opt,name=ext_unix_sk,json=extUnixSk" json:"ext_unix_sk,omitempty"` - TcpEstablished *bool `protobuf:"varint,5,opt,name=tcp_established,json=tcpEstablished" json:"tcp_established,omitempty"` - EvasiveDevices *bool `protobuf:"varint,6,opt,name=evasive_devices,json=evasiveDevices" json:"evasive_devices,omitempty"` - ShellJob *bool `protobuf:"varint,7,opt,name=shell_job,json=shellJob" json:"shell_job,omitempty"` - FileLocks *bool `protobuf:"varint,8,opt,name=file_locks,json=fileLocks" json:"file_locks,omitempty"` - LogLevel *int32 `protobuf:"varint,9,opt,name=log_level,json=logLevel,def=2" json:"log_level,omitempty"` - LogFile *string `protobuf:"bytes,10,opt,name=log_file,json=logFile" json:"log_file,omitempty"` - Ps *CriuPageServerInfo `protobuf:"bytes,11,opt,name=ps" json:"ps,omitempty"` - NotifyScripts *bool `protobuf:"varint,12,opt,name=notify_scripts,json=notifyScripts" json:"notify_scripts,omitempty"` - Root *string `protobuf:"bytes,13,opt,name=root" json:"root,omitempty"` - ParentImg *string `protobuf:"bytes,14,opt,name=parent_img,json=parentImg" json:"parent_img,omitempty"` - TrackMem *bool `protobuf:"varint,15,opt,name=track_mem,json=trackMem" json:"track_mem,omitempty"` - AutoDedup *bool `protobuf:"varint,16,opt,name=auto_dedup,json=autoDedup" json:"auto_dedup,omitempty"` - WorkDirFd *int32 `protobuf:"varint,17,opt,name=work_dir_fd,json=workDirFd" json:"work_dir_fd,omitempty"` - LinkRemap *bool `protobuf:"varint,18,opt,name=link_remap,json=linkRemap" json:"link_remap,omitempty"` - Veths []*CriuVethPair `protobuf:"bytes,19,rep,name=veths" json:"veths,omitempty"` - CpuCap *uint32 `protobuf:"varint,20,opt,name=cpu_cap,json=cpuCap,def=4294967295" json:"cpu_cap,omitempty"` - ForceIrmap *bool `protobuf:"varint,21,opt,name=force_irmap,json=forceIrmap" json:"force_irmap,omitempty"` - ExecCmd []string `protobuf:"bytes,22,rep,name=exec_cmd,json=execCmd" json:"exec_cmd,omitempty"` - ExtMnt []*ExtMountMap `protobuf:"bytes,23,rep,name=ext_mnt,json=extMnt" json:"ext_mnt,omitempty"` - ManageCgroups *bool `protobuf:"varint,24,opt,name=manage_cgroups,json=manageCgroups" json:"manage_cgroups,omitempty"` - CgRoot []*CgroupRoot `protobuf:"bytes,25,rep,name=cg_root,json=cgRoot" json:"cg_root,omitempty"` - RstSibling *bool `protobuf:"varint,26,opt,name=rst_sibling,json=rstSibling" json:"rst_sibling,omitempty"` - InheritFd []*InheritFd `protobuf:"bytes,27,rep,name=inherit_fd,json=inheritFd" json:"inherit_fd,omitempty"` - AutoExtMnt *bool `protobuf:"varint,28,opt,name=auto_ext_mnt,json=autoExtMnt" json:"auto_ext_mnt,omitempty"` - ExtSharing *bool `protobuf:"varint,29,opt,name=ext_sharing,json=extSharing" json:"ext_sharing,omitempty"` - ExtMasters *bool `protobuf:"varint,30,opt,name=ext_masters,json=extMasters" json:"ext_masters,omitempty"` - SkipMnt []string `protobuf:"bytes,31,rep,name=skip_mnt,json=skipMnt" json:"skip_mnt,omitempty"` - EnableFs []string `protobuf:"bytes,32,rep,name=enable_fs,json=enableFs" json:"enable_fs,omitempty"` - UnixSkIno []*UnixSk `protobuf:"bytes,33,rep,name=unix_sk_ino,json=unixSkIno" json:"unix_sk_ino,omitempty"` - ManageCgroupsMode *CriuCgMode `protobuf:"varint,34,opt,name=manage_cgroups_mode,json=manageCgroupsMode,enum=CriuCgMode" json:"manage_cgroups_mode,omitempty"` - GhostLimit *uint32 `protobuf:"varint,35,opt,name=ghost_limit,json=ghostLimit,def=1048576" json:"ghost_limit,omitempty"` - IrmapScanPaths []string `protobuf:"bytes,36,rep,name=irmap_scan_paths,json=irmapScanPaths" json:"irmap_scan_paths,omitempty"` - External []string `protobuf:"bytes,37,rep,name=external" json:"external,omitempty"` - EmptyNs *uint32 `protobuf:"varint,38,opt,name=empty_ns,json=emptyNs" json:"empty_ns,omitempty"` - JoinNs []*JoinNamespace `protobuf:"bytes,39,rep,name=join_ns,json=joinNs" json:"join_ns,omitempty"` - CgroupProps *string `protobuf:"bytes,41,opt,name=cgroup_props,json=cgroupProps" json:"cgroup_props,omitempty"` - CgroupPropsFile *string `protobuf:"bytes,42,opt,name=cgroup_props_file,json=cgroupPropsFile" json:"cgroup_props_file,omitempty"` - CgroupDumpController []string `protobuf:"bytes,43,rep,name=cgroup_dump_controller,json=cgroupDumpController" json:"cgroup_dump_controller,omitempty"` - FreezeCgroup *string `protobuf:"bytes,44,opt,name=freeze_cgroup,json=freezeCgroup" json:"freeze_cgroup,omitempty"` - Timeout *uint32 `protobuf:"varint,45,opt,name=timeout" json:"timeout,omitempty"` - TcpSkipInFlight *bool `protobuf:"varint,46,opt,name=tcp_skip_in_flight,json=tcpSkipInFlight" json:"tcp_skip_in_flight,omitempty"` - WeakSysctls *bool `protobuf:"varint,47,opt,name=weak_sysctls,json=weakSysctls" json:"weak_sysctls,omitempty"` - LazyPages *bool `protobuf:"varint,48,opt,name=lazy_pages,json=lazyPages" json:"lazy_pages,omitempty"` - StatusFd *int32 `protobuf:"varint,49,opt,name=status_fd,json=statusFd" json:"status_fd,omitempty"` - OrphanPtsMaster *bool `protobuf:"varint,50,opt,name=orphan_pts_master,json=orphanPtsMaster" json:"orphan_pts_master,omitempty"` - ConfigFile *string `protobuf:"bytes,51,opt,name=config_file,json=configFile" json:"config_file,omitempty"` - XXX_unrecognized []byte `json:"-"` -} - -func (m *CriuOpts) Reset() { *m = CriuOpts{} } -func (m *CriuOpts) String() string { return proto.CompactTextString(m) } -func (*CriuOpts) ProtoMessage() {} -func (*CriuOpts) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{7} } - -const Default_CriuOpts_LogLevel int32 = 2 -const Default_CriuOpts_CpuCap uint32 = 4294967295 -const Default_CriuOpts_GhostLimit uint32 = 1048576 - -func (m *CriuOpts) GetImagesDirFd() int32 { - if m != nil && m.ImagesDirFd != nil { - return *m.ImagesDirFd - } - return 0 -} - -func (m *CriuOpts) GetPid() int32 { - if m != nil && m.Pid != nil { - return *m.Pid - } - return 0 -} - -func (m *CriuOpts) GetLeaveRunning() bool { - if m != nil && m.LeaveRunning != nil { - return *m.LeaveRunning - } - return false -} - -func (m *CriuOpts) GetExtUnixSk() bool { - if m != nil && m.ExtUnixSk != nil { - return *m.ExtUnixSk - } - return false -} - -func (m *CriuOpts) GetTcpEstablished() bool { - if m != nil && m.TcpEstablished != nil { - return *m.TcpEstablished - } - return false -} - -func (m *CriuOpts) GetEvasiveDevices() bool { - if m != nil && m.EvasiveDevices != nil { - return *m.EvasiveDevices - } - return false -} - -func (m *CriuOpts) GetShellJob() bool { - if m != nil && m.ShellJob != nil { - return *m.ShellJob - } - return false -} - -func (m *CriuOpts) GetFileLocks() bool { - if m != nil && m.FileLocks != nil { - return *m.FileLocks - } - return false -} - -func (m *CriuOpts) GetLogLevel() int32 { - if m != nil && m.LogLevel != nil { - return *m.LogLevel - } - return Default_CriuOpts_LogLevel -} - -func (m *CriuOpts) GetLogFile() string { - if m != nil && m.LogFile != nil { - return *m.LogFile - } - return "" -} - -func (m *CriuOpts) GetPs() *CriuPageServerInfo { - if m != nil { - return m.Ps - } - return nil -} - -func (m *CriuOpts) GetNotifyScripts() bool { - if m != nil && m.NotifyScripts != nil { - return *m.NotifyScripts - } - return false -} - -func (m *CriuOpts) GetRoot() string { - if m != nil && m.Root != nil { - return *m.Root - } - return "" -} - -func (m *CriuOpts) GetParentImg() string { - if m != nil && m.ParentImg != nil { - return *m.ParentImg - } - return "" -} - -func (m *CriuOpts) GetTrackMem() bool { - if m != nil && m.TrackMem != nil { - return *m.TrackMem - } - return false -} - -func (m *CriuOpts) GetAutoDedup() bool { - if m != nil && m.AutoDedup != nil { - return *m.AutoDedup - } - return false -} - -func (m *CriuOpts) GetWorkDirFd() int32 { - if m != nil && m.WorkDirFd != nil { - return *m.WorkDirFd - } - return 0 -} - -func (m *CriuOpts) GetLinkRemap() bool { - if m != nil && m.LinkRemap != nil { - return *m.LinkRemap - } - return false -} - -func (m *CriuOpts) GetVeths() []*CriuVethPair { - if m != nil { - return m.Veths - } - return nil -} - -func (m *CriuOpts) GetCpuCap() uint32 { - if m != nil && m.CpuCap != nil { - return *m.CpuCap - } - return Default_CriuOpts_CpuCap -} - -func (m *CriuOpts) GetForceIrmap() bool { - if m != nil && m.ForceIrmap != nil { - return *m.ForceIrmap - } - return false -} - -func (m *CriuOpts) GetExecCmd() []string { - if m != nil { - return m.ExecCmd - } - return nil -} - -func (m *CriuOpts) GetExtMnt() []*ExtMountMap { - if m != nil { - return m.ExtMnt - } - return nil -} - -func (m *CriuOpts) GetManageCgroups() bool { - if m != nil && m.ManageCgroups != nil { - return *m.ManageCgroups - } - return false -} - -func (m *CriuOpts) GetCgRoot() []*CgroupRoot { - if m != nil { - return m.CgRoot - } - return nil -} - -func (m *CriuOpts) GetRstSibling() bool { - if m != nil && m.RstSibling != nil { - return *m.RstSibling - } - return false -} - -func (m *CriuOpts) GetInheritFd() []*InheritFd { - if m != nil { - return m.InheritFd - } - return nil -} - -func (m *CriuOpts) GetAutoExtMnt() bool { - if m != nil && m.AutoExtMnt != nil { - return *m.AutoExtMnt - } - return false -} - -func (m *CriuOpts) GetExtSharing() bool { - if m != nil && m.ExtSharing != nil { - return *m.ExtSharing - } - return false -} - -func (m *CriuOpts) GetExtMasters() bool { - if m != nil && m.ExtMasters != nil { - return *m.ExtMasters - } - return false -} - -func (m *CriuOpts) GetSkipMnt() []string { - if m != nil { - return m.SkipMnt - } - return nil -} - -func (m *CriuOpts) GetEnableFs() []string { - if m != nil { - return m.EnableFs - } - return nil -} - -func (m *CriuOpts) GetUnixSkIno() []*UnixSk { - if m != nil { - return m.UnixSkIno - } - return nil -} - -func (m *CriuOpts) GetManageCgroupsMode() CriuCgMode { - if m != nil && m.ManageCgroupsMode != nil { - return *m.ManageCgroupsMode - } - return CriuCgMode_IGNORE -} - -func (m *CriuOpts) GetGhostLimit() uint32 { - if m != nil && m.GhostLimit != nil { - return *m.GhostLimit - } - return Default_CriuOpts_GhostLimit -} - -func (m *CriuOpts) GetIrmapScanPaths() []string { - if m != nil { - return m.IrmapScanPaths - } - return nil -} - -func (m *CriuOpts) GetExternal() []string { - if m != nil { - return m.External - } - return nil -} - -func (m *CriuOpts) GetEmptyNs() uint32 { - if m != nil && m.EmptyNs != nil { - return *m.EmptyNs - } - return 0 -} - -func (m *CriuOpts) GetJoinNs() []*JoinNamespace { - if m != nil { - return m.JoinNs - } - return nil -} - -func (m *CriuOpts) GetCgroupProps() string { - if m != nil && m.CgroupProps != nil { - return *m.CgroupProps - } - return "" -} - -func (m *CriuOpts) GetCgroupPropsFile() string { - if m != nil && m.CgroupPropsFile != nil { - return *m.CgroupPropsFile - } - return "" -} - -func (m *CriuOpts) GetCgroupDumpController() []string { - if m != nil { - return m.CgroupDumpController - } - return nil -} - -func (m *CriuOpts) GetFreezeCgroup() string { - if m != nil && m.FreezeCgroup != nil { - return *m.FreezeCgroup - } - return "" -} - -func (m *CriuOpts) GetTimeout() uint32 { - if m != nil && m.Timeout != nil { - return *m.Timeout - } - return 0 -} - -func (m *CriuOpts) GetTcpSkipInFlight() bool { - if m != nil && m.TcpSkipInFlight != nil { - return *m.TcpSkipInFlight - } - return false -} - -func (m *CriuOpts) GetWeakSysctls() bool { - if m != nil && m.WeakSysctls != nil { - return *m.WeakSysctls - } - return false -} - -func (m *CriuOpts) GetLazyPages() bool { - if m != nil && m.LazyPages != nil { - return *m.LazyPages - } - return false -} - -func (m *CriuOpts) GetStatusFd() int32 { - if m != nil && m.StatusFd != nil { - return *m.StatusFd - } - return 0 -} - -func (m *CriuOpts) GetOrphanPtsMaster() bool { - if m != nil && m.OrphanPtsMaster != nil { - return *m.OrphanPtsMaster - } - return false -} - -func (m *CriuOpts) GetConfigFile() string { - if m != nil && m.ConfigFile != nil { - return *m.ConfigFile - } - return "" -} - -type CriuDumpResp struct { - Restored *bool `protobuf:"varint,1,opt,name=restored" json:"restored,omitempty"` - XXX_unrecognized []byte `json:"-"` -} - -func (m *CriuDumpResp) Reset() { *m = CriuDumpResp{} } -func (m *CriuDumpResp) String() string { return proto.CompactTextString(m) } -func (*CriuDumpResp) ProtoMessage() {} -func (*CriuDumpResp) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{8} } - -func (m *CriuDumpResp) GetRestored() bool { - if m != nil && m.Restored != nil { - return *m.Restored - } - return false -} - -type CriuRestoreResp struct { - Pid *int32 `protobuf:"varint,1,req,name=pid" json:"pid,omitempty"` - XXX_unrecognized []byte `json:"-"` -} - -func (m *CriuRestoreResp) Reset() { *m = CriuRestoreResp{} } -func (m *CriuRestoreResp) String() string { return proto.CompactTextString(m) } -func (*CriuRestoreResp) ProtoMessage() {} -func (*CriuRestoreResp) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{9} } - -func (m *CriuRestoreResp) GetPid() int32 { - if m != nil && m.Pid != nil { - return *m.Pid - } - return 0 -} - -type CriuNotify struct { - Script *string `protobuf:"bytes,1,opt,name=script" json:"script,omitempty"` - Pid *int32 `protobuf:"varint,2,opt,name=pid" json:"pid,omitempty"` - XXX_unrecognized []byte `json:"-"` -} - -func (m *CriuNotify) Reset() { *m = CriuNotify{} } -func (m *CriuNotify) String() string { return proto.CompactTextString(m) } -func (*CriuNotify) ProtoMessage() {} -func (*CriuNotify) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{10} } - -func (m *CriuNotify) GetScript() string { - if m != nil && m.Script != nil { - return *m.Script - } - return "" -} - -func (m *CriuNotify) GetPid() int32 { - if m != nil && m.Pid != nil { - return *m.Pid - } - return 0 -} - -// -// List of features which can queried via -// CRIU_REQ_TYPE__FEATURE_CHECK -type CriuFeatures struct { - MemTrack *bool `protobuf:"varint,1,opt,name=mem_track,json=memTrack" json:"mem_track,omitempty"` - LazyPages *bool `protobuf:"varint,2,opt,name=lazy_pages,json=lazyPages" json:"lazy_pages,omitempty"` - XXX_unrecognized []byte `json:"-"` -} - -func (m *CriuFeatures) Reset() { *m = CriuFeatures{} } -func (m *CriuFeatures) String() string { return proto.CompactTextString(m) } -func (*CriuFeatures) ProtoMessage() {} -func (*CriuFeatures) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{11} } - -func (m *CriuFeatures) GetMemTrack() bool { - if m != nil && m.MemTrack != nil { - return *m.MemTrack - } - return false -} - -func (m *CriuFeatures) GetLazyPages() bool { - if m != nil && m.LazyPages != nil { - return *m.LazyPages - } - return false -} - -type CriuReq struct { - Type *CriuReqType `protobuf:"varint,1,req,name=type,enum=CriuReqType" json:"type,omitempty"` - Opts *CriuOpts `protobuf:"bytes,2,opt,name=opts" json:"opts,omitempty"` - NotifySuccess *bool `protobuf:"varint,3,opt,name=notify_success,json=notifySuccess" json:"notify_success,omitempty"` - // - // When set service won't close the connection but - // will wait for more req-s to appear. Works not - // for all request types. - KeepOpen *bool `protobuf:"varint,4,opt,name=keep_open,json=keepOpen" json:"keep_open,omitempty"` - // - // 'features' can be used to query which features - // are supported by the installed criu/kernel - // via RPC. - Features *CriuFeatures `protobuf:"bytes,5,opt,name=features" json:"features,omitempty"` - // 'pid' is used for WAIT_PID - Pid *uint32 `protobuf:"varint,6,opt,name=pid" json:"pid,omitempty"` - XXX_unrecognized []byte `json:"-"` -} - -func (m *CriuReq) Reset() { *m = CriuReq{} } -func (m *CriuReq) String() string { return proto.CompactTextString(m) } -func (*CriuReq) ProtoMessage() {} -func (*CriuReq) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{12} } - -func (m *CriuReq) GetType() CriuReqType { - if m != nil && m.Type != nil { - return *m.Type - } - return CriuReqType_EMPTY -} - -func (m *CriuReq) GetOpts() *CriuOpts { - if m != nil { - return m.Opts - } - return nil -} - -func (m *CriuReq) GetNotifySuccess() bool { - if m != nil && m.NotifySuccess != nil { - return *m.NotifySuccess - } - return false -} - -func (m *CriuReq) GetKeepOpen() bool { - if m != nil && m.KeepOpen != nil { - return *m.KeepOpen - } - return false -} - -func (m *CriuReq) GetFeatures() *CriuFeatures { - if m != nil { - return m.Features - } - return nil -} - -func (m *CriuReq) GetPid() uint32 { - if m != nil && m.Pid != nil { - return *m.Pid - } - return 0 -} - -type CriuResp struct { - Type *CriuReqType `protobuf:"varint,1,req,name=type,enum=CriuReqType" json:"type,omitempty"` - Success *bool `protobuf:"varint,2,req,name=success" json:"success,omitempty"` - Dump *CriuDumpResp `protobuf:"bytes,3,opt,name=dump" json:"dump,omitempty"` - Restore *CriuRestoreResp `protobuf:"bytes,4,opt,name=restore" json:"restore,omitempty"` - Notify *CriuNotify `protobuf:"bytes,5,opt,name=notify" json:"notify,omitempty"` - Ps *CriuPageServerInfo `protobuf:"bytes,6,opt,name=ps" json:"ps,omitempty"` - CrErrno *int32 `protobuf:"varint,7,opt,name=cr_errno,json=crErrno" json:"cr_errno,omitempty"` - Features *CriuFeatures `protobuf:"bytes,8,opt,name=features" json:"features,omitempty"` - CrErrmsg *string `protobuf:"bytes,9,opt,name=cr_errmsg,json=crErrmsg" json:"cr_errmsg,omitempty"` - Version *CriuVersion `protobuf:"bytes,10,opt,name=version" json:"version,omitempty"` - Status *int32 `protobuf:"varint,11,opt,name=status" json:"status,omitempty"` - XXX_unrecognized []byte `json:"-"` -} - -func (m *CriuResp) Reset() { *m = CriuResp{} } -func (m *CriuResp) String() string { return proto.CompactTextString(m) } -func (*CriuResp) ProtoMessage() {} -func (*CriuResp) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{13} } - -func (m *CriuResp) GetType() CriuReqType { - if m != nil && m.Type != nil { - return *m.Type - } - return CriuReqType_EMPTY -} - -func (m *CriuResp) GetSuccess() bool { - if m != nil && m.Success != nil { - return *m.Success - } - return false -} - -func (m *CriuResp) GetDump() *CriuDumpResp { - if m != nil { - return m.Dump - } - return nil -} - -func (m *CriuResp) GetRestore() *CriuRestoreResp { - if m != nil { - return m.Restore - } - return nil -} - -func (m *CriuResp) GetNotify() *CriuNotify { - if m != nil { - return m.Notify - } - return nil -} - -func (m *CriuResp) GetPs() *CriuPageServerInfo { - if m != nil { - return m.Ps - } - return nil -} - -func (m *CriuResp) GetCrErrno() int32 { - if m != nil && m.CrErrno != nil { - return *m.CrErrno - } - return 0 -} - -func (m *CriuResp) GetFeatures() *CriuFeatures { - if m != nil { - return m.Features - } - return nil -} - -func (m *CriuResp) GetCrErrmsg() string { - if m != nil && m.CrErrmsg != nil { - return *m.CrErrmsg - } - return "" -} - -func (m *CriuResp) GetVersion() *CriuVersion { - if m != nil { - return m.Version - } - return nil -} - -func (m *CriuResp) GetStatus() int32 { - if m != nil && m.Status != nil { - return *m.Status - } - return 0 -} - -// Answer for criu_req_type.VERSION requests -type CriuVersion struct { - Major *int32 `protobuf:"varint,1,req,name=major" json:"major,omitempty"` - Minor *int32 `protobuf:"varint,2,req,name=minor" json:"minor,omitempty"` - Gitid *string `protobuf:"bytes,3,opt,name=gitid" json:"gitid,omitempty"` - Sublevel *int32 `protobuf:"varint,4,opt,name=sublevel" json:"sublevel,omitempty"` - Extra *int32 `protobuf:"varint,5,opt,name=extra" json:"extra,omitempty"` - Name *string `protobuf:"bytes,6,opt,name=name" json:"name,omitempty"` - XXX_unrecognized []byte `json:"-"` -} - -func (m *CriuVersion) Reset() { *m = CriuVersion{} } -func (m *CriuVersion) String() string { return proto.CompactTextString(m) } -func (*CriuVersion) ProtoMessage() {} -func (*CriuVersion) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{14} } - -func (m *CriuVersion) GetMajor() int32 { - if m != nil && m.Major != nil { - return *m.Major - } - return 0 -} - -func (m *CriuVersion) GetMinor() int32 { - if m != nil && m.Minor != nil { - return *m.Minor - } - return 0 -} - -func (m *CriuVersion) GetGitid() string { - if m != nil && m.Gitid != nil { - return *m.Gitid - } - return "" -} - -func (m *CriuVersion) GetSublevel() int32 { - if m != nil && m.Sublevel != nil { - return *m.Sublevel - } - return 0 -} - -func (m *CriuVersion) GetExtra() int32 { - if m != nil && m.Extra != nil { - return *m.Extra - } - return 0 -} - -func (m *CriuVersion) GetName() string { - if m != nil && m.Name != nil { - return *m.Name - } - return "" -} - -func init() { - proto.RegisterType((*CriuPageServerInfo)(nil), "criu_page_server_info") - proto.RegisterType((*CriuVethPair)(nil), "criu_veth_pair") - proto.RegisterType((*ExtMountMap)(nil), "ext_mount_map") - proto.RegisterType((*JoinNamespace)(nil), "join_namespace") - proto.RegisterType((*InheritFd)(nil), "inherit_fd") - proto.RegisterType((*CgroupRoot)(nil), "cgroup_root") - proto.RegisterType((*UnixSk)(nil), "unix_sk") - proto.RegisterType((*CriuOpts)(nil), "criu_opts") - proto.RegisterType((*CriuDumpResp)(nil), "criu_dump_resp") - proto.RegisterType((*CriuRestoreResp)(nil), "criu_restore_resp") - proto.RegisterType((*CriuNotify)(nil), "criu_notify") - proto.RegisterType((*CriuFeatures)(nil), "criu_features") - proto.RegisterType((*CriuReq)(nil), "criu_req") - proto.RegisterType((*CriuResp)(nil), "criu_resp") - proto.RegisterType((*CriuVersion)(nil), "criu_version") - proto.RegisterEnum("CriuCgMode", CriuCgMode_name, CriuCgMode_value) - proto.RegisterEnum("CriuReqType", CriuReqType_name, CriuReqType_value) -} - -func init() { proto.RegisterFile("rpc/rpc.proto", fileDescriptor0) } - -var fileDescriptor0 = []byte{ - // 1835 bytes of a gzipped FileDescriptorProto - 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0x8c, 0x56, 0xeb, 0x72, 0x5b, 0xb7, - 0x11, 0x0e, 0x29, 0xf1, 0x06, 0x5e, 0x7c, 0x0c, 0x5f, 0x02, 0xc7, 0xb5, 0xad, 0xd0, 0x51, 0xa2, - 0x2a, 0x2e, 0x93, 0x30, 0x76, 0x5c, 0x67, 0xda, 0x1f, 0x1e, 0x8a, 0x74, 0xd8, 0x48, 0x22, 0x07, - 0xa4, 0xdc, 0xc9, 0x2f, 0xcc, 0xd1, 0x39, 0x20, 0x05, 0xf3, 0xdc, 0x0a, 0x80, 0x8a, 0xe4, 0x97, - 0xe8, 0xbf, 0x3e, 0x57, 0xde, 0xa4, 0xaf, 0xd0, 0xd9, 0x05, 0x28, 0x4b, 0x49, 0x66, 0xd2, 0x7f, - 0xd8, 0x0f, 0xbb, 0xc0, 0xde, 0x77, 0x49, 0x5b, 0x17, 0xd1, 0x57, 0xba, 0x88, 0x7a, 0x85, 0xce, - 0x6d, 0xde, 0x5d, 0x92, 0x7b, 0x91, 0x56, 0x6b, 0x51, 0x84, 0x4b, 0x29, 0x8c, 0xd4, 0xe7, 0x52, - 0x0b, 0x95, 0x2d, 0x72, 0xca, 0x48, 0x2d, 0x8c, 0x63, 0x2d, 0x8d, 0x61, 0xa5, 0x9d, 0xd2, 0x5e, - 0x83, 0x6f, 0x48, 0x4a, 0xc9, 0x76, 0x91, 0x6b, 0xcb, 0xca, 0x3b, 0xa5, 0xbd, 0x0a, 0xc7, 0x33, - 0x0d, 0xc8, 0x56, 0xa1, 0x62, 0xb6, 0x85, 0x10, 0x1c, 0x69, 0x87, 0x94, 0x17, 0x31, 0xdb, 0x46, - 0xa0, 0xbc, 0x88, 0xbb, 0x7f, 0x23, 0x1d, 0xfc, 0xe8, 0x5c, 0xda, 0x33, 0x51, 0x84, 0x4a, 0xd3, - 0x3b, 0xa4, 0xa2, 0x16, 0x42, 0x65, 0xac, 0xb4, 0x53, 0xde, 0x6b, 0xf0, 0x6d, 0xb5, 0x18, 0x67, - 0xf4, 0x1e, 0xa9, 0xaa, 0x85, 0xc8, 0xd7, 0xf0, 0x3c, 0xa0, 0x15, 0xb5, 0x98, 0xac, 0x6d, 0xf7, - 0x5b, 0xd2, 0x96, 0x17, 0x56, 0xa4, 0xf9, 0x3a, 0xb3, 0x22, 0x0d, 0x0b, 0xf8, 0x70, 0x25, 0x2f, - 0xbd, 0x28, 0x1c, 0x01, 0x39, 0x0f, 0x13, 0x2f, 0x06, 0xc7, 0xee, 0x5b, 0xd2, 0x79, 0x97, 0xab, - 0x4c, 0x64, 0x61, 0x2a, 0x4d, 0x11, 0x46, 0x12, 0x94, 0xca, 0x8c, 0x17, 0x2a, 0x67, 0x86, 0x7e, - 0x4c, 0x6a, 0x99, 0x11, 0x0b, 0x95, 0x48, 0x2f, 0x57, 0xcd, 0xcc, 0x48, 0x25, 0x92, 0x3e, 0x24, - 0x0d, 0x79, 0x61, 0x75, 0x28, 0xf2, 0xc2, 0xa2, 0x55, 0x0d, 0x5e, 0x47, 0x60, 0x52, 0xd8, 0x6e, - 0x8f, 0x10, 0x95, 0x9d, 0x49, 0xad, 0xac, 0x58, 0xc4, 0xbf, 0xa3, 0x89, 0x33, 0x1d, 0x1e, 0x74, - 0xa6, 0xbf, 0x20, 0xcd, 0x68, 0xa9, 0xf3, 0x75, 0x21, 0x74, 0x9e, 0x5b, 0xf0, 0x5f, 0x64, 0x75, - 0xe2, 0xdd, 0x8a, 0x67, 0xf4, 0x69, 0x68, 0xcf, 0xbc, 0x16, 0x78, 0xee, 0x3e, 0x21, 0xb5, 0x75, - 0xa6, 0x2e, 0x84, 0x59, 0xd1, 0xbb, 0xa4, 0xa2, 0xb2, 0x3c, 0x96, 0xf8, 0x4b, 0x9b, 0x3b, 0xa2, - 0xfb, 0xdf, 0x36, 0x69, 0xa0, 0x4f, 0xf3, 0xc2, 0x1a, 0xda, 0x25, 0x6d, 0x95, 0x86, 0x4b, 0x69, - 0x44, 0xac, 0xb4, 0x58, 0xc4, 0xc8, 0x5b, 0xe1, 0x4d, 0x07, 0x1e, 0x28, 0x3d, 0x8a, 0x37, 0x61, - 0x2a, 0x7f, 0x08, 0xd3, 0x53, 0xd2, 0x4e, 0x64, 0x78, 0x2e, 0x85, 0x5e, 0x67, 0x99, 0xca, 0x96, - 0x68, 0x6c, 0x9d, 0xb7, 0x10, 0xe4, 0x0e, 0xa3, 0x8f, 0x49, 0x13, 0xbc, 0xef, 0xb5, 0xc1, 0xa0, - 0xd6, 0x39, 0x38, 0xe8, 0x24, 0x53, 0x17, 0xb3, 0x15, 0xfd, 0x82, 0xdc, 0xb2, 0x51, 0x21, 0xa4, - 0xb1, 0xe1, 0x69, 0xa2, 0xcc, 0x99, 0x8c, 0x59, 0x05, 0x79, 0x3a, 0x36, 0x2a, 0x86, 0x1f, 0x50, - 0x60, 0x94, 0xe7, 0xa1, 0x51, 0xe7, 0x52, 0xc4, 0xf2, 0x5c, 0x45, 0xd2, 0xb0, 0xaa, 0x63, 0xf4, - 0xf0, 0x81, 0x43, 0xc1, 0xff, 0xe6, 0x4c, 0x26, 0x89, 0x78, 0x97, 0x9f, 0xb2, 0x1a, 0xb2, 0xd4, - 0x11, 0xf8, 0x47, 0x7e, 0x4a, 0x1f, 0x11, 0x02, 0x21, 0x13, 0x49, 0x1e, 0xad, 0x0c, 0xab, 0x3b, - 0x6d, 0x00, 0x39, 0x04, 0x80, 0x3e, 0x26, 0x8d, 0x24, 0x5f, 0x8a, 0x44, 0x9e, 0xcb, 0x84, 0x35, - 0xc0, 0xd4, 0xef, 0x4b, 0x7d, 0x5e, 0x4f, 0xf2, 0xe5, 0x21, 0x40, 0xf4, 0x01, 0x81, 0xb3, 0x8b, - 0x3a, 0x71, 0xa9, 0x9d, 0xe4, 0x4b, 0x0c, 0xfb, 0xe7, 0xa4, 0x5c, 0x18, 0xd6, 0xdc, 0x29, 0xed, - 0x35, 0xfb, 0xf7, 0x7b, 0xbf, 0x5b, 0x18, 0xbc, 0x5c, 0x18, 0xba, 0x4b, 0x3a, 0x59, 0x6e, 0xd5, - 0xe2, 0x52, 0x98, 0x48, 0xab, 0xc2, 0x1a, 0xd6, 0x42, 0x2d, 0xda, 0x0e, 0x9d, 0x39, 0x10, 0xa2, - 0x0a, 0x11, 0x67, 0x6d, 0x17, 0x69, 0x8c, 0xfe, 0x23, 0x42, 0x8a, 0x50, 0xcb, 0xcc, 0x0a, 0x95, - 0x2e, 0x59, 0x07, 0x6f, 0x1a, 0x0e, 0x19, 0xa7, 0x4b, 0x30, 0xdc, 0xea, 0x30, 0x5a, 0x89, 0x54, - 0xa6, 0xec, 0x96, 0x33, 0x1c, 0x81, 0x23, 0x99, 0x82, 0x6c, 0xb8, 0xb6, 0xb9, 0x88, 0x65, 0xbc, - 0x2e, 0x58, 0xe0, 0x0c, 0x07, 0xe4, 0x00, 0x00, 0x08, 0xd3, 0xcf, 0xb9, 0x5e, 0x6d, 0xe2, 0x7f, - 0x1b, 0xa3, 0xdc, 0x00, 0xc8, 0x45, 0xff, 0x11, 0x21, 0x89, 0xca, 0x56, 0x42, 0xcb, 0x34, 0x2c, - 0x18, 0x75, 0xe2, 0x80, 0x70, 0x00, 0xe8, 0x2e, 0xa9, 0x40, 0x71, 0x1a, 0x76, 0x67, 0x67, 0x6b, - 0xaf, 0xd9, 0xbf, 0xd5, 0xbb, 0x59, 0xaf, 0xdc, 0xdd, 0xd2, 0xa7, 0xa4, 0x16, 0x15, 0x6b, 0x11, - 0x85, 0x05, 0xbb, 0xbb, 0x53, 0xda, 0x6b, 0x7f, 0x4f, 0x9e, 0xf7, 0x5f, 0x3d, 0x7f, 0xf5, 0xdd, - 0xcb, 0xfe, 0xab, 0x17, 0xbc, 0x1a, 0x15, 0xeb, 0x41, 0x58, 0xd0, 0x27, 0xa4, 0xb9, 0xc8, 0x75, - 0x24, 0x85, 0xd2, 0xf0, 0xd7, 0x3d, 0xfc, 0x8b, 0x20, 0x34, 0x06, 0x04, 0x82, 0x20, 0x2f, 0x64, - 0x24, 0xa2, 0x34, 0x66, 0xf7, 0x77, 0xb6, 0x20, 0x08, 0x40, 0x0f, 0x52, 0x48, 0x92, 0x1a, 0xd6, - 0x7a, 0x66, 0xd9, 0xc7, 0xa8, 0x49, 0xa7, 0x77, 0xa3, 0xf6, 0x79, 0x55, 0x5e, 0xd8, 0xa3, 0xcc, - 0x42, 0x14, 0xd2, 0x30, 0x83, 0xf8, 0xb8, 0xf2, 0x32, 0x8c, 0xb9, 0x28, 0x38, 0x74, 0xe0, 0x40, - 0xba, 0x4b, 0x6a, 0xd1, 0x12, 0x4b, 0x8f, 0x3d, 0xc0, 0xf7, 0x5a, 0xbd, 0x6b, 0xe5, 0xc8, 0xab, - 0xd1, 0x92, 0x43, 0x60, 0x9e, 0x90, 0xa6, 0x36, 0x56, 0x18, 0x75, 0x9a, 0x40, 0x1d, 0x7c, 0xe2, - 0x54, 0xd6, 0xc6, 0xce, 0x1c, 0x42, 0xf7, 0xaf, 0x97, 0x3d, 0x7b, 0x88, 0x4f, 0x35, 0x7b, 0x1f, - 0x20, 0xde, 0xf0, 0xe7, 0x51, 0x4c, 0x77, 0x48, 0x0b, 0x23, 0xb5, 0x31, 0xe4, 0x4f, 0xee, 0x35, - 0xc0, 0x86, 0x4e, 0xf9, 0x27, 0xae, 0xa6, 0xcc, 0x59, 0xa8, 0xe1, 0xbb, 0x47, 0x8e, 0x41, 0x5e, - 0xd8, 0x99, 0x43, 0x36, 0x0c, 0x69, 0x68, 0xac, 0xd4, 0x86, 0x3d, 0xbe, 0x62, 0x38, 0x72, 0x08, - 0xb8, 0xd0, 0xac, 0x54, 0x81, 0xef, 0x3f, 0x71, 0x2e, 0x04, 0x1a, 0x1e, 0x87, 0xf6, 0x95, 0x85, - 0xa7, 0x89, 0x14, 0x0b, 0xc3, 0x76, 0xf0, 0xae, 0xee, 0x80, 0x91, 0xa1, 0x7b, 0xa4, 0xe9, 0x2b, - 0x59, 0xa8, 0x2c, 0x67, 0x9f, 0xa2, 0x21, 0xf5, 0x9e, 0xc7, 0x78, 0x63, 0x8d, 0x45, 0x3d, 0xce, - 0x72, 0xfa, 0x77, 0x72, 0xe7, 0xa6, 0x83, 0x45, 0x0a, 0x4d, 0xa8, 0xbb, 0x53, 0xda, 0xeb, 0xf4, - 0xdb, 0x2e, 0x3f, 0xa2, 0x25, 0x82, 0xfc, 0xf6, 0x0d, 0xa7, 0x1f, 0xe5, 0xb1, 0x84, 0x8f, 0x96, - 0x67, 0xb9, 0xb1, 0x22, 0x51, 0xa9, 0xb2, 0xec, 0x29, 0x66, 0x4b, 0xed, 0x9b, 0xaf, 0x9f, 0xff, - 0xf5, 0xc5, 0xcb, 0xef, 0x38, 0xc1, 0xbb, 0x43, 0xb8, 0xa2, 0x7b, 0x24, 0xc0, 0x44, 0x11, 0x26, - 0x0a, 0x33, 0x01, 0xdd, 0xcf, 0xb0, 0xcf, 0x50, 0xed, 0x0e, 0xe2, 0xb3, 0x28, 0xcc, 0xa6, 0x80, - 0xd2, 0x4f, 0x20, 0x6f, 0xac, 0xd4, 0x59, 0x98, 0xb0, 0x5d, 0x6f, 0x98, 0xa7, 0x31, 0xa7, 0xd2, - 0xc2, 0x5e, 0x8a, 0xcc, 0xb0, 0xcf, 0xe1, 0x33, 0x5e, 0x43, 0xfa, 0x18, 0x6c, 0xae, 0xb9, 0x51, - 0x60, 0xd8, 0x17, 0x3e, 0xbb, 0x6f, 0x8e, 0x06, 0x5e, 0x05, 0xfa, 0xd8, 0xd0, 0x4f, 0x49, 0xcb, - 0x67, 0x47, 0xa1, 0xf3, 0xc2, 0xb0, 0x3f, 0x63, 0x85, 0xfa, 0x06, 0x3e, 0x05, 0x88, 0xee, 0x93, - 0xdb, 0xd7, 0x59, 0x5c, 0x27, 0xd9, 0x47, 0xbe, 0x5b, 0xd7, 0xf8, 0xb0, 0xa3, 0x3c, 0x27, 0xf7, - 0x3d, 0x6f, 0xbc, 0x4e, 0x0b, 0x11, 0xe5, 0x99, 0xd5, 0x79, 0x92, 0x48, 0xcd, 0xbe, 0x44, 0xed, - 0xef, 0xba, 0xdb, 0x83, 0x75, 0x5a, 0x0c, 0xae, 0xee, 0xa0, 0x2b, 0x2f, 0xb4, 0x94, 0xef, 0x37, - 0x8e, 0x67, 0xcf, 0xf0, 0xf5, 0x96, 0x03, 0x9d, 0x8f, 0x61, 0x42, 0x5b, 0x95, 0x4a, 0x98, 0x95, - 0x7f, 0x71, 0xd6, 0x7a, 0x92, 0x7e, 0x49, 0x28, 0xf4, 0x63, 0xcc, 0x0e, 0x95, 0x89, 0x45, 0xa2, - 0x96, 0x67, 0x96, 0xf5, 0x30, 0x83, 0xa0, 0x53, 0xcf, 0x56, 0xaa, 0x18, 0x67, 0x23, 0x84, 0xc1, - 0xe0, 0x9f, 0x65, 0xb8, 0x12, 0xe6, 0xd2, 0x44, 0x36, 0x31, 0xec, 0x2b, 0x64, 0x6b, 0x02, 0x36, - 0x73, 0x10, 0x36, 0x8e, 0xf0, 0xfd, 0x25, 0xf6, 0x42, 0xc3, 0xbe, 0xf6, 0x8d, 0x23, 0x7c, 0x7f, - 0x39, 0x05, 0x00, 0x9b, 0xb5, 0x0d, 0xed, 0xda, 0x40, 0x5d, 0x7c, 0x83, 0x5d, 0xa7, 0xee, 0x80, - 0x51, 0x0c, 0xce, 0xca, 0x75, 0x71, 0x06, 0x61, 0xb5, 0xc6, 0x67, 0x33, 0xeb, 0x3b, 0x55, 0xdc, - 0xc5, 0xd4, 0x1a, 0x97, 0xd2, 0x90, 0xf2, 0x51, 0x9e, 0x2d, 0x94, 0x6f, 0xce, 0xdf, 0xa2, 0xd1, - 0xc4, 0x41, 0xe0, 0xcd, 0xee, 0x33, 0xbf, 0x44, 0xa0, 0x2f, 0xb5, 0x34, 0x05, 0xe4, 0x83, 0x96, - 0xc6, 0xe6, 0x5a, 0xc6, 0x38, 0x50, 0xeb, 0xfc, 0x8a, 0xee, 0xee, 0x92, 0xdb, 0xc8, 0xed, 0x01, - 0x27, 0xe0, 0x47, 0xa0, 0x1b, 0x8e, 0x70, 0xec, 0xbe, 0x24, 0x4d, 0x64, 0x73, 0xbd, 0x9b, 0xde, - 0x27, 0x55, 0xd7, 0xd4, 0xfd, 0x80, 0xf6, 0xd4, 0x6f, 0x67, 0x67, 0xf7, 0x47, 0xd2, 0x46, 0xc1, - 0x85, 0x0c, 0xed, 0x5a, 0x3b, 0x47, 0xa4, 0x32, 0x15, 0xd8, 0xaf, 0x37, 0xda, 0xa4, 0x32, 0x9d, - 0x03, 0xfd, 0x2b, 0x27, 0x96, 0x7f, 0xe5, 0xc4, 0xee, 0x2f, 0x25, 0x52, 0xf7, 0xda, 0xfe, 0x8b, - 0x76, 0xc9, 0xb6, 0xbd, 0x2c, 0xdc, 0xb8, 0xef, 0xf4, 0x3b, 0xbd, 0xcd, 0x85, 0x00, 0x94, 0xe3, - 0x1d, 0x7d, 0x4c, 0xb6, 0x61, 0xee, 0xe3, 0x4b, 0xcd, 0x3e, 0xe9, 0x5d, 0x6d, 0x02, 0x1c, 0xf1, - 0xeb, 0x33, 0x6a, 0x1d, 0x45, 0xb0, 0xc7, 0x6d, 0xdd, 0x98, 0x51, 0x0e, 0x04, 0x9d, 0x57, 0x52, - 0x16, 0x22, 0x2f, 0x64, 0xe6, 0x27, 0x7b, 0x1d, 0x80, 0x49, 0x21, 0x33, 0xba, 0x4f, 0xea, 0x1b, - 0xe3, 0x70, 0xa2, 0x37, 0x37, 0xba, 0x6c, 0x50, 0x7e, 0x75, 0xbf, 0xf1, 0x4f, 0x15, 0x53, 0x11, - 0xfd, 0xf3, 0xef, 0x2d, 0xbf, 0x9f, 0xa0, 0xe3, 0xff, 0x1f, 0x9b, 0x18, 0xa9, 0x6d, 0x94, 0x85, - 0x4d, 0xa8, 0xce, 0x37, 0x24, 0x7d, 0x4a, 0xb6, 0x21, 0xe8, 0x68, 0xc3, 0xd5, 0x6c, 0xba, 0x4a, - 0x03, 0x8e, 0x97, 0xf4, 0x19, 0xa9, 0xf9, 0x58, 0xa3, 0x25, 0xcd, 0x3e, 0xed, 0xfd, 0x26, 0x01, - 0xf8, 0x86, 0x85, 0x7e, 0x46, 0xaa, 0xce, 0x15, 0xde, 0xb4, 0x56, 0xef, 0x5a, 0x1a, 0x70, 0x7f, - 0xe7, 0x57, 0x82, 0xea, 0x1f, 0xae, 0x04, 0x0f, 0x20, 0x7c, 0x42, 0x6a, 0x9d, 0xe5, 0xb8, 0xb0, - 0x54, 0x78, 0x2d, 0xd2, 0x43, 0x20, 0x6f, 0x78, 0xb1, 0xfe, 0x07, 0x5e, 0x7c, 0x08, 0x2e, 0x83, - 0x67, 0x52, 0xb3, 0xc4, 0xe5, 0xa5, 0xc1, 0xeb, 0xf8, 0x4e, 0x6a, 0x96, 0x30, 0x19, 0xcf, 0xa5, - 0x36, 0x2a, 0xcf, 0x70, 0x71, 0x69, 0x6e, 0x7a, 0xb0, 0x07, 0xf9, 0xe6, 0x16, 0x73, 0x18, 0x0b, - 0x10, 0x77, 0x99, 0x0a, 0xf7, 0x54, 0xf7, 0x3f, 0x25, 0xd2, 0xba, 0x2e, 0x01, 0x8b, 0x65, 0x1a, - 0xbe, 0xcb, 0xb5, 0xaf, 0x07, 0x47, 0x20, 0xaa, 0xb2, 0x5c, 0xfb, 0x1d, 0xd6, 0x11, 0x80, 0x2e, - 0x95, 0xf5, 0x5b, 0x7e, 0x83, 0x3b, 0x02, 0x0a, 0xd0, 0xac, 0x4f, 0xdd, 0xb2, 0xb5, 0xed, 0x6b, - 0xdf, 0xd3, 0x20, 0x81, 0x4b, 0x33, 0x3a, 0xb8, 0xc2, 0x1d, 0x01, 0x5b, 0x11, 0xb4, 0x5d, 0xf4, - 0x69, 0x83, 0xe3, 0x79, 0x5f, 0x78, 0xbd, 0xfc, 0x34, 0xa1, 0x84, 0x54, 0xc7, 0x6f, 0x8e, 0x27, - 0x7c, 0x18, 0x7c, 0x44, 0x9b, 0xa4, 0x36, 0x78, 0x23, 0x8e, 0x27, 0xc7, 0xc3, 0xa0, 0x44, 0x1b, - 0xa4, 0x32, 0xe5, 0x93, 0xe9, 0x2c, 0x28, 0xd3, 0x3a, 0xd9, 0x9e, 0x4d, 0x46, 0xf3, 0x60, 0x0b, - 0x4e, 0xa3, 0x93, 0xc3, 0xc3, 0x60, 0x1b, 0xe4, 0x66, 0x73, 0x3e, 0x1e, 0xcc, 0x83, 0x0a, 0xc8, - 0x1d, 0x0c, 0x47, 0xaf, 0x4f, 0x0e, 0xe7, 0x41, 0x75, 0xff, 0x97, 0x92, 0x2f, 0xd6, 0x4d, 0xc6, - 0xc1, 0x4b, 0xc3, 0xa3, 0xe9, 0xfc, 0xa7, 0xe0, 0x23, 0x90, 0x3f, 0x38, 0x39, 0x9a, 0x06, 0x25, - 0x90, 0xe1, 0xc3, 0xd9, 0x1c, 0x3e, 0x2e, 0x03, 0xc7, 0xe0, 0x87, 0xe1, 0xe0, 0xc7, 0x60, 0x8b, - 0xb6, 0x48, 0x7d, 0xca, 0x87, 0x02, 0xb9, 0xb6, 0xe9, 0x2d, 0xd2, 0x9c, 0xbe, 0x7e, 0x33, 0x14, - 0xb3, 0x21, 0x7f, 0x3b, 0xe4, 0x41, 0x05, 0xbe, 0x3d, 0x9e, 0xcc, 0xc7, 0xa3, 0x9f, 0x82, 0x2a, - 0x0d, 0x48, 0x6b, 0x30, 0x3d, 0x19, 0x1f, 0x8f, 0x26, 0x8e, 0xbd, 0x46, 0x6f, 0x93, 0xf6, 0x06, - 0x71, 0xef, 0xd5, 0x01, 0x1a, 0x0d, 0x5f, 0xcf, 0x4f, 0xf8, 0xd0, 0x43, 0x0d, 0xf8, 0xfa, 0xed, - 0x90, 0xcf, 0xc6, 0x93, 0xe3, 0x80, 0xc0, 0x7f, 0xff, 0x7c, 0x3d, 0x9e, 0x8b, 0xe9, 0xf8, 0x20, - 0x68, 0xd2, 0xbb, 0x24, 0xb8, 0xf6, 0x9f, 0x18, 0xfc, 0x70, 0x78, 0x10, 0xb4, 0xfe, 0x17, 0x00, - 0x00, 0xff, 0xff, 0xf8, 0x9f, 0x0e, 0x7d, 0xca, 0x0d, 0x00, 0x00, -} diff --git a/vendor/github.com/checkpoint-restore/go-criu/test/main.go b/vendor/github.com/checkpoint-restore/go-criu/test/main.go deleted file mode 100644 index 418ebb8..0000000 --- a/vendor/github.com/checkpoint-restore/go-criu/test/main.go +++ /dev/null @@ -1,133 +0,0 @@ -package main - -import ( - "fmt" - "github.com/checkpoint-restore/go-criu" - "github.com/checkpoint-restore/go-criu/rpc" - "github.com/golang/protobuf/proto" - "os" - "strconv" -) - -// TestNfy struct -type TestNfy struct { - criu.NoNotify -} - -// PreDump test function -func (c TestNfy) PreDump() error { - fmt.Printf("TEST PRE DUMP\n") - return nil -} - -func doDump(c *criu.Criu, pidS string, imgDir string, pre bool, prevImg string) error { - fmt.Printf("Dumping\n") - pid, _ := strconv.Atoi(pidS) - img, err := os.Open(imgDir) - if err != nil { - return fmt.Errorf("can't open image dir (%s)", err) - } - defer img.Close() - - opts := rpc.CriuOpts{ - Pid: proto.Int32(int32(pid)), - ImagesDirFd: proto.Int32(int32(img.Fd())), - LogLevel: proto.Int32(4), - LogFile: proto.String("dump.log"), - } - - if prevImg != "" { - opts.ParentImg = proto.String(prevImg) - opts.TrackMem = proto.Bool(true) - } - - if pre { - err = c.PreDump(opts, TestNfy{}) - } else { - err = c.Dump(opts, TestNfy{}) - } - if err != nil { - return fmt.Errorf("dump fail (%s)", err) - } - - return nil -} - -// Usage: test $act $pid $images_dir -func main() { - c := criu.MakeCriu() - // Read out CRIU version - version, err := c.GetCriuVersion() - if err != nil { - fmt.Println(err) - os.Exit(1) - } - fmt.Println("CRIU version", version) - // Check if version at least 3.2 - result, err := c.IsCriuAtLeast(30200) - if err != nil { - fmt.Println(err) - os.Exit(1) - } - if !result { - fmt.Println("CRIU too old") - os.Exit(1) - } - act := os.Args[1] - switch act { - case "dump": - err := doDump(c, os.Args[2], os.Args[3], false, "") - if err != nil { - fmt.Print(err) - os.Exit(1) - } - case "dump2": - err := c.Prepare() - if err != nil { - fmt.Print(err) - os.Exit(1) - } - - err = doDump(c, os.Args[2], os.Args[3]+"/pre", true, "") - if err != nil { - fmt.Printf("pre-dump failed") - fmt.Print(err) - os.Exit(1) - } - err = doDump(c, os.Args[2], os.Args[3], false, "./pre") - if err != nil { - fmt.Printf("dump failed") - fmt.Print(err) - os.Exit(1) - } - - c.Cleanup() - case "restore": - fmt.Printf("Restoring\n") - img, err := os.Open(os.Args[2]) - if err != nil { - fmt.Printf("can't open image dir") - os.Exit(1) - } - defer img.Close() - - opts := rpc.CriuOpts{ - ImagesDirFd: proto.Int32(int32(img.Fd())), - LogLevel: proto.Int32(4), - LogFile: proto.String("restore.log"), - } - - err = c.Restore(opts, nil) - if err != nil { - fmt.Printf("Error:") - fmt.Print(err) - fmt.Printf("\n") - os.Exit(1) - } - default: - fmt.Printf("unknown action\n") - os.Exit(1) - } - - fmt.Printf("Success\n") -} diff --git a/vendor/github.com/checkpoint-restore/go-criu/test/phaul-main.go b/vendor/github.com/checkpoint-restore/go-criu/test/phaul-main.go deleted file mode 100644 index f1bec2c..0000000 --- a/vendor/github.com/checkpoint-restore/go-criu/test/phaul-main.go +++ /dev/null @@ -1,192 +0,0 @@ -package main - -import ( - "fmt" - "os" - "strconv" - "strings" - "syscall" - - "github.com/checkpoint-restore/go-criu" - "github.com/checkpoint-restore/go-criu/phaul" - "github.com/checkpoint-restore/go-criu/rpc" - "github.com/golang/protobuf/proto" -) - -type testLocal struct { - criu.NoNotify - r *testRemote -} - -type testRemote struct { - srv *phaul.Server -} - -/* Dir where test will put dump images */ -const imagesDir = "image" - -func prepareImages() error { - err := os.Mkdir(imagesDir, 0700) - if err != nil { - return err - } - - /* Work dir for PhaulClient */ - err = os.Mkdir(imagesDir+"/local", 0700) - if err != nil { - return err - } - - /* Work dir for PhaulServer */ - err = os.Mkdir(imagesDir+"/remote", 0700) - if err != nil { - return err - } - - /* Work dir for DumpCopyRestore */ - err = os.Mkdir(imagesDir+"/test", 0700) - if err != nil { - return err - } - - return nil -} - -func mergeImages(dumpDir, lastPreDumpDir string) error { - idir, err := os.Open(dumpDir) - if err != nil { - return err - } - - defer idir.Close() - - imgs, err := idir.Readdirnames(0) - if err != nil { - return err - } - - for _, fname := range imgs { - if !strings.HasSuffix(fname, ".img") { - continue - } - - fmt.Printf("\t%s -> %s/\n", fname, lastPreDumpDir) - err = syscall.Link(dumpDir+"/"+fname, lastPreDumpDir+"/"+fname) - if err != nil { - return err - } - } - - return nil -} - -func (r *testRemote) doRestore() error { - lastSrvImagesDir := r.srv.LastImagesDir() - /* - * In imagesDir we have images from dump, in the - * lastSrvImagesDir -- where server-side images - * (from page server, with pages and pagemaps) are. - * Need to put former into latter and restore from - * them. - */ - err := mergeImages(imagesDir+"/test", lastSrvImagesDir) - if err != nil { - return err - } - - imgDir, err := os.Open(lastSrvImagesDir) - if err != nil { - return err - } - defer imgDir.Close() - - opts := rpc.CriuOpts{ - LogLevel: proto.Int32(4), - LogFile: proto.String("restore.log"), - ImagesDirFd: proto.Int32(int32(imgDir.Fd())), - } - - cr := r.srv.GetCriu() - fmt.Printf("Do restore\n") - return cr.Restore(opts, nil) -} - -func (l *testLocal) PostDump() error { - return l.r.doRestore() -} - -func (l *testLocal) DumpCopyRestore(cr *criu.Criu, cfg phaul.Config, lastClnImagesDir string) error { - fmt.Printf("Final stage\n") - - imgDir, err := os.Open(imagesDir + "/test") - if err != nil { - return err - } - defer imgDir.Close() - - psi := rpc.CriuPageServerInfo{ - Fd: proto.Int32(int32(cfg.Memfd)), - } - - opts := rpc.CriuOpts{ - Pid: proto.Int32(int32(cfg.Pid)), - LogLevel: proto.Int32(4), - LogFile: proto.String("dump.log"), - ImagesDirFd: proto.Int32(int32(imgDir.Fd())), - TrackMem: proto.Bool(true), - ParentImg: proto.String(lastClnImagesDir), - Ps: &psi, - } - - fmt.Printf("Do dump\n") - return cr.Dump(opts, l) -} - -func main() { - pid, _ := strconv.Atoi(os.Args[1]) - fds, err := syscall.Socketpair(syscall.AF_LOCAL, syscall.SOCK_STREAM, 0) - if err != nil { - fmt.Printf("Can't make socketpair: %v\n", err) - os.Exit(1) - } - - err = prepareImages() - if err != nil { - fmt.Printf("Can't prepare dirs for images: %v\n", err) - os.Exit(1) - return - } - - fmt.Printf("Make server part (socket %d)\n", fds[1]) - srv, err := phaul.MakePhaulServer(phaul.Config{ - Pid: pid, - Memfd: fds[1], - Wdir: imagesDir + "/remote"}) - if err != nil { - fmt.Printf("Unable to run a server: %v", err) - os.Exit(1) - return - } - - r := &testRemote{srv} - - fmt.Printf("Make client part (socket %d)\n", fds[0]) - cln, err := phaul.MakePhaulClient(&testLocal{r: r}, srv, - phaul.Config{ - Pid: pid, - Memfd: fds[0], - Wdir: imagesDir + "/local"}) - if err != nil { - fmt.Printf("Unable to run a client: %v\n", err) - os.Exit(1) - } - - fmt.Printf("Migrate\n") - err = cln.Migrate() - if err != nil { - fmt.Printf("Failed: %v\n", err) - os.Exit(1) - } - - fmt.Printf("SUCCESS!\n") -} diff --git a/vendor/github.com/checkpoint-restore/go-criu/test/piggie.c b/vendor/github.com/checkpoint-restore/go-criu/test/piggie.c deleted file mode 100644 index 1dc0801..0000000 --- a/vendor/github.com/checkpoint-restore/go-criu/test/piggie.c +++ /dev/null @@ -1,57 +0,0 @@ -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include - -#define STKS (4*4096) - -#ifndef CLONE_NEWPID -#define CLONE_NEWPID 0x20000000 -#endif - -static int do_test(void *logf) -{ - int fd, i = 0; - - setsid(); - - close(0); - close(1); - close(2); - - fd = open("/dev/null", O_RDONLY); - if (fd != 0) { - dup2(fd, 0); - close(fd); - } - - fd = open(logf, O_WRONLY | O_TRUNC | O_CREAT, 0600); - dup2(fd, 1); - dup2(fd, 2); - if (fd != 1 && fd != 2) - close(fd); - - while (1) { - sleep(1); - printf("%d\n", i++); - fflush(stdout); - } - - return 0; -} - -int main(int argc, char **argv) -{ - int pid; - void *stk; - - stk = mmap(NULL, STKS, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANON | MAP_GROWSDOWN, 0, 0); - pid = clone(do_test, stk + STKS, SIGCHLD | CLONE_NEWPID, argv[1]); - printf("Child forked, pid %d\n", pid); - - return 0; -} diff --git a/vendor/github.com/checkpoint-restore/go-criu/v5/.gitignore b/vendor/github.com/checkpoint-restore/go-criu/v5/.gitignore new file mode 100644 index 0000000..1b87ff1 --- /dev/null +++ b/vendor/github.com/checkpoint-restore/go-criu/v5/.gitignore @@ -0,0 +1,6 @@ +test/test +test/test.coverage +test/piggie/piggie +test/phaul/phaul +test/phaul/phaul.coverage +image diff --git a/vendor/github.com/checkpoint-restore/go-criu/v5/.golangci.yml b/vendor/github.com/checkpoint-restore/go-criu/v5/.golangci.yml new file mode 100644 index 0000000..fbbac4b --- /dev/null +++ b/vendor/github.com/checkpoint-restore/go-criu/v5/.golangci.yml @@ -0,0 +1,12 @@ +run: + skip_dirs: + - rpc + - stats + +linters: + disable-all: false + presets: + - bugs + - performance + - unused + - format diff --git a/vendor/github.com/checkpoint-restore/go-criu/LICENSE b/vendor/github.com/checkpoint-restore/go-criu/v5/LICENSE similarity index 100% rename from vendor/github.com/checkpoint-restore/go-criu/LICENSE rename to vendor/github.com/checkpoint-restore/go-criu/v5/LICENSE diff --git a/vendor/github.com/checkpoint-restore/go-criu/v5/Makefile b/vendor/github.com/checkpoint-restore/go-criu/v5/Makefile new file mode 100644 index 0000000..67c43a0 --- /dev/null +++ b/vendor/github.com/checkpoint-restore/go-criu/v5/Makefile @@ -0,0 +1,107 @@ +SHELL = /bin/bash +GO ?= go +CC ?= gcc +COVERAGE_PATH ?= $(shell pwd)/.coverage +CRIU_FEATURE_MEM_TRACK = $(shell if criu check --feature mem_dirty_track > /dev/null; then echo 1; else echo 0; fi) +CRIU_FEATURE_LAZY_PAGES = $(shell if criu check --feature uffd-noncoop > /dev/null; then echo 1; else echo 0; fi) +CRIU_FEATURE_PIDFD_STORE = $(shell if criu check --feature pidfd_store > /dev/null; then echo 1; else echo 0; fi) + +export CRIU_FEATURE_MEM_TRACK CRIU_FEATURE_LAZY_PAGES CRIU_FEATURE_PIDFD_STORE + +all: build test phaul-test + +lint: + golangci-lint run ./... + +build: + $(GO) build -v ./... + +TEST_PAYLOAD := test/piggie/piggie +TEST_BINARIES := test/test $(TEST_PAYLOAD) test/phaul/phaul +COVERAGE_BINARIES := test/test.coverage test/phaul/phaul.coverage +test-bin: $(TEST_BINARIES) + +test/piggie/piggie: test/piggie/piggie.c + $(CC) $^ -o $@ + +test/test: test/main.go + $(GO) build -v -o $@ $^ + +test: $(TEST_BINARIES) + mkdir -p image + PID=$$(test/piggie/piggie) && { \ + test/test dump $$PID image && \ + test/test restore image; \ + pkill -9 piggie; \ + } + rm -rf image + +test/phaul/phaul: test/phaul/main.go + $(GO) build -v -o $@ $^ + +phaul-test: $(TEST_BINARIES) + rm -rf image + PID=$$(test/piggie/piggie) && { \ + test/phaul/phaul $$PID; \ + pkill -9 piggie; \ + } + +test/test.coverage: test/*.go + $(GO) test \ + -covermode=count \ + -coverpkg=./... \ + -mod=vendor \ + -tags coverage \ + -buildmode=pie -c -o $@ $^ + +test/phaul/phaul.coverage: test/phaul/*.go + $(GO) test \ + -covermode=count \ + -coverpkg=./... \ + -mod=vendor \ + -tags coverage \ + -buildmode=pie -c -o $@ $^ + +coverage: $(COVERAGE_BINARIES) $(TEST_PAYLOAD) + mkdir -p $(COVERAGE_PATH) + mkdir -p image + PID=$$(test/piggie/piggie) && { \ + test/test.coverage -test.coverprofile=coverprofile.integration.$$RANDOM -test.outputdir=${COVERAGE_PATH} COVERAGE dump $$PID image && \ + test/test.coverage -test.coverprofile=coverprofile.integration.$$RANDOM -test.outputdir=${COVERAGE_PATH} COVERAGE restore image; \ + pkill -9 piggie; \ + } + rm -rf image + PID=$$(test/piggie/piggie) && { \ + test/phaul/phaul.coverage -test.coverprofile=coverprofile.integration.$$RANDOM -test.outputdir=${COVERAGE_PATH} COVERAGE $$PID; \ + pkill -9 piggie; \ + } + echo "mode: set" > .coverage/coverage.out && cat .coverage/coverprofile* | \ + grep -v mode: | sort -r | awk '{if($$1 != last) {print $$0;last=$$1}}' >> .coverage/coverage.out + +clean: + @rm -f $(TEST_BINARIES) $(COVERAGE_BINARIES) codecov + @rm -rf image $(COVERAGE_PATH) + +rpc/rpc.proto: + curl -sSL https://raw.githubusercontent.com/checkpoint-restore/criu/master/images/rpc.proto -o $@ + +stats/stats.proto: + curl -sSL https://raw.githubusercontent.com/checkpoint-restore/criu/master/images/stats.proto -o $@ + +rpc/rpc.pb.go: rpc/rpc.proto + protoc --go_out=. --go_opt=M$^=rpc/ $^ + +stats/stats.pb.go: stats/stats.proto + protoc --go_out=. --go_opt=M$^=stats/ $^ + +vendor: + GO111MODULE=on $(GO) mod tidy + GO111MODULE=on $(GO) mod vendor + GO111MODULE=on $(GO) mod verify + +codecov: + curl -Os https://uploader.codecov.io/latest/linux/codecov + chmod +x codecov + ./codecov -f '.coverage/coverage.out' + +.PHONY: build test phaul-test test-bin clean lint vendor coverage codecov diff --git a/vendor/github.com/checkpoint-restore/go-criu/README.md b/vendor/github.com/checkpoint-restore/go-criu/v5/README.md similarity index 53% rename from vendor/github.com/checkpoint-restore/go-criu/README.md rename to vendor/github.com/checkpoint-restore/go-criu/v5/README.md index 610756d..a748332 100644 --- a/vendor/github.com/checkpoint-restore/go-criu/README.md +++ b/vendor/github.com/checkpoint-restore/go-criu/v5/README.md @@ -1,8 +1,10 @@ -[![master](https://travis-ci.org/checkpoint-restore/go-criu.svg?branch=master)](https://travis-ci.org/checkpoint-restore/go-criu) +[![test](https://github.com/checkpoint-restore/go-criu/workflows/ci/badge.svg?branch=master)](https://github.com/checkpoint-restore/go-criu/actions?query=workflow%3Aci) +[![verify](https://github.com/checkpoint-restore/go-criu/workflows/verify/badge.svg?branch=master)](https://github.com/checkpoint-restore/go-criu/actions?query=workflow%3Averify) +[![Go Reference](https://pkg.go.dev/badge/github.com/checkpoint-restore/go-criu.svg)](https://pkg.go.dev/github.com/checkpoint-restore/go-criu) -## go-criu -- Go bindings for [CRIU](https://criu.org/) +## go-criu -- Go bindings for CRIU -This repository provides Go bindings for CRIU. The code is based on the Go based PHaul +This repository provides Go bindings for [CRIU](https://criu.org/). The code is based on the Go-based PHaul implementation from the CRIU repository. For easier inclusion into other Go projects the CRIU Go bindings have been moved to this repository. @@ -10,17 +12,48 @@ The Go bindings provide an easy way to use the CRIU RPC calls from Go without th to set up all the infrastructure to make the actual RPC connection to CRIU. The following example would print the version of CRIU: -``` +```go +import ( + "log" + + "github.com/checkpoint-restore/go-criu/v5" +) + +func main() { c := criu.MakeCriu() version, err := c.GetCriuVersion() - fmt.Println(version) + if err != nil { + log.Fatalln(err) + } + log.Println(version) +} ``` + or to just check if at least a certain CRIU version is installed: -``` + +```go c := criu.MakeCriu() result, err := c.IsCriuAtLeast(31100) ``` +## Releases + +The first go-criu release was 3.11 based on CRIU 3.11. The initial plan +was to follow CRIU so that go-criu would carry the same version number as +CRIU. + +As go-criu is imported in other projects and as Go modules are expected +to follow Semantic Versioning go-criu will also follow Semantic Versioning +starting with the 4.0.0 release. + +The following table shows the relation between go-criu and criu versions: + +| Major version | Latest release | CRIU version | +| -------------- | -------------- | ------------ | +| v5             | 5.2.0         | 3.16         | +| v5             | 5.0.0         | 3.15         | +| v4             | 4.1.0         | 3.14         | + ## How to contribute While bug fixes can first be identified via an "issue", that is not required. @@ -53,6 +86,11 @@ by adding a "Signed-off-by" line containing the contributor's name and e-mail to every commit message. Your signature certifies that you wrote the patch or otherwise have the right to pass it on as an open-source patch. -### License +### License and copyright -The license of go-criu is the Apache 2.0 license. +Unless mentioned otherwise in a specific file's header, all code in +this project is released under the Apache 2.0 license. + +The author of a change remains the copyright holder of their code +(no copyright assignment). The list of authors and contributors can be +retrieved from the git commit history and in some cases, the file headers. diff --git a/vendor/github.com/checkpoint-restore/go-criu/v5/features.go b/vendor/github.com/checkpoint-restore/go-criu/v5/features.go new file mode 100644 index 0000000..c7127f9 --- /dev/null +++ b/vendor/github.com/checkpoint-restore/go-criu/v5/features.go @@ -0,0 +1,45 @@ +package criu + +import ( + "fmt" + + "github.com/checkpoint-restore/go-criu/v5/rpc" +) + +// Feature checking in go-criu is based on the libcriu feature checking function. + +// Feature checking allows the user to check if CRIU supports +// certain features. There are CRIU features which do not depend +// on the version of CRIU but on kernel features or architecture. +// +// One example is memory tracking. Memory tracking can be disabled +// in the kernel or there are architectures which do not support +// it (aarch64 for example). By using the feature check a libcriu +// user can easily query CRIU if a certain feature is available. +// +// The features which should be checked can be marked in the +// structure 'struct criu_feature_check'. Each structure member +// that is set to true will result in CRIU checking for the +// availability of that feature in the current combination of +// CRIU/kernel/architecture. +// +// Available features will be set to true when the function +// returns successfully. Missing features will be set to false. + +func (c *Criu) FeatureCheck(features *rpc.CriuFeatures) (*rpc.CriuFeatures, error) { + resp, err := c.doSwrkWithResp( + rpc.CriuReqType_FEATURE_CHECK, + nil, + nil, + features, + ) + if err != nil { + return nil, err + } + + if resp.GetType() != rpc.CriuReqType_FEATURE_CHECK { + return nil, fmt.Errorf("Unexpected CRIU RPC response") + } + + return features, nil +} diff --git a/vendor/github.com/checkpoint-restore/go-criu/v5/go.mod b/vendor/github.com/checkpoint-restore/go-criu/v5/go.mod new file mode 100644 index 0000000..cf4fea9 --- /dev/null +++ b/vendor/github.com/checkpoint-restore/go-criu/v5/go.mod @@ -0,0 +1,8 @@ +module github.com/checkpoint-restore/go-criu/v5 + +go 1.13 + +require ( + golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c + google.golang.org/protobuf v1.27.1 +) diff --git a/vendor/github.com/checkpoint-restore/go-criu/v5/go.sum b/vendor/github.com/checkpoint-restore/go-criu/v5/go.sum new file mode 100644 index 0000000..789fdcb --- /dev/null +++ b/vendor/github.com/checkpoint-restore/go-criu/v5/go.sum @@ -0,0 +1,10 @@ +github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= +github.com/google/go-cmp v0.5.5 h1:Khx7svrCpmxxtHBq5j2mp/xVjsi8hQMfNLvJFAlrGgU= +github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c h1:VwygUrnw9jn88c4u8GD3rZQbqrP/tgas88tPUbBxQrk= +golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= +google.golang.org/protobuf v1.27.1 h1:SnqbnDw1V7RiZcXPx5MEeqPv2s79L9i7BJUlG/+RurQ= +google.golang.org/protobuf v1.27.1/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= diff --git a/vendor/github.com/checkpoint-restore/go-criu/v5/main.go b/vendor/github.com/checkpoint-restore/go-criu/v5/main.go new file mode 100644 index 0000000..88b1b24 --- /dev/null +++ b/vendor/github.com/checkpoint-restore/go-criu/v5/main.go @@ -0,0 +1,264 @@ +package criu + +import ( + "errors" + "fmt" + "os" + "os/exec" + "strconv" + "syscall" + + "github.com/checkpoint-restore/go-criu/v5/rpc" + "google.golang.org/protobuf/proto" +) + +// Criu struct +type Criu struct { + swrkCmd *exec.Cmd + swrkSk *os.File + swrkPath string +} + +// MakeCriu returns the Criu object required for most operations +func MakeCriu() *Criu { + return &Criu{ + swrkPath: "criu", + } +} + +// SetCriuPath allows setting the path to the CRIU binary +// if it is in a non standard location +func (c *Criu) SetCriuPath(path string) { + c.swrkPath = path +} + +// Prepare sets up everything for the RPC communication to CRIU +func (c *Criu) Prepare() error { + fds, err := syscall.Socketpair(syscall.AF_LOCAL, syscall.SOCK_SEQPACKET, 0) + if err != nil { + return err + } + + cln := os.NewFile(uintptr(fds[0]), "criu-xprt-cln") + syscall.CloseOnExec(fds[0]) + srv := os.NewFile(uintptr(fds[1]), "criu-xprt-srv") + defer srv.Close() + + args := []string{"swrk", strconv.Itoa(fds[1])} + // #nosec G204 + cmd := exec.Command(c.swrkPath, args...) + + err = cmd.Start() + if err != nil { + cln.Close() + return err + } + + c.swrkCmd = cmd + c.swrkSk = cln + + return nil +} + +// Cleanup cleans up +func (c *Criu) Cleanup() { + if c.swrkCmd != nil { + c.swrkSk.Close() + c.swrkSk = nil + _ = c.swrkCmd.Wait() + c.swrkCmd = nil + } +} + +func (c *Criu) sendAndRecv(reqB []byte) ([]byte, int, error) { + cln := c.swrkSk + _, err := cln.Write(reqB) + if err != nil { + return nil, 0, err + } + + respB := make([]byte, 2*4096) + n, err := cln.Read(respB) + if err != nil { + return nil, 0, err + } + + return respB, n, nil +} + +func (c *Criu) doSwrk(reqType rpc.CriuReqType, opts *rpc.CriuOpts, nfy Notify) error { + resp, err := c.doSwrkWithResp(reqType, opts, nfy, nil) + if err != nil { + return err + } + respType := resp.GetType() + if respType != reqType { + return errors.New("unexpected CRIU RPC response") + } + + return nil +} + +func (c *Criu) doSwrkWithResp(reqType rpc.CriuReqType, opts *rpc.CriuOpts, nfy Notify, features *rpc.CriuFeatures) (*rpc.CriuResp, error) { + var resp *rpc.CriuResp + + req := rpc.CriuReq{ + Type: &reqType, + Opts: opts, + } + + if nfy != nil { + opts.NotifyScripts = proto.Bool(true) + } + + if features != nil { + req.Features = features + } + + if c.swrkCmd == nil { + err := c.Prepare() + if err != nil { + return nil, err + } + + defer c.Cleanup() + } + + for { + reqB, err := proto.Marshal(&req) + if err != nil { + return nil, err + } + + respB, respS, err := c.sendAndRecv(reqB) + if err != nil { + return nil, err + } + + resp = &rpc.CriuResp{} + err = proto.Unmarshal(respB[:respS], resp) + if err != nil { + return nil, err + } + + if !resp.GetSuccess() { + return resp, fmt.Errorf("operation failed (msg:%s err:%d)", + resp.GetCrErrmsg(), resp.GetCrErrno()) + } + + respType := resp.GetType() + if respType != rpc.CriuReqType_NOTIFY { + break + } + if nfy == nil { + return resp, errors.New("unexpected notify") + } + + notify := resp.GetNotify() + switch notify.GetScript() { + case "pre-dump": + err = nfy.PreDump() + case "post-dump": + err = nfy.PostDump() + case "pre-restore": + err = nfy.PreRestore() + case "post-restore": + err = nfy.PostRestore(notify.GetPid()) + case "network-lock": + err = nfy.NetworkLock() + case "network-unlock": + err = nfy.NetworkUnlock() + case "setup-namespaces": + err = nfy.SetupNamespaces(notify.GetPid()) + case "post-setup-namespaces": + err = nfy.PostSetupNamespaces() + case "post-resume": + err = nfy.PostResume() + default: + err = nil + } + + if err != nil { + return resp, err + } + + req = rpc.CriuReq{ + Type: &respType, + NotifySuccess: proto.Bool(true), + } + } + + return resp, nil +} + +// Dump dumps a process +func (c *Criu) Dump(opts *rpc.CriuOpts, nfy Notify) error { + return c.doSwrk(rpc.CriuReqType_DUMP, opts, nfy) +} + +// Restore restores a process +func (c *Criu) Restore(opts *rpc.CriuOpts, nfy Notify) error { + return c.doSwrk(rpc.CriuReqType_RESTORE, opts, nfy) +} + +// PreDump does a pre-dump +func (c *Criu) PreDump(opts *rpc.CriuOpts, nfy Notify) error { + return c.doSwrk(rpc.CriuReqType_PRE_DUMP, opts, nfy) +} + +// StartPageServer starts the page server +func (c *Criu) StartPageServer(opts *rpc.CriuOpts) error { + return c.doSwrk(rpc.CriuReqType_PAGE_SERVER, opts, nil) +} + +// StartPageServerChld starts the page server and returns PID and port +func (c *Criu) StartPageServerChld(opts *rpc.CriuOpts) (int, int, error) { + resp, err := c.doSwrkWithResp(rpc.CriuReqType_PAGE_SERVER_CHLD, opts, nil, nil) + if err != nil { + return 0, 0, err + } + + return int(resp.Ps.GetPid()), int(resp.Ps.GetPort()), nil +} + +// GetCriuVersion executes the VERSION RPC call and returns the version +// as an integer. Major * 10000 + Minor * 100 + SubLevel +func (c *Criu) GetCriuVersion() (int, error) { + resp, err := c.doSwrkWithResp(rpc.CriuReqType_VERSION, nil, nil, nil) + if err != nil { + return 0, err + } + + if resp.GetType() != rpc.CriuReqType_VERSION { + return 0, fmt.Errorf("Unexpected CRIU RPC response") + } + + version := int(*resp.GetVersion().MajorNumber) * 10000 + version += int(*resp.GetVersion().MinorNumber) * 100 + if resp.GetVersion().Sublevel != nil { + version += int(*resp.GetVersion().Sublevel) + } + + if resp.GetVersion().Gitid != nil { + // taken from runc: if it is a git release -> increase minor by 1 + version -= (version % 100) + version += 100 + } + + return version, nil +} + +// IsCriuAtLeast checks if the version is at least the same +// as the parameter version +func (c *Criu) IsCriuAtLeast(version int) (bool, error) { + criuVersion, err := c.GetCriuVersion() + if err != nil { + return false, err + } + + if criuVersion >= version { + return true, nil + } + + return false, nil +} diff --git a/vendor/github.com/checkpoint-restore/go-criu/v5/notify.go b/vendor/github.com/checkpoint-restore/go-criu/v5/notify.go new file mode 100644 index 0000000..a177f2b --- /dev/null +++ b/vendor/github.com/checkpoint-restore/go-criu/v5/notify.go @@ -0,0 +1,62 @@ +package criu + +// Notify interface +type Notify interface { + PreDump() error + PostDump() error + PreRestore() error + PostRestore(pid int32) error + NetworkLock() error + NetworkUnlock() error + SetupNamespaces(pid int32) error + PostSetupNamespaces() error + PostResume() error +} + +// NoNotify struct +type NoNotify struct{} + +// PreDump NoNotify +func (c NoNotify) PreDump() error { + return nil +} + +// PostDump NoNotify +func (c NoNotify) PostDump() error { + return nil +} + +// PreRestore NoNotify +func (c NoNotify) PreRestore() error { + return nil +} + +// PostRestore NoNotify +func (c NoNotify) PostRestore(pid int32) error { + return nil +} + +// NetworkLock NoNotify +func (c NoNotify) NetworkLock() error { + return nil +} + +// NetworkUnlock NoNotify +func (c NoNotify) NetworkUnlock() error { + return nil +} + +// SetupNamespaces NoNotify +func (c NoNotify) SetupNamespaces(pid int32) error { + return nil +} + +// PostSetupNamespaces NoNotify +func (c NoNotify) PostSetupNamespaces() error { + return nil +} + +// PostResume NoNotify +func (c NoNotify) PostResume() error { + return nil +} diff --git a/vendor/github.com/checkpoint-restore/go-criu/v5/rpc/rpc.pb.go b/vendor/github.com/checkpoint-restore/go-criu/v5/rpc/rpc.pb.go new file mode 100644 index 0000000..15e33fe --- /dev/null +++ b/vendor/github.com/checkpoint-restore/go-criu/v5/rpc/rpc.pb.go @@ -0,0 +1,2237 @@ +// SPDX-License-Identifier: MIT + +// Code generated by protoc-gen-go. DO NOT EDIT. +// versions: +// protoc-gen-go v1.27.1 +// protoc v3.12.4 +// source: rpc/rpc.proto + +package rpc + +import ( + protoreflect "google.golang.org/protobuf/reflect/protoreflect" + protoimpl "google.golang.org/protobuf/runtime/protoimpl" + reflect "reflect" + sync "sync" +) + +const ( + // Verify that this generated code is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) + // Verify that runtime/protoimpl is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) +) + +type CriuCgMode int32 + +const ( + CriuCgMode_IGNORE CriuCgMode = 0 + CriuCgMode_CG_NONE CriuCgMode = 1 + CriuCgMode_PROPS CriuCgMode = 2 + CriuCgMode_SOFT CriuCgMode = 3 + CriuCgMode_FULL CriuCgMode = 4 + CriuCgMode_STRICT CriuCgMode = 5 + CriuCgMode_DEFAULT CriuCgMode = 6 +) + +// Enum value maps for CriuCgMode. +var ( + CriuCgMode_name = map[int32]string{ + 0: "IGNORE", + 1: "CG_NONE", + 2: "PROPS", + 3: "SOFT", + 4: "FULL", + 5: "STRICT", + 6: "DEFAULT", + } + CriuCgMode_value = map[string]int32{ + "IGNORE": 0, + "CG_NONE": 1, + "PROPS": 2, + "SOFT": 3, + "FULL": 4, + "STRICT": 5, + "DEFAULT": 6, + } +) + +func (x CriuCgMode) Enum() *CriuCgMode { + p := new(CriuCgMode) + *p = x + return p +} + +func (x CriuCgMode) String() string { + return protoimpl.X.EnumStringOf(x.Descriptor(), protoreflect.EnumNumber(x)) +} + +func (CriuCgMode) Descriptor() protoreflect.EnumDescriptor { + return file_rpc_rpc_proto_enumTypes[0].Descriptor() +} + +func (CriuCgMode) Type() protoreflect.EnumType { + return &file_rpc_rpc_proto_enumTypes[0] +} + +func (x CriuCgMode) Number() protoreflect.EnumNumber { + return protoreflect.EnumNumber(x) +} + +// Deprecated: Do not use. +func (x *CriuCgMode) UnmarshalJSON(b []byte) error { + num, err := protoimpl.X.UnmarshalJSONEnum(x.Descriptor(), b) + if err != nil { + return err + } + *x = CriuCgMode(num) + return nil +} + +// Deprecated: Use CriuCgMode.Descriptor instead. +func (CriuCgMode) EnumDescriptor() ([]byte, []int) { + return file_rpc_rpc_proto_rawDescGZIP(), []int{0} +} + +type CriuPreDumpMode int32 + +const ( + CriuPreDumpMode_SPLICE CriuPreDumpMode = 1 + CriuPreDumpMode_VM_READ CriuPreDumpMode = 2 +) + +// Enum value maps for CriuPreDumpMode. +var ( + CriuPreDumpMode_name = map[int32]string{ + 1: "SPLICE", + 2: "VM_READ", + } + CriuPreDumpMode_value = map[string]int32{ + "SPLICE": 1, + "VM_READ": 2, + } +) + +func (x CriuPreDumpMode) Enum() *CriuPreDumpMode { + p := new(CriuPreDumpMode) + *p = x + return p +} + +func (x CriuPreDumpMode) String() string { + return protoimpl.X.EnumStringOf(x.Descriptor(), protoreflect.EnumNumber(x)) +} + +func (CriuPreDumpMode) Descriptor() protoreflect.EnumDescriptor { + return file_rpc_rpc_proto_enumTypes[1].Descriptor() +} + +func (CriuPreDumpMode) Type() protoreflect.EnumType { + return &file_rpc_rpc_proto_enumTypes[1] +} + +func (x CriuPreDumpMode) Number() protoreflect.EnumNumber { + return protoreflect.EnumNumber(x) +} + +// Deprecated: Do not use. +func (x *CriuPreDumpMode) UnmarshalJSON(b []byte) error { + num, err := protoimpl.X.UnmarshalJSONEnum(x.Descriptor(), b) + if err != nil { + return err + } + *x = CriuPreDumpMode(num) + return nil +} + +// Deprecated: Use CriuPreDumpMode.Descriptor instead. +func (CriuPreDumpMode) EnumDescriptor() ([]byte, []int) { + return file_rpc_rpc_proto_rawDescGZIP(), []int{1} +} + +type CriuReqType int32 + +const ( + CriuReqType_EMPTY CriuReqType = 0 + CriuReqType_DUMP CriuReqType = 1 + CriuReqType_RESTORE CriuReqType = 2 + CriuReqType_CHECK CriuReqType = 3 + CriuReqType_PRE_DUMP CriuReqType = 4 + CriuReqType_PAGE_SERVER CriuReqType = 5 + CriuReqType_NOTIFY CriuReqType = 6 + CriuReqType_CPUINFO_DUMP CriuReqType = 7 + CriuReqType_CPUINFO_CHECK CriuReqType = 8 + CriuReqType_FEATURE_CHECK CriuReqType = 9 + CriuReqType_VERSION CriuReqType = 10 + CriuReqType_WAIT_PID CriuReqType = 11 + CriuReqType_PAGE_SERVER_CHLD CriuReqType = 12 +) + +// Enum value maps for CriuReqType. +var ( + CriuReqType_name = map[int32]string{ + 0: "EMPTY", + 1: "DUMP", + 2: "RESTORE", + 3: "CHECK", + 4: "PRE_DUMP", + 5: "PAGE_SERVER", + 6: "NOTIFY", + 7: "CPUINFO_DUMP", + 8: "CPUINFO_CHECK", + 9: "FEATURE_CHECK", + 10: "VERSION", + 11: "WAIT_PID", + 12: "PAGE_SERVER_CHLD", + } + CriuReqType_value = map[string]int32{ + "EMPTY": 0, + "DUMP": 1, + "RESTORE": 2, + "CHECK": 3, + "PRE_DUMP": 4, + "PAGE_SERVER": 5, + "NOTIFY": 6, + "CPUINFO_DUMP": 7, + "CPUINFO_CHECK": 8, + "FEATURE_CHECK": 9, + "VERSION": 10, + "WAIT_PID": 11, + "PAGE_SERVER_CHLD": 12, + } +) + +func (x CriuReqType) Enum() *CriuReqType { + p := new(CriuReqType) + *p = x + return p +} + +func (x CriuReqType) String() string { + return protoimpl.X.EnumStringOf(x.Descriptor(), protoreflect.EnumNumber(x)) +} + +func (CriuReqType) Descriptor() protoreflect.EnumDescriptor { + return file_rpc_rpc_proto_enumTypes[2].Descriptor() +} + +func (CriuReqType) Type() protoreflect.EnumType { + return &file_rpc_rpc_proto_enumTypes[2] +} + +func (x CriuReqType) Number() protoreflect.EnumNumber { + return protoreflect.EnumNumber(x) +} + +// Deprecated: Do not use. +func (x *CriuReqType) UnmarshalJSON(b []byte) error { + num, err := protoimpl.X.UnmarshalJSONEnum(x.Descriptor(), b) + if err != nil { + return err + } + *x = CriuReqType(num) + return nil +} + +// Deprecated: Use CriuReqType.Descriptor instead. +func (CriuReqType) EnumDescriptor() ([]byte, []int) { + return file_rpc_rpc_proto_rawDescGZIP(), []int{2} +} + +type CriuPageServerInfo struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Address *string `protobuf:"bytes,1,opt,name=address" json:"address,omitempty"` + Port *int32 `protobuf:"varint,2,opt,name=port" json:"port,omitempty"` + Pid *int32 `protobuf:"varint,3,opt,name=pid" json:"pid,omitempty"` + Fd *int32 `protobuf:"varint,4,opt,name=fd" json:"fd,omitempty"` +} + +func (x *CriuPageServerInfo) Reset() { + *x = CriuPageServerInfo{} + if protoimpl.UnsafeEnabled { + mi := &file_rpc_rpc_proto_msgTypes[0] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *CriuPageServerInfo) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*CriuPageServerInfo) ProtoMessage() {} + +func (x *CriuPageServerInfo) ProtoReflect() protoreflect.Message { + mi := &file_rpc_rpc_proto_msgTypes[0] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use CriuPageServerInfo.ProtoReflect.Descriptor instead. +func (*CriuPageServerInfo) Descriptor() ([]byte, []int) { + return file_rpc_rpc_proto_rawDescGZIP(), []int{0} +} + +func (x *CriuPageServerInfo) GetAddress() string { + if x != nil && x.Address != nil { + return *x.Address + } + return "" +} + +func (x *CriuPageServerInfo) GetPort() int32 { + if x != nil && x.Port != nil { + return *x.Port + } + return 0 +} + +func (x *CriuPageServerInfo) GetPid() int32 { + if x != nil && x.Pid != nil { + return *x.Pid + } + return 0 +} + +func (x *CriuPageServerInfo) GetFd() int32 { + if x != nil && x.Fd != nil { + return *x.Fd + } + return 0 +} + +type CriuVethPair struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + IfIn *string `protobuf:"bytes,1,req,name=if_in,json=ifIn" json:"if_in,omitempty"` + IfOut *string `protobuf:"bytes,2,req,name=if_out,json=ifOut" json:"if_out,omitempty"` +} + +func (x *CriuVethPair) Reset() { + *x = CriuVethPair{} + if protoimpl.UnsafeEnabled { + mi := &file_rpc_rpc_proto_msgTypes[1] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *CriuVethPair) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*CriuVethPair) ProtoMessage() {} + +func (x *CriuVethPair) ProtoReflect() protoreflect.Message { + mi := &file_rpc_rpc_proto_msgTypes[1] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use CriuVethPair.ProtoReflect.Descriptor instead. +func (*CriuVethPair) Descriptor() ([]byte, []int) { + return file_rpc_rpc_proto_rawDescGZIP(), []int{1} +} + +func (x *CriuVethPair) GetIfIn() string { + if x != nil && x.IfIn != nil { + return *x.IfIn + } + return "" +} + +func (x *CriuVethPair) GetIfOut() string { + if x != nil && x.IfOut != nil { + return *x.IfOut + } + return "" +} + +type ExtMountMap struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Key *string `protobuf:"bytes,1,req,name=key" json:"key,omitempty"` + Val *string `protobuf:"bytes,2,req,name=val" json:"val,omitempty"` +} + +func (x *ExtMountMap) Reset() { + *x = ExtMountMap{} + if protoimpl.UnsafeEnabled { + mi := &file_rpc_rpc_proto_msgTypes[2] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *ExtMountMap) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*ExtMountMap) ProtoMessage() {} + +func (x *ExtMountMap) ProtoReflect() protoreflect.Message { + mi := &file_rpc_rpc_proto_msgTypes[2] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use ExtMountMap.ProtoReflect.Descriptor instead. +func (*ExtMountMap) Descriptor() ([]byte, []int) { + return file_rpc_rpc_proto_rawDescGZIP(), []int{2} +} + +func (x *ExtMountMap) GetKey() string { + if x != nil && x.Key != nil { + return *x.Key + } + return "" +} + +func (x *ExtMountMap) GetVal() string { + if x != nil && x.Val != nil { + return *x.Val + } + return "" +} + +type JoinNamespace struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Ns *string `protobuf:"bytes,1,req,name=ns" json:"ns,omitempty"` + NsFile *string `protobuf:"bytes,2,req,name=ns_file,json=nsFile" json:"ns_file,omitempty"` + ExtraOpt *string `protobuf:"bytes,3,opt,name=extra_opt,json=extraOpt" json:"extra_opt,omitempty"` +} + +func (x *JoinNamespace) Reset() { + *x = JoinNamespace{} + if protoimpl.UnsafeEnabled { + mi := &file_rpc_rpc_proto_msgTypes[3] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *JoinNamespace) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*JoinNamespace) ProtoMessage() {} + +func (x *JoinNamespace) ProtoReflect() protoreflect.Message { + mi := &file_rpc_rpc_proto_msgTypes[3] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use JoinNamespace.ProtoReflect.Descriptor instead. +func (*JoinNamespace) Descriptor() ([]byte, []int) { + return file_rpc_rpc_proto_rawDescGZIP(), []int{3} +} + +func (x *JoinNamespace) GetNs() string { + if x != nil && x.Ns != nil { + return *x.Ns + } + return "" +} + +func (x *JoinNamespace) GetNsFile() string { + if x != nil && x.NsFile != nil { + return *x.NsFile + } + return "" +} + +func (x *JoinNamespace) GetExtraOpt() string { + if x != nil && x.ExtraOpt != nil { + return *x.ExtraOpt + } + return "" +} + +type InheritFd struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Key *string `protobuf:"bytes,1,req,name=key" json:"key,omitempty"` + Fd *int32 `protobuf:"varint,2,req,name=fd" json:"fd,omitempty"` +} + +func (x *InheritFd) Reset() { + *x = InheritFd{} + if protoimpl.UnsafeEnabled { + mi := &file_rpc_rpc_proto_msgTypes[4] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *InheritFd) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*InheritFd) ProtoMessage() {} + +func (x *InheritFd) ProtoReflect() protoreflect.Message { + mi := &file_rpc_rpc_proto_msgTypes[4] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use InheritFd.ProtoReflect.Descriptor instead. +func (*InheritFd) Descriptor() ([]byte, []int) { + return file_rpc_rpc_proto_rawDescGZIP(), []int{4} +} + +func (x *InheritFd) GetKey() string { + if x != nil && x.Key != nil { + return *x.Key + } + return "" +} + +func (x *InheritFd) GetFd() int32 { + if x != nil && x.Fd != nil { + return *x.Fd + } + return 0 +} + +type CgroupRoot struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Ctrl *string `protobuf:"bytes,1,opt,name=ctrl" json:"ctrl,omitempty"` + Path *string `protobuf:"bytes,2,req,name=path" json:"path,omitempty"` +} + +func (x *CgroupRoot) Reset() { + *x = CgroupRoot{} + if protoimpl.UnsafeEnabled { + mi := &file_rpc_rpc_proto_msgTypes[5] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *CgroupRoot) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*CgroupRoot) ProtoMessage() {} + +func (x *CgroupRoot) ProtoReflect() protoreflect.Message { + mi := &file_rpc_rpc_proto_msgTypes[5] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use CgroupRoot.ProtoReflect.Descriptor instead. +func (*CgroupRoot) Descriptor() ([]byte, []int) { + return file_rpc_rpc_proto_rawDescGZIP(), []int{5} +} + +func (x *CgroupRoot) GetCtrl() string { + if x != nil && x.Ctrl != nil { + return *x.Ctrl + } + return "" +} + +func (x *CgroupRoot) GetPath() string { + if x != nil && x.Path != nil { + return *x.Path + } + return "" +} + +type UnixSk struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Inode *uint32 `protobuf:"varint,1,req,name=inode" json:"inode,omitempty"` +} + +func (x *UnixSk) Reset() { + *x = UnixSk{} + if protoimpl.UnsafeEnabled { + mi := &file_rpc_rpc_proto_msgTypes[6] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *UnixSk) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*UnixSk) ProtoMessage() {} + +func (x *UnixSk) ProtoReflect() protoreflect.Message { + mi := &file_rpc_rpc_proto_msgTypes[6] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use UnixSk.ProtoReflect.Descriptor instead. +func (*UnixSk) Descriptor() ([]byte, []int) { + return file_rpc_rpc_proto_rawDescGZIP(), []int{6} +} + +func (x *UnixSk) GetInode() uint32 { + if x != nil && x.Inode != nil { + return *x.Inode + } + return 0 +} + +type CriuOpts struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + ImagesDirFd *int32 `protobuf:"varint,1,req,name=images_dir_fd,json=imagesDirFd" json:"images_dir_fd,omitempty"` + Pid *int32 `protobuf:"varint,2,opt,name=pid" json:"pid,omitempty"` // if not set on dump, will dump requesting process + LeaveRunning *bool `protobuf:"varint,3,opt,name=leave_running,json=leaveRunning" json:"leave_running,omitempty"` + ExtUnixSk *bool `protobuf:"varint,4,opt,name=ext_unix_sk,json=extUnixSk" json:"ext_unix_sk,omitempty"` + TcpEstablished *bool `protobuf:"varint,5,opt,name=tcp_established,json=tcpEstablished" json:"tcp_established,omitempty"` + EvasiveDevices *bool `protobuf:"varint,6,opt,name=evasive_devices,json=evasiveDevices" json:"evasive_devices,omitempty"` + ShellJob *bool `protobuf:"varint,7,opt,name=shell_job,json=shellJob" json:"shell_job,omitempty"` + FileLocks *bool `protobuf:"varint,8,opt,name=file_locks,json=fileLocks" json:"file_locks,omitempty"` + LogLevel *int32 `protobuf:"varint,9,opt,name=log_level,json=logLevel,def=2" json:"log_level,omitempty"` + LogFile *string `protobuf:"bytes,10,opt,name=log_file,json=logFile" json:"log_file,omitempty"` // No subdirs are allowed. Consider using work-dir + Ps *CriuPageServerInfo `protobuf:"bytes,11,opt,name=ps" json:"ps,omitempty"` + NotifyScripts *bool `protobuf:"varint,12,opt,name=notify_scripts,json=notifyScripts" json:"notify_scripts,omitempty"` + Root *string `protobuf:"bytes,13,opt,name=root" json:"root,omitempty"` + ParentImg *string `protobuf:"bytes,14,opt,name=parent_img,json=parentImg" json:"parent_img,omitempty"` + TrackMem *bool `protobuf:"varint,15,opt,name=track_mem,json=trackMem" json:"track_mem,omitempty"` + AutoDedup *bool `protobuf:"varint,16,opt,name=auto_dedup,json=autoDedup" json:"auto_dedup,omitempty"` + WorkDirFd *int32 `protobuf:"varint,17,opt,name=work_dir_fd,json=workDirFd" json:"work_dir_fd,omitempty"` + LinkRemap *bool `protobuf:"varint,18,opt,name=link_remap,json=linkRemap" json:"link_remap,omitempty"` + Veths []*CriuVethPair `protobuf:"bytes,19,rep,name=veths" json:"veths,omitempty"` // DEPRECATED, use external instead + CpuCap *uint32 `protobuf:"varint,20,opt,name=cpu_cap,json=cpuCap,def=4294967295" json:"cpu_cap,omitempty"` + ForceIrmap *bool `protobuf:"varint,21,opt,name=force_irmap,json=forceIrmap" json:"force_irmap,omitempty"` + ExecCmd []string `protobuf:"bytes,22,rep,name=exec_cmd,json=execCmd" json:"exec_cmd,omitempty"` + ExtMnt []*ExtMountMap `protobuf:"bytes,23,rep,name=ext_mnt,json=extMnt" json:"ext_mnt,omitempty"` // DEPRECATED, use external instead + ManageCgroups *bool `protobuf:"varint,24,opt,name=manage_cgroups,json=manageCgroups" json:"manage_cgroups,omitempty"` // backward compatibility + CgRoot []*CgroupRoot `protobuf:"bytes,25,rep,name=cg_root,json=cgRoot" json:"cg_root,omitempty"` + RstSibling *bool `protobuf:"varint,26,opt,name=rst_sibling,json=rstSibling" json:"rst_sibling,omitempty"` // swrk only + InheritFd []*InheritFd `protobuf:"bytes,27,rep,name=inherit_fd,json=inheritFd" json:"inherit_fd,omitempty"` // swrk only + AutoExtMnt *bool `protobuf:"varint,28,opt,name=auto_ext_mnt,json=autoExtMnt" json:"auto_ext_mnt,omitempty"` + ExtSharing *bool `protobuf:"varint,29,opt,name=ext_sharing,json=extSharing" json:"ext_sharing,omitempty"` + ExtMasters *bool `protobuf:"varint,30,opt,name=ext_masters,json=extMasters" json:"ext_masters,omitempty"` + SkipMnt []string `protobuf:"bytes,31,rep,name=skip_mnt,json=skipMnt" json:"skip_mnt,omitempty"` + EnableFs []string `protobuf:"bytes,32,rep,name=enable_fs,json=enableFs" json:"enable_fs,omitempty"` + UnixSkIno []*UnixSk `protobuf:"bytes,33,rep,name=unix_sk_ino,json=unixSkIno" json:"unix_sk_ino,omitempty"` // DEPRECATED, use external instead + ManageCgroupsMode *CriuCgMode `protobuf:"varint,34,opt,name=manage_cgroups_mode,json=manageCgroupsMode,enum=CriuCgMode" json:"manage_cgroups_mode,omitempty"` + GhostLimit *uint32 `protobuf:"varint,35,opt,name=ghost_limit,json=ghostLimit,def=1048576" json:"ghost_limit,omitempty"` + IrmapScanPaths []string `protobuf:"bytes,36,rep,name=irmap_scan_paths,json=irmapScanPaths" json:"irmap_scan_paths,omitempty"` + External []string `protobuf:"bytes,37,rep,name=external" json:"external,omitempty"` + EmptyNs *uint32 `protobuf:"varint,38,opt,name=empty_ns,json=emptyNs" json:"empty_ns,omitempty"` + JoinNs []*JoinNamespace `protobuf:"bytes,39,rep,name=join_ns,json=joinNs" json:"join_ns,omitempty"` + CgroupProps *string `protobuf:"bytes,41,opt,name=cgroup_props,json=cgroupProps" json:"cgroup_props,omitempty"` + CgroupPropsFile *string `protobuf:"bytes,42,opt,name=cgroup_props_file,json=cgroupPropsFile" json:"cgroup_props_file,omitempty"` + CgroupDumpController []string `protobuf:"bytes,43,rep,name=cgroup_dump_controller,json=cgroupDumpController" json:"cgroup_dump_controller,omitempty"` + FreezeCgroup *string `protobuf:"bytes,44,opt,name=freeze_cgroup,json=freezeCgroup" json:"freeze_cgroup,omitempty"` + Timeout *uint32 `protobuf:"varint,45,opt,name=timeout" json:"timeout,omitempty"` + TcpSkipInFlight *bool `protobuf:"varint,46,opt,name=tcp_skip_in_flight,json=tcpSkipInFlight" json:"tcp_skip_in_flight,omitempty"` + WeakSysctls *bool `protobuf:"varint,47,opt,name=weak_sysctls,json=weakSysctls" json:"weak_sysctls,omitempty"` + LazyPages *bool `protobuf:"varint,48,opt,name=lazy_pages,json=lazyPages" json:"lazy_pages,omitempty"` + StatusFd *int32 `protobuf:"varint,49,opt,name=status_fd,json=statusFd" json:"status_fd,omitempty"` + OrphanPtsMaster *bool `protobuf:"varint,50,opt,name=orphan_pts_master,json=orphanPtsMaster" json:"orphan_pts_master,omitempty"` + ConfigFile *string `protobuf:"bytes,51,opt,name=config_file,json=configFile" json:"config_file,omitempty"` + TcpClose *bool `protobuf:"varint,52,opt,name=tcp_close,json=tcpClose" json:"tcp_close,omitempty"` + LsmProfile *string `protobuf:"bytes,53,opt,name=lsm_profile,json=lsmProfile" json:"lsm_profile,omitempty"` + TlsCacert *string `protobuf:"bytes,54,opt,name=tls_cacert,json=tlsCacert" json:"tls_cacert,omitempty"` + TlsCacrl *string `protobuf:"bytes,55,opt,name=tls_cacrl,json=tlsCacrl" json:"tls_cacrl,omitempty"` + TlsCert *string `protobuf:"bytes,56,opt,name=tls_cert,json=tlsCert" json:"tls_cert,omitempty"` + TlsKey *string `protobuf:"bytes,57,opt,name=tls_key,json=tlsKey" json:"tls_key,omitempty"` + Tls *bool `protobuf:"varint,58,opt,name=tls" json:"tls,omitempty"` + TlsNoCnVerify *bool `protobuf:"varint,59,opt,name=tls_no_cn_verify,json=tlsNoCnVerify" json:"tls_no_cn_verify,omitempty"` + CgroupYard *string `protobuf:"bytes,60,opt,name=cgroup_yard,json=cgroupYard" json:"cgroup_yard,omitempty"` + PreDumpMode *CriuPreDumpMode `protobuf:"varint,61,opt,name=pre_dump_mode,json=preDumpMode,enum=CriuPreDumpMode,def=1" json:"pre_dump_mode,omitempty"` + PidfdStoreSk *int32 `protobuf:"varint,62,opt,name=pidfd_store_sk,json=pidfdStoreSk" json:"pidfd_store_sk,omitempty"` + LsmMountContext *string `protobuf:"bytes,63,opt,name=lsm_mount_context,json=lsmMountContext" json:"lsm_mount_context,omitempty"` // optional bool check_mounts = 128; +} + +// Default values for CriuOpts fields. +const ( + Default_CriuOpts_LogLevel = int32(2) + Default_CriuOpts_CpuCap = uint32(4294967295) + Default_CriuOpts_GhostLimit = uint32(1048576) + Default_CriuOpts_PreDumpMode = CriuPreDumpMode_SPLICE +) + +func (x *CriuOpts) Reset() { + *x = CriuOpts{} + if protoimpl.UnsafeEnabled { + mi := &file_rpc_rpc_proto_msgTypes[7] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *CriuOpts) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*CriuOpts) ProtoMessage() {} + +func (x *CriuOpts) ProtoReflect() protoreflect.Message { + mi := &file_rpc_rpc_proto_msgTypes[7] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use CriuOpts.ProtoReflect.Descriptor instead. +func (*CriuOpts) Descriptor() ([]byte, []int) { + return file_rpc_rpc_proto_rawDescGZIP(), []int{7} +} + +func (x *CriuOpts) GetImagesDirFd() int32 { + if x != nil && x.ImagesDirFd != nil { + return *x.ImagesDirFd + } + return 0 +} + +func (x *CriuOpts) GetPid() int32 { + if x != nil && x.Pid != nil { + return *x.Pid + } + return 0 +} + +func (x *CriuOpts) GetLeaveRunning() bool { + if x != nil && x.LeaveRunning != nil { + return *x.LeaveRunning + } + return false +} + +func (x *CriuOpts) GetExtUnixSk() bool { + if x != nil && x.ExtUnixSk != nil { + return *x.ExtUnixSk + } + return false +} + +func (x *CriuOpts) GetTcpEstablished() bool { + if x != nil && x.TcpEstablished != nil { + return *x.TcpEstablished + } + return false +} + +func (x *CriuOpts) GetEvasiveDevices() bool { + if x != nil && x.EvasiveDevices != nil { + return *x.EvasiveDevices + } + return false +} + +func (x *CriuOpts) GetShellJob() bool { + if x != nil && x.ShellJob != nil { + return *x.ShellJob + } + return false +} + +func (x *CriuOpts) GetFileLocks() bool { + if x != nil && x.FileLocks != nil { + return *x.FileLocks + } + return false +} + +func (x *CriuOpts) GetLogLevel() int32 { + if x != nil && x.LogLevel != nil { + return *x.LogLevel + } + return Default_CriuOpts_LogLevel +} + +func (x *CriuOpts) GetLogFile() string { + if x != nil && x.LogFile != nil { + return *x.LogFile + } + return "" +} + +func (x *CriuOpts) GetPs() *CriuPageServerInfo { + if x != nil { + return x.Ps + } + return nil +} + +func (x *CriuOpts) GetNotifyScripts() bool { + if x != nil && x.NotifyScripts != nil { + return *x.NotifyScripts + } + return false +} + +func (x *CriuOpts) GetRoot() string { + if x != nil && x.Root != nil { + return *x.Root + } + return "" +} + +func (x *CriuOpts) GetParentImg() string { + if x != nil && x.ParentImg != nil { + return *x.ParentImg + } + return "" +} + +func (x *CriuOpts) GetTrackMem() bool { + if x != nil && x.TrackMem != nil { + return *x.TrackMem + } + return false +} + +func (x *CriuOpts) GetAutoDedup() bool { + if x != nil && x.AutoDedup != nil { + return *x.AutoDedup + } + return false +} + +func (x *CriuOpts) GetWorkDirFd() int32 { + if x != nil && x.WorkDirFd != nil { + return *x.WorkDirFd + } + return 0 +} + +func (x *CriuOpts) GetLinkRemap() bool { + if x != nil && x.LinkRemap != nil { + return *x.LinkRemap + } + return false +} + +func (x *CriuOpts) GetVeths() []*CriuVethPair { + if x != nil { + return x.Veths + } + return nil +} + +func (x *CriuOpts) GetCpuCap() uint32 { + if x != nil && x.CpuCap != nil { + return *x.CpuCap + } + return Default_CriuOpts_CpuCap +} + +func (x *CriuOpts) GetForceIrmap() bool { + if x != nil && x.ForceIrmap != nil { + return *x.ForceIrmap + } + return false +} + +func (x *CriuOpts) GetExecCmd() []string { + if x != nil { + return x.ExecCmd + } + return nil +} + +func (x *CriuOpts) GetExtMnt() []*ExtMountMap { + if x != nil { + return x.ExtMnt + } + return nil +} + +func (x *CriuOpts) GetManageCgroups() bool { + if x != nil && x.ManageCgroups != nil { + return *x.ManageCgroups + } + return false +} + +func (x *CriuOpts) GetCgRoot() []*CgroupRoot { + if x != nil { + return x.CgRoot + } + return nil +} + +func (x *CriuOpts) GetRstSibling() bool { + if x != nil && x.RstSibling != nil { + return *x.RstSibling + } + return false +} + +func (x *CriuOpts) GetInheritFd() []*InheritFd { + if x != nil { + return x.InheritFd + } + return nil +} + +func (x *CriuOpts) GetAutoExtMnt() bool { + if x != nil && x.AutoExtMnt != nil { + return *x.AutoExtMnt + } + return false +} + +func (x *CriuOpts) GetExtSharing() bool { + if x != nil && x.ExtSharing != nil { + return *x.ExtSharing + } + return false +} + +func (x *CriuOpts) GetExtMasters() bool { + if x != nil && x.ExtMasters != nil { + return *x.ExtMasters + } + return false +} + +func (x *CriuOpts) GetSkipMnt() []string { + if x != nil { + return x.SkipMnt + } + return nil +} + +func (x *CriuOpts) GetEnableFs() []string { + if x != nil { + return x.EnableFs + } + return nil +} + +func (x *CriuOpts) GetUnixSkIno() []*UnixSk { + if x != nil { + return x.UnixSkIno + } + return nil +} + +func (x *CriuOpts) GetManageCgroupsMode() CriuCgMode { + if x != nil && x.ManageCgroupsMode != nil { + return *x.ManageCgroupsMode + } + return CriuCgMode_IGNORE +} + +func (x *CriuOpts) GetGhostLimit() uint32 { + if x != nil && x.GhostLimit != nil { + return *x.GhostLimit + } + return Default_CriuOpts_GhostLimit +} + +func (x *CriuOpts) GetIrmapScanPaths() []string { + if x != nil { + return x.IrmapScanPaths + } + return nil +} + +func (x *CriuOpts) GetExternal() []string { + if x != nil { + return x.External + } + return nil +} + +func (x *CriuOpts) GetEmptyNs() uint32 { + if x != nil && x.EmptyNs != nil { + return *x.EmptyNs + } + return 0 +} + +func (x *CriuOpts) GetJoinNs() []*JoinNamespace { + if x != nil { + return x.JoinNs + } + return nil +} + +func (x *CriuOpts) GetCgroupProps() string { + if x != nil && x.CgroupProps != nil { + return *x.CgroupProps + } + return "" +} + +func (x *CriuOpts) GetCgroupPropsFile() string { + if x != nil && x.CgroupPropsFile != nil { + return *x.CgroupPropsFile + } + return "" +} + +func (x *CriuOpts) GetCgroupDumpController() []string { + if x != nil { + return x.CgroupDumpController + } + return nil +} + +func (x *CriuOpts) GetFreezeCgroup() string { + if x != nil && x.FreezeCgroup != nil { + return *x.FreezeCgroup + } + return "" +} + +func (x *CriuOpts) GetTimeout() uint32 { + if x != nil && x.Timeout != nil { + return *x.Timeout + } + return 0 +} + +func (x *CriuOpts) GetTcpSkipInFlight() bool { + if x != nil && x.TcpSkipInFlight != nil { + return *x.TcpSkipInFlight + } + return false +} + +func (x *CriuOpts) GetWeakSysctls() bool { + if x != nil && x.WeakSysctls != nil { + return *x.WeakSysctls + } + return false +} + +func (x *CriuOpts) GetLazyPages() bool { + if x != nil && x.LazyPages != nil { + return *x.LazyPages + } + return false +} + +func (x *CriuOpts) GetStatusFd() int32 { + if x != nil && x.StatusFd != nil { + return *x.StatusFd + } + return 0 +} + +func (x *CriuOpts) GetOrphanPtsMaster() bool { + if x != nil && x.OrphanPtsMaster != nil { + return *x.OrphanPtsMaster + } + return false +} + +func (x *CriuOpts) GetConfigFile() string { + if x != nil && x.ConfigFile != nil { + return *x.ConfigFile + } + return "" +} + +func (x *CriuOpts) GetTcpClose() bool { + if x != nil && x.TcpClose != nil { + return *x.TcpClose + } + return false +} + +func (x *CriuOpts) GetLsmProfile() string { + if x != nil && x.LsmProfile != nil { + return *x.LsmProfile + } + return "" +} + +func (x *CriuOpts) GetTlsCacert() string { + if x != nil && x.TlsCacert != nil { + return *x.TlsCacert + } + return "" +} + +func (x *CriuOpts) GetTlsCacrl() string { + if x != nil && x.TlsCacrl != nil { + return *x.TlsCacrl + } + return "" +} + +func (x *CriuOpts) GetTlsCert() string { + if x != nil && x.TlsCert != nil { + return *x.TlsCert + } + return "" +} + +func (x *CriuOpts) GetTlsKey() string { + if x != nil && x.TlsKey != nil { + return *x.TlsKey + } + return "" +} + +func (x *CriuOpts) GetTls() bool { + if x != nil && x.Tls != nil { + return *x.Tls + } + return false +} + +func (x *CriuOpts) GetTlsNoCnVerify() bool { + if x != nil && x.TlsNoCnVerify != nil { + return *x.TlsNoCnVerify + } + return false +} + +func (x *CriuOpts) GetCgroupYard() string { + if x != nil && x.CgroupYard != nil { + return *x.CgroupYard + } + return "" +} + +func (x *CriuOpts) GetPreDumpMode() CriuPreDumpMode { + if x != nil && x.PreDumpMode != nil { + return *x.PreDumpMode + } + return Default_CriuOpts_PreDumpMode +} + +func (x *CriuOpts) GetPidfdStoreSk() int32 { + if x != nil && x.PidfdStoreSk != nil { + return *x.PidfdStoreSk + } + return 0 +} + +func (x *CriuOpts) GetLsmMountContext() string { + if x != nil && x.LsmMountContext != nil { + return *x.LsmMountContext + } + return "" +} + +type CriuDumpResp struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Restored *bool `protobuf:"varint,1,opt,name=restored" json:"restored,omitempty"` +} + +func (x *CriuDumpResp) Reset() { + *x = CriuDumpResp{} + if protoimpl.UnsafeEnabled { + mi := &file_rpc_rpc_proto_msgTypes[8] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *CriuDumpResp) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*CriuDumpResp) ProtoMessage() {} + +func (x *CriuDumpResp) ProtoReflect() protoreflect.Message { + mi := &file_rpc_rpc_proto_msgTypes[8] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use CriuDumpResp.ProtoReflect.Descriptor instead. +func (*CriuDumpResp) Descriptor() ([]byte, []int) { + return file_rpc_rpc_proto_rawDescGZIP(), []int{8} +} + +func (x *CriuDumpResp) GetRestored() bool { + if x != nil && x.Restored != nil { + return *x.Restored + } + return false +} + +type CriuRestoreResp struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Pid *int32 `protobuf:"varint,1,req,name=pid" json:"pid,omitempty"` +} + +func (x *CriuRestoreResp) Reset() { + *x = CriuRestoreResp{} + if protoimpl.UnsafeEnabled { + mi := &file_rpc_rpc_proto_msgTypes[9] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *CriuRestoreResp) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*CriuRestoreResp) ProtoMessage() {} + +func (x *CriuRestoreResp) ProtoReflect() protoreflect.Message { + mi := &file_rpc_rpc_proto_msgTypes[9] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use CriuRestoreResp.ProtoReflect.Descriptor instead. +func (*CriuRestoreResp) Descriptor() ([]byte, []int) { + return file_rpc_rpc_proto_rawDescGZIP(), []int{9} +} + +func (x *CriuRestoreResp) GetPid() int32 { + if x != nil && x.Pid != nil { + return *x.Pid + } + return 0 +} + +type CriuNotify struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Script *string `protobuf:"bytes,1,opt,name=script" json:"script,omitempty"` + Pid *int32 `protobuf:"varint,2,opt,name=pid" json:"pid,omitempty"` +} + +func (x *CriuNotify) Reset() { + *x = CriuNotify{} + if protoimpl.UnsafeEnabled { + mi := &file_rpc_rpc_proto_msgTypes[10] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *CriuNotify) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*CriuNotify) ProtoMessage() {} + +func (x *CriuNotify) ProtoReflect() protoreflect.Message { + mi := &file_rpc_rpc_proto_msgTypes[10] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use CriuNotify.ProtoReflect.Descriptor instead. +func (*CriuNotify) Descriptor() ([]byte, []int) { + return file_rpc_rpc_proto_rawDescGZIP(), []int{10} +} + +func (x *CriuNotify) GetScript() string { + if x != nil && x.Script != nil { + return *x.Script + } + return "" +} + +func (x *CriuNotify) GetPid() int32 { + if x != nil && x.Pid != nil { + return *x.Pid + } + return 0 +} + +// +// List of features which can queried via +// CRIU_REQ_TYPE__FEATURE_CHECK +type CriuFeatures struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + MemTrack *bool `protobuf:"varint,1,opt,name=mem_track,json=memTrack" json:"mem_track,omitempty"` + LazyPages *bool `protobuf:"varint,2,opt,name=lazy_pages,json=lazyPages" json:"lazy_pages,omitempty"` + PidfdStore *bool `protobuf:"varint,3,opt,name=pidfd_store,json=pidfdStore" json:"pidfd_store,omitempty"` +} + +func (x *CriuFeatures) Reset() { + *x = CriuFeatures{} + if protoimpl.UnsafeEnabled { + mi := &file_rpc_rpc_proto_msgTypes[11] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *CriuFeatures) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*CriuFeatures) ProtoMessage() {} + +func (x *CriuFeatures) ProtoReflect() protoreflect.Message { + mi := &file_rpc_rpc_proto_msgTypes[11] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use CriuFeatures.ProtoReflect.Descriptor instead. +func (*CriuFeatures) Descriptor() ([]byte, []int) { + return file_rpc_rpc_proto_rawDescGZIP(), []int{11} +} + +func (x *CriuFeatures) GetMemTrack() bool { + if x != nil && x.MemTrack != nil { + return *x.MemTrack + } + return false +} + +func (x *CriuFeatures) GetLazyPages() bool { + if x != nil && x.LazyPages != nil { + return *x.LazyPages + } + return false +} + +func (x *CriuFeatures) GetPidfdStore() bool { + if x != nil && x.PidfdStore != nil { + return *x.PidfdStore + } + return false +} + +type CriuReq struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Type *CriuReqType `protobuf:"varint,1,req,name=type,enum=CriuReqType" json:"type,omitempty"` + Opts *CriuOpts `protobuf:"bytes,2,opt,name=opts" json:"opts,omitempty"` + NotifySuccess *bool `protobuf:"varint,3,opt,name=notify_success,json=notifySuccess" json:"notify_success,omitempty"` + // + // When set service won't close the connection but + // will wait for more req-s to appear. Works not + // for all request types. + KeepOpen *bool `protobuf:"varint,4,opt,name=keep_open,json=keepOpen" json:"keep_open,omitempty"` + // + // 'features' can be used to query which features + // are supported by the installed criu/kernel + // via RPC. + Features *CriuFeatures `protobuf:"bytes,5,opt,name=features" json:"features,omitempty"` + // 'pid' is used for WAIT_PID + Pid *uint32 `protobuf:"varint,6,opt,name=pid" json:"pid,omitempty"` +} + +func (x *CriuReq) Reset() { + *x = CriuReq{} + if protoimpl.UnsafeEnabled { + mi := &file_rpc_rpc_proto_msgTypes[12] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *CriuReq) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*CriuReq) ProtoMessage() {} + +func (x *CriuReq) ProtoReflect() protoreflect.Message { + mi := &file_rpc_rpc_proto_msgTypes[12] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use CriuReq.ProtoReflect.Descriptor instead. +func (*CriuReq) Descriptor() ([]byte, []int) { + return file_rpc_rpc_proto_rawDescGZIP(), []int{12} +} + +func (x *CriuReq) GetType() CriuReqType { + if x != nil && x.Type != nil { + return *x.Type + } + return CriuReqType_EMPTY +} + +func (x *CriuReq) GetOpts() *CriuOpts { + if x != nil { + return x.Opts + } + return nil +} + +func (x *CriuReq) GetNotifySuccess() bool { + if x != nil && x.NotifySuccess != nil { + return *x.NotifySuccess + } + return false +} + +func (x *CriuReq) GetKeepOpen() bool { + if x != nil && x.KeepOpen != nil { + return *x.KeepOpen + } + return false +} + +func (x *CriuReq) GetFeatures() *CriuFeatures { + if x != nil { + return x.Features + } + return nil +} + +func (x *CriuReq) GetPid() uint32 { + if x != nil && x.Pid != nil { + return *x.Pid + } + return 0 +} + +type CriuResp struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Type *CriuReqType `protobuf:"varint,1,req,name=type,enum=CriuReqType" json:"type,omitempty"` + Success *bool `protobuf:"varint,2,req,name=success" json:"success,omitempty"` + Dump *CriuDumpResp `protobuf:"bytes,3,opt,name=dump" json:"dump,omitempty"` + Restore *CriuRestoreResp `protobuf:"bytes,4,opt,name=restore" json:"restore,omitempty"` + Notify *CriuNotify `protobuf:"bytes,5,opt,name=notify" json:"notify,omitempty"` + Ps *CriuPageServerInfo `protobuf:"bytes,6,opt,name=ps" json:"ps,omitempty"` + CrErrno *int32 `protobuf:"varint,7,opt,name=cr_errno,json=crErrno" json:"cr_errno,omitempty"` + Features *CriuFeatures `protobuf:"bytes,8,opt,name=features" json:"features,omitempty"` + CrErrmsg *string `protobuf:"bytes,9,opt,name=cr_errmsg,json=crErrmsg" json:"cr_errmsg,omitempty"` + Version *CriuVersion `protobuf:"bytes,10,opt,name=version" json:"version,omitempty"` + Status *int32 `protobuf:"varint,11,opt,name=status" json:"status,omitempty"` +} + +func (x *CriuResp) Reset() { + *x = CriuResp{} + if protoimpl.UnsafeEnabled { + mi := &file_rpc_rpc_proto_msgTypes[13] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *CriuResp) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*CriuResp) ProtoMessage() {} + +func (x *CriuResp) ProtoReflect() protoreflect.Message { + mi := &file_rpc_rpc_proto_msgTypes[13] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use CriuResp.ProtoReflect.Descriptor instead. +func (*CriuResp) Descriptor() ([]byte, []int) { + return file_rpc_rpc_proto_rawDescGZIP(), []int{13} +} + +func (x *CriuResp) GetType() CriuReqType { + if x != nil && x.Type != nil { + return *x.Type + } + return CriuReqType_EMPTY +} + +func (x *CriuResp) GetSuccess() bool { + if x != nil && x.Success != nil { + return *x.Success + } + return false +} + +func (x *CriuResp) GetDump() *CriuDumpResp { + if x != nil { + return x.Dump + } + return nil +} + +func (x *CriuResp) GetRestore() *CriuRestoreResp { + if x != nil { + return x.Restore + } + return nil +} + +func (x *CriuResp) GetNotify() *CriuNotify { + if x != nil { + return x.Notify + } + return nil +} + +func (x *CriuResp) GetPs() *CriuPageServerInfo { + if x != nil { + return x.Ps + } + return nil +} + +func (x *CriuResp) GetCrErrno() int32 { + if x != nil && x.CrErrno != nil { + return *x.CrErrno + } + return 0 +} + +func (x *CriuResp) GetFeatures() *CriuFeatures { + if x != nil { + return x.Features + } + return nil +} + +func (x *CriuResp) GetCrErrmsg() string { + if x != nil && x.CrErrmsg != nil { + return *x.CrErrmsg + } + return "" +} + +func (x *CriuResp) GetVersion() *CriuVersion { + if x != nil { + return x.Version + } + return nil +} + +func (x *CriuResp) GetStatus() int32 { + if x != nil && x.Status != nil { + return *x.Status + } + return 0 +} + +// Answer for criu_req_type.VERSION requests +type CriuVersion struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + MajorNumber *int32 `protobuf:"varint,1,req,name=major_number,json=majorNumber" json:"major_number,omitempty"` + MinorNumber *int32 `protobuf:"varint,2,req,name=minor_number,json=minorNumber" json:"minor_number,omitempty"` + Gitid *string `protobuf:"bytes,3,opt,name=gitid" json:"gitid,omitempty"` + Sublevel *int32 `protobuf:"varint,4,opt,name=sublevel" json:"sublevel,omitempty"` + Extra *int32 `protobuf:"varint,5,opt,name=extra" json:"extra,omitempty"` + Name *string `protobuf:"bytes,6,opt,name=name" json:"name,omitempty"` +} + +func (x *CriuVersion) Reset() { + *x = CriuVersion{} + if protoimpl.UnsafeEnabled { + mi := &file_rpc_rpc_proto_msgTypes[14] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *CriuVersion) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*CriuVersion) ProtoMessage() {} + +func (x *CriuVersion) ProtoReflect() protoreflect.Message { + mi := &file_rpc_rpc_proto_msgTypes[14] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use CriuVersion.ProtoReflect.Descriptor instead. +func (*CriuVersion) Descriptor() ([]byte, []int) { + return file_rpc_rpc_proto_rawDescGZIP(), []int{14} +} + +func (x *CriuVersion) GetMajorNumber() int32 { + if x != nil && x.MajorNumber != nil { + return *x.MajorNumber + } + return 0 +} + +func (x *CriuVersion) GetMinorNumber() int32 { + if x != nil && x.MinorNumber != nil { + return *x.MinorNumber + } + return 0 +} + +func (x *CriuVersion) GetGitid() string { + if x != nil && x.Gitid != nil { + return *x.Gitid + } + return "" +} + +func (x *CriuVersion) GetSublevel() int32 { + if x != nil && x.Sublevel != nil { + return *x.Sublevel + } + return 0 +} + +func (x *CriuVersion) GetExtra() int32 { + if x != nil && x.Extra != nil { + return *x.Extra + } + return 0 +} + +func (x *CriuVersion) GetName() string { + if x != nil && x.Name != nil { + return *x.Name + } + return "" +} + +var File_rpc_rpc_proto protoreflect.FileDescriptor + +var file_rpc_rpc_proto_rawDesc = []byte{ + 0x0a, 0x0d, 0x72, 0x70, 0x63, 0x2f, 0x72, 0x70, 0x63, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x22, + 0x67, 0x0a, 0x15, 0x63, 0x72, 0x69, 0x75, 0x5f, 0x70, 0x61, 0x67, 0x65, 0x5f, 0x73, 0x65, 0x72, + 0x76, 0x65, 0x72, 0x5f, 0x69, 0x6e, 0x66, 0x6f, 0x12, 0x18, 0x0a, 0x07, 0x61, 0x64, 0x64, 0x72, + 0x65, 0x73, 0x73, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x07, 0x61, 0x64, 0x64, 0x72, 0x65, + 0x73, 0x73, 0x12, 0x12, 0x0a, 0x04, 0x70, 0x6f, 0x72, 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, 0x05, + 0x52, 0x04, 0x70, 0x6f, 0x72, 0x74, 0x12, 0x10, 0x0a, 0x03, 0x70, 0x69, 0x64, 0x18, 0x03, 0x20, + 0x01, 0x28, 0x05, 0x52, 0x03, 0x70, 0x69, 0x64, 0x12, 0x0e, 0x0a, 0x02, 0x66, 0x64, 0x18, 0x04, + 0x20, 0x01, 0x28, 0x05, 0x52, 0x02, 0x66, 0x64, 0x22, 0x3c, 0x0a, 0x0e, 0x63, 0x72, 0x69, 0x75, + 0x5f, 0x76, 0x65, 0x74, 0x68, 0x5f, 0x70, 0x61, 0x69, 0x72, 0x12, 0x13, 0x0a, 0x05, 0x69, 0x66, + 0x5f, 0x69, 0x6e, 0x18, 0x01, 0x20, 0x02, 0x28, 0x09, 0x52, 0x04, 0x69, 0x66, 0x49, 0x6e, 0x12, + 0x15, 0x0a, 0x06, 0x69, 0x66, 0x5f, 0x6f, 0x75, 0x74, 0x18, 0x02, 0x20, 0x02, 0x28, 0x09, 0x52, + 0x05, 0x69, 0x66, 0x4f, 0x75, 0x74, 0x22, 0x33, 0x0a, 0x0d, 0x65, 0x78, 0x74, 0x5f, 0x6d, 0x6f, + 0x75, 0x6e, 0x74, 0x5f, 0x6d, 0x61, 0x70, 0x12, 0x10, 0x0a, 0x03, 0x6b, 0x65, 0x79, 0x18, 0x01, + 0x20, 0x02, 0x28, 0x09, 0x52, 0x03, 0x6b, 0x65, 0x79, 0x12, 0x10, 0x0a, 0x03, 0x76, 0x61, 0x6c, + 0x18, 0x02, 0x20, 0x02, 0x28, 0x09, 0x52, 0x03, 0x76, 0x61, 0x6c, 0x22, 0x56, 0x0a, 0x0e, 0x6a, + 0x6f, 0x69, 0x6e, 0x5f, 0x6e, 0x61, 0x6d, 0x65, 0x73, 0x70, 0x61, 0x63, 0x65, 0x12, 0x0e, 0x0a, + 0x02, 0x6e, 0x73, 0x18, 0x01, 0x20, 0x02, 0x28, 0x09, 0x52, 0x02, 0x6e, 0x73, 0x12, 0x17, 0x0a, + 0x07, 0x6e, 0x73, 0x5f, 0x66, 0x69, 0x6c, 0x65, 0x18, 0x02, 0x20, 0x02, 0x28, 0x09, 0x52, 0x06, + 0x6e, 0x73, 0x46, 0x69, 0x6c, 0x65, 0x12, 0x1b, 0x0a, 0x09, 0x65, 0x78, 0x74, 0x72, 0x61, 0x5f, + 0x6f, 0x70, 0x74, 0x18, 0x03, 0x20, 0x01, 0x28, 0x09, 0x52, 0x08, 0x65, 0x78, 0x74, 0x72, 0x61, + 0x4f, 0x70, 0x74, 0x22, 0x2e, 0x0a, 0x0a, 0x69, 0x6e, 0x68, 0x65, 0x72, 0x69, 0x74, 0x5f, 0x66, + 0x64, 0x12, 0x10, 0x0a, 0x03, 0x6b, 0x65, 0x79, 0x18, 0x01, 0x20, 0x02, 0x28, 0x09, 0x52, 0x03, + 0x6b, 0x65, 0x79, 0x12, 0x0e, 0x0a, 0x02, 0x66, 0x64, 0x18, 0x02, 0x20, 0x02, 0x28, 0x05, 0x52, + 0x02, 0x66, 0x64, 0x22, 0x35, 0x0a, 0x0b, 0x63, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x5f, 0x72, 0x6f, + 0x6f, 0x74, 0x12, 0x12, 0x0a, 0x04, 0x63, 0x74, 0x72, 0x6c, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, + 0x52, 0x04, 0x63, 0x74, 0x72, 0x6c, 0x12, 0x12, 0x0a, 0x04, 0x70, 0x61, 0x74, 0x68, 0x18, 0x02, + 0x20, 0x02, 0x28, 0x09, 0x52, 0x04, 0x70, 0x61, 0x74, 0x68, 0x22, 0x1f, 0x0a, 0x07, 0x75, 0x6e, + 0x69, 0x78, 0x5f, 0x73, 0x6b, 0x12, 0x14, 0x0a, 0x05, 0x69, 0x6e, 0x6f, 0x64, 0x65, 0x18, 0x01, + 0x20, 0x02, 0x28, 0x0d, 0x52, 0x05, 0x69, 0x6e, 0x6f, 0x64, 0x65, 0x22, 0x8c, 0x11, 0x0a, 0x09, + 0x63, 0x72, 0x69, 0x75, 0x5f, 0x6f, 0x70, 0x74, 0x73, 0x12, 0x22, 0x0a, 0x0d, 0x69, 0x6d, 0x61, + 0x67, 0x65, 0x73, 0x5f, 0x64, 0x69, 0x72, 0x5f, 0x66, 0x64, 0x18, 0x01, 0x20, 0x02, 0x28, 0x05, + 0x52, 0x0b, 0x69, 0x6d, 0x61, 0x67, 0x65, 0x73, 0x44, 0x69, 0x72, 0x46, 0x64, 0x12, 0x10, 0x0a, + 0x03, 0x70, 0x69, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x05, 0x52, 0x03, 0x70, 0x69, 0x64, 0x12, + 0x23, 0x0a, 0x0d, 0x6c, 0x65, 0x61, 0x76, 0x65, 0x5f, 0x72, 0x75, 0x6e, 0x6e, 0x69, 0x6e, 0x67, + 0x18, 0x03, 0x20, 0x01, 0x28, 0x08, 0x52, 0x0c, 0x6c, 0x65, 0x61, 0x76, 0x65, 0x52, 0x75, 0x6e, + 0x6e, 0x69, 0x6e, 0x67, 0x12, 0x1e, 0x0a, 0x0b, 0x65, 0x78, 0x74, 0x5f, 0x75, 0x6e, 0x69, 0x78, + 0x5f, 0x73, 0x6b, 0x18, 0x04, 0x20, 0x01, 0x28, 0x08, 0x52, 0x09, 0x65, 0x78, 0x74, 0x55, 0x6e, + 0x69, 0x78, 0x53, 0x6b, 0x12, 0x27, 0x0a, 0x0f, 0x74, 0x63, 0x70, 0x5f, 0x65, 0x73, 0x74, 0x61, + 0x62, 0x6c, 0x69, 0x73, 0x68, 0x65, 0x64, 0x18, 0x05, 0x20, 0x01, 0x28, 0x08, 0x52, 0x0e, 0x74, + 0x63, 0x70, 0x45, 0x73, 0x74, 0x61, 0x62, 0x6c, 0x69, 0x73, 0x68, 0x65, 0x64, 0x12, 0x27, 0x0a, + 0x0f, 0x65, 0x76, 0x61, 0x73, 0x69, 0x76, 0x65, 0x5f, 0x64, 0x65, 0x76, 0x69, 0x63, 0x65, 0x73, + 0x18, 0x06, 0x20, 0x01, 0x28, 0x08, 0x52, 0x0e, 0x65, 0x76, 0x61, 0x73, 0x69, 0x76, 0x65, 0x44, + 0x65, 0x76, 0x69, 0x63, 0x65, 0x73, 0x12, 0x1b, 0x0a, 0x09, 0x73, 0x68, 0x65, 0x6c, 0x6c, 0x5f, + 0x6a, 0x6f, 0x62, 0x18, 0x07, 0x20, 0x01, 0x28, 0x08, 0x52, 0x08, 0x73, 0x68, 0x65, 0x6c, 0x6c, + 0x4a, 0x6f, 0x62, 0x12, 0x1d, 0x0a, 0x0a, 0x66, 0x69, 0x6c, 0x65, 0x5f, 0x6c, 0x6f, 0x63, 0x6b, + 0x73, 0x18, 0x08, 0x20, 0x01, 0x28, 0x08, 0x52, 0x09, 0x66, 0x69, 0x6c, 0x65, 0x4c, 0x6f, 0x63, + 0x6b, 0x73, 0x12, 0x1e, 0x0a, 0x09, 0x6c, 0x6f, 0x67, 0x5f, 0x6c, 0x65, 0x76, 0x65, 0x6c, 0x18, + 0x09, 0x20, 0x01, 0x28, 0x05, 0x3a, 0x01, 0x32, 0x52, 0x08, 0x6c, 0x6f, 0x67, 0x4c, 0x65, 0x76, + 0x65, 0x6c, 0x12, 0x19, 0x0a, 0x08, 0x6c, 0x6f, 0x67, 0x5f, 0x66, 0x69, 0x6c, 0x65, 0x18, 0x0a, + 0x20, 0x01, 0x28, 0x09, 0x52, 0x07, 0x6c, 0x6f, 0x67, 0x46, 0x69, 0x6c, 0x65, 0x12, 0x26, 0x0a, + 0x02, 0x70, 0x73, 0x18, 0x0b, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x16, 0x2e, 0x63, 0x72, 0x69, 0x75, + 0x5f, 0x70, 0x61, 0x67, 0x65, 0x5f, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x5f, 0x69, 0x6e, 0x66, + 0x6f, 0x52, 0x02, 0x70, 0x73, 0x12, 0x25, 0x0a, 0x0e, 0x6e, 0x6f, 0x74, 0x69, 0x66, 0x79, 0x5f, + 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x73, 0x18, 0x0c, 0x20, 0x01, 0x28, 0x08, 0x52, 0x0d, 0x6e, + 0x6f, 0x74, 0x69, 0x66, 0x79, 0x53, 0x63, 0x72, 0x69, 0x70, 0x74, 0x73, 0x12, 0x12, 0x0a, 0x04, + 0x72, 0x6f, 0x6f, 0x74, 0x18, 0x0d, 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x72, 0x6f, 0x6f, 0x74, + 0x12, 0x1d, 0x0a, 0x0a, 0x70, 0x61, 0x72, 0x65, 0x6e, 0x74, 0x5f, 0x69, 0x6d, 0x67, 0x18, 0x0e, + 0x20, 0x01, 0x28, 0x09, 0x52, 0x09, 0x70, 0x61, 0x72, 0x65, 0x6e, 0x74, 0x49, 0x6d, 0x67, 0x12, + 0x1b, 0x0a, 0x09, 0x74, 0x72, 0x61, 0x63, 0x6b, 0x5f, 0x6d, 0x65, 0x6d, 0x18, 0x0f, 0x20, 0x01, + 0x28, 0x08, 0x52, 0x08, 0x74, 0x72, 0x61, 0x63, 0x6b, 0x4d, 0x65, 0x6d, 0x12, 0x1d, 0x0a, 0x0a, + 0x61, 0x75, 0x74, 0x6f, 0x5f, 0x64, 0x65, 0x64, 0x75, 0x70, 0x18, 0x10, 0x20, 0x01, 0x28, 0x08, + 0x52, 0x09, 0x61, 0x75, 0x74, 0x6f, 0x44, 0x65, 0x64, 0x75, 0x70, 0x12, 0x1e, 0x0a, 0x0b, 0x77, + 0x6f, 0x72, 0x6b, 0x5f, 0x64, 0x69, 0x72, 0x5f, 0x66, 0x64, 0x18, 0x11, 0x20, 0x01, 0x28, 0x05, + 0x52, 0x09, 0x77, 0x6f, 0x72, 0x6b, 0x44, 0x69, 0x72, 0x46, 0x64, 0x12, 0x1d, 0x0a, 0x0a, 0x6c, + 0x69, 0x6e, 0x6b, 0x5f, 0x72, 0x65, 0x6d, 0x61, 0x70, 0x18, 0x12, 0x20, 0x01, 0x28, 0x08, 0x52, + 0x09, 0x6c, 0x69, 0x6e, 0x6b, 0x52, 0x65, 0x6d, 0x61, 0x70, 0x12, 0x25, 0x0a, 0x05, 0x76, 0x65, + 0x74, 0x68, 0x73, 0x18, 0x13, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x0f, 0x2e, 0x63, 0x72, 0x69, 0x75, + 0x5f, 0x76, 0x65, 0x74, 0x68, 0x5f, 0x70, 0x61, 0x69, 0x72, 0x52, 0x05, 0x76, 0x65, 0x74, 0x68, + 0x73, 0x12, 0x23, 0x0a, 0x07, 0x63, 0x70, 0x75, 0x5f, 0x63, 0x61, 0x70, 0x18, 0x14, 0x20, 0x01, + 0x28, 0x0d, 0x3a, 0x0a, 0x34, 0x32, 0x39, 0x34, 0x39, 0x36, 0x37, 0x32, 0x39, 0x35, 0x52, 0x06, + 0x63, 0x70, 0x75, 0x43, 0x61, 0x70, 0x12, 0x1f, 0x0a, 0x0b, 0x66, 0x6f, 0x72, 0x63, 0x65, 0x5f, + 0x69, 0x72, 0x6d, 0x61, 0x70, 0x18, 0x15, 0x20, 0x01, 0x28, 0x08, 0x52, 0x0a, 0x66, 0x6f, 0x72, + 0x63, 0x65, 0x49, 0x72, 0x6d, 0x61, 0x70, 0x12, 0x19, 0x0a, 0x08, 0x65, 0x78, 0x65, 0x63, 0x5f, + 0x63, 0x6d, 0x64, 0x18, 0x16, 0x20, 0x03, 0x28, 0x09, 0x52, 0x07, 0x65, 0x78, 0x65, 0x63, 0x43, + 0x6d, 0x64, 0x12, 0x27, 0x0a, 0x07, 0x65, 0x78, 0x74, 0x5f, 0x6d, 0x6e, 0x74, 0x18, 0x17, 0x20, + 0x03, 0x28, 0x0b, 0x32, 0x0e, 0x2e, 0x65, 0x78, 0x74, 0x5f, 0x6d, 0x6f, 0x75, 0x6e, 0x74, 0x5f, + 0x6d, 0x61, 0x70, 0x52, 0x06, 0x65, 0x78, 0x74, 0x4d, 0x6e, 0x74, 0x12, 0x25, 0x0a, 0x0e, 0x6d, + 0x61, 0x6e, 0x61, 0x67, 0x65, 0x5f, 0x63, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x73, 0x18, 0x18, 0x20, + 0x01, 0x28, 0x08, 0x52, 0x0d, 0x6d, 0x61, 0x6e, 0x61, 0x67, 0x65, 0x43, 0x67, 0x72, 0x6f, 0x75, + 0x70, 0x73, 0x12, 0x25, 0x0a, 0x07, 0x63, 0x67, 0x5f, 0x72, 0x6f, 0x6f, 0x74, 0x18, 0x19, 0x20, + 0x03, 0x28, 0x0b, 0x32, 0x0c, 0x2e, 0x63, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x5f, 0x72, 0x6f, 0x6f, + 0x74, 0x52, 0x06, 0x63, 0x67, 0x52, 0x6f, 0x6f, 0x74, 0x12, 0x1f, 0x0a, 0x0b, 0x72, 0x73, 0x74, + 0x5f, 0x73, 0x69, 0x62, 0x6c, 0x69, 0x6e, 0x67, 0x18, 0x1a, 0x20, 0x01, 0x28, 0x08, 0x52, 0x0a, + 0x72, 0x73, 0x74, 0x53, 0x69, 0x62, 0x6c, 0x69, 0x6e, 0x67, 0x12, 0x2a, 0x0a, 0x0a, 0x69, 0x6e, + 0x68, 0x65, 0x72, 0x69, 0x74, 0x5f, 0x66, 0x64, 0x18, 0x1b, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x0b, + 0x2e, 0x69, 0x6e, 0x68, 0x65, 0x72, 0x69, 0x74, 0x5f, 0x66, 0x64, 0x52, 0x09, 0x69, 0x6e, 0x68, + 0x65, 0x72, 0x69, 0x74, 0x46, 0x64, 0x12, 0x20, 0x0a, 0x0c, 0x61, 0x75, 0x74, 0x6f, 0x5f, 0x65, + 0x78, 0x74, 0x5f, 0x6d, 0x6e, 0x74, 0x18, 0x1c, 0x20, 0x01, 0x28, 0x08, 0x52, 0x0a, 0x61, 0x75, + 0x74, 0x6f, 0x45, 0x78, 0x74, 0x4d, 0x6e, 0x74, 0x12, 0x1f, 0x0a, 0x0b, 0x65, 0x78, 0x74, 0x5f, + 0x73, 0x68, 0x61, 0x72, 0x69, 0x6e, 0x67, 0x18, 0x1d, 0x20, 0x01, 0x28, 0x08, 0x52, 0x0a, 0x65, + 0x78, 0x74, 0x53, 0x68, 0x61, 0x72, 0x69, 0x6e, 0x67, 0x12, 0x1f, 0x0a, 0x0b, 0x65, 0x78, 0x74, + 0x5f, 0x6d, 0x61, 0x73, 0x74, 0x65, 0x72, 0x73, 0x18, 0x1e, 0x20, 0x01, 0x28, 0x08, 0x52, 0x0a, + 0x65, 0x78, 0x74, 0x4d, 0x61, 0x73, 0x74, 0x65, 0x72, 0x73, 0x12, 0x19, 0x0a, 0x08, 0x73, 0x6b, + 0x69, 0x70, 0x5f, 0x6d, 0x6e, 0x74, 0x18, 0x1f, 0x20, 0x03, 0x28, 0x09, 0x52, 0x07, 0x73, 0x6b, + 0x69, 0x70, 0x4d, 0x6e, 0x74, 0x12, 0x1b, 0x0a, 0x09, 0x65, 0x6e, 0x61, 0x62, 0x6c, 0x65, 0x5f, + 0x66, 0x73, 0x18, 0x20, 0x20, 0x03, 0x28, 0x09, 0x52, 0x08, 0x65, 0x6e, 0x61, 0x62, 0x6c, 0x65, + 0x46, 0x73, 0x12, 0x28, 0x0a, 0x0b, 0x75, 0x6e, 0x69, 0x78, 0x5f, 0x73, 0x6b, 0x5f, 0x69, 0x6e, + 0x6f, 0x18, 0x21, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x08, 0x2e, 0x75, 0x6e, 0x69, 0x78, 0x5f, 0x73, + 0x6b, 0x52, 0x09, 0x75, 0x6e, 0x69, 0x78, 0x53, 0x6b, 0x49, 0x6e, 0x6f, 0x12, 0x3d, 0x0a, 0x13, + 0x6d, 0x61, 0x6e, 0x61, 0x67, 0x65, 0x5f, 0x63, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x73, 0x5f, 0x6d, + 0x6f, 0x64, 0x65, 0x18, 0x22, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x0d, 0x2e, 0x63, 0x72, 0x69, 0x75, + 0x5f, 0x63, 0x67, 0x5f, 0x6d, 0x6f, 0x64, 0x65, 0x52, 0x11, 0x6d, 0x61, 0x6e, 0x61, 0x67, 0x65, + 0x43, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x73, 0x4d, 0x6f, 0x64, 0x65, 0x12, 0x28, 0x0a, 0x0b, 0x67, + 0x68, 0x6f, 0x73, 0x74, 0x5f, 0x6c, 0x69, 0x6d, 0x69, 0x74, 0x18, 0x23, 0x20, 0x01, 0x28, 0x0d, + 0x3a, 0x07, 0x31, 0x30, 0x34, 0x38, 0x35, 0x37, 0x36, 0x52, 0x0a, 0x67, 0x68, 0x6f, 0x73, 0x74, + 0x4c, 0x69, 0x6d, 0x69, 0x74, 0x12, 0x28, 0x0a, 0x10, 0x69, 0x72, 0x6d, 0x61, 0x70, 0x5f, 0x73, + 0x63, 0x61, 0x6e, 0x5f, 0x70, 0x61, 0x74, 0x68, 0x73, 0x18, 0x24, 0x20, 0x03, 0x28, 0x09, 0x52, + 0x0e, 0x69, 0x72, 0x6d, 0x61, 0x70, 0x53, 0x63, 0x61, 0x6e, 0x50, 0x61, 0x74, 0x68, 0x73, 0x12, + 0x1a, 0x0a, 0x08, 0x65, 0x78, 0x74, 0x65, 0x72, 0x6e, 0x61, 0x6c, 0x18, 0x25, 0x20, 0x03, 0x28, + 0x09, 0x52, 0x08, 0x65, 0x78, 0x74, 0x65, 0x72, 0x6e, 0x61, 0x6c, 0x12, 0x19, 0x0a, 0x08, 0x65, + 0x6d, 0x70, 0x74, 0x79, 0x5f, 0x6e, 0x73, 0x18, 0x26, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x07, 0x65, + 0x6d, 0x70, 0x74, 0x79, 0x4e, 0x73, 0x12, 0x28, 0x0a, 0x07, 0x6a, 0x6f, 0x69, 0x6e, 0x5f, 0x6e, + 0x73, 0x18, 0x27, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x0f, 0x2e, 0x6a, 0x6f, 0x69, 0x6e, 0x5f, 0x6e, + 0x61, 0x6d, 0x65, 0x73, 0x70, 0x61, 0x63, 0x65, 0x52, 0x06, 0x6a, 0x6f, 0x69, 0x6e, 0x4e, 0x73, + 0x12, 0x21, 0x0a, 0x0c, 0x63, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x5f, 0x70, 0x72, 0x6f, 0x70, 0x73, + 0x18, 0x29, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0b, 0x63, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x50, 0x72, + 0x6f, 0x70, 0x73, 0x12, 0x2a, 0x0a, 0x11, 0x63, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x5f, 0x70, 0x72, + 0x6f, 0x70, 0x73, 0x5f, 0x66, 0x69, 0x6c, 0x65, 0x18, 0x2a, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0f, + 0x63, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x50, 0x72, 0x6f, 0x70, 0x73, 0x46, 0x69, 0x6c, 0x65, 0x12, + 0x34, 0x0a, 0x16, 0x63, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x5f, 0x64, 0x75, 0x6d, 0x70, 0x5f, 0x63, + 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x18, 0x2b, 0x20, 0x03, 0x28, 0x09, 0x52, + 0x14, 0x63, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x44, 0x75, 0x6d, 0x70, 0x43, 0x6f, 0x6e, 0x74, 0x72, + 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x12, 0x23, 0x0a, 0x0d, 0x66, 0x72, 0x65, 0x65, 0x7a, 0x65, 0x5f, + 0x63, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x18, 0x2c, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0c, 0x66, 0x72, + 0x65, 0x65, 0x7a, 0x65, 0x43, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x12, 0x18, 0x0a, 0x07, 0x74, 0x69, + 0x6d, 0x65, 0x6f, 0x75, 0x74, 0x18, 0x2d, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x07, 0x74, 0x69, 0x6d, + 0x65, 0x6f, 0x75, 0x74, 0x12, 0x2b, 0x0a, 0x12, 0x74, 0x63, 0x70, 0x5f, 0x73, 0x6b, 0x69, 0x70, + 0x5f, 0x69, 0x6e, 0x5f, 0x66, 0x6c, 0x69, 0x67, 0x68, 0x74, 0x18, 0x2e, 0x20, 0x01, 0x28, 0x08, + 0x52, 0x0f, 0x74, 0x63, 0x70, 0x53, 0x6b, 0x69, 0x70, 0x49, 0x6e, 0x46, 0x6c, 0x69, 0x67, 0x68, + 0x74, 0x12, 0x21, 0x0a, 0x0c, 0x77, 0x65, 0x61, 0x6b, 0x5f, 0x73, 0x79, 0x73, 0x63, 0x74, 0x6c, + 0x73, 0x18, 0x2f, 0x20, 0x01, 0x28, 0x08, 0x52, 0x0b, 0x77, 0x65, 0x61, 0x6b, 0x53, 0x79, 0x73, + 0x63, 0x74, 0x6c, 0x73, 0x12, 0x1d, 0x0a, 0x0a, 0x6c, 0x61, 0x7a, 0x79, 0x5f, 0x70, 0x61, 0x67, + 0x65, 0x73, 0x18, 0x30, 0x20, 0x01, 0x28, 0x08, 0x52, 0x09, 0x6c, 0x61, 0x7a, 0x79, 0x50, 0x61, + 0x67, 0x65, 0x73, 0x12, 0x1b, 0x0a, 0x09, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x5f, 0x66, 0x64, + 0x18, 0x31, 0x20, 0x01, 0x28, 0x05, 0x52, 0x08, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x46, 0x64, + 0x12, 0x2a, 0x0a, 0x11, 0x6f, 0x72, 0x70, 0x68, 0x61, 0x6e, 0x5f, 0x70, 0x74, 0x73, 0x5f, 0x6d, + 0x61, 0x73, 0x74, 0x65, 0x72, 0x18, 0x32, 0x20, 0x01, 0x28, 0x08, 0x52, 0x0f, 0x6f, 0x72, 0x70, + 0x68, 0x61, 0x6e, 0x50, 0x74, 0x73, 0x4d, 0x61, 0x73, 0x74, 0x65, 0x72, 0x12, 0x1f, 0x0a, 0x0b, + 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x5f, 0x66, 0x69, 0x6c, 0x65, 0x18, 0x33, 0x20, 0x01, 0x28, + 0x09, 0x52, 0x0a, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x46, 0x69, 0x6c, 0x65, 0x12, 0x1b, 0x0a, + 0x09, 0x74, 0x63, 0x70, 0x5f, 0x63, 0x6c, 0x6f, 0x73, 0x65, 0x18, 0x34, 0x20, 0x01, 0x28, 0x08, + 0x52, 0x08, 0x74, 0x63, 0x70, 0x43, 0x6c, 0x6f, 0x73, 0x65, 0x12, 0x1f, 0x0a, 0x0b, 0x6c, 0x73, + 0x6d, 0x5f, 0x70, 0x72, 0x6f, 0x66, 0x69, 0x6c, 0x65, 0x18, 0x35, 0x20, 0x01, 0x28, 0x09, 0x52, + 0x0a, 0x6c, 0x73, 0x6d, 0x50, 0x72, 0x6f, 0x66, 0x69, 0x6c, 0x65, 0x12, 0x1d, 0x0a, 0x0a, 0x74, + 0x6c, 0x73, 0x5f, 0x63, 0x61, 0x63, 0x65, 0x72, 0x74, 0x18, 0x36, 0x20, 0x01, 0x28, 0x09, 0x52, + 0x09, 0x74, 0x6c, 0x73, 0x43, 0x61, 0x63, 0x65, 0x72, 0x74, 0x12, 0x1b, 0x0a, 0x09, 0x74, 0x6c, + 0x73, 0x5f, 0x63, 0x61, 0x63, 0x72, 0x6c, 0x18, 0x37, 0x20, 0x01, 0x28, 0x09, 0x52, 0x08, 0x74, + 0x6c, 0x73, 0x43, 0x61, 0x63, 0x72, 0x6c, 0x12, 0x19, 0x0a, 0x08, 0x74, 0x6c, 0x73, 0x5f, 0x63, + 0x65, 0x72, 0x74, 0x18, 0x38, 0x20, 0x01, 0x28, 0x09, 0x52, 0x07, 0x74, 0x6c, 0x73, 0x43, 0x65, + 0x72, 0x74, 0x12, 0x17, 0x0a, 0x07, 0x74, 0x6c, 0x73, 0x5f, 0x6b, 0x65, 0x79, 0x18, 0x39, 0x20, + 0x01, 0x28, 0x09, 0x52, 0x06, 0x74, 0x6c, 0x73, 0x4b, 0x65, 0x79, 0x12, 0x10, 0x0a, 0x03, 0x74, + 0x6c, 0x73, 0x18, 0x3a, 0x20, 0x01, 0x28, 0x08, 0x52, 0x03, 0x74, 0x6c, 0x73, 0x12, 0x27, 0x0a, + 0x10, 0x74, 0x6c, 0x73, 0x5f, 0x6e, 0x6f, 0x5f, 0x63, 0x6e, 0x5f, 0x76, 0x65, 0x72, 0x69, 0x66, + 0x79, 0x18, 0x3b, 0x20, 0x01, 0x28, 0x08, 0x52, 0x0d, 0x74, 0x6c, 0x73, 0x4e, 0x6f, 0x43, 0x6e, + 0x56, 0x65, 0x72, 0x69, 0x66, 0x79, 0x12, 0x1f, 0x0a, 0x0b, 0x63, 0x67, 0x72, 0x6f, 0x75, 0x70, + 0x5f, 0x79, 0x61, 0x72, 0x64, 0x18, 0x3c, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0a, 0x63, 0x67, 0x72, + 0x6f, 0x75, 0x70, 0x59, 0x61, 0x72, 0x64, 0x12, 0x3f, 0x0a, 0x0d, 0x70, 0x72, 0x65, 0x5f, 0x64, + 0x75, 0x6d, 0x70, 0x5f, 0x6d, 0x6f, 0x64, 0x65, 0x18, 0x3d, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x13, + 0x2e, 0x63, 0x72, 0x69, 0x75, 0x5f, 0x70, 0x72, 0x65, 0x5f, 0x64, 0x75, 0x6d, 0x70, 0x5f, 0x6d, + 0x6f, 0x64, 0x65, 0x3a, 0x06, 0x53, 0x50, 0x4c, 0x49, 0x43, 0x45, 0x52, 0x0b, 0x70, 0x72, 0x65, + 0x44, 0x75, 0x6d, 0x70, 0x4d, 0x6f, 0x64, 0x65, 0x12, 0x24, 0x0a, 0x0e, 0x70, 0x69, 0x64, 0x66, + 0x64, 0x5f, 0x73, 0x74, 0x6f, 0x72, 0x65, 0x5f, 0x73, 0x6b, 0x18, 0x3e, 0x20, 0x01, 0x28, 0x05, + 0x52, 0x0c, 0x70, 0x69, 0x64, 0x66, 0x64, 0x53, 0x74, 0x6f, 0x72, 0x65, 0x53, 0x6b, 0x12, 0x2a, + 0x0a, 0x11, 0x6c, 0x73, 0x6d, 0x5f, 0x6d, 0x6f, 0x75, 0x6e, 0x74, 0x5f, 0x63, 0x6f, 0x6e, 0x74, + 0x65, 0x78, 0x74, 0x18, 0x3f, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0f, 0x6c, 0x73, 0x6d, 0x4d, 0x6f, + 0x75, 0x6e, 0x74, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x22, 0x2c, 0x0a, 0x0e, 0x63, 0x72, + 0x69, 0x75, 0x5f, 0x64, 0x75, 0x6d, 0x70, 0x5f, 0x72, 0x65, 0x73, 0x70, 0x12, 0x1a, 0x0a, 0x08, + 0x72, 0x65, 0x73, 0x74, 0x6f, 0x72, 0x65, 0x64, 0x18, 0x01, 0x20, 0x01, 0x28, 0x08, 0x52, 0x08, + 0x72, 0x65, 0x73, 0x74, 0x6f, 0x72, 0x65, 0x64, 0x22, 0x25, 0x0a, 0x11, 0x63, 0x72, 0x69, 0x75, + 0x5f, 0x72, 0x65, 0x73, 0x74, 0x6f, 0x72, 0x65, 0x5f, 0x72, 0x65, 0x73, 0x70, 0x12, 0x10, 0x0a, + 0x03, 0x70, 0x69, 0x64, 0x18, 0x01, 0x20, 0x02, 0x28, 0x05, 0x52, 0x03, 0x70, 0x69, 0x64, 0x22, + 0x37, 0x0a, 0x0b, 0x63, 0x72, 0x69, 0x75, 0x5f, 0x6e, 0x6f, 0x74, 0x69, 0x66, 0x79, 0x12, 0x16, + 0x0a, 0x06, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x06, + 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x12, 0x10, 0x0a, 0x03, 0x70, 0x69, 0x64, 0x18, 0x02, 0x20, + 0x01, 0x28, 0x05, 0x52, 0x03, 0x70, 0x69, 0x64, 0x22, 0x6c, 0x0a, 0x0d, 0x63, 0x72, 0x69, 0x75, + 0x5f, 0x66, 0x65, 0x61, 0x74, 0x75, 0x72, 0x65, 0x73, 0x12, 0x1b, 0x0a, 0x09, 0x6d, 0x65, 0x6d, + 0x5f, 0x74, 0x72, 0x61, 0x63, 0x6b, 0x18, 0x01, 0x20, 0x01, 0x28, 0x08, 0x52, 0x08, 0x6d, 0x65, + 0x6d, 0x54, 0x72, 0x61, 0x63, 0x6b, 0x12, 0x1d, 0x0a, 0x0a, 0x6c, 0x61, 0x7a, 0x79, 0x5f, 0x70, + 0x61, 0x67, 0x65, 0x73, 0x18, 0x02, 0x20, 0x01, 0x28, 0x08, 0x52, 0x09, 0x6c, 0x61, 0x7a, 0x79, + 0x50, 0x61, 0x67, 0x65, 0x73, 0x12, 0x1f, 0x0a, 0x0b, 0x70, 0x69, 0x64, 0x66, 0x64, 0x5f, 0x73, + 0x74, 0x6f, 0x72, 0x65, 0x18, 0x03, 0x20, 0x01, 0x28, 0x08, 0x52, 0x0a, 0x70, 0x69, 0x64, 0x66, + 0x64, 0x53, 0x74, 0x6f, 0x72, 0x65, 0x22, 0xd0, 0x01, 0x0a, 0x08, 0x63, 0x72, 0x69, 0x75, 0x5f, + 0x72, 0x65, 0x71, 0x12, 0x22, 0x0a, 0x04, 0x74, 0x79, 0x70, 0x65, 0x18, 0x01, 0x20, 0x02, 0x28, + 0x0e, 0x32, 0x0e, 0x2e, 0x63, 0x72, 0x69, 0x75, 0x5f, 0x72, 0x65, 0x71, 0x5f, 0x74, 0x79, 0x70, + 0x65, 0x52, 0x04, 0x74, 0x79, 0x70, 0x65, 0x12, 0x1e, 0x0a, 0x04, 0x6f, 0x70, 0x74, 0x73, 0x18, + 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x0a, 0x2e, 0x63, 0x72, 0x69, 0x75, 0x5f, 0x6f, 0x70, 0x74, + 0x73, 0x52, 0x04, 0x6f, 0x70, 0x74, 0x73, 0x12, 0x25, 0x0a, 0x0e, 0x6e, 0x6f, 0x74, 0x69, 0x66, + 0x79, 0x5f, 0x73, 0x75, 0x63, 0x63, 0x65, 0x73, 0x73, 0x18, 0x03, 0x20, 0x01, 0x28, 0x08, 0x52, + 0x0d, 0x6e, 0x6f, 0x74, 0x69, 0x66, 0x79, 0x53, 0x75, 0x63, 0x63, 0x65, 0x73, 0x73, 0x12, 0x1b, + 0x0a, 0x09, 0x6b, 0x65, 0x65, 0x70, 0x5f, 0x6f, 0x70, 0x65, 0x6e, 0x18, 0x04, 0x20, 0x01, 0x28, + 0x08, 0x52, 0x08, 0x6b, 0x65, 0x65, 0x70, 0x4f, 0x70, 0x65, 0x6e, 0x12, 0x2a, 0x0a, 0x08, 0x66, + 0x65, 0x61, 0x74, 0x75, 0x72, 0x65, 0x73, 0x18, 0x05, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x0e, 0x2e, + 0x63, 0x72, 0x69, 0x75, 0x5f, 0x66, 0x65, 0x61, 0x74, 0x75, 0x72, 0x65, 0x73, 0x52, 0x08, 0x66, + 0x65, 0x61, 0x74, 0x75, 0x72, 0x65, 0x73, 0x12, 0x10, 0x0a, 0x03, 0x70, 0x69, 0x64, 0x18, 0x06, + 0x20, 0x01, 0x28, 0x0d, 0x52, 0x03, 0x70, 0x69, 0x64, 0x22, 0x8f, 0x03, 0x0a, 0x09, 0x63, 0x72, + 0x69, 0x75, 0x5f, 0x72, 0x65, 0x73, 0x70, 0x12, 0x22, 0x0a, 0x04, 0x74, 0x79, 0x70, 0x65, 0x18, + 0x01, 0x20, 0x02, 0x28, 0x0e, 0x32, 0x0e, 0x2e, 0x63, 0x72, 0x69, 0x75, 0x5f, 0x72, 0x65, 0x71, + 0x5f, 0x74, 0x79, 0x70, 0x65, 0x52, 0x04, 0x74, 0x79, 0x70, 0x65, 0x12, 0x18, 0x0a, 0x07, 0x73, + 0x75, 0x63, 0x63, 0x65, 0x73, 0x73, 0x18, 0x02, 0x20, 0x02, 0x28, 0x08, 0x52, 0x07, 0x73, 0x75, + 0x63, 0x63, 0x65, 0x73, 0x73, 0x12, 0x23, 0x0a, 0x04, 0x64, 0x75, 0x6d, 0x70, 0x18, 0x03, 0x20, + 0x01, 0x28, 0x0b, 0x32, 0x0f, 0x2e, 0x63, 0x72, 0x69, 0x75, 0x5f, 0x64, 0x75, 0x6d, 0x70, 0x5f, + 0x72, 0x65, 0x73, 0x70, 0x52, 0x04, 0x64, 0x75, 0x6d, 0x70, 0x12, 0x2c, 0x0a, 0x07, 0x72, 0x65, + 0x73, 0x74, 0x6f, 0x72, 0x65, 0x18, 0x04, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x12, 0x2e, 0x63, 0x72, + 0x69, 0x75, 0x5f, 0x72, 0x65, 0x73, 0x74, 0x6f, 0x72, 0x65, 0x5f, 0x72, 0x65, 0x73, 0x70, 0x52, + 0x07, 0x72, 0x65, 0x73, 0x74, 0x6f, 0x72, 0x65, 0x12, 0x24, 0x0a, 0x06, 0x6e, 0x6f, 0x74, 0x69, + 0x66, 0x79, 0x18, 0x05, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x0c, 0x2e, 0x63, 0x72, 0x69, 0x75, 0x5f, + 0x6e, 0x6f, 0x74, 0x69, 0x66, 0x79, 0x52, 0x06, 0x6e, 0x6f, 0x74, 0x69, 0x66, 0x79, 0x12, 0x26, + 0x0a, 0x02, 0x70, 0x73, 0x18, 0x06, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x16, 0x2e, 0x63, 0x72, 0x69, + 0x75, 0x5f, 0x70, 0x61, 0x67, 0x65, 0x5f, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x5f, 0x69, 0x6e, + 0x66, 0x6f, 0x52, 0x02, 0x70, 0x73, 0x12, 0x19, 0x0a, 0x08, 0x63, 0x72, 0x5f, 0x65, 0x72, 0x72, + 0x6e, 0x6f, 0x18, 0x07, 0x20, 0x01, 0x28, 0x05, 0x52, 0x07, 0x63, 0x72, 0x45, 0x72, 0x72, 0x6e, + 0x6f, 0x12, 0x2a, 0x0a, 0x08, 0x66, 0x65, 0x61, 0x74, 0x75, 0x72, 0x65, 0x73, 0x18, 0x08, 0x20, + 0x01, 0x28, 0x0b, 0x32, 0x0e, 0x2e, 0x63, 0x72, 0x69, 0x75, 0x5f, 0x66, 0x65, 0x61, 0x74, 0x75, + 0x72, 0x65, 0x73, 0x52, 0x08, 0x66, 0x65, 0x61, 0x74, 0x75, 0x72, 0x65, 0x73, 0x12, 0x1b, 0x0a, + 0x09, 0x63, 0x72, 0x5f, 0x65, 0x72, 0x72, 0x6d, 0x73, 0x67, 0x18, 0x09, 0x20, 0x01, 0x28, 0x09, + 0x52, 0x08, 0x63, 0x72, 0x45, 0x72, 0x72, 0x6d, 0x73, 0x67, 0x12, 0x27, 0x0a, 0x07, 0x76, 0x65, + 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x18, 0x0a, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x0d, 0x2e, 0x63, 0x72, + 0x69, 0x75, 0x5f, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x52, 0x07, 0x76, 0x65, 0x72, 0x73, + 0x69, 0x6f, 0x6e, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x18, 0x0b, 0x20, + 0x01, 0x28, 0x05, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x22, 0xb0, 0x01, 0x0a, 0x0c, + 0x63, 0x72, 0x69, 0x75, 0x5f, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x12, 0x21, 0x0a, 0x0c, + 0x6d, 0x61, 0x6a, 0x6f, 0x72, 0x5f, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x18, 0x01, 0x20, 0x02, + 0x28, 0x05, 0x52, 0x0b, 0x6d, 0x61, 0x6a, 0x6f, 0x72, 0x4e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x12, + 0x21, 0x0a, 0x0c, 0x6d, 0x69, 0x6e, 0x6f, 0x72, 0x5f, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x18, + 0x02, 0x20, 0x02, 0x28, 0x05, 0x52, 0x0b, 0x6d, 0x69, 0x6e, 0x6f, 0x72, 0x4e, 0x75, 0x6d, 0x62, + 0x65, 0x72, 0x12, 0x14, 0x0a, 0x05, 0x67, 0x69, 0x74, 0x69, 0x64, 0x18, 0x03, 0x20, 0x01, 0x28, + 0x09, 0x52, 0x05, 0x67, 0x69, 0x74, 0x69, 0x64, 0x12, 0x1a, 0x0a, 0x08, 0x73, 0x75, 0x62, 0x6c, + 0x65, 0x76, 0x65, 0x6c, 0x18, 0x04, 0x20, 0x01, 0x28, 0x05, 0x52, 0x08, 0x73, 0x75, 0x62, 0x6c, + 0x65, 0x76, 0x65, 0x6c, 0x12, 0x14, 0x0a, 0x05, 0x65, 0x78, 0x74, 0x72, 0x61, 0x18, 0x05, 0x20, + 0x01, 0x28, 0x05, 0x52, 0x05, 0x65, 0x78, 0x74, 0x72, 0x61, 0x12, 0x12, 0x0a, 0x04, 0x6e, 0x61, + 0x6d, 0x65, 0x18, 0x06, 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x6e, 0x61, 0x6d, 0x65, 0x2a, 0x5f, + 0x0a, 0x0c, 0x63, 0x72, 0x69, 0x75, 0x5f, 0x63, 0x67, 0x5f, 0x6d, 0x6f, 0x64, 0x65, 0x12, 0x0a, + 0x0a, 0x06, 0x49, 0x47, 0x4e, 0x4f, 0x52, 0x45, 0x10, 0x00, 0x12, 0x0b, 0x0a, 0x07, 0x43, 0x47, + 0x5f, 0x4e, 0x4f, 0x4e, 0x45, 0x10, 0x01, 0x12, 0x09, 0x0a, 0x05, 0x50, 0x52, 0x4f, 0x50, 0x53, + 0x10, 0x02, 0x12, 0x08, 0x0a, 0x04, 0x53, 0x4f, 0x46, 0x54, 0x10, 0x03, 0x12, 0x08, 0x0a, 0x04, + 0x46, 0x55, 0x4c, 0x4c, 0x10, 0x04, 0x12, 0x0a, 0x0a, 0x06, 0x53, 0x54, 0x52, 0x49, 0x43, 0x54, + 0x10, 0x05, 0x12, 0x0b, 0x0a, 0x07, 0x44, 0x45, 0x46, 0x41, 0x55, 0x4c, 0x54, 0x10, 0x06, 0x2a, + 0x2d, 0x0a, 0x12, 0x63, 0x72, 0x69, 0x75, 0x5f, 0x70, 0x72, 0x65, 0x5f, 0x64, 0x75, 0x6d, 0x70, + 0x5f, 0x6d, 0x6f, 0x64, 0x65, 0x12, 0x0a, 0x0a, 0x06, 0x53, 0x50, 0x4c, 0x49, 0x43, 0x45, 0x10, + 0x01, 0x12, 0x0b, 0x0a, 0x07, 0x56, 0x4d, 0x5f, 0x52, 0x45, 0x41, 0x44, 0x10, 0x02, 0x2a, 0xd0, + 0x01, 0x0a, 0x0d, 0x63, 0x72, 0x69, 0x75, 0x5f, 0x72, 0x65, 0x71, 0x5f, 0x74, 0x79, 0x70, 0x65, + 0x12, 0x09, 0x0a, 0x05, 0x45, 0x4d, 0x50, 0x54, 0x59, 0x10, 0x00, 0x12, 0x08, 0x0a, 0x04, 0x44, + 0x55, 0x4d, 0x50, 0x10, 0x01, 0x12, 0x0b, 0x0a, 0x07, 0x52, 0x45, 0x53, 0x54, 0x4f, 0x52, 0x45, + 0x10, 0x02, 0x12, 0x09, 0x0a, 0x05, 0x43, 0x48, 0x45, 0x43, 0x4b, 0x10, 0x03, 0x12, 0x0c, 0x0a, + 0x08, 0x50, 0x52, 0x45, 0x5f, 0x44, 0x55, 0x4d, 0x50, 0x10, 0x04, 0x12, 0x0f, 0x0a, 0x0b, 0x50, + 0x41, 0x47, 0x45, 0x5f, 0x53, 0x45, 0x52, 0x56, 0x45, 0x52, 0x10, 0x05, 0x12, 0x0a, 0x0a, 0x06, + 0x4e, 0x4f, 0x54, 0x49, 0x46, 0x59, 0x10, 0x06, 0x12, 0x10, 0x0a, 0x0c, 0x43, 0x50, 0x55, 0x49, + 0x4e, 0x46, 0x4f, 0x5f, 0x44, 0x55, 0x4d, 0x50, 0x10, 0x07, 0x12, 0x11, 0x0a, 0x0d, 0x43, 0x50, + 0x55, 0x49, 0x4e, 0x46, 0x4f, 0x5f, 0x43, 0x48, 0x45, 0x43, 0x4b, 0x10, 0x08, 0x12, 0x11, 0x0a, + 0x0d, 0x46, 0x45, 0x41, 0x54, 0x55, 0x52, 0x45, 0x5f, 0x43, 0x48, 0x45, 0x43, 0x4b, 0x10, 0x09, + 0x12, 0x0b, 0x0a, 0x07, 0x56, 0x45, 0x52, 0x53, 0x49, 0x4f, 0x4e, 0x10, 0x0a, 0x12, 0x0c, 0x0a, + 0x08, 0x57, 0x41, 0x49, 0x54, 0x5f, 0x50, 0x49, 0x44, 0x10, 0x0b, 0x12, 0x14, 0x0a, 0x10, 0x50, + 0x41, 0x47, 0x45, 0x5f, 0x53, 0x45, 0x52, 0x56, 0x45, 0x52, 0x5f, 0x43, 0x48, 0x4c, 0x44, 0x10, + 0x0c, +} + +var ( + file_rpc_rpc_proto_rawDescOnce sync.Once + file_rpc_rpc_proto_rawDescData = file_rpc_rpc_proto_rawDesc +) + +func file_rpc_rpc_proto_rawDescGZIP() []byte { + file_rpc_rpc_proto_rawDescOnce.Do(func() { + file_rpc_rpc_proto_rawDescData = protoimpl.X.CompressGZIP(file_rpc_rpc_proto_rawDescData) + }) + return file_rpc_rpc_proto_rawDescData +} + +var file_rpc_rpc_proto_enumTypes = make([]protoimpl.EnumInfo, 3) +var file_rpc_rpc_proto_msgTypes = make([]protoimpl.MessageInfo, 15) +var file_rpc_rpc_proto_goTypes = []interface{}{ + (CriuCgMode)(0), // 0: criu_cg_mode + (CriuPreDumpMode)(0), // 1: criu_pre_dump_mode + (CriuReqType)(0), // 2: criu_req_type + (*CriuPageServerInfo)(nil), // 3: criu_page_server_info + (*CriuVethPair)(nil), // 4: criu_veth_pair + (*ExtMountMap)(nil), // 5: ext_mount_map + (*JoinNamespace)(nil), // 6: join_namespace + (*InheritFd)(nil), // 7: inherit_fd + (*CgroupRoot)(nil), // 8: cgroup_root + (*UnixSk)(nil), // 9: unix_sk + (*CriuOpts)(nil), // 10: criu_opts + (*CriuDumpResp)(nil), // 11: criu_dump_resp + (*CriuRestoreResp)(nil), // 12: criu_restore_resp + (*CriuNotify)(nil), // 13: criu_notify + (*CriuFeatures)(nil), // 14: criu_features + (*CriuReq)(nil), // 15: criu_req + (*CriuResp)(nil), // 16: criu_resp + (*CriuVersion)(nil), // 17: criu_version +} +var file_rpc_rpc_proto_depIdxs = []int32{ + 3, // 0: criu_opts.ps:type_name -> criu_page_server_info + 4, // 1: criu_opts.veths:type_name -> criu_veth_pair + 5, // 2: criu_opts.ext_mnt:type_name -> ext_mount_map + 8, // 3: criu_opts.cg_root:type_name -> cgroup_root + 7, // 4: criu_opts.inherit_fd:type_name -> inherit_fd + 9, // 5: criu_opts.unix_sk_ino:type_name -> unix_sk + 0, // 6: criu_opts.manage_cgroups_mode:type_name -> criu_cg_mode + 6, // 7: criu_opts.join_ns:type_name -> join_namespace + 1, // 8: criu_opts.pre_dump_mode:type_name -> criu_pre_dump_mode + 2, // 9: criu_req.type:type_name -> criu_req_type + 10, // 10: criu_req.opts:type_name -> criu_opts + 14, // 11: criu_req.features:type_name -> criu_features + 2, // 12: criu_resp.type:type_name -> criu_req_type + 11, // 13: criu_resp.dump:type_name -> criu_dump_resp + 12, // 14: criu_resp.restore:type_name -> criu_restore_resp + 13, // 15: criu_resp.notify:type_name -> criu_notify + 3, // 16: criu_resp.ps:type_name -> criu_page_server_info + 14, // 17: criu_resp.features:type_name -> criu_features + 17, // 18: criu_resp.version:type_name -> criu_version + 19, // [19:19] is the sub-list for method output_type + 19, // [19:19] is the sub-list for method input_type + 19, // [19:19] is the sub-list for extension type_name + 19, // [19:19] is the sub-list for extension extendee + 0, // [0:19] is the sub-list for field type_name +} + +func init() { file_rpc_rpc_proto_init() } +func file_rpc_rpc_proto_init() { + if File_rpc_rpc_proto != nil { + return + } + if !protoimpl.UnsafeEnabled { + file_rpc_rpc_proto_msgTypes[0].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*CriuPageServerInfo); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_rpc_rpc_proto_msgTypes[1].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*CriuVethPair); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_rpc_rpc_proto_msgTypes[2].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*ExtMountMap); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_rpc_rpc_proto_msgTypes[3].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*JoinNamespace); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_rpc_rpc_proto_msgTypes[4].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*InheritFd); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_rpc_rpc_proto_msgTypes[5].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*CgroupRoot); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_rpc_rpc_proto_msgTypes[6].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*UnixSk); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_rpc_rpc_proto_msgTypes[7].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*CriuOpts); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_rpc_rpc_proto_msgTypes[8].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*CriuDumpResp); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_rpc_rpc_proto_msgTypes[9].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*CriuRestoreResp); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_rpc_rpc_proto_msgTypes[10].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*CriuNotify); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_rpc_rpc_proto_msgTypes[11].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*CriuFeatures); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_rpc_rpc_proto_msgTypes[12].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*CriuReq); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_rpc_rpc_proto_msgTypes[13].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*CriuResp); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_rpc_rpc_proto_msgTypes[14].Exporter = func(v interface{}, i int) interface{} { + switch v := v.(*CriuVersion); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + } + type x struct{} + out := protoimpl.TypeBuilder{ + File: protoimpl.DescBuilder{ + GoPackagePath: reflect.TypeOf(x{}).PkgPath(), + RawDescriptor: file_rpc_rpc_proto_rawDesc, + NumEnums: 3, + NumMessages: 15, + NumExtensions: 0, + NumServices: 0, + }, + GoTypes: file_rpc_rpc_proto_goTypes, + DependencyIndexes: file_rpc_rpc_proto_depIdxs, + EnumInfos: file_rpc_rpc_proto_enumTypes, + MessageInfos: file_rpc_rpc_proto_msgTypes, + }.Build() + File_rpc_rpc_proto = out.File + file_rpc_rpc_proto_rawDesc = nil + file_rpc_rpc_proto_goTypes = nil + file_rpc_rpc_proto_depIdxs = nil +} diff --git a/vendor/github.com/checkpoint-restore/go-criu/v5/rpc/rpc.proto b/vendor/github.com/checkpoint-restore/go-criu/v5/rpc/rpc.proto new file mode 100644 index 0000000..61e1b24 --- /dev/null +++ b/vendor/github.com/checkpoint-restore/go-criu/v5/rpc/rpc.proto @@ -0,0 +1,239 @@ +// SPDX-License-Identifier: MIT + +syntax = "proto2"; + +message criu_page_server_info { + optional string address = 1; + optional int32 port = 2; + optional int32 pid = 3; + optional int32 fd = 4; +} + +message criu_veth_pair { + required string if_in = 1; + required string if_out = 2; +}; + +message ext_mount_map { + required string key = 1; + required string val = 2; +}; + +message join_namespace { + required string ns = 1; + required string ns_file = 2; + optional string extra_opt = 3; +} + +message inherit_fd { + required string key = 1; + required int32 fd = 2; +}; + +message cgroup_root { + optional string ctrl = 1; + required string path = 2; +}; + +message unix_sk { + required uint32 inode = 1; +}; + +enum criu_cg_mode { + IGNORE = 0; + CG_NONE = 1; + PROPS = 2; + SOFT = 3; + FULL = 4; + STRICT = 5; + DEFAULT = 6; +}; + +enum criu_pre_dump_mode { + SPLICE = 1; + VM_READ = 2; +}; + +message criu_opts { + required int32 images_dir_fd = 1; + optional int32 pid = 2; /* if not set on dump, will dump requesting process */ + + optional bool leave_running = 3; + optional bool ext_unix_sk = 4; + optional bool tcp_established = 5; + optional bool evasive_devices = 6; + optional bool shell_job = 7; + optional bool file_locks = 8; + optional int32 log_level = 9 [default = 2]; + optional string log_file = 10; /* No subdirs are allowed. Consider using work-dir */ + + optional criu_page_server_info ps = 11; + + optional bool notify_scripts = 12; + + optional string root = 13; + optional string parent_img = 14; + optional bool track_mem = 15; + optional bool auto_dedup = 16; + + optional int32 work_dir_fd = 17; + optional bool link_remap = 18; + repeated criu_veth_pair veths = 19; /* DEPRECATED, use external instead */ + + optional uint32 cpu_cap = 20 [default = 0xffffffff]; + optional bool force_irmap = 21; + repeated string exec_cmd = 22; + + repeated ext_mount_map ext_mnt = 23; /* DEPRECATED, use external instead */ + optional bool manage_cgroups = 24; /* backward compatibility */ + repeated cgroup_root cg_root = 25; + + optional bool rst_sibling = 26; /* swrk only */ + repeated inherit_fd inherit_fd = 27; /* swrk only */ + + optional bool auto_ext_mnt = 28; + optional bool ext_sharing = 29; + optional bool ext_masters = 30; + + repeated string skip_mnt = 31; + repeated string enable_fs = 32; + + repeated unix_sk unix_sk_ino = 33; /* DEPRECATED, use external instead */ + + optional criu_cg_mode manage_cgroups_mode = 34; + optional uint32 ghost_limit = 35 [default = 0x100000]; + repeated string irmap_scan_paths = 36; + repeated string external = 37; + optional uint32 empty_ns = 38; + repeated join_namespace join_ns = 39; + + optional string cgroup_props = 41; + optional string cgroup_props_file = 42; + repeated string cgroup_dump_controller = 43; + + optional string freeze_cgroup = 44; + optional uint32 timeout = 45; + optional bool tcp_skip_in_flight = 46; + optional bool weak_sysctls = 47; + optional bool lazy_pages = 48; + optional int32 status_fd = 49; + optional bool orphan_pts_master = 50; + optional string config_file = 51; + optional bool tcp_close = 52; + optional string lsm_profile = 53; + optional string tls_cacert = 54; + optional string tls_cacrl = 55; + optional string tls_cert = 56; + optional string tls_key = 57; + optional bool tls = 58; + optional bool tls_no_cn_verify = 59; + optional string cgroup_yard = 60; + optional criu_pre_dump_mode pre_dump_mode = 61 [default = SPLICE]; + optional int32 pidfd_store_sk = 62; + optional string lsm_mount_context = 63; +/* optional bool check_mounts = 128; */ +} + +message criu_dump_resp { + optional bool restored = 1; +} + +message criu_restore_resp { + required int32 pid = 1; +} + +message criu_notify { + optional string script = 1; + optional int32 pid = 2; +} + +enum criu_req_type { + EMPTY = 0; + DUMP = 1; + RESTORE = 2; + CHECK = 3; + PRE_DUMP = 4; + PAGE_SERVER = 5; + + NOTIFY = 6; + + CPUINFO_DUMP = 7; + CPUINFO_CHECK = 8; + + FEATURE_CHECK = 9; + + VERSION = 10; + + WAIT_PID = 11; + PAGE_SERVER_CHLD = 12; +} + +/* + * List of features which can queried via + * CRIU_REQ_TYPE__FEATURE_CHECK + */ +message criu_features { + optional bool mem_track = 1; + optional bool lazy_pages = 2; + optional bool pidfd_store = 3; +} + +/* + * Request -- each type corresponds to must-be-there + * request arguments of respective type + */ + +message criu_req { + required criu_req_type type = 1; + + optional criu_opts opts = 2; + optional bool notify_success = 3; + + /* + * When set service won't close the connection but + * will wait for more req-s to appear. Works not + * for all request types. + */ + optional bool keep_open = 4; + /* + * 'features' can be used to query which features + * are supported by the installed criu/kernel + * via RPC. + */ + optional criu_features features = 5; + + /* 'pid' is used for WAIT_PID */ + optional uint32 pid = 6; +} + +/* + * Response -- it states whether the request was served + * and additional request-specific information + */ + +message criu_resp { + required criu_req_type type = 1; + required bool success = 2; + + optional criu_dump_resp dump = 3; + optional criu_restore_resp restore = 4; + optional criu_notify notify = 5; + optional criu_page_server_info ps = 6; + + optional int32 cr_errno = 7; + optional criu_features features = 8; + optional string cr_errmsg = 9; + optional criu_version version = 10; + + optional int32 status = 11; +} + +/* Answer for criu_req_type.VERSION requests */ +message criu_version { + required int32 major_number = 1; + required int32 minor_number = 2; + optional string gitid = 3; + optional int32 sublevel = 4; + optional int32 extra = 5; + optional string name = 6; +} diff --git a/vendor/github.com/cilium/ebpf/.clang-format b/vendor/github.com/cilium/ebpf/.clang-format new file mode 100644 index 0000000..4eb94b1 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/.clang-format @@ -0,0 +1,17 @@ +--- +Language: Cpp +BasedOnStyle: LLVM +AlignAfterOpenBracket: DontAlign +AlignConsecutiveAssignments: true +AlignEscapedNewlines: DontAlign +AlwaysBreakBeforeMultilineStrings: true +AlwaysBreakTemplateDeclarations: false +AllowAllParametersOfDeclarationOnNextLine: false +AllowShortFunctionsOnASingleLine: false +BreakBeforeBraces: Attach +IndentWidth: 4 +KeepEmptyLinesAtTheStartOfBlocks: false +TabWidth: 4 +UseTab: ForContinuationAndIndentation +ColumnLimit: 1000 +... diff --git a/vendor/github.com/cilium/ebpf/.gitignore b/vendor/github.com/cilium/ebpf/.gitignore new file mode 100644 index 0000000..b46162b --- /dev/null +++ b/vendor/github.com/cilium/ebpf/.gitignore @@ -0,0 +1,14 @@ +# Binaries for programs and plugins +*.exe +*.exe~ +*.dll +*.so +*.dylib +*.o +!*_bpf*.o + +# Test binary, build with `go test -c` +*.test + +# Output of the go coverage tool, specifically when used with LiteIDE +*.out diff --git a/vendor/github.com/cilium/ebpf/.golangci.yaml b/vendor/github.com/cilium/ebpf/.golangci.yaml new file mode 100644 index 0000000..dc62dd6 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/.golangci.yaml @@ -0,0 +1,28 @@ +--- +issues: + exclude-rules: + # syscall param structs will have unused fields in Go code. + - path: syscall.*.go + linters: + - structcheck + +linters: + disable-all: true + enable: + - deadcode + - errcheck + - goimports + - gosimple + - govet + - ineffassign + - misspell + - staticcheck + - structcheck + - typecheck + - unused + - varcheck + + # Could be enabled later: + # - gocyclo + # - maligned + # - gosec diff --git a/vendor/github.com/cilium/ebpf/ARCHITECTURE.md b/vendor/github.com/cilium/ebpf/ARCHITECTURE.md new file mode 100644 index 0000000..6cbb31b --- /dev/null +++ b/vendor/github.com/cilium/ebpf/ARCHITECTURE.md @@ -0,0 +1,80 @@ +Architecture of the library +=== + + ELF -> Specifications -> Objects -> Links + +ELF +--- + +BPF is usually produced by using Clang to compile a subset of C. Clang outputs +an ELF file which contains program byte code (aka BPF), but also metadata for +maps used by the program. The metadata follows the conventions set by libbpf +shipped with the kernel. Certain ELF sections have special meaning +and contain structures defined by libbpf. Newer versions of clang emit +additional metadata in BPF Type Format (aka BTF). + +The library aims to be compatible with libbpf so that moving from a C toolchain +to a Go one creates little friction. To that end, the [ELF reader](elf_reader.go) +is tested against the Linux selftests and avoids introducing custom behaviour +if possible. + +The output of the ELF reader is a `CollectionSpec` which encodes +all of the information contained in the ELF in a form that is easy to work with +in Go. + +### BTF + +The BPF Type Format describes more than just the types used by a BPF program. It +includes debug aids like which source line corresponds to which instructions and +what global variables are used. + +[BTF parsing](internal/btf/) lives in a separate internal package since exposing +it would mean an additional maintenance burden, and because the API still +has sharp corners. The most important concept is the `btf.Type` interface, which +also describes things that aren't really types like `.rodata` or `.bss` sections. +`btf.Type`s can form cyclical graphs, which can easily lead to infinite loops if +one is not careful. Hopefully a safe pattern to work with `btf.Type` emerges as +we write more code that deals with it. + +Specifications +--- + +`CollectionSpec`, `ProgramSpec` and `MapSpec` are blueprints for in-kernel +objects and contain everything necessary to execute the relevant `bpf(2)` +syscalls. Since the ELF reader outputs a `CollectionSpec` it's possible to +modify clang-compiled BPF code, for example to rewrite constants. At the same +time the [asm](asm/) package provides an assembler that can be used to generate +`ProgramSpec` on the fly. + +Creating a spec should never require any privileges or be restricted in any way, +for example by only allowing programs in native endianness. This ensures that +the library stays flexible. + +Objects +--- + +`Program` and `Map` are the result of loading specs into the kernel. Sometimes +loading a spec will fail because the kernel is too old, or a feature is not +enabled. There are multiple ways the library deals with that: + +* Fallback: older kernels don't allow naming programs and maps. The library + automatically detects support for names, and omits them during load if + necessary. This works since name is primarily a debug aid. + +* Sentinel error: sometimes it's possible to detect that a feature isn't available. + In that case the library will return an error wrapping `ErrNotSupported`. + This is also useful to skip tests that can't run on the current kernel. + +Once program and map objects are loaded they expose the kernel's low-level API, +e.g. `NextKey`. Often this API is awkward to use in Go, so there are safer +wrappers on top of the low-level API, like `MapIterator`. The low-level API is +useful when our higher-level API doesn't support a particular use case. + +Links +--- + +BPF can be attached to many different points in the kernel and newer BPF hooks +tend to use bpf_link to do so. Older hooks unfortunately use a combination of +syscalls, netlink messages, etc. Adding support for a new link type should not +pull in large dependencies like netlink, so XDP programs or tracepoints are +out of scope. diff --git a/vendor/github.com/cilium/ebpf/CODE_OF_CONDUCT.md b/vendor/github.com/cilium/ebpf/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..8e42838 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/CODE_OF_CONDUCT.md @@ -0,0 +1,46 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation. + +## Our Standards + +Examples of behavior that contributes to creating a positive environment include: + +* Using welcoming and inclusive language +* Being respectful of differing viewpoints and experiences +* Gracefully accepting constructive criticism +* Focusing on what is best for the community +* Showing empathy towards other community members + +Examples of unacceptable behavior by participants include: + +* The use of sexualized language or imagery and unwelcome sexual attention or advances +* Trolling, insulting/derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or electronic address, without explicit permission +* Other conduct which could reasonably be considered inappropriate in a professional setting + +## Our Responsibilities + +Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. + +Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. + +## Scope + +This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at nathanjsweet at gmail dot com or i at lmb dot io. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. + +Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version] + +[homepage]: http://contributor-covenant.org +[version]: http://contributor-covenant.org/version/1/4/ diff --git a/vendor/github.com/cilium/ebpf/CONTRIBUTING.md b/vendor/github.com/cilium/ebpf/CONTRIBUTING.md new file mode 100644 index 0000000..0d29eae --- /dev/null +++ b/vendor/github.com/cilium/ebpf/CONTRIBUTING.md @@ -0,0 +1,40 @@ +# How to contribute + +Development is on [GitHub](https://github.com/cilium/ebpf) and contributions in +the form of pull requests and issues reporting bugs or suggesting new features +are welcome. Please take a look at [the architecture](ARCHITECTURE.md) to get +a better understanding for the high-level goals. + +New features must be accompanied by tests. Before starting work on any large +feature, please [join](https://ebpf.io/slack) the +[#ebpf-go](https://cilium.slack.com/messages/ebpf-go) channel on Slack to +discuss the design first. + +When submitting pull requests, consider writing details about what problem you +are solving and why the proposed approach solves that problem in commit messages +and/or pull request description to help future library users and maintainers to +reason about the proposed changes. + +## Running the tests + +Many of the tests require privileges to set resource limits and load eBPF code. +The easiest way to obtain these is to run the tests with `sudo`. + +To test the current package with your local kernel you can simply run: +``` +go test -exec sudo ./... +``` + +To test the current package with a different kernel version you can use the [run-tests.sh](run-tests.sh) script. +It requires [virtme](https://github.com/amluto/virtme) and qemu to be installed. + +Examples: + +```bash +# Run all tests on a 5.4 kernel +./run-tests.sh 5.4 + +# Run a subset of tests: +./run-tests.sh 5.4 go test ./link +``` + diff --git a/vendor/github.com/cilium/ebpf/Makefile b/vendor/github.com/cilium/ebpf/Makefile new file mode 100644 index 0000000..0bc15c0 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/Makefile @@ -0,0 +1,73 @@ +# The development version of clang is distributed as the 'clang' binary, +# while stable/released versions have a version number attached. +# Pin the default clang to a stable version. +CLANG ?= clang-12 +CFLAGS := -target bpf -O2 -g -Wall -Werror $(CFLAGS) + +# Obtain an absolute path to the directory of the Makefile. +# Assume the Makefile is in the root of the repository. +REPODIR := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) +UIDGID := $(shell stat -c '%u:%g' ${REPODIR}) + +IMAGE := $(shell cat ${REPODIR}/testdata/docker/IMAGE) +VERSION := $(shell cat ${REPODIR}/testdata/docker/VERSION) + +# clang <8 doesn't tag relocs properly (STT_NOTYPE) +# clang 9 is the first version emitting BTF +TARGETS := \ + testdata/loader-clang-7 \ + testdata/loader-clang-9 \ + testdata/loader-$(CLANG) \ + testdata/btf_map_init \ + testdata/invalid_map \ + testdata/raw_tracepoint \ + testdata/invalid_map_static \ + testdata/invalid_btf_map_init \ + testdata/strings \ + testdata/freplace \ + testdata/iproute2_map_compat \ + internal/btf/testdata/relocs + +.PHONY: all clean docker-all docker-shell + +.DEFAULT_TARGET = docker-all + +# Build all ELF binaries using a Dockerized LLVM toolchain. +docker-all: + docker run --rm --user "${UIDGID}" \ + -v "${REPODIR}":/ebpf -w /ebpf --env MAKEFLAGS \ + --env CFLAGS="-fdebug-prefix-map=/ebpf=." \ + "${IMAGE}:${VERSION}" \ + make all + +# (debug) Drop the user into a shell inside the Docker container as root. +docker-shell: + docker run --rm -ti \ + -v "${REPODIR}":/ebpf -w /ebpf \ + "${IMAGE}:${VERSION}" + +clean: + -$(RM) testdata/*.elf + -$(RM) internal/btf/testdata/*.elf + +all: $(addsuffix -el.elf,$(TARGETS)) $(addsuffix -eb.elf,$(TARGETS)) + ln -srf testdata/loader-$(CLANG)-el.elf testdata/loader-el.elf + ln -srf testdata/loader-$(CLANG)-eb.elf testdata/loader-eb.elf + +testdata/loader-%-el.elf: testdata/loader.c + $* $(CFLAGS) -mlittle-endian -c $< -o $@ + +testdata/loader-%-eb.elf: testdata/loader.c + $* $(CFLAGS) -mbig-endian -c $< -o $@ + +%-el.elf: %.c + $(CLANG) $(CFLAGS) -mlittle-endian -c $< -o $@ + +%-eb.elf : %.c + $(CLANG) $(CFLAGS) -mbig-endian -c $< -o $@ + +# Usage: make VMLINUX=/path/to/vmlinux vmlinux-btf +.PHONY: vmlinux-btf +vmlinux-btf: internal/btf/testdata/vmlinux-btf.gz +internal/btf/testdata/vmlinux-btf.gz: $(VMLINUX) + objcopy --dump-section .BTF=/dev/stdout "$<" /dev/null | gzip > "$@" diff --git a/vendor/github.com/cilium/ebpf/README.md b/vendor/github.com/cilium/ebpf/README.md new file mode 100644 index 0000000..01e2fff --- /dev/null +++ b/vendor/github.com/cilium/ebpf/README.md @@ -0,0 +1,70 @@ +# eBPF + +[![PkgGoDev](https://pkg.go.dev/badge/github.com/cilium/ebpf)](https://pkg.go.dev/github.com/cilium/ebpf) + +![HoneyGopher](.github/images/cilium-ebpf.png) + +eBPF is a pure Go library that provides utilities for loading, compiling, and +debugging eBPF programs. It has minimal external dependencies and is intended to +be used in long running processes. + +The library is maintained by [Cloudflare](https://www.cloudflare.com) and +[Cilium](https://www.cilium.io). + +See [ebpf.io](https://ebpf.io) for other projects from the eBPF ecosystem. + +## Getting Started + +A small collection of Go and eBPF programs that serve as examples for building +your own tools can be found under [examples/](examples/). + +Contributions are highly encouraged, as they highlight certain use cases of +eBPF and the library, and help shape the future of the project. + +## Getting Help + +Please +[join](https://ebpf.io/slack) the +[#ebpf-go](https://cilium.slack.com/messages/ebpf-go) channel on Slack if you +have questions regarding the library. + +## Packages + +This library includes the following packages: + +* [asm](https://pkg.go.dev/github.com/cilium/ebpf/asm) contains a basic + assembler, allowing you to write eBPF assembly instructions directly + within your Go code. (You don't need to use this if you prefer to write your eBPF program in C.) +* [cmd/bpf2go](https://pkg.go.dev/github.com/cilium/ebpf/cmd/bpf2go) allows + compiling and embedding eBPF programs written in C within Go code. As well as + compiling the C code, it auto-generates Go code for loading and manipulating + the eBPF program and map objects. +* [link](https://pkg.go.dev/github.com/cilium/ebpf/link) allows attaching eBPF + to various hooks +* [perf](https://pkg.go.dev/github.com/cilium/ebpf/perf) allows reading from a + `PERF_EVENT_ARRAY` +* [ringbuf](https://pkg.go.dev/github.com/cilium/ebpf/ringbuf) allows reading from a + `BPF_MAP_TYPE_RINGBUF` map + + +## Requirements + +* A version of Go that is [supported by + upstream](https://golang.org/doc/devel/release.html#policy) +* Linux >= 4.9. CI is run against LTS releases. + +## Regenerating Testdata + +Run `make` in the root of this repository to rebuild testdata in all +subpackages. This requires Docker, as it relies on a standardized build +environment to keep the build output stable. + +The toolchain image build files are kept in [testdata/docker/](testdata/docker/). + +## License + +MIT + +### eBPF Gopher + +The eBPF honeygopher is based on the Go gopher designed by Renee French. diff --git a/vendor/github.com/cilium/ebpf/abi.go b/vendor/github.com/cilium/ebpf/abi.go deleted file mode 100644 index 999b8cc..0000000 --- a/vendor/github.com/cilium/ebpf/abi.go +++ /dev/null @@ -1,183 +0,0 @@ -package ebpf - -import ( - "github.com/pkg/errors" -) - -// CollectionABI describes the interface of an eBPF collection. -type CollectionABI struct { - Maps map[string]*MapABI - Programs map[string]*ProgramABI -} - -// CheckSpec verifies that all maps and programs mentioned -// in the ABI are present in the spec. -func (abi *CollectionABI) CheckSpec(cs *CollectionSpec) error { - for name := range abi.Maps { - if cs.Maps[name] == nil { - return errors.Errorf("missing map %s", name) - } - } - - for name := range abi.Programs { - if cs.Programs[name] == nil { - return errors.Errorf("missing program %s", name) - } - } - - return nil -} - -// Check verifies that all items in a collection conform to this ABI. -func (abi *CollectionABI) Check(coll *Collection) error { - for name, mapABI := range abi.Maps { - m := coll.Maps[name] - if m == nil { - return errors.Errorf("missing map %s", name) - } - if err := mapABI.Check(m); err != nil { - return errors.Wrapf(err, "map %s", name) - } - } - - for name, progABI := range abi.Programs { - p := coll.Programs[name] - if p == nil { - return errors.Errorf("missing program %s", name) - } - if err := progABI.Check(p); err != nil { - return errors.Wrapf(err, "program %s", name) - } - } - - return nil -} - -// MapABI describes a Map. -// -// Use it to assert that a Map matches what your code expects. -type MapABI struct { - Type MapType - KeySize uint32 - ValueSize uint32 - MaxEntries uint32 - InnerMap *MapABI -} - -func newMapABIFromSpec(spec *MapSpec) *MapABI { - var inner *MapABI - if spec.InnerMap != nil { - inner = newMapABIFromSpec(spec.InnerMap) - } - - return &MapABI{ - spec.Type, - spec.KeySize, - spec.ValueSize, - spec.MaxEntries, - inner, - } -} - -func newMapABIFromFd(fd *bpfFD) (*MapABI, error) { - info, err := bpfGetMapInfoByFD(fd) - if err != nil { - return nil, err - } - - mapType := MapType(info.mapType) - if mapType == ArrayOfMaps || mapType == HashOfMaps { - return nil, errors.New("can't get map info for nested maps") - } - - return &MapABI{ - mapType, - info.keySize, - info.valueSize, - info.maxEntries, - nil, - }, nil -} - -// Check verifies that a Map conforms to the ABI. -// -// Members of ABI which have the zero value of their type are not checked. -func (abi *MapABI) Check(m *Map) error { - return abi.check(&m.abi) -} - -func (abi *MapABI) check(other *MapABI) error { - if abi.Type != UnspecifiedMap && other.Type != abi.Type { - return errors.Errorf("expected map type %s, have %s", abi.Type, other.Type) - } - if err := checkUint32("key size", abi.KeySize, other.KeySize); err != nil { - return err - } - if err := checkUint32("value size", abi.ValueSize, other.ValueSize); err != nil { - return err - } - if err := checkUint32("max entries", abi.MaxEntries, other.MaxEntries); err != nil { - return err - } - - if abi.InnerMap == nil { - if abi.Type == ArrayOfMaps || abi.Type == HashOfMaps { - return errors.New("missing inner map ABI") - } - - return nil - } - - if other.InnerMap == nil { - return errors.New("missing inner map") - } - - return errors.Wrap(abi.InnerMap.check(other.InnerMap), "inner map") -} - -// ProgramABI describes a Program. -// -// Use it to assert that a Program matches what your code expects. -type ProgramABI struct { - Type ProgramType -} - -func newProgramABIFromSpec(spec *ProgramSpec) *ProgramABI { - return &ProgramABI{ - spec.Type, - } -} - -func newProgramABIFromFd(fd *bpfFD) (*ProgramABI, error) { - info, err := bpfGetProgInfoByFD(fd) - if err != nil { - return nil, err - } - - return newProgramABIFromInfo(info), nil -} - -func newProgramABIFromInfo(info *bpfProgInfo) *ProgramABI { - return &ProgramABI{ - Type: ProgramType(info.progType), - } -} - -// Check verifies that a Program conforms to the ABI. -// -// Members which have the zero value of their type -// are not checked. -func (abi *ProgramABI) Check(prog *Program) error { - if abi.Type != UnspecifiedProgram && prog.abi.Type != abi.Type { - return errors.Errorf("expected program type %s, have %s", abi.Type, prog.abi.Type) - } - - return nil -} - -func checkUint32(name string, want, have uint32) error { - if want != 0 && have != want { - return errors.Errorf("expected %s to be %d, have %d", name, want, have) - } - return nil -} diff --git a/vendor/github.com/cilium/ebpf/asm/func.go b/vendor/github.com/cilium/ebpf/asm/func.go index 97f794c..bfa5d59 100644 --- a/vendor/github.com/cilium/ebpf/asm/func.go +++ b/vendor/github.com/cilium/ebpf/asm/func.go @@ -7,7 +7,7 @@ type BuiltinFunc int32 // eBPF built-in functions // -// You can renegerate this list using the following gawk script: +// You can regenerate this list using the following gawk script: // // /FN\(.+\),/ { // match($1, /\((.+)\)/, r) @@ -132,6 +132,64 @@ const ( FnSkStorageDelete FnSendSignal FnTcpGenSyncookie + FnSkbOutput + FnProbeReadUser + FnProbeReadKernel + FnProbeReadUserStr + FnProbeReadKernelStr + FnTcpSendAck + FnSendSignalThread + FnJiffies64 + FnReadBranchRecords + FnGetNsCurrentPidTgid + FnXdpOutput + FnGetNetnsCookie + FnGetCurrentAncestorCgroupId + FnSkAssign + FnKtimeGetBootNs + FnSeqPrintf + FnSeqWrite + FnSkCgroupId + FnSkAncestorCgroupId + FnRingbufOutput + FnRingbufReserve + FnRingbufSubmit + FnRingbufDiscard + FnRingbufQuery + FnCsumLevel + FnSkcToTcp6Sock + FnSkcToTcpSock + FnSkcToTcpTimewaitSock + FnSkcToTcpRequestSock + FnSkcToUdp6Sock + FnGetTaskStack + FnLoadHdrOpt + FnStoreHdrOpt + FnReserveHdrOpt + FnInodeStorageGet + FnInodeStorageDelete + FnDPath + FnCopyFromUser + FnSnprintfBtf + FnSeqPrintfBtf + FnSkbCgroupClassid + FnRedirectNeigh + FnPerCpuPtr + FnThisCpuPtr + FnRedirectPeer + FnTaskStorageGet + FnTaskStorageDelete + FnGetCurrentTaskBtf + FnBprmOptsSet + FnKtimeGetCoarseNs + FnImaInodeHash + FnSockFromFile + FnCheckMtu + FnForEachMapElem + FnSnprintf + FnSysBpf + FnBtfFindByNameKind + FnSysClose ) // Call emits a function call. diff --git a/vendor/github.com/cilium/ebpf/asm/func_string.go b/vendor/github.com/cilium/ebpf/asm/func_string.go index 8860b9f..5a0e333 100644 --- a/vendor/github.com/cilium/ebpf/asm/func_string.go +++ b/vendor/github.com/cilium/ebpf/asm/func_string.go @@ -119,11 +119,69 @@ func _() { _ = x[FnSkStorageDelete-108] _ = x[FnSendSignal-109] _ = x[FnTcpGenSyncookie-110] + _ = x[FnSkbOutput-111] + _ = x[FnProbeReadUser-112] + _ = x[FnProbeReadKernel-113] + _ = x[FnProbeReadUserStr-114] + _ = x[FnProbeReadKernelStr-115] + _ = x[FnTcpSendAck-116] + _ = x[FnSendSignalThread-117] + _ = x[FnJiffies64-118] + _ = x[FnReadBranchRecords-119] + _ = x[FnGetNsCurrentPidTgid-120] + _ = x[FnXdpOutput-121] + _ = x[FnGetNetnsCookie-122] + _ = x[FnGetCurrentAncestorCgroupId-123] + _ = x[FnSkAssign-124] + _ = x[FnKtimeGetBootNs-125] + _ = x[FnSeqPrintf-126] + _ = x[FnSeqWrite-127] + _ = x[FnSkCgroupId-128] + _ = x[FnSkAncestorCgroupId-129] + _ = x[FnRingbufOutput-130] + _ = x[FnRingbufReserve-131] + _ = x[FnRingbufSubmit-132] + _ = x[FnRingbufDiscard-133] + _ = x[FnRingbufQuery-134] + _ = x[FnCsumLevel-135] + _ = x[FnSkcToTcp6Sock-136] + _ = x[FnSkcToTcpSock-137] + _ = x[FnSkcToTcpTimewaitSock-138] + _ = x[FnSkcToTcpRequestSock-139] + _ = x[FnSkcToUdp6Sock-140] + _ = x[FnGetTaskStack-141] + _ = x[FnLoadHdrOpt-142] + _ = x[FnStoreHdrOpt-143] + _ = x[FnReserveHdrOpt-144] + _ = x[FnInodeStorageGet-145] + _ = x[FnInodeStorageDelete-146] + _ = x[FnDPath-147] + _ = x[FnCopyFromUser-148] + _ = x[FnSnprintfBtf-149] + _ = x[FnSeqPrintfBtf-150] + _ = x[FnSkbCgroupClassid-151] + _ = x[FnRedirectNeigh-152] + _ = x[FnPerCpuPtr-153] + _ = x[FnThisCpuPtr-154] + _ = x[FnRedirectPeer-155] + _ = x[FnTaskStorageGet-156] + _ = x[FnTaskStorageDelete-157] + _ = x[FnGetCurrentTaskBtf-158] + _ = x[FnBprmOptsSet-159] + _ = x[FnKtimeGetCoarseNs-160] + _ = x[FnImaInodeHash-161] + _ = x[FnSockFromFile-162] + _ = x[FnCheckMtu-163] + _ = x[FnForEachMapElem-164] + _ = x[FnSnprintf-165] + _ = x[FnSysBpf-166] + _ = x[FnBtfFindByNameKind-167] + _ = x[FnSysClose-168] } -const _BuiltinFunc_name = "FnUnspecFnMapLookupElemFnMapUpdateElemFnMapDeleteElemFnProbeReadFnKtimeGetNsFnTracePrintkFnGetPrandomU32FnGetSmpProcessorIdFnSkbStoreBytesFnL3CsumReplaceFnL4CsumReplaceFnTailCallFnCloneRedirectFnGetCurrentPidTgidFnGetCurrentUidGidFnGetCurrentCommFnGetCgroupClassidFnSkbVlanPushFnSkbVlanPopFnSkbGetTunnelKeyFnSkbSetTunnelKeyFnPerfEventReadFnRedirectFnGetRouteRealmFnPerfEventOutputFnSkbLoadBytesFnGetStackidFnCsumDiffFnSkbGetTunnelOptFnSkbSetTunnelOptFnSkbChangeProtoFnSkbChangeTypeFnSkbUnderCgroupFnGetHashRecalcFnGetCurrentTaskFnProbeWriteUserFnCurrentTaskUnderCgroupFnSkbChangeTailFnSkbPullDataFnCsumUpdateFnSetHashInvalidFnGetNumaNodeIdFnSkbChangeHeadFnXdpAdjustHeadFnProbeReadStrFnGetSocketCookieFnGetSocketUidFnSetHashFnSetsockoptFnSkbAdjustRoomFnRedirectMapFnSkRedirectMapFnSockMapUpdateFnXdpAdjustMetaFnPerfEventReadValueFnPerfProgReadValueFnGetsockoptFnOverrideReturnFnSockOpsCbFlagsSetFnMsgRedirectMapFnMsgApplyBytesFnMsgCorkBytesFnMsgPullDataFnBindFnXdpAdjustTailFnSkbGetXfrmStateFnGetStackFnSkbLoadBytesRelativeFnFibLookupFnSockHashUpdateFnMsgRedirectHashFnSkRedirectHashFnLwtPushEncapFnLwtSeg6StoreBytesFnLwtSeg6AdjustSrhFnLwtSeg6ActionFnRcRepeatFnRcKeydownFnSkbCgroupIdFnGetCurrentCgroupIdFnGetLocalStorageFnSkSelectReuseportFnSkbAncestorCgroupIdFnSkLookupTcpFnSkLookupUdpFnSkReleaseFnMapPushElemFnMapPopElemFnMapPeekElemFnMsgPushDataFnMsgPopDataFnRcPointerRelFnSpinLockFnSpinUnlockFnSkFullsockFnTcpSockFnSkbEcnSetCeFnGetListenerSockFnSkcLookupTcpFnTcpCheckSyncookieFnSysctlGetNameFnSysctlGetCurrentValueFnSysctlGetNewValueFnSysctlSetNewValueFnStrtolFnStrtoulFnSkStorageGetFnSkStorageDeleteFnSendSignalFnTcpGenSyncookie" +const _BuiltinFunc_name = "FnUnspecFnMapLookupElemFnMapUpdateElemFnMapDeleteElemFnProbeReadFnKtimeGetNsFnTracePrintkFnGetPrandomU32FnGetSmpProcessorIdFnSkbStoreBytesFnL3CsumReplaceFnL4CsumReplaceFnTailCallFnCloneRedirectFnGetCurrentPidTgidFnGetCurrentUidGidFnGetCurrentCommFnGetCgroupClassidFnSkbVlanPushFnSkbVlanPopFnSkbGetTunnelKeyFnSkbSetTunnelKeyFnPerfEventReadFnRedirectFnGetRouteRealmFnPerfEventOutputFnSkbLoadBytesFnGetStackidFnCsumDiffFnSkbGetTunnelOptFnSkbSetTunnelOptFnSkbChangeProtoFnSkbChangeTypeFnSkbUnderCgroupFnGetHashRecalcFnGetCurrentTaskFnProbeWriteUserFnCurrentTaskUnderCgroupFnSkbChangeTailFnSkbPullDataFnCsumUpdateFnSetHashInvalidFnGetNumaNodeIdFnSkbChangeHeadFnXdpAdjustHeadFnProbeReadStrFnGetSocketCookieFnGetSocketUidFnSetHashFnSetsockoptFnSkbAdjustRoomFnRedirectMapFnSkRedirectMapFnSockMapUpdateFnXdpAdjustMetaFnPerfEventReadValueFnPerfProgReadValueFnGetsockoptFnOverrideReturnFnSockOpsCbFlagsSetFnMsgRedirectMapFnMsgApplyBytesFnMsgCorkBytesFnMsgPullDataFnBindFnXdpAdjustTailFnSkbGetXfrmStateFnGetStackFnSkbLoadBytesRelativeFnFibLookupFnSockHashUpdateFnMsgRedirectHashFnSkRedirectHashFnLwtPushEncapFnLwtSeg6StoreBytesFnLwtSeg6AdjustSrhFnLwtSeg6ActionFnRcRepeatFnRcKeydownFnSkbCgroupIdFnGetCurrentCgroupIdFnGetLocalStorageFnSkSelectReuseportFnSkbAncestorCgroupIdFnSkLookupTcpFnSkLookupUdpFnSkReleaseFnMapPushElemFnMapPopElemFnMapPeekElemFnMsgPushDataFnMsgPopDataFnRcPointerRelFnSpinLockFnSpinUnlockFnSkFullsockFnTcpSockFnSkbEcnSetCeFnGetListenerSockFnSkcLookupTcpFnTcpCheckSyncookieFnSysctlGetNameFnSysctlGetCurrentValueFnSysctlGetNewValueFnSysctlSetNewValueFnStrtolFnStrtoulFnSkStorageGetFnSkStorageDeleteFnSendSignalFnTcpGenSyncookieFnSkbOutputFnProbeReadUserFnProbeReadKernelFnProbeReadUserStrFnProbeReadKernelStrFnTcpSendAckFnSendSignalThreadFnJiffies64FnReadBranchRecordsFnGetNsCurrentPidTgidFnXdpOutputFnGetNetnsCookieFnGetCurrentAncestorCgroupIdFnSkAssignFnKtimeGetBootNsFnSeqPrintfFnSeqWriteFnSkCgroupIdFnSkAncestorCgroupIdFnRingbufOutputFnRingbufReserveFnRingbufSubmitFnRingbufDiscardFnRingbufQueryFnCsumLevelFnSkcToTcp6SockFnSkcToTcpSockFnSkcToTcpTimewaitSockFnSkcToTcpRequestSockFnSkcToUdp6SockFnGetTaskStackFnLoadHdrOptFnStoreHdrOptFnReserveHdrOptFnInodeStorageGetFnInodeStorageDeleteFnDPathFnCopyFromUserFnSnprintfBtfFnSeqPrintfBtfFnSkbCgroupClassidFnRedirectNeighFnPerCpuPtrFnThisCpuPtrFnRedirectPeerFnTaskStorageGetFnTaskStorageDeleteFnGetCurrentTaskBtfFnBprmOptsSetFnKtimeGetCoarseNsFnImaInodeHashFnSockFromFileFnCheckMtuFnForEachMapElemFnSnprintfFnSysBpfFnBtfFindByNameKindFnSysClose" -var _BuiltinFunc_index = [...]uint16{0, 8, 23, 38, 53, 64, 76, 89, 104, 123, 138, 153, 168, 178, 193, 212, 230, 246, 264, 277, 289, 306, 323, 338, 348, 363, 380, 394, 406, 416, 433, 450, 466, 481, 497, 512, 528, 544, 568, 583, 596, 608, 624, 639, 654, 669, 683, 700, 714, 723, 735, 750, 763, 778, 793, 808, 828, 847, 859, 875, 894, 910, 925, 939, 952, 958, 973, 990, 1000, 1022, 1033, 1049, 1066, 1082, 1096, 1115, 1133, 1148, 1158, 1169, 1182, 1202, 1219, 1238, 1259, 1272, 1285, 1296, 1309, 1321, 1334, 1347, 1359, 1373, 1383, 1395, 1407, 1416, 1429, 1446, 1460, 1479, 1494, 1517, 1536, 1555, 1563, 1572, 1586, 1603, 1615, 1632} +var _BuiltinFunc_index = [...]uint16{0, 8, 23, 38, 53, 64, 76, 89, 104, 123, 138, 153, 168, 178, 193, 212, 230, 246, 264, 277, 289, 306, 323, 338, 348, 363, 380, 394, 406, 416, 433, 450, 466, 481, 497, 512, 528, 544, 568, 583, 596, 608, 624, 639, 654, 669, 683, 700, 714, 723, 735, 750, 763, 778, 793, 808, 828, 847, 859, 875, 894, 910, 925, 939, 952, 958, 973, 990, 1000, 1022, 1033, 1049, 1066, 1082, 1096, 1115, 1133, 1148, 1158, 1169, 1182, 1202, 1219, 1238, 1259, 1272, 1285, 1296, 1309, 1321, 1334, 1347, 1359, 1373, 1383, 1395, 1407, 1416, 1429, 1446, 1460, 1479, 1494, 1517, 1536, 1555, 1563, 1572, 1586, 1603, 1615, 1632, 1643, 1658, 1675, 1693, 1713, 1725, 1743, 1754, 1773, 1794, 1805, 1821, 1849, 1859, 1875, 1886, 1896, 1908, 1928, 1943, 1959, 1974, 1990, 2004, 2015, 2030, 2044, 2066, 2087, 2102, 2116, 2128, 2141, 2156, 2173, 2193, 2200, 2214, 2227, 2241, 2259, 2274, 2285, 2297, 2311, 2327, 2346, 2365, 2378, 2396, 2410, 2424, 2434, 2450, 2460, 2468, 2487, 2497} func (i BuiltinFunc) String() string { if i < 0 || i >= BuiltinFunc(len(_BuiltinFunc_index)-1) { diff --git a/vendor/github.com/cilium/ebpf/asm/instruction.go b/vendor/github.com/cilium/ebpf/asm/instruction.go index c8ed6cf..64d717d 100644 --- a/vendor/github.com/cilium/ebpf/asm/instruction.go +++ b/vendor/github.com/cilium/ebpf/asm/instruction.go @@ -1,18 +1,29 @@ package asm import ( + "crypto/sha1" "encoding/binary" + "encoding/hex" + "errors" "fmt" "io" "math" "strings" - "github.com/pkg/errors" + "github.com/cilium/ebpf/internal/unix" ) // InstructionSize is the size of a BPF instruction in bytes const InstructionSize = 8 +// RawInstructionOffset is an offset in units of raw BPF instructions. +type RawInstructionOffset uint64 + +// Bytes returns the offset of an instruction in bytes. +func (rio RawInstructionOffset) Bytes() uint64 { + return uint64(rio) * InstructionSize +} + // Instruction is a single eBPF instruction. type Instruction struct { OpCode OpCode @@ -39,12 +50,14 @@ func (ins *Instruction) Unmarshal(r io.Reader, bo binary.ByteOrder) (uint64, err } ins.OpCode = bi.OpCode - ins.Dst = bi.Registers.Dst() - ins.Src = bi.Registers.Src() ins.Offset = bi.Offset ins.Constant = int64(bi.Constant) + ins.Dst, ins.Src, err = bi.Registers.Unmarshal(bo) + if err != nil { + return 0, fmt.Errorf("can't unmarshal registers: %s", err) + } - if !bi.OpCode.isDWordLoad() { + if !bi.OpCode.IsDWordLoad() { return InstructionSize, nil } @@ -67,7 +80,7 @@ func (ins Instruction) Marshal(w io.Writer, bo binary.ByteOrder) (uint64, error) return 0, errors.New("invalid opcode") } - isDWordLoad := ins.OpCode.isDWordLoad() + isDWordLoad := ins.OpCode.IsDWordLoad() cons := int32(ins.Constant) if isDWordLoad { @@ -75,9 +88,14 @@ func (ins Instruction) Marshal(w io.Writer, bo binary.ByteOrder) (uint64, error) cons = int32(uint32(ins.Constant)) } + regs, err := newBPFRegisters(ins.Dst, ins.Src, bo) + if err != nil { + return 0, fmt.Errorf("can't marshal registers: %s", err) + } + bpfi := bpfInstruction{ ins.OpCode, - newBPFRegisters(ins.Dst, ins.Src), + regs, ins.Offset, cons, } @@ -103,22 +121,77 @@ func (ins Instruction) Marshal(w io.Writer, bo binary.ByteOrder) (uint64, error) // RewriteMapPtr changes an instruction to use a new map fd. // -// Returns an error if the fd is invalid, or the instruction -// is incorrect. +// Returns an error if the instruction doesn't load a map. func (ins *Instruction) RewriteMapPtr(fd int) error { - if !ins.OpCode.isDWordLoad() { - return errors.Errorf("%s is not a 64 bit load", ins.OpCode) + if !ins.OpCode.IsDWordLoad() { + return fmt.Errorf("%s is not a 64 bit load", ins.OpCode) } - if fd < 0 { - return errors.New("invalid fd") + if ins.Src != PseudoMapFD && ins.Src != PseudoMapValue { + return errors.New("not a load from a map") } - ins.Src = R1 - ins.Constant = int64(fd) + // Preserve the offset value for direct map loads. + offset := uint64(ins.Constant) & (math.MaxUint32 << 32) + rawFd := uint64(uint32(fd)) + ins.Constant = int64(offset | rawFd) return nil } +// MapPtr returns the map fd for this instruction. +// +// The result is undefined if the instruction is not a load from a map, +// see IsLoadFromMap. +func (ins *Instruction) MapPtr() int { + return int(int32(uint64(ins.Constant) & math.MaxUint32)) +} + +// RewriteMapOffset changes the offset of a direct load from a map. +// +// Returns an error if the instruction is not a direct load. +func (ins *Instruction) RewriteMapOffset(offset uint32) error { + if !ins.OpCode.IsDWordLoad() { + return fmt.Errorf("%s is not a 64 bit load", ins.OpCode) + } + + if ins.Src != PseudoMapValue { + return errors.New("not a direct load from a map") + } + + fd := uint64(ins.Constant) & math.MaxUint32 + ins.Constant = int64(uint64(offset)<<32 | fd) + return nil +} + +func (ins *Instruction) mapOffset() uint32 { + return uint32(uint64(ins.Constant) >> 32) +} + +// IsLoadFromMap returns true if the instruction loads from a map. +// +// This covers both loading the map pointer and direct map value loads. +func (ins *Instruction) IsLoadFromMap() bool { + return ins.OpCode == LoadImmOp(DWord) && (ins.Src == PseudoMapFD || ins.Src == PseudoMapValue) +} + +// IsFunctionCall returns true if the instruction calls another BPF function. +// +// This is not the same thing as a BPF helper call. +func (ins *Instruction) IsFunctionCall() bool { + return ins.OpCode.JumpOp() == Call && ins.Src == PseudoCall +} + +// IsBuiltinCall returns true if the instruction is a built-in call, i.e. BPF helper call. +func (ins *Instruction) IsBuiltinCall() bool { + return ins.OpCode.JumpOp() == Call && ins.Src == R0 && ins.Dst == R0 +} + +// IsConstantLoad returns true if the instruction loads a constant of the +// given size. +func (ins *Instruction) IsConstantLoad(size Size) bool { + return ins.OpCode == LoadImmOp(size) && ins.Src == R0 && ins.Offset == 0 +} + // Format implements fmt.Formatter. func (ins Instruction) Format(f fmt.State, c rune) { if c != 'v' { @@ -139,6 +212,19 @@ func (ins Instruction) Format(f fmt.State, c rune) { return } + if ins.IsLoadFromMap() { + fd := ins.MapPtr() + switch ins.Src { + case PseudoMapFD: + fmt.Fprintf(f, "LoadMapPtr dst: %s fd: %d", ins.Dst, fd) + + case PseudoMapValue: + fmt.Fprintf(f, "LoadMapValue dst: %s, fd: %d off: %d", ins.Dst, fd, ins.mapOffset()) + } + + goto ref + } + fmt.Fprintf(f, "%v ", op) switch cls := op.Class(); cls { case LdClass, LdXClass, StClass, StXClass: @@ -166,7 +252,7 @@ func (ins Instruction) Format(f fmt.State, c rune) { case JumpClass: switch jop := op.JumpOp(); jop { case Call: - if ins.Src == R1 { + if ins.Src == PseudoCall { // bpf-to-bpf call fmt.Fprint(f, ins.Constant) } else { @@ -183,6 +269,7 @@ func (ins Instruction) Format(f fmt.State, c rune) { } } +ref: if ins.Reference != "" { fmt.Fprintf(f, " <%s>", ins.Reference) } @@ -235,7 +322,7 @@ func (insns Instructions) SymbolOffsets() (map[string]int, error) { } if _, ok := offsets[ins.Symbol]; ok { - return nil, errors.Errorf("duplicate symbol %s", ins.Symbol) + return nil, fmt.Errorf("duplicate symbol %s", ins.Symbol) } offsets[ins.Symbol] = i @@ -260,34 +347,12 @@ func (insns Instructions) ReferenceOffsets() map[string][]int { return offsets } -func (insns Instructions) marshalledOffsets() (map[string]int, error) { - symbols := make(map[string]int) - - marshalledPos := 0 - for _, ins := range insns { - currentPos := marshalledPos - marshalledPos += ins.OpCode.marshalledInstructions() - - if ins.Symbol == "" { - continue - } - - if _, ok := symbols[ins.Symbol]; ok { - return nil, errors.Errorf("duplicate symbol %s", ins.Symbol) - } - - symbols[ins.Symbol] = currentPos - } - - return symbols, nil -} - // Format implements fmt.Formatter. // // You can control indentation of symbols by // specifying a width. Setting a precision controls the indentation of // instructions. -// The default character is a tab, which can be overriden by specifying +// The default character is a tab, which can be overridden by specifying // the ' ' space flag. func (insns Instructions) Format(f fmt.State, c rune) { if c != 's' && c != 'v' { @@ -320,65 +385,85 @@ func (insns Instructions) Format(f fmt.State, c rune) { symIndent = strings.Repeat(" ", symPadding) } - // Figure out how many digits we need to represent the highest - // offset. - highestOffset := 0 - for _, ins := range insns { - highestOffset += ins.OpCode.marshalledInstructions() - } + // Guess how many digits we need at most, by assuming that all instructions + // are double wide. + highestOffset := len(insns) * 2 offsetWidth := int(math.Ceil(math.Log10(float64(highestOffset)))) - offset := 0 - for _, ins := range insns { - if ins.Symbol != "" { - fmt.Fprintf(f, "%s%s:\n", symIndent, ins.Symbol) + iter := insns.Iterate() + for iter.Next() { + if iter.Ins.Symbol != "" { + fmt.Fprintf(f, "%s%s:\n", symIndent, iter.Ins.Symbol) } - fmt.Fprintf(f, "%s%*d: %v\n", indent, offsetWidth, offset, ins) - offset += ins.OpCode.marshalledInstructions() + fmt.Fprintf(f, "%s%*d: %v\n", indent, offsetWidth, iter.Offset, iter.Ins) } - - return } // Marshal encodes a BPF program into the kernel format. func (insns Instructions) Marshal(w io.Writer, bo binary.ByteOrder) error { - absoluteOffsets, err := insns.marshalledOffsets() - if err != nil { - return err - } - - num := 0 for i, ins := range insns { - switch { - case ins.OpCode.JumpOp() == Call && ins.Constant == -1: - // Rewrite bpf to bpf call - offset, ok := absoluteOffsets[ins.Reference] - if !ok { - return errors.Errorf("instruction %d: reference to missing symbol %s", i, ins.Reference) - } - - ins.Constant = int64(offset - num - 1) - - case ins.OpCode.Class() == JumpClass && ins.Offset == -1: - // Rewrite jump to label - offset, ok := absoluteOffsets[ins.Reference] - if !ok { - return errors.Errorf("instruction %d: reference to missing symbol %s", i, ins.Reference) - } - - ins.Offset = int16(offset - num - 1) - } - - n, err := ins.Marshal(w, bo) + _, err := ins.Marshal(w, bo) if err != nil { - return errors.Wrapf(err, "instruction %d", i) + return fmt.Errorf("instruction %d: %w", i, err) } - - num += int(n / InstructionSize) } return nil } +// Tag calculates the kernel tag for a series of instructions. +// +// It mirrors bpf_prog_calc_tag in the kernel and so can be compared +// to ProgramInfo.Tag to figure out whether a loaded program matches +// certain instructions. +func (insns Instructions) Tag(bo binary.ByteOrder) (string, error) { + h := sha1.New() + for i, ins := range insns { + if ins.IsLoadFromMap() { + ins.Constant = 0 + } + _, err := ins.Marshal(h, bo) + if err != nil { + return "", fmt.Errorf("instruction %d: %w", i, err) + } + } + return hex.EncodeToString(h.Sum(nil)[:unix.BPF_TAG_SIZE]), nil +} + +// Iterate allows iterating a BPF program while keeping track of +// various offsets. +// +// Modifying the instruction slice will lead to undefined behaviour. +func (insns Instructions) Iterate() *InstructionIterator { + return &InstructionIterator{insns: insns} +} + +// InstructionIterator iterates over a BPF program. +type InstructionIterator struct { + insns Instructions + // The instruction in question. + Ins *Instruction + // The index of the instruction in the original instruction slice. + Index int + // The offset of the instruction in raw BPF instructions. This accounts + // for double-wide instructions. + Offset RawInstructionOffset +} + +// Next returns true as long as there are any instructions remaining. +func (iter *InstructionIterator) Next() bool { + if len(iter.insns) == 0 { + return false + } + + if iter.Ins != nil { + iter.Index++ + iter.Offset += RawInstructionOffset(iter.Ins.OpCode.rawInstructions()) + } + iter.Ins = &iter.insns[0] + iter.insns = iter.insns[1:] + return true +} + type bpfInstruction struct { OpCode OpCode Registers bpfRegisters @@ -388,16 +473,26 @@ type bpfInstruction struct { type bpfRegisters uint8 -func newBPFRegisters(dst, src Register) bpfRegisters { - return bpfRegisters((src << 4) | (dst & 0xF)) +func newBPFRegisters(dst, src Register, bo binary.ByteOrder) (bpfRegisters, error) { + switch bo { + case binary.LittleEndian: + return bpfRegisters((src << 4) | (dst & 0xF)), nil + case binary.BigEndian: + return bpfRegisters((dst << 4) | (src & 0xF)), nil + default: + return 0, fmt.Errorf("unrecognized ByteOrder %T", bo) + } } -func (r bpfRegisters) Dst() Register { - return Register(r & 0xF) -} - -func (r bpfRegisters) Src() Register { - return Register(r >> 4) +func (r bpfRegisters) Unmarshal(bo binary.ByteOrder) (dst, src Register, err error) { + switch bo { + case binary.LittleEndian: + return Register(r & 0xF), Register(r >> 4), nil + case binary.BigEndian: + return Register(r >> 4), Register(r & 0xf), nil + default: + return 0, 0, fmt.Errorf("unrecognized ByteOrder %T", bo) + } } type unreferencedSymbolError struct { diff --git a/vendor/github.com/cilium/ebpf/asm/jump.go b/vendor/github.com/cilium/ebpf/asm/jump.go index 33c9b56..7757179 100644 --- a/vendor/github.com/cilium/ebpf/asm/jump.go +++ b/vendor/github.com/cilium/ebpf/asm/jump.go @@ -95,7 +95,7 @@ func (op JumpOp) Label(label string) Instruction { if op == Call { return Instruction{ OpCode: OpCode(JumpClass).SetJumpOp(Call), - Src: R1, + Src: PseudoCall, Constant: -1, Reference: label, } diff --git a/vendor/github.com/cilium/ebpf/asm/load_store.go b/vendor/github.com/cilium/ebpf/asm/load_store.go index ab0e92f..85ed286 100644 --- a/vendor/github.com/cilium/ebpf/asm/load_store.go +++ b/vendor/github.com/cilium/ebpf/asm/load_store.go @@ -110,8 +110,23 @@ func LoadMapPtr(dst Register, fd int) Instruction { return Instruction{ OpCode: LoadImmOp(DWord), Dst: dst, - Src: R1, - Constant: int64(fd), + Src: PseudoMapFD, + Constant: int64(uint32(fd)), + } +} + +// LoadMapValue stores a pointer to the value at a certain offset of a map. +func LoadMapValue(dst Register, fd int, offset uint32) Instruction { + if fd < 0 { + return Instruction{OpCode: InvalidOpCode} + } + + fdAndOffset := (uint64(offset) << 32) | uint64(uint32(fd)) + return Instruction{ + OpCode: LoadImmOp(DWord), + Dst: dst, + Src: PseudoMapValue, + Constant: int64(fdAndOffset), } } diff --git a/vendor/github.com/cilium/ebpf/asm/opcode.go b/vendor/github.com/cilium/ebpf/asm/opcode.go index d796de3..6edc3cf 100644 --- a/vendor/github.com/cilium/ebpf/asm/opcode.go +++ b/vendor/github.com/cilium/ebpf/asm/opcode.go @@ -66,16 +66,16 @@ type OpCode uint8 // InvalidOpCode is returned by setters on OpCode const InvalidOpCode OpCode = 0xff -// marshalledInstructions returns the number of BPF instructions required +// rawInstructions returns the number of BPF instructions required // to encode this opcode. -func (op OpCode) marshalledInstructions() int { - if op == LoadImmOp(DWord) { +func (op OpCode) rawInstructions() int { + if op.IsDWordLoad() { return 2 } return 1 } -func (op OpCode) isDWordLoad() bool { +func (op OpCode) IsDWordLoad() bool { return op == LoadImmOp(DWord) } @@ -225,7 +225,7 @@ func (op OpCode) String() string { } default: - fmt.Fprintf(&f, "%#x", op) + fmt.Fprintf(&f, "OpCode(%#x)", uint8(op)) } return f.String() diff --git a/vendor/github.com/cilium/ebpf/asm/register.go b/vendor/github.com/cilium/ebpf/asm/register.go index 4f284fb..76cb44b 100644 --- a/vendor/github.com/cilium/ebpf/asm/register.go +++ b/vendor/github.com/cilium/ebpf/asm/register.go @@ -33,6 +33,13 @@ const ( RFP = R10 ) +// Pseudo registers used by 64bit loads and jumps +const ( + PseudoMapFD = R1 // BPF_PSEUDO_MAP_FD + PseudoMapValue = R2 // BPF_PSEUDO_MAP_VALUE + PseudoCall = R1 // BPF_PSEUDO_CALL +) + func (r Register) String() string { v := uint8(r) if v == 10 { diff --git a/vendor/github.com/cilium/ebpf/attachtype_string.go b/vendor/github.com/cilium/ebpf/attachtype_string.go new file mode 100644 index 0000000..de355ed --- /dev/null +++ b/vendor/github.com/cilium/ebpf/attachtype_string.go @@ -0,0 +1,65 @@ +// Code generated by "stringer -type AttachType -trimprefix Attach"; DO NOT EDIT. + +package ebpf + +import "strconv" + +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[AttachNone-0] + _ = x[AttachCGroupInetIngress-0] + _ = x[AttachCGroupInetEgress-1] + _ = x[AttachCGroupInetSockCreate-2] + _ = x[AttachCGroupSockOps-3] + _ = x[AttachSkSKBStreamParser-4] + _ = x[AttachSkSKBStreamVerdict-5] + _ = x[AttachCGroupDevice-6] + _ = x[AttachSkMsgVerdict-7] + _ = x[AttachCGroupInet4Bind-8] + _ = x[AttachCGroupInet6Bind-9] + _ = x[AttachCGroupInet4Connect-10] + _ = x[AttachCGroupInet6Connect-11] + _ = x[AttachCGroupInet4PostBind-12] + _ = x[AttachCGroupInet6PostBind-13] + _ = x[AttachCGroupUDP4Sendmsg-14] + _ = x[AttachCGroupUDP6Sendmsg-15] + _ = x[AttachLircMode2-16] + _ = x[AttachFlowDissector-17] + _ = x[AttachCGroupSysctl-18] + _ = x[AttachCGroupUDP4Recvmsg-19] + _ = x[AttachCGroupUDP6Recvmsg-20] + _ = x[AttachCGroupGetsockopt-21] + _ = x[AttachCGroupSetsockopt-22] + _ = x[AttachTraceRawTp-23] + _ = x[AttachTraceFEntry-24] + _ = x[AttachTraceFExit-25] + _ = x[AttachModifyReturn-26] + _ = x[AttachLSMMac-27] + _ = x[AttachTraceIter-28] + _ = x[AttachCgroupInet4GetPeername-29] + _ = x[AttachCgroupInet6GetPeername-30] + _ = x[AttachCgroupInet4GetSockname-31] + _ = x[AttachCgroupInet6GetSockname-32] + _ = x[AttachXDPDevMap-33] + _ = x[AttachCgroupInetSockRelease-34] + _ = x[AttachXDPCPUMap-35] + _ = x[AttachSkLookup-36] + _ = x[AttachXDP-37] + _ = x[AttachSkSKBVerdict-38] + _ = x[AttachSkReuseportSelect-39] + _ = x[AttachSkReuseportSelectOrMigrate-40] + _ = x[AttachPerfEvent-41] +} + +const _AttachType_name = "NoneCGroupInetEgressCGroupInetSockCreateCGroupSockOpsSkSKBStreamParserSkSKBStreamVerdictCGroupDeviceSkMsgVerdictCGroupInet4BindCGroupInet6BindCGroupInet4ConnectCGroupInet6ConnectCGroupInet4PostBindCGroupInet6PostBindCGroupUDP4SendmsgCGroupUDP6SendmsgLircMode2FlowDissectorCGroupSysctlCGroupUDP4RecvmsgCGroupUDP6RecvmsgCGroupGetsockoptCGroupSetsockoptTraceRawTpTraceFEntryTraceFExitModifyReturnLSMMacTraceIterCgroupInet4GetPeernameCgroupInet6GetPeernameCgroupInet4GetSocknameCgroupInet6GetSocknameXDPDevMapCgroupInetSockReleaseXDPCPUMapSkLookupXDPSkSKBVerdictSkReuseportSelectSkReuseportSelectOrMigratePerfEvent" + +var _AttachType_index = [...]uint16{0, 4, 20, 40, 53, 70, 88, 100, 112, 127, 142, 160, 178, 197, 216, 233, 250, 259, 272, 284, 301, 318, 334, 350, 360, 371, 381, 393, 399, 408, 430, 452, 474, 496, 505, 526, 535, 543, 546, 558, 575, 601, 610} + +func (i AttachType) String() string { + if i >= AttachType(len(_AttachType_index)-1) { + return "AttachType(" + strconv.FormatInt(int64(i), 10) + ")" + } + return _AttachType_name[_AttachType_index[i]:_AttachType_index[i+1]] +} diff --git a/vendor/github.com/cilium/ebpf/collection.go b/vendor/github.com/cilium/ebpf/collection.go index 5ad1a5e..2ededc8 100644 --- a/vendor/github.com/cilium/ebpf/collection.go +++ b/vendor/github.com/cilium/ebpf/collection.go @@ -1,12 +1,24 @@ package ebpf import ( + "encoding/binary" + "errors" + "fmt" + "io" + "math" + "reflect" + "strings" + "github.com/cilium/ebpf/asm" - "github.com/pkg/errors" + "github.com/cilium/ebpf/internal" + "github.com/cilium/ebpf/internal/btf" ) // CollectionOptions control loading a collection into the kernel. +// +// Maps and Programs are passed to NewMapWithOptions and NewProgramsWithOptions. type CollectionOptions struct { + Maps MapOptions Programs ProgramOptions } @@ -14,6 +26,10 @@ type CollectionOptions struct { type CollectionSpec struct { Maps map[string]*MapSpec Programs map[string]*ProgramSpec + + // ByteOrder specifies whether the ELF was compiled for + // big-endian or little-endian architectures. + ByteOrder binary.ByteOrder } // Copy returns a recursive copy of the spec. @@ -23,8 +39,9 @@ func (cs *CollectionSpec) Copy() *CollectionSpec { } cpy := CollectionSpec{ - Maps: make(map[string]*MapSpec, len(cs.Maps)), - Programs: make(map[string]*ProgramSpec, len(cs.Programs)), + Maps: make(map[string]*MapSpec, len(cs.Maps)), + Programs: make(map[string]*ProgramSpec, len(cs.Programs)), + ByteOrder: cs.ByteOrder, } for name, spec := range cs.Maps { @@ -38,6 +55,208 @@ func (cs *CollectionSpec) Copy() *CollectionSpec { return &cpy } +// RewriteMaps replaces all references to specific maps. +// +// Use this function to use pre-existing maps instead of creating new ones +// when calling NewCollection. Any named maps are removed from CollectionSpec.Maps. +// +// Returns an error if a named map isn't used in at least one program. +func (cs *CollectionSpec) RewriteMaps(maps map[string]*Map) error { + for symbol, m := range maps { + // have we seen a program that uses this symbol / map + seen := false + fd := m.FD() + for progName, progSpec := range cs.Programs { + err := progSpec.Instructions.RewriteMapPtr(symbol, fd) + + switch { + case err == nil: + seen = true + + case asm.IsUnreferencedSymbol(err): + // Not all programs need to use the map + + default: + return fmt.Errorf("program %s: %w", progName, err) + } + } + + if !seen { + return fmt.Errorf("map %s not referenced by any programs", symbol) + } + + // Prevent NewCollection from creating rewritten maps + delete(cs.Maps, symbol) + } + + return nil +} + +// RewriteConstants replaces the value of multiple constants. +// +// The constant must be defined like so in the C program: +// +// volatile const type foobar; +// volatile const type foobar = default; +// +// Replacement values must be of the same length as the C sizeof(type). +// If necessary, they are marshalled according to the same rules as +// map values. +// +// From Linux 5.5 the verifier will use constants to eliminate dead code. +// +// Returns an error if a constant doesn't exist. +func (cs *CollectionSpec) RewriteConstants(consts map[string]interface{}) error { + rodata := cs.Maps[".rodata"] + if rodata == nil { + return errors.New("missing .rodata section") + } + + if rodata.BTF == nil { + return errors.New(".rodata section has no BTF") + } + + if n := len(rodata.Contents); n != 1 { + return fmt.Errorf("expected one key in .rodata, found %d", n) + } + + kv := rodata.Contents[0] + value, ok := kv.Value.([]byte) + if !ok { + return fmt.Errorf("first value in .rodata is %T not []byte", kv.Value) + } + + buf := make([]byte, len(value)) + copy(buf, value) + + err := patchValue(buf, rodata.BTF.Value, consts) + if err != nil { + return err + } + + rodata.Contents[0] = MapKV{kv.Key, buf} + return nil +} + +// Assign the contents of a CollectionSpec to a struct. +// +// This function is a shortcut to manually checking the presence +// of maps and programs in a CollectionSpec. Consider using bpf2go +// if this sounds useful. +// +// 'to' must be a pointer to a struct. A field of the +// struct is updated with values from Programs or Maps if it +// has an `ebpf` tag and its type is *ProgramSpec or *MapSpec. +// The tag's value specifies the name of the program or map as +// found in the CollectionSpec. +// +// struct { +// Foo *ebpf.ProgramSpec `ebpf:"xdp_foo"` +// Bar *ebpf.MapSpec `ebpf:"bar_map"` +// Ignored int +// } +// +// Returns an error if any of the eBPF objects can't be found, or +// if the same MapSpec or ProgramSpec is assigned multiple times. +func (cs *CollectionSpec) Assign(to interface{}) error { + // Assign() only supports assigning ProgramSpecs and MapSpecs, + // so doesn't load any resources into the kernel. + getValue := func(typ reflect.Type, name string) (interface{}, error) { + switch typ { + + case reflect.TypeOf((*ProgramSpec)(nil)): + if p := cs.Programs[name]; p != nil { + return p, nil + } + return nil, fmt.Errorf("missing program %q", name) + + case reflect.TypeOf((*MapSpec)(nil)): + if m := cs.Maps[name]; m != nil { + return m, nil + } + return nil, fmt.Errorf("missing map %q", name) + + default: + return nil, fmt.Errorf("unsupported type %s", typ) + } + } + + return assignValues(to, getValue) +} + +// LoadAndAssign loads Maps and Programs into the kernel and assigns them +// to a struct. +// +// This function is a shortcut to manually checking the presence +// of maps and programs in a CollectionSpec. Consider using bpf2go +// if this sounds useful. +// +// 'to' must be a pointer to a struct. A field of the struct is updated with +// a Program or Map if it has an `ebpf` tag and its type is *Program or *Map. +// The tag's value specifies the name of the program or map as found in the +// CollectionSpec. Before updating the struct, the requested objects and their +// dependent resources are loaded into the kernel and populated with values if +// specified. +// +// struct { +// Foo *ebpf.Program `ebpf:"xdp_foo"` +// Bar *ebpf.Map `ebpf:"bar_map"` +// Ignored int +// } +// +// opts may be nil. +// +// Returns an error if any of the fields can't be found, or +// if the same Map or Program is assigned multiple times. +func (cs *CollectionSpec) LoadAndAssign(to interface{}, opts *CollectionOptions) error { + loader := newCollectionLoader(cs, opts) + defer loader.cleanup() + + // Support assigning Programs and Maps, lazy-loading the required objects. + assignedMaps := make(map[string]bool) + getValue := func(typ reflect.Type, name string) (interface{}, error) { + switch typ { + + case reflect.TypeOf((*Program)(nil)): + return loader.loadProgram(name) + + case reflect.TypeOf((*Map)(nil)): + assignedMaps[name] = true + return loader.loadMap(name) + + default: + return nil, fmt.Errorf("unsupported type %s", typ) + } + } + + // Load the Maps and Programs requested by the annotated struct. + if err := assignValues(to, getValue); err != nil { + return err + } + + // Populate the requested maps. Has a chance of lazy-loading other dependent maps. + if err := loader.populateMaps(); err != nil { + return err + } + + // Evaluate the loader's objects after all (lazy)loading has taken place. + for n, m := range loader.maps { + switch m.typ { + case ProgramArray: + // Require all lazy-loaded ProgramArrays to be assigned to the given object. + // Without any references, they will be closed on the first GC and all tail + // calls into them will miss. + if !assignedMaps[n] { + return fmt.Errorf("ProgramArray %s must be assigned to prevent missed tail calls", n) + } + } + } + + loader.finalize() + + return nil +} + // Collection is a collection of Programs and Maps associated // with their symbols type Collection struct { @@ -46,64 +265,238 @@ type Collection struct { } // NewCollection creates a Collection from a specification. -// -// Only maps referenced by at least one of the programs are initialized. func NewCollection(spec *CollectionSpec) (*Collection, error) { return NewCollectionWithOptions(spec, CollectionOptions{}) } // NewCollectionWithOptions creates a Collection from a specification. -// -// Only maps referenced by at least one of the programs are initialized. func NewCollectionWithOptions(spec *CollectionSpec, opts CollectionOptions) (*Collection, error) { - maps := make(map[string]*Map) - for mapName, mapSpec := range spec.Maps { - m, err := NewMap(mapSpec) - if err != nil { - return nil, errors.Wrapf(err, "map %s", mapName) + loader := newCollectionLoader(spec, &opts) + defer loader.cleanup() + + // Create maps first, as their fds need to be linked into programs. + for mapName := range spec.Maps { + if _, err := loader.loadMap(mapName); err != nil { + return nil, err } - maps[mapName] = m } - progs := make(map[string]*Program) - for progName, origProgSpec := range spec.Programs { - progSpec := origProgSpec.Copy() - - // Rewrite any reference to a valid map. - for i := range progSpec.Instructions { - var ( - ins = &progSpec.Instructions[i] - m = maps[ins.Reference] - ) - - if ins.Reference == "" || m == nil { - continue - } - - if ins.Src == asm.R1 { - // Don't overwrite maps already rewritten, users can - // rewrite programs in the spec themselves - continue - } - - if err := ins.RewriteMapPtr(m.FD()); err != nil { - return nil, errors.Wrapf(err, "progam %s: map %s", progName, ins.Reference) - } + for progName := range spec.Programs { + if _, err := loader.loadProgram(progName); err != nil { + return nil, err } - - prog, err := NewProgramWithOptions(progSpec, opts.Programs) - if err != nil { - return nil, errors.Wrapf(err, "program %s", progName) - } - progs[progName] = prog } + // Maps can contain Program and Map stubs, so populate them after + // all Maps and Programs have been successfully loaded. + if err := loader.populateMaps(); err != nil { + return nil, err + } + + maps, progs := loader.maps, loader.programs + + loader.finalize() + return &Collection{ progs, maps, }, nil } +type handleCache struct { + btfHandles map[*btf.Spec]*btf.Handle + btfSpecs map[io.ReaderAt]*btf.Spec +} + +func newHandleCache() *handleCache { + return &handleCache{ + btfHandles: make(map[*btf.Spec]*btf.Handle), + btfSpecs: make(map[io.ReaderAt]*btf.Spec), + } +} + +func (hc handleCache) btfHandle(spec *btf.Spec) (*btf.Handle, error) { + if hc.btfHandles[spec] != nil { + return hc.btfHandles[spec], nil + } + + handle, err := btf.NewHandle(spec) + if err != nil { + return nil, err + } + + hc.btfHandles[spec] = handle + return handle, nil +} + +func (hc handleCache) btfSpec(rd io.ReaderAt) (*btf.Spec, error) { + if hc.btfSpecs[rd] != nil { + return hc.btfSpecs[rd], nil + } + + spec, err := btf.LoadSpecFromReader(rd) + if err != nil { + return nil, err + } + + hc.btfSpecs[rd] = spec + return spec, nil +} + +func (hc handleCache) close() { + for _, handle := range hc.btfHandles { + handle.Close() + } +} + +type collectionLoader struct { + coll *CollectionSpec + opts *CollectionOptions + maps map[string]*Map + programs map[string]*Program + handles *handleCache +} + +func newCollectionLoader(coll *CollectionSpec, opts *CollectionOptions) *collectionLoader { + if opts == nil { + opts = &CollectionOptions{} + } + + return &collectionLoader{ + coll, + opts, + make(map[string]*Map), + make(map[string]*Program), + newHandleCache(), + } +} + +// finalize should be called when all the collectionLoader's resources +// have been successfully loaded into the kernel and populated with values. +func (cl *collectionLoader) finalize() { + cl.maps, cl.programs = nil, nil +} + +// cleanup cleans up all resources left over in the collectionLoader. +// Call finalize() when Map and Program creation/population is successful +// to prevent them from getting closed. +func (cl *collectionLoader) cleanup() { + cl.handles.close() + for _, m := range cl.maps { + m.Close() + } + for _, p := range cl.programs { + p.Close() + } +} + +func (cl *collectionLoader) loadMap(mapName string) (*Map, error) { + if m := cl.maps[mapName]; m != nil { + return m, nil + } + + mapSpec := cl.coll.Maps[mapName] + if mapSpec == nil { + return nil, fmt.Errorf("missing map %s", mapName) + } + + m, err := newMapWithOptions(mapSpec, cl.opts.Maps, cl.handles) + if err != nil { + return nil, fmt.Errorf("map %s: %w", mapName, err) + } + + cl.maps[mapName] = m + return m, nil +} + +func (cl *collectionLoader) loadProgram(progName string) (*Program, error) { + if prog := cl.programs[progName]; prog != nil { + return prog, nil + } + + progSpec := cl.coll.Programs[progName] + if progSpec == nil { + return nil, fmt.Errorf("unknown program %s", progName) + } + + progSpec = progSpec.Copy() + + // Rewrite any reference to a valid map. + for i := range progSpec.Instructions { + ins := &progSpec.Instructions[i] + + if !ins.IsLoadFromMap() || ins.Reference == "" { + continue + } + + if uint32(ins.Constant) != math.MaxUint32 { + // Don't overwrite maps already rewritten, users can + // rewrite programs in the spec themselves + continue + } + + m, err := cl.loadMap(ins.Reference) + if err != nil { + return nil, fmt.Errorf("program %s: %w", progName, err) + } + + fd := m.FD() + if fd < 0 { + return nil, fmt.Errorf("map %s: %w", ins.Reference, internal.ErrClosedFd) + } + if err := ins.RewriteMapPtr(m.FD()); err != nil { + return nil, fmt.Errorf("program %s: map %s: %w", progName, ins.Reference, err) + } + } + + prog, err := newProgramWithOptions(progSpec, cl.opts.Programs, cl.handles) + if err != nil { + return nil, fmt.Errorf("program %s: %w", progName, err) + } + + cl.programs[progName] = prog + return prog, nil +} + +func (cl *collectionLoader) populateMaps() error { + for mapName, m := range cl.maps { + mapSpec, ok := cl.coll.Maps[mapName] + if !ok { + return fmt.Errorf("missing map spec %s", mapName) + } + + mapSpec = mapSpec.Copy() + + // Replace any object stubs with loaded objects. + for i, kv := range mapSpec.Contents { + switch v := kv.Value.(type) { + case programStub: + // loadProgram is idempotent and could return an existing Program. + prog, err := cl.loadProgram(string(v)) + if err != nil { + return fmt.Errorf("loading program %s, for map %s: %w", v, mapName, err) + } + mapSpec.Contents[i] = MapKV{kv.Key, prog} + + case mapStub: + // loadMap is idempotent and could return an existing Map. + innerMap, err := cl.loadMap(string(v)) + if err != nil { + return fmt.Errorf("loading inner map %s, for map %s: %w", v, mapName, err) + } + mapSpec.Contents[i] = MapKV{kv.Key, innerMap} + } + } + + // Populate and freeze the map if specified. + if err := m.finalize(mapSpec); err != nil { + return fmt.Errorf("populating map %s: %w", mapName, err) + } + } + + return nil +} + // LoadCollection parses an object file and converts it to a collection. func LoadCollection(file string) (*Collection, error) { spec, err := LoadCollectionSpec(file) @@ -146,3 +539,130 @@ func (coll *Collection) DetachProgram(name string) *Program { delete(coll.Programs, name) return p } + +// structField represents a struct field containing the ebpf struct tag. +type structField struct { + reflect.StructField + value reflect.Value +} + +// ebpfFields extracts field names tagged with 'ebpf' from a struct type. +// Keep track of visited types to avoid infinite recursion. +func ebpfFields(structVal reflect.Value, visited map[reflect.Type]bool) ([]structField, error) { + if visited == nil { + visited = make(map[reflect.Type]bool) + } + + structType := structVal.Type() + if structType.Kind() != reflect.Struct { + return nil, fmt.Errorf("%s is not a struct", structType) + } + + if visited[structType] { + return nil, fmt.Errorf("recursion on type %s", structType) + } + + fields := make([]structField, 0, structType.NumField()) + for i := 0; i < structType.NumField(); i++ { + field := structField{structType.Field(i), structVal.Field(i)} + + // If the field is tagged, gather it and move on. + name := field.Tag.Get("ebpf") + if name != "" { + fields = append(fields, field) + continue + } + + // If the field does not have an ebpf tag, but is a struct or a pointer + // to a struct, attempt to gather its fields as well. + var v reflect.Value + switch field.Type.Kind() { + case reflect.Ptr: + if field.Type.Elem().Kind() != reflect.Struct { + continue + } + + if field.value.IsNil() { + return nil, fmt.Errorf("nil pointer to %s", structType) + } + + // Obtain the destination type of the pointer. + v = field.value.Elem() + + case reflect.Struct: + // Reference the value's type directly. + v = field.value + + default: + continue + } + + inner, err := ebpfFields(v, visited) + if err != nil { + return nil, fmt.Errorf("field %s: %w", field.Name, err) + } + + fields = append(fields, inner...) + } + + return fields, nil +} + +// assignValues attempts to populate all fields of 'to' tagged with 'ebpf'. +// +// getValue is called for every tagged field of 'to' and must return the value +// to be assigned to the field with the given typ and name. +func assignValues(to interface{}, + getValue func(typ reflect.Type, name string) (interface{}, error)) error { + + toValue := reflect.ValueOf(to) + if toValue.Type().Kind() != reflect.Ptr { + return fmt.Errorf("%T is not a pointer to struct", to) + } + + if toValue.IsNil() { + return fmt.Errorf("nil pointer to %T", to) + } + + fields, err := ebpfFields(toValue.Elem(), nil) + if err != nil { + return err + } + + type elem struct { + // Either *Map or *Program + typ reflect.Type + name string + } + + assigned := make(map[elem]string) + for _, field := range fields { + // Get string value the field is tagged with. + tag := field.Tag.Get("ebpf") + if strings.Contains(tag, ",") { + return fmt.Errorf("field %s: ebpf tag contains a comma", field.Name) + } + + // Check if the eBPF object with the requested + // type and tag was already assigned elsewhere. + e := elem{field.Type, tag} + if af := assigned[e]; af != "" { + return fmt.Errorf("field %s: object %q was already assigned to %s", field.Name, tag, af) + } + + // Get the eBPF object referred to by the tag. + value, err := getValue(field.Type, tag) + if err != nil { + return fmt.Errorf("field %s: %w", field.Name, err) + } + + if !field.value.CanSet() { + return fmt.Errorf("field %s: can't set value", field.Name) + } + field.value.Set(reflect.ValueOf(value)) + + assigned[e] = field.Name + } + + return nil +} diff --git a/vendor/github.com/cilium/ebpf/doc.go b/vendor/github.com/cilium/ebpf/doc.go index d96e6b1..f7f34da 100644 --- a/vendor/github.com/cilium/ebpf/doc.go +++ b/vendor/github.com/cilium/ebpf/doc.go @@ -12,6 +12,5 @@ // eBPF code should be compiled ahead of time using clang, and shipped with // your application as any other resource. // -// This package doesn't include code required to attach eBPF to Linux -// subsystems, since this varies per subsystem. +// Use the link subpackage to attach a loaded program to a hook in the kernel. package ebpf diff --git a/vendor/github.com/cilium/ebpf/elf_reader.go b/vendor/github.com/cilium/ebpf/elf_reader.go index 3bdc084..42010f4 100644 --- a/vendor/github.com/cilium/ebpf/elf_reader.go +++ b/vendor/github.com/cilium/ebpf/elf_reader.go @@ -1,23 +1,31 @@ package ebpf import ( + "bufio" "bytes" "debug/elf" "encoding/binary" + "errors" "fmt" "io" + "math" "os" "strings" "github.com/cilium/ebpf/asm" - - "github.com/pkg/errors" + "github.com/cilium/ebpf/internal" + "github.com/cilium/ebpf/internal/btf" + "github.com/cilium/ebpf/internal/unix" ) +// elfCode is a convenience to reduce the amount of arguments that have to +// be passed around explicitly. You should treat its contents as immutable. type elfCode struct { - *elf.File - symbols []elf.Symbol - symbolsPerSection map[elf.SectionIndex]map[uint64]string + *internal.SafeELFFile + sections map[elf.SectionIndex]*elfSection + license string + version uint32 + btf *btf.Spec } // LoadCollectionSpec parses an ELF file into a CollectionSpec. @@ -29,91 +37,168 @@ func LoadCollectionSpec(file string) (*CollectionSpec, error) { defer f.Close() spec, err := LoadCollectionSpecFromReader(f) - return spec, errors.Wrapf(err, "file %s", file) + if err != nil { + return nil, fmt.Errorf("file %s: %w", file, err) + } + return spec, nil } // LoadCollectionSpecFromReader parses an ELF file into a CollectionSpec. -func LoadCollectionSpecFromReader(code io.ReaderAt) (*CollectionSpec, error) { - f, err := elf.NewFile(code) +func LoadCollectionSpecFromReader(rd io.ReaderAt) (*CollectionSpec, error) { + f, err := internal.NewSafeELFFile(rd) if err != nil { return nil, err } defer f.Close() - symbols, err := f.Symbols() - if err != nil { - return nil, errors.Wrap(err, "load symbols") - } + var ( + licenseSection *elf.Section + versionSection *elf.Section + sections = make(map[elf.SectionIndex]*elfSection) + relSections = make(map[elf.SectionIndex]*elf.Section) + ) - ec := &elfCode{f, symbols, symbolsPerSection(symbols)} + // This is the target of relocations generated by inline assembly. + sections[elf.SHN_UNDEF] = newElfSection(new(elf.Section), undefSection) + + // Collect all the sections we're interested in. This includes relocations + // which we parse later. + for i, sec := range f.Sections { + idx := elf.SectionIndex(i) - var licenseSection, versionSection *elf.Section - progSections := make(map[elf.SectionIndex]*elf.Section) - relSections := make(map[elf.SectionIndex]*elf.Section) - mapSections := make(map[elf.SectionIndex]*elf.Section) - for i, sec := range ec.Sections { switch { case strings.HasPrefix(sec.Name, "license"): licenseSection = sec case strings.HasPrefix(sec.Name, "version"): versionSection = sec case strings.HasPrefix(sec.Name, "maps"): - mapSections[elf.SectionIndex(i)] = sec + sections[idx] = newElfSection(sec, mapSection) + case sec.Name == ".maps": + sections[idx] = newElfSection(sec, btfMapSection) + case sec.Name == ".bss" || sec.Name == ".data" || strings.HasPrefix(sec.Name, ".rodata"): + sections[idx] = newElfSection(sec, dataSection) case sec.Type == elf.SHT_REL: - if int(sec.Info) >= len(ec.Sections) { - return nil, errors.Errorf("found relocation section %v for missing section %v", i, sec.Info) - } - // Store relocations under the section index of the target - idx := elf.SectionIndex(sec.Info) - if relSections[idx] != nil { - return nil, errors.Errorf("section %d has multiple relocation sections", idx) - } - relSections[idx] = sec + relSections[elf.SectionIndex(sec.Info)] = sec case sec.Type == elf.SHT_PROGBITS && (sec.Flags&elf.SHF_EXECINSTR) != 0 && sec.Size > 0: - progSections[elf.SectionIndex(i)] = sec + sections[idx] = newElfSection(sec, programSection) } } license, err := loadLicense(licenseSection) if err != nil { - return nil, errors.Wrap(err, "load license") + return nil, fmt.Errorf("load license: %w", err) } - version, err := loadVersion(versionSection, ec.ByteOrder) + version, err := loadVersion(versionSection, f.ByteOrder) if err != nil { - return nil, errors.Wrap(err, "load version") + return nil, fmt.Errorf("load version: %w", err) } - maps, err := ec.loadMaps(mapSections) + btfSpec, err := btf.LoadSpecFromReader(rd) + if err != nil && !errors.Is(err, btf.ErrNotFound) { + return nil, fmt.Errorf("load BTF: %w", err) + } + + // Assign symbols to all the sections we're interested in. + symbols, err := f.Symbols() if err != nil { - return nil, errors.Wrap(err, "load maps") + return nil, fmt.Errorf("load symbols: %v", err) } - progs, libs, err := ec.loadPrograms(progSections, relSections, license, version) - if err != nil { - return nil, errors.Wrap(err, "load programs") - } + for _, symbol := range symbols { + idx := symbol.Section + symType := elf.ST_TYPE(symbol.Info) - if len(libs) > 0 { - for name, prog := range progs { - prog.Instructions, err = link(prog.Instructions, libs...) - if err != nil { - return nil, errors.Wrapf(err, "program %s", name) - } + section := sections[idx] + if section == nil { + continue } + + // Older versions of LLVM don't tag symbols correctly, so keep + // all NOTYPE ones. + keep := symType == elf.STT_NOTYPE + switch section.kind { + case mapSection, btfMapSection, dataSection: + keep = keep || symType == elf.STT_OBJECT + case programSection: + keep = keep || symType == elf.STT_FUNC + } + if !keep || symbol.Name == "" { + continue + } + + section.symbols[symbol.Value] = symbol } - return &CollectionSpec{maps, progs}, nil + ec := &elfCode{ + SafeELFFile: f, + sections: sections, + license: license, + version: version, + btf: btfSpec, + } + + // Go through relocation sections, and parse the ones for sections we're + // interested in. Make sure that relocations point at valid sections. + for idx, relSection := range relSections { + section := sections[idx] + if section == nil { + continue + } + + rels, err := ec.loadRelocations(relSection, symbols) + if err != nil { + return nil, fmt.Errorf("relocation for section %q: %w", section.Name, err) + } + + for _, rel := range rels { + target := sections[rel.Section] + if target == nil { + return nil, fmt.Errorf("section %q: reference to %q in section %s: %w", section.Name, rel.Name, rel.Section, ErrNotSupported) + } + + if target.Flags&elf.SHF_STRINGS > 0 { + return nil, fmt.Errorf("section %q: string is not stack allocated: %w", section.Name, ErrNotSupported) + } + + target.references++ + } + + section.relocations = rels + } + + // Collect all the various ways to define maps. + maps := make(map[string]*MapSpec) + if err := ec.loadMaps(maps); err != nil { + return nil, fmt.Errorf("load maps: %w", err) + } + + if err := ec.loadBTFMaps(maps); err != nil { + return nil, fmt.Errorf("load BTF maps: %w", err) + } + + if err := ec.loadDataSections(maps); err != nil { + return nil, fmt.Errorf("load data sections: %w", err) + } + + // Finally, collect programs and link them. + progs, err := ec.loadPrograms() + if err != nil { + return nil, fmt.Errorf("load programs: %w", err) + } + + return &CollectionSpec{maps, progs, ec.ByteOrder}, nil } func loadLicense(sec *elf.Section) (string, error) { if sec == nil { - return "", errors.Errorf("missing license section") + return "", nil } + data, err := sec.Data() if err != nil { - return "", errors.Wrapf(err, "section %s", sec.Name) + return "", fmt.Errorf("section %s: %v", sec.Name, err) } return string(bytes.TrimRight(data, "\000")), nil } @@ -124,269 +209,869 @@ func loadVersion(sec *elf.Section, bo binary.ByteOrder) (uint32, error) { } var version uint32 - err := binary.Read(sec.Open(), bo, &version) - return version, errors.Wrapf(err, "section %s", sec.Name) + if err := binary.Read(sec.Open(), bo, &version); err != nil { + return 0, fmt.Errorf("section %s: %v", sec.Name, err) + } + return version, nil } -func (ec *elfCode) loadPrograms(progSections, relSections map[elf.SectionIndex]*elf.Section, license string, version uint32) (map[string]*ProgramSpec, []asm.Instructions, error) { +type elfSectionKind int + +const ( + undefSection elfSectionKind = iota + mapSection + btfMapSection + programSection + dataSection +) + +type elfSection struct { + *elf.Section + kind elfSectionKind + // Offset from the start of the section to a symbol + symbols map[uint64]elf.Symbol + // Offset from the start of the section to a relocation, which points at + // a symbol in another section. + relocations map[uint64]elf.Symbol + // The number of relocations pointing at this section. + references int +} + +func newElfSection(section *elf.Section, kind elfSectionKind) *elfSection { + return &elfSection{ + section, + kind, + make(map[uint64]elf.Symbol), + make(map[uint64]elf.Symbol), + 0, + } +} + +func (ec *elfCode) loadPrograms() (map[string]*ProgramSpec, error) { var ( - progs = make(map[string]*ProgramSpec) - libs []asm.Instructions + progs []*ProgramSpec + libs []*ProgramSpec ) - for idx, prog := range progSections { - syms := ec.symbolsPerSection[idx] - if len(syms) == 0 { - return nil, nil, errors.Errorf("section %v: missing symbols", prog.Name) + + for _, sec := range ec.sections { + if sec.kind != programSection { + continue } - funcSym := syms[0] - if funcSym == "" { - return nil, nil, errors.Errorf("section %v: no label at start", prog.Name) + if len(sec.symbols) == 0 { + return nil, fmt.Errorf("section %v: missing symbols", sec.Name) } - rels, err := ec.loadRelocations(relSections[idx]) + funcSym, ok := sec.symbols[0] + if !ok { + return nil, fmt.Errorf("section %v: no label at start", sec.Name) + } + + insns, length, err := ec.loadInstructions(sec) if err != nil { - return nil, nil, errors.Wrapf(err, "program %s: can't load relocations", funcSym) + return nil, fmt.Errorf("program %s: %w", funcSym.Name, err) } - insns, err := ec.loadInstructions(prog, syms, rels) - if err != nil { - return nil, nil, errors.Wrapf(err, "program %s: can't unmarshal instructions", funcSym) + progType, attachType, progFlags, attachTo := getProgType(sec.Name) + + spec := &ProgramSpec{ + Name: funcSym.Name, + Type: progType, + Flags: progFlags, + AttachType: attachType, + AttachTo: attachTo, + License: ec.license, + KernelVersion: ec.version, + Instructions: insns, + ByteOrder: ec.ByteOrder, } - if progType, attachType := getProgType(prog.Name); progType == UnspecifiedProgram { + if ec.btf != nil { + spec.BTF, err = ec.btf.Program(sec.Name, length) + if err != nil && !errors.Is(err, btf.ErrNoExtendedInfo) { + return nil, fmt.Errorf("program %s: %w", funcSym.Name, err) + } + } + + if spec.Type == UnspecifiedProgram { // There is no single name we can use for "library" sections, // since they may contain multiple functions. We'll decode the // labels they contain later on, and then link sections that way. - libs = append(libs, insns) + libs = append(libs, spec) } else { - progs[funcSym] = &ProgramSpec{ - Name: funcSym, - Type: progType, - AttachType: attachType, - License: license, - KernelVersion: version, - Instructions: insns, - } + progs = append(progs, spec) } } - return progs, libs, nil + + res := make(map[string]*ProgramSpec, len(progs)) + for _, prog := range progs { + err := link(prog, libs) + if err != nil { + return nil, fmt.Errorf("program %s: %w", prog.Name, err) + } + res[prog.Name] = prog + } + + return res, nil } -func (ec *elfCode) loadInstructions(section *elf.Section, symbols, relocations map[uint64]string) (asm.Instructions, error) { +func (ec *elfCode) loadInstructions(section *elfSection) (asm.Instructions, uint64, error) { var ( - r = section.Open() + r = bufio.NewReader(section.Open()) insns asm.Instructions - ins asm.Instruction offset uint64 ) for { + var ins asm.Instruction n, err := ins.Unmarshal(r, ec.ByteOrder) if err == io.EOF { - return insns, nil + return insns, offset, nil } if err != nil { - return nil, errors.Wrapf(err, "offset %d", offset) + return nil, 0, fmt.Errorf("offset %d: %w", offset, err) } - ins.Symbol = symbols[offset] - ins.Reference = relocations[offset] + ins.Symbol = section.symbols[offset].Name + + if rel, ok := section.relocations[offset]; ok { + if err = ec.relocateInstruction(&ins, rel); err != nil { + return nil, 0, fmt.Errorf("offset %d: relocate instruction: %w", offset, err) + } + } insns = append(insns, ins) offset += n } } -func (ec *elfCode) loadMaps(mapSections map[elf.SectionIndex]*elf.Section) (map[string]*MapSpec, error) { +func (ec *elfCode) relocateInstruction(ins *asm.Instruction, rel elf.Symbol) error { var ( - maps = make(map[string]*MapSpec) - b = make([]byte, 1) + typ = elf.ST_TYPE(rel.Info) + bind = elf.ST_BIND(rel.Info) + name = rel.Name ) - for idx, sec := range mapSections { - syms := ec.symbolsPerSection[idx] - if len(syms) == 0 { - return nil, errors.Errorf("section %v: no symbols", sec.Name) + + target := ec.sections[rel.Section] + + switch target.kind { + case mapSection, btfMapSection: + if bind != elf.STB_GLOBAL { + return fmt.Errorf("possible erroneous static qualifier on map definition: found reference to %q", name) } - if sec.Size%uint64(len(syms)) != 0 { - return nil, errors.Errorf("section %v: map descriptors are not of equal size", sec.Name) + if typ != elf.STT_OBJECT && typ != elf.STT_NOTYPE { + // STT_NOTYPE is generated on clang < 8 which doesn't tag + // relocations appropriately. + return fmt.Errorf("map load: incorrect relocation type %v", typ) + } + + ins.Src = asm.PseudoMapFD + + // Mark the instruction as needing an update when creating the + // collection. + if err := ins.RewriteMapPtr(-1); err != nil { + return err + } + + case dataSection: + var offset uint32 + switch typ { + case elf.STT_SECTION: + if bind != elf.STB_LOCAL { + return fmt.Errorf("direct load: %s: unsupported relocation %s", name, bind) + } + + // This is really a reference to a static symbol, which clang doesn't + // emit a symbol table entry for. Instead it encodes the offset in + // the instruction itself. + offset = uint32(uint64(ins.Constant)) + + case elf.STT_OBJECT: + if bind != elf.STB_GLOBAL { + return fmt.Errorf("direct load: %s: unsupported relocation %s", name, bind) + } + + offset = uint32(rel.Value) + + default: + return fmt.Errorf("incorrect relocation type %v for direct map load", typ) + } + + // We rely on using the name of the data section as the reference. It + // would be nicer to keep the real name in case of an STT_OBJECT, but + // it's not clear how to encode that into Instruction. + name = target.Name + + // The kernel expects the offset in the second basic BPF instruction. + ins.Constant = int64(uint64(offset) << 32) + ins.Src = asm.PseudoMapValue + + // Mark the instruction as needing an update when creating the + // collection. + if err := ins.RewriteMapPtr(-1); err != nil { + return err + } + + case programSection: + if ins.OpCode.JumpOp() != asm.Call { + return fmt.Errorf("not a call instruction: %s", ins) + } + + if ins.Src != asm.PseudoCall { + return fmt.Errorf("call: %s: incorrect source register", name) + } + + switch typ { + case elf.STT_NOTYPE, elf.STT_FUNC: + if bind != elf.STB_GLOBAL { + return fmt.Errorf("call: %s: unsupported binding: %s", name, bind) + } + + case elf.STT_SECTION: + if bind != elf.STB_LOCAL { + return fmt.Errorf("call: %s: unsupported binding: %s", name, bind) + } + + // The function we want to call is in the indicated section, + // at the offset encoded in the instruction itself. Reverse + // the calculation to find the real function we're looking for. + // A value of -1 references the first instruction in the section. + offset := int64(int32(ins.Constant)+1) * asm.InstructionSize + if offset < 0 { + return fmt.Errorf("call: %s: invalid offset %d", name, offset) + } + + sym, ok := target.symbols[uint64(offset)] + if !ok { + return fmt.Errorf("call: %s: no symbol at offset %d", name, offset) + } + + ins.Constant = -1 + name = sym.Name + + default: + return fmt.Errorf("call: %s: invalid symbol type %s", name, typ) + } + + case undefSection: + if bind != elf.STB_GLOBAL { + return fmt.Errorf("asm relocation: %s: unsupported binding: %s", name, bind) + } + + if typ != elf.STT_NOTYPE { + return fmt.Errorf("asm relocation: %s: unsupported type %s", name, typ) + } + + // There is nothing to do here but set ins.Reference. + + default: + return fmt.Errorf("relocation to %q: %w", target.Name, ErrNotSupported) + } + + ins.Reference = name + return nil +} + +func (ec *elfCode) loadMaps(maps map[string]*MapSpec) error { + for _, sec := range ec.sections { + if sec.kind != mapSection { + continue + } + + nSym := len(sec.symbols) + if nSym == 0 { + return fmt.Errorf("section %v: no symbols", sec.Name) + } + + if sec.Size%uint64(nSym) != 0 { + return fmt.Errorf("section %v: map descriptors are not of equal size", sec.Name) } var ( - r = sec.Open() - size = sec.Size / uint64(len(syms)) + r = bufio.NewReader(sec.Open()) + size = sec.Size / uint64(nSym) ) - for i, offset := 0, uint64(0); i < len(syms); i, offset = i+1, offset+size { - mapSym := syms[offset] - if mapSym == "" { - fmt.Println(syms) - return nil, errors.Errorf("section %s: missing symbol for map at offset %d", sec.Name, offset) + for i, offset := 0, uint64(0); i < nSym; i, offset = i+1, offset+size { + mapSym, ok := sec.symbols[offset] + if !ok { + return fmt.Errorf("section %s: missing symbol for map at offset %d", sec.Name, offset) } - if maps[mapSym] != nil { - return nil, errors.Errorf("section %v: map %v already exists", sec.Name, mapSym) + mapName := mapSym.Name + if maps[mapName] != nil { + return fmt.Errorf("section %v: map %v already exists", sec.Name, mapSym) } lr := io.LimitReader(r, int64(size)) - var spec MapSpec + spec := MapSpec{ + Name: SanitizeName(mapName, -1), + } switch { case binary.Read(lr, ec.ByteOrder, &spec.Type) != nil: - return nil, errors.Errorf("map %v: missing type", mapSym) + return fmt.Errorf("map %s: missing type", mapName) case binary.Read(lr, ec.ByteOrder, &spec.KeySize) != nil: - return nil, errors.Errorf("map %v: missing key size", mapSym) + return fmt.Errorf("map %s: missing key size", mapName) case binary.Read(lr, ec.ByteOrder, &spec.ValueSize) != nil: - return nil, errors.Errorf("map %v: missing value size", mapSym) + return fmt.Errorf("map %s: missing value size", mapName) case binary.Read(lr, ec.ByteOrder, &spec.MaxEntries) != nil: - return nil, errors.Errorf("map %v: missing max entries", mapSym) + return fmt.Errorf("map %s: missing max entries", mapName) case binary.Read(lr, ec.ByteOrder, &spec.Flags) != nil: - return nil, errors.Errorf("map %v: missing flags", mapSym) + return fmt.Errorf("map %s: missing flags", mapName) } - for { - _, err := lr.Read(b) - if err == io.EOF { - break + extra, err := io.ReadAll(lr) + if err != nil { + return fmt.Errorf("map %s: reading map tail: %w", mapName, err) + } + if len(extra) > 0 { + spec.Extra = *bytes.NewReader(extra) + } + + if err := spec.clampPerfEventArraySize(); err != nil { + return fmt.Errorf("map %s: %w", mapName, err) + } + + maps[mapName] = &spec + } + } + + return nil +} + +// loadBTFMaps iterates over all ELF sections marked as BTF map sections +// (like .maps) and parses them into MapSpecs. Dump the .maps section and +// any relocations with `readelf -x .maps -r `. +func (ec *elfCode) loadBTFMaps(maps map[string]*MapSpec) error { + for _, sec := range ec.sections { + if sec.kind != btfMapSection { + continue + } + + if ec.btf == nil { + return fmt.Errorf("missing BTF") + } + + // Each section must appear as a DataSec in the ELF's BTF blob. + var ds *btf.Datasec + if err := ec.btf.FindType(sec.Name, &ds); err != nil { + return fmt.Errorf("cannot find section '%s' in BTF: %w", sec.Name, err) + } + + // Open a Reader to the ELF's raw section bytes so we can assert that all + // of them are zero on a per-map (per-Var) basis. For now, the section's + // sole purpose is to receive relocations, so all must be zero. + rs := sec.Open() + + for _, vs := range ds.Vars { + // BPF maps are declared as and assigned to global variables, + // so iterate over each Var in the DataSec and validate their types. + v, ok := vs.Type.(*btf.Var) + if !ok { + return fmt.Errorf("section %v: unexpected type %s", sec.Name, vs.Type) + } + name := string(v.Name) + + // The BTF metadata for each Var contains the full length of the map + // declaration, so read the corresponding amount of bytes from the ELF. + // This way, we can pinpoint which map declaration contains unexpected + // (and therefore unsupported) data. + _, err := io.Copy(internal.DiscardZeroes{}, io.LimitReader(rs, int64(vs.Size))) + if err != nil { + return fmt.Errorf("section %v: map %s: initializing BTF map definitions: %w", sec.Name, name, internal.ErrNotSupported) + } + + if maps[name] != nil { + return fmt.Errorf("section %v: map %s already exists", sec.Name, name) + } + + // Each Var representing a BTF map definition contains a Struct. + mapStruct, ok := v.Type.(*btf.Struct) + if !ok { + return fmt.Errorf("expected struct, got %s", v.Type) + } + + mapSpec, err := mapSpecFromBTF(sec, &vs, mapStruct, ec.btf, name, false) + if err != nil { + return fmt.Errorf("map %v: %w", name, err) + } + + if err := mapSpec.clampPerfEventArraySize(); err != nil { + return fmt.Errorf("map %v: %w", name, err) + } + + maps[name] = mapSpec + } + + // Drain the ELF section reader to make sure all bytes are accounted for + // with BTF metadata. + i, err := io.Copy(io.Discard, rs) + if err != nil { + return fmt.Errorf("section %v: unexpected error reading remainder of ELF section: %w", sec.Name, err) + } + if i > 0 { + return fmt.Errorf("section %v: %d unexpected remaining bytes in ELF section, invalid BTF?", sec.Name, i) + } + } + + return nil +} + +// A programStub is a placeholder for a Program to be inserted at a certain map key. +// It needs to be resolved into a Program later on in the loader process. +type programStub string + +// A mapStub is a placeholder for a Map to be inserted at a certain map key. +// It needs to be resolved into a Map later on in the loader process. +type mapStub string + +// mapSpecFromBTF produces a MapSpec based on a btf.Struct def representing +// a BTF map definition. The name and spec arguments will be copied to the +// resulting MapSpec, and inner must be true on any resursive invocations. +func mapSpecFromBTF(es *elfSection, vs *btf.VarSecinfo, def *btf.Struct, spec *btf.Spec, name string, inner bool) (*MapSpec, error) { + var ( + key, value btf.Type + keySize, valueSize uint32 + mapType MapType + flags, maxEntries uint32 + pinType PinType + innerMapSpec *MapSpec + contents []MapKV + err error + ) + + for i, member := range def.Members { + switch member.Name { + case "type": + mt, err := uintFromBTF(member.Type) + if err != nil { + return nil, fmt.Errorf("can't get type: %w", err) + } + mapType = MapType(mt) + + case "map_flags": + flags, err = uintFromBTF(member.Type) + if err != nil { + return nil, fmt.Errorf("can't get BTF map flags: %w", err) + } + + case "max_entries": + maxEntries, err = uintFromBTF(member.Type) + if err != nil { + return nil, fmt.Errorf("can't get BTF map max entries: %w", err) + } + + case "key": + if keySize != 0 { + return nil, errors.New("both key and key_size given") + } + + pk, ok := member.Type.(*btf.Pointer) + if !ok { + return nil, fmt.Errorf("key type is not a pointer: %T", member.Type) + } + + key = pk.Target + + size, err := btf.Sizeof(pk.Target) + if err != nil { + return nil, fmt.Errorf("can't get size of BTF key: %w", err) + } + + keySize = uint32(size) + + case "value": + if valueSize != 0 { + return nil, errors.New("both value and value_size given") + } + + vk, ok := member.Type.(*btf.Pointer) + if !ok { + return nil, fmt.Errorf("value type is not a pointer: %T", member.Type) + } + + value = vk.Target + + size, err := btf.Sizeof(vk.Target) + if err != nil { + return nil, fmt.Errorf("can't get size of BTF value: %w", err) + } + + valueSize = uint32(size) + + case "key_size": + // Key needs to be nil and keySize needs to be 0 for key_size to be + // considered a valid member. + if key != nil || keySize != 0 { + return nil, errors.New("both key and key_size given") + } + + keySize, err = uintFromBTF(member.Type) + if err != nil { + return nil, fmt.Errorf("can't get BTF key size: %w", err) + } + + case "value_size": + // Value needs to be nil and valueSize needs to be 0 for value_size to be + // considered a valid member. + if value != nil || valueSize != 0 { + return nil, errors.New("both value and value_size given") + } + + valueSize, err = uintFromBTF(member.Type) + if err != nil { + return nil, fmt.Errorf("can't get BTF value size: %w", err) + } + + case "pinning": + if inner { + return nil, errors.New("inner maps can't be pinned") + } + + pinning, err := uintFromBTF(member.Type) + if err != nil { + return nil, fmt.Errorf("can't get pinning: %w", err) + } + + pinType = PinType(pinning) + + case "values": + // The 'values' field in BTF map definitions is used for declaring map + // value types that are references to other BPF objects, like other maps + // or programs. It is always expected to be an array of pointers. + if i != len(def.Members)-1 { + return nil, errors.New("'values' must be the last member in a BTF map definition") + } + + if valueSize != 0 && valueSize != 4 { + return nil, errors.New("value_size must be 0 or 4") + } + valueSize = 4 + + valueType, err := resolveBTFArrayMacro(member.Type) + if err != nil { + return nil, fmt.Errorf("can't resolve type of member 'values': %w", err) + } + + switch t := valueType.(type) { + case *btf.Struct: + // The values member pointing to an array of structs means we're expecting + // a map-in-map declaration. + if mapType != ArrayOfMaps && mapType != HashOfMaps { + return nil, errors.New("outer map needs to be an array or a hash of maps") } + if inner { + return nil, fmt.Errorf("nested inner maps are not supported") + } + + // This inner map spec is used as a map template, but it needs to be + // created as a traditional map before it can be used to do so. + // libbpf names the inner map template '.inner', but we + // opted for _inner to simplify validation logic. (dots only supported + // on kernels 5.2 and up) + // Pass the BTF spec from the parent object, since both parent and + // child must be created from the same BTF blob (on kernels that support BTF). + innerMapSpec, err = mapSpecFromBTF(es, vs, t, spec, name+"_inner", true) if err != nil { - return nil, err + return nil, fmt.Errorf("can't parse BTF map definition of inner map: %w", err) } - if b[0] != 0 { - return nil, errors.Errorf("map %v: unknown and non-zero fields in definition", mapSym) + + case *btf.FuncProto: + // The values member contains an array of function pointers, meaning an + // autopopulated PROG_ARRAY. + if mapType != ProgramArray { + return nil, errors.New("map needs to be a program array") } + + default: + return nil, fmt.Errorf("unsupported value type %q in 'values' field", t) } - maps[mapSym] = &spec + contents, err = resolveBTFValuesContents(es, vs, member) + if err != nil { + return nil, fmt.Errorf("resolving values contents: %w", err) + } + + default: + return nil, fmt.Errorf("unrecognized field %s in BTF map definition", member.Name) } } - return maps, nil + + if key == nil { + key = &btf.Void{} + } + if value == nil { + value = &btf.Void{} + } + + return &MapSpec{ + Name: SanitizeName(name, -1), + Type: MapType(mapType), + KeySize: keySize, + ValueSize: valueSize, + MaxEntries: maxEntries, + Flags: flags, + BTF: &btf.Map{Spec: spec, Key: key, Value: value}, + Pinning: pinType, + InnerMap: innerMapSpec, + Contents: contents, + }, nil } -func getProgType(v string) (ProgramType, AttachType) { - types := map[string]ProgramType{ - // From https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/lib/bpf/libbpf.c#n3568 - "socket": SocketFilter, - "seccomp": SocketFilter, - "kprobe/": Kprobe, - "kretprobe/": Kprobe, - "tracepoint/": TracePoint, - "xdp": XDP, - "perf_event": PerfEvent, - "sockops": SockOps, - "sk_skb": SkSKB, - "sk_msg": SkMsg, - "lirc_mode2": LircMode2, - "flow_dissector": FlowDissector, - - "cgroup_skb/": CGroupSKB, - "cgroup/dev": CGroupDevice, - "cgroup/skb": CGroupSKB, - "cgroup/sock": CGroupSock, - "cgroup/post_bind": CGroupSock, - "cgroup/bind": CGroupSockAddr, - "cgroup/connect": CGroupSockAddr, - "cgroup/sendmsg": CGroupSockAddr, - "cgroup/recvmsg": CGroupSockAddr, - "cgroup/sysctl": CGroupSysctl, - "cgroup/getsockopt": CGroupSockopt, - "cgroup/setsockopt": CGroupSockopt, - "classifier": SchedCLS, - "action": SchedACT, - } - attachTypes := map[string]AttachType{ - "cgroup_skb/ingress": AttachCGroupInetIngress, - "cgroup_skb/egress": AttachCGroupInetEgress, - "cgroup/sock": AttachCGroupInetSockCreate, - "cgroup/post_bind4": AttachCGroupInet4PostBind, - "cgroup/post_bind6": AttachCGroupInet6PostBind, - "cgroup/dev": AttachCGroupDevice, - "sockops": AttachCGroupSockOps, - "sk_skb/stream_parser": AttachSkSKBStreamParser, - "sk_skb/stream_verdict": AttachSkSKBStreamVerdict, - "sk_msg": AttachSkSKBStreamVerdict, - "lirc_mode2": AttachLircMode2, - "flow_dissector": AttachFlowDissector, - "cgroup/bind4": AttachCGroupInet4Bind, - "cgroup/bind6": AttachCGroupInet6Bind, - "cgroup/connect4": AttachCGroupInet4Connect, - "cgroup/connect6": AttachCGroupInet6Connect, - "cgroup/sendmsg4": AttachCGroupUDP4Sendmsg, - "cgroup/sendmsg6": AttachCGroupUDP6Sendmsg, - "cgroup/recvmsg4": AttachCGroupUDP4Recvmsg, - "cgroup/recvmsg6": AttachCGroupUDP6Recvmsg, - "cgroup/sysctl": AttachCGroupSysctl, - "cgroup/getsockopt": AttachCGroupGetsockopt, - "cgroup/setsockopt": AttachCGroupSetsockopt, - } - attachType := AttachNone - for k, t := range attachTypes { - if strings.HasPrefix(v, k) { - attachType = t - } +// uintFromBTF resolves the __uint macro, which is a pointer to a sized +// array, e.g. for int (*foo)[10], this function will return 10. +func uintFromBTF(typ btf.Type) (uint32, error) { + ptr, ok := typ.(*btf.Pointer) + if !ok { + return 0, fmt.Errorf("not a pointer: %v", typ) } - for k, t := range types { - if strings.HasPrefix(v, k) { - return t, attachType - } + arr, ok := ptr.Target.(*btf.Array) + if !ok { + return 0, fmt.Errorf("not a pointer to array: %v", typ) } - return UnspecifiedProgram, AttachNone + + return arr.Nelems, nil } -func (ec *elfCode) loadRelocations(sec *elf.Section) (map[uint64]string, error) { - rels := make(map[uint64]string) - if sec == nil { - return rels, nil +// resolveBTFArrayMacro resolves the __array macro, which declares an array +// of pointers to a given type. This function returns the target Type of +// the pointers in the array. +func resolveBTFArrayMacro(typ btf.Type) (btf.Type, error) { + arr, ok := typ.(*btf.Array) + if !ok { + return nil, fmt.Errorf("not an array: %v", typ) } + ptr, ok := arr.Type.(*btf.Pointer) + if !ok { + return nil, fmt.Errorf("not an array of pointers: %v", typ) + } + + return ptr.Target, nil +} + +// resolveBTFValuesContents resolves relocations into ELF sections belonging +// to btf.VarSecinfo's. This can be used on the 'values' member in BTF map +// definitions to extract static declarations of map contents. +func resolveBTFValuesContents(es *elfSection, vs *btf.VarSecinfo, member btf.Member) ([]MapKV, error) { + // The elements of a .values pointer array are not encoded in BTF. + // Instead, relocations are generated into each array index. + // However, it's possible to leave certain array indices empty, so all + // indices' offsets need to be checked for emitted relocations. + + // The offset of the 'values' member within the _struct_ (in bits) + // is the starting point of the array. Convert to bytes. Add VarSecinfo + // offset to get the absolute position in the ELF blob. + start := (member.OffsetBits / 8) + vs.Offset + // 'values' is encoded in BTF as a zero (variable) length struct + // member, and its contents run until the end of the VarSecinfo. + // Add VarSecinfo offset to get the absolute position in the ELF blob. + end := vs.Size + vs.Offset + // The size of an address in this section. This determines the width of + // an index in the array. + align := uint32(es.SectionHeader.Addralign) + + // Check if variable-length section is aligned. + if (end-start)%align != 0 { + return nil, errors.New("unaligned static values section") + } + elems := (end - start) / align + + if elems == 0 { + return nil, nil + } + + contents := make([]MapKV, 0, elems) + + // k is the array index, off is its corresponding ELF section offset. + for k, off := uint32(0), start; k < elems; k, off = k+1, off+align { + r, ok := es.relocations[uint64(off)] + if !ok { + continue + } + + // Relocation exists for the current offset in the ELF section. + // Emit a value stub based on the type of relocation to be replaced by + // a real fd later in the pipeline before populating the map. + // Map keys are encoded in MapKV entries, so empty array indices are + // skipped here. + switch t := elf.ST_TYPE(r.Info); t { + case elf.STT_FUNC: + contents = append(contents, MapKV{uint32(k), programStub(r.Name)}) + case elf.STT_OBJECT: + contents = append(contents, MapKV{uint32(k), mapStub(r.Name)}) + default: + return nil, fmt.Errorf("unknown relocation type %v", t) + } + } + + return contents, nil +} + +func (ec *elfCode) loadDataSections(maps map[string]*MapSpec) error { + for _, sec := range ec.sections { + if sec.kind != dataSection { + continue + } + + if sec.references == 0 { + // Prune data sections which are not referenced by any + // instructions. + continue + } + + if ec.btf == nil { + return errors.New("data sections require BTF, make sure all consts are marked as static") + } + + var datasec *btf.Datasec + if err := ec.btf.FindType(sec.Name, &datasec); err != nil { + return fmt.Errorf("data section %s: can't get BTF: %w", sec.Name, err) + } + + data, err := sec.Data() + if err != nil { + return fmt.Errorf("data section %s: can't get contents: %w", sec.Name, err) + } + + if uint64(len(data)) > math.MaxUint32 { + return fmt.Errorf("data section %s: contents exceed maximum size", sec.Name) + } + + mapSpec := &MapSpec{ + Name: SanitizeName(sec.Name, -1), + Type: Array, + KeySize: 4, + ValueSize: uint32(len(data)), + MaxEntries: 1, + Contents: []MapKV{{uint32(0), data}}, + BTF: &btf.Map{Spec: ec.btf, Key: &btf.Void{}, Value: datasec}, + } + + switch sec.Name { + case ".rodata": + mapSpec.Flags = unix.BPF_F_RDONLY_PROG + mapSpec.Freeze = true + case ".bss": + // The kernel already zero-initializes the map + mapSpec.Contents = nil + } + + maps[sec.Name] = mapSpec + } + return nil +} + +func getProgType(sectionName string) (ProgramType, AttachType, uint32, string) { + types := map[string]struct { + progType ProgramType + attachType AttachType + progFlags uint32 + }{ + // From https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/lib/bpf/libbpf.c + "socket": {SocketFilter, AttachNone, 0}, + "sk_reuseport/migrate": {SkReuseport, AttachSkReuseportSelectOrMigrate, 0}, + "sk_reuseport": {SkReuseport, AttachSkReuseportSelect, 0}, + "seccomp": {SocketFilter, AttachNone, 0}, + "kprobe/": {Kprobe, AttachNone, 0}, + "uprobe/": {Kprobe, AttachNone, 0}, + "kretprobe/": {Kprobe, AttachNone, 0}, + "uretprobe/": {Kprobe, AttachNone, 0}, + "tracepoint/": {TracePoint, AttachNone, 0}, + "raw_tracepoint/": {RawTracepoint, AttachNone, 0}, + "raw_tp/": {RawTracepoint, AttachNone, 0}, + "tp_btf/": {Tracing, AttachTraceRawTp, 0}, + "xdp": {XDP, AttachNone, 0}, + "perf_event": {PerfEvent, AttachNone, 0}, + "lwt_in": {LWTIn, AttachNone, 0}, + "lwt_out": {LWTOut, AttachNone, 0}, + "lwt_xmit": {LWTXmit, AttachNone, 0}, + "lwt_seg6local": {LWTSeg6Local, AttachNone, 0}, + "sockops": {SockOps, AttachCGroupSockOps, 0}, + "sk_skb/stream_parser": {SkSKB, AttachSkSKBStreamParser, 0}, + "sk_skb/stream_verdict": {SkSKB, AttachSkSKBStreamParser, 0}, + "sk_msg": {SkMsg, AttachSkSKBStreamVerdict, 0}, + "lirc_mode2": {LircMode2, AttachLircMode2, 0}, + "flow_dissector": {FlowDissector, AttachFlowDissector, 0}, + "iter/": {Tracing, AttachTraceIter, 0}, + "fentry/": {Tracing, AttachTraceFEntry, 0}, + "fmod_ret/": {Tracing, AttachModifyReturn, 0}, + "fexit/": {Tracing, AttachTraceFExit, 0}, + "fentry.s/": {Tracing, AttachTraceFEntry, unix.BPF_F_SLEEPABLE}, + "fmod_ret.s/": {Tracing, AttachModifyReturn, unix.BPF_F_SLEEPABLE}, + "fexit.s/": {Tracing, AttachTraceFExit, unix.BPF_F_SLEEPABLE}, + "sk_lookup/": {SkLookup, AttachSkLookup, 0}, + "freplace/": {Extension, AttachNone, 0}, + "lsm/": {LSM, AttachLSMMac, 0}, + "lsm.s/": {LSM, AttachLSMMac, unix.BPF_F_SLEEPABLE}, + + "cgroup_skb/ingress": {CGroupSKB, AttachCGroupInetIngress, 0}, + "cgroup_skb/egress": {CGroupSKB, AttachCGroupInetEgress, 0}, + "cgroup/dev": {CGroupDevice, AttachCGroupDevice, 0}, + "cgroup/skb": {CGroupSKB, AttachNone, 0}, + "cgroup/sock": {CGroupSock, AttachCGroupInetSockCreate, 0}, + "cgroup/post_bind4": {CGroupSock, AttachCGroupInet4PostBind, 0}, + "cgroup/post_bind6": {CGroupSock, AttachCGroupInet6PostBind, 0}, + "cgroup/bind4": {CGroupSockAddr, AttachCGroupInet4Bind, 0}, + "cgroup/bind6": {CGroupSockAddr, AttachCGroupInet6Bind, 0}, + "cgroup/connect4": {CGroupSockAddr, AttachCGroupInet4Connect, 0}, + "cgroup/connect6": {CGroupSockAddr, AttachCGroupInet6Connect, 0}, + "cgroup/sendmsg4": {CGroupSockAddr, AttachCGroupUDP4Sendmsg, 0}, + "cgroup/sendmsg6": {CGroupSockAddr, AttachCGroupUDP6Sendmsg, 0}, + "cgroup/recvmsg4": {CGroupSockAddr, AttachCGroupUDP4Recvmsg, 0}, + "cgroup/recvmsg6": {CGroupSockAddr, AttachCGroupUDP6Recvmsg, 0}, + "cgroup/sysctl": {CGroupSysctl, AttachCGroupSysctl, 0}, + "cgroup/getsockopt": {CGroupSockopt, AttachCGroupGetsockopt, 0}, + "cgroup/setsockopt": {CGroupSockopt, AttachCGroupSetsockopt, 0}, + "classifier": {SchedCLS, AttachNone, 0}, + "action": {SchedACT, AttachNone, 0}, + + "cgroup/getsockname4": {CGroupSockAddr, AttachCgroupInet4GetSockname, 0}, + "cgroup/getsockname6": {CGroupSockAddr, AttachCgroupInet6GetSockname, 0}, + "cgroup/getpeername4": {CGroupSockAddr, AttachCgroupInet4GetPeername, 0}, + "cgroup/getpeername6": {CGroupSockAddr, AttachCgroupInet6GetPeername, 0}, + } + + for prefix, t := range types { + if !strings.HasPrefix(sectionName, prefix) { + continue + } + + if !strings.HasSuffix(prefix, "/") { + return t.progType, t.attachType, t.progFlags, "" + } + + return t.progType, t.attachType, t.progFlags, sectionName[len(prefix):] + } + + return UnspecifiedProgram, AttachNone, 0, "" +} + +func (ec *elfCode) loadRelocations(sec *elf.Section, symbols []elf.Symbol) (map[uint64]elf.Symbol, error) { + rels := make(map[uint64]elf.Symbol) + if sec.Entsize < 16 { - return nil, errors.New("rels are less than 16 bytes") + return nil, fmt.Errorf("section %s: relocations are less than 16 bytes", sec.Name) } - r := sec.Open() + r := bufio.NewReader(sec.Open()) for off := uint64(0); off < sec.Size; off += sec.Entsize { ent := io.LimitReader(r, int64(sec.Entsize)) var rel elf.Rel64 if binary.Read(ent, ec.ByteOrder, &rel) != nil { - return nil, errors.Errorf("can't parse relocation at offset %v", off) + return nil, fmt.Errorf("can't parse relocation at offset %v", off) } symNo := int(elf.R_SYM64(rel.Info) - 1) - if symNo >= len(ec.symbols) { - return nil, errors.Errorf("relocation at offset %d: symbol %v doesnt exist", off, symNo) + if symNo >= len(symbols) { + return nil, fmt.Errorf("offset %d: symbol %d doesn't exist", off, symNo) } - rels[rel.Off] = ec.symbols[symNo].Name + symbol := symbols[symNo] + rels[rel.Off] = symbol } + return rels, nil } - -func symbolsPerSection(symbols []elf.Symbol) map[elf.SectionIndex]map[uint64]string { - result := make(map[elf.SectionIndex]map[uint64]string) - for i, sym := range symbols { - switch elf.ST_TYPE(sym.Info) { - case elf.STT_NOTYPE: - // Older versions of LLVM doesn't tag - // symbols correctly. - break - case elf.STT_OBJECT: - break - case elf.STT_FUNC: - break - default: - continue - } - - if sym.Name == "" { - continue - } - - idx := sym.Section - if _, ok := result[idx]; !ok { - result[idx] = make(map[uint64]string) - } - result[idx][sym.Value] = symbols[i].Name - } - return result -} diff --git a/vendor/github.com/cilium/ebpf/elf_reader_fuzz.go b/vendor/github.com/cilium/ebpf/elf_reader_fuzz.go new file mode 100644 index 0000000..5f4e0a0 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/elf_reader_fuzz.go @@ -0,0 +1,22 @@ +//go:build gofuzz +// +build gofuzz + +// Use with https://github.com/dvyukov/go-fuzz + +package ebpf + +import "bytes" + +func FuzzLoadCollectionSpec(data []byte) int { + spec, err := LoadCollectionSpecFromReader(bytes.NewReader(data)) + if err != nil { + if spec != nil { + panic("spec is not nil") + } + return 0 + } + if spec == nil { + panic("spec is nil") + } + return 1 +} diff --git a/vendor/github.com/cilium/ebpf/feature.go b/vendor/github.com/cilium/ebpf/feature.go deleted file mode 100644 index 9104bc9..0000000 --- a/vendor/github.com/cilium/ebpf/feature.go +++ /dev/null @@ -1,19 +0,0 @@ -package ebpf - -import ( - "sync" -) - -type featureTest struct { - Fn func() bool - - once sync.Once - result bool -} - -func (ft *featureTest) Result() bool { - ft.once.Do(func() { - ft.result = ft.Fn() - }) - return ft.result -} diff --git a/vendor/github.com/cilium/ebpf/go.mod b/vendor/github.com/cilium/ebpf/go.mod index 687bdec..f5edf69 100644 --- a/vendor/github.com/cilium/ebpf/go.mod +++ b/vendor/github.com/cilium/ebpf/go.mod @@ -1,8 +1,9 @@ module github.com/cilium/ebpf -go 1.12 +go 1.16 require ( - github.com/pkg/errors v0.8.1 - golang.org/x/sys v0.0.0-20191022100944-742c48ecaeb7 + github.com/frankban/quicktest v1.11.3 + github.com/google/go-cmp v0.5.4 + golang.org/x/sys v0.0.0-20210906170528-6f6e22806c34 ) diff --git a/vendor/github.com/cilium/ebpf/go.sum b/vendor/github.com/cilium/ebpf/go.sum new file mode 100644 index 0000000..1ef5a47 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/go.sum @@ -0,0 +1,13 @@ +github.com/frankban/quicktest v1.11.3 h1:8sXhOn0uLys67V8EsXLc6eszDs8VXWxL3iRvebPhedY= +github.com/frankban/quicktest v1.11.3/go.mod h1:wRf/ReqHper53s+kmmSZizM8NamnL3IM0I9ntUbOk+k= +github.com/google/go-cmp v0.5.4 h1:L8R9j+yAqZuZjsqh/z+F1NCffTKKLShY6zXTItVIZ8M= +github.com/google/go-cmp v0.5.4/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/kr/pretty v0.2.1 h1:Fmg33tUaq4/8ym9TJN1x7sLJnHVwhP33CNkpYV/7rwI= +github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +golang.org/x/sys v0.0.0-20210906170528-6f6e22806c34 h1:GkvMjFtXUmahfDtashnc1mnrCtuBVcwse5QV2lUk/tI= +golang.org/x/sys v0.0.0-20210906170528-6f6e22806c34/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= diff --git a/vendor/github.com/cilium/ebpf/info.go b/vendor/github.com/cilium/ebpf/info.go new file mode 100644 index 0000000..65fa4d7 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/info.go @@ -0,0 +1,273 @@ +package ebpf + +import ( + "bufio" + "encoding/hex" + "errors" + "fmt" + "io" + "os" + "strings" + "syscall" + "time" + + "github.com/cilium/ebpf/internal" + "github.com/cilium/ebpf/internal/btf" +) + +// MapInfo describes a map. +type MapInfo struct { + Type MapType + id MapID + KeySize uint32 + ValueSize uint32 + MaxEntries uint32 + Flags uint32 + // Name as supplied by user space at load time. + Name string +} + +func newMapInfoFromFd(fd *internal.FD) (*MapInfo, error) { + info, err := bpfGetMapInfoByFD(fd) + if errors.Is(err, syscall.EINVAL) { + return newMapInfoFromProc(fd) + } + if err != nil { + return nil, err + } + + return &MapInfo{ + MapType(info.map_type), + MapID(info.id), + info.key_size, + info.value_size, + info.max_entries, + info.map_flags, + // name is available from 4.15. + internal.CString(info.name[:]), + }, nil +} + +func newMapInfoFromProc(fd *internal.FD) (*MapInfo, error) { + var mi MapInfo + err := scanFdInfo(fd, map[string]interface{}{ + "map_type": &mi.Type, + "key_size": &mi.KeySize, + "value_size": &mi.ValueSize, + "max_entries": &mi.MaxEntries, + "map_flags": &mi.Flags, + }) + if err != nil { + return nil, err + } + return &mi, nil +} + +// ID returns the map ID. +// +// Available from 4.13. +// +// The bool return value indicates whether this optional field is available. +func (mi *MapInfo) ID() (MapID, bool) { + return mi.id, mi.id > 0 +} + +// programStats holds statistics of a program. +type programStats struct { + // Total accumulated runtime of the program ins ns. + runtime time.Duration + // Total number of times the program was called. + runCount uint64 +} + +// ProgramInfo describes a program. +type ProgramInfo struct { + Type ProgramType + id ProgramID + // Truncated hash of the BPF bytecode. + Tag string + // Name as supplied by user space at load time. + Name string + // BTF for the program. + btf btf.ID + // IDS map ids related to program. + ids []MapID + + stats *programStats +} + +func newProgramInfoFromFd(fd *internal.FD) (*ProgramInfo, error) { + info, err := bpfGetProgInfoByFD(fd, nil) + if errors.Is(err, syscall.EINVAL) { + return newProgramInfoFromProc(fd) + } + if err != nil { + return nil, err + } + + var mapIDs []MapID + if info.nr_map_ids > 0 { + mapIDs = make([]MapID, info.nr_map_ids) + info, err = bpfGetProgInfoByFD(fd, mapIDs) + if err != nil { + return nil, err + } + } + + return &ProgramInfo{ + Type: ProgramType(info.prog_type), + id: ProgramID(info.id), + // tag is available if the kernel supports BPF_PROG_GET_INFO_BY_FD. + Tag: hex.EncodeToString(info.tag[:]), + // name is available from 4.15. + Name: internal.CString(info.name[:]), + btf: btf.ID(info.btf_id), + ids: mapIDs, + stats: &programStats{ + runtime: time.Duration(info.run_time_ns), + runCount: info.run_cnt, + }, + }, nil +} + +func newProgramInfoFromProc(fd *internal.FD) (*ProgramInfo, error) { + var info ProgramInfo + err := scanFdInfo(fd, map[string]interface{}{ + "prog_type": &info.Type, + "prog_tag": &info.Tag, + }) + if errors.Is(err, errMissingFields) { + return nil, &internal.UnsupportedFeatureError{ + Name: "reading program info from /proc/self/fdinfo", + MinimumVersion: internal.Version{4, 10, 0}, + } + } + if err != nil { + return nil, err + } + + return &info, nil +} + +// ID returns the program ID. +// +// Available from 4.13. +// +// The bool return value indicates whether this optional field is available. +func (pi *ProgramInfo) ID() (ProgramID, bool) { + return pi.id, pi.id > 0 +} + +// BTFID returns the BTF ID associated with the program. +// +// Available from 5.0. +// +// The bool return value indicates whether this optional field is available and +// populated. (The field may be available but not populated if the kernel +// supports the field but the program was loaded without BTF information.) +func (pi *ProgramInfo) BTFID() (btf.ID, bool) { + return pi.btf, pi.btf > 0 +} + +// RunCount returns the total number of times the program was called. +// +// Can return 0 if the collection of statistics is not enabled. See EnableStats(). +// The bool return value indicates whether this optional field is available. +func (pi *ProgramInfo) RunCount() (uint64, bool) { + if pi.stats != nil { + return pi.stats.runCount, true + } + return 0, false +} + +// Runtime returns the total accumulated runtime of the program. +// +// Can return 0 if the collection of statistics is not enabled. See EnableStats(). +// The bool return value indicates whether this optional field is available. +func (pi *ProgramInfo) Runtime() (time.Duration, bool) { + if pi.stats != nil { + return pi.stats.runtime, true + } + return time.Duration(0), false +} + +// MapIDs returns the maps related to the program. +// +// The bool return value indicates whether this optional field is available. +func (pi *ProgramInfo) MapIDs() ([]MapID, bool) { + return pi.ids, pi.ids != nil +} + +func scanFdInfo(fd *internal.FD, fields map[string]interface{}) error { + raw, err := fd.Value() + if err != nil { + return err + } + + fh, err := os.Open(fmt.Sprintf("/proc/self/fdinfo/%d", raw)) + if err != nil { + return err + } + defer fh.Close() + + if err := scanFdInfoReader(fh, fields); err != nil { + return fmt.Errorf("%s: %w", fh.Name(), err) + } + return nil +} + +var errMissingFields = errors.New("missing fields") + +func scanFdInfoReader(r io.Reader, fields map[string]interface{}) error { + var ( + scanner = bufio.NewScanner(r) + scanned int + ) + + for scanner.Scan() { + parts := strings.SplitN(scanner.Text(), "\t", 2) + if len(parts) != 2 { + continue + } + + name := strings.TrimSuffix(parts[0], ":") + field, ok := fields[string(name)] + if !ok { + continue + } + + if n, err := fmt.Sscanln(parts[1], field); err != nil || n != 1 { + return fmt.Errorf("can't parse field %s: %v", name, err) + } + + scanned++ + } + + if err := scanner.Err(); err != nil { + return err + } + + if scanned != len(fields) { + return errMissingFields + } + + return nil +} + +// EnableStats starts the measuring of the runtime +// and run counts of eBPF programs. +// +// Collecting statistics can have an impact on the performance. +// +// Requires at least 5.8. +func EnableStats(which uint32) (io.Closer, error) { + attr := internal.BPFEnableStatsAttr{ + StatsType: which, + } + + fd, err := internal.BPFEnableStats(&attr) + if err != nil { + return nil, err + } + return fd, nil +} diff --git a/vendor/github.com/cilium/ebpf/internal/align.go b/vendor/github.com/cilium/ebpf/internal/align.go new file mode 100644 index 0000000..8b4f265 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/align.go @@ -0,0 +1,6 @@ +package internal + +// Align returns 'n' updated to 'alignment' boundary. +func Align(n, alignment int) int { + return (int(n) + alignment - 1) / alignment * alignment +} diff --git a/vendor/github.com/cilium/ebpf/internal/btf/btf.go b/vendor/github.com/cilium/ebpf/internal/btf/btf.go new file mode 100644 index 0000000..2b5f6d2 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/btf/btf.go @@ -0,0 +1,798 @@ +package btf + +import ( + "bytes" + "debug/elf" + "encoding/binary" + "errors" + "fmt" + "io" + "math" + "os" + "reflect" + "sync" + "unsafe" + + "github.com/cilium/ebpf/internal" + "github.com/cilium/ebpf/internal/unix" +) + +const btfMagic = 0xeB9F + +// Errors returned by BTF functions. +var ( + ErrNotSupported = internal.ErrNotSupported + ErrNotFound = errors.New("not found") + ErrNoExtendedInfo = errors.New("no extended info") +) + +// ID represents the unique ID of a BTF object. +type ID uint32 + +// Spec represents decoded BTF. +type Spec struct { + rawTypes []rawType + strings stringTable + types []Type + namedTypes map[string][]NamedType + funcInfos map[string]extInfo + lineInfos map[string]extInfo + coreRelos map[string]coreRelos + byteOrder binary.ByteOrder +} + +type btfHeader struct { + Magic uint16 + Version uint8 + Flags uint8 + HdrLen uint32 + + TypeOff uint32 + TypeLen uint32 + StringOff uint32 + StringLen uint32 +} + +// LoadSpecFromReader reads BTF sections from an ELF. +// +// Returns ErrNotFound if the reader contains no BTF. +func LoadSpecFromReader(rd io.ReaderAt) (*Spec, error) { + file, err := internal.NewSafeELFFile(rd) + if err != nil { + return nil, err + } + defer file.Close() + + symbols, err := file.Symbols() + if err != nil { + return nil, fmt.Errorf("can't read symbols: %v", err) + } + + variableOffsets := make(map[variable]uint32) + for _, symbol := range symbols { + if idx := symbol.Section; idx >= elf.SHN_LORESERVE && idx <= elf.SHN_HIRESERVE { + // Ignore things like SHN_ABS + continue + } + + if int(symbol.Section) >= len(file.Sections) { + return nil, fmt.Errorf("symbol %s: invalid section %d", symbol.Name, symbol.Section) + } + + secName := file.Sections[symbol.Section].Name + if symbol.Value > math.MaxUint32 { + return nil, fmt.Errorf("section %s: symbol %s: size exceeds maximum", secName, symbol.Name) + } + + variableOffsets[variable{secName, symbol.Name}] = uint32(symbol.Value) + } + + return loadSpecFromELF(file, variableOffsets) +} + +func loadSpecFromELF(file *internal.SafeELFFile, variableOffsets map[variable]uint32) (*Spec, error) { + var ( + btfSection *elf.Section + btfExtSection *elf.Section + sectionSizes = make(map[string]uint32) + ) + + for _, sec := range file.Sections { + switch sec.Name { + case ".BTF": + btfSection = sec + case ".BTF.ext": + btfExtSection = sec + default: + if sec.Type != elf.SHT_PROGBITS && sec.Type != elf.SHT_NOBITS { + break + } + + if sec.Size > math.MaxUint32 { + return nil, fmt.Errorf("section %s exceeds maximum size", sec.Name) + } + + sectionSizes[sec.Name] = uint32(sec.Size) + } + } + + if btfSection == nil { + return nil, fmt.Errorf("btf: %w", ErrNotFound) + } + + spec, err := loadRawSpec(btfSection.Open(), file.ByteOrder, sectionSizes, variableOffsets) + if err != nil { + return nil, err + } + + if btfExtSection == nil { + return spec, nil + } + + spec.funcInfos, spec.lineInfos, spec.coreRelos, err = parseExtInfos(btfExtSection.Open(), file.ByteOrder, spec.strings) + if err != nil { + return nil, fmt.Errorf("can't read ext info: %w", err) + } + + return spec, nil +} + +// LoadRawSpec reads a blob of BTF data that isn't wrapped in an ELF file. +// +// Prefer using LoadSpecFromReader, since this function only supports a subset +// of BTF. +func LoadRawSpec(btf io.Reader, bo binary.ByteOrder) (*Spec, error) { + // This will return an error if we encounter a Datasec, since we can't fix + // it up. + return loadRawSpec(btf, bo, nil, nil) +} + +func loadRawSpec(btf io.Reader, bo binary.ByteOrder, sectionSizes map[string]uint32, variableOffsets map[variable]uint32) (*Spec, error) { + rawTypes, rawStrings, err := parseBTF(btf, bo) + if err != nil { + return nil, err + } + + err = fixupDatasec(rawTypes, rawStrings, sectionSizes, variableOffsets) + if err != nil { + return nil, err + } + + types, typesByName, err := inflateRawTypes(rawTypes, rawStrings) + if err != nil { + return nil, err + } + + return &Spec{ + rawTypes: rawTypes, + namedTypes: typesByName, + types: types, + strings: rawStrings, + byteOrder: bo, + }, nil +} + +var kernelBTF struct { + sync.Mutex + *Spec +} + +// LoadKernelSpec returns the current kernel's BTF information. +// +// Requires a >= 5.5 kernel with CONFIG_DEBUG_INFO_BTF enabled. Returns +// ErrNotSupported if BTF is not enabled. +func LoadKernelSpec() (*Spec, error) { + kernelBTF.Lock() + defer kernelBTF.Unlock() + + if kernelBTF.Spec != nil { + return kernelBTF.Spec, nil + } + + var err error + kernelBTF.Spec, err = loadKernelSpec() + return kernelBTF.Spec, err +} + +func loadKernelSpec() (*Spec, error) { + release, err := unix.KernelRelease() + if err != nil { + return nil, fmt.Errorf("can't read kernel release number: %w", err) + } + + fh, err := os.Open("/sys/kernel/btf/vmlinux") + if err == nil { + defer fh.Close() + + return LoadRawSpec(fh, internal.NativeEndian) + } + + // use same list of locations as libbpf + // https://github.com/libbpf/libbpf/blob/9a3a42608dbe3731256a5682a125ac1e23bced8f/src/btf.c#L3114-L3122 + locations := []string{ + "/boot/vmlinux-%s", + "/lib/modules/%s/vmlinux-%[1]s", + "/lib/modules/%s/build/vmlinux", + "/usr/lib/modules/%s/kernel/vmlinux", + "/usr/lib/debug/boot/vmlinux-%s", + "/usr/lib/debug/boot/vmlinux-%s.debug", + "/usr/lib/debug/lib/modules/%s/vmlinux", + } + + for _, loc := range locations { + path := fmt.Sprintf(loc, release) + + fh, err := os.Open(path) + if err != nil { + continue + } + defer fh.Close() + + file, err := internal.NewSafeELFFile(fh) + if err != nil { + return nil, err + } + defer file.Close() + + return loadSpecFromELF(file, nil) + } + + return nil, fmt.Errorf("no BTF for kernel version %s: %w", release, internal.ErrNotSupported) +} + +func parseBTF(btf io.Reader, bo binary.ByteOrder) ([]rawType, stringTable, error) { + rawBTF, err := io.ReadAll(btf) + if err != nil { + return nil, nil, fmt.Errorf("can't read BTF: %v", err) + } + + rd := bytes.NewReader(rawBTF) + + var header btfHeader + if err := binary.Read(rd, bo, &header); err != nil { + return nil, nil, fmt.Errorf("can't read header: %v", err) + } + + if header.Magic != btfMagic { + return nil, nil, fmt.Errorf("incorrect magic value %v", header.Magic) + } + + if header.Version != 1 { + return nil, nil, fmt.Errorf("unexpected version %v", header.Version) + } + + if header.Flags != 0 { + return nil, nil, fmt.Errorf("unsupported flags %v", header.Flags) + } + + remainder := int64(header.HdrLen) - int64(binary.Size(&header)) + if remainder < 0 { + return nil, nil, errors.New("header is too short") + } + + if _, err := io.CopyN(internal.DiscardZeroes{}, rd, remainder); err != nil { + return nil, nil, fmt.Errorf("header padding: %v", err) + } + + if _, err := rd.Seek(int64(header.HdrLen+header.StringOff), io.SeekStart); err != nil { + return nil, nil, fmt.Errorf("can't seek to start of string section: %v", err) + } + + rawStrings, err := readStringTable(io.LimitReader(rd, int64(header.StringLen))) + if err != nil { + return nil, nil, fmt.Errorf("can't read type names: %w", err) + } + + if _, err := rd.Seek(int64(header.HdrLen+header.TypeOff), io.SeekStart); err != nil { + return nil, nil, fmt.Errorf("can't seek to start of type section: %v", err) + } + + rawTypes, err := readTypes(io.LimitReader(rd, int64(header.TypeLen)), bo) + if err != nil { + return nil, nil, fmt.Errorf("can't read types: %w", err) + } + + return rawTypes, rawStrings, nil +} + +type variable struct { + section string + name string +} + +func fixupDatasec(rawTypes []rawType, rawStrings stringTable, sectionSizes map[string]uint32, variableOffsets map[variable]uint32) error { + for i, rawType := range rawTypes { + if rawType.Kind() != kindDatasec { + continue + } + + name, err := rawStrings.Lookup(rawType.NameOff) + if err != nil { + return err + } + + if name == ".kconfig" || name == ".ksyms" { + return fmt.Errorf("reference to %s: %w", name, ErrNotSupported) + } + + if rawTypes[i].SizeType != 0 { + continue + } + + size, ok := sectionSizes[name] + if !ok { + return fmt.Errorf("data section %s: missing size", name) + } + + rawTypes[i].SizeType = size + + secinfos := rawType.data.([]btfVarSecinfo) + for j, secInfo := range secinfos { + id := int(secInfo.Type - 1) + if id >= len(rawTypes) { + return fmt.Errorf("data section %s: invalid type id %d for variable %d", name, id, j) + } + + varName, err := rawStrings.Lookup(rawTypes[id].NameOff) + if err != nil { + return fmt.Errorf("data section %s: can't get name for type %d: %w", name, id, err) + } + + offset, ok := variableOffsets[variable{name, varName}] + if !ok { + return fmt.Errorf("data section %s: missing offset for variable %s", name, varName) + } + + secinfos[j].Offset = offset + } + } + + return nil +} + +// Copy creates a copy of Spec. +func (s *Spec) Copy() *Spec { + types, _ := copyTypes(s.types, nil) + namedTypes := make(map[string][]NamedType) + for _, typ := range types { + if named, ok := typ.(NamedType); ok { + name := essentialName(named.TypeName()) + namedTypes[name] = append(namedTypes[name], named) + } + } + + // NB: Other parts of spec are not copied since they are immutable. + return &Spec{ + s.rawTypes, + s.strings, + types, + namedTypes, + s.funcInfos, + s.lineInfos, + s.coreRelos, + s.byteOrder, + } +} + +type marshalOpts struct { + ByteOrder binary.ByteOrder + StripFuncLinkage bool +} + +func (s *Spec) marshal(opts marshalOpts) ([]byte, error) { + var ( + buf bytes.Buffer + header = new(btfHeader) + headerLen = binary.Size(header) + ) + + // Reserve space for the header. We have to write it last since + // we don't know the size of the type section yet. + _, _ = buf.Write(make([]byte, headerLen)) + + // Write type section, just after the header. + for _, raw := range s.rawTypes { + switch { + case opts.StripFuncLinkage && raw.Kind() == kindFunc: + raw.SetLinkage(StaticFunc) + } + + if err := raw.Marshal(&buf, opts.ByteOrder); err != nil { + return nil, fmt.Errorf("can't marshal BTF: %w", err) + } + } + + typeLen := uint32(buf.Len() - headerLen) + + // Write string section after type section. + _, _ = buf.Write(s.strings) + + // Fill out the header, and write it out. + header = &btfHeader{ + Magic: btfMagic, + Version: 1, + Flags: 0, + HdrLen: uint32(headerLen), + TypeOff: 0, + TypeLen: typeLen, + StringOff: typeLen, + StringLen: uint32(len(s.strings)), + } + + raw := buf.Bytes() + err := binary.Write(sliceWriter(raw[:headerLen]), opts.ByteOrder, header) + if err != nil { + return nil, fmt.Errorf("can't write header: %v", err) + } + + return raw, nil +} + +type sliceWriter []byte + +func (sw sliceWriter) Write(p []byte) (int, error) { + if len(p) != len(sw) { + return 0, errors.New("size doesn't match") + } + + return copy(sw, p), nil +} + +// Program finds the BTF for a specific section. +// +// Length is the number of bytes in the raw BPF instruction stream. +// +// Returns an error which may wrap ErrNoExtendedInfo if the Spec doesn't +// contain extended BTF info. +func (s *Spec) Program(name string, length uint64) (*Program, error) { + if length == 0 { + return nil, errors.New("length musn't be zero") + } + + if s.funcInfos == nil && s.lineInfos == nil && s.coreRelos == nil { + return nil, fmt.Errorf("BTF for section %s: %w", name, ErrNoExtendedInfo) + } + + funcInfos, funcOK := s.funcInfos[name] + lineInfos, lineOK := s.lineInfos[name] + relos, coreOK := s.coreRelos[name] + + if !funcOK && !lineOK && !coreOK { + return nil, fmt.Errorf("no extended BTF info for section %s", name) + } + + return &Program{s, length, funcInfos, lineInfos, relos}, nil +} + +// FindType searches for a type with a specific name. +// +// Called T a type that satisfies Type, typ must be a non-nil **T. +// On success, the address of the found type will be copied in typ. +// +// Returns an error wrapping ErrNotFound if no matching +// type exists in spec. +func (s *Spec) FindType(name string, typ interface{}) error { + typValue := reflect.ValueOf(typ) + if typValue.Kind() != reflect.Ptr { + return fmt.Errorf("%T is not a pointer", typ) + } + + typPtr := typValue.Elem() + if !typPtr.CanSet() { + return fmt.Errorf("%T cannot be set", typ) + } + + wanted := typPtr.Type() + if !wanted.AssignableTo(reflect.TypeOf((*Type)(nil)).Elem()) { + return fmt.Errorf("%T does not satisfy Type interface", typ) + } + + var candidate Type + for _, typ := range s.namedTypes[essentialName(name)] { + if reflect.TypeOf(typ) != wanted { + continue + } + + // Match against the full name, not just the essential one. + if typ.TypeName() != name { + continue + } + + if candidate != nil { + return fmt.Errorf("type %s: multiple candidates for %T", name, typ) + } + + candidate = typ + } + + if candidate == nil { + return fmt.Errorf("type %s: %w", name, ErrNotFound) + } + + typPtr.Set(reflect.ValueOf(candidate)) + + return nil +} + +// Handle is a reference to BTF loaded into the kernel. +type Handle struct { + spec *Spec + fd *internal.FD +} + +// NewHandle loads BTF into the kernel. +// +// Returns ErrNotSupported if BTF is not supported. +func NewHandle(spec *Spec) (*Handle, error) { + if err := haveBTF(); err != nil { + return nil, err + } + + if spec.byteOrder != internal.NativeEndian { + return nil, fmt.Errorf("can't load %s BTF on %s", spec.byteOrder, internal.NativeEndian) + } + + btf, err := spec.marshal(marshalOpts{ + ByteOrder: internal.NativeEndian, + StripFuncLinkage: haveFuncLinkage() != nil, + }) + if err != nil { + return nil, fmt.Errorf("can't marshal BTF: %w", err) + } + + if uint64(len(btf)) > math.MaxUint32 { + return nil, errors.New("BTF exceeds the maximum size") + } + + attr := &bpfLoadBTFAttr{ + btf: internal.NewSlicePointer(btf), + btfSize: uint32(len(btf)), + } + + fd, err := bpfLoadBTF(attr) + if err != nil { + logBuf := make([]byte, 64*1024) + attr.logBuf = internal.NewSlicePointer(logBuf) + attr.btfLogSize = uint32(len(logBuf)) + attr.btfLogLevel = 1 + _, logErr := bpfLoadBTF(attr) + return nil, internal.ErrorWithLog(err, logBuf, logErr) + } + + return &Handle{spec.Copy(), fd}, nil +} + +// NewHandleFromID returns the BTF handle for a given id. +// +// Returns ErrNotExist, if there is no BTF with the given id. +// +// Requires CAP_SYS_ADMIN. +func NewHandleFromID(id ID) (*Handle, error) { + fd, err := internal.BPFObjGetFDByID(internal.BPF_BTF_GET_FD_BY_ID, uint32(id)) + if err != nil { + return nil, fmt.Errorf("get BTF by id: %w", err) + } + + info, err := newInfoFromFd(fd) + if err != nil { + _ = fd.Close() + return nil, fmt.Errorf("get BTF spec for handle: %w", err) + } + + return &Handle{info.BTF, fd}, nil +} + +// Spec returns the Spec that defined the BTF loaded into the kernel. +func (h *Handle) Spec() *Spec { + return h.spec +} + +// Close destroys the handle. +// +// Subsequent calls to FD will return an invalid value. +func (h *Handle) Close() error { + return h.fd.Close() +} + +// FD returns the file descriptor for the handle. +func (h *Handle) FD() int { + value, err := h.fd.Value() + if err != nil { + return -1 + } + + return int(value) +} + +// Map is the BTF for a map. +type Map struct { + Spec *Spec + Key, Value Type +} + +// Program is the BTF information for a stream of instructions. +type Program struct { + spec *Spec + length uint64 + funcInfos, lineInfos extInfo + coreRelos coreRelos +} + +// Spec returns the BTF spec of this program. +func (p *Program) Spec() *Spec { + return p.spec +} + +// Append the information from other to the Program. +func (p *Program) Append(other *Program) error { + if other.spec != p.spec { + return fmt.Errorf("can't append program with different BTF specs") + } + + funcInfos, err := p.funcInfos.append(other.funcInfos, p.length) + if err != nil { + return fmt.Errorf("func infos: %w", err) + } + + lineInfos, err := p.lineInfos.append(other.lineInfos, p.length) + if err != nil { + return fmt.Errorf("line infos: %w", err) + } + + p.funcInfos = funcInfos + p.lineInfos = lineInfos + p.coreRelos = p.coreRelos.append(other.coreRelos, p.length) + p.length += other.length + return nil +} + +// FuncInfos returns the binary form of BTF function infos. +func (p *Program) FuncInfos() (recordSize uint32, bytes []byte, err error) { + bytes, err = p.funcInfos.MarshalBinary() + if err != nil { + return 0, nil, fmt.Errorf("func infos: %w", err) + } + + return p.funcInfos.recordSize, bytes, nil +} + +// LineInfos returns the binary form of BTF line infos. +func (p *Program) LineInfos() (recordSize uint32, bytes []byte, err error) { + bytes, err = p.lineInfos.MarshalBinary() + if err != nil { + return 0, nil, fmt.Errorf("line infos: %w", err) + } + + return p.lineInfos.recordSize, bytes, nil +} + +// Fixups returns the changes required to adjust the program to the target. +// +// Passing a nil target will relocate against the running kernel. +func (p *Program) Fixups(target *Spec) (COREFixups, error) { + if len(p.coreRelos) == 0 { + return nil, nil + } + + if target == nil { + var err error + target, err = LoadKernelSpec() + if err != nil { + return nil, err + } + } + + return coreRelocate(p.spec, target, p.coreRelos) +} + +type bpfLoadBTFAttr struct { + btf internal.Pointer + logBuf internal.Pointer + btfSize uint32 + btfLogSize uint32 + btfLogLevel uint32 +} + +func bpfLoadBTF(attr *bpfLoadBTFAttr) (*internal.FD, error) { + fd, err := internal.BPF(internal.BPF_BTF_LOAD, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + if err != nil { + return nil, err + } + + return internal.NewFD(uint32(fd)), nil +} + +func marshalBTF(types interface{}, strings []byte, bo binary.ByteOrder) []byte { + const minHeaderLength = 24 + + typesLen := uint32(binary.Size(types)) + header := btfHeader{ + Magic: btfMagic, + Version: 1, + HdrLen: minHeaderLength, + TypeOff: 0, + TypeLen: typesLen, + StringOff: typesLen, + StringLen: uint32(len(strings)), + } + + buf := new(bytes.Buffer) + _ = binary.Write(buf, bo, &header) + _ = binary.Write(buf, bo, types) + buf.Write(strings) + + return buf.Bytes() +} + +var haveBTF = internal.FeatureTest("BTF", "5.1", func() error { + var ( + types struct { + Integer btfType + Var btfType + btfVar struct{ Linkage uint32 } + } + strings = []byte{0, 'a', 0} + ) + + // We use a BTF_KIND_VAR here, to make sure that + // the kernel understands BTF at least as well as we + // do. BTF_KIND_VAR was introduced ~5.1. + types.Integer.SetKind(kindPointer) + types.Var.NameOff = 1 + types.Var.SetKind(kindVar) + types.Var.SizeType = 1 + + btf := marshalBTF(&types, strings, internal.NativeEndian) + + fd, err := bpfLoadBTF(&bpfLoadBTFAttr{ + btf: internal.NewSlicePointer(btf), + btfSize: uint32(len(btf)), + }) + if errors.Is(err, unix.EINVAL) || errors.Is(err, unix.EPERM) { + // Treat both EINVAL and EPERM as not supported: loading the program + // might still succeed without BTF. + return internal.ErrNotSupported + } + if err != nil { + return err + } + + fd.Close() + return nil +}) + +var haveFuncLinkage = internal.FeatureTest("BTF func linkage", "5.6", func() error { + if err := haveBTF(); err != nil { + return err + } + + var ( + types struct { + FuncProto btfType + Func btfType + } + strings = []byte{0, 'a', 0} + ) + + types.FuncProto.SetKind(kindFuncProto) + types.Func.SetKind(kindFunc) + types.Func.SizeType = 1 // aka FuncProto + types.Func.NameOff = 1 + types.Func.SetLinkage(GlobalFunc) + + btf := marshalBTF(&types, strings, internal.NativeEndian) + + fd, err := bpfLoadBTF(&bpfLoadBTFAttr{ + btf: internal.NewSlicePointer(btf), + btfSize: uint32(len(btf)), + }) + if errors.Is(err, unix.EINVAL) { + return internal.ErrNotSupported + } + if err != nil { + return err + } + + fd.Close() + return nil +}) diff --git a/vendor/github.com/cilium/ebpf/internal/btf/btf_types.go b/vendor/github.com/cilium/ebpf/internal/btf/btf_types.go new file mode 100644 index 0000000..d98c73c --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/btf/btf_types.go @@ -0,0 +1,287 @@ +package btf + +import ( + "encoding/binary" + "fmt" + "io" +) + +//go:generate stringer -linecomment -output=btf_types_string.go -type=FuncLinkage,VarLinkage + +// btfKind describes a Type. +type btfKind uint8 + +// Equivalents of the BTF_KIND_* constants. +const ( + kindUnknown btfKind = iota + kindInt + kindPointer + kindArray + kindStruct + kindUnion + kindEnum + kindForward + kindTypedef + kindVolatile + kindConst + kindRestrict + // Added ~4.20 + kindFunc + kindFuncProto + // Added ~5.1 + kindVar + kindDatasec + // Added ~5.13 + kindFloat +) + +// FuncLinkage describes BTF function linkage metadata. +type FuncLinkage int + +// Equivalent of enum btf_func_linkage. +const ( + StaticFunc FuncLinkage = iota // static + GlobalFunc // global + ExternFunc // extern +) + +// VarLinkage describes BTF variable linkage metadata. +type VarLinkage int + +const ( + StaticVar VarLinkage = iota // static + GlobalVar // global + ExternVar // extern +) + +const ( + btfTypeKindShift = 24 + btfTypeKindLen = 5 + btfTypeVlenShift = 0 + btfTypeVlenMask = 16 + btfTypeKindFlagShift = 31 + btfTypeKindFlagMask = 1 +) + +// btfType is equivalent to struct btf_type in Documentation/bpf/btf.rst. +type btfType struct { + NameOff uint32 + /* "info" bits arrangement + * bits 0-15: vlen (e.g. # of struct's members), linkage + * bits 16-23: unused + * bits 24-28: kind (e.g. int, ptr, array...etc) + * bits 29-30: unused + * bit 31: kind_flag, currently used by + * struct, union and fwd + */ + Info uint32 + /* "size" is used by INT, ENUM, STRUCT and UNION. + * "size" tells the size of the type it is describing. + * + * "type" is used by PTR, TYPEDEF, VOLATILE, CONST, RESTRICT, + * FUNC and FUNC_PROTO. + * "type" is a type_id referring to another type. + */ + SizeType uint32 +} + +func (k btfKind) String() string { + switch k { + case kindUnknown: + return "Unknown" + case kindInt: + return "Integer" + case kindPointer: + return "Pointer" + case kindArray: + return "Array" + case kindStruct: + return "Struct" + case kindUnion: + return "Union" + case kindEnum: + return "Enumeration" + case kindForward: + return "Forward" + case kindTypedef: + return "Typedef" + case kindVolatile: + return "Volatile" + case kindConst: + return "Const" + case kindRestrict: + return "Restrict" + case kindFunc: + return "Function" + case kindFuncProto: + return "Function Proto" + case kindVar: + return "Variable" + case kindDatasec: + return "Section" + case kindFloat: + return "Float" + default: + return fmt.Sprintf("Unknown (%d)", k) + } +} + +func mask(len uint32) uint32 { + return (1 << len) - 1 +} + +func (bt *btfType) info(len, shift uint32) uint32 { + return (bt.Info >> shift) & mask(len) +} + +func (bt *btfType) setInfo(value, len, shift uint32) { + bt.Info &^= mask(len) << shift + bt.Info |= (value & mask(len)) << shift +} + +func (bt *btfType) Kind() btfKind { + return btfKind(bt.info(btfTypeKindLen, btfTypeKindShift)) +} + +func (bt *btfType) SetKind(kind btfKind) { + bt.setInfo(uint32(kind), btfTypeKindLen, btfTypeKindShift) +} + +func (bt *btfType) Vlen() int { + return int(bt.info(btfTypeVlenMask, btfTypeVlenShift)) +} + +func (bt *btfType) SetVlen(vlen int) { + bt.setInfo(uint32(vlen), btfTypeVlenMask, btfTypeVlenShift) +} + +func (bt *btfType) KindFlag() bool { + return bt.info(btfTypeKindFlagMask, btfTypeKindFlagShift) == 1 +} + +func (bt *btfType) Linkage() FuncLinkage { + return FuncLinkage(bt.info(btfTypeVlenMask, btfTypeVlenShift)) +} + +func (bt *btfType) SetLinkage(linkage FuncLinkage) { + bt.setInfo(uint32(linkage), btfTypeVlenMask, btfTypeVlenShift) +} + +func (bt *btfType) Type() TypeID { + // TODO: Panic here if wrong kind? + return TypeID(bt.SizeType) +} + +func (bt *btfType) Size() uint32 { + // TODO: Panic here if wrong kind? + return bt.SizeType +} + +type rawType struct { + btfType + data interface{} +} + +func (rt *rawType) Marshal(w io.Writer, bo binary.ByteOrder) error { + if err := binary.Write(w, bo, &rt.btfType); err != nil { + return err + } + + if rt.data == nil { + return nil + } + + return binary.Write(w, bo, rt.data) +} + +type btfArray struct { + Type TypeID + IndexType TypeID + Nelems uint32 +} + +type btfMember struct { + NameOff uint32 + Type TypeID + Offset uint32 +} + +type btfVarSecinfo struct { + Type TypeID + Offset uint32 + Size uint32 +} + +type btfVariable struct { + Linkage uint32 +} + +type btfEnum struct { + NameOff uint32 + Val int32 +} + +type btfParam struct { + NameOff uint32 + Type TypeID +} + +func readTypes(r io.Reader, bo binary.ByteOrder) ([]rawType, error) { + var ( + header btfType + types []rawType + ) + + for id := TypeID(1); ; id++ { + if err := binary.Read(r, bo, &header); err == io.EOF { + return types, nil + } else if err != nil { + return nil, fmt.Errorf("can't read type info for id %v: %v", id, err) + } + + var data interface{} + switch header.Kind() { + case kindInt: + data = new(uint32) + case kindPointer: + case kindArray: + data = new(btfArray) + case kindStruct: + fallthrough + case kindUnion: + data = make([]btfMember, header.Vlen()) + case kindEnum: + data = make([]btfEnum, header.Vlen()) + case kindForward: + case kindTypedef: + case kindVolatile: + case kindConst: + case kindRestrict: + case kindFunc: + case kindFuncProto: + data = make([]btfParam, header.Vlen()) + case kindVar: + data = new(btfVariable) + case kindDatasec: + data = make([]btfVarSecinfo, header.Vlen()) + case kindFloat: + default: + return nil, fmt.Errorf("type id %v: unknown kind: %v", id, header.Kind()) + } + + if data == nil { + types = append(types, rawType{header, nil}) + continue + } + + if err := binary.Read(r, bo, data); err != nil { + return nil, fmt.Errorf("type id %d: kind %v: can't read %T: %v", id, header.Kind(), data, err) + } + + types = append(types, rawType{header, data}) + } +} + +func intEncoding(raw uint32) (IntEncoding, uint32, byte) { + return IntEncoding((raw & 0x0f000000) >> 24), (raw & 0x00ff0000) >> 16, byte(raw & 0x000000ff) +} diff --git a/vendor/github.com/cilium/ebpf/internal/btf/btf_types_string.go b/vendor/github.com/cilium/ebpf/internal/btf/btf_types_string.go new file mode 100644 index 0000000..0e0c17d --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/btf/btf_types_string.go @@ -0,0 +1,44 @@ +// Code generated by "stringer -linecomment -output=btf_types_string.go -type=FuncLinkage,VarLinkage"; DO NOT EDIT. + +package btf + +import "strconv" + +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[StaticFunc-0] + _ = x[GlobalFunc-1] + _ = x[ExternFunc-2] +} + +const _FuncLinkage_name = "staticglobalextern" + +var _FuncLinkage_index = [...]uint8{0, 6, 12, 18} + +func (i FuncLinkage) String() string { + if i < 0 || i >= FuncLinkage(len(_FuncLinkage_index)-1) { + return "FuncLinkage(" + strconv.FormatInt(int64(i), 10) + ")" + } + return _FuncLinkage_name[_FuncLinkage_index[i]:_FuncLinkage_index[i+1]] +} +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[StaticVar-0] + _ = x[GlobalVar-1] + _ = x[ExternVar-2] +} + +const _VarLinkage_name = "staticglobalextern" + +var _VarLinkage_index = [...]uint8{0, 6, 12, 18} + +func (i VarLinkage) String() string { + if i < 0 || i >= VarLinkage(len(_VarLinkage_index)-1) { + return "VarLinkage(" + strconv.FormatInt(int64(i), 10) + ")" + } + return _VarLinkage_name[_VarLinkage_index[i]:_VarLinkage_index[i+1]] +} diff --git a/vendor/github.com/cilium/ebpf/internal/btf/core.go b/vendor/github.com/cilium/ebpf/internal/btf/core.go new file mode 100644 index 0000000..d02df9d --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/btf/core.go @@ -0,0 +1,888 @@ +package btf + +import ( + "errors" + "fmt" + "math" + "reflect" + "sort" + "strconv" + "strings" + + "github.com/cilium/ebpf/asm" +) + +// Code in this file is derived from libbpf, which is available under a BSD +// 2-Clause license. + +// COREFixup is the result of computing a CO-RE relocation for a target. +type COREFixup struct { + Kind COREKind + Local uint32 + Target uint32 + Poison bool +} + +func (f COREFixup) equal(other COREFixup) bool { + return f.Local == other.Local && f.Target == other.Target +} + +func (f COREFixup) String() string { + if f.Poison { + return fmt.Sprintf("%s=poison", f.Kind) + } + return fmt.Sprintf("%s=%d->%d", f.Kind, f.Local, f.Target) +} + +func (f COREFixup) apply(ins *asm.Instruction) error { + if f.Poison { + return errors.New("can't poison individual instruction") + } + + switch class := ins.OpCode.Class(); class { + case asm.LdXClass, asm.StClass, asm.StXClass: + if want := int16(f.Local); want != ins.Offset { + return fmt.Errorf("invalid offset %d, expected %d", ins.Offset, want) + } + + if f.Target > math.MaxInt16 { + return fmt.Errorf("offset %d exceeds MaxInt16", f.Target) + } + + ins.Offset = int16(f.Target) + + case asm.LdClass: + if !ins.IsConstantLoad(asm.DWord) { + return fmt.Errorf("not a dword-sized immediate load") + } + + if want := int64(f.Local); want != ins.Constant { + return fmt.Errorf("invalid immediate %d, expected %d", ins.Constant, want) + } + + ins.Constant = int64(f.Target) + + case asm.ALUClass: + if ins.OpCode.ALUOp() == asm.Swap { + return fmt.Errorf("relocation against swap") + } + + fallthrough + + case asm.ALU64Class: + if src := ins.OpCode.Source(); src != asm.ImmSource { + return fmt.Errorf("invalid source %s", src) + } + + if want := int64(f.Local); want != ins.Constant { + return fmt.Errorf("invalid immediate %d, expected %d", ins.Constant, want) + } + + if f.Target > math.MaxInt32 { + return fmt.Errorf("immediate %d exceeds MaxInt32", f.Target) + } + + ins.Constant = int64(f.Target) + + default: + return fmt.Errorf("invalid class %s", class) + } + + return nil +} + +func (f COREFixup) isNonExistant() bool { + return f.Kind.checksForExistence() && f.Target == 0 +} + +type COREFixups map[uint64]COREFixup + +// Apply a set of CO-RE relocations to a BPF program. +func (fs COREFixups) Apply(insns asm.Instructions) (asm.Instructions, error) { + if len(fs) == 0 { + cpy := make(asm.Instructions, len(insns)) + copy(cpy, insns) + return insns, nil + } + + cpy := make(asm.Instructions, 0, len(insns)) + iter := insns.Iterate() + for iter.Next() { + fixup, ok := fs[iter.Offset.Bytes()] + if !ok { + cpy = append(cpy, *iter.Ins) + continue + } + + ins := *iter.Ins + if fixup.Poison { + const badRelo = asm.BuiltinFunc(0xbad2310) + + cpy = append(cpy, badRelo.Call()) + if ins.OpCode.IsDWordLoad() { + // 64 bit constant loads occupy two raw bpf instructions, so + // we need to add another instruction as padding. + cpy = append(cpy, badRelo.Call()) + } + + continue + } + + if err := fixup.apply(&ins); err != nil { + return nil, fmt.Errorf("instruction %d, offset %d: %s: %w", iter.Index, iter.Offset.Bytes(), fixup.Kind, err) + } + + cpy = append(cpy, ins) + } + + return cpy, nil +} + +// COREKind is the type of CO-RE relocation +type COREKind uint32 + +const ( + reloFieldByteOffset COREKind = iota /* field byte offset */ + reloFieldByteSize /* field size in bytes */ + reloFieldExists /* field existence in target kernel */ + reloFieldSigned /* field signedness (0 - unsigned, 1 - signed) */ + reloFieldLShiftU64 /* bitfield-specific left bitshift */ + reloFieldRShiftU64 /* bitfield-specific right bitshift */ + reloTypeIDLocal /* type ID in local BPF object */ + reloTypeIDTarget /* type ID in target kernel */ + reloTypeExists /* type existence in target kernel */ + reloTypeSize /* type size in bytes */ + reloEnumvalExists /* enum value existence in target kernel */ + reloEnumvalValue /* enum value integer value */ +) + +func (k COREKind) String() string { + switch k { + case reloFieldByteOffset: + return "byte_off" + case reloFieldByteSize: + return "byte_sz" + case reloFieldExists: + return "field_exists" + case reloFieldSigned: + return "signed" + case reloFieldLShiftU64: + return "lshift_u64" + case reloFieldRShiftU64: + return "rshift_u64" + case reloTypeIDLocal: + return "local_type_id" + case reloTypeIDTarget: + return "target_type_id" + case reloTypeExists: + return "type_exists" + case reloTypeSize: + return "type_size" + case reloEnumvalExists: + return "enumval_exists" + case reloEnumvalValue: + return "enumval_value" + default: + return "unknown" + } +} + +func (k COREKind) checksForExistence() bool { + return k == reloEnumvalExists || k == reloTypeExists || k == reloFieldExists +} + +func coreRelocate(local, target *Spec, relos coreRelos) (COREFixups, error) { + if local.byteOrder != target.byteOrder { + return nil, fmt.Errorf("can't relocate %s against %s", local.byteOrder, target.byteOrder) + } + + var ids []TypeID + relosByID := make(map[TypeID]coreRelos) + result := make(COREFixups, len(relos)) + for _, relo := range relos { + if relo.kind == reloTypeIDLocal { + // Filtering out reloTypeIDLocal here makes our lives a lot easier + // down the line, since it doesn't have a target at all. + if len(relo.accessor) > 1 || relo.accessor[0] != 0 { + return nil, fmt.Errorf("%s: unexpected accessor %v", relo.kind, relo.accessor) + } + + result[uint64(relo.insnOff)] = COREFixup{ + relo.kind, + uint32(relo.typeID), + uint32(relo.typeID), + false, + } + continue + } + + relos, ok := relosByID[relo.typeID] + if !ok { + ids = append(ids, relo.typeID) + } + relosByID[relo.typeID] = append(relos, relo) + } + + // Ensure we work on relocations in a deterministic order. + sort.Slice(ids, func(i, j int) bool { + return ids[i] < ids[j] + }) + + for _, id := range ids { + if int(id) >= len(local.types) { + return nil, fmt.Errorf("invalid type id %d", id) + } + + localType := local.types[id] + named, ok := localType.(NamedType) + if !ok || named.TypeName() == "" { + return nil, fmt.Errorf("relocate unnamed or anonymous type %s: %w", localType, ErrNotSupported) + } + + relos := relosByID[id] + targets := target.namedTypes[essentialName(named.TypeName())] + fixups, err := coreCalculateFixups(localType, targets, relos) + if err != nil { + return nil, fmt.Errorf("relocate %s: %w", localType, err) + } + + for i, relo := range relos { + result[uint64(relo.insnOff)] = fixups[i] + } + } + + return result, nil +} + +var errAmbiguousRelocation = errors.New("ambiguous relocation") +var errImpossibleRelocation = errors.New("impossible relocation") + +// coreCalculateFixups calculates the fixups for the given relocations using +// the "best" target. +// +// The best target is determined by scoring: the less poisoning we have to do +// the better the target is. +func coreCalculateFixups(local Type, targets []NamedType, relos coreRelos) ([]COREFixup, error) { + localID := local.ID() + local, err := copyType(local, skipQualifierAndTypedef) + if err != nil { + return nil, err + } + + bestScore := len(relos) + var bestFixups []COREFixup + for i := range targets { + targetID := targets[i].ID() + target, err := copyType(targets[i], skipQualifierAndTypedef) + if err != nil { + return nil, err + } + + score := 0 // lower is better + fixups := make([]COREFixup, 0, len(relos)) + for _, relo := range relos { + fixup, err := coreCalculateFixup(local, localID, target, targetID, relo) + if err != nil { + return nil, fmt.Errorf("target %s: %w", target, err) + } + if fixup.Poison || fixup.isNonExistant() { + score++ + } + fixups = append(fixups, fixup) + } + + if score > bestScore { + // We have a better target already, ignore this one. + continue + } + + if score < bestScore { + // This is the best target yet, use it. + bestScore = score + bestFixups = fixups + continue + } + + // Some other target has the same score as the current one. Make sure + // the fixups agree with each other. + for i, fixup := range bestFixups { + if !fixup.equal(fixups[i]) { + return nil, fmt.Errorf("%s: multiple types match: %w", fixup.Kind, errAmbiguousRelocation) + } + } + } + + if bestFixups == nil { + // Nothing at all matched, probably because there are no suitable + // targets at all. Poison everything! + bestFixups = make([]COREFixup, len(relos)) + for i, relo := range relos { + bestFixups[i] = COREFixup{Kind: relo.kind, Poison: true} + } + } + + return bestFixups, nil +} + +// coreCalculateFixup calculates the fixup for a single local type, target type +// and relocation. +func coreCalculateFixup(local Type, localID TypeID, target Type, targetID TypeID, relo coreRelo) (COREFixup, error) { + fixup := func(local, target uint32) (COREFixup, error) { + return COREFixup{relo.kind, local, target, false}, nil + } + poison := func() (COREFixup, error) { + if relo.kind.checksForExistence() { + return fixup(1, 0) + } + return COREFixup{relo.kind, 0, 0, true}, nil + } + zero := COREFixup{} + + switch relo.kind { + case reloTypeIDTarget, reloTypeSize, reloTypeExists: + if len(relo.accessor) > 1 || relo.accessor[0] != 0 { + return zero, fmt.Errorf("%s: unexpected accessor %v", relo.kind, relo.accessor) + } + + err := coreAreTypesCompatible(local, target) + if errors.Is(err, errImpossibleRelocation) { + return poison() + } + if err != nil { + return zero, fmt.Errorf("relocation %s: %w", relo.kind, err) + } + + switch relo.kind { + case reloTypeExists: + return fixup(1, 1) + + case reloTypeIDTarget: + return fixup(uint32(localID), uint32(targetID)) + + case reloTypeSize: + localSize, err := Sizeof(local) + if err != nil { + return zero, err + } + + targetSize, err := Sizeof(target) + if err != nil { + return zero, err + } + + return fixup(uint32(localSize), uint32(targetSize)) + } + + case reloEnumvalValue, reloEnumvalExists: + localValue, targetValue, err := coreFindEnumValue(local, relo.accessor, target) + if errors.Is(err, errImpossibleRelocation) { + return poison() + } + if err != nil { + return zero, fmt.Errorf("relocation %s: %w", relo.kind, err) + } + + switch relo.kind { + case reloEnumvalExists: + return fixup(1, 1) + + case reloEnumvalValue: + return fixup(uint32(localValue.Value), uint32(targetValue.Value)) + } + + case reloFieldByteOffset, reloFieldByteSize, reloFieldExists: + if _, ok := target.(*Fwd); ok { + // We can't relocate fields using a forward declaration, so + // skip it. If a non-forward declaration is present in the BTF + // we'll find it in one of the other iterations. + return poison() + } + + localField, targetField, err := coreFindField(local, relo.accessor, target) + if errors.Is(err, errImpossibleRelocation) { + return poison() + } + if err != nil { + return zero, fmt.Errorf("target %s: %w", target, err) + } + + switch relo.kind { + case reloFieldExists: + return fixup(1, 1) + + case reloFieldByteOffset: + return fixup(localField.offset/8, targetField.offset/8) + + case reloFieldByteSize: + localSize, err := Sizeof(localField.Type) + if err != nil { + return zero, err + } + + targetSize, err := Sizeof(targetField.Type) + if err != nil { + return zero, err + } + + return fixup(uint32(localSize), uint32(targetSize)) + + } + } + + return zero, fmt.Errorf("relocation %s: %w", relo.kind, ErrNotSupported) +} + +/* coreAccessor contains a path through a struct. It contains at least one index. + * + * The interpretation depends on the kind of the relocation. The following is + * taken from struct bpf_core_relo in libbpf_internal.h: + * + * - for field-based relocations, string encodes an accessed field using + * a sequence of field and array indices, separated by colon (:). It's + * conceptually very close to LLVM's getelementptr ([0]) instruction's + * arguments for identifying offset to a field. + * - for type-based relocations, strings is expected to be just "0"; + * - for enum value-based relocations, string contains an index of enum + * value within its enum type; + * + * Example to provide a better feel. + * + * struct sample { + * int a; + * struct { + * int b[10]; + * }; + * }; + * + * struct sample s = ...; + * int x = &s->a; // encoded as "0:0" (a is field #0) + * int y = &s->b[5]; // encoded as "0:1:0:5" (anon struct is field #1, + * // b is field #0 inside anon struct, accessing elem #5) + * int z = &s[10]->b; // encoded as "10:1" (ptr is used as an array) + */ +type coreAccessor []int + +func parseCoreAccessor(accessor string) (coreAccessor, error) { + if accessor == "" { + return nil, fmt.Errorf("empty accessor") + } + + parts := strings.Split(accessor, ":") + result := make(coreAccessor, 0, len(parts)) + for _, part := range parts { + // 31 bits to avoid overflowing int on 32 bit platforms. + index, err := strconv.ParseUint(part, 10, 31) + if err != nil { + return nil, fmt.Errorf("accessor index %q: %s", part, err) + } + + result = append(result, int(index)) + } + + return result, nil +} + +func (ca coreAccessor) String() string { + strs := make([]string, 0, len(ca)) + for _, i := range ca { + strs = append(strs, strconv.Itoa(i)) + } + return strings.Join(strs, ":") +} + +func (ca coreAccessor) enumValue(t Type) (*EnumValue, error) { + e, ok := t.(*Enum) + if !ok { + return nil, fmt.Errorf("not an enum: %s", t) + } + + if len(ca) > 1 { + return nil, fmt.Errorf("invalid accessor %s for enum", ca) + } + + i := ca[0] + if i >= len(e.Values) { + return nil, fmt.Errorf("invalid index %d for %s", i, e) + } + + return &e.Values[i], nil +} + +type coreField struct { + Type Type + offset uint32 +} + +func adjustOffset(base uint32, t Type, n int) (uint32, error) { + size, err := Sizeof(t) + if err != nil { + return 0, err + } + + return base + (uint32(n) * uint32(size) * 8), nil +} + +// coreFindField descends into the local type using the accessor and tries to +// find an equivalent field in target at each step. +// +// Returns the field and the offset of the field from the start of +// target in bits. +func coreFindField(local Type, localAcc coreAccessor, target Type) (_, _ coreField, _ error) { + // The first index is used to offset a pointer of the base type like + // when accessing an array. + localOffset, err := adjustOffset(0, local, localAcc[0]) + if err != nil { + return coreField{}, coreField{}, err + } + + targetOffset, err := adjustOffset(0, target, localAcc[0]) + if err != nil { + return coreField{}, coreField{}, err + } + + if err := coreAreMembersCompatible(local, target); err != nil { + return coreField{}, coreField{}, fmt.Errorf("fields: %w", err) + } + + var localMaybeFlex, targetMaybeFlex bool + for _, acc := range localAcc[1:] { + switch localType := local.(type) { + case composite: + // For composite types acc is used to find the field in the local type, + // and then we try to find a field in target with the same name. + localMembers := localType.members() + if acc >= len(localMembers) { + return coreField{}, coreField{}, fmt.Errorf("invalid accessor %d for %s", acc, local) + } + + localMember := localMembers[acc] + if localMember.Name == "" { + _, ok := localMember.Type.(composite) + if !ok { + return coreField{}, coreField{}, fmt.Errorf("unnamed field with type %s: %s", localMember.Type, ErrNotSupported) + } + + // This is an anonymous struct or union, ignore it. + local = localMember.Type + localOffset += localMember.OffsetBits + localMaybeFlex = false + continue + } + + targetType, ok := target.(composite) + if !ok { + return coreField{}, coreField{}, fmt.Errorf("target not composite: %w", errImpossibleRelocation) + } + + targetMember, last, err := coreFindMember(targetType, localMember.Name) + if err != nil { + return coreField{}, coreField{}, err + } + + if targetMember.BitfieldSize > 0 { + return coreField{}, coreField{}, fmt.Errorf("field %q is a bitfield: %w", targetMember.Name, ErrNotSupported) + } + + local = localMember.Type + localMaybeFlex = acc == len(localMembers)-1 + localOffset += localMember.OffsetBits + target = targetMember.Type + targetMaybeFlex = last + targetOffset += targetMember.OffsetBits + + case *Array: + // For arrays, acc is the index in the target. + targetType, ok := target.(*Array) + if !ok { + return coreField{}, coreField{}, fmt.Errorf("target not array: %w", errImpossibleRelocation) + } + + if localType.Nelems == 0 && !localMaybeFlex { + return coreField{}, coreField{}, fmt.Errorf("local type has invalid flexible array") + } + if targetType.Nelems == 0 && !targetMaybeFlex { + return coreField{}, coreField{}, fmt.Errorf("target type has invalid flexible array") + } + + if localType.Nelems > 0 && acc >= int(localType.Nelems) { + return coreField{}, coreField{}, fmt.Errorf("invalid access of %s at index %d", localType, acc) + } + if targetType.Nelems > 0 && acc >= int(targetType.Nelems) { + return coreField{}, coreField{}, fmt.Errorf("out of bounds access of target: %w", errImpossibleRelocation) + } + + local = localType.Type + localMaybeFlex = false + localOffset, err = adjustOffset(localOffset, local, acc) + if err != nil { + return coreField{}, coreField{}, err + } + + target = targetType.Type + targetMaybeFlex = false + targetOffset, err = adjustOffset(targetOffset, target, acc) + if err != nil { + return coreField{}, coreField{}, err + } + + default: + return coreField{}, coreField{}, fmt.Errorf("relocate field of %T: %w", localType, ErrNotSupported) + } + + if err := coreAreMembersCompatible(local, target); err != nil { + return coreField{}, coreField{}, err + } + } + + return coreField{local, localOffset}, coreField{target, targetOffset}, nil +} + +// coreFindMember finds a member in a composite type while handling anonymous +// structs and unions. +func coreFindMember(typ composite, name string) (Member, bool, error) { + if name == "" { + return Member{}, false, errors.New("can't search for anonymous member") + } + + type offsetTarget struct { + composite + offset uint32 + } + + targets := []offsetTarget{{typ, 0}} + visited := make(map[composite]bool) + + for i := 0; i < len(targets); i++ { + target := targets[i] + + // Only visit targets once to prevent infinite recursion. + if visited[target] { + continue + } + if len(visited) >= maxTypeDepth { + // This check is different than libbpf, which restricts the entire + // path to BPF_CORE_SPEC_MAX_LEN items. + return Member{}, false, fmt.Errorf("type is nested too deep") + } + visited[target] = true + + members := target.members() + for j, member := range members { + if member.Name == name { + // NB: This is safe because member is a copy. + member.OffsetBits += target.offset + return member, j == len(members)-1, nil + } + + // The names don't match, but this member could be an anonymous struct + // or union. + if member.Name != "" { + continue + } + + comp, ok := member.Type.(composite) + if !ok { + return Member{}, false, fmt.Errorf("anonymous non-composite type %T not allowed", member.Type) + } + + targets = append(targets, offsetTarget{comp, target.offset + member.OffsetBits}) + } + } + + return Member{}, false, fmt.Errorf("no matching member: %w", errImpossibleRelocation) +} + +// coreFindEnumValue follows localAcc to find the equivalent enum value in target. +func coreFindEnumValue(local Type, localAcc coreAccessor, target Type) (localValue, targetValue *EnumValue, _ error) { + localValue, err := localAcc.enumValue(local) + if err != nil { + return nil, nil, err + } + + targetEnum, ok := target.(*Enum) + if !ok { + return nil, nil, errImpossibleRelocation + } + + localName := essentialName(localValue.Name) + for i, targetValue := range targetEnum.Values { + if essentialName(targetValue.Name) != localName { + continue + } + + return localValue, &targetEnum.Values[i], nil + } + + return nil, nil, errImpossibleRelocation +} + +/* The comment below is from bpf_core_types_are_compat in libbpf.c: + * + * Check local and target types for compatibility. This check is used for + * type-based CO-RE relocations and follow slightly different rules than + * field-based relocations. This function assumes that root types were already + * checked for name match. Beyond that initial root-level name check, names + * are completely ignored. Compatibility rules are as follows: + * - any two STRUCTs/UNIONs/FWDs/ENUMs/INTs are considered compatible, but + * kind should match for local and target types (i.e., STRUCT is not + * compatible with UNION); + * - for ENUMs, the size is ignored; + * - for INT, size and signedness are ignored; + * - for ARRAY, dimensionality is ignored, element types are checked for + * compatibility recursively; + * - CONST/VOLATILE/RESTRICT modifiers are ignored; + * - TYPEDEFs/PTRs are compatible if types they pointing to are compatible; + * - FUNC_PROTOs are compatible if they have compatible signature: same + * number of input args and compatible return and argument types. + * These rules are not set in stone and probably will be adjusted as we get + * more experience with using BPF CO-RE relocations. + * + * Returns errImpossibleRelocation if types are not compatible. + */ +func coreAreTypesCompatible(localType Type, targetType Type) error { + var ( + localTs, targetTs typeDeque + l, t = &localType, &targetType + depth = 0 + ) + + for ; l != nil && t != nil; l, t = localTs.shift(), targetTs.shift() { + if depth >= maxTypeDepth { + return errors.New("types are nested too deep") + } + + localType = *l + targetType = *t + + if reflect.TypeOf(localType) != reflect.TypeOf(targetType) { + return fmt.Errorf("type mismatch: %w", errImpossibleRelocation) + } + + switch lv := (localType).(type) { + case *Void, *Struct, *Union, *Enum, *Fwd: + // Nothing to do here + + case *Int: + tv := targetType.(*Int) + if lv.isBitfield() || tv.isBitfield() { + return fmt.Errorf("bitfield: %w", errImpossibleRelocation) + } + + case *Pointer, *Array: + depth++ + localType.walk(&localTs) + targetType.walk(&targetTs) + + case *FuncProto: + tv := targetType.(*FuncProto) + if len(lv.Params) != len(tv.Params) { + return fmt.Errorf("function param mismatch: %w", errImpossibleRelocation) + } + + depth++ + localType.walk(&localTs) + targetType.walk(&targetTs) + + default: + return fmt.Errorf("unsupported type %T", localType) + } + } + + if l != nil { + return fmt.Errorf("dangling local type %T", *l) + } + + if t != nil { + return fmt.Errorf("dangling target type %T", *t) + } + + return nil +} + +/* coreAreMembersCompatible checks two types for field-based relocation compatibility. + * + * The comment below is from bpf_core_fields_are_compat in libbpf.c: + * + * Check two types for compatibility for the purpose of field access + * relocation. const/volatile/restrict and typedefs are skipped to ensure we + * are relocating semantically compatible entities: + * - any two STRUCTs/UNIONs are compatible and can be mixed; + * - any two FWDs are compatible, if their names match (modulo flavor suffix); + * - any two PTRs are always compatible; + * - for ENUMs, names should be the same (ignoring flavor suffix) or at + * least one of enums should be anonymous; + * - for ENUMs, check sizes, names are ignored; + * - for INT, size and signedness are ignored; + * - any two FLOATs are always compatible; + * - for ARRAY, dimensionality is ignored, element types are checked for + * compatibility recursively; + * [ NB: coreAreMembersCompatible doesn't recurse, this check is done + * by coreFindField. ] + * - everything else shouldn't be ever a target of relocation. + * These rules are not set in stone and probably will be adjusted as we get + * more experience with using BPF CO-RE relocations. + * + * Returns errImpossibleRelocation if the members are not compatible. + */ +func coreAreMembersCompatible(localType Type, targetType Type) error { + doNamesMatch := func(a, b string) error { + if a == "" || b == "" { + // allow anonymous and named type to match + return nil + } + + if essentialName(a) == essentialName(b) { + return nil + } + + return fmt.Errorf("names don't match: %w", errImpossibleRelocation) + } + + _, lok := localType.(composite) + _, tok := targetType.(composite) + if lok && tok { + return nil + } + + if reflect.TypeOf(localType) != reflect.TypeOf(targetType) { + return fmt.Errorf("type mismatch: %w", errImpossibleRelocation) + } + + switch lv := localType.(type) { + case *Array, *Pointer, *Float: + return nil + + case *Enum: + tv := targetType.(*Enum) + return doNamesMatch(lv.Name, tv.Name) + + case *Fwd: + tv := targetType.(*Fwd) + return doNamesMatch(lv.Name, tv.Name) + + case *Int: + tv := targetType.(*Int) + if lv.isBitfield() || tv.isBitfield() { + return fmt.Errorf("bitfield: %w", errImpossibleRelocation) + } + return nil + + default: + return fmt.Errorf("type %s: %w", localType, ErrNotSupported) + } +} + +func skipQualifierAndTypedef(typ Type) (Type, error) { + result := typ + for depth := 0; depth <= maxTypeDepth; depth++ { + switch v := (result).(type) { + case qualifier: + result = v.qualify() + case *Typedef: + result = v.Type + default: + return result, nil + } + } + return nil, errors.New("exceeded type depth") +} diff --git a/vendor/github.com/cilium/ebpf/internal/btf/doc.go b/vendor/github.com/cilium/ebpf/internal/btf/doc.go new file mode 100644 index 0000000..ad2576c --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/btf/doc.go @@ -0,0 +1,8 @@ +// Package btf handles data encoded according to the BPF Type Format. +// +// The canonical documentation lives in the Linux kernel repository and is +// available at https://www.kernel.org/doc/html/latest/bpf/btf.html +// +// The API is very much unstable. You should only use this via the main +// ebpf library. +package btf diff --git a/vendor/github.com/cilium/ebpf/internal/btf/ext_info.go b/vendor/github.com/cilium/ebpf/internal/btf/ext_info.go new file mode 100644 index 0000000..cdae2ec --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/btf/ext_info.go @@ -0,0 +1,312 @@ +package btf + +import ( + "bufio" + "bytes" + "encoding/binary" + "errors" + "fmt" + "io" + + "github.com/cilium/ebpf/asm" + "github.com/cilium/ebpf/internal" +) + +type btfExtHeader struct { + Magic uint16 + Version uint8 + Flags uint8 + HdrLen uint32 + + FuncInfoOff uint32 + FuncInfoLen uint32 + LineInfoOff uint32 + LineInfoLen uint32 +} + +type btfExtCoreHeader struct { + CoreReloOff uint32 + CoreReloLen uint32 +} + +func parseExtInfos(r io.ReadSeeker, bo binary.ByteOrder, strings stringTable) (funcInfo, lineInfo map[string]extInfo, relos map[string]coreRelos, err error) { + var header btfExtHeader + var coreHeader btfExtCoreHeader + if err := binary.Read(r, bo, &header); err != nil { + return nil, nil, nil, fmt.Errorf("can't read header: %v", err) + } + + if header.Magic != btfMagic { + return nil, nil, nil, fmt.Errorf("incorrect magic value %v", header.Magic) + } + + if header.Version != 1 { + return nil, nil, nil, fmt.Errorf("unexpected version %v", header.Version) + } + + if header.Flags != 0 { + return nil, nil, nil, fmt.Errorf("unsupported flags %v", header.Flags) + } + + remainder := int64(header.HdrLen) - int64(binary.Size(&header)) + if remainder < 0 { + return nil, nil, nil, errors.New("header is too short") + } + + coreHdrSize := int64(binary.Size(&coreHeader)) + if remainder >= coreHdrSize { + if err := binary.Read(r, bo, &coreHeader); err != nil { + return nil, nil, nil, fmt.Errorf("can't read CO-RE relocation header: %v", err) + } + remainder -= coreHdrSize + } + + // Of course, the .BTF.ext header has different semantics than the + // .BTF ext header. We need to ignore non-null values. + _, err = io.CopyN(io.Discard, r, remainder) + if err != nil { + return nil, nil, nil, fmt.Errorf("header padding: %v", err) + } + + if _, err := r.Seek(int64(header.HdrLen+header.FuncInfoOff), io.SeekStart); err != nil { + return nil, nil, nil, fmt.Errorf("can't seek to function info section: %v", err) + } + + buf := bufio.NewReader(io.LimitReader(r, int64(header.FuncInfoLen))) + funcInfo, err = parseExtInfo(buf, bo, strings) + if err != nil { + return nil, nil, nil, fmt.Errorf("function info: %w", err) + } + + if _, err := r.Seek(int64(header.HdrLen+header.LineInfoOff), io.SeekStart); err != nil { + return nil, nil, nil, fmt.Errorf("can't seek to line info section: %v", err) + } + + buf = bufio.NewReader(io.LimitReader(r, int64(header.LineInfoLen))) + lineInfo, err = parseExtInfo(buf, bo, strings) + if err != nil { + return nil, nil, nil, fmt.Errorf("line info: %w", err) + } + + if coreHeader.CoreReloOff > 0 && coreHeader.CoreReloLen > 0 { + if _, err := r.Seek(int64(header.HdrLen+coreHeader.CoreReloOff), io.SeekStart); err != nil { + return nil, nil, nil, fmt.Errorf("can't seek to CO-RE relocation section: %v", err) + } + + relos, err = parseExtInfoRelos(io.LimitReader(r, int64(coreHeader.CoreReloLen)), bo, strings) + if err != nil { + return nil, nil, nil, fmt.Errorf("CO-RE relocation info: %w", err) + } + } + + return funcInfo, lineInfo, relos, nil +} + +type btfExtInfoSec struct { + SecNameOff uint32 + NumInfo uint32 +} + +type extInfoRecord struct { + InsnOff uint64 + Opaque []byte +} + +type extInfo struct { + byteOrder binary.ByteOrder + recordSize uint32 + records []extInfoRecord +} + +func (ei extInfo) append(other extInfo, offset uint64) (extInfo, error) { + if other.byteOrder != ei.byteOrder { + return extInfo{}, fmt.Errorf("ext_info byte order mismatch, want %v (got %v)", ei.byteOrder, other.byteOrder) + } + + if other.recordSize != ei.recordSize { + return extInfo{}, fmt.Errorf("ext_info record size mismatch, want %d (got %d)", ei.recordSize, other.recordSize) + } + + records := make([]extInfoRecord, 0, len(ei.records)+len(other.records)) + records = append(records, ei.records...) + for _, info := range other.records { + records = append(records, extInfoRecord{ + InsnOff: info.InsnOff + offset, + Opaque: info.Opaque, + }) + } + return extInfo{ei.byteOrder, ei.recordSize, records}, nil +} + +func (ei extInfo) MarshalBinary() ([]byte, error) { + if ei.byteOrder != internal.NativeEndian { + return nil, fmt.Errorf("%s is not the native byte order", ei.byteOrder) + } + + if len(ei.records) == 0 { + return nil, nil + } + + buf := bytes.NewBuffer(make([]byte, 0, int(ei.recordSize)*len(ei.records))) + for _, info := range ei.records { + // The kernel expects offsets in number of raw bpf instructions, + // while the ELF tracks it in bytes. + insnOff := uint32(info.InsnOff / asm.InstructionSize) + if err := binary.Write(buf, internal.NativeEndian, insnOff); err != nil { + return nil, fmt.Errorf("can't write instruction offset: %v", err) + } + + buf.Write(info.Opaque) + } + + return buf.Bytes(), nil +} + +func parseExtInfo(r io.Reader, bo binary.ByteOrder, strings stringTable) (map[string]extInfo, error) { + const maxRecordSize = 256 + + var recordSize uint32 + if err := binary.Read(r, bo, &recordSize); err != nil { + return nil, fmt.Errorf("can't read record size: %v", err) + } + + if recordSize < 4 { + // Need at least insnOff + return nil, errors.New("record size too short") + } + if recordSize > maxRecordSize { + return nil, fmt.Errorf("record size %v exceeds %v", recordSize, maxRecordSize) + } + + result := make(map[string]extInfo) + for { + secName, infoHeader, err := parseExtInfoHeader(r, bo, strings) + if errors.Is(err, io.EOF) { + return result, nil + } + + var records []extInfoRecord + for i := uint32(0); i < infoHeader.NumInfo; i++ { + var byteOff uint32 + if err := binary.Read(r, bo, &byteOff); err != nil { + return nil, fmt.Errorf("section %v: can't read extended info offset: %v", secName, err) + } + + buf := make([]byte, int(recordSize-4)) + if _, err := io.ReadFull(r, buf); err != nil { + return nil, fmt.Errorf("section %v: can't read record: %v", secName, err) + } + + if byteOff%asm.InstructionSize != 0 { + return nil, fmt.Errorf("section %v: offset %v is not aligned with instruction size", secName, byteOff) + } + + records = append(records, extInfoRecord{uint64(byteOff), buf}) + } + + result[secName] = extInfo{ + bo, + recordSize, + records, + } + } +} + +// bpfCoreRelo matches `struct bpf_core_relo` from the kernel +type bpfCoreRelo struct { + InsnOff uint32 + TypeID TypeID + AccessStrOff uint32 + Kind COREKind +} + +type coreRelo struct { + insnOff uint32 + typeID TypeID + accessor coreAccessor + kind COREKind +} + +type coreRelos []coreRelo + +// append two slices of extInfoRelo to each other. The InsnOff of b are adjusted +// by offset. +func (r coreRelos) append(other coreRelos, offset uint64) coreRelos { + result := make([]coreRelo, 0, len(r)+len(other)) + result = append(result, r...) + for _, relo := range other { + relo.insnOff += uint32(offset) + result = append(result, relo) + } + return result +} + +var extInfoReloSize = binary.Size(bpfCoreRelo{}) + +func parseExtInfoRelos(r io.Reader, bo binary.ByteOrder, strings stringTable) (map[string]coreRelos, error) { + var recordSize uint32 + if err := binary.Read(r, bo, &recordSize); err != nil { + return nil, fmt.Errorf("read record size: %v", err) + } + + if recordSize != uint32(extInfoReloSize) { + return nil, fmt.Errorf("expected record size %d, got %d", extInfoReloSize, recordSize) + } + + result := make(map[string]coreRelos) + for { + secName, infoHeader, err := parseExtInfoHeader(r, bo, strings) + if errors.Is(err, io.EOF) { + return result, nil + } + + var relos coreRelos + for i := uint32(0); i < infoHeader.NumInfo; i++ { + var relo bpfCoreRelo + if err := binary.Read(r, bo, &relo); err != nil { + return nil, fmt.Errorf("section %v: read record: %v", secName, err) + } + + if relo.InsnOff%asm.InstructionSize != 0 { + return nil, fmt.Errorf("section %v: offset %v is not aligned with instruction size", secName, relo.InsnOff) + } + + accessorStr, err := strings.Lookup(relo.AccessStrOff) + if err != nil { + return nil, err + } + + accessor, err := parseCoreAccessor(accessorStr) + if err != nil { + return nil, fmt.Errorf("accessor %q: %s", accessorStr, err) + } + + relos = append(relos, coreRelo{ + relo.InsnOff, + relo.TypeID, + accessor, + relo.Kind, + }) + } + + result[secName] = relos + } +} + +func parseExtInfoHeader(r io.Reader, bo binary.ByteOrder, strings stringTable) (string, *btfExtInfoSec, error) { + var infoHeader btfExtInfoSec + if err := binary.Read(r, bo, &infoHeader); err != nil { + return "", nil, fmt.Errorf("read ext info header: %w", err) + } + + secName, err := strings.Lookup(infoHeader.SecNameOff) + if err != nil { + return "", nil, fmt.Errorf("get section name: %w", err) + } + + if infoHeader.NumInfo == 0 { + return "", nil, fmt.Errorf("section %s has zero records", secName) + } + + return secName, &infoHeader, nil +} diff --git a/vendor/github.com/cilium/ebpf/internal/btf/fuzz.go b/vendor/github.com/cilium/ebpf/internal/btf/fuzz.go new file mode 100644 index 0000000..220b285 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/btf/fuzz.go @@ -0,0 +1,50 @@ +//go:build gofuzz +// +build gofuzz + +// Use with https://github.com/dvyukov/go-fuzz + +package btf + +import ( + "bytes" + "encoding/binary" + + "github.com/cilium/ebpf/internal" +) + +func FuzzSpec(data []byte) int { + if len(data) < binary.Size(btfHeader{}) { + return -1 + } + + spec, err := loadNakedSpec(bytes.NewReader(data), internal.NativeEndian, nil, nil) + if err != nil { + if spec != nil { + panic("spec is not nil") + } + return 0 + } + if spec == nil { + panic("spec is nil") + } + return 1 +} + +func FuzzExtInfo(data []byte) int { + if len(data) < binary.Size(btfExtHeader{}) { + return -1 + } + + table := stringTable("\x00foo\x00barfoo\x00") + info, err := parseExtInfo(bytes.NewReader(data), internal.NativeEndian, table) + if err != nil { + if info != nil { + panic("info is not nil") + } + return 0 + } + if info == nil { + panic("info is nil") + } + return 1 +} diff --git a/vendor/github.com/cilium/ebpf/internal/btf/info.go b/vendor/github.com/cilium/ebpf/internal/btf/info.go new file mode 100644 index 0000000..6a9b5d2 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/btf/info.go @@ -0,0 +1,48 @@ +package btf + +import ( + "bytes" + + "github.com/cilium/ebpf/internal" +) + +// info describes a BTF object. +type info struct { + BTF *Spec + ID ID + // Name is an identifying name for the BTF, currently only used by the + // kernel. + Name string + // KernelBTF is true if the BTf originated with the kernel and not + // userspace. + KernelBTF bool +} + +func newInfoFromFd(fd *internal.FD) (*info, error) { + // We invoke the syscall once with a empty BTF and name buffers to get size + // information to allocate buffers. Then we invoke it a second time with + // buffers to receive the data. + bpfInfo, err := bpfGetBTFInfoByFD(fd, nil, nil) + if err != nil { + return nil, err + } + + btfBuffer := make([]byte, bpfInfo.btfSize) + nameBuffer := make([]byte, bpfInfo.nameLen) + bpfInfo, err = bpfGetBTFInfoByFD(fd, btfBuffer, nameBuffer) + if err != nil { + return nil, err + } + + spec, err := loadRawSpec(bytes.NewReader(btfBuffer), internal.NativeEndian, nil, nil) + if err != nil { + return nil, err + } + + return &info{ + BTF: spec, + ID: ID(bpfInfo.id), + Name: internal.CString(nameBuffer), + KernelBTF: bpfInfo.kernelBTF != 0, + }, nil +} diff --git a/vendor/github.com/cilium/ebpf/internal/btf/strings.go b/vendor/github.com/cilium/ebpf/internal/btf/strings.go new file mode 100644 index 0000000..9876aa2 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/btf/strings.go @@ -0,0 +1,54 @@ +package btf + +import ( + "bytes" + "errors" + "fmt" + "io" +) + +type stringTable []byte + +func readStringTable(r io.Reader) (stringTable, error) { + contents, err := io.ReadAll(r) + if err != nil { + return nil, fmt.Errorf("can't read string table: %v", err) + } + + if len(contents) < 1 { + return nil, errors.New("string table is empty") + } + + if contents[0] != '\x00' { + return nil, errors.New("first item in string table is non-empty") + } + + if contents[len(contents)-1] != '\x00' { + return nil, errors.New("string table isn't null terminated") + } + + return stringTable(contents), nil +} + +func (st stringTable) Lookup(offset uint32) (string, error) { + if int64(offset) > int64(^uint(0)>>1) { + return "", fmt.Errorf("offset %d overflows int", offset) + } + + pos := int(offset) + if pos >= len(st) { + return "", fmt.Errorf("offset %d is out of bounds", offset) + } + + if pos > 0 && st[pos-1] != '\x00' { + return "", fmt.Errorf("offset %d isn't start of a string", offset) + } + + str := st[pos:] + end := bytes.IndexByte(str, '\x00') + if end == -1 { + return "", fmt.Errorf("offset %d isn't null terminated", offset) + } + + return string(str[:end]), nil +} diff --git a/vendor/github.com/cilium/ebpf/internal/btf/syscalls.go b/vendor/github.com/cilium/ebpf/internal/btf/syscalls.go new file mode 100644 index 0000000..a4f80ab --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/btf/syscalls.go @@ -0,0 +1,31 @@ +package btf + +import ( + "fmt" + "unsafe" + + "github.com/cilium/ebpf/internal" +) + +type bpfBTFInfo struct { + btf internal.Pointer + btfSize uint32 + id uint32 + name internal.Pointer + nameLen uint32 + kernelBTF uint32 +} + +func bpfGetBTFInfoByFD(fd *internal.FD, btf, name []byte) (*bpfBTFInfo, error) { + info := bpfBTFInfo{ + btf: internal.NewSlicePointer(btf), + btfSize: uint32(len(btf)), + name: internal.NewSlicePointer(name), + nameLen: uint32(len(name)), + } + if err := internal.BPFObjGetInfoByFD(fd, unsafe.Pointer(&info), unsafe.Sizeof(info)); err != nil { + return nil, fmt.Errorf("can't get program info: %w", err) + } + + return &info, nil +} diff --git a/vendor/github.com/cilium/ebpf/internal/btf/types.go b/vendor/github.com/cilium/ebpf/internal/btf/types.go new file mode 100644 index 0000000..5c8e7c6 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/btf/types.go @@ -0,0 +1,957 @@ +package btf + +import ( + "fmt" + "math" + "strings" +) + +const maxTypeDepth = 32 + +// TypeID identifies a type in a BTF section. +type TypeID uint32 + +// ID implements part of the Type interface. +func (tid TypeID) ID() TypeID { + return tid +} + +// Type represents a type described by BTF. +type Type interface { + ID() TypeID + + String() string + + // Make a copy of the type, without copying Type members. + copy() Type + + // Enumerate all nested Types. Repeated calls must visit nested + // types in the same order. + walk(*typeDeque) +} + +// NamedType is a type with a name. +type NamedType interface { + Type + + // Name of the type, empty for anonymous types. + TypeName() string +} + +var ( + _ NamedType = (*Int)(nil) + _ NamedType = (*Struct)(nil) + _ NamedType = (*Union)(nil) + _ NamedType = (*Enum)(nil) + _ NamedType = (*Fwd)(nil) + _ NamedType = (*Func)(nil) + _ NamedType = (*Typedef)(nil) + _ NamedType = (*Var)(nil) + _ NamedType = (*Datasec)(nil) + _ NamedType = (*Float)(nil) +) + +// Void is the unit type of BTF. +type Void struct{} + +func (v *Void) ID() TypeID { return 0 } +func (v *Void) String() string { return "void#0" } +func (v *Void) size() uint32 { return 0 } +func (v *Void) copy() Type { return (*Void)(nil) } +func (v *Void) walk(*typeDeque) {} + +type IntEncoding byte + +const ( + Signed IntEncoding = 1 << iota + Char + Bool +) + +// Int is an integer of a given length. +type Int struct { + TypeID + Name string + + // The size of the integer in bytes. + Size uint32 + Encoding IntEncoding + // OffsetBits is the starting bit offset. Currently always 0. + // See https://www.kernel.org/doc/html/latest/bpf/btf.html#btf-kind-int + OffsetBits uint32 + Bits byte +} + +func (i *Int) String() string { + var s strings.Builder + + switch { + case i.Encoding&Char != 0: + s.WriteString("char") + case i.Encoding&Bool != 0: + s.WriteString("bool") + default: + if i.Encoding&Signed == 0 { + s.WriteRune('u') + } + s.WriteString("int") + fmt.Fprintf(&s, "%d", i.Size*8) + } + + fmt.Fprintf(&s, "#%d", i.TypeID) + + if i.Bits > 0 { + fmt.Fprintf(&s, "[bits=%d]", i.Bits) + } + + return s.String() +} + +func (i *Int) TypeName() string { return i.Name } +func (i *Int) size() uint32 { return i.Size } +func (i *Int) walk(*typeDeque) {} +func (i *Int) copy() Type { + cpy := *i + return &cpy +} + +func (i *Int) isBitfield() bool { + return i.OffsetBits > 0 +} + +// Pointer is a pointer to another type. +type Pointer struct { + TypeID + Target Type +} + +func (p *Pointer) String() string { + return fmt.Sprintf("pointer#%d[target=#%d]", p.TypeID, p.Target.ID()) +} + +func (p *Pointer) size() uint32 { return 8 } +func (p *Pointer) walk(tdq *typeDeque) { tdq.push(&p.Target) } +func (p *Pointer) copy() Type { + cpy := *p + return &cpy +} + +// Array is an array with a fixed number of elements. +type Array struct { + TypeID + Type Type + Nelems uint32 +} + +func (arr *Array) String() string { + return fmt.Sprintf("array#%d[type=#%d n=%d]", arr.TypeID, arr.Type.ID(), arr.Nelems) +} + +func (arr *Array) walk(tdq *typeDeque) { tdq.push(&arr.Type) } +func (arr *Array) copy() Type { + cpy := *arr + return &cpy +} + +// Struct is a compound type of consecutive members. +type Struct struct { + TypeID + Name string + // The size of the struct including padding, in bytes + Size uint32 + Members []Member +} + +func (s *Struct) String() string { + return fmt.Sprintf("struct#%d[%q]", s.TypeID, s.Name) +} + +func (s *Struct) TypeName() string { return s.Name } + +func (s *Struct) size() uint32 { return s.Size } + +func (s *Struct) walk(tdq *typeDeque) { + for i := range s.Members { + tdq.push(&s.Members[i].Type) + } +} + +func (s *Struct) copy() Type { + cpy := *s + cpy.Members = copyMembers(s.Members) + return &cpy +} + +func (s *Struct) members() []Member { + return s.Members +} + +// Union is a compound type where members occupy the same memory. +type Union struct { + TypeID + Name string + // The size of the union including padding, in bytes. + Size uint32 + Members []Member +} + +func (u *Union) String() string { + return fmt.Sprintf("union#%d[%q]", u.TypeID, u.Name) +} + +func (u *Union) TypeName() string { return u.Name } + +func (u *Union) size() uint32 { return u.Size } + +func (u *Union) walk(tdq *typeDeque) { + for i := range u.Members { + tdq.push(&u.Members[i].Type) + } +} + +func (u *Union) copy() Type { + cpy := *u + cpy.Members = copyMembers(u.Members) + return &cpy +} + +func (u *Union) members() []Member { + return u.Members +} + +func copyMembers(orig []Member) []Member { + cpy := make([]Member, len(orig)) + copy(cpy, orig) + return cpy +} + +type composite interface { + members() []Member +} + +var ( + _ composite = (*Struct)(nil) + _ composite = (*Union)(nil) +) + +// Member is part of a Struct or Union. +// +// It is not a valid Type. +type Member struct { + Name string + Type Type + // OffsetBits is the bit offset of this member. + OffsetBits uint32 + BitfieldSize uint32 +} + +// Enum lists possible values. +type Enum struct { + TypeID + Name string + Values []EnumValue +} + +func (e *Enum) String() string { + return fmt.Sprintf("enum#%d[%q]", e.TypeID, e.Name) +} + +func (e *Enum) TypeName() string { return e.Name } + +// EnumValue is part of an Enum +// +// Is is not a valid Type +type EnumValue struct { + Name string + Value int32 +} + +func (e *Enum) size() uint32 { return 4 } +func (e *Enum) walk(*typeDeque) {} +func (e *Enum) copy() Type { + cpy := *e + cpy.Values = make([]EnumValue, len(e.Values)) + copy(cpy.Values, e.Values) + return &cpy +} + +// FwdKind is the type of forward declaration. +type FwdKind int + +// Valid types of forward declaration. +const ( + FwdStruct FwdKind = iota + FwdUnion +) + +func (fk FwdKind) String() string { + switch fk { + case FwdStruct: + return "struct" + case FwdUnion: + return "union" + default: + return fmt.Sprintf("%T(%d)", fk, int(fk)) + } +} + +// Fwd is a forward declaration of a Type. +type Fwd struct { + TypeID + Name string + Kind FwdKind +} + +func (f *Fwd) String() string { + return fmt.Sprintf("fwd#%d[%s %q]", f.TypeID, f.Kind, f.Name) +} + +func (f *Fwd) TypeName() string { return f.Name } + +func (f *Fwd) walk(*typeDeque) {} +func (f *Fwd) copy() Type { + cpy := *f + return &cpy +} + +// Typedef is an alias of a Type. +type Typedef struct { + TypeID + Name string + Type Type +} + +func (td *Typedef) String() string { + return fmt.Sprintf("typedef#%d[%q #%d]", td.TypeID, td.Name, td.Type.ID()) +} + +func (td *Typedef) TypeName() string { return td.Name } + +func (td *Typedef) walk(tdq *typeDeque) { tdq.push(&td.Type) } +func (td *Typedef) copy() Type { + cpy := *td + return &cpy +} + +// Volatile is a qualifier. +type Volatile struct { + TypeID + Type Type +} + +func (v *Volatile) String() string { + return fmt.Sprintf("volatile#%d[#%d]", v.TypeID, v.Type.ID()) +} + +func (v *Volatile) qualify() Type { return v.Type } +func (v *Volatile) walk(tdq *typeDeque) { tdq.push(&v.Type) } +func (v *Volatile) copy() Type { + cpy := *v + return &cpy +} + +// Const is a qualifier. +type Const struct { + TypeID + Type Type +} + +func (c *Const) String() string { + return fmt.Sprintf("const#%d[#%d]", c.TypeID, c.Type.ID()) +} + +func (c *Const) qualify() Type { return c.Type } +func (c *Const) walk(tdq *typeDeque) { tdq.push(&c.Type) } +func (c *Const) copy() Type { + cpy := *c + return &cpy +} + +// Restrict is a qualifier. +type Restrict struct { + TypeID + Type Type +} + +func (r *Restrict) String() string { + return fmt.Sprintf("restrict#%d[#%d]", r.TypeID, r.Type.ID()) +} + +func (r *Restrict) qualify() Type { return r.Type } +func (r *Restrict) walk(tdq *typeDeque) { tdq.push(&r.Type) } +func (r *Restrict) copy() Type { + cpy := *r + return &cpy +} + +// Func is a function definition. +type Func struct { + TypeID + Name string + Type Type + Linkage FuncLinkage +} + +func (f *Func) String() string { + return fmt.Sprintf("func#%d[%s %q proto=#%d]", f.TypeID, f.Linkage, f.Name, f.Type.ID()) +} + +func (f *Func) TypeName() string { return f.Name } + +func (f *Func) walk(tdq *typeDeque) { tdq.push(&f.Type) } +func (f *Func) copy() Type { + cpy := *f + return &cpy +} + +// FuncProto is a function declaration. +type FuncProto struct { + TypeID + Return Type + Params []FuncParam +} + +func (fp *FuncProto) String() string { + var s strings.Builder + fmt.Fprintf(&s, "proto#%d[", fp.TypeID) + for _, param := range fp.Params { + fmt.Fprintf(&s, "%q=#%d, ", param.Name, param.Type.ID()) + } + fmt.Fprintf(&s, "return=#%d]", fp.Return.ID()) + return s.String() +} + +func (fp *FuncProto) walk(tdq *typeDeque) { + tdq.push(&fp.Return) + for i := range fp.Params { + tdq.push(&fp.Params[i].Type) + } +} + +func (fp *FuncProto) copy() Type { + cpy := *fp + cpy.Params = make([]FuncParam, len(fp.Params)) + copy(cpy.Params, fp.Params) + return &cpy +} + +type FuncParam struct { + Name string + Type Type +} + +// Var is a global variable. +type Var struct { + TypeID + Name string + Type Type + Linkage VarLinkage +} + +func (v *Var) String() string { + return fmt.Sprintf("var#%d[%s %q]", v.TypeID, v.Linkage, v.Name) +} + +func (v *Var) TypeName() string { return v.Name } + +func (v *Var) walk(tdq *typeDeque) { tdq.push(&v.Type) } +func (v *Var) copy() Type { + cpy := *v + return &cpy +} + +// Datasec is a global program section containing data. +type Datasec struct { + TypeID + Name string + Size uint32 + Vars []VarSecinfo +} + +func (ds *Datasec) String() string { + return fmt.Sprintf("section#%d[%q]", ds.TypeID, ds.Name) +} + +func (ds *Datasec) TypeName() string { return ds.Name } + +func (ds *Datasec) size() uint32 { return ds.Size } + +func (ds *Datasec) walk(tdq *typeDeque) { + for i := range ds.Vars { + tdq.push(&ds.Vars[i].Type) + } +} + +func (ds *Datasec) copy() Type { + cpy := *ds + cpy.Vars = make([]VarSecinfo, len(ds.Vars)) + copy(cpy.Vars, ds.Vars) + return &cpy +} + +// VarSecinfo describes variable in a Datasec. +// +// It is not a valid Type. +type VarSecinfo struct { + Type Type + Offset uint32 + Size uint32 +} + +// Float is a float of a given length. +type Float struct { + TypeID + Name string + + // The size of the float in bytes. + Size uint32 +} + +func (f *Float) String() string { + return fmt.Sprintf("float%d#%d[%q]", f.Size*8, f.TypeID, f.Name) +} + +func (f *Float) TypeName() string { return f.Name } +func (f *Float) size() uint32 { return f.Size } +func (f *Float) walk(*typeDeque) {} +func (f *Float) copy() Type { + cpy := *f + return &cpy +} + +type sizer interface { + size() uint32 +} + +var ( + _ sizer = (*Int)(nil) + _ sizer = (*Pointer)(nil) + _ sizer = (*Struct)(nil) + _ sizer = (*Union)(nil) + _ sizer = (*Enum)(nil) + _ sizer = (*Datasec)(nil) +) + +type qualifier interface { + qualify() Type +} + +var ( + _ qualifier = (*Const)(nil) + _ qualifier = (*Restrict)(nil) + _ qualifier = (*Volatile)(nil) +) + +// Sizeof returns the size of a type in bytes. +// +// Returns an error if the size can't be computed. +func Sizeof(typ Type) (int, error) { + var ( + n = int64(1) + elem int64 + ) + + for i := 0; i < maxTypeDepth; i++ { + switch v := typ.(type) { + case *Array: + if n > 0 && int64(v.Nelems) > math.MaxInt64/n { + return 0, fmt.Errorf("type %s: overflow", typ) + } + + // Arrays may be of zero length, which allows + // n to be zero as well. + n *= int64(v.Nelems) + typ = v.Type + continue + + case sizer: + elem = int64(v.size()) + + case *Typedef: + typ = v.Type + continue + + case qualifier: + typ = v.qualify() + continue + + default: + return 0, fmt.Errorf("unsized type %T", typ) + } + + if n > 0 && elem > math.MaxInt64/n { + return 0, fmt.Errorf("type %s: overflow", typ) + } + + size := n * elem + if int64(int(size)) != size { + return 0, fmt.Errorf("type %s: overflow", typ) + } + + return int(size), nil + } + + return 0, fmt.Errorf("type %s: exceeded type depth", typ) +} + +// copy a Type recursively. +// +// typ may form a cycle. +// +// Returns any errors from transform verbatim. +func copyType(typ Type, transform func(Type) (Type, error)) (Type, error) { + copies := make(copier) + return typ, copies.copy(&typ, transform) +} + +// copy a slice of Types recursively. +// +// Types may form a cycle. +// +// Returns any errors from transform verbatim. +func copyTypes(types []Type, transform func(Type) (Type, error)) ([]Type, error) { + result := make([]Type, len(types)) + copy(result, types) + + copies := make(copier) + for i := range result { + if err := copies.copy(&result[i], transform); err != nil { + return nil, err + } + } + + return result, nil +} + +type copier map[Type]Type + +func (c copier) copy(typ *Type, transform func(Type) (Type, error)) error { + var work typeDeque + for t := typ; t != nil; t = work.pop() { + // *t is the identity of the type. + if cpy := c[*t]; cpy != nil { + *t = cpy + continue + } + + var cpy Type + if transform != nil { + tf, err := transform(*t) + if err != nil { + return fmt.Errorf("copy %s: %w", *t, err) + } + cpy = tf.copy() + } else { + cpy = (*t).copy() + } + + c[*t] = cpy + *t = cpy + + // Mark any nested types for copying. + cpy.walk(&work) + } + + return nil +} + +// typeDeque keeps track of pointers to types which still +// need to be visited. +type typeDeque struct { + types []*Type + read, write uint64 + mask uint64 +} + +func (dq *typeDeque) empty() bool { + return dq.read == dq.write +} + +// push adds a type to the stack. +func (dq *typeDeque) push(t *Type) { + if dq.write-dq.read < uint64(len(dq.types)) { + dq.types[dq.write&dq.mask] = t + dq.write++ + return + } + + new := len(dq.types) * 2 + if new == 0 { + new = 8 + } + + types := make([]*Type, new) + pivot := dq.read & dq.mask + n := copy(types, dq.types[pivot:]) + n += copy(types[n:], dq.types[:pivot]) + types[n] = t + + dq.types = types + dq.mask = uint64(new) - 1 + dq.read, dq.write = 0, uint64(n+1) +} + +// shift returns the first element or null. +func (dq *typeDeque) shift() *Type { + if dq.empty() { + return nil + } + + index := dq.read & dq.mask + t := dq.types[index] + dq.types[index] = nil + dq.read++ + return t +} + +// pop returns the last element or null. +func (dq *typeDeque) pop() *Type { + if dq.empty() { + return nil + } + + dq.write-- + index := dq.write & dq.mask + t := dq.types[index] + dq.types[index] = nil + return t +} + +// all returns all elements. +// +// The deque is empty after calling this method. +func (dq *typeDeque) all() []*Type { + length := dq.write - dq.read + types := make([]*Type, 0, length) + for t := dq.shift(); t != nil; t = dq.shift() { + types = append(types, t) + } + return types +} + +// inflateRawTypes takes a list of raw btf types linked via type IDs, and turns +// it into a graph of Types connected via pointers. +// +// Returns a map of named types (so, where NameOff is non-zero) and a slice of types +// indexed by TypeID. Since BTF ignores compilation units, multiple types may share +// the same name. A Type may form a cyclic graph by pointing at itself. +func inflateRawTypes(rawTypes []rawType, rawStrings stringTable) (types []Type, namedTypes map[string][]NamedType, err error) { + type fixupDef struct { + id TypeID + expectedKind btfKind + typ *Type + } + + var fixups []fixupDef + fixup := func(id TypeID, expectedKind btfKind, typ *Type) { + fixups = append(fixups, fixupDef{id, expectedKind, typ}) + } + + convertMembers := func(raw []btfMember, kindFlag bool) ([]Member, error) { + // NB: The fixup below relies on pre-allocating this array to + // work, since otherwise append might re-allocate members. + members := make([]Member, 0, len(raw)) + for i, btfMember := range raw { + name, err := rawStrings.Lookup(btfMember.NameOff) + if err != nil { + return nil, fmt.Errorf("can't get name for member %d: %w", i, err) + } + m := Member{ + Name: name, + OffsetBits: btfMember.Offset, + } + if kindFlag { + m.BitfieldSize = btfMember.Offset >> 24 + m.OffsetBits &= 0xffffff + } + members = append(members, m) + } + for i := range members { + fixup(raw[i].Type, kindUnknown, &members[i].Type) + } + return members, nil + } + + types = make([]Type, 0, len(rawTypes)) + types = append(types, (*Void)(nil)) + namedTypes = make(map[string][]NamedType) + + for i, raw := range rawTypes { + var ( + // Void is defined to always be type ID 0, and is thus + // omitted from BTF. + id = TypeID(i + 1) + typ Type + ) + + name, err := rawStrings.Lookup(raw.NameOff) + if err != nil { + return nil, nil, fmt.Errorf("get name for type id %d: %w", id, err) + } + + switch raw.Kind() { + case kindInt: + encoding, offset, bits := intEncoding(*raw.data.(*uint32)) + typ = &Int{id, name, raw.Size(), encoding, offset, bits} + + case kindPointer: + ptr := &Pointer{id, nil} + fixup(raw.Type(), kindUnknown, &ptr.Target) + typ = ptr + + case kindArray: + btfArr := raw.data.(*btfArray) + + // IndexType is unused according to btf.rst. + // Don't make it available right now. + arr := &Array{id, nil, btfArr.Nelems} + fixup(btfArr.Type, kindUnknown, &arr.Type) + typ = arr + + case kindStruct: + members, err := convertMembers(raw.data.([]btfMember), raw.KindFlag()) + if err != nil { + return nil, nil, fmt.Errorf("struct %s (id %d): %w", name, id, err) + } + typ = &Struct{id, name, raw.Size(), members} + + case kindUnion: + members, err := convertMembers(raw.data.([]btfMember), raw.KindFlag()) + if err != nil { + return nil, nil, fmt.Errorf("union %s (id %d): %w", name, id, err) + } + typ = &Union{id, name, raw.Size(), members} + + case kindEnum: + rawvals := raw.data.([]btfEnum) + vals := make([]EnumValue, 0, len(rawvals)) + for i, btfVal := range rawvals { + name, err := rawStrings.Lookup(btfVal.NameOff) + if err != nil { + return nil, nil, fmt.Errorf("get name for enum value %d: %s", i, err) + } + vals = append(vals, EnumValue{ + Name: name, + Value: btfVal.Val, + }) + } + typ = &Enum{id, name, vals} + + case kindForward: + if raw.KindFlag() { + typ = &Fwd{id, name, FwdUnion} + } else { + typ = &Fwd{id, name, FwdStruct} + } + + case kindTypedef: + typedef := &Typedef{id, name, nil} + fixup(raw.Type(), kindUnknown, &typedef.Type) + typ = typedef + + case kindVolatile: + volatile := &Volatile{id, nil} + fixup(raw.Type(), kindUnknown, &volatile.Type) + typ = volatile + + case kindConst: + cnst := &Const{id, nil} + fixup(raw.Type(), kindUnknown, &cnst.Type) + typ = cnst + + case kindRestrict: + restrict := &Restrict{id, nil} + fixup(raw.Type(), kindUnknown, &restrict.Type) + typ = restrict + + case kindFunc: + fn := &Func{id, name, nil, raw.Linkage()} + fixup(raw.Type(), kindFuncProto, &fn.Type) + typ = fn + + case kindFuncProto: + rawparams := raw.data.([]btfParam) + params := make([]FuncParam, 0, len(rawparams)) + for i, param := range rawparams { + name, err := rawStrings.Lookup(param.NameOff) + if err != nil { + return nil, nil, fmt.Errorf("get name for func proto parameter %d: %s", i, err) + } + params = append(params, FuncParam{ + Name: name, + }) + } + for i := range params { + fixup(rawparams[i].Type, kindUnknown, ¶ms[i].Type) + } + + fp := &FuncProto{id, nil, params} + fixup(raw.Type(), kindUnknown, &fp.Return) + typ = fp + + case kindVar: + variable := raw.data.(*btfVariable) + v := &Var{id, name, nil, VarLinkage(variable.Linkage)} + fixup(raw.Type(), kindUnknown, &v.Type) + typ = v + + case kindDatasec: + btfVars := raw.data.([]btfVarSecinfo) + vars := make([]VarSecinfo, 0, len(btfVars)) + for _, btfVar := range btfVars { + vars = append(vars, VarSecinfo{ + Offset: btfVar.Offset, + Size: btfVar.Size, + }) + } + for i := range vars { + fixup(btfVars[i].Type, kindVar, &vars[i].Type) + } + typ = &Datasec{id, name, raw.SizeType, vars} + + case kindFloat: + typ = &Float{id, name, raw.Size()} + + default: + return nil, nil, fmt.Errorf("type id %d: unknown kind: %v", id, raw.Kind()) + } + + types = append(types, typ) + + if named, ok := typ.(NamedType); ok { + if name := essentialName(named.TypeName()); name != "" { + namedTypes[name] = append(namedTypes[name], named) + } + } + } + + for _, fixup := range fixups { + i := int(fixup.id) + if i >= len(types) { + return nil, nil, fmt.Errorf("reference to invalid type id: %d", fixup.id) + } + + // Default void (id 0) to unknown + rawKind := kindUnknown + if i > 0 { + rawKind = rawTypes[i-1].Kind() + } + + if expected := fixup.expectedKind; expected != kindUnknown && rawKind != expected { + return nil, nil, fmt.Errorf("expected type id %d to have kind %s, found %s", fixup.id, expected, rawKind) + } + + *fixup.typ = types[i] + } + + return types, namedTypes, nil +} + +// essentialName returns name without a ___ suffix. +func essentialName(name string) string { + lastIdx := strings.LastIndex(name, "___") + if lastIdx > 0 { + return name[:lastIdx] + } + return name +} diff --git a/vendor/github.com/cilium/ebpf/internal/cpu.go b/vendor/github.com/cilium/ebpf/internal/cpu.go index ce3cab7..3affa1e 100644 --- a/vendor/github.com/cilium/ebpf/internal/cpu.go +++ b/vendor/github.com/cilium/ebpf/internal/cpu.go @@ -3,9 +3,8 @@ package internal import ( "fmt" "os" + "strings" "sync" - - "github.com/pkg/errors" ) var sysCPU struct { @@ -18,45 +17,44 @@ var sysCPU struct { // Logical CPU numbers must be of the form 0-n func PossibleCPUs() (int, error) { sysCPU.once.Do(func() { - sysCPU.num, sysCPU.err = parseCPUs("/sys/devices/system/cpu/possible") + sysCPU.num, sysCPU.err = parseCPUsFromFile("/sys/devices/system/cpu/possible") }) return sysCPU.num, sysCPU.err } -var onlineCPU struct { - once sync.Once - err error - num int -} - -// OnlineCPUs returns the number of currently online CPUs -// Logical CPU numbers must be of the form 0-n -func OnlineCPUs() (int, error) { - onlineCPU.once.Do(func() { - onlineCPU.num, onlineCPU.err = parseCPUs("/sys/devices/system/cpu/online") - }) - - return onlineCPU.num, onlineCPU.err -} - -// parseCPUs parses the number of cpus from sysfs, -// in the format of "/sys/devices/system/cpu/{possible,online,..}. -// Logical CPU numbers must be of the form 0-n -func parseCPUs(path string) (int, error) { - file, err := os.Open(path) +func parseCPUsFromFile(path string) (int, error) { + spec, err := os.ReadFile(path) if err != nil { return 0, err } - defer file.Close() + + n, err := parseCPUs(string(spec)) + if err != nil { + return 0, fmt.Errorf("can't parse %s: %v", path, err) + } + + return n, nil +} + +// parseCPUs parses the number of cpus from a string produced +// by bitmap_list_string() in the Linux kernel. +// Multiple ranges are rejected, since they can't be unified +// into a single number. +// This is the format of /sys/devices/system/cpu/possible, it +// is not suitable for /sys/devices/system/cpu/online, etc. +func parseCPUs(spec string) (int, error) { + if strings.Trim(spec, "\n") == "0" { + return 1, nil + } var low, high int - n, _ := fmt.Fscanf(file, "%d-%d", &low, &high) - if n < 1 || low != 0 { - return 0, errors.Wrapf(err, "%s has unknown format", path) + n, err := fmt.Sscanf(spec, "%d-%d\n", &low, &high) + if n != 2 || err != nil { + return 0, fmt.Errorf("invalid format: %s", spec) } - if n == 1 { - high = low + if low != 0 { + return 0, fmt.Errorf("CPU spec doesn't start at zero: %s", spec) } // cpus is 0 indexed diff --git a/vendor/github.com/cilium/ebpf/internal/elf.go b/vendor/github.com/cilium/ebpf/internal/elf.go new file mode 100644 index 0000000..54a4313 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/elf.go @@ -0,0 +1,68 @@ +package internal + +import ( + "debug/elf" + "fmt" + "io" +) + +type SafeELFFile struct { + *elf.File +} + +// NewSafeELFFile reads an ELF safely. +// +// Any panic during parsing is turned into an error. This is necessary since +// there are a bunch of unfixed bugs in debug/elf. +// +// https://github.com/golang/go/issues?q=is%3Aissue+is%3Aopen+debug%2Felf+in%3Atitle +func NewSafeELFFile(r io.ReaderAt) (safe *SafeELFFile, err error) { + defer func() { + r := recover() + if r == nil { + return + } + + safe = nil + err = fmt.Errorf("reading ELF file panicked: %s", r) + }() + + file, err := elf.NewFile(r) + if err != nil { + return nil, err + } + + return &SafeELFFile{file}, nil +} + +// Symbols is the safe version of elf.File.Symbols. +func (se *SafeELFFile) Symbols() (syms []elf.Symbol, err error) { + defer func() { + r := recover() + if r == nil { + return + } + + syms = nil + err = fmt.Errorf("reading ELF symbols panicked: %s", r) + }() + + syms, err = se.File.Symbols() + return +} + +// DynamicSymbols is the safe version of elf.File.DynamicSymbols. +func (se *SafeELFFile) DynamicSymbols() (syms []elf.Symbol, err error) { + defer func() { + r := recover() + if r == nil { + return + } + + syms = nil + err = fmt.Errorf("reading ELF dynamic symbols panicked: %s", r) + }() + + syms, err = se.File.DynamicSymbols() + return +} diff --git a/vendor/github.com/cilium/ebpf/internal/endian.go b/vendor/github.com/cilium/ebpf/internal/endian.go index ac8a94e..6ae99fc 100644 --- a/vendor/github.com/cilium/ebpf/internal/endian.go +++ b/vendor/github.com/cilium/ebpf/internal/endian.go @@ -9,11 +9,16 @@ import ( // depending on the host's endianness. var NativeEndian binary.ByteOrder +// Clang is set to either "el" or "eb" depending on the host's endianness. +var ClangEndian string + func init() { if isBigEndian() { NativeEndian = binary.BigEndian + ClangEndian = "eb" } else { NativeEndian = binary.LittleEndian + ClangEndian = "el" } } diff --git a/vendor/github.com/cilium/ebpf/internal/errors.go b/vendor/github.com/cilium/ebpf/internal/errors.go new file mode 100644 index 0000000..877bd72 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/errors.go @@ -0,0 +1,51 @@ +package internal + +import ( + "bytes" + "errors" + "fmt" + "strings" + + "github.com/cilium/ebpf/internal/unix" +) + +// ErrorWithLog returns an error that includes logs from the +// kernel verifier. +// +// logErr should be the error returned by the syscall that generated +// the log. It is used to check for truncation of the output. +func ErrorWithLog(err error, log []byte, logErr error) error { + logStr := strings.Trim(CString(log), "\t\r\n ") + if errors.Is(logErr, unix.ENOSPC) { + logStr += " (truncated...)" + } + + return &VerifierError{err, logStr} +} + +// VerifierError includes information from the eBPF verifier. +type VerifierError struct { + cause error + log string +} + +func (le *VerifierError) Unwrap() error { + return le.cause +} + +func (le *VerifierError) Error() string { + if le.log == "" { + return le.cause.Error() + } + + return fmt.Sprintf("%s: %s", le.cause, le.log) +} + +// CString turns a NUL / zero terminated byte buffer into a string. +func CString(in []byte) string { + inLen := bytes.IndexByte(in, 0) + if inLen == -1 { + return "" + } + return string(in[:inLen]) +} diff --git a/vendor/github.com/cilium/ebpf/internal/fd.go b/vendor/github.com/cilium/ebpf/internal/fd.go new file mode 100644 index 0000000..af04955 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/fd.go @@ -0,0 +1,69 @@ +package internal + +import ( + "errors" + "fmt" + "os" + "runtime" + "strconv" + + "github.com/cilium/ebpf/internal/unix" +) + +var ErrClosedFd = errors.New("use of closed file descriptor") + +type FD struct { + raw int64 +} + +func NewFD(value uint32) *FD { + fd := &FD{int64(value)} + runtime.SetFinalizer(fd, (*FD).Close) + return fd +} + +func (fd *FD) String() string { + return strconv.FormatInt(fd.raw, 10) +} + +func (fd *FD) Value() (uint32, error) { + if fd.raw < 0 { + return 0, ErrClosedFd + } + + return uint32(fd.raw), nil +} + +func (fd *FD) Close() error { + if fd.raw < 0 { + return nil + } + + value := int(fd.raw) + fd.raw = -1 + + fd.Forget() + return unix.Close(value) +} + +func (fd *FD) Forget() { + runtime.SetFinalizer(fd, nil) +} + +func (fd *FD) Dup() (*FD, error) { + if fd.raw < 0 { + return nil, ErrClosedFd + } + + dup, err := unix.FcntlInt(uintptr(fd.raw), unix.F_DUPFD_CLOEXEC, 0) + if err != nil { + return nil, fmt.Errorf("can't dup fd: %v", err) + } + + return NewFD(uint32(dup)), nil +} + +func (fd *FD) File(name string) *os.File { + fd.Forget() + return os.NewFile(uintptr(fd.raw), name) +} diff --git a/vendor/github.com/cilium/ebpf/internal/feature.go b/vendor/github.com/cilium/ebpf/internal/feature.go new file mode 100644 index 0000000..c94a2e1 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/feature.go @@ -0,0 +1,100 @@ +package internal + +import ( + "errors" + "fmt" + "sync" +) + +// ErrNotSupported indicates that a feature is not supported by the current kernel. +var ErrNotSupported = errors.New("not supported") + +// UnsupportedFeatureError is returned by FeatureTest() functions. +type UnsupportedFeatureError struct { + // The minimum Linux mainline version required for this feature. + // Used for the error string, and for sanity checking during testing. + MinimumVersion Version + + // The name of the feature that isn't supported. + Name string +} + +func (ufe *UnsupportedFeatureError) Error() string { + if ufe.MinimumVersion.Unspecified() { + return fmt.Sprintf("%s not supported", ufe.Name) + } + return fmt.Sprintf("%s not supported (requires >= %s)", ufe.Name, ufe.MinimumVersion) +} + +// Is indicates that UnsupportedFeatureError is ErrNotSupported. +func (ufe *UnsupportedFeatureError) Is(target error) bool { + return target == ErrNotSupported +} + +type featureTest struct { + sync.RWMutex + successful bool + result error +} + +// FeatureTestFn is used to determine whether the kernel supports +// a certain feature. +// +// The return values have the following semantics: +// +// err == ErrNotSupported: the feature is not available +// err == nil: the feature is available +// err != nil: the test couldn't be executed +type FeatureTestFn func() error + +// FeatureTest wraps a function so that it is run at most once. +// +// name should identify the tested feature, while version must be in the +// form Major.Minor[.Patch]. +// +// Returns an error wrapping ErrNotSupported if the feature is not supported. +func FeatureTest(name, version string, fn FeatureTestFn) func() error { + v, err := NewVersion(version) + if err != nil { + return func() error { return err } + } + + ft := new(featureTest) + return func() error { + ft.RLock() + if ft.successful { + defer ft.RUnlock() + return ft.result + } + ft.RUnlock() + ft.Lock() + defer ft.Unlock() + // check one more time on the off + // chance that two go routines + // were able to call into the write + // lock + if ft.successful { + return ft.result + } + err := fn() + switch { + case errors.Is(err, ErrNotSupported): + ft.result = &UnsupportedFeatureError{ + MinimumVersion: v, + Name: name, + } + fallthrough + + case err == nil: + ft.successful = true + + default: + // We couldn't execute the feature test to a point + // where it could make a determination. + // Don't cache the result, just return it. + return fmt.Errorf("detect support for %s: %w", name, err) + } + + return ft.result + } +} diff --git a/vendor/github.com/cilium/ebpf/internal/io.go b/vendor/github.com/cilium/ebpf/internal/io.go new file mode 100644 index 0000000..fa74027 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/io.go @@ -0,0 +1,16 @@ +package internal + +import "errors" + +// DiscardZeroes makes sure that all written bytes are zero +// before discarding them. +type DiscardZeroes struct{} + +func (DiscardZeroes) Write(p []byte) (int, error) { + for _, b := range p { + if b != 0 { + return 0, errors.New("encountered non-zero byte") + } + } + return len(p), nil +} diff --git a/vendor/github.com/cilium/ebpf/internal/pinning.go b/vendor/github.com/cilium/ebpf/internal/pinning.go new file mode 100644 index 0000000..5329b43 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/pinning.go @@ -0,0 +1,44 @@ +package internal + +import ( + "errors" + "fmt" + "os" + + "github.com/cilium/ebpf/internal/unix" +) + +func Pin(currentPath, newPath string, fd *FD) error { + if newPath == "" { + return errors.New("given pinning path cannot be empty") + } + if currentPath == newPath { + return nil + } + if currentPath == "" { + return BPFObjPin(newPath, fd) + } + var err error + // Renameat2 is used instead of os.Rename to disallow the new path replacing + // an existing path. + if err = unix.Renameat2(unix.AT_FDCWD, currentPath, unix.AT_FDCWD, newPath, unix.RENAME_NOREPLACE); err == nil { + // Object is now moved to the new pinning path. + return nil + } + if !os.IsNotExist(err) { + return fmt.Errorf("unable to move pinned object to new path %v: %w", newPath, err) + } + // Internal state not in sync with the file system so let's fix it. + return BPFObjPin(newPath, fd) +} + +func Unpin(pinnedPath string) error { + if pinnedPath == "" { + return nil + } + err := os.Remove(pinnedPath) + if err == nil || os.IsNotExist(err) { + return nil + } + return err +} diff --git a/vendor/github.com/cilium/ebpf/internal/ptr.go b/vendor/github.com/cilium/ebpf/internal/ptr.go new file mode 100644 index 0000000..f295de7 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/ptr.go @@ -0,0 +1,31 @@ +package internal + +import ( + "unsafe" + + "github.com/cilium/ebpf/internal/unix" +) + +// NewPointer creates a 64-bit pointer from an unsafe Pointer. +func NewPointer(ptr unsafe.Pointer) Pointer { + return Pointer{ptr: ptr} +} + +// NewSlicePointer creates a 64-bit pointer from a byte slice. +func NewSlicePointer(buf []byte) Pointer { + if len(buf) == 0 { + return Pointer{} + } + + return Pointer{ptr: unsafe.Pointer(&buf[0])} +} + +// NewStringPointer creates a 64-bit pointer from a string. +func NewStringPointer(str string) Pointer { + p, err := unix.BytePtrFromString(str) + if err != nil { + return Pointer{} + } + + return Pointer{ptr: unsafe.Pointer(p)} +} diff --git a/vendor/github.com/cilium/ebpf/ptr_32_be.go b/vendor/github.com/cilium/ebpf/internal/ptr_32_be.go similarity index 50% rename from vendor/github.com/cilium/ebpf/ptr_32_be.go rename to vendor/github.com/cilium/ebpf/internal/ptr_32_be.go index 7757744..8c114dd 100644 --- a/vendor/github.com/cilium/ebpf/ptr_32_be.go +++ b/vendor/github.com/cilium/ebpf/internal/ptr_32_be.go @@ -1,14 +1,15 @@ +//go:build armbe || mips || mips64p32 // +build armbe mips mips64p32 -package ebpf +package internal import ( "unsafe" ) -// ptr wraps an unsafe.Pointer to be 64bit to +// Pointer wraps an unsafe.Pointer to be 64bit to // conform to the syscall specification. -type syscallPtr struct { +type Pointer struct { pad uint32 ptr unsafe.Pointer } diff --git a/vendor/github.com/cilium/ebpf/internal/ptr_32_le.go b/vendor/github.com/cilium/ebpf/internal/ptr_32_le.go new file mode 100644 index 0000000..e65a61e --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/ptr_32_le.go @@ -0,0 +1,15 @@ +//go:build 386 || amd64p32 || arm || mipsle || mips64p32le +// +build 386 amd64p32 arm mipsle mips64p32le + +package internal + +import ( + "unsafe" +) + +// Pointer wraps an unsafe.Pointer to be 64bit to +// conform to the syscall specification. +type Pointer struct { + ptr unsafe.Pointer + pad uint32 +} diff --git a/vendor/github.com/cilium/ebpf/internal/ptr_64.go b/vendor/github.com/cilium/ebpf/internal/ptr_64.go new file mode 100644 index 0000000..71a3afe --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/ptr_64.go @@ -0,0 +1,14 @@ +//go:build !386 && !amd64p32 && !arm && !mipsle && !mips64p32le && !armbe && !mips && !mips64p32 +// +build !386,!amd64p32,!arm,!mipsle,!mips64p32le,!armbe,!mips,!mips64p32 + +package internal + +import ( + "unsafe" +) + +// Pointer wraps an unsafe.Pointer to be 64bit to +// conform to the syscall specification. +type Pointer struct { + ptr unsafe.Pointer +} diff --git a/vendor/github.com/cilium/ebpf/internal/syscall.go b/vendor/github.com/cilium/ebpf/internal/syscall.go new file mode 100644 index 0000000..b75037b --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/syscall.go @@ -0,0 +1,304 @@ +package internal + +import ( + "errors" + "fmt" + "path/filepath" + "runtime" + "syscall" + "unsafe" + + "github.com/cilium/ebpf/internal/unix" +) + +//go:generate stringer -output syscall_string.go -type=BPFCmd + +// BPFCmd identifies a subcommand of the bpf syscall. +type BPFCmd int + +// Well known BPF commands. +const ( + BPF_MAP_CREATE BPFCmd = iota + BPF_MAP_LOOKUP_ELEM + BPF_MAP_UPDATE_ELEM + BPF_MAP_DELETE_ELEM + BPF_MAP_GET_NEXT_KEY + BPF_PROG_LOAD + BPF_OBJ_PIN + BPF_OBJ_GET + BPF_PROG_ATTACH + BPF_PROG_DETACH + BPF_PROG_TEST_RUN + BPF_PROG_GET_NEXT_ID + BPF_MAP_GET_NEXT_ID + BPF_PROG_GET_FD_BY_ID + BPF_MAP_GET_FD_BY_ID + BPF_OBJ_GET_INFO_BY_FD + BPF_PROG_QUERY + BPF_RAW_TRACEPOINT_OPEN + BPF_BTF_LOAD + BPF_BTF_GET_FD_BY_ID + BPF_TASK_FD_QUERY + BPF_MAP_LOOKUP_AND_DELETE_ELEM + BPF_MAP_FREEZE + BPF_BTF_GET_NEXT_ID + BPF_MAP_LOOKUP_BATCH + BPF_MAP_LOOKUP_AND_DELETE_BATCH + BPF_MAP_UPDATE_BATCH + BPF_MAP_DELETE_BATCH + BPF_LINK_CREATE + BPF_LINK_UPDATE + BPF_LINK_GET_FD_BY_ID + BPF_LINK_GET_NEXT_ID + BPF_ENABLE_STATS + BPF_ITER_CREATE +) + +// BPF wraps SYS_BPF. +// +// Any pointers contained in attr must use the Pointer type from this package. +func BPF(cmd BPFCmd, attr unsafe.Pointer, size uintptr) (uintptr, error) { + r1, _, errNo := unix.Syscall(unix.SYS_BPF, uintptr(cmd), uintptr(attr), size) + runtime.KeepAlive(attr) + + var err error + if errNo != 0 { + err = wrappedErrno{errNo} + } + + return r1, err +} + +type BPFProgLoadAttr struct { + ProgType uint32 + InsCount uint32 + Instructions Pointer + License Pointer + LogLevel uint32 + LogSize uint32 + LogBuf Pointer + KernelVersion uint32 // since 4.1 2541517c32be + ProgFlags uint32 // since 4.11 e07b98d9bffe + ProgName BPFObjName // since 4.15 067cae47771c + ProgIfIndex uint32 // since 4.15 1f6f4cb7ba21 + ExpectedAttachType uint32 // since 4.17 5e43f899b03a + ProgBTFFd uint32 + FuncInfoRecSize uint32 + FuncInfo Pointer + FuncInfoCnt uint32 + LineInfoRecSize uint32 + LineInfo Pointer + LineInfoCnt uint32 + AttachBTFID uint32 + AttachProgFd uint32 +} + +// BPFProgLoad wraps BPF_PROG_LOAD. +func BPFProgLoad(attr *BPFProgLoadAttr) (*FD, error) { + for { + fd, err := BPF(BPF_PROG_LOAD, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + // As of ~4.20 the verifier can be interrupted by a signal, + // and returns EAGAIN in that case. + if errors.Is(err, unix.EAGAIN) { + continue + } + + if err != nil { + return nil, err + } + + return NewFD(uint32(fd)), nil + } +} + +type BPFProgAttachAttr struct { + TargetFd uint32 + AttachBpfFd uint32 + AttachType uint32 + AttachFlags uint32 + ReplaceBpfFd uint32 +} + +func BPFProgAttach(attr *BPFProgAttachAttr) error { + _, err := BPF(BPF_PROG_ATTACH, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + return err +} + +type BPFProgDetachAttr struct { + TargetFd uint32 + AttachBpfFd uint32 + AttachType uint32 +} + +func BPFProgDetach(attr *BPFProgDetachAttr) error { + _, err := BPF(BPF_PROG_DETACH, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + return err +} + +type BPFEnableStatsAttr struct { + StatsType uint32 +} + +func BPFEnableStats(attr *BPFEnableStatsAttr) (*FD, error) { + ptr, err := BPF(BPF_ENABLE_STATS, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + if err != nil { + return nil, fmt.Errorf("enable stats: %w", err) + } + return NewFD(uint32(ptr)), nil + +} + +type bpfObjAttr struct { + fileName Pointer + fd uint32 + fileFlags uint32 +} + +const bpfFSType = 0xcafe4a11 + +// BPFObjPin wraps BPF_OBJ_PIN. +func BPFObjPin(fileName string, fd *FD) error { + dirName := filepath.Dir(fileName) + var statfs unix.Statfs_t + if err := unix.Statfs(dirName, &statfs); err != nil { + return err + } + if uint64(statfs.Type) != bpfFSType { + return fmt.Errorf("%s is not on a bpf filesystem", fileName) + } + + value, err := fd.Value() + if err != nil { + return err + } + + attr := bpfObjAttr{ + fileName: NewStringPointer(fileName), + fd: value, + } + _, err = BPF(BPF_OBJ_PIN, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) + if err != nil { + return fmt.Errorf("pin object %s: %w", fileName, err) + } + return nil +} + +// BPFObjGet wraps BPF_OBJ_GET. +func BPFObjGet(fileName string, flags uint32) (*FD, error) { + attr := bpfObjAttr{ + fileName: NewStringPointer(fileName), + fileFlags: flags, + } + ptr, err := BPF(BPF_OBJ_GET, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) + if err != nil { + return nil, fmt.Errorf("get object %s: %w", fileName, err) + } + return NewFD(uint32(ptr)), nil +} + +type bpfObjGetInfoByFDAttr struct { + fd uint32 + infoLen uint32 + info Pointer +} + +// BPFObjGetInfoByFD wraps BPF_OBJ_GET_INFO_BY_FD. +// +// Available from 4.13. +func BPFObjGetInfoByFD(fd *FD, info unsafe.Pointer, size uintptr) error { + value, err := fd.Value() + if err != nil { + return err + } + + attr := bpfObjGetInfoByFDAttr{ + fd: value, + infoLen: uint32(size), + info: NewPointer(info), + } + _, err = BPF(BPF_OBJ_GET_INFO_BY_FD, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) + if err != nil { + return fmt.Errorf("fd %v: %w", fd, err) + } + return nil +} + +type bpfGetFDByIDAttr struct { + id uint32 + next uint32 +} + +// BPFObjGetInfoByFD wraps BPF_*_GET_FD_BY_ID. +// +// Available from 4.13. +func BPFObjGetFDByID(cmd BPFCmd, id uint32) (*FD, error) { + attr := bpfGetFDByIDAttr{ + id: id, + } + ptr, err := BPF(cmd, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) + return NewFD(uint32(ptr)), err +} + +// BPFObjName is a null-terminated string made up of +// 'A-Za-z0-9_' characters. +type BPFObjName [unix.BPF_OBJ_NAME_LEN]byte + +// NewBPFObjName truncates the result if it is too long. +func NewBPFObjName(name string) BPFObjName { + var result BPFObjName + copy(result[:unix.BPF_OBJ_NAME_LEN-1], name) + return result +} + +type BPFMapCreateAttr struct { + MapType uint32 + KeySize uint32 + ValueSize uint32 + MaxEntries uint32 + Flags uint32 + InnerMapFd uint32 // since 4.12 56f668dfe00d + NumaNode uint32 // since 4.14 96eabe7a40aa + MapName BPFObjName // since 4.15 ad5b177bd73f + MapIfIndex uint32 + BTFFd uint32 + BTFKeyTypeID uint32 + BTFValueTypeID uint32 +} + +func BPFMapCreate(attr *BPFMapCreateAttr) (*FD, error) { + fd, err := BPF(BPF_MAP_CREATE, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + if err != nil { + return nil, err + } + + return NewFD(uint32(fd)), nil +} + +// wrappedErrno wraps syscall.Errno to prevent direct comparisons with +// syscall.E* or unix.E* constants. +// +// You should never export an error of this type. +type wrappedErrno struct { + syscall.Errno +} + +func (we wrappedErrno) Unwrap() error { + return we.Errno +} + +type syscallError struct { + error + errno syscall.Errno +} + +func SyscallError(err error, errno syscall.Errno) error { + return &syscallError{err, errno} +} + +func (se *syscallError) Is(target error) bool { + return target == se.error +} + +func (se *syscallError) Unwrap() error { + return se.errno +} diff --git a/vendor/github.com/cilium/ebpf/internal/syscall_string.go b/vendor/github.com/cilium/ebpf/internal/syscall_string.go new file mode 100644 index 0000000..85df047 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/syscall_string.go @@ -0,0 +1,56 @@ +// Code generated by "stringer -output syscall_string.go -type=BPFCmd"; DO NOT EDIT. + +package internal + +import "strconv" + +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[BPF_MAP_CREATE-0] + _ = x[BPF_MAP_LOOKUP_ELEM-1] + _ = x[BPF_MAP_UPDATE_ELEM-2] + _ = x[BPF_MAP_DELETE_ELEM-3] + _ = x[BPF_MAP_GET_NEXT_KEY-4] + _ = x[BPF_PROG_LOAD-5] + _ = x[BPF_OBJ_PIN-6] + _ = x[BPF_OBJ_GET-7] + _ = x[BPF_PROG_ATTACH-8] + _ = x[BPF_PROG_DETACH-9] + _ = x[BPF_PROG_TEST_RUN-10] + _ = x[BPF_PROG_GET_NEXT_ID-11] + _ = x[BPF_MAP_GET_NEXT_ID-12] + _ = x[BPF_PROG_GET_FD_BY_ID-13] + _ = x[BPF_MAP_GET_FD_BY_ID-14] + _ = x[BPF_OBJ_GET_INFO_BY_FD-15] + _ = x[BPF_PROG_QUERY-16] + _ = x[BPF_RAW_TRACEPOINT_OPEN-17] + _ = x[BPF_BTF_LOAD-18] + _ = x[BPF_BTF_GET_FD_BY_ID-19] + _ = x[BPF_TASK_FD_QUERY-20] + _ = x[BPF_MAP_LOOKUP_AND_DELETE_ELEM-21] + _ = x[BPF_MAP_FREEZE-22] + _ = x[BPF_BTF_GET_NEXT_ID-23] + _ = x[BPF_MAP_LOOKUP_BATCH-24] + _ = x[BPF_MAP_LOOKUP_AND_DELETE_BATCH-25] + _ = x[BPF_MAP_UPDATE_BATCH-26] + _ = x[BPF_MAP_DELETE_BATCH-27] + _ = x[BPF_LINK_CREATE-28] + _ = x[BPF_LINK_UPDATE-29] + _ = x[BPF_LINK_GET_FD_BY_ID-30] + _ = x[BPF_LINK_GET_NEXT_ID-31] + _ = x[BPF_ENABLE_STATS-32] + _ = x[BPF_ITER_CREATE-33] +} + +const _BPFCmd_name = "BPF_MAP_CREATEBPF_MAP_LOOKUP_ELEMBPF_MAP_UPDATE_ELEMBPF_MAP_DELETE_ELEMBPF_MAP_GET_NEXT_KEYBPF_PROG_LOADBPF_OBJ_PINBPF_OBJ_GETBPF_PROG_ATTACHBPF_PROG_DETACHBPF_PROG_TEST_RUNBPF_PROG_GET_NEXT_IDBPF_MAP_GET_NEXT_IDBPF_PROG_GET_FD_BY_IDBPF_MAP_GET_FD_BY_IDBPF_OBJ_GET_INFO_BY_FDBPF_PROG_QUERYBPF_RAW_TRACEPOINT_OPENBPF_BTF_LOADBPF_BTF_GET_FD_BY_IDBPF_TASK_FD_QUERYBPF_MAP_LOOKUP_AND_DELETE_ELEMBPF_MAP_FREEZEBPF_BTF_GET_NEXT_IDBPF_MAP_LOOKUP_BATCHBPF_MAP_LOOKUP_AND_DELETE_BATCHBPF_MAP_UPDATE_BATCHBPF_MAP_DELETE_BATCHBPF_LINK_CREATEBPF_LINK_UPDATEBPF_LINK_GET_FD_BY_IDBPF_LINK_GET_NEXT_IDBPF_ENABLE_STATSBPF_ITER_CREATE" + +var _BPFCmd_index = [...]uint16{0, 14, 33, 52, 71, 91, 104, 115, 126, 141, 156, 173, 193, 212, 233, 253, 275, 289, 312, 324, 344, 361, 391, 405, 424, 444, 475, 495, 515, 530, 545, 566, 586, 602, 617} + +func (i BPFCmd) String() string { + if i < 0 || i >= BPFCmd(len(_BPFCmd_index)-1) { + return "BPFCmd(" + strconv.FormatInt(int64(i), 10) + ")" + } + return _BPFCmd_name[_BPFCmd_index[i]:_BPFCmd_index[i+1]] +} diff --git a/vendor/github.com/cilium/ebpf/internal/unix/types_linux.go b/vendor/github.com/cilium/ebpf/internal/unix/types_linux.go index 49c6be5..9aa70fa 100644 --- a/vendor/github.com/cilium/ebpf/internal/unix/types_linux.go +++ b/vendor/github.com/cilium/ebpf/internal/unix/types_linux.go @@ -1,21 +1,45 @@ +//go:build linux // +build linux package unix import ( + "bytes" "syscall" linux "golang.org/x/sys/unix" ) const ( - ENOENT = linux.ENOENT - EAGAIN = linux.EAGAIN - ENOSPC = linux.ENOSPC - EINVAL = linux.EINVAL - EPOLLIN = linux.EPOLLIN + ENOENT = linux.ENOENT + EEXIST = linux.EEXIST + EAGAIN = linux.EAGAIN + ENOSPC = linux.ENOSPC + EINVAL = linux.EINVAL + EPOLLIN = linux.EPOLLIN + EINTR = linux.EINTR + EPERM = linux.EPERM + ESRCH = linux.ESRCH + ENODEV = linux.ENODEV + EBADF = linux.EBADF + E2BIG = linux.E2BIG + // ENOTSUPP is not the same as ENOTSUP or EOPNOTSUP + ENOTSUPP = syscall.Errno(0x20c) + + BPF_F_NO_PREALLOC = linux.BPF_F_NO_PREALLOC + BPF_F_NUMA_NODE = linux.BPF_F_NUMA_NODE + BPF_F_RDONLY = linux.BPF_F_RDONLY + BPF_F_WRONLY = linux.BPF_F_WRONLY + BPF_F_RDONLY_PROG = linux.BPF_F_RDONLY_PROG + BPF_F_WRONLY_PROG = linux.BPF_F_WRONLY_PROG + BPF_F_SLEEPABLE = linux.BPF_F_SLEEPABLE + BPF_F_MMAPABLE = linux.BPF_F_MMAPABLE + BPF_F_INNER_MAP = linux.BPF_F_INNER_MAP BPF_OBJ_NAME_LEN = linux.BPF_OBJ_NAME_LEN BPF_TAG_SIZE = linux.BPF_TAG_SIZE + BPF_RINGBUF_BUSY_BIT = linux.BPF_RINGBUF_BUSY_BIT + BPF_RINGBUF_DISCARD_BIT = linux.BPF_RINGBUF_DISCARD_BIT + BPF_RINGBUF_HDR_SZ = linux.BPF_RINGBUF_HDR_SZ SYS_BPF = linux.SYS_BPF F_DUPFD_CLOEXEC = linux.F_DUPFD_CLOEXEC EPOLL_CTL_ADD = linux.EPOLL_CTL_ADD @@ -25,11 +49,23 @@ const ( PROT_READ = linux.PROT_READ PROT_WRITE = linux.PROT_WRITE MAP_SHARED = linux.MAP_SHARED + PERF_ATTR_SIZE_VER1 = linux.PERF_ATTR_SIZE_VER1 PERF_TYPE_SOFTWARE = linux.PERF_TYPE_SOFTWARE + PERF_TYPE_TRACEPOINT = linux.PERF_TYPE_TRACEPOINT PERF_COUNT_SW_BPF_OUTPUT = linux.PERF_COUNT_SW_BPF_OUTPUT + PERF_EVENT_IOC_DISABLE = linux.PERF_EVENT_IOC_DISABLE + PERF_EVENT_IOC_ENABLE = linux.PERF_EVENT_IOC_ENABLE + PERF_EVENT_IOC_SET_BPF = linux.PERF_EVENT_IOC_SET_BPF PerfBitWatermark = linux.PerfBitWatermark PERF_SAMPLE_RAW = linux.PERF_SAMPLE_RAW PERF_FLAG_FD_CLOEXEC = linux.PERF_FLAG_FD_CLOEXEC + RLIM_INFINITY = linux.RLIM_INFINITY + RLIMIT_MEMLOCK = linux.RLIMIT_MEMLOCK + BPF_STATS_RUN_TIME = linux.BPF_STATS_RUN_TIME + PERF_RECORD_LOST = linux.PERF_RECORD_LOST + PERF_RECORD_SAMPLE = linux.PERF_RECORD_SAMPLE + AT_FDCWD = linux.AT_FDCWD + RENAME_NOREPLACE = linux.RENAME_NOREPLACE ) // Statfs_t is a wrapper @@ -38,11 +74,6 @@ type Statfs_t = linux.Statfs_t // Rlimit is a wrapper type Rlimit = linux.Rlimit -// Setrlimit is a wrapper -func Setrlimit(resource int, rlim *Rlimit) (err error) { - return linux.Setrlimit(resource, rlim) -} - // Syscall is a wrapper func Syscall(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err syscall.Errno) { return linux.Syscall(trap, a1, a2, a3) @@ -53,6 +84,11 @@ func FcntlInt(fd uintptr, cmd, arg int) (int, error) { return linux.FcntlInt(fd, cmd, arg) } +// IoctlSetInt is a wrapper +func IoctlSetInt(fd int, req uint, value int) error { + return linux.IoctlSetInt(fd, req, value) +} + // Statfs is a wrapper func Statfs(path string, buf *Statfs_t) (err error) { return linux.Statfs(path, buf) @@ -116,3 +152,57 @@ type PerfEventAttr = linux.PerfEventAttr func PerfEventOpen(attr *PerfEventAttr, pid int, cpu int, groupFd int, flags int) (fd int, err error) { return linux.PerfEventOpen(attr, pid, cpu, groupFd, flags) } + +// Utsname is a wrapper +type Utsname = linux.Utsname + +// Uname is a wrapper +func Uname(buf *Utsname) (err error) { + return linux.Uname(buf) +} + +// Getpid is a wrapper +func Getpid() int { + return linux.Getpid() +} + +// Gettid is a wrapper +func Gettid() int { + return linux.Gettid() +} + +// Tgkill is a wrapper +func Tgkill(tgid int, tid int, sig syscall.Signal) (err error) { + return linux.Tgkill(tgid, tid, sig) +} + +// BytePtrFromString is a wrapper +func BytePtrFromString(s string) (*byte, error) { + return linux.BytePtrFromString(s) +} + +// ByteSliceToString is a wrapper +func ByteSliceToString(s []byte) string { + return linux.ByteSliceToString(s) +} + +// Renameat2 is a wrapper +func Renameat2(olddirfd int, oldpath string, newdirfd int, newpath string, flags uint) error { + return linux.Renameat2(olddirfd, oldpath, newdirfd, newpath, flags) +} + +func KernelRelease() (string, error) { + var uname Utsname + err := Uname(&uname) + if err != nil { + return "", err + } + + end := bytes.IndexByte(uname.Release[:], 0) + release := string(uname.Release[:end]) + return release, nil +} + +func Prlimit(pid, resource int, new, old *Rlimit) error { + return linux.Prlimit(pid, resource, new, old) +} diff --git a/vendor/github.com/cilium/ebpf/internal/unix/types_other.go b/vendor/github.com/cilium/ebpf/internal/unix/types_other.go index a327f2a..4f50d89 100644 --- a/vendor/github.com/cilium/ebpf/internal/unix/types_other.go +++ b/vendor/github.com/cilium/ebpf/internal/unix/types_other.go @@ -1,3 +1,4 @@ +//go:build !linux // +build !linux package unix @@ -11,12 +12,34 @@ import ( var errNonLinux = fmt.Errorf("unsupported platform %s/%s", runtime.GOOS, runtime.GOARCH) const ( - ENOENT = syscall.ENOENT - EAGAIN = syscall.EAGAIN - ENOSPC = syscall.ENOSPC - EINVAL = syscall.EINVAL + ENOENT = syscall.ENOENT + EEXIST = syscall.EEXIST + EAGAIN = syscall.EAGAIN + ENOSPC = syscall.ENOSPC + EINVAL = syscall.EINVAL + EINTR = syscall.EINTR + EPERM = syscall.EPERM + ESRCH = syscall.ESRCH + ENODEV = syscall.ENODEV + EBADF = syscall.Errno(0) + E2BIG = syscall.Errno(0) + // ENOTSUPP is not the same as ENOTSUP or EOPNOTSUP + ENOTSUPP = syscall.Errno(0x20c) + + BPF_F_NO_PREALLOC = 0 + BPF_F_NUMA_NODE = 0 + BPF_F_RDONLY = 0 + BPF_F_WRONLY = 0 + BPF_F_RDONLY_PROG = 0 + BPF_F_WRONLY_PROG = 0 + BPF_F_SLEEPABLE = 0 + BPF_F_MMAPABLE = 0 + BPF_F_INNER_MAP = 0 BPF_OBJ_NAME_LEN = 0x10 BPF_TAG_SIZE = 0x8 + BPF_RINGBUF_BUSY_BIT = 0 + BPF_RINGBUF_DISCARD_BIT = 0 + BPF_RINGBUF_HDR_SZ = 0 SYS_BPF = 321 F_DUPFD_CLOEXEC = 0x406 EPOLLIN = 0x1 @@ -27,11 +50,23 @@ const ( PROT_READ = 0x1 PROT_WRITE = 0x2 MAP_SHARED = 0x1 + PERF_ATTR_SIZE_VER1 = 0 PERF_TYPE_SOFTWARE = 0x1 + PERF_TYPE_TRACEPOINT = 0 PERF_COUNT_SW_BPF_OUTPUT = 0xa + PERF_EVENT_IOC_DISABLE = 0 + PERF_EVENT_IOC_ENABLE = 0 + PERF_EVENT_IOC_SET_BPF = 0 PerfBitWatermark = 0x4000 PERF_SAMPLE_RAW = 0x400 PERF_FLAG_FD_CLOEXEC = 0x8 + RLIM_INFINITY = 0x7fffffffffffffff + RLIMIT_MEMLOCK = 8 + BPF_STATS_RUN_TIME = 0 + PERF_RECORD_LOST = 2 + PERF_RECORD_SAMPLE = 9 + AT_FDCWD = -0x2 + RENAME_NOREPLACE = 0x1 ) // Statfs_t is a wrapper @@ -56,11 +91,6 @@ type Rlimit struct { Max uint64 } -// Setrlimit is a wrapper -func Setrlimit(resource int, rlim *Rlimit) (err error) { - return errNonLinux -} - // Syscall is a wrapper func Syscall(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err syscall.Errno) { return 0, 0, syscall.Errno(1) @@ -71,6 +101,11 @@ func FcntlInt(fd uintptr, cmd, arg int) (int, error) { return -1, errNonLinux } +// IoctlSetInt is a wrapper +func IoctlSetInt(fd int, req uint, value int) error { + return errNonLinux +} + // Statfs is a wrapper func Statfs(path string, buf *Statfs_t) error { return errNonLinux @@ -181,3 +216,52 @@ type PerfEventAttr struct { func PerfEventOpen(attr *PerfEventAttr, pid int, cpu int, groupFd int, flags int) (fd int, err error) { return 0, errNonLinux } + +// Utsname is a wrapper +type Utsname struct { + Release [65]byte + Version [65]byte +} + +// Uname is a wrapper +func Uname(buf *Utsname) (err error) { + return errNonLinux +} + +// Getpid is a wrapper +func Getpid() int { + return -1 +} + +// Gettid is a wrapper +func Gettid() int { + return -1 +} + +// Tgkill is a wrapper +func Tgkill(tgid int, tid int, sig syscall.Signal) (err error) { + return errNonLinux +} + +// BytePtrFromString is a wrapper +func BytePtrFromString(s string) (*byte, error) { + return nil, errNonLinux +} + +// ByteSliceToString is a wrapper +func ByteSliceToString(s []byte) string { + return "" +} + +// Renameat2 is a wrapper +func Renameat2(olddirfd int, oldpath string, newdirfd int, newpath string, flags uint) error { + return errNonLinux +} + +func KernelRelease() (string, error) { + return "", errNonLinux +} + +func Prlimit(pid, resource int, new, old *Rlimit) error { + return errNonLinux +} diff --git a/vendor/github.com/cilium/ebpf/internal/version.go b/vendor/github.com/cilium/ebpf/internal/version.go new file mode 100644 index 0000000..4915e58 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/internal/version.go @@ -0,0 +1,163 @@ +package internal + +import ( + "fmt" + "os" + "regexp" + "sync" + + "github.com/cilium/ebpf/internal/unix" +) + +const ( + // Version constant used in ELF binaries indicating that the loader needs to + // substitute the eBPF program's version with the value of the kernel's + // KERNEL_VERSION compile-time macro. Used for compatibility with BCC, gobpf + // and RedSift. + MagicKernelVersion = 0xFFFFFFFE +) + +var ( + // Match between one and three decimals separated by dots, with the last + // segment (patch level) being optional on some kernels. + // The x.y.z string must appear at the start of a string or right after + // whitespace to prevent sequences like 'x.y.z-a.b.c' from matching 'a.b.c'. + rgxKernelVersion = regexp.MustCompile(`(?:\A|\s)\d{1,3}\.\d{1,3}(?:\.\d{1,3})?`) + + kernelVersion = struct { + once sync.Once + version Version + err error + }{} +) + +// A Version in the form Major.Minor.Patch. +type Version [3]uint16 + +// NewVersion creates a version from a string like "Major.Minor.Patch". +// +// Patch is optional. +func NewVersion(ver string) (Version, error) { + var major, minor, patch uint16 + n, _ := fmt.Sscanf(ver, "%d.%d.%d", &major, &minor, &patch) + if n < 2 { + return Version{}, fmt.Errorf("invalid version: %s", ver) + } + return Version{major, minor, patch}, nil +} + +func (v Version) String() string { + if v[2] == 0 { + return fmt.Sprintf("v%d.%d", v[0], v[1]) + } + return fmt.Sprintf("v%d.%d.%d", v[0], v[1], v[2]) +} + +// Less returns true if the version is less than another version. +func (v Version) Less(other Version) bool { + for i, a := range v { + if a == other[i] { + continue + } + return a < other[i] + } + return false +} + +// Unspecified returns true if the version is all zero. +func (v Version) Unspecified() bool { + return v[0] == 0 && v[1] == 0 && v[2] == 0 +} + +// Kernel implements the kernel's KERNEL_VERSION macro from linux/version.h. +// It represents the kernel version and patch level as a single value. +func (v Version) Kernel() uint32 { + + // Kernels 4.4 and 4.9 have their SUBLEVEL clamped to 255 to avoid + // overflowing into PATCHLEVEL. + // See kernel commit 9b82f13e7ef3 ("kbuild: clamp SUBLEVEL to 255"). + s := v[2] + if s > 255 { + s = 255 + } + + // Truncate members to uint8 to prevent them from spilling over into + // each other when overflowing 8 bits. + return uint32(uint8(v[0]))<<16 | uint32(uint8(v[1]))<<8 | uint32(uint8(s)) +} + +// KernelVersion returns the version of the currently running kernel. +func KernelVersion() (Version, error) { + kernelVersion.once.Do(func() { + kernelVersion.version, kernelVersion.err = detectKernelVersion() + }) + + if kernelVersion.err != nil { + return Version{}, kernelVersion.err + } + return kernelVersion.version, nil +} + +// detectKernelVersion returns the version of the running kernel. It scans the +// following sources in order: /proc/version_signature, uname -v, uname -r. +// In each of those locations, the last-appearing x.y(.z) value is selected +// for parsing. The first location that yields a usable version number is +// returned. +func detectKernelVersion() (Version, error) { + + // Try reading /proc/version_signature for Ubuntu compatibility. + // Example format: Ubuntu 4.15.0-91.92-generic 4.15.18 + // This method exists in the kernel itself, see d18acd15c + // ("perf tools: Fix kernel version error in ubuntu"). + if pvs, err := os.ReadFile("/proc/version_signature"); err == nil { + // If /proc/version_signature exists, failing to parse it is an error. + // It only exists on Ubuntu, where the real patch level is not obtainable + // through any other method. + v, err := findKernelVersion(string(pvs)) + if err != nil { + return Version{}, err + } + return v, nil + } + + var uname unix.Utsname + if err := unix.Uname(&uname); err != nil { + return Version{}, fmt.Errorf("calling uname: %w", err) + } + + // Debian puts the version including the patch level in uname.Version. + // It is not an error if there's no version number in uname.Version, + // as most distributions don't use it. Parsing can continue on uname.Release. + // Example format: #1 SMP Debian 4.19.37-5+deb10u2 (2019-08-08) + if v, err := findKernelVersion(unix.ByteSliceToString(uname.Version[:])); err == nil { + return v, nil + } + + // Most other distributions have the full kernel version including patch + // level in uname.Release. + // Example format: 4.19.0-5-amd64, 5.5.10-arch1-1 + v, err := findKernelVersion(unix.ByteSliceToString(uname.Release[:])) + if err != nil { + return Version{}, err + } + + return v, nil +} + +// findKernelVersion matches s against rgxKernelVersion and parses the result +// into a Version. If s contains multiple matches, the last entry is selected. +func findKernelVersion(s string) (Version, error) { + m := rgxKernelVersion.FindAllString(s, -1) + if m == nil { + return Version{}, fmt.Errorf("no kernel version in string: %s", s) + } + // Pick the last match of the string in case there are multiple. + s = m[len(m)-1] + + v, err := NewVersion(s) + if err != nil { + return Version{}, fmt.Errorf("parsing version string %s: %w", s, err) + } + + return v, nil +} diff --git a/vendor/github.com/cilium/ebpf/link/cgroup.go b/vendor/github.com/cilium/ebpf/link/cgroup.go new file mode 100644 index 0000000..5540bb0 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/link/cgroup.go @@ -0,0 +1,171 @@ +package link + +import ( + "errors" + "fmt" + "os" + + "github.com/cilium/ebpf" +) + +type cgroupAttachFlags uint32 + +// cgroup attach flags +const ( + flagAllowOverride cgroupAttachFlags = 1 << iota + flagAllowMulti + flagReplace +) + +type CgroupOptions struct { + // Path to a cgroupv2 folder. + Path string + // One of the AttachCgroup* constants + Attach ebpf.AttachType + // Program must be of type CGroup*, and the attach type must match Attach. + Program *ebpf.Program +} + +// AttachCgroup links a BPF program to a cgroup. +func AttachCgroup(opts CgroupOptions) (Link, error) { + cgroup, err := os.Open(opts.Path) + if err != nil { + return nil, fmt.Errorf("can't open cgroup: %s", err) + } + + clone, err := opts.Program.Clone() + if err != nil { + cgroup.Close() + return nil, err + } + + var cg Link + cg, err = newLinkCgroup(cgroup, opts.Attach, clone) + if errors.Is(err, ErrNotSupported) { + cg, err = newProgAttachCgroup(cgroup, opts.Attach, clone, flagAllowMulti) + } + if errors.Is(err, ErrNotSupported) { + cg, err = newProgAttachCgroup(cgroup, opts.Attach, clone, flagAllowOverride) + } + if err != nil { + cgroup.Close() + clone.Close() + return nil, err + } + + return cg, nil +} + +// LoadPinnedCgroup loads a pinned cgroup from a bpffs. +func LoadPinnedCgroup(fileName string, opts *ebpf.LoadPinOptions) (Link, error) { + link, err := LoadPinnedRawLink(fileName, CgroupType, opts) + if err != nil { + return nil, err + } + + return &linkCgroup{*link}, nil +} + +type progAttachCgroup struct { + cgroup *os.File + current *ebpf.Program + attachType ebpf.AttachType + flags cgroupAttachFlags +} + +var _ Link = (*progAttachCgroup)(nil) + +func (cg *progAttachCgroup) isLink() {} + +func newProgAttachCgroup(cgroup *os.File, attach ebpf.AttachType, prog *ebpf.Program, flags cgroupAttachFlags) (*progAttachCgroup, error) { + if flags&flagAllowMulti > 0 { + if err := haveProgAttachReplace(); err != nil { + return nil, fmt.Errorf("can't support multiple programs: %w", err) + } + } + + err := RawAttachProgram(RawAttachProgramOptions{ + Target: int(cgroup.Fd()), + Program: prog, + Flags: uint32(flags), + Attach: attach, + }) + if err != nil { + return nil, fmt.Errorf("cgroup: %w", err) + } + + return &progAttachCgroup{cgroup, prog, attach, flags}, nil +} + +func (cg *progAttachCgroup) Close() error { + defer cg.cgroup.Close() + defer cg.current.Close() + + err := RawDetachProgram(RawDetachProgramOptions{ + Target: int(cg.cgroup.Fd()), + Program: cg.current, + Attach: cg.attachType, + }) + if err != nil { + return fmt.Errorf("close cgroup: %s", err) + } + return nil +} + +func (cg *progAttachCgroup) Update(prog *ebpf.Program) error { + new, err := prog.Clone() + if err != nil { + return err + } + + args := RawAttachProgramOptions{ + Target: int(cg.cgroup.Fd()), + Program: prog, + Attach: cg.attachType, + Flags: uint32(cg.flags), + } + + if cg.flags&flagAllowMulti > 0 { + // Atomically replacing multiple programs requires at least + // 5.5 (commit 7dd68b3279f17921 "bpf: Support replacing cgroup-bpf + // program in MULTI mode") + args.Flags |= uint32(flagReplace) + args.Replace = cg.current + } + + if err := RawAttachProgram(args); err != nil { + new.Close() + return fmt.Errorf("can't update cgroup: %s", err) + } + + cg.current.Close() + cg.current = new + return nil +} + +func (cg *progAttachCgroup) Pin(string) error { + return fmt.Errorf("can't pin cgroup: %w", ErrNotSupported) +} + +func (cg *progAttachCgroup) Unpin() error { + return fmt.Errorf("can't pin cgroup: %w", ErrNotSupported) +} + +type linkCgroup struct { + RawLink +} + +var _ Link = (*linkCgroup)(nil) + +func newLinkCgroup(cgroup *os.File, attach ebpf.AttachType, prog *ebpf.Program) (*linkCgroup, error) { + link, err := AttachRawLink(RawLinkOptions{ + Target: int(cgroup.Fd()), + Program: prog, + Attach: attach, + }) + if err != nil { + return nil, err + } + + return &linkCgroup{*link}, err +} diff --git a/vendor/github.com/cilium/ebpf/link/doc.go b/vendor/github.com/cilium/ebpf/link/doc.go new file mode 100644 index 0000000..2bde35e --- /dev/null +++ b/vendor/github.com/cilium/ebpf/link/doc.go @@ -0,0 +1,2 @@ +// Package link allows attaching eBPF programs to various kernel hooks. +package link diff --git a/vendor/github.com/cilium/ebpf/link/freplace.go b/vendor/github.com/cilium/ebpf/link/freplace.go new file mode 100644 index 0000000..a698e1a --- /dev/null +++ b/vendor/github.com/cilium/ebpf/link/freplace.go @@ -0,0 +1,88 @@ +package link + +import ( + "fmt" + + "github.com/cilium/ebpf" + "github.com/cilium/ebpf/internal/btf" +) + +type FreplaceLink struct { + RawLink +} + +// AttachFreplace attaches the given eBPF program to the function it replaces. +// +// The program and name can either be provided at link time, or can be provided +// at program load time. If they were provided at load time, they should be nil +// and empty respectively here, as they will be ignored by the kernel. +// Examples: +// +// AttachFreplace(dispatcher, "function", replacement) +// AttachFreplace(nil, "", replacement) +func AttachFreplace(targetProg *ebpf.Program, name string, prog *ebpf.Program) (*FreplaceLink, error) { + if (name == "") != (targetProg == nil) { + return nil, fmt.Errorf("must provide both or neither of name and targetProg: %w", errInvalidInput) + } + if prog == nil { + return nil, fmt.Errorf("prog cannot be nil: %w", errInvalidInput) + } + if prog.Type() != ebpf.Extension { + return nil, fmt.Errorf("eBPF program type %s is not an Extension: %w", prog.Type(), errInvalidInput) + } + + var ( + target int + typeID btf.TypeID + ) + if targetProg != nil { + info, err := targetProg.Info() + if err != nil { + return nil, err + } + btfID, ok := info.BTFID() + if !ok { + return nil, fmt.Errorf("could not get BTF ID for program %s: %w", info.Name, errInvalidInput) + } + btfHandle, err := btf.NewHandleFromID(btfID) + if err != nil { + return nil, err + } + defer btfHandle.Close() + + var function *btf.Func + if err := btfHandle.Spec().FindType(name, &function); err != nil { + return nil, err + } + + target = targetProg.FD() + typeID = function.ID() + } + + link, err := AttachRawLink(RawLinkOptions{ + Target: target, + Program: prog, + Attach: ebpf.AttachNone, + BTF: typeID, + }) + if err != nil { + return nil, err + } + + return &FreplaceLink{*link}, nil +} + +// Update implements the Link interface. +func (f *FreplaceLink) Update(new *ebpf.Program) error { + return fmt.Errorf("freplace update: %w", ErrNotSupported) +} + +// LoadPinnedFreplace loads a pinned iterator from a bpffs. +func LoadPinnedFreplace(fileName string, opts *ebpf.LoadPinOptions) (*FreplaceLink, error) { + link, err := LoadPinnedRawLink(fileName, TracingType, opts) + if err != nil { + return nil, err + } + + return &FreplaceLink{*link}, err +} diff --git a/vendor/github.com/cilium/ebpf/link/iter.go b/vendor/github.com/cilium/ebpf/link/iter.go new file mode 100644 index 0000000..654d34e --- /dev/null +++ b/vendor/github.com/cilium/ebpf/link/iter.go @@ -0,0 +1,100 @@ +package link + +import ( + "fmt" + "io" + "unsafe" + + "github.com/cilium/ebpf" + "github.com/cilium/ebpf/internal" +) + +type IterOptions struct { + // Program must be of type Tracing with attach type + // AttachTraceIter. The kind of iterator to attach to is + // determined at load time via the AttachTo field. + // + // AttachTo requires the kernel to include BTF of itself, + // and it to be compiled with a recent pahole (>= 1.16). + Program *ebpf.Program + + // Map specifies the target map for bpf_map_elem and sockmap iterators. + // It may be nil. + Map *ebpf.Map +} + +// AttachIter attaches a BPF seq_file iterator. +func AttachIter(opts IterOptions) (*Iter, error) { + if err := haveBPFLink(); err != nil { + return nil, err + } + + progFd := opts.Program.FD() + if progFd < 0 { + return nil, fmt.Errorf("invalid program: %s", internal.ErrClosedFd) + } + + var info bpfIterLinkInfoMap + if opts.Map != nil { + mapFd := opts.Map.FD() + if mapFd < 0 { + return nil, fmt.Errorf("invalid map: %w", internal.ErrClosedFd) + } + info.map_fd = uint32(mapFd) + } + + attr := bpfLinkCreateIterAttr{ + prog_fd: uint32(progFd), + attach_type: ebpf.AttachTraceIter, + iter_info: internal.NewPointer(unsafe.Pointer(&info)), + iter_info_len: uint32(unsafe.Sizeof(info)), + } + + fd, err := bpfLinkCreateIter(&attr) + if err != nil { + return nil, fmt.Errorf("can't link iterator: %w", err) + } + + return &Iter{RawLink{fd, ""}}, err +} + +// LoadPinnedIter loads a pinned iterator from a bpffs. +func LoadPinnedIter(fileName string, opts *ebpf.LoadPinOptions) (*Iter, error) { + link, err := LoadPinnedRawLink(fileName, IterType, opts) + if err != nil { + return nil, err + } + + return &Iter{*link}, err +} + +// Iter represents an attached bpf_iter. +type Iter struct { + RawLink +} + +// Open creates a new instance of the iterator. +// +// Reading from the returned reader triggers the BPF program. +func (it *Iter) Open() (io.ReadCloser, error) { + linkFd, err := it.fd.Value() + if err != nil { + return nil, err + } + + attr := &bpfIterCreateAttr{ + linkFd: linkFd, + } + + fd, err := bpfIterCreate(attr) + if err != nil { + return nil, fmt.Errorf("can't create iterator: %w", err) + } + + return fd.File("bpf_iter"), nil +} + +// union bpf_iter_link_info.map +type bpfIterLinkInfoMap struct { + map_fd uint32 +} diff --git a/vendor/github.com/cilium/ebpf/link/kprobe.go b/vendor/github.com/cilium/ebpf/link/kprobe.go new file mode 100644 index 0000000..b6577b5 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/link/kprobe.go @@ -0,0 +1,444 @@ +package link + +import ( + "bytes" + "crypto/rand" + "errors" + "fmt" + "os" + "path/filepath" + "runtime" + "sync" + "unsafe" + + "github.com/cilium/ebpf" + "github.com/cilium/ebpf/internal" + "github.com/cilium/ebpf/internal/unix" +) + +var ( + kprobeEventsPath = filepath.Join(tracefsPath, "kprobe_events") + + kprobeRetprobeBit = struct { + once sync.Once + value uint64 + err error + }{} +) + +type probeType uint8 + +const ( + kprobeType probeType = iota + uprobeType +) + +func (pt probeType) String() string { + if pt == kprobeType { + return "kprobe" + } + return "uprobe" +} + +func (pt probeType) EventsPath() string { + if pt == kprobeType { + return kprobeEventsPath + } + return uprobeEventsPath +} + +func (pt probeType) PerfEventType(ret bool) perfEventType { + if pt == kprobeType { + if ret { + return kretprobeEvent + } + return kprobeEvent + } + if ret { + return uretprobeEvent + } + return uprobeEvent +} + +func (pt probeType) RetprobeBit() (uint64, error) { + if pt == kprobeType { + return kretprobeBit() + } + return uretprobeBit() +} + +// Kprobe attaches the given eBPF program to a perf event that fires when the +// given kernel symbol starts executing. See /proc/kallsyms for available +// symbols. For example, printk(): +// +// kp, err := Kprobe("printk", prog) +// +// Losing the reference to the resulting Link (kp) will close the Kprobe +// and prevent further execution of prog. The Link must be Closed during +// program shutdown to avoid leaking system resources. +func Kprobe(symbol string, prog *ebpf.Program) (Link, error) { + k, err := kprobe(symbol, prog, false) + if err != nil { + return nil, err + } + + err = k.attach(prog) + if err != nil { + k.Close() + return nil, err + } + + return k, nil +} + +// Kretprobe attaches the given eBPF program to a perf event that fires right +// before the given kernel symbol exits, with the function stack left intact. +// See /proc/kallsyms for available symbols. For example, printk(): +// +// kp, err := Kretprobe("printk", prog) +// +// Losing the reference to the resulting Link (kp) will close the Kretprobe +// and prevent further execution of prog. The Link must be Closed during +// program shutdown to avoid leaking system resources. +func Kretprobe(symbol string, prog *ebpf.Program) (Link, error) { + k, err := kprobe(symbol, prog, true) + if err != nil { + return nil, err + } + + err = k.attach(prog) + if err != nil { + k.Close() + return nil, err + } + + return k, nil +} + +// kprobe opens a perf event on the given symbol and attaches prog to it. +// If ret is true, create a kretprobe. +func kprobe(symbol string, prog *ebpf.Program, ret bool) (*perfEvent, error) { + if symbol == "" { + return nil, fmt.Errorf("symbol name cannot be empty: %w", errInvalidInput) + } + if prog == nil { + return nil, fmt.Errorf("prog cannot be nil: %w", errInvalidInput) + } + if !rgxTraceEvent.MatchString(symbol) { + return nil, fmt.Errorf("symbol '%s' must be alphanumeric or underscore: %w", symbol, errInvalidInput) + } + if prog.Type() != ebpf.Kprobe { + return nil, fmt.Errorf("eBPF program type %s is not a Kprobe: %w", prog.Type(), errInvalidInput) + } + + // Use kprobe PMU if the kernel has it available. + tp, err := pmuKprobe(platformPrefix(symbol), ret) + if errors.Is(err, os.ErrNotExist) { + tp, err = pmuKprobe(symbol, ret) + } + if err == nil { + return tp, nil + } + if err != nil && !errors.Is(err, ErrNotSupported) { + return nil, fmt.Errorf("creating perf_kprobe PMU: %w", err) + } + + // Use tracefs if kprobe PMU is missing. + tp, err = tracefsKprobe(platformPrefix(symbol), ret) + if errors.Is(err, os.ErrNotExist) { + tp, err = tracefsKprobe(symbol, ret) + } + if err != nil { + return nil, fmt.Errorf("creating trace event '%s' in tracefs: %w", symbol, err) + } + + return tp, nil +} + +// pmuKprobe opens a perf event based on the kprobe PMU. +// Returns os.ErrNotExist if the given symbol does not exist in the kernel. +func pmuKprobe(symbol string, ret bool) (*perfEvent, error) { + return pmuProbe(kprobeType, symbol, "", 0, perfAllThreads, ret) +} + +// pmuProbe opens a perf event based on a Performance Monitoring Unit. +// +// Requires at least a 4.17 kernel. +// e12f03d7031a "perf/core: Implement the 'perf_kprobe' PMU" +// 33ea4b24277b "perf/core: Implement the 'perf_uprobe' PMU" +// +// Returns ErrNotSupported if the kernel doesn't support perf_[k,u]probe PMU +func pmuProbe(typ probeType, symbol, path string, offset uint64, pid int, ret bool) (*perfEvent, error) { + // Getting the PMU type will fail if the kernel doesn't support + // the perf_[k,u]probe PMU. + et, err := getPMUEventType(typ) + if err != nil { + return nil, err + } + + var config uint64 + if ret { + bit, err := typ.RetprobeBit() + if err != nil { + return nil, err + } + config |= 1 << bit + } + + var ( + attr unix.PerfEventAttr + sp unsafe.Pointer + ) + switch typ { + case kprobeType: + // Create a pointer to a NUL-terminated string for the kernel. + sp, err = unsafeStringPtr(symbol) + if err != nil { + return nil, err + } + + attr = unix.PerfEventAttr{ + Type: uint32(et), // PMU event type read from sysfs + Ext1: uint64(uintptr(sp)), // Kernel symbol to trace + Config: config, // Retprobe flag + } + case uprobeType: + sp, err = unsafeStringPtr(path) + if err != nil { + return nil, err + } + + attr = unix.PerfEventAttr{ + // The minimum size required for PMU uprobes is PERF_ATTR_SIZE_VER1, + // since it added the config2 (Ext2) field. The Size field controls the + // size of the internal buffer the kernel allocates for reading the + // perf_event_attr argument from userspace. + Size: unix.PERF_ATTR_SIZE_VER1, + Type: uint32(et), // PMU event type read from sysfs + Ext1: uint64(uintptr(sp)), // Uprobe path + Ext2: offset, // Uprobe offset + Config: config, // Retprobe flag + } + } + + fd, err := unix.PerfEventOpen(&attr, pid, 0, -1, unix.PERF_FLAG_FD_CLOEXEC) + + // Since commit 97c753e62e6c, ENOENT is correctly returned instead of EINVAL + // when trying to create a kretprobe for a missing symbol. Make sure ENOENT + // is returned to the caller. + if errors.Is(err, os.ErrNotExist) || errors.Is(err, unix.EINVAL) { + return nil, fmt.Errorf("symbol '%s' not found: %w", symbol, os.ErrNotExist) + } + // Since at least commit cb9a19fe4aa51, ENOTSUPP is returned + // when attempting to set a uprobe on a trap instruction. + if errors.Is(err, unix.ENOTSUPP) { + return nil, fmt.Errorf("failed setting uprobe on offset %#x (possible trap insn): %w", offset, err) + } + if err != nil { + return nil, fmt.Errorf("opening perf event: %w", err) + } + + // Ensure the string pointer is not collected before PerfEventOpen returns. + runtime.KeepAlive(sp) + + // Kernel has perf_[k,u]probe PMU available, initialize perf event. + return &perfEvent{ + fd: internal.NewFD(uint32(fd)), + pmuID: et, + name: symbol, + typ: typ.PerfEventType(ret), + }, nil +} + +// tracefsKprobe creates a Kprobe tracefs entry. +func tracefsKprobe(symbol string, ret bool) (*perfEvent, error) { + return tracefsProbe(kprobeType, symbol, "", 0, perfAllThreads, ret) +} + +// tracefsProbe creates a trace event by writing an entry to /[k,u]probe_events. +// A new trace event group name is generated on every call to support creating +// multiple trace events for the same kernel or userspace symbol. +// Path and offset are only set in the case of uprobe(s) and are used to set +// the executable/library path on the filesystem and the offset where the probe is inserted. +// A perf event is then opened on the newly-created trace event and returned to the caller. +func tracefsProbe(typ probeType, symbol, path string, offset uint64, pid int, ret bool) (*perfEvent, error) { + // Generate a random string for each trace event we attempt to create. + // This value is used as the 'group' token in tracefs to allow creating + // multiple kprobe trace events with the same name. + group, err := randomGroup("ebpf") + if err != nil { + return nil, fmt.Errorf("randomizing group name: %w", err) + } + + // Before attempting to create a trace event through tracefs, + // check if an event with the same group and name already exists. + // Kernels 4.x and earlier don't return os.ErrExist on writing a duplicate + // entry, so we need to rely on reads for detecting uniqueness. + _, err = getTraceEventID(group, symbol) + if err == nil { + return nil, fmt.Errorf("trace event already exists: %s/%s", group, symbol) + } + if err != nil && !errors.Is(err, os.ErrNotExist) { + return nil, fmt.Errorf("checking trace event %s/%s: %w", group, symbol, err) + } + + // Create the [k,u]probe trace event using tracefs. + if err := createTraceFSProbeEvent(typ, group, symbol, path, offset, ret); err != nil { + return nil, fmt.Errorf("creating probe entry on tracefs: %w", err) + } + + // Get the newly-created trace event's id. + tid, err := getTraceEventID(group, symbol) + if err != nil { + return nil, fmt.Errorf("getting trace event id: %w", err) + } + + // Kprobes are ephemeral tracepoints and share the same perf event type. + fd, err := openTracepointPerfEvent(tid, pid) + if err != nil { + return nil, err + } + + return &perfEvent{ + fd: fd, + group: group, + name: symbol, + tracefsID: tid, + typ: typ.PerfEventType(ret), + }, nil +} + +// createTraceFSProbeEvent creates a new ephemeral trace event by writing to +// /[k,u]probe_events. Returns os.ErrNotExist if symbol is not a valid +// kernel symbol, or if it is not traceable with kprobes. Returns os.ErrExist +// if a probe with the same group and symbol already exists. +func createTraceFSProbeEvent(typ probeType, group, symbol, path string, offset uint64, ret bool) error { + // Open the kprobe_events file in tracefs. + f, err := os.OpenFile(typ.EventsPath(), os.O_APPEND|os.O_WRONLY, 0666) + if err != nil { + return fmt.Errorf("error opening '%s': %w", typ.EventsPath(), err) + } + defer f.Close() + + var pe string + switch typ { + case kprobeType: + // The kprobe_events syntax is as follows (see Documentation/trace/kprobetrace.txt): + // p[:[GRP/]EVENT] [MOD:]SYM[+offs]|MEMADDR [FETCHARGS] : Set a probe + // r[MAXACTIVE][:[GRP/]EVENT] [MOD:]SYM[+0] [FETCHARGS] : Set a return probe + // -:[GRP/]EVENT : Clear a probe + // + // Some examples: + // r:ebpf_1234/r_my_kretprobe nf_conntrack_destroy + // p:ebpf_5678/p_my_kprobe __x64_sys_execve + // + // Leaving the kretprobe's MAXACTIVE set to 0 (or absent) will make the + // kernel default to NR_CPUS. This is desired in most eBPF cases since + // subsampling or rate limiting logic can be more accurately implemented in + // the eBPF program itself. + // See Documentation/kprobes.txt for more details. + pe = fmt.Sprintf("%s:%s/%s %s", probePrefix(ret), group, symbol, symbol) + case uprobeType: + // The uprobe_events syntax is as follows: + // p[:[GRP/]EVENT] PATH:OFFSET [FETCHARGS] : Set a probe + // r[:[GRP/]EVENT] PATH:OFFSET [FETCHARGS] : Set a return probe + // -:[GRP/]EVENT : Clear a probe + // + // Some examples: + // r:ebpf_1234/readline /bin/bash:0x12345 + // p:ebpf_5678/main_mySymbol /bin/mybin:0x12345 + // + // See Documentation/trace/uprobetracer.txt for more details. + pathOffset := uprobePathOffset(path, offset) + pe = fmt.Sprintf("%s:%s/%s %s", probePrefix(ret), group, symbol, pathOffset) + } + _, err = f.WriteString(pe) + // Since commit 97c753e62e6c, ENOENT is correctly returned instead of EINVAL + // when trying to create a kretprobe for a missing symbol. Make sure ENOENT + // is returned to the caller. + if errors.Is(err, os.ErrNotExist) || errors.Is(err, unix.EINVAL) { + return fmt.Errorf("symbol %s not found: %w", symbol, os.ErrNotExist) + } + if err != nil { + return fmt.Errorf("writing '%s' to '%s': %w", pe, typ.EventsPath(), err) + } + + return nil +} + +// closeTraceFSProbeEvent removes the [k,u]probe with the given type, group and symbol +// from /[k,u]probe_events. +func closeTraceFSProbeEvent(typ probeType, group, symbol string) error { + f, err := os.OpenFile(typ.EventsPath(), os.O_APPEND|os.O_WRONLY, 0666) + if err != nil { + return fmt.Errorf("error opening %s: %w", typ.EventsPath(), err) + } + defer f.Close() + + // See [k,u]probe_events syntax above. The probe type does not need to be specified + // for removals. + pe := fmt.Sprintf("-:%s/%s", group, symbol) + if _, err = f.WriteString(pe); err != nil { + return fmt.Errorf("writing '%s' to '%s': %w", pe, typ.EventsPath(), err) + } + + return nil +} + +// randomGroup generates a pseudorandom string for use as a tracefs group name. +// Returns an error when the output string would exceed 63 characters (kernel +// limitation), when rand.Read() fails or when prefix contains characters not +// allowed by rgxTraceEvent. +func randomGroup(prefix string) (string, error) { + if !rgxTraceEvent.MatchString(prefix) { + return "", fmt.Errorf("prefix '%s' must be alphanumeric or underscore: %w", prefix, errInvalidInput) + } + + b := make([]byte, 8) + if _, err := rand.Read(b); err != nil { + return "", fmt.Errorf("reading random bytes: %w", err) + } + + group := fmt.Sprintf("%s_%x", prefix, b) + if len(group) > 63 { + return "", fmt.Errorf("group name '%s' cannot be longer than 63 characters: %w", group, errInvalidInput) + } + + return group, nil +} + +func probePrefix(ret bool) string { + if ret { + return "r" + } + return "p" +} + +// determineRetprobeBit reads a Performance Monitoring Unit's retprobe bit +// from /sys/bus/event_source/devices//format/retprobe. +func determineRetprobeBit(typ probeType) (uint64, error) { + p := filepath.Join("/sys/bus/event_source/devices/", typ.String(), "/format/retprobe") + + data, err := os.ReadFile(p) + if err != nil { + return 0, err + } + + var rp uint64 + n, err := fmt.Sscanf(string(bytes.TrimSpace(data)), "config:%d", &rp) + if err != nil { + return 0, fmt.Errorf("parse retprobe bit: %w", err) + } + if n != 1 { + return 0, fmt.Errorf("parse retprobe bit: expected 1 item, got %d", n) + } + + return rp, nil +} + +func kretprobeBit() (uint64, error) { + kprobeRetprobeBit.once.Do(func() { + kprobeRetprobeBit.value, kprobeRetprobeBit.err = determineRetprobeBit(kprobeType) + }) + return kprobeRetprobeBit.value, kprobeRetprobeBit.err +} diff --git a/vendor/github.com/cilium/ebpf/link/link.go b/vendor/github.com/cilium/ebpf/link/link.go new file mode 100644 index 0000000..4926584 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/link/link.go @@ -0,0 +1,233 @@ +package link + +import ( + "fmt" + "unsafe" + + "github.com/cilium/ebpf" + "github.com/cilium/ebpf/internal" + "github.com/cilium/ebpf/internal/btf" +) + +var ErrNotSupported = internal.ErrNotSupported + +// Link represents a Program attached to a BPF hook. +type Link interface { + // Replace the current program with a new program. + // + // Passing a nil program is an error. May return an error wrapping ErrNotSupported. + Update(*ebpf.Program) error + + // Persist a link by pinning it into a bpffs. + // + // May return an error wrapping ErrNotSupported. + Pin(string) error + + // Undo a previous call to Pin. + // + // May return an error wrapping ErrNotSupported. + Unpin() error + + // Close frees resources. + // + // The link will be broken unless it has been successfully pinned. + // A link may continue past the lifetime of the process if Close is + // not called. + Close() error + + // Prevent external users from implementing this interface. + isLink() +} + +// ID uniquely identifies a BPF link. +type ID uint32 + +// RawLinkOptions control the creation of a raw link. +type RawLinkOptions struct { + // File descriptor to attach to. This differs for each attach type. + Target int + // Program to attach. + Program *ebpf.Program + // Attach must match the attach type of Program. + Attach ebpf.AttachType + // BTF is the BTF of the attachment target. + BTF btf.TypeID +} + +// RawLinkInfo contains metadata on a link. +type RawLinkInfo struct { + Type Type + ID ID + Program ebpf.ProgramID +} + +// RawLink is the low-level API to bpf_link. +// +// You should consider using the higher level interfaces in this +// package instead. +type RawLink struct { + fd *internal.FD + pinnedPath string +} + +// AttachRawLink creates a raw link. +func AttachRawLink(opts RawLinkOptions) (*RawLink, error) { + if err := haveBPFLink(); err != nil { + return nil, err + } + + if opts.Target < 0 { + return nil, fmt.Errorf("invalid target: %s", internal.ErrClosedFd) + } + + progFd := opts.Program.FD() + if progFd < 0 { + return nil, fmt.Errorf("invalid program: %s", internal.ErrClosedFd) + } + + attr := bpfLinkCreateAttr{ + targetFd: uint32(opts.Target), + progFd: uint32(progFd), + attachType: opts.Attach, + targetBTFID: uint32(opts.BTF), + } + fd, err := bpfLinkCreate(&attr) + if err != nil { + return nil, fmt.Errorf("can't create link: %s", err) + } + + return &RawLink{fd, ""}, nil +} + +// LoadPinnedRawLink loads a persisted link from a bpffs. +// +// Returns an error if the pinned link type doesn't match linkType. Pass +// UnspecifiedType to disable this behaviour. +func LoadPinnedRawLink(fileName string, linkType Type, opts *ebpf.LoadPinOptions) (*RawLink, error) { + fd, err := internal.BPFObjGet(fileName, opts.Marshal()) + if err != nil { + return nil, fmt.Errorf("load pinned link: %w", err) + } + + link := &RawLink{fd, fileName} + if linkType == UnspecifiedType { + return link, nil + } + + info, err := link.Info() + if err != nil { + link.Close() + return nil, fmt.Errorf("get pinned link info: %s", err) + } + + if info.Type != linkType { + link.Close() + return nil, fmt.Errorf("link type %v doesn't match %v", info.Type, linkType) + } + + return link, nil +} + +func (l *RawLink) isLink() {} + +// FD returns the raw file descriptor. +func (l *RawLink) FD() int { + fd, err := l.fd.Value() + if err != nil { + return -1 + } + return int(fd) +} + +// Close breaks the link. +// +// Use Pin if you want to make the link persistent. +func (l *RawLink) Close() error { + return l.fd.Close() +} + +// Pin persists a link past the lifetime of the process. +// +// Calling Close on a pinned Link will not break the link +// until the pin is removed. +func (l *RawLink) Pin(fileName string) error { + if err := internal.Pin(l.pinnedPath, fileName, l.fd); err != nil { + return err + } + l.pinnedPath = fileName + return nil +} + +// Unpin implements the Link interface. +func (l *RawLink) Unpin() error { + if err := internal.Unpin(l.pinnedPath); err != nil { + return err + } + l.pinnedPath = "" + return nil +} + +// Update implements the Link interface. +func (l *RawLink) Update(new *ebpf.Program) error { + return l.UpdateArgs(RawLinkUpdateOptions{ + New: new, + }) +} + +// RawLinkUpdateOptions control the behaviour of RawLink.UpdateArgs. +type RawLinkUpdateOptions struct { + New *ebpf.Program + Old *ebpf.Program + Flags uint32 +} + +// UpdateArgs updates a link based on args. +func (l *RawLink) UpdateArgs(opts RawLinkUpdateOptions) error { + newFd := opts.New.FD() + if newFd < 0 { + return fmt.Errorf("invalid program: %s", internal.ErrClosedFd) + } + + var oldFd int + if opts.Old != nil { + oldFd = opts.Old.FD() + if oldFd < 0 { + return fmt.Errorf("invalid replacement program: %s", internal.ErrClosedFd) + } + } + + linkFd, err := l.fd.Value() + if err != nil { + return fmt.Errorf("can't update link: %s", err) + } + + attr := bpfLinkUpdateAttr{ + linkFd: linkFd, + newProgFd: uint32(newFd), + oldProgFd: uint32(oldFd), + flags: opts.Flags, + } + return bpfLinkUpdate(&attr) +} + +// struct bpf_link_info +type bpfLinkInfo struct { + typ uint32 + id uint32 + prog_id uint32 +} + +// Info returns metadata about the link. +func (l *RawLink) Info() (*RawLinkInfo, error) { + var info bpfLinkInfo + err := internal.BPFObjGetInfoByFD(l.fd, unsafe.Pointer(&info), unsafe.Sizeof(info)) + if err != nil { + return nil, fmt.Errorf("link info: %s", err) + } + + return &RawLinkInfo{ + Type(info.typ), + ID(info.id), + ebpf.ProgramID(info.prog_id), + }, nil +} diff --git a/vendor/github.com/cilium/ebpf/link/netns.go b/vendor/github.com/cilium/ebpf/link/netns.go new file mode 100644 index 0000000..37e5b84 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/link/netns.go @@ -0,0 +1,60 @@ +package link + +import ( + "fmt" + + "github.com/cilium/ebpf" +) + +// NetNsInfo contains metadata about a network namespace link. +type NetNsInfo struct { + RawLinkInfo +} + +// NetNsLink is a program attached to a network namespace. +type NetNsLink struct { + *RawLink +} + +// AttachNetNs attaches a program to a network namespace. +func AttachNetNs(ns int, prog *ebpf.Program) (*NetNsLink, error) { + var attach ebpf.AttachType + switch t := prog.Type(); t { + case ebpf.FlowDissector: + attach = ebpf.AttachFlowDissector + case ebpf.SkLookup: + attach = ebpf.AttachSkLookup + default: + return nil, fmt.Errorf("can't attach %v to network namespace", t) + } + + link, err := AttachRawLink(RawLinkOptions{ + Target: ns, + Program: prog, + Attach: attach, + }) + if err != nil { + return nil, err + } + + return &NetNsLink{link}, nil +} + +// LoadPinnedNetNs loads a network namespace link from bpffs. +func LoadPinnedNetNs(fileName string, opts *ebpf.LoadPinOptions) (*NetNsLink, error) { + link, err := LoadPinnedRawLink(fileName, NetNsType, opts) + if err != nil { + return nil, err + } + + return &NetNsLink{link}, nil +} + +// Info returns information about the link. +func (nns *NetNsLink) Info() (*NetNsInfo, error) { + info, err := nns.RawLink.Info() + if err != nil { + return nil, err + } + return &NetNsInfo{*info}, nil +} diff --git a/vendor/github.com/cilium/ebpf/link/perf_event.go b/vendor/github.com/cilium/ebpf/link/perf_event.go new file mode 100644 index 0000000..7e0443a --- /dev/null +++ b/vendor/github.com/cilium/ebpf/link/perf_event.go @@ -0,0 +1,272 @@ +package link + +import ( + "bytes" + "errors" + "fmt" + "os" + "path/filepath" + "regexp" + "runtime" + "strconv" + "strings" + "unsafe" + + "github.com/cilium/ebpf" + "github.com/cilium/ebpf/internal" + "github.com/cilium/ebpf/internal/unix" +) + +// Getting the terminology right is usually the hardest part. For posterity and +// for staying sane during implementation: +// +// - trace event: Representation of a kernel runtime hook. Filesystem entries +// under /events. Can be tracepoints (static), kprobes or uprobes. +// Can be instantiated into perf events (see below). +// - tracepoint: A predetermined hook point in the kernel. Exposed as trace +// events in (sub)directories under /events. Cannot be closed or +// removed, they are static. +// - k(ret)probe: Ephemeral trace events based on entry or exit points of +// exported kernel symbols. kprobe-based (tracefs) trace events can be +// created system-wide by writing to the /kprobe_events file, or +// they can be scoped to the current process by creating PMU perf events. +// - u(ret)probe: Ephemeral trace events based on user provides ELF binaries +// and offsets. uprobe-based (tracefs) trace events can be +// created system-wide by writing to the /uprobe_events file, or +// they can be scoped to the current process by creating PMU perf events. +// - perf event: An object instantiated based on an existing trace event or +// kernel symbol. Referred to by fd in userspace. +// Exactly one eBPF program can be attached to a perf event. Multiple perf +// events can be created from a single trace event. Closing a perf event +// stops any further invocations of the attached eBPF program. + +var ( + tracefsPath = "/sys/kernel/debug/tracing" + + // Trace event groups, names and kernel symbols must adhere to this set + // of characters. Non-empty, first character must not be a number, all + // characters must be alphanumeric or underscore. + rgxTraceEvent = regexp.MustCompile("^[a-zA-Z_][0-9a-zA-Z_]*$") + + errInvalidInput = errors.New("invalid input") +) + +const ( + perfAllThreads = -1 +) + +type perfEventType uint8 + +const ( + tracepointEvent perfEventType = iota + kprobeEvent + kretprobeEvent + uprobeEvent + uretprobeEvent +) + +// A perfEvent represents a perf event kernel object. Exactly one eBPF program +// can be attached to it. It is created based on a tracefs trace event or a +// Performance Monitoring Unit (PMU). +type perfEvent struct { + + // Group and name of the tracepoint/kprobe/uprobe. + group string + name string + + // PMU event ID read from sysfs. Valid IDs are non-zero. + pmuID uint64 + // ID of the trace event read from tracefs. Valid IDs are non-zero. + tracefsID uint64 + + // The event type determines the types of programs that can be attached. + typ perfEventType + + fd *internal.FD +} + +func (pe *perfEvent) isLink() {} + +func (pe *perfEvent) Pin(string) error { + return fmt.Errorf("pin perf event: %w", ErrNotSupported) +} + +func (pe *perfEvent) Unpin() error { + return fmt.Errorf("unpin perf event: %w", ErrNotSupported) +} + +// Since 4.15 (e87c6bc3852b "bpf: permit multiple bpf attachments for a single perf event"), +// calling PERF_EVENT_IOC_SET_BPF appends the given program to a prog_array +// owned by the perf event, which means multiple programs can be attached +// simultaneously. +// +// Before 4.15, calling PERF_EVENT_IOC_SET_BPF more than once on a perf event +// returns EEXIST. +// +// Detaching a program from a perf event is currently not possible, so a +// program replacement mechanism cannot be implemented for perf events. +func (pe *perfEvent) Update(prog *ebpf.Program) error { + return fmt.Errorf("can't replace eBPF program in perf event: %w", ErrNotSupported) +} + +func (pe *perfEvent) Close() error { + if pe.fd == nil { + return nil + } + + pfd, err := pe.fd.Value() + if err != nil { + return fmt.Errorf("getting perf event fd: %w", err) + } + + err = unix.IoctlSetInt(int(pfd), unix.PERF_EVENT_IOC_DISABLE, 0) + if err != nil { + return fmt.Errorf("disabling perf event: %w", err) + } + + err = pe.fd.Close() + if err != nil { + return fmt.Errorf("closing perf event fd: %w", err) + } + + switch pe.typ { + case kprobeEvent, kretprobeEvent: + // Clean up kprobe tracefs entry. + if pe.tracefsID != 0 { + return closeTraceFSProbeEvent(kprobeType, pe.group, pe.name) + } + case uprobeEvent, uretprobeEvent: + // Clean up uprobe tracefs entry. + if pe.tracefsID != 0 { + return closeTraceFSProbeEvent(uprobeType, pe.group, pe.name) + } + case tracepointEvent: + // Tracepoint trace events don't hold any extra resources. + return nil + } + + return nil +} + +// attach the given eBPF prog to the perf event stored in pe. +// pe must contain a valid perf event fd. +// prog's type must match the program type stored in pe. +func (pe *perfEvent) attach(prog *ebpf.Program) error { + if prog == nil { + return errors.New("cannot attach a nil program") + } + if pe.fd == nil { + return errors.New("cannot attach to nil perf event") + } + if prog.FD() < 0 { + return fmt.Errorf("invalid program: %w", internal.ErrClosedFd) + } + switch pe.typ { + case kprobeEvent, kretprobeEvent, uprobeEvent, uretprobeEvent: + if t := prog.Type(); t != ebpf.Kprobe { + return fmt.Errorf("invalid program type (expected %s): %s", ebpf.Kprobe, t) + } + case tracepointEvent: + if t := prog.Type(); t != ebpf.TracePoint { + return fmt.Errorf("invalid program type (expected %s): %s", ebpf.TracePoint, t) + } + default: + return fmt.Errorf("unknown perf event type: %d", pe.typ) + } + + // The ioctl below will fail when the fd is invalid. + kfd, _ := pe.fd.Value() + + // Assign the eBPF program to the perf event. + err := unix.IoctlSetInt(int(kfd), unix.PERF_EVENT_IOC_SET_BPF, prog.FD()) + if err != nil { + return fmt.Errorf("setting perf event bpf program: %w", err) + } + + // PERF_EVENT_IOC_ENABLE and _DISABLE ignore their given values. + if err := unix.IoctlSetInt(int(kfd), unix.PERF_EVENT_IOC_ENABLE, 0); err != nil { + return fmt.Errorf("enable perf event: %s", err) + } + + // Close the perf event when its reference is lost to avoid leaking system resources. + runtime.SetFinalizer(pe, (*perfEvent).Close) + return nil +} + +// unsafeStringPtr returns an unsafe.Pointer to a NUL-terminated copy of str. +func unsafeStringPtr(str string) (unsafe.Pointer, error) { + p, err := unix.BytePtrFromString(str) + if err != nil { + return nil, err + } + return unsafe.Pointer(p), nil +} + +// getTraceEventID reads a trace event's ID from tracefs given its group and name. +// group and name must be alphanumeric or underscore, as required by the kernel. +func getTraceEventID(group, name string) (uint64, error) { + tid, err := uint64FromFile(tracefsPath, "events", group, name, "id") + if errors.Is(err, os.ErrNotExist) { + return 0, fmt.Errorf("trace event %s/%s: %w", group, name, os.ErrNotExist) + } + if err != nil { + return 0, fmt.Errorf("reading trace event ID of %s/%s: %w", group, name, err) + } + + return tid, nil +} + +// getPMUEventType reads a Performance Monitoring Unit's type (numeric identifier) +// from /sys/bus/event_source/devices//type. +// +// Returns ErrNotSupported if the pmu type is not supported. +func getPMUEventType(typ probeType) (uint64, error) { + et, err := uint64FromFile("/sys/bus/event_source/devices", typ.String(), "type") + if errors.Is(err, os.ErrNotExist) { + return 0, fmt.Errorf("pmu type %s: %w", typ, ErrNotSupported) + } + if err != nil { + return 0, fmt.Errorf("reading pmu type %s: %w", typ, err) + } + + return et, nil +} + +// openTracepointPerfEvent opens a tracepoint-type perf event. System-wide +// [k,u]probes created by writing to /[k,u]probe_events are tracepoints +// behind the scenes, and can be attached to using these perf events. +func openTracepointPerfEvent(tid uint64, pid int) (*internal.FD, error) { + attr := unix.PerfEventAttr{ + Type: unix.PERF_TYPE_TRACEPOINT, + Config: tid, + Sample_type: unix.PERF_SAMPLE_RAW, + Sample: 1, + Wakeup: 1, + } + + fd, err := unix.PerfEventOpen(&attr, pid, 0, -1, unix.PERF_FLAG_FD_CLOEXEC) + if err != nil { + return nil, fmt.Errorf("opening tracepoint perf event: %w", err) + } + + return internal.NewFD(uint32(fd)), nil +} + +// uint64FromFile reads a uint64 from a file. All elements of path are sanitized +// and joined onto base. Returns error if base no longer prefixes the path after +// joining all components. +func uint64FromFile(base string, path ...string) (uint64, error) { + l := filepath.Join(path...) + p := filepath.Join(base, l) + if !strings.HasPrefix(p, base) { + return 0, fmt.Errorf("path '%s' attempts to escape base path '%s': %w", l, base, errInvalidInput) + } + + data, err := os.ReadFile(p) + if err != nil { + return 0, fmt.Errorf("reading file %s: %w", p, err) + } + + et := bytes.TrimSpace(data) + return strconv.ParseUint(string(et), 10, 64) +} diff --git a/vendor/github.com/cilium/ebpf/link/platform.go b/vendor/github.com/cilium/ebpf/link/platform.go new file mode 100644 index 0000000..eb6f7b7 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/link/platform.go @@ -0,0 +1,25 @@ +package link + +import ( + "fmt" + "runtime" +) + +func platformPrefix(symbol string) string { + + prefix := runtime.GOARCH + + // per https://github.com/golang/go/blob/master/src/go/build/syslist.go + switch prefix { + case "386": + prefix = "ia32" + case "amd64", "amd64p32": + prefix = "x64" + case "arm64", "arm64be": + prefix = "arm64" + default: + return symbol + } + + return fmt.Sprintf("__%s_%s", prefix, symbol) +} diff --git a/vendor/github.com/cilium/ebpf/link/program.go b/vendor/github.com/cilium/ebpf/link/program.go new file mode 100644 index 0000000..b90c457 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/link/program.go @@ -0,0 +1,76 @@ +package link + +import ( + "fmt" + + "github.com/cilium/ebpf" + "github.com/cilium/ebpf/internal" +) + +type RawAttachProgramOptions struct { + // File descriptor to attach to. This differs for each attach type. + Target int + // Program to attach. + Program *ebpf.Program + // Program to replace (cgroups). + Replace *ebpf.Program + // Attach must match the attach type of Program (and Replace). + Attach ebpf.AttachType + // Flags control the attach behaviour. This differs for each attach type. + Flags uint32 +} + +// RawAttachProgram is a low level wrapper around BPF_PROG_ATTACH. +// +// You should use one of the higher level abstractions available in this +// package if possible. +func RawAttachProgram(opts RawAttachProgramOptions) error { + if err := haveProgAttach(); err != nil { + return err + } + + var replaceFd uint32 + if opts.Replace != nil { + replaceFd = uint32(opts.Replace.FD()) + } + + attr := internal.BPFProgAttachAttr{ + TargetFd: uint32(opts.Target), + AttachBpfFd: uint32(opts.Program.FD()), + ReplaceBpfFd: replaceFd, + AttachType: uint32(opts.Attach), + AttachFlags: uint32(opts.Flags), + } + + if err := internal.BPFProgAttach(&attr); err != nil { + return fmt.Errorf("can't attach program: %w", err) + } + return nil +} + +type RawDetachProgramOptions struct { + Target int + Program *ebpf.Program + Attach ebpf.AttachType +} + +// RawDetachProgram is a low level wrapper around BPF_PROG_DETACH. +// +// You should use one of the higher level abstractions available in this +// package if possible. +func RawDetachProgram(opts RawDetachProgramOptions) error { + if err := haveProgAttach(); err != nil { + return err + } + + attr := internal.BPFProgDetachAttr{ + TargetFd: uint32(opts.Target), + AttachBpfFd: uint32(opts.Program.FD()), + AttachType: uint32(opts.Attach), + } + if err := internal.BPFProgDetach(&attr); err != nil { + return fmt.Errorf("can't detach program: %w", err) + } + + return nil +} diff --git a/vendor/github.com/cilium/ebpf/link/raw_tracepoint.go b/vendor/github.com/cilium/ebpf/link/raw_tracepoint.go new file mode 100644 index 0000000..f4beb1e --- /dev/null +++ b/vendor/github.com/cilium/ebpf/link/raw_tracepoint.go @@ -0,0 +1,61 @@ +package link + +import ( + "fmt" + + "github.com/cilium/ebpf" + "github.com/cilium/ebpf/internal" +) + +type RawTracepointOptions struct { + // Tracepoint name. + Name string + // Program must be of type RawTracepoint* + Program *ebpf.Program +} + +// AttachRawTracepoint links a BPF program to a raw_tracepoint. +// +// Requires at least Linux 4.17. +func AttachRawTracepoint(opts RawTracepointOptions) (Link, error) { + if t := opts.Program.Type(); t != ebpf.RawTracepoint && t != ebpf.RawTracepointWritable { + return nil, fmt.Errorf("invalid program type %s, expected RawTracepoint(Writable)", t) + } + if opts.Program.FD() < 0 { + return nil, fmt.Errorf("invalid program: %w", internal.ErrClosedFd) + } + + fd, err := bpfRawTracepointOpen(&bpfRawTracepointOpenAttr{ + name: internal.NewStringPointer(opts.Name), + fd: uint32(opts.Program.FD()), + }) + if err != nil { + return nil, err + } + + return &progAttachRawTracepoint{fd: fd}, nil +} + +type progAttachRawTracepoint struct { + fd *internal.FD +} + +var _ Link = (*progAttachRawTracepoint)(nil) + +func (rt *progAttachRawTracepoint) isLink() {} + +func (rt *progAttachRawTracepoint) Close() error { + return rt.fd.Close() +} + +func (rt *progAttachRawTracepoint) Update(_ *ebpf.Program) error { + return fmt.Errorf("can't update raw_tracepoint: %w", ErrNotSupported) +} + +func (rt *progAttachRawTracepoint) Pin(_ string) error { + return fmt.Errorf("can't pin raw_tracepoint: %w", ErrNotSupported) +} + +func (rt *progAttachRawTracepoint) Unpin() error { + return fmt.Errorf("unpin raw_tracepoint: %w", ErrNotSupported) +} diff --git a/vendor/github.com/cilium/ebpf/link/syscalls.go b/vendor/github.com/cilium/ebpf/link/syscalls.go new file mode 100644 index 0000000..a614994 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/link/syscalls.go @@ -0,0 +1,191 @@ +package link + +import ( + "errors" + "unsafe" + + "github.com/cilium/ebpf" + "github.com/cilium/ebpf/asm" + "github.com/cilium/ebpf/internal" + "github.com/cilium/ebpf/internal/unix" +) + +// Type is the kind of link. +type Type uint32 + +// Valid link types. +// +// Equivalent to enum bpf_link_type. +const ( + UnspecifiedType Type = iota + RawTracepointType + TracingType + CgroupType + IterType + NetNsType + XDPType +) + +var haveProgAttach = internal.FeatureTest("BPF_PROG_ATTACH", "4.10", func() error { + prog, err := ebpf.NewProgram(&ebpf.ProgramSpec{ + Type: ebpf.CGroupSKB, + AttachType: ebpf.AttachCGroupInetIngress, + License: "MIT", + Instructions: asm.Instructions{ + asm.Mov.Imm(asm.R0, 0), + asm.Return(), + }, + }) + if err != nil { + return internal.ErrNotSupported + } + + // BPF_PROG_ATTACH was introduced at the same time as CGgroupSKB, + // so being able to load the program is enough to infer that we + // have the syscall. + prog.Close() + return nil +}) + +var haveProgAttachReplace = internal.FeatureTest("BPF_PROG_ATTACH atomic replacement", "5.5", func() error { + if err := haveProgAttach(); err != nil { + return err + } + + prog, err := ebpf.NewProgram(&ebpf.ProgramSpec{ + Type: ebpf.CGroupSKB, + AttachType: ebpf.AttachCGroupInetIngress, + License: "MIT", + Instructions: asm.Instructions{ + asm.Mov.Imm(asm.R0, 0), + asm.Return(), + }, + }) + if err != nil { + return internal.ErrNotSupported + } + defer prog.Close() + + // We know that we have BPF_PROG_ATTACH since we can load CGroupSKB programs. + // If passing BPF_F_REPLACE gives us EINVAL we know that the feature isn't + // present. + attr := internal.BPFProgAttachAttr{ + // We rely on this being checked after attachFlags. + TargetFd: ^uint32(0), + AttachBpfFd: uint32(prog.FD()), + AttachType: uint32(ebpf.AttachCGroupInetIngress), + AttachFlags: uint32(flagReplace), + } + + err = internal.BPFProgAttach(&attr) + if errors.Is(err, unix.EINVAL) { + return internal.ErrNotSupported + } + if errors.Is(err, unix.EBADF) { + return nil + } + return err +}) + +type bpfLinkCreateAttr struct { + progFd uint32 + targetFd uint32 + attachType ebpf.AttachType + flags uint32 + targetBTFID uint32 +} + +func bpfLinkCreate(attr *bpfLinkCreateAttr) (*internal.FD, error) { + ptr, err := internal.BPF(internal.BPF_LINK_CREATE, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + if err != nil { + return nil, err + } + return internal.NewFD(uint32(ptr)), nil +} + +type bpfLinkCreateIterAttr struct { + prog_fd uint32 + target_fd uint32 + attach_type ebpf.AttachType + flags uint32 + iter_info internal.Pointer + iter_info_len uint32 +} + +func bpfLinkCreateIter(attr *bpfLinkCreateIterAttr) (*internal.FD, error) { + ptr, err := internal.BPF(internal.BPF_LINK_CREATE, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + if err != nil { + return nil, err + } + return internal.NewFD(uint32(ptr)), nil +} + +type bpfLinkUpdateAttr struct { + linkFd uint32 + newProgFd uint32 + flags uint32 + oldProgFd uint32 +} + +func bpfLinkUpdate(attr *bpfLinkUpdateAttr) error { + _, err := internal.BPF(internal.BPF_LINK_UPDATE, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + return err +} + +var haveBPFLink = internal.FeatureTest("bpf_link", "5.7", func() error { + prog, err := ebpf.NewProgram(&ebpf.ProgramSpec{ + Type: ebpf.CGroupSKB, + AttachType: ebpf.AttachCGroupInetIngress, + License: "MIT", + Instructions: asm.Instructions{ + asm.Mov.Imm(asm.R0, 0), + asm.Return(), + }, + }) + if err != nil { + return internal.ErrNotSupported + } + defer prog.Close() + + attr := bpfLinkCreateAttr{ + // This is a hopefully invalid file descriptor, which triggers EBADF. + targetFd: ^uint32(0), + progFd: uint32(prog.FD()), + attachType: ebpf.AttachCGroupInetIngress, + } + _, err = bpfLinkCreate(&attr) + if errors.Is(err, unix.EINVAL) { + return internal.ErrNotSupported + } + if errors.Is(err, unix.EBADF) { + return nil + } + return err +}) + +type bpfIterCreateAttr struct { + linkFd uint32 + flags uint32 +} + +func bpfIterCreate(attr *bpfIterCreateAttr) (*internal.FD, error) { + ptr, err := internal.BPF(internal.BPF_ITER_CREATE, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + if err == nil { + return internal.NewFD(uint32(ptr)), nil + } + return nil, err +} + +type bpfRawTracepointOpenAttr struct { + name internal.Pointer + fd uint32 + _ uint32 +} + +func bpfRawTracepointOpen(attr *bpfRawTracepointOpenAttr) (*internal.FD, error) { + ptr, err := internal.BPF(internal.BPF_RAW_TRACEPOINT_OPEN, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) + if err == nil { + return internal.NewFD(uint32(ptr)), nil + } + return nil, err +} diff --git a/vendor/github.com/cilium/ebpf/link/tracepoint.go b/vendor/github.com/cilium/ebpf/link/tracepoint.go new file mode 100644 index 0000000..7423df8 --- /dev/null +++ b/vendor/github.com/cilium/ebpf/link/tracepoint.go @@ -0,0 +1,60 @@ +package link + +import ( + "fmt" + + "github.com/cilium/ebpf" +) + +// Tracepoint attaches the given eBPF program to the tracepoint with the given +// group and name. See /sys/kernel/debug/tracing/events to find available +// tracepoints. The top-level directory is the group, the event's subdirectory +// is the name. Example: +// +// tp, err := Tracepoint("syscalls", "sys_enter_fork", prog) +// +// Losing the reference to the resulting Link (tp) will close the Tracepoint +// and prevent further execution of prog. The Link must be Closed during +// program shutdown to avoid leaking system resources. +// +// Note that attaching eBPF programs to syscalls (sys_enter_*/sys_exit_*) is +// only possible as of kernel 4.14 (commit cf5f5ce). +func Tracepoint(group, name string, prog *ebpf.Program) (Link, error) { + if group == "" || name == "" { + return nil, fmt.Errorf("group and name cannot be empty: %w", errInvalidInput) + } + if prog == nil { + return nil, fmt.Errorf("prog cannot be nil: %w", errInvalidInput) + } + if !rgxTraceEvent.MatchString(group) || !rgxTraceEvent.MatchString(name) { + return nil, fmt.Errorf("group and name '%s/%s' must be alphanumeric or underscore: %w", group, name, errInvalidInput) + } + if prog.Type() != ebpf.TracePoint { + return nil, fmt.Errorf("eBPF program type %s is not a Tracepoint: %w", prog.Type(), errInvalidInput) + } + + tid, err := getTraceEventID(group, name) + if err != nil { + return nil, err + } + + fd, err := openTracepointPerfEvent(tid, perfAllThreads) + if err != nil { + return nil, err + } + + pe := &perfEvent{ + fd: fd, + tracefsID: tid, + group: group, + name: name, + typ: tracepointEvent, + } + + if err := pe.attach(prog); err != nil { + pe.Close() + return nil, err + } + + return pe, nil +} diff --git a/vendor/github.com/cilium/ebpf/link/uprobe.go b/vendor/github.com/cilium/ebpf/link/uprobe.go new file mode 100644 index 0000000..59170ce --- /dev/null +++ b/vendor/github.com/cilium/ebpf/link/uprobe.go @@ -0,0 +1,288 @@ +package link + +import ( + "debug/elf" + "errors" + "fmt" + "os" + "path/filepath" + "regexp" + "sync" + + "github.com/cilium/ebpf" + "github.com/cilium/ebpf/internal" +) + +var ( + uprobeEventsPath = filepath.Join(tracefsPath, "uprobe_events") + + // rgxUprobeSymbol is used to strip invalid characters from the uprobe symbol + // as they are not allowed to be used as the EVENT token in tracefs. + rgxUprobeSymbol = regexp.MustCompile("[^a-zA-Z0-9]+") + + uprobeRetprobeBit = struct { + once sync.Once + value uint64 + err error + }{} + + // ErrNoSymbol indicates that the given symbol was not found + // in the ELF symbols table. + ErrNoSymbol = errors.New("not found") +) + +// Executable defines an executable program on the filesystem. +type Executable struct { + // Path of the executable on the filesystem. + path string + // Parsed ELF symbols and dynamic symbols offsets. + offsets map[string]uint64 +} + +// UprobeOptions defines additional parameters that will be used +// when loading Uprobes. +type UprobeOptions struct { + // Symbol offset. Must be provided in case of external symbols (shared libs). + // If set, overrides the offset eventually parsed from the executable. + Offset uint64 + // Only set the uprobe on the given process ID. Useful when tracing + // shared library calls or programs that have many running instances. + PID int +} + +// To open a new Executable, use: +// +// OpenExecutable("/bin/bash") +// +// The returned value can then be used to open Uprobe(s). +func OpenExecutable(path string) (*Executable, error) { + if path == "" { + return nil, fmt.Errorf("path cannot be empty") + } + + f, err := os.Open(path) + if err != nil { + return nil, fmt.Errorf("open file '%s': %w", path, err) + } + defer f.Close() + + se, err := internal.NewSafeELFFile(f) + if err != nil { + return nil, fmt.Errorf("parse ELF file: %w", err) + } + + if se.Type != elf.ET_EXEC && se.Type != elf.ET_DYN { + // ELF is not an executable or a shared object. + return nil, errors.New("the given file is not an executable or a shared object") + } + + ex := Executable{ + path: path, + offsets: make(map[string]uint64), + } + + if err := ex.load(se); err != nil { + return nil, err + } + + return &ex, nil +} + +func (ex *Executable) load(f *internal.SafeELFFile) error { + syms, err := f.Symbols() + if err != nil && !errors.Is(err, elf.ErrNoSymbols) { + return err + } + + dynsyms, err := f.DynamicSymbols() + if err != nil && !errors.Is(err, elf.ErrNoSymbols) { + return err + } + + syms = append(syms, dynsyms...) + + for _, s := range syms { + if elf.ST_TYPE(s.Info) != elf.STT_FUNC { + // Symbol not associated with a function or other executable code. + continue + } + + off := s.Value + + // Loop over ELF segments. + for _, prog := range f.Progs { + // Skip uninteresting segments. + if prog.Type != elf.PT_LOAD || (prog.Flags&elf.PF_X) == 0 { + continue + } + + if prog.Vaddr <= s.Value && s.Value < (prog.Vaddr+prog.Memsz) { + // If the symbol value is contained in the segment, calculate + // the symbol offset. + // + // fn symbol offset = fn symbol VA - .text VA + .text offset + // + // stackoverflow.com/a/40249502 + off = s.Value - prog.Vaddr + prog.Off + break + } + } + + ex.offsets[s.Name] = off + } + + return nil +} + +func (ex *Executable) offset(symbol string) (uint64, error) { + if off, ok := ex.offsets[symbol]; ok { + // Symbols with location 0 from section undef are shared library calls and + // are relocated before the binary is executed. Dynamic linking is not + // implemented by the library, so mark this as unsupported for now. + // + // Since only offset values are stored and not elf.Symbol, if the value is 0, + // assume it's an external symbol. + if off == 0 { + return 0, fmt.Errorf("cannot resolve %s library call '%s', "+ + "consider providing the offset via options: %w", ex.path, symbol, ErrNotSupported) + } + return off, nil + } + return 0, fmt.Errorf("symbol %s: %w", symbol, ErrNoSymbol) +} + +// Uprobe attaches the given eBPF program to a perf event that fires when the +// given symbol starts executing in the given Executable. +// For example, /bin/bash::main(): +// +// ex, _ = OpenExecutable("/bin/bash") +// ex.Uprobe("main", prog, nil) +// +// When using symbols which belongs to shared libraries, +// an offset must be provided via options: +// +// up, err := ex.Uprobe("main", prog, &UprobeOptions{Offset: 0x123}) +// +// Losing the reference to the resulting Link (up) will close the Uprobe +// and prevent further execution of prog. The Link must be Closed during +// program shutdown to avoid leaking system resources. +// +// Functions provided by shared libraries can currently not be traced and +// will result in an ErrNotSupported. +func (ex *Executable) Uprobe(symbol string, prog *ebpf.Program, opts *UprobeOptions) (Link, error) { + u, err := ex.uprobe(symbol, prog, opts, false) + if err != nil { + return nil, err + } + + err = u.attach(prog) + if err != nil { + u.Close() + return nil, err + } + + return u, nil +} + +// Uretprobe attaches the given eBPF program to a perf event that fires right +// before the given symbol exits. For example, /bin/bash::main(): +// +// ex, _ = OpenExecutable("/bin/bash") +// ex.Uretprobe("main", prog, nil) +// +// When using symbols which belongs to shared libraries, +// an offset must be provided via options: +// +// up, err := ex.Uretprobe("main", prog, &UprobeOptions{Offset: 0x123}) +// +// Losing the reference to the resulting Link (up) will close the Uprobe +// and prevent further execution of prog. The Link must be Closed during +// program shutdown to avoid leaking system resources. +// +// Functions provided by shared libraries can currently not be traced and +// will result in an ErrNotSupported. +func (ex *Executable) Uretprobe(symbol string, prog *ebpf.Program, opts *UprobeOptions) (Link, error) { + u, err := ex.uprobe(symbol, prog, opts, true) + if err != nil { + return nil, err + } + + err = u.attach(prog) + if err != nil { + u.Close() + return nil, err + } + + return u, nil +} + +// uprobe opens a perf event for the given binary/symbol and attaches prog to it. +// If ret is true, create a uretprobe. +func (ex *Executable) uprobe(symbol string, prog *ebpf.Program, opts *UprobeOptions, ret bool) (*perfEvent, error) { + if prog == nil { + return nil, fmt.Errorf("prog cannot be nil: %w", errInvalidInput) + } + if prog.Type() != ebpf.Kprobe { + return nil, fmt.Errorf("eBPF program type %s is not Kprobe: %w", prog.Type(), errInvalidInput) + } + + var offset uint64 + if opts != nil && opts.Offset != 0 { + offset = opts.Offset + } else { + off, err := ex.offset(symbol) + if err != nil { + return nil, err + } + offset = off + } + + pid := perfAllThreads + if opts != nil && opts.PID != 0 { + pid = opts.PID + } + + // Use uprobe PMU if the kernel has it available. + tp, err := pmuUprobe(symbol, ex.path, offset, pid, ret) + if err == nil { + return tp, nil + } + if err != nil && !errors.Is(err, ErrNotSupported) { + return nil, fmt.Errorf("creating perf_uprobe PMU: %w", err) + } + + // Use tracefs if uprobe PMU is missing. + tp, err = tracefsUprobe(uprobeSanitizedSymbol(symbol), ex.path, offset, pid, ret) + if err != nil { + return nil, fmt.Errorf("creating trace event '%s:%s' in tracefs: %w", ex.path, symbol, err) + } + + return tp, nil +} + +// pmuUprobe opens a perf event based on the uprobe PMU. +func pmuUprobe(symbol, path string, offset uint64, pid int, ret bool) (*perfEvent, error) { + return pmuProbe(uprobeType, symbol, path, offset, pid, ret) +} + +// tracefsUprobe creates a Uprobe tracefs entry. +func tracefsUprobe(symbol, path string, offset uint64, pid int, ret bool) (*perfEvent, error) { + return tracefsProbe(uprobeType, symbol, path, offset, pid, ret) +} + +// uprobeSanitizedSymbol replaces every invalid characted for the tracefs api with an underscore. +func uprobeSanitizedSymbol(symbol string) string { + return rgxUprobeSymbol.ReplaceAllString(symbol, "_") +} + +// uprobePathOffset creates the PATH:OFFSET token for the tracefs api. +func uprobePathOffset(path string, offset uint64) string { + return fmt.Sprintf("%s:%#x", path, offset) +} + +func uretprobeBit() (uint64, error) { + uprobeRetprobeBit.once.Do(func() { + uprobeRetprobeBit.value, uprobeRetprobeBit.err = determineRetprobeBit(uprobeType) + }) + return uprobeRetprobeBit.value, uprobeRetprobeBit.err +} diff --git a/vendor/github.com/cilium/ebpf/linker.go b/vendor/github.com/cilium/ebpf/linker.go index da556c2..f3b1629 100644 --- a/vendor/github.com/cilium/ebpf/linker.go +++ b/vendor/github.com/cilium/ebpf/linker.go @@ -1,31 +1,59 @@ package ebpf import ( + "fmt" + "github.com/cilium/ebpf/asm" ) // link resolves bpf-to-bpf calls. // -// Each section may contain multiple functions / labels, and is only linked -// if the program being edited references one of these functions. +// Each library may contain multiple functions / labels, and is only linked +// if prog references one of these functions. // -// Sections must not require linking themselves. -func link(insns asm.Instructions, sections ...asm.Instructions) (asm.Instructions, error) { - for _, section := range sections { - var err error - insns, err = linkSection(insns, section) - if err != nil { - return nil, err +// Libraries also linked. +func link(prog *ProgramSpec, libs []*ProgramSpec) error { + var ( + linked = make(map[*ProgramSpec]bool) + pending = []asm.Instructions{prog.Instructions} + insns asm.Instructions + ) + for len(pending) > 0 { + insns, pending = pending[0], pending[1:] + for _, lib := range libs { + if linked[lib] { + continue + } + + needed, err := needSection(insns, lib.Instructions) + if err != nil { + return fmt.Errorf("linking %s: %w", lib.Name, err) + } + + if !needed { + continue + } + + linked[lib] = true + prog.Instructions = append(prog.Instructions, lib.Instructions...) + pending = append(pending, lib.Instructions) + + if prog.BTF != nil && lib.BTF != nil { + if err := prog.BTF.Append(lib.BTF); err != nil { + return fmt.Errorf("linking BTF of %s: %w", lib.Name, err) + } + } } } - return insns, nil + + return nil } -func linkSection(insns, section asm.Instructions) (asm.Instructions, error) { +func needSection(insns, section asm.Instructions) (bool, error) { // A map of symbols to the libraries which contain them. symbols, err := section.SymbolOffsets() if err != nil { - return nil, err + return false, err } for _, ins := range insns { @@ -33,7 +61,7 @@ func linkSection(insns, section asm.Instructions) (asm.Instructions, error) { continue } - if ins.OpCode.JumpOp() != asm.Call || ins.Src != asm.R1 { + if ins.OpCode.JumpOp() != asm.Call || ins.Src != asm.PseudoCall { continue } @@ -48,11 +76,84 @@ func linkSection(insns, section asm.Instructions) (asm.Instructions, error) { } // At this point we know that at least one function in the - // library is called from insns. Merge the two sections. - // The rewrite of ins.Constant happens in asm.Instruction.Marshal. - return append(insns, section...), nil + // library is called from insns, so we have to link it. + return true, nil } - // None of the functions in the section are called. Do nothing. - return insns, nil + // None of the functions in the section are called. + return false, nil +} + +func fixupJumpsAndCalls(insns asm.Instructions) error { + symbolOffsets := make(map[string]asm.RawInstructionOffset) + iter := insns.Iterate() + for iter.Next() { + ins := iter.Ins + + if ins.Symbol == "" { + continue + } + + if _, ok := symbolOffsets[ins.Symbol]; ok { + return fmt.Errorf("duplicate symbol %s", ins.Symbol) + } + + symbolOffsets[ins.Symbol] = iter.Offset + } + + iter = insns.Iterate() + for iter.Next() { + i := iter.Index + offset := iter.Offset + ins := iter.Ins + + if ins.Reference == "" { + continue + } + + switch { + case ins.IsFunctionCall() && ins.Constant == -1: + // Rewrite bpf to bpf call + callOffset, ok := symbolOffsets[ins.Reference] + if !ok { + return fmt.Errorf("call at %d: reference to missing symbol %q", i, ins.Reference) + } + + ins.Constant = int64(callOffset - offset - 1) + + case ins.OpCode.Class() == asm.JumpClass && ins.Offset == -1: + // Rewrite jump to label + jumpOffset, ok := symbolOffsets[ins.Reference] + if !ok { + return fmt.Errorf("jump at %d: reference to missing symbol %q", i, ins.Reference) + } + + ins.Offset = int16(jumpOffset - offset - 1) + + case ins.IsLoadFromMap() && ins.MapPtr() == -1: + return fmt.Errorf("map %s: %w", ins.Reference, errUnsatisfiedReference) + } + } + + // fixupBPFCalls replaces bpf_probe_read_{kernel,user}[_str] with bpf_probe_read[_str] on older kernels + // https://github.com/libbpf/libbpf/blob/master/src/libbpf.c#L6009 + iter = insns.Iterate() + for iter.Next() { + ins := iter.Ins + if !ins.IsBuiltinCall() { + continue + } + switch asm.BuiltinFunc(ins.Constant) { + case asm.FnProbeReadKernel, asm.FnProbeReadUser: + if err := haveProbeReadKernel(); err != nil { + ins.Constant = int64(asm.FnProbeRead) + } + case asm.FnProbeReadKernelStr, asm.FnProbeReadUserStr: + if err := haveProbeReadKernel(); err != nil { + ins.Constant = int64(asm.FnProbeReadStr) + } + } + } + + return nil } diff --git a/vendor/github.com/cilium/ebpf/map.go b/vendor/github.com/cilium/ebpf/map.go index 028a913..cca387e 100644 --- a/vendor/github.com/cilium/ebpf/map.go +++ b/vendor/github.com/cilium/ebpf/map.go @@ -1,15 +1,39 @@ package ebpf import ( + "bytes" + "errors" "fmt" - "unsafe" + "io" + "path/filepath" + "reflect" + "strings" "github.com/cilium/ebpf/internal" + "github.com/cilium/ebpf/internal/btf" "github.com/cilium/ebpf/internal/unix" - - "github.com/pkg/errors" ) +// Errors returned by Map and MapIterator methods. +var ( + ErrKeyNotExist = errors.New("key does not exist") + ErrKeyExist = errors.New("key already exists") + ErrIterationAborted = errors.New("iteration aborted") + ErrMapIncompatible = errors.New("map's spec is incompatible with pinned map") +) + +// MapOptions control loading a map into the kernel. +type MapOptions struct { + // The base path to pin maps in if requested via PinByName. + // Existing maps will be re-used if they are compatible, otherwise an + // error is returned. + PinPath string + LoadPinOptions LoadPinOptions +} + +// MapID represents the unique ID of an eBPF map +type MapID uint32 + // MapSpec defines a Map. type MapSpec struct { // Name is passed to the kernel as a debug aid. Must only contain @@ -19,9 +43,36 @@ type MapSpec struct { KeySize uint32 ValueSize uint32 MaxEntries uint32 - Flags uint32 + + // Flags is passed to the kernel and specifies additional map + // creation attributes. + Flags uint32 + + // Automatically pin and load a map from MapOptions.PinPath. + // Generates an error if an existing pinned map is incompatible with the MapSpec. + Pinning PinType + + // Specify numa node during map creation + // (effective only if unix.BPF_F_NUMA_NODE flag is set, + // which can be imported from golang.org/x/sys/unix) + NumaNode uint32 + + // The initial contents of the map. May be nil. + Contents []MapKV + + // Whether to freeze a map after setting its initial contents. + Freeze bool + // InnerMap is used as a template for ArrayOfMaps and HashOfMaps InnerMap *MapSpec + + // Extra trailing bytes found in the ELF map definition when using structs + // larger than libbpf's bpf_map_def. Must be empty before instantiating + // the MapSpec into a Map. + Extra bytes.Reader + + // The BTF associated with this map. + BTF *btf.Map } func (ms *MapSpec) String() string { @@ -29,16 +80,66 @@ func (ms *MapSpec) String() string { } // Copy returns a copy of the spec. +// +// MapSpec.Contents is a shallow copy. func (ms *MapSpec) Copy() *MapSpec { if ms == nil { return nil } cpy := *ms + + cpy.Contents = make([]MapKV, len(ms.Contents)) + copy(cpy.Contents, ms.Contents) + cpy.InnerMap = ms.InnerMap.Copy() + return &cpy } +func (ms *MapSpec) clampPerfEventArraySize() error { + if ms.Type != PerfEventArray { + return nil + } + + n, err := internal.PossibleCPUs() + if err != nil { + return fmt.Errorf("perf event array: %w", err) + } + + if n := uint32(n); ms.MaxEntries > n { + ms.MaxEntries = n + } + + return nil +} + +// MapKV is used to initialize the contents of a Map. +type MapKV struct { + Key interface{} + Value interface{} +} + +func (ms *MapSpec) checkCompatibility(m *Map) error { + switch { + case m.typ != ms.Type: + return fmt.Errorf("expected type %v, got %v: %w", ms.Type, m.typ, ErrMapIncompatible) + + case m.keySize != ms.KeySize: + return fmt.Errorf("expected key size %v, got %v: %w", ms.KeySize, m.keySize, ErrMapIncompatible) + + case m.valueSize != ms.ValueSize: + return fmt.Errorf("expected value size %v, got %v: %w", ms.ValueSize, m.valueSize, ErrMapIncompatible) + + case m.maxEntries != ms.MaxEntries: + return fmt.Errorf("expected max entries %v, got %v: %w", ms.MaxEntries, m.maxEntries, ErrMapIncompatible) + + case m.flags != ms.Flags: + return fmt.Errorf("expected flags %v, got %v: %w", ms.Flags, m.flags, ErrMapIncompatible) + } + return nil +} + // Map represents a Map file descriptor. // // It is not safe to close a map which is used by other goroutines. @@ -49,8 +150,14 @@ func (ms *MapSpec) Copy() *MapSpec { // Implement encoding.BinaryMarshaler or encoding.BinaryUnmarshaler // if you require custom encoding. type Map struct { - fd *bpfFD - abi MapABI + name string + fd *internal.FD + typ MapType + keySize uint32 + valueSize uint32 + maxEntries uint32 + flags uint32 + pinnedPath string // Per CPU maps return values larger than the size in the spec fullValueSize int } @@ -62,110 +169,270 @@ func NewMapFromFD(fd int) (*Map, error) { if fd < 0 { return nil, errors.New("invalid fd") } - bpfFd := newBPFFD(uint32(fd)) - abi, err := newMapABIFromFd(bpfFd) + return newMapFromFD(internal.NewFD(uint32(fd))) +} + +func newMapFromFD(fd *internal.FD) (*Map, error) { + info, err := newMapInfoFromFd(fd) if err != nil { - bpfFd.forget() - return nil, err + fd.Close() + return nil, fmt.Errorf("get map info: %s", err) } - return newMap(bpfFd, abi) + + return newMap(fd, info.Name, info.Type, info.KeySize, info.ValueSize, info.MaxEntries, info.Flags) } // NewMap creates a new Map. // +// It's equivalent to calling NewMapWithOptions with default options. +func NewMap(spec *MapSpec) (*Map, error) { + return NewMapWithOptions(spec, MapOptions{}) +} + +// NewMapWithOptions creates a new Map. +// // Creating a map for the first time will perform feature detection // by creating small, temporary maps. -func NewMap(spec *MapSpec) (*Map, error) { - if spec.Type != ArrayOfMaps && spec.Type != HashOfMaps { - return createMap(spec, nil) +// +// The caller is responsible for ensuring the process' rlimit is set +// sufficiently high for locking memory during map creation. This can be done +// by calling rlimit.RemoveMemlock() prior to calling NewMapWithOptions. +// +// May return an error wrapping ErrMapIncompatible. +func NewMapWithOptions(spec *MapSpec, opts MapOptions) (*Map, error) { + handles := newHandleCache() + defer handles.close() + + m, err := newMapWithOptions(spec, opts, handles) + if err != nil { + return nil, fmt.Errorf("creating map: %w", err) } - if spec.InnerMap == nil { - return nil, errors.Errorf("%s requires InnerMap", spec.Type) + err = m.finalize(spec) + if err != nil { + return nil, fmt.Errorf("populating map: %w", err) } - template, err := createMap(spec.InnerMap, nil) + return m, nil +} + +func newMapWithOptions(spec *MapSpec, opts MapOptions, handles *handleCache) (_ *Map, err error) { + closeOnError := func(c io.Closer) { + if err != nil { + c.Close() + } + } + + switch spec.Pinning { + case PinByName: + if spec.Name == "" { + return nil, fmt.Errorf("pin by name: missing Name") + } + + if opts.PinPath == "" { + return nil, fmt.Errorf("pin by name: missing MapOptions.PinPath") + } + + path := filepath.Join(opts.PinPath, spec.Name) + m, err := LoadPinnedMap(path, &opts.LoadPinOptions) + if errors.Is(err, unix.ENOENT) { + break + } + if err != nil { + return nil, fmt.Errorf("load pinned map: %w", err) + } + defer closeOnError(m) + + if err := spec.checkCompatibility(m); err != nil { + return nil, fmt.Errorf("use pinned map %s: %w", spec.Name, err) + } + + return m, nil + + case PinNone: + // Nothing to do here + + default: + return nil, fmt.Errorf("pin type %d: %w", int(spec.Pinning), ErrNotSupported) + } + + var innerFd *internal.FD + if spec.Type == ArrayOfMaps || spec.Type == HashOfMaps { + if spec.InnerMap == nil { + return nil, fmt.Errorf("%s requires InnerMap", spec.Type) + } + + if spec.InnerMap.Pinning != PinNone { + return nil, errors.New("inner maps cannot be pinned") + } + + template, err := spec.InnerMap.createMap(nil, opts, handles) + if err != nil { + return nil, fmt.Errorf("inner map: %w", err) + } + defer template.Close() + + // Intentionally skip populating and freezing (finalizing) + // the inner map template since it will be removed shortly. + + innerFd = template.fd + } + + m, err := spec.createMap(innerFd, opts, handles) if err != nil { return nil, err } - defer template.Close() + defer closeOnError(m) - return createMap(spec, template.fd) + if spec.Pinning == PinByName { + path := filepath.Join(opts.PinPath, spec.Name) + if err := m.Pin(path); err != nil { + return nil, fmt.Errorf("pin map: %s", err) + } + } + + return m, nil } -func createMap(spec *MapSpec, inner *bpfFD) (*Map, error) { +// createMap validates the spec's properties and creates the map in the kernel +// using the given opts. It does not populate or freeze the map. +func (spec *MapSpec) createMap(inner *internal.FD, opts MapOptions, handles *handleCache) (_ *Map, err error) { + closeOnError := func(closer io.Closer) { + if err != nil { + closer.Close() + } + } + spec = spec.Copy() + // Kernels 4.13 through 5.4 used a struct bpf_map_def that contained + // additional 'inner_map_idx' and later 'numa_node' fields. + // In order to support loading these definitions, tolerate the presence of + // extra bytes, but require them to be zeroes. + if _, err := io.Copy(internal.DiscardZeroes{}, &spec.Extra); err != nil { + return nil, errors.New("extra contains unhandled non-zero bytes, drain before creating map") + } + switch spec.Type { - case ArrayOfMaps: - fallthrough - case HashOfMaps: + case ArrayOfMaps, HashOfMaps: + if err := haveNestedMaps(); err != nil { + return nil, err + } + if spec.ValueSize != 0 && spec.ValueSize != 4 { - return nil, errors.Errorf("ValueSize must be zero or four for map of map") + return nil, errors.New("ValueSize must be zero or four for map of map") } spec.ValueSize = 4 case PerfEventArray: - if spec.KeySize != 0 { - return nil, errors.Errorf("KeySize must be zero for perf event array") + if spec.KeySize != 0 && spec.KeySize != 4 { + return nil, errors.New("KeySize must be zero or four for perf event array") } - if spec.ValueSize != 0 { - return nil, errors.Errorf("ValueSize must be zero for perf event array") + spec.KeySize = 4 + + if spec.ValueSize != 0 && spec.ValueSize != 4 { + return nil, errors.New("ValueSize must be zero or four for perf event array") } + spec.ValueSize = 4 + if spec.MaxEntries == 0 { - n, err := internal.OnlineCPUs() + n, err := internal.PossibleCPUs() if err != nil { - return nil, errors.Wrap(err, "perf event array") + return nil, fmt.Errorf("perf event array: %w", err) } spec.MaxEntries = uint32(n) } - - spec.KeySize = 4 - spec.ValueSize = 4 } - attr := bpfMapCreateAttr{ - mapType: spec.Type, - keySize: spec.KeySize, - valueSize: spec.ValueSize, - maxEntries: spec.MaxEntries, - flags: spec.Flags, + if spec.Flags&(unix.BPF_F_RDONLY_PROG|unix.BPF_F_WRONLY_PROG) > 0 || spec.Freeze { + if err := haveMapMutabilityModifiers(); err != nil { + return nil, fmt.Errorf("map create: %w", err) + } + } + if spec.Flags&unix.BPF_F_MMAPABLE > 0 { + if err := haveMmapableMaps(); err != nil { + return nil, fmt.Errorf("map create: %w", err) + } + } + if spec.Flags&unix.BPF_F_INNER_MAP > 0 { + if err := haveInnerMaps(); err != nil { + return nil, fmt.Errorf("map create: %w", err) + } + } + + attr := internal.BPFMapCreateAttr{ + MapType: uint32(spec.Type), + KeySize: spec.KeySize, + ValueSize: spec.ValueSize, + MaxEntries: spec.MaxEntries, + Flags: spec.Flags, + NumaNode: spec.NumaNode, } if inner != nil { var err error - attr.innerMapFd, err = inner.value() + attr.InnerMapFd, err = inner.Value() if err != nil { - return nil, errors.Wrap(err, "map create") + return nil, fmt.Errorf("map create: %w", err) } } - name, err := newBPFObjName(spec.Name) + if haveObjName() == nil { + attr.MapName = internal.NewBPFObjName(spec.Name) + } + + var btfDisabled bool + if spec.BTF != nil { + handle, err := handles.btfHandle(spec.BTF.Spec) + btfDisabled = errors.Is(err, btf.ErrNotSupported) + if err != nil && !btfDisabled { + return nil, fmt.Errorf("load BTF: %w", err) + } + + if handle != nil { + attr.BTFFd = uint32(handle.FD()) + attr.BTFKeyTypeID = uint32(spec.BTF.Key.ID()) + attr.BTFValueTypeID = uint32(spec.BTF.Value.ID()) + } + } + + fd, err := internal.BPFMapCreate(&attr) if err != nil { - return nil, errors.Wrap(err, "map create") + if errors.Is(err, unix.EPERM) { + return nil, fmt.Errorf("map create: %w (MEMLOCK bay be too low, consider rlimit.RemoveMemlock)", err) + } + if btfDisabled { + return nil, fmt.Errorf("map create without BTF: %w", err) + } + return nil, fmt.Errorf("map create: %w", err) } + defer closeOnError(fd) - if haveObjName.Result() { - attr.mapName = name - } - - fd, err := bpfMapCreate(&attr) + m, err := newMap(fd, spec.Name, spec.Type, spec.KeySize, spec.ValueSize, spec.MaxEntries, spec.Flags) if err != nil { - return nil, errors.Wrap(err, "map create") + return nil, fmt.Errorf("map create: %w", err) } - return newMap(fd, newMapABIFromSpec(spec)) + return m, nil } -func newMap(fd *bpfFD, abi *MapABI) (*Map, error) { +// newMap allocates and returns a new Map structure. +// Sets the fullValueSize on per-CPU maps. +func newMap(fd *internal.FD, name string, typ MapType, keySize, valueSize, maxEntries, flags uint32) (*Map, error) { m := &Map{ + name, fd, - *abi, - int(abi.ValueSize), + typ, + keySize, + valueSize, + maxEntries, + flags, + "", + int(valueSize), } - if !abi.Type.hasPerCPUValue() { + if !typ.hasPerCPUValue() { return m, nil } @@ -174,17 +441,45 @@ func newMap(fd *bpfFD, abi *MapABI) (*Map, error) { return nil, err } - m.fullValueSize = align(int(abi.ValueSize), 8) * possibleCPUs + m.fullValueSize = internal.Align(int(valueSize), 8) * possibleCPUs return m, nil } func (m *Map) String() string { - return fmt.Sprintf("%s#%d", m.abi.Type, m.fd) + if m.name != "" { + return fmt.Sprintf("%s(%s)#%v", m.typ, m.name, m.fd) + } + return fmt.Sprintf("%s#%v", m.typ, m.fd) } -// ABI gets the ABI of the Map -func (m *Map) ABI() MapABI { - return m.abi +// Type returns the underlying type of the map. +func (m *Map) Type() MapType { + return m.typ +} + +// KeySize returns the size of the map key in bytes. +func (m *Map) KeySize() uint32 { + return m.keySize +} + +// ValueSize returns the size of the map value in bytes. +func (m *Map) ValueSize() uint32 { + return m.valueSize +} + +// MaxEntries returns the maximum number of elements the map can hold. +func (m *Map) MaxEntries() uint32 { + return m.maxEntries +} + +// Flags returns the flags of the map. +func (m *Map) Flags() uint32 { + return m.flags +} + +// Info returns metadata about the map. +func (m *Map) Info() (*MapInfo, error) { + return newMapInfoFromFd(m.fd) } // Lookup retrieves a value from a Map. @@ -192,54 +487,32 @@ func (m *Map) ABI() MapABI { // Calls Close() on valueOut if it is of type **Map or **Program, // and *valueOut is not nil. // -// Returns an error if the key doesn't exist, see IsNotExist. +// Returns an error if the key doesn't exist, see ErrKeyNotExist. func (m *Map) Lookup(key, valueOut interface{}) error { valuePtr, valueBytes := makeBuffer(valueOut, m.fullValueSize) - if err := m.lookup(key, valuePtr); err != nil { return err } - if valueBytes == nil { - return nil + return m.unmarshalValue(valueOut, valueBytes) +} + +// LookupAndDelete retrieves and deletes a value from a Map. +// +// Returns ErrKeyNotExist if the key doesn't exist. +func (m *Map) LookupAndDelete(key, valueOut interface{}) error { + valuePtr, valueBytes := makeBuffer(valueOut, m.fullValueSize) + + keyPtr, err := m.marshalKey(key) + if err != nil { + return fmt.Errorf("can't marshal key: %w", err) } - if m.abi.Type.hasPerCPUValue() { - return unmarshalPerCPUValue(valueOut, int(m.abi.ValueSize), valueBytes) + if err := bpfMapLookupAndDelete(m.fd, keyPtr, valuePtr); err != nil { + return fmt.Errorf("lookup and delete failed: %w", err) } - switch value := valueOut.(type) { - case **Map: - m, err := unmarshalMap(valueBytes) - if err != nil { - return err - } - - (*value).Close() - *value = m - return nil - case *Map: - return errors.Errorf("can't unmarshal into %T, need %T", value, (**Map)(nil)) - case Map: - return errors.Errorf("can't unmarshal into %T, need %T", value, (**Map)(nil)) - - case **Program: - p, err := unmarshalProgram(valueBytes) - if err != nil { - return err - } - - (*value).Close() - *value = p - return nil - case *Program: - return errors.Errorf("can't unmarshal into %T, need %T", value, (**Program)(nil)) - case Program: - return errors.Errorf("can't unmarshal into %T, need %T", value, (**Program)(nil)) - - default: - return unmarshalBytes(valueOut, valueBytes) - } + return m.unmarshalValue(valueOut, valueBytes) } // LookupBytes gets a value from Map. @@ -247,24 +520,26 @@ func (m *Map) Lookup(key, valueOut interface{}) error { // Returns a nil value if a key doesn't exist. func (m *Map) LookupBytes(key interface{}) ([]byte, error) { valueBytes := make([]byte, m.fullValueSize) - valuePtr := newPtr(unsafe.Pointer(&valueBytes[0])) + valuePtr := internal.NewSlicePointer(valueBytes) err := m.lookup(key, valuePtr) - if IsNotExist(err) { + if errors.Is(err, ErrKeyNotExist) { return nil, nil } return valueBytes, err } -func (m *Map) lookup(key interface{}, valueOut syscallPtr) error { - keyPtr, err := marshalPtr(key, int(m.abi.KeySize)) +func (m *Map) lookup(key interface{}, valueOut internal.Pointer) error { + keyPtr, err := m.marshalKey(key) if err != nil { - return errors.WithMessage(err, "can't marshal key") + return fmt.Errorf("can't marshal key: %w", err) } - err = bpfMapLookupElem(m.fd, keyPtr, valueOut) - return errors.WithMessage(err, "lookup failed") + if err = bpfMapLookupElem(m.fd, keyPtr, valueOut); err != nil { + return fmt.Errorf("lookup failed: %w", err) + } + return nil } // MapUpdateFlags controls the behaviour of the Map.Update call. @@ -290,53 +565,54 @@ func (m *Map) Put(key, value interface{}) error { // Update changes the value of a key. func (m *Map) Update(key, value interface{}, flags MapUpdateFlags) error { - keyPtr, err := marshalPtr(key, int(m.abi.KeySize)) + keyPtr, err := m.marshalKey(key) if err != nil { - return errors.WithMessage(err, "can't marshal key") + return fmt.Errorf("can't marshal key: %w", err) } - var valuePtr syscallPtr - if m.abi.Type.hasPerCPUValue() { - valuePtr, err = marshalPerCPUValue(value, int(m.abi.ValueSize)) - } else { - valuePtr, err = marshalPtr(value, int(m.abi.ValueSize)) - } + valuePtr, err := m.marshalValue(value) if err != nil { - return errors.WithMessage(err, "can't marshal value") + return fmt.Errorf("can't marshal value: %w", err) } - return bpfMapUpdateElem(m.fd, keyPtr, valuePtr, uint64(flags)) + if err = bpfMapUpdateElem(m.fd, keyPtr, valuePtr, uint64(flags)); err != nil { + return fmt.Errorf("update failed: %w", err) + } + + return nil } // Delete removes a value. // -// Returns an error if the key does not exist, see IsNotExist. +// Returns ErrKeyNotExist if the key does not exist. func (m *Map) Delete(key interface{}) error { - keyPtr, err := marshalPtr(key, int(m.abi.KeySize)) + keyPtr, err := m.marshalKey(key) if err != nil { - return errors.WithMessage(err, "can't marshal key") + return fmt.Errorf("can't marshal key: %w", err) } - err = bpfMapDeleteElem(m.fd, keyPtr) - return errors.WithMessage(err, "can't delete key") + if err = bpfMapDeleteElem(m.fd, keyPtr); err != nil { + return fmt.Errorf("delete failed: %w", err) + } + return nil } // NextKey finds the key following an initial key. // // See NextKeyBytes for details. +// +// Returns ErrKeyNotExist if there is no next key. func (m *Map) NextKey(key, nextKeyOut interface{}) error { - nextKeyPtr, nextKeyBytes := makeBuffer(nextKeyOut, int(m.abi.KeySize)) + nextKeyPtr, nextKeyBytes := makeBuffer(nextKeyOut, int(m.keySize)) if err := m.nextKey(key, nextKeyPtr); err != nil { return err } - if nextKeyBytes == nil { - return nil + if err := m.unmarshalKey(nextKeyOut, nextKeyBytes); err != nil { + return fmt.Errorf("can't unmarshal next key: %w", err) } - - err := unmarshalBytes(nextKeyOut, nextKeyBytes) - return errors.WithMessage(err, "can't unmarshal next key") + return nil } // NextKeyBytes returns the key following an initial key as a byte slice. @@ -344,33 +620,189 @@ func (m *Map) NextKey(key, nextKeyOut interface{}) error { // Passing nil will return the first key. // // Use Iterate if you want to traverse all entries in the map. +// +// Returns nil if there are no more keys. func (m *Map) NextKeyBytes(key interface{}) ([]byte, error) { - nextKey := make([]byte, m.abi.KeySize) - nextKeyPtr := newPtr(unsafe.Pointer(&nextKey[0])) + nextKey := make([]byte, m.keySize) + nextKeyPtr := internal.NewSlicePointer(nextKey) err := m.nextKey(key, nextKeyPtr) - if IsNotExist(err) { + if errors.Is(err, ErrKeyNotExist) { return nil, nil } return nextKey, err } -func (m *Map) nextKey(key interface{}, nextKeyOut syscallPtr) error { +func (m *Map) nextKey(key interface{}, nextKeyOut internal.Pointer) error { var ( - keyPtr syscallPtr + keyPtr internal.Pointer err error ) if key != nil { - keyPtr, err = marshalPtr(key, int(m.abi.KeySize)) + keyPtr, err = m.marshalKey(key) if err != nil { - return errors.WithMessage(err, "can't marshal key") + return fmt.Errorf("can't marshal key: %w", err) } } - err = bpfMapGetNextKey(m.fd, keyPtr, nextKeyOut) - return errors.WithMessage(err, "can't get next key") + if err = bpfMapGetNextKey(m.fd, keyPtr, nextKeyOut); err != nil { + return fmt.Errorf("next key failed: %w", err) + } + return nil +} + +// BatchLookup looks up many elements in a map at once. +// +// "keysOut" and "valuesOut" must be of type slice, a pointer +// to a slice or buffer will not work. +// "prevKey" is the key to start the batch lookup from, it will +// *not* be included in the results. Use nil to start at the first key. +// +// ErrKeyNotExist is returned when the batch lookup has reached +// the end of all possible results, even when partial results +// are returned. It should be used to evaluate when lookup is "done". +func (m *Map) BatchLookup(prevKey, nextKeyOut, keysOut, valuesOut interface{}, opts *BatchOptions) (int, error) { + return m.batchLookup(internal.BPF_MAP_LOOKUP_BATCH, prevKey, nextKeyOut, keysOut, valuesOut, opts) +} + +// BatchLookupAndDelete looks up many elements in a map at once, +// +// It then deletes all those elements. +// "keysOut" and "valuesOut" must be of type slice, a pointer +// to a slice or buffer will not work. +// "prevKey" is the key to start the batch lookup from, it will +// *not* be included in the results. Use nil to start at the first key. +// +// ErrKeyNotExist is returned when the batch lookup has reached +// the end of all possible results, even when partial results +// are returned. It should be used to evaluate when lookup is "done". +func (m *Map) BatchLookupAndDelete(prevKey, nextKeyOut, keysOut, valuesOut interface{}, opts *BatchOptions) (int, error) { + return m.batchLookup(internal.BPF_MAP_LOOKUP_AND_DELETE_BATCH, prevKey, nextKeyOut, keysOut, valuesOut, opts) +} + +func (m *Map) batchLookup(cmd internal.BPFCmd, startKey, nextKeyOut, keysOut, valuesOut interface{}, opts *BatchOptions) (int, error) { + if err := haveBatchAPI(); err != nil { + return 0, err + } + if m.typ.hasPerCPUValue() { + return 0, ErrNotSupported + } + keysValue := reflect.ValueOf(keysOut) + if keysValue.Kind() != reflect.Slice { + return 0, fmt.Errorf("keys must be a slice") + } + valuesValue := reflect.ValueOf(valuesOut) + if valuesValue.Kind() != reflect.Slice { + return 0, fmt.Errorf("valuesOut must be a slice") + } + count := keysValue.Len() + if count != valuesValue.Len() { + return 0, fmt.Errorf("keysOut and valuesOut must be the same length") + } + keyBuf := make([]byte, count*int(m.keySize)) + keyPtr := internal.NewSlicePointer(keyBuf) + valueBuf := make([]byte, count*int(m.fullValueSize)) + valuePtr := internal.NewSlicePointer(valueBuf) + + var ( + startPtr internal.Pointer + err error + retErr error + ) + if startKey != nil { + startPtr, err = marshalPtr(startKey, int(m.keySize)) + if err != nil { + return 0, err + } + } + nextPtr, nextBuf := makeBuffer(nextKeyOut, int(m.keySize)) + + ct, err := bpfMapBatch(cmd, m.fd, startPtr, nextPtr, keyPtr, valuePtr, uint32(count), opts) + if err != nil { + if !errors.Is(err, ErrKeyNotExist) { + return 0, err + } + retErr = ErrKeyNotExist + } + + err = m.unmarshalKey(nextKeyOut, nextBuf) + if err != nil { + return 0, err + } + err = unmarshalBytes(keysOut, keyBuf) + if err != nil { + return 0, err + } + err = unmarshalBytes(valuesOut, valueBuf) + if err != nil { + retErr = err + } + return int(ct), retErr +} + +// BatchUpdate updates the map with multiple keys and values +// simultaneously. +// "keys" and "values" must be of type slice, a pointer +// to a slice or buffer will not work. +func (m *Map) BatchUpdate(keys, values interface{}, opts *BatchOptions) (int, error) { + if err := haveBatchAPI(); err != nil { + return 0, err + } + if m.typ.hasPerCPUValue() { + return 0, ErrNotSupported + } + keysValue := reflect.ValueOf(keys) + if keysValue.Kind() != reflect.Slice { + return 0, fmt.Errorf("keys must be a slice") + } + valuesValue := reflect.ValueOf(values) + if valuesValue.Kind() != reflect.Slice { + return 0, fmt.Errorf("values must be a slice") + } + var ( + count = keysValue.Len() + valuePtr internal.Pointer + err error + ) + if count != valuesValue.Len() { + return 0, fmt.Errorf("keys and values must be the same length") + } + keyPtr, err := marshalPtr(keys, count*int(m.keySize)) + if err != nil { + return 0, err + } + valuePtr, err = marshalPtr(values, count*int(m.valueSize)) + if err != nil { + return 0, err + } + var nilPtr internal.Pointer + ct, err := bpfMapBatch(internal.BPF_MAP_UPDATE_BATCH, m.fd, nilPtr, nilPtr, keyPtr, valuePtr, uint32(count), opts) + return int(ct), err +} + +// BatchDelete batch deletes entries in the map by keys. +// "keys" must be of type slice, a pointer to a slice or buffer will not work. +func (m *Map) BatchDelete(keys interface{}, opts *BatchOptions) (int, error) { + if err := haveBatchAPI(); err != nil { + return 0, err + } + if m.typ.hasPerCPUValue() { + return 0, ErrNotSupported + } + keysValue := reflect.ValueOf(keys) + if keysValue.Kind() != reflect.Slice { + return 0, fmt.Errorf("keys must be a slice") + } + count := keysValue.Len() + keyPtr, err := marshalPtr(keys, count*int(m.keySize)) + if err != nil { + return 0, fmt.Errorf("cannot marshal keys: %v", err) + } + var nilPtr internal.Pointer + ct, err := bpfMapBatch(internal.BPF_MAP_DELETE_BATCH, m.fd, nilPtr, nilPtr, keyPtr, nilPtr, uint32(count), opts) + return int(ct), err } // Iterate traverses a map. @@ -391,14 +823,14 @@ func (m *Map) Close() error { return nil } - return m.fd.close() + return m.fd.Close() } // FD gets the file descriptor of the Map. // // Calling this function is invalid after Close has been called. func (m *Map) FD() int { - fd, err := m.fd.value() + fd, err := m.fd.Value() if err != nil { // Best effort: -1 is the number most likely to be an // invalid file descriptor. @@ -412,6 +844,7 @@ func (m *Map) FD() int { // // Closing the duplicate does not affect the original, and vice versa. // Changes made to the map are reflected by both instances however. +// If the original map was pinned, the cloned map will not be pinned by default. // // Cloning a nil Map returns nil. func (m *Map) Clone() (*Map, error) { @@ -419,72 +852,236 @@ func (m *Map) Clone() (*Map, error) { return nil, nil } - dup, err := m.fd.dup() + dup, err := m.fd.Dup() if err != nil { - return nil, errors.Wrap(err, "can't clone map") + return nil, fmt.Errorf("can't clone map: %w", err) } - return newMap(dup, &m.abi) + return &Map{ + m.name, + dup, + m.typ, + m.keySize, + m.valueSize, + m.maxEntries, + m.flags, + "", + m.fullValueSize, + }, nil } -// Pin persists the map past the lifetime of the process that created it. +// Pin persists the map on the BPF virtual file system past the lifetime of +// the process that created it . // -// This requires bpffs to be mounted above fileName. See http://cilium.readthedocs.io/en/doc-1.0/kubernetes/install/#mounting-the-bpf-fs-optional +// Calling Pin on a previously pinned map will overwrite the path, except when +// the new path already exists. Re-pinning across filesystems is not supported. +// You can Clone a map to pin it to a different path. +// +// This requires bpffs to be mounted above fileName. See https://docs.cilium.io/en/k8s-doc/admin/#admin-mount-bpffs func (m *Map) Pin(fileName string) error { - return bpfPinObject(fileName, m.fd) + if err := internal.Pin(m.pinnedPath, fileName, m.fd); err != nil { + return err + } + m.pinnedPath = fileName + return nil } -// LoadPinnedMap load a Map from a BPF file. +// Unpin removes the persisted state for the map from the BPF virtual filesystem. // -// Requires at least Linux 4.13, and is not compatible with -// nested maps. Use LoadPinnedMapExplicit in these situations. -func LoadPinnedMap(fileName string) (*Map, error) { - fd, err := bpfGetObject(fileName) - if err != nil { - return nil, err +// Failed calls to Unpin will not alter the state returned by IsPinned. +// +// Unpinning an unpinned Map returns nil. +func (m *Map) Unpin() error { + if err := internal.Unpin(m.pinnedPath); err != nil { + return err } - abi, err := newMapABIFromFd(fd) - if err != nil { - _ = fd.close() - return nil, err - } - return newMap(fd, abi) + m.pinnedPath = "" + return nil } -// LoadPinnedMapExplicit loads a map with explicit parameters. -func LoadPinnedMapExplicit(fileName string, abi *MapABI) (*Map, error) { - fd, err := bpfGetObject(fileName) +// IsPinned returns true if the map has a non-empty pinned path. +func (m *Map) IsPinned() bool { + return m.pinnedPath != "" +} + +// Freeze prevents a map to be modified from user space. +// +// It makes no changes to kernel-side restrictions. +func (m *Map) Freeze() error { + if err := haveMapMutabilityModifiers(); err != nil { + return fmt.Errorf("can't freeze map: %w", err) + } + + if err := bpfMapFreeze(m.fd); err != nil { + return fmt.Errorf("can't freeze map: %w", err) + } + return nil +} + +// finalize populates the Map according to the Contents specified +// in spec and freezes the Map if requested by spec. +func (m *Map) finalize(spec *MapSpec) error { + for _, kv := range spec.Contents { + if err := m.Put(kv.Key, kv.Value); err != nil { + return fmt.Errorf("putting value: key %v: %w", kv.Key, err) + } + } + + if spec.Freeze { + if err := m.Freeze(); err != nil { + return fmt.Errorf("freezing map: %w", err) + } + } + + return nil +} + +func (m *Map) marshalKey(data interface{}) (internal.Pointer, error) { + if data == nil { + if m.keySize == 0 { + // Queues have a key length of zero, so passing nil here is valid. + return internal.NewPointer(nil), nil + } + return internal.Pointer{}, errors.New("can't use nil as key of map") + } + + return marshalPtr(data, int(m.keySize)) +} + +func (m *Map) unmarshalKey(data interface{}, buf []byte) error { + if buf == nil { + // This is from a makeBuffer call, nothing do do here. + return nil + } + + return unmarshalBytes(data, buf) +} + +func (m *Map) marshalValue(data interface{}) (internal.Pointer, error) { + if m.typ.hasPerCPUValue() { + return marshalPerCPUValue(data, int(m.valueSize)) + } + + var ( + buf []byte + err error + ) + + switch value := data.(type) { + case *Map: + if !m.typ.canStoreMap() { + return internal.Pointer{}, fmt.Errorf("can't store map in %s", m.typ) + } + buf, err = marshalMap(value, int(m.valueSize)) + + case *Program: + if !m.typ.canStoreProgram() { + return internal.Pointer{}, fmt.Errorf("can't store program in %s", m.typ) + } + buf, err = marshalProgram(value, int(m.valueSize)) + + default: + return marshalPtr(data, int(m.valueSize)) + } + + if err != nil { + return internal.Pointer{}, err + } + + return internal.NewSlicePointer(buf), nil +} + +func (m *Map) unmarshalValue(value interface{}, buf []byte) error { + if buf == nil { + // This is from a makeBuffer call, nothing do do here. + return nil + } + + if m.typ.hasPerCPUValue() { + return unmarshalPerCPUValue(value, int(m.valueSize), buf) + } + + switch value := value.(type) { + case **Map: + if !m.typ.canStoreMap() { + return fmt.Errorf("can't read a map from %s", m.typ) + } + + other, err := unmarshalMap(buf) + if err != nil { + return err + } + + // The caller might close the map externally, so ignore errors. + _ = (*value).Close() + + *value = other + return nil + + case *Map: + if !m.typ.canStoreMap() { + return fmt.Errorf("can't read a map from %s", m.typ) + } + return errors.New("require pointer to *Map") + + case **Program: + if !m.typ.canStoreProgram() { + return fmt.Errorf("can't read a program from %s", m.typ) + } + + other, err := unmarshalProgram(buf) + if err != nil { + return err + } + + // The caller might close the program externally, so ignore errors. + _ = (*value).Close() + + *value = other + return nil + + case *Program: + if !m.typ.canStoreProgram() { + return fmt.Errorf("can't read a program from %s", m.typ) + } + return errors.New("require pointer to *Program") + } + + return unmarshalBytes(value, buf) +} + +// LoadPinnedMap loads a Map from a BPF file. +func LoadPinnedMap(fileName string, opts *LoadPinOptions) (*Map, error) { + fd, err := internal.BPFObjGet(fileName, opts.Marshal()) if err != nil { return nil, err } - return newMap(fd, abi) + + m, err := newMapFromFD(fd) + if err == nil { + m.pinnedPath = fileName + } + + return m, err } +// unmarshalMap creates a map from a map ID encoded in host endianness. func unmarshalMap(buf []byte) (*Map, error) { if len(buf) != 4 { return nil, errors.New("map id requires 4 byte value") } - // Looking up an entry in a nested map or prog array returns an id, - // not an fd. id := internal.NativeEndian.Uint32(buf) - fd, err := bpfGetMapFDByID(id) - if err != nil { - return nil, err - } - - abi, err := newMapABIFromFd(fd) - if err != nil { - _ = fd.close() - return nil, err - } - - return newMap(fd, abi) + return NewMapFromID(MapID(id)) } -// MarshalBinary implements BinaryMarshaler. -func (m *Map) MarshalBinary() ([]byte, error) { - fd, err := m.fd.value() +// marshalMap marshals the fd of a map into a buffer in host endianness. +func marshalMap(m *Map, length int) ([]byte, error) { + if length != 4 { + return nil, fmt.Errorf("can't marshal map to %d bytes", length) + } + + fd, err := m.fd.Value() if err != nil { return nil, err } @@ -494,6 +1091,60 @@ func (m *Map) MarshalBinary() ([]byte, error) { return buf, nil } +func patchValue(value []byte, typ btf.Type, replacements map[string]interface{}) error { + replaced := make(map[string]bool) + replace := func(name string, offset, size int, replacement interface{}) error { + if offset+size > len(value) { + return fmt.Errorf("%s: offset %d(+%d) is out of bounds", name, offset, size) + } + + buf, err := marshalBytes(replacement, size) + if err != nil { + return fmt.Errorf("marshal %s: %w", name, err) + } + + copy(value[offset:offset+size], buf) + replaced[name] = true + return nil + } + + switch parent := typ.(type) { + case *btf.Datasec: + for _, secinfo := range parent.Vars { + name := string(secinfo.Type.(*btf.Var).Name) + replacement, ok := replacements[name] + if !ok { + continue + } + + err := replace(name, int(secinfo.Offset), int(secinfo.Size), replacement) + if err != nil { + return err + } + } + + default: + return fmt.Errorf("patching %T is not supported", typ) + } + + if len(replaced) == len(replacements) { + return nil + } + + var missing []string + for name := range replacements { + if !replaced[name] { + missing = append(missing, name) + } + } + + if len(missing) == 1 { + return fmt.Errorf("unknown field: %s", missing[0]) + } + + return fmt.Errorf("unknown fields: %s", strings.Join(missing, ",")) +} + // MapIterator iterates a Map. // // See Map.Iterate. @@ -509,13 +1160,11 @@ type MapIterator struct { func newMapIterator(target *Map) *MapIterator { return &MapIterator{ target: target, - maxEntries: target.abi.MaxEntries, - prevBytes: make([]byte, int(target.abi.KeySize)), + maxEntries: target.maxEntries, + prevBytes: make([]byte, target.keySize), } } -var errIterationAborted = errors.New("iteration aborted") - // Next decodes the next key and value. // // Iterating a hash map from which keys are being deleted is not @@ -531,7 +1180,9 @@ func (mi *MapIterator) Next(keyOut, valueOut interface{}) bool { return false } - for ; mi.count < mi.maxEntries; mi.count++ { + // For array-like maps NextKeyBytes returns nil only on after maxEntries + // iterations. + for mi.count <= mi.maxEntries { var nextBytes []byte nextBytes, mi.err = mi.target.NextKeyBytes(mi.prevKey) if mi.err != nil { @@ -550,8 +1201,9 @@ func (mi *MapIterator) Next(keyOut, valueOut interface{}) bool { copy(mi.prevBytes, nextBytes) mi.prevKey = mi.prevBytes + mi.count++ mi.err = mi.target.Lookup(nextBytes, valueOut) - if IsNotExist(mi.err) { + if errors.Is(mi.err, ErrKeyNotExist) { // Even though the key should be valid, we couldn't look up // its value. If we're iterating a hash map this is probably // because a concurrent delete removed the value before we @@ -566,30 +1218,50 @@ func (mi *MapIterator) Next(keyOut, valueOut interface{}) bool { return false } - mi.err = unmarshalBytes(keyOut, nextBytes) + mi.err = mi.target.unmarshalKey(keyOut, nextBytes) return mi.err == nil } - mi.err = errIterationAborted + mi.err = fmt.Errorf("%w", ErrIterationAborted) return false } // Err returns any encountered error. // // The method must be called after Next returns nil. +// +// Returns ErrIterationAborted if it wasn't possible to do a full iteration. func (mi *MapIterator) Err() error { return mi.err } -// IsNotExist returns true if the error indicates that a -// key doesn't exist. -func IsNotExist(err error) bool { - return errors.Cause(err) == unix.ENOENT +// MapGetNextID returns the ID of the next eBPF map. +// +// Returns ErrNotExist, if there is no next eBPF map. +func MapGetNextID(startID MapID) (MapID, error) { + id, err := objGetNextID(internal.BPF_MAP_GET_NEXT_ID, uint32(startID)) + return MapID(id), err } -// IsIterationAborted returns true if the iteration was aborted. +// NewMapFromID returns the map for a given id. // -// This occurs when keys are deleted from a hash map during iteration. -func IsIterationAborted(err error) bool { - return errors.Cause(err) == errIterationAborted +// Returns ErrNotExist, if there is no eBPF map with the given id. +func NewMapFromID(id MapID) (*Map, error) { + fd, err := internal.BPFObjGetFDByID(internal.BPF_MAP_GET_FD_BY_ID, uint32(id)) + if err != nil { + return nil, err + } + + return newMapFromFD(fd) +} + +// ID returns the systemwide unique ID of the map. +// +// Deprecated: use MapInfo.ID() instead. +func (m *Map) ID() (MapID, error) { + info, err := bpfGetMapInfoByFD(m.fd) + if err != nil { + return MapID(0), err + } + return MapID(info.id), nil } diff --git a/vendor/github.com/cilium/ebpf/marshalers.go b/vendor/github.com/cilium/ebpf/marshalers.go index 44ba273..e461d67 100644 --- a/vendor/github.com/cilium/ebpf/marshalers.go +++ b/vendor/github.com/cilium/ebpf/marshalers.go @@ -4,29 +4,46 @@ import ( "bytes" "encoding" "encoding/binary" + "errors" + "fmt" "reflect" "runtime" + "sync" "unsafe" "github.com/cilium/ebpf/internal" - - "github.com/pkg/errors" ) -func marshalPtr(data interface{}, length int) (syscallPtr, error) { +// marshalPtr converts an arbitrary value into a pointer suitable +// to be passed to the kernel. +// +// As an optimization, it returns the original value if it is an +// unsafe.Pointer. +func marshalPtr(data interface{}, length int) (internal.Pointer, error) { if ptr, ok := data.(unsafe.Pointer); ok { - return newPtr(ptr), nil + return internal.NewPointer(ptr), nil } buf, err := marshalBytes(data, length) if err != nil { - return syscallPtr{}, err + return internal.Pointer{}, err } - return newPtr(unsafe.Pointer(&buf[0])), nil + return internal.NewSlicePointer(buf), nil } +// marshalBytes converts an arbitrary value into a byte buffer. +// +// Prefer using Map.marshalKey and Map.marshalValue if possible, since +// those have special cases that allow more types to be encoded. +// +// Returns an error if the given value isn't representable in exactly +// length bytes. func marshalBytes(data interface{}, length int) (buf []byte, err error) { + if data == nil { + return nil, errors.New("can't marshal a nil value") + } + switch value := data.(type) { case encoding.BinaryMarshaler: buf, err = value.MarshalBinary() @@ -36,10 +53,14 @@ func marshalBytes(data interface{}, length int) (buf []byte, err error) { buf = value case unsafe.Pointer: err = errors.New("can't marshal from unsafe.Pointer") + case Map, *Map, Program, *Program: + err = fmt.Errorf("can't marshal %T", value) default: var wr bytes.Buffer err = binary.Write(&wr, internal.NativeEndian, value) - err = errors.Wrapf(err, "encoding %T", value) + if err != nil { + err = fmt.Errorf("encoding %T: %v", value, err) + } buf = wr.Bytes() } if err != nil { @@ -47,33 +68,49 @@ func marshalBytes(data interface{}, length int) (buf []byte, err error) { } if len(buf) != length { - return nil, errors.Errorf("%T doesn't marshal to %d bytes", data, length) + return nil, fmt.Errorf("%T doesn't marshal to %d bytes", data, length) } return buf, nil } -func makeBuffer(dst interface{}, length int) (syscallPtr, []byte) { +func makeBuffer(dst interface{}, length int) (internal.Pointer, []byte) { if ptr, ok := dst.(unsafe.Pointer); ok { - return newPtr(ptr), nil + return internal.NewPointer(ptr), nil } buf := make([]byte, length) - return newPtr(unsafe.Pointer(&buf[0])), buf + return internal.NewSlicePointer(buf), buf } +var bytesReaderPool = sync.Pool{ + New: func() interface{} { + return new(bytes.Reader) + }, +} + +// unmarshalBytes converts a byte buffer into an arbitrary value. +// +// Prefer using Map.unmarshalKey and Map.unmarshalValue if possible, since +// those have special cases that allow more types to be encoded. +// +// The common int32 and int64 types are directly handled to avoid +// unnecessary heap allocations as happening in the default case. func unmarshalBytes(data interface{}, buf []byte) error { switch value := data.(type) { case unsafe.Pointer: - sh := &reflect.SliceHeader{ - Data: uintptr(value), - Len: len(buf), - Cap: len(buf), - } + var dst []byte + // Use unsafe.Slice when we drop support for pre1.17 (https://github.com/golang/go/issues/19367) + // We could opt for removing unsafe.Pointer support in the lib as well + sh := (*reflect.SliceHeader)(unsafe.Pointer(&dst)) + sh.Data = uintptr(value) + sh.Len = len(buf) + sh.Cap = len(buf) - dst := *(*[]byte)(unsafe.Pointer(sh)) copy(dst, buf) runtime.KeepAlive(value) return nil + case Map, *Map, Program, *Program: + return fmt.Errorf("can't unmarshal into %T", value) case encoding.BinaryUnmarshaler: return value.UnmarshalBinary(buf) case *string: @@ -82,14 +119,42 @@ func unmarshalBytes(data interface{}, buf []byte) error { case *[]byte: *value = buf return nil + case *int32: + if len(buf) < 4 { + return errors.New("int32 requires 4 bytes") + } + *value = int32(internal.NativeEndian.Uint32(buf)) + return nil + case *uint32: + if len(buf) < 4 { + return errors.New("uint32 requires 4 bytes") + } + *value = internal.NativeEndian.Uint32(buf) + return nil + case *int64: + if len(buf) < 8 { + return errors.New("int64 requires 8 bytes") + } + *value = int64(internal.NativeEndian.Uint64(buf)) + return nil + case *uint64: + if len(buf) < 8 { + return errors.New("uint64 requires 8 bytes") + } + *value = internal.NativeEndian.Uint64(buf) + return nil case string: return errors.New("require pointer to string") case []byte: return errors.New("require pointer to []byte") default: - rd := bytes.NewReader(buf) - err := binary.Read(rd, internal.NativeEndian, value) - return errors.Wrapf(err, "decoding %T", value) + rd := bytesReaderPool.Get().(*bytes.Reader) + rd.Reset(buf) + defer bytesReaderPool.Put(rd) + if err := binary.Read(rd, internal.NativeEndian, value); err != nil { + return fmt.Errorf("decoding %T: %v", value, err) + } + return nil } } @@ -99,38 +164,38 @@ func unmarshalBytes(data interface{}, buf []byte) error { // Values are initialized to zero if the slice has less elements than CPUs. // // slice must have a type like []elementType. -func marshalPerCPUValue(slice interface{}, elemLength int) (syscallPtr, error) { +func marshalPerCPUValue(slice interface{}, elemLength int) (internal.Pointer, error) { sliceType := reflect.TypeOf(slice) if sliceType.Kind() != reflect.Slice { - return syscallPtr{}, errors.New("per-CPU value requires slice") + return internal.Pointer{}, errors.New("per-CPU value requires slice") } possibleCPUs, err := internal.PossibleCPUs() if err != nil { - return syscallPtr{}, err + return internal.Pointer{}, err } sliceValue := reflect.ValueOf(slice) sliceLen := sliceValue.Len() if sliceLen > possibleCPUs { - return syscallPtr{}, errors.Errorf("per-CPU value exceeds number of CPUs") + return internal.Pointer{}, fmt.Errorf("per-CPU value exceeds number of CPUs") } - alignedElemLength := align(elemLength, 8) + alignedElemLength := internal.Align(elemLength, 8) buf := make([]byte, alignedElemLength*possibleCPUs) for i := 0; i < sliceLen; i++ { elem := sliceValue.Index(i).Interface() elemBytes, err := marshalBytes(elem, elemLength) if err != nil { - return syscallPtr{}, err + return internal.Pointer{}, err } offset := i * alignedElemLength copy(buf[offset:offset+elemLength], elemBytes) } - return newPtr(unsafe.Pointer(&buf[0])), nil + return internal.NewSlicePointer(buf), nil } // unmarshalPerCPUValue decodes a buffer into a slice containing one value per @@ -140,7 +205,7 @@ func marshalPerCPUValue(slice interface{}, elemLength int) (syscallPtr, error) { func unmarshalPerCPUValue(slicePtr interface{}, elemLength int, buf []byte) error { slicePtrType := reflect.TypeOf(slicePtr) if slicePtrType.Kind() != reflect.Ptr || slicePtrType.Elem().Kind() != reflect.Slice { - return errors.Errorf("per-cpu value requires pointer to slice") + return fmt.Errorf("per-cpu value requires pointer to slice") } possibleCPUs, err := internal.PossibleCPUs() @@ -159,7 +224,7 @@ func unmarshalPerCPUValue(slicePtr interface{}, elemLength int, buf []byte) erro step := len(buf) / possibleCPUs if step < elemLength { - return errors.Errorf("per-cpu element length is larger than available data") + return fmt.Errorf("per-cpu element length is larger than available data") } for i := 0; i < possibleCPUs; i++ { var elem interface{} @@ -177,7 +242,7 @@ func unmarshalPerCPUValue(slicePtr interface{}, elemLength int, buf []byte) erro err := unmarshalBytes(elem, elemBytes) if err != nil { - return errors.Wrapf(err, "cpu %d", i) + return fmt.Errorf("cpu %d: %w", i, err) } buf = buf[step:] @@ -186,7 +251,3 @@ func unmarshalPerCPUValue(slicePtr interface{}, elemLength int, buf []byte) erro reflect.ValueOf(slicePtr).Elem().Set(slice) return nil } - -func align(n, alignment int) int { - return (int(n) + alignment - 1) / alignment * alignment -} diff --git a/vendor/github.com/cilium/ebpf/prog.go b/vendor/github.com/cilium/ebpf/prog.go index 03b24fb..3549a3f 100644 --- a/vendor/github.com/cilium/ebpf/prog.go +++ b/vendor/github.com/cilium/ebpf/prog.go @@ -2,23 +2,28 @@ package ebpf import ( "bytes" + "encoding/binary" + "errors" "fmt" + "io" "math" "path/filepath" "strings" "time" - "unsafe" "github.com/cilium/ebpf/asm" "github.com/cilium/ebpf/internal" + "github.com/cilium/ebpf/internal/btf" "github.com/cilium/ebpf/internal/unix" - - "github.com/pkg/errors" ) -var ( - errNotSupported = errors.New("ebpf: not supported by kernel") -) +// ErrNotSupported is returned whenever the kernel doesn't support a feature. +var ErrNotSupported = internal.ErrNotSupported + +var errUnsatisfiedReference = errors.New("unsatisfied reference") + +// ProgramID represents the unique ID of an eBPF program. +type ProgramID uint32 const ( // Number of bytes to pad the output buffer for BPF_PROG_TEST_RUN. @@ -39,18 +44,53 @@ type ProgramOptions struct { // Controls the output buffer size for the verifier. Defaults to // DefaultVerifierLogSize. LogSize int + // An ELF containing the target BTF for this program. It is used both to + // find the correct function to trace and to apply CO-RE relocations. + // This is useful in environments where the kernel BTF is not available + // (containers) or where it is in a non-standard location. Defaults to + // use the kernel BTF from a well-known location. + TargetBTF io.ReaderAt } -// ProgramSpec defines a Program +// ProgramSpec defines a Program. type ProgramSpec struct { // Name is passed to the kernel as a debug aid. Must only contain // alpha numeric and '_' characters. - Name string - Type ProgramType - AttachType AttachType - Instructions asm.Instructions - License string + Name string + + // Type determines at which hook in the kernel a program will run. + Type ProgramType + AttachType AttachType + // Name of a kernel data structure or function to attach to. Its + // interpretation depends on Type and AttachType. + AttachTo string + // The program to attach to. Must be provided manually. + AttachTarget *Program + Instructions asm.Instructions + + // Flags is passed to the kernel and specifies additional program + // load attributes. + Flags uint32 + + // License of the program. Some helpers are only available if + // the license is deemed compatible with the GPL. + // + // See https://www.kernel.org/doc/html/latest/process/license-rules.html#id1 + License string + + // Version used by Kprobe programs. + // + // Deprecated on kernels 5.0 and later. Leave empty to let the library + // detect this value automatically. KernelVersion uint32 + + // The BTF associated with this program. Changing Instructions + // will most likely invalidate the contained data, and may + // result in errors when attempting to load it into the kernel. + BTF *btf.Program + + // The byte order this program was compiled for, may be nil. + ByteOrder binary.ByteOrder } // Copy returns a copy of the spec. @@ -65,6 +105,13 @@ func (ps *ProgramSpec) Copy() *ProgramSpec { return &cpy } +// Tag calculates the kernel tag for a series of instructions. +// +// Use asm.Instructions.Tag if you need to calculate for non-native endianness. +func (ps *ProgramSpec) Tag() (string, error) { + return ps.Instructions.Tag(internal.NativeEndian) +} + // Program represents BPF program loaded into the kernel. // // It is not safe to close a Program which is used by other goroutines. @@ -73,9 +120,10 @@ type Program struct { // otherwise it is empty. VerifierLog string - fd *bpfFD - name string - abi ProgramABI + fd *internal.FD + name string + pinnedPath string + typ ProgramType } // NewProgram creates a new Program. @@ -91,11 +139,148 @@ func NewProgram(spec *ProgramSpec) (*Program, error) { // Loading a program for the first time will perform // feature detection by loading small, temporary programs. func NewProgramWithOptions(spec *ProgramSpec, opts ProgramOptions) (*Program, error) { - attr, err := convertProgramSpec(spec, haveObjName.Result()) + handles := newHandleCache() + defer handles.close() + + prog, err := newProgramWithOptions(spec, opts, handles) + if errors.Is(err, errUnsatisfiedReference) { + return nil, fmt.Errorf("cannot load program without loading its whole collection: %w", err) + } + return prog, err +} + +func newProgramWithOptions(spec *ProgramSpec, opts ProgramOptions, handles *handleCache) (*Program, error) { + if len(spec.Instructions) == 0 { + return nil, errors.New("instructions cannot be empty") + } + + if spec.ByteOrder != nil && spec.ByteOrder != internal.NativeEndian { + return nil, fmt.Errorf("can't load %s program on %s", spec.ByteOrder, internal.NativeEndian) + } + + // Kernels before 5.0 (6c4fc209fcf9 "bpf: remove useless version check for prog load") + // require the version field to be set to the value of the KERNEL_VERSION + // macro for kprobe-type programs. + // Overwrite Kprobe program version if set to zero or the magic version constant. + kv := spec.KernelVersion + if spec.Type == Kprobe && (kv == 0 || kv == internal.MagicKernelVersion) { + v, err := internal.KernelVersion() + if err != nil { + return nil, fmt.Errorf("detecting kernel version: %w", err) + } + kv = v.Kernel() + } + + attr := &internal.BPFProgLoadAttr{ + ProgType: uint32(spec.Type), + ProgFlags: spec.Flags, + ExpectedAttachType: uint32(spec.AttachType), + License: internal.NewStringPointer(spec.License), + KernelVersion: kv, + } + + if haveObjName() == nil { + attr.ProgName = internal.NewBPFObjName(spec.Name) + } + + var err error + var targetBTF *btf.Spec + if opts.TargetBTF != nil { + targetBTF, err = handles.btfSpec(opts.TargetBTF) + if err != nil { + return nil, fmt.Errorf("load target BTF: %w", err) + } + } + + var btfDisabled bool + var core btf.COREFixups + if spec.BTF != nil { + core, err = spec.BTF.Fixups(targetBTF) + if err != nil { + return nil, fmt.Errorf("CO-RE relocations: %w", err) + } + + handle, err := handles.btfHandle(spec.BTF.Spec()) + btfDisabled = errors.Is(err, btf.ErrNotSupported) + if err != nil && !btfDisabled { + return nil, fmt.Errorf("load BTF: %w", err) + } + + if handle != nil { + attr.ProgBTFFd = uint32(handle.FD()) + + recSize, bytes, err := spec.BTF.LineInfos() + if err != nil { + return nil, fmt.Errorf("get BTF line infos: %w", err) + } + attr.LineInfoRecSize = recSize + attr.LineInfoCnt = uint32(uint64(len(bytes)) / uint64(recSize)) + attr.LineInfo = internal.NewSlicePointer(bytes) + + recSize, bytes, err = spec.BTF.FuncInfos() + if err != nil { + return nil, fmt.Errorf("get BTF function infos: %w", err) + } + attr.FuncInfoRecSize = recSize + attr.FuncInfoCnt = uint32(uint64(len(bytes)) / uint64(recSize)) + attr.FuncInfo = internal.NewSlicePointer(bytes) + } + } + + insns, err := core.Apply(spec.Instructions) + if err != nil { + return nil, fmt.Errorf("CO-RE fixup: %w", err) + } + + if err := fixupJumpsAndCalls(insns); err != nil { + return nil, err + } + + buf := bytes.NewBuffer(make([]byte, 0, len(spec.Instructions)*asm.InstructionSize)) + err = insns.Marshal(buf, internal.NativeEndian) if err != nil { return nil, err } + bytecode := buf.Bytes() + attr.Instructions = internal.NewSlicePointer(bytecode) + attr.InsCount = uint32(len(bytecode) / asm.InstructionSize) + + if spec.AttachTo != "" { + if spec.AttachTarget != nil { + info, err := spec.AttachTarget.Info() + if err != nil { + return nil, fmt.Errorf("load target BTF: %w", err) + } + + btfID, ok := info.BTFID() + if !ok { + return nil, fmt.Errorf("load target BTF: no BTF info available") + } + btfHandle, err := btf.NewHandleFromID(btfID) + if err != nil { + return nil, fmt.Errorf("load target BTF: %w", err) + } + defer btfHandle.Close() + + targetBTF = btfHandle.Spec() + if err != nil { + return nil, fmt.Errorf("load target BTF: %w", err) + } + } + + target, err := resolveBTFType(targetBTF, spec.AttachTo, spec.Type, spec.AttachType) + if err != nil { + return nil, err + } + if target != nil { + attr.AttachBTFID = uint32(target.ID()) + } + if spec.AttachTarget != nil { + attr.AttachProgFd = uint32(spec.AttachTarget.FD()) + } + } + logSize := DefaultVerifierLogSize if opts.LogSize > 0 { logSize = opts.LogSize @@ -104,126 +289,102 @@ func NewProgramWithOptions(spec *ProgramSpec, opts ProgramOptions) (*Program, er var logBuf []byte if opts.LogLevel > 0 { logBuf = make([]byte, logSize) - attr.logLevel = opts.LogLevel - attr.logSize = uint32(len(logBuf)) - attr.logBuf = newPtr(unsafe.Pointer(&logBuf[0])) + attr.LogLevel = opts.LogLevel + attr.LogSize = uint32(len(logBuf)) + attr.LogBuf = internal.NewSlicePointer(logBuf) } - fd, err := bpfProgLoad(attr) + fd, err := internal.BPFProgLoad(attr) if err == nil { - prog := newProgram(fd, spec.Name, &ProgramABI{spec.Type}) - prog.VerifierLog = convertCString(logBuf) - return prog, nil + return &Program{internal.CString(logBuf), fd, spec.Name, "", spec.Type}, nil } - truncated := errors.Cause(err) == unix.ENOSPC - if opts.LogLevel == 0 { + logErr := err + if opts.LogLevel == 0 && opts.LogSize >= 0 { // Re-run with the verifier enabled to get better error messages. logBuf = make([]byte, logSize) - attr.logLevel = 1 - attr.logSize = uint32(len(logBuf)) - attr.logBuf = newPtr(unsafe.Pointer(&logBuf[0])) + attr.LogLevel = 1 + attr.LogSize = uint32(len(logBuf)) + attr.LogBuf = internal.NewSlicePointer(logBuf) - _, nerr := bpfProgLoad(attr) - truncated = errors.Cause(nerr) == unix.ENOSPC + fd, logErr = internal.BPFProgLoad(attr) + if logErr == nil { + fd.Close() + } } - logs := convertCString(logBuf) - if truncated { - logs += "\n(truncated...)" + if errors.Is(logErr, unix.EPERM) && logBuf[0] == 0 { + // EPERM due to RLIMIT_MEMLOCK happens before the verifier, so we can + // check that the log is empty to reduce false positives. + return nil, fmt.Errorf("load program: %w (MEMLOCK bay be too low, consider rlimit.RemoveMemlock)", logErr) } - return nil, &loadError{err, logs} + err = internal.ErrorWithLog(err, logBuf, logErr) + if btfDisabled { + return nil, fmt.Errorf("load program without BTF: %w", err) + } + return nil, fmt.Errorf("load program: %w", err) } // NewProgramFromFD creates a program from a raw fd. // // You should not use fd after calling this function. +// +// Requires at least Linux 4.10. func NewProgramFromFD(fd int) (*Program, error) { if fd < 0 { return nil, errors.New("invalid fd") } - bpfFd := newBPFFD(uint32(fd)) - info, err := bpfGetProgInfoByFD(bpfFd) - if err != nil { - bpfFd.forget() - return nil, err - } - - var name string - if bpfName := convertCString(info.name[:]); bpfName != "" { - name = bpfName - } else { - name = convertCString(info.tag[:]) - } - - return newProgram(bpfFd, name, newProgramABIFromInfo(info)), nil + return newProgramFromFD(internal.NewFD(uint32(fd))) } -func newProgram(fd *bpfFD, name string, abi *ProgramABI) *Program { - return &Program{ - name: name, - fd: fd, - abi: *abi, +// NewProgramFromID returns the program for a given id. +// +// Returns ErrNotExist, if there is no eBPF program with the given id. +func NewProgramFromID(id ProgramID) (*Program, error) { + fd, err := internal.BPFObjGetFDByID(internal.BPF_PROG_GET_FD_BY_ID, uint32(id)) + if err != nil { + return nil, fmt.Errorf("get program by id: %w", err) } + + return newProgramFromFD(fd) } -func convertProgramSpec(spec *ProgramSpec, includeName bool) (*bpfProgLoadAttr, error) { - if len(spec.Instructions) == 0 { - return nil, errors.New("Instructions cannot be empty") - } - - if len(spec.License) == 0 { - return nil, errors.New("License cannot be empty") - } - - buf := bytes.NewBuffer(make([]byte, 0, len(spec.Instructions)*asm.InstructionSize)) - err := spec.Instructions.Marshal(buf, internal.NativeEndian) +func newProgramFromFD(fd *internal.FD) (*Program, error) { + info, err := newProgramInfoFromFd(fd) if err != nil { - return nil, err + fd.Close() + return nil, fmt.Errorf("discover program type: %w", err) } - bytecode := buf.Bytes() - insCount := uint32(len(bytecode) / asm.InstructionSize) - lic := []byte(spec.License) - attr := &bpfProgLoadAttr{ - progType: spec.Type, - expectedAttachType: spec.AttachType, - insCount: insCount, - instructions: newPtr(unsafe.Pointer(&bytecode[0])), - license: newPtr(unsafe.Pointer(&lic[0])), - } - - name, err := newBPFObjName(spec.Name) - if err != nil { - return nil, err - } - - if includeName { - attr.progName = name - } - - return attr, nil + return &Program{"", fd, "", "", info.Type}, nil } func (p *Program) String() string { if p.name != "" { - return fmt.Sprintf("%s(%s)#%s", p.abi.Type, p.name, p.fd) + return fmt.Sprintf("%s(%s)#%v", p.typ, p.name, p.fd) } - return fmt.Sprintf("%s#%s", p.abi.Type, p.fd) + return fmt.Sprintf("%s(%v)", p.typ, p.fd) } -// ABI gets the ABI of the Program -func (p *Program) ABI() ProgramABI { - return p.abi +// Type returns the underlying type of the program. +func (p *Program) Type() ProgramType { + return p.typ +} + +// Info returns metadata about the program. +// +// Requires at least 4.10. +func (p *Program) Info() (*ProgramInfo, error) { + return newProgramInfoFromFd(p.fd) } // FD gets the file descriptor of the Program. // // It is invalid to call this function after Close has been called. func (p *Program) FD() int { - fd, err := p.fd.value() + fd, err := p.fd.Value() if err != nil { // Best effort: -1 is the number most likely to be an // invalid file descriptor. @@ -243,19 +404,45 @@ func (p *Program) Clone() (*Program, error) { return nil, nil } - dup, err := p.fd.dup() + dup, err := p.fd.Dup() if err != nil { - return nil, errors.Wrap(err, "can't clone program") + return nil, fmt.Errorf("can't clone program: %w", err) } - return newProgram(dup, p.name, &p.abi), nil + return &Program{p.VerifierLog, dup, p.name, "", p.typ}, nil } -// Pin persists the Program past the lifetime of the process that created it +// Pin persists the Program on the BPF virtual file system past the lifetime of +// the process that created it // -// This requires bpffs to be mounted above fileName. See http://cilium.readthedocs.io/en/doc-1.0/kubernetes/install/#mounting-the-bpf-fs-optional +// Calling Pin on a previously pinned program will overwrite the path, except when +// the new path already exists. Re-pinning across filesystems is not supported. +// +// This requires bpffs to be mounted above fileName. See https://docs.cilium.io/en/k8s-doc/admin/#admin-mount-bpffs func (p *Program) Pin(fileName string) error { - return errors.Wrap(bpfPinObject(fileName, p.fd), "can't pin program") + if err := internal.Pin(p.pinnedPath, fileName, p.fd); err != nil { + return err + } + p.pinnedPath = fileName + return nil +} + +// Unpin removes the persisted state for the Program from the BPF virtual filesystem. +// +// Failed calls to Unpin will not alter the state returned by IsPinned. +// +// Unpinning an unpinned Program returns nil. +func (p *Program) Unpin() error { + if err := internal.Unpin(p.pinnedPath); err != nil { + return err + } + p.pinnedPath = "" + return nil +} + +// IsPinned returns true if the Program has a non-empty pinned path. +func (p *Program) IsPinned() bool { + return p.pinnedPath != "" } // Close unloads the program from the kernel. @@ -264,7 +451,7 @@ func (p *Program) Close() error { return nil } - return p.fd.close() + return p.fd.Close() } // Test runs the Program in the kernel with the given input and returns the @@ -275,57 +462,69 @@ func (p *Program) Close() error { // // This function requires at least Linux 4.12. func (p *Program) Test(in []byte) (uint32, []byte, error) { - ret, out, _, err := p.testRun(in, 1) - return ret, out, err + ret, out, _, err := p.testRun(in, 1, nil) + if err != nil { + return ret, nil, fmt.Errorf("can't test program: %w", err) + } + return ret, out, nil } // Benchmark runs the Program with the given input for a number of times // and returns the time taken per iteration. // -// The returned value is the return value of the last execution of -// the program. +// Returns the result of the last execution of the program and the time per +// run or an error. reset is called whenever the benchmark syscall is +// interrupted, and should be set to testing.B.ResetTimer or similar. +// +// Note: profiling a call to this function will skew it's results, see +// https://github.com/cilium/ebpf/issues/24 // // This function requires at least Linux 4.12. -func (p *Program) Benchmark(in []byte, repeat int) (uint32, time.Duration, error) { - ret, _, total, err := p.testRun(in, repeat) - return ret, total, err +func (p *Program) Benchmark(in []byte, repeat int, reset func()) (uint32, time.Duration, error) { + ret, _, total, err := p.testRun(in, repeat, reset) + if err != nil { + return ret, total, fmt.Errorf("can't benchmark program: %w", err) + } + return ret, total, nil } -var noProgTestRun = featureTest{ - Fn: func() bool { - prog, err := NewProgram(&ProgramSpec{ - Type: SocketFilter, - Instructions: asm.Instructions{ - asm.LoadImm(asm.R0, 0, asm.DWord), - asm.Return(), - }, - License: "MIT", - }) - if err != nil { - // This may be because we lack sufficient permissions, etc. - return false - } - defer prog.Close() +var haveProgTestRun = internal.FeatureTest("BPF_PROG_TEST_RUN", "4.12", func() error { + prog, err := NewProgram(&ProgramSpec{ + Type: SocketFilter, + Instructions: asm.Instructions{ + asm.LoadImm(asm.R0, 0, asm.DWord), + asm.Return(), + }, + License: "MIT", + }) + if err != nil { + // This may be because we lack sufficient permissions, etc. + return err + } + defer prog.Close() - fd, err := prog.fd.value() - if err != nil { - return false - } + // Programs require at least 14 bytes input + in := make([]byte, 14) + attr := bpfProgTestRunAttr{ + fd: uint32(prog.FD()), + dataSizeIn: uint32(len(in)), + dataIn: internal.NewSlicePointer(in), + } - // Programs require at least 14 bytes input - in := make([]byte, 14) - attr := bpfProgTestRunAttr{ - fd: fd, - dataSizeIn: uint32(len(in)), - dataIn: newPtr(unsafe.Pointer(&in[0])), - } + err = bpfProgTestRun(&attr) + if errors.Is(err, unix.EINVAL) { + // Check for EINVAL specifically, rather than err != nil since we + // otherwise misdetect due to insufficient permissions. + return internal.ErrNotSupported + } + if errors.Is(err, unix.EINTR) { + // We know that PROG_TEST_RUN is supported if we get EINTR. + return nil + } + return err +}) - _, err = bpfCall(_ProgTestRun, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) - return errors.Cause(err) == unix.EINVAL - }, -} - -func (p *Program) testRun(in []byte, repeat int) (uint32, []byte, time.Duration, error) { +func (p *Program) testRun(in []byte, repeat int, reset func()) (uint32, []byte, time.Duration, error) { if uint(repeat) > math.MaxUint32 { return 0, nil, 0, fmt.Errorf("repeat is too high") } @@ -338,8 +537,8 @@ func (p *Program) testRun(in []byte, repeat int) (uint32, []byte, time.Duration, return 0, nil, 0, fmt.Errorf("input is too long") } - if noProgTestRun.Result() { - return 0, nil, 0, errNotSupported + if err := haveProgTestRun(); err != nil { + return 0, nil, 0, err } // Older kernels ignore the dataSizeOut argument when copying to user space. @@ -349,7 +548,7 @@ func (p *Program) testRun(in []byte, repeat int) (uint32, []byte, time.Duration, // See https://patchwork.ozlabs.org/cover/1006822/ out := make([]byte, len(in)+outputPad) - fd, err := p.fd.value() + fd, err := p.fd.Value() if err != nil { return 0, nil, 0, err } @@ -358,14 +557,25 @@ func (p *Program) testRun(in []byte, repeat int) (uint32, []byte, time.Duration, fd: fd, dataSizeIn: uint32(len(in)), dataSizeOut: uint32(len(out)), - dataIn: newPtr(unsafe.Pointer(&in[0])), - dataOut: newPtr(unsafe.Pointer(&out[0])), + dataIn: internal.NewSlicePointer(in), + dataOut: internal.NewSlicePointer(out), repeat: uint32(repeat), } - _, err = bpfCall(_ProgTestRun, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) - if err != nil { - return 0, nil, 0, errors.Wrap(err, "can't run test") + for { + err = bpfProgTestRun(&attr) + if err == nil { + break + } + + if errors.Is(err, unix.EINTR) { + if reset != nil { + reset() + } + continue + } + + return 0, nil, 0, fmt.Errorf("can't run test: %w", err) } if int(attr.dataSizeOut) > cap(out) { @@ -387,23 +597,15 @@ func unmarshalProgram(buf []byte) (*Program, error) { // Looking up an entry in a nested map or prog array returns an id, // not an fd. id := internal.NativeEndian.Uint32(buf) - fd, err := bpfGetProgramFDByID(id) - if err != nil { - return nil, err - } - - abi, err := newProgramABIFromFd(fd) - if err != nil { - _ = fd.close() - return nil, err - } - - return newProgram(fd, "", abi), nil + return NewProgramFromID(ProgramID(id)) } -// MarshalBinary implements BinaryMarshaler. -func (p *Program) MarshalBinary() ([]byte, error) { - value, err := p.fd.value() +func marshalProgram(p *Program, length int) ([]byte, error) { + if length != 4 { + return nil, fmt.Errorf("can't marshal program to %d bytes", length) + } + + value, err := p.fd.Value() if err != nil { return nil, err } @@ -413,84 +615,80 @@ func (p *Program) MarshalBinary() ([]byte, error) { return buf, nil } -// Attach a Program to a container object fd +// Attach a Program. +// +// Deprecated: use link.RawAttachProgram instead. func (p *Program) Attach(fd int, typ AttachType, flags AttachFlags) error { if fd < 0 { return errors.New("invalid fd") } - pfd, err := p.fd.value() + pfd, err := p.fd.Value() if err != nil { return err } - attr := bpfProgAlterAttr{ - targetFd: uint32(fd), - attachBpfFd: pfd, - attachType: uint32(typ), - attachFlags: uint32(flags), + attr := internal.BPFProgAttachAttr{ + TargetFd: uint32(fd), + AttachBpfFd: pfd, + AttachType: uint32(typ), + AttachFlags: uint32(flags), } - return bpfProgAlter(_ProgAttach, &attr) + return internal.BPFProgAttach(&attr) } -// Detach a Program from a container object fd +// Detach a Program. +// +// Deprecated: use link.RawDetachProgram instead. func (p *Program) Detach(fd int, typ AttachType, flags AttachFlags) error { if fd < 0 { return errors.New("invalid fd") } - pfd, err := p.fd.value() + if flags != 0 { + return errors.New("flags must be zero") + } + + pfd, err := p.fd.Value() if err != nil { return err } - attr := bpfProgAlterAttr{ - targetFd: uint32(fd), - attachBpfFd: pfd, - attachType: uint32(typ), - attachFlags: uint32(flags), + attr := internal.BPFProgDetachAttr{ + TargetFd: uint32(fd), + AttachBpfFd: pfd, + AttachType: uint32(typ), } - return bpfProgAlter(_ProgDetach, &attr) + return internal.BPFProgDetach(&attr) } // LoadPinnedProgram loads a Program from a BPF file. // -// Requires at least Linux 4.13, use LoadPinnedProgramExplicit on -// earlier versions. -func LoadPinnedProgram(fileName string) (*Program, error) { - fd, err := bpfGetObject(fileName) +// Requires at least Linux 4.11. +func LoadPinnedProgram(fileName string, opts *LoadPinOptions) (*Program, error) { + fd, err := internal.BPFObjGet(fileName, opts.Marshal()) if err != nil { return nil, err } - abi, err := newProgramABIFromFd(fd) + info, err := newProgramInfoFromFd(fd) if err != nil { - _ = fd.close() - return nil, err + _ = fd.Close() + return nil, fmt.Errorf("info for %s: %w", fileName, err) } - return newProgram(fd, filepath.Base(fileName), abi), nil + return &Program{"", fd, filepath.Base(fileName), fileName, info.Type}, nil } -// LoadPinnedProgramExplicit loads a program with explicit parameters. -func LoadPinnedProgramExplicit(fileName string, abi *ProgramABI) (*Program, error) { - fd, err := bpfGetObject(fileName) - if err != nil { - return nil, err - } - - return newProgram(fd, filepath.Base(fileName), abi), nil -} - -// SanitizeName replaces all invalid characters in name. +// SanitizeName replaces all invalid characters in name with replacement. +// Passing a negative value for replacement will delete characters instead +// of replacing them. Use this to automatically generate valid names for maps +// and programs at runtime. // -// Use this to automatically generate valid names for maps and -// programs at run time. -// -// Passing a negative value for replacement will delete characters -// instead of replacing them. +// The set of allowed characters depends on the running kernel version. +// Dots are only allowed as of kernel 5.2. func SanitizeName(name string, replacement rune) string { return strings.Map(func(char rune) rune { if invalidBPFObjNameChar(char) { @@ -500,24 +698,64 @@ func SanitizeName(name string, replacement rune) string { }, name) } -type loadError struct { - cause error - verifierLog string +// ProgramGetNextID returns the ID of the next eBPF program. +// +// Returns ErrNotExist, if there is no next eBPF program. +func ProgramGetNextID(startID ProgramID) (ProgramID, error) { + id, err := objGetNextID(internal.BPF_PROG_GET_NEXT_ID, uint32(startID)) + return ProgramID(id), err } -func (le *loadError) Error() string { - if le.verifierLog == "" { - return fmt.Sprintf("failed to load program: %s", le.cause) +// ID returns the systemwide unique ID of the program. +// +// Deprecated: use ProgramInfo.ID() instead. +func (p *Program) ID() (ProgramID, error) { + info, err := bpfGetProgInfoByFD(p.fd, nil) + if err != nil { + return ProgramID(0), err } - return fmt.Sprintf("failed to load program: %s: %s", le.cause, le.verifierLog) + return ProgramID(info.id), nil } -func (le *loadError) Cause() error { - return le.cause -} +func resolveBTFType(spec *btf.Spec, name string, progType ProgramType, attachType AttachType) (btf.Type, error) { + type match struct { + p ProgramType + a AttachType + } -// IsNotSupported returns true if an error occurred because -// the kernel does not have support for a specific feature. -func IsNotSupported(err error) bool { - return errors.Cause(err) == errNotSupported + var typeName, featureName string + switch (match{progType, attachType}) { + case match{LSM, AttachLSMMac}: + typeName = "bpf_lsm_" + name + featureName = name + " LSM hook" + case match{Tracing, AttachTraceIter}: + typeName = "bpf_iter_" + name + featureName = name + " iterator" + case match{Extension, AttachNone}: + typeName = name + featureName = fmt.Sprintf("freplace %s", name) + default: + return nil, nil + } + + if spec == nil { + var err error + spec, err = btf.LoadKernelSpec() + if err != nil { + return nil, fmt.Errorf("load kernel spec: %w", err) + } + } + + var target *btf.Func + err := spec.FindType(typeName, &target) + if errors.Is(err, btf.ErrNotFound) { + return nil, &internal.UnsupportedFeatureError{ + Name: featureName, + } + } + if err != nil { + return nil, fmt.Errorf("resolve BTF for %s: %w", featureName, err) + } + + return target, nil } diff --git a/vendor/github.com/cilium/ebpf/ptr_32_le.go b/vendor/github.com/cilium/ebpf/ptr_32_le.go deleted file mode 100644 index 14b805e..0000000 --- a/vendor/github.com/cilium/ebpf/ptr_32_le.go +++ /dev/null @@ -1,14 +0,0 @@ -// +build 386 amd64p32 arm mipsle mips64p32le - -package ebpf - -import ( - "unsafe" -) - -// ptr wraps an unsafe.Pointer to be 64bit to -// conform to the syscall specification. -type syscallPtr struct { - ptr unsafe.Pointer - pad uint32 -} diff --git a/vendor/github.com/cilium/ebpf/ptr_64.go b/vendor/github.com/cilium/ebpf/ptr_64.go deleted file mode 100644 index c897d72..0000000 --- a/vendor/github.com/cilium/ebpf/ptr_64.go +++ /dev/null @@ -1,14 +0,0 @@ -// +build !386,!amd64p32,!arm,!mipsle,!mips64p32le -// +build !armbe,!mips,!mips64p32 - -package ebpf - -import ( - "unsafe" -) - -// ptr wraps an unsafe.Pointer to be 64bit to -// conform to the syscall specification. -type syscallPtr struct { - ptr unsafe.Pointer -} diff --git a/vendor/github.com/cilium/ebpf/readme.md b/vendor/github.com/cilium/ebpf/readme.md deleted file mode 100644 index 26ab2b9..0000000 --- a/vendor/github.com/cilium/ebpf/readme.md +++ /dev/null @@ -1,20 +0,0 @@ -eBPF -------- -[![](https://godoc.org/github.com/cilium/ebpf?status.svg)](https://godoc.org/github.com/cilium/ebpf) - -eBPF is a pure Go library that provides utilities for loading, compiling, and debugging eBPF programs. It has minimal external dependencies and is intended to be used in long running processes. - -[ebpf/asm](https://godoc.org/github.com/cilium/ebpf/asm) contains a basic assembler. - -The library is maintained by [Cloudflare](https://www.cloudflare.com) and [Cilium](https://www.cilium.io). Feel free to [join](https://cilium.herokuapp.com/) the [libbpf-go](https://cilium.slack.com/messages/libbpf-go) channel on Slack. - -## Current status - -The package is production ready, but **the API is explicitly unstable -right now**. Expect to update your code if you want to follow along. - -## Useful resources - -* [Cilium eBPF documentation](https://cilium.readthedocs.io/en/latest/bpf/#bpf-guide) (recommended) -* [Linux documentation on BPF](http://elixir.free-electrons.com/linux/latest/source/Documentation/networking/filter.txt) -* [eBPF features by Linux version](https://github.com/iovisor/bcc/blob/master/docs/kernel-versions.md) diff --git a/vendor/github.com/cilium/ebpf/run-tests.sh b/vendor/github.com/cilium/ebpf/run-tests.sh new file mode 100644 index 0000000..a079edc --- /dev/null +++ b/vendor/github.com/cilium/ebpf/run-tests.sh @@ -0,0 +1,125 @@ +#!/bin/bash +# Test the current package under a different kernel. +# Requires virtme and qemu to be installed. +# Examples: +# Run all tests on a 5.4 kernel +# $ ./run-tests.sh 5.4 +# Run a subset of tests: +# $ ./run-tests.sh 5.4 ./link + +set -euo pipefail + +script="$(realpath "$0")" +readonly script + +# This script is a bit like a Matryoshka doll since it keeps re-executing itself +# in various different contexts: +# +# 1. invoked by the user like run-tests.sh 5.4 +# 2. invoked by go test like run-tests.sh --exec-vm +# 3. invoked by init in the vm like run-tests.sh --exec-test +# +# This allows us to use all available CPU on the host machine to compile our +# code, and then only use the VM to execute the test. This is because the VM +# is usually slower at compiling than the host. +if [[ "${1:-}" = "--exec-vm" ]]; then + shift + + input="$1" + shift + + # Use sudo if /dev/kvm isn't accessible by the current user. + sudo="" + if [[ ! -r /dev/kvm || ! -w /dev/kvm ]]; then + sudo="sudo" + fi + readonly sudo + + testdir="$(dirname "$1")" + output="$(mktemp -d)" + printf -v cmd "%q " "$@" + + if [[ "$(stat -c '%t:%T' -L /proc/$$/fd/0)" == "1:3" ]]; then + # stdin is /dev/null, which doesn't play well with qemu. Use a fifo as a + # blocking substitute. + mkfifo "${output}/fake-stdin" + # Open for reading and writing to avoid blocking. + exec 0<> "${output}/fake-stdin" + rm "${output}/fake-stdin" + fi + + if ! $sudo virtme-run --kimg "${input}/bzImage" --memory 768M --pwd \ + --rwdir="${testdir}=${testdir}" \ + --rodir=/run/input="${input}" \ + --rwdir=/run/output="${output}" \ + --script-sh "PATH=\"$PATH\" \"$script\" --exec-test $cmd" \ + --kopt possible_cpus=2; then # need at least two CPUs for some tests + exit 23 + fi + + if [[ ! -e "${output}/success" ]]; then + exit 42 + fi + + $sudo rm -r "$output" + exit 0 +elif [[ "${1:-}" = "--exec-test" ]]; then + shift + + mount -t bpf bpf /sys/fs/bpf + mount -t tracefs tracefs /sys/kernel/debug/tracing + + if [[ -d "/run/input/bpf" ]]; then + export KERNEL_SELFTESTS="/run/input/bpf" + fi + + dmesg -C + if ! "$@"; then + dmesg + exit 1 # this return code is "swallowed" by qemu + fi + touch "/run/output/success" + exit 0 +fi + +readonly kernel_version="${1:-}" +if [[ -z "${kernel_version}" ]]; then + echo "Expecting kernel version as first argument" + exit 1 +fi +shift + +readonly kernel="linux-${kernel_version}.bz" +readonly selftests="linux-${kernel_version}-selftests-bpf.bz" +readonly input="$(mktemp -d)" +readonly tmp_dir="${TMPDIR:-/tmp}" +readonly branch="${BRANCH:-master}" + +fetch() { + echo Fetching "${1}" + wget -nv -N -P "${tmp_dir}" "https://github.com/cilium/ci-kernels/raw/${branch}/${1}" +} + +fetch "${kernel}" +cp "${tmp_dir}/${kernel}" "${input}/bzImage" + +if fetch "${selftests}"; then + mkdir "${input}/bpf" + tar --strip-components=4 -xjf "${tmp_dir}/${selftests}" -C "${input}/bpf" +else + echo "No selftests found, disabling" +fi + +args=(-short -coverpkg=./... -coverprofile=coverage.out -count 1 ./...) +if (( $# > 0 )); then + args=("$@") +fi + +export GOFLAGS=-mod=readonly +export CGO_ENABLED=0 + +echo Testing on "${kernel_version}" +go test -exec "$script --exec-vm $input" "${args[@]}" +echo "Test successful on ${kernel_version}" + +rm -r "${input}" diff --git a/vendor/github.com/cilium/ebpf/syscalls.go b/vendor/github.com/cilium/ebpf/syscalls.go index 68abd3b..f8cb5f0 100644 --- a/vendor/github.com/cilium/ebpf/syscalls.go +++ b/vendor/github.com/cilium/ebpf/syscalls.go @@ -2,94 +2,35 @@ package ebpf import ( "bytes" - "path/filepath" - "runtime" - "strconv" - "strings" + "errors" + "fmt" + "os" "unsafe" + "github.com/cilium/ebpf/asm" + "github.com/cilium/ebpf/internal" "github.com/cilium/ebpf/internal/unix" - - "github.com/pkg/errors" ) -var errClosedFd = errors.New("use of closed file descriptor") - -type bpfFD struct { - raw int64 -} - -func newBPFFD(value uint32) *bpfFD { - fd := &bpfFD{int64(value)} - runtime.SetFinalizer(fd, (*bpfFD).close) - return fd -} - -func (fd *bpfFD) String() string { - return strconv.FormatInt(fd.raw, 10) -} - -func (fd *bpfFD) value() (uint32, error) { - if fd.raw < 0 { - return 0, errClosedFd - } - - return uint32(fd.raw), nil -} - -func (fd *bpfFD) close() error { - if fd.raw < 0 { - return nil - } - - value := int(fd.raw) - fd.raw = -1 - - fd.forget() - return unix.Close(value) -} - -func (fd *bpfFD) forget() { - runtime.SetFinalizer(fd, nil) -} - -func (fd *bpfFD) dup() (*bpfFD, error) { - if fd.raw < 0 { - return nil, errClosedFd - } - - dup, err := unix.FcntlInt(uintptr(fd.raw), unix.F_DUPFD_CLOEXEC, 0) - if err != nil { - return nil, errors.Wrap(err, "can't dup fd") - } - - return newBPFFD(uint32(dup)), nil -} - -// bpfObjName is a null-terminated string made up of -// 'A-Za-z0-9_' characters. -type bpfObjName [unix.BPF_OBJ_NAME_LEN]byte - -// newBPFObjName truncates the result if it is too long. -func newBPFObjName(name string) (bpfObjName, error) { - idx := strings.IndexFunc(name, invalidBPFObjNameChar) - if idx != -1 { - return bpfObjName{}, errors.Errorf("invalid character '%c' in name '%s'", name[idx], name) - } - - var result bpfObjName - copy(result[:unix.BPF_OBJ_NAME_LEN-1], name) - return result, nil -} +// ErrNotExist is returned when loading a non-existing map or program. +// +// Deprecated: use os.ErrNotExist instead. +var ErrNotExist = os.ErrNotExist +// invalidBPFObjNameChar returns true if char may not appear in +// a BPF object name. func invalidBPFObjNameChar(char rune) bool { + dotAllowed := objNameAllowsDot() == nil + switch { case char >= 'A' && char <= 'Z': - fallthrough + return false case char >= 'a' && char <= 'z': - fallthrough + return false case char >= '0' && char <= '9': - fallthrough + return false + case dotAllowed && char == '.': + return false case char == '_': return false default: @@ -97,69 +38,77 @@ func invalidBPFObjNameChar(char rune) bool { } } -type bpfMapCreateAttr struct { - mapType MapType - keySize uint32 - valueSize uint32 - maxEntries uint32 - flags uint32 - innerMapFd uint32 // since 4.12 56f668dfe00d - numaNode uint32 // since 4.14 96eabe7a40aa - mapName bpfObjName // since 4.15 ad5b177bd73f -} - type bpfMapOpAttr struct { mapFd uint32 padding uint32 - key syscallPtr - value syscallPtr + key internal.Pointer + value internal.Pointer flags uint64 } +type bpfBatchMapOpAttr struct { + inBatch internal.Pointer + outBatch internal.Pointer + keys internal.Pointer + values internal.Pointer + count uint32 + mapFd uint32 + elemFlags uint64 + flags uint64 +} + type bpfMapInfo struct { - mapType uint32 - id uint32 - keySize uint32 - valueSize uint32 - maxEntries uint32 - flags uint32 - mapName bpfObjName // since 4.15 ad5b177bd73f -} - -type bpfPinObjAttr struct { - fileName syscallPtr - fd uint32 - padding uint32 -} - -type bpfProgLoadAttr struct { - progType ProgramType - insCount uint32 - instructions syscallPtr - license syscallPtr - logLevel uint32 - logSize uint32 - logBuf syscallPtr - kernelVersion uint32 // since 4.1 2541517c32be - progFlags uint32 // since 4.11 e07b98d9bffe - progName bpfObjName // since 4.15 067cae47771c - progIfIndex uint32 // since 4.15 1f6f4cb7ba21 - expectedAttachType AttachType // since 4.17 5e43f899b03a + map_type uint32 // since 4.12 1e2709769086 + id uint32 + key_size uint32 + value_size uint32 + max_entries uint32 + map_flags uint32 + name internal.BPFObjName // since 4.15 ad5b177bd73f + ifindex uint32 // since 4.16 52775b33bb50 + btf_vmlinux_value_type_id uint32 // since 5.6 85d33df357b6 + netns_dev uint64 // since 4.16 52775b33bb50 + netns_ino uint64 + btf_id uint32 // since 4.18 78958fca7ead + btf_key_type_id uint32 // since 4.18 9b2cf328b2ec + btf_value_type_id uint32 } type bpfProgInfo struct { - progType uint32 - id uint32 - tag [unix.BPF_TAG_SIZE]byte - jitedLen uint32 - xlatedLen uint32 - jited syscallPtr - xlated syscallPtr - loadTime uint64 // since 4.15 cb4d2b3f03d8 - createdByUID uint32 - nrMapIDs uint32 - mapIds syscallPtr - name bpfObjName + prog_type uint32 + id uint32 + tag [unix.BPF_TAG_SIZE]byte + jited_prog_len uint32 + xlated_prog_len uint32 + jited_prog_insns internal.Pointer + xlated_prog_insns internal.Pointer + load_time uint64 // since 4.15 cb4d2b3f03d8 + created_by_uid uint32 + nr_map_ids uint32 // since 4.15 cb4d2b3f03d8 + map_ids internal.Pointer + name internal.BPFObjName // since 4.15 067cae47771c + ifindex uint32 + gpl_compatible uint32 + netns_dev uint64 + netns_ino uint64 + nr_jited_ksyms uint32 + nr_jited_func_lens uint32 + jited_ksyms internal.Pointer + jited_func_lens internal.Pointer + btf_id uint32 + func_info_rec_size uint32 + func_info internal.Pointer + nr_func_info uint32 + nr_line_info uint32 + line_info internal.Pointer + jited_line_info internal.Pointer + nr_jited_line_info uint32 + line_info_rec_size uint32 + jited_line_info_rec_size uint32 + nr_prog_tags uint32 + prog_tags internal.Pointer + run_time_ns uint64 + run_cnt uint64 } type bpfProgTestRunAttr struct { @@ -167,67 +116,96 @@ type bpfProgTestRunAttr struct { retval uint32 dataSizeIn uint32 dataSizeOut uint32 - dataIn syscallPtr - dataOut syscallPtr + dataIn internal.Pointer + dataOut internal.Pointer repeat uint32 duration uint32 } -type bpfProgAlterAttr struct { - targetFd uint32 - attachBpfFd uint32 - attachType uint32 - attachFlags uint32 +type bpfMapFreezeAttr struct { + mapFd uint32 } -type bpfObjGetInfoByFDAttr struct { - fd uint32 - infoLen uint32 - info syscallPtr // May be either bpfMapInfo or bpfProgInfo +type bpfObjGetNextIDAttr struct { + startID uint32 + nextID uint32 + openFlags uint32 } -type bpfGetFDByIDAttr struct { - id uint32 - next uint32 -} - -func newPtr(ptr unsafe.Pointer) syscallPtr { - return syscallPtr{ptr: ptr} -} - -func bpfProgLoad(attr *bpfProgLoadAttr) (*bpfFD, error) { - for { - fd, err := bpfCall(_ProgLoad, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) - // As of ~4.20 the verifier can be interrupted by a signal, - // and returns EAGAIN in that case. - if err == unix.EAGAIN { - continue - } - - if err != nil { - return nil, err - } - - return newBPFFD(uint32(fd)), nil - } -} - -func bpfProgAlter(cmd int, attr *bpfProgAlterAttr) error { - _, err := bpfCall(cmd, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) +func bpfProgTestRun(attr *bpfProgTestRunAttr) error { + _, err := internal.BPF(internal.BPF_PROG_TEST_RUN, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) return err } -func bpfMapCreate(attr *bpfMapCreateAttr) (*bpfFD, error) { - fd, err := bpfCall(_MapCreate, unsafe.Pointer(attr), unsafe.Sizeof(*attr)) - if err != nil { - return nil, err +var haveNestedMaps = internal.FeatureTest("nested maps", "4.12", func() error { + _, err := internal.BPFMapCreate(&internal.BPFMapCreateAttr{ + MapType: uint32(ArrayOfMaps), + KeySize: 4, + ValueSize: 4, + MaxEntries: 1, + // Invalid file descriptor. + InnerMapFd: ^uint32(0), + }) + if errors.Is(err, unix.EINVAL) { + return internal.ErrNotSupported } + if errors.Is(err, unix.EBADF) { + return nil + } + return err +}) - return newBPFFD(uint32(fd)), nil -} +var haveMapMutabilityModifiers = internal.FeatureTest("read- and write-only maps", "5.2", func() error { + // This checks BPF_F_RDONLY_PROG and BPF_F_WRONLY_PROG. Since + // BPF_MAP_FREEZE appeared in 5.2 as well we don't do a separate check. + m, err := internal.BPFMapCreate(&internal.BPFMapCreateAttr{ + MapType: uint32(Array), + KeySize: 4, + ValueSize: 4, + MaxEntries: 1, + Flags: unix.BPF_F_RDONLY_PROG, + }) + if err != nil { + return internal.ErrNotSupported + } + _ = m.Close() + return nil +}) -func bpfMapLookupElem(m *bpfFD, key, valueOut syscallPtr) error { - fd, err := m.value() +var haveMmapableMaps = internal.FeatureTest("mmapable maps", "5.5", func() error { + // This checks BPF_F_MMAPABLE, which appeared in 5.5 for array maps. + m, err := internal.BPFMapCreate(&internal.BPFMapCreateAttr{ + MapType: uint32(Array), + KeySize: 4, + ValueSize: 4, + MaxEntries: 1, + Flags: unix.BPF_F_MMAPABLE, + }) + if err != nil { + return internal.ErrNotSupported + } + _ = m.Close() + return nil +}) + +var haveInnerMaps = internal.FeatureTest("inner maps", "5.10", func() error { + // This checks BPF_F_INNER_MAP, which appeared in 5.10. + m, err := internal.BPFMapCreate(&internal.BPFMapCreateAttr{ + MapType: uint32(Array), + KeySize: 4, + ValueSize: 4, + MaxEntries: 1, + Flags: unix.BPF_F_INNER_MAP, + }) + if err != nil { + return internal.ErrNotSupported + } + _ = m.Close() + return nil +}) + +func bpfMapLookupElem(m *internal.FD, key, valueOut internal.Pointer) error { + fd, err := m.Value() if err != nil { return err } @@ -237,12 +215,27 @@ func bpfMapLookupElem(m *bpfFD, key, valueOut syscallPtr) error { key: key, value: valueOut, } - _, err = bpfCall(_MapLookupElem, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) - return err + _, err = internal.BPF(internal.BPF_MAP_LOOKUP_ELEM, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) + return wrapMapError(err) } -func bpfMapUpdateElem(m *bpfFD, key, valueOut syscallPtr, flags uint64) error { - fd, err := m.value() +func bpfMapLookupAndDelete(m *internal.FD, key, valueOut internal.Pointer) error { + fd, err := m.Value() + if err != nil { + return err + } + + attr := bpfMapOpAttr{ + mapFd: fd, + key: key, + value: valueOut, + } + _, err = internal.BPF(internal.BPF_MAP_LOOKUP_AND_DELETE_ELEM, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) + return wrapMapError(err) +} + +func bpfMapUpdateElem(m *internal.FD, key, valueOut internal.Pointer, flags uint64) error { + fd, err := m.Value() if err != nil { return err } @@ -253,12 +246,12 @@ func bpfMapUpdateElem(m *bpfFD, key, valueOut syscallPtr, flags uint64) error { value: valueOut, flags: flags, } - _, err = bpfCall(_MapUpdateElem, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) - return err + _, err = internal.BPF(internal.BPF_MAP_UPDATE_ELEM, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) + return wrapMapError(err) } -func bpfMapDeleteElem(m *bpfFD, key syscallPtr) error { - fd, err := m.value() +func bpfMapDeleteElem(m *internal.FD, key internal.Pointer) error { + fd, err := m.Value() if err != nil { return err } @@ -267,12 +260,12 @@ func bpfMapDeleteElem(m *bpfFD, key syscallPtr) error { mapFd: fd, key: key, } - _, err = bpfCall(_MapDeleteElem, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) - return err + _, err = internal.BPF(internal.BPF_MAP_DELETE_ELEM, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) + return wrapMapError(err) } -func bpfMapGetNextKey(m *bpfFD, key, nextKeyOut syscallPtr) error { - fd, err := m.value() +func bpfMapGetNextKey(m *internal.FD, key, nextKeyOut internal.Pointer) error { + fd, err := m.Value() if err != nil { return err } @@ -282,139 +275,190 @@ func bpfMapGetNextKey(m *bpfFD, key, nextKeyOut syscallPtr) error { key: key, value: nextKeyOut, } - _, err = bpfCall(_MapGetNextKey, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) + _, err = internal.BPF(internal.BPF_MAP_GET_NEXT_KEY, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) + return wrapMapError(err) +} + +func objGetNextID(cmd internal.BPFCmd, start uint32) (uint32, error) { + attr := bpfObjGetNextIDAttr{ + startID: start, + } + _, err := internal.BPF(cmd, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) + return attr.nextID, err +} + +func bpfMapBatch(cmd internal.BPFCmd, m *internal.FD, inBatch, outBatch, keys, values internal.Pointer, count uint32, opts *BatchOptions) (uint32, error) { + fd, err := m.Value() + if err != nil { + return 0, err + } + + attr := bpfBatchMapOpAttr{ + inBatch: inBatch, + outBatch: outBatch, + keys: keys, + values: values, + count: count, + mapFd: fd, + } + if opts != nil { + attr.elemFlags = opts.ElemFlags + attr.flags = opts.Flags + } + _, err = internal.BPF(cmd, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) + // always return count even on an error, as things like update might partially be fulfilled. + return attr.count, wrapMapError(err) +} + +func wrapMapError(err error) error { + if err == nil { + return nil + } + + if errors.Is(err, unix.ENOENT) { + return internal.SyscallError(ErrKeyNotExist, unix.ENOENT) + } + + if errors.Is(err, unix.EEXIST) { + return internal.SyscallError(ErrKeyExist, unix.EEXIST) + } + + if errors.Is(err, unix.ENOTSUPP) { + return internal.SyscallError(ErrNotSupported, unix.ENOTSUPP) + } + + if errors.Is(err, unix.E2BIG) { + return fmt.Errorf("key too big for map: %w", err) + } + return err } -const bpfFSType = 0xcafe4a11 - -func bpfPinObject(fileName string, fd *bpfFD) error { - dirName := filepath.Dir(fileName) - var statfs unix.Statfs_t - if err := unix.Statfs(dirName, &statfs); err != nil { - return err - } - if uint64(statfs.Type) != bpfFSType { - return errors.Errorf("%s is not on a bpf filesystem", fileName) - } - - value, err := fd.value() +func bpfMapFreeze(m *internal.FD) error { + fd, err := m.Value() if err != nil { return err } - _, err = bpfCall(_ObjPin, unsafe.Pointer(&bpfPinObjAttr{ - fileName: newPtr(unsafe.Pointer(&[]byte(fileName)[0])), - fd: value, - }), 16) - return errors.Wrapf(err, "pin object %s", fileName) + attr := bpfMapFreezeAttr{ + mapFd: fd, + } + _, err = internal.BPF(internal.BPF_MAP_FREEZE, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) + return err } -func bpfGetObject(fileName string) (*bpfFD, error) { - ptr, err := bpfCall(_ObjGet, unsafe.Pointer(&bpfPinObjAttr{ - fileName: newPtr(unsafe.Pointer(&[]byte(fileName)[0])), - }), 16) - if err != nil { - return nil, errors.Wrapf(err, "get object %s", fileName) - } - return newBPFFD(uint32(ptr)), nil -} - -func bpfGetObjectInfoByFD(fd *bpfFD, info unsafe.Pointer, size uintptr) error { - value, err := fd.value() - if err != nil { - return err - } - - // available from 4.13 - attr := bpfObjGetInfoByFDAttr{ - fd: value, - infoLen: uint32(size), - info: newPtr(info), - } - _, err = bpfCall(_ObjGetInfoByFD, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) - return errors.Wrapf(err, "fd %d", value) -} - -func bpfGetProgInfoByFD(fd *bpfFD) (*bpfProgInfo, error) { +func bpfGetProgInfoByFD(fd *internal.FD, ids []MapID) (*bpfProgInfo, error) { var info bpfProgInfo - err := bpfGetObjectInfoByFD(fd, unsafe.Pointer(&info), unsafe.Sizeof(info)) - return &info, errors.Wrap(err, "can't get program info") + if len(ids) > 0 { + info.nr_map_ids = uint32(len(ids)) + info.map_ids = internal.NewPointer(unsafe.Pointer(&ids[0])) + } + + if err := internal.BPFObjGetInfoByFD(fd, unsafe.Pointer(&info), unsafe.Sizeof(info)); err != nil { + return nil, fmt.Errorf("can't get program info: %w", err) + } + return &info, nil } -func bpfGetMapInfoByFD(fd *bpfFD) (*bpfMapInfo, error) { +func bpfGetMapInfoByFD(fd *internal.FD) (*bpfMapInfo, error) { var info bpfMapInfo - err := bpfGetObjectInfoByFD(fd, unsafe.Pointer(&info), unsafe.Sizeof(info)) - return &info, errors.Wrap(err, "can't get map info:") -} - -var haveObjName = featureTest{ - Fn: func() bool { - name, err := newBPFObjName("feature_test") - if err != nil { - // This really is a fatal error, but it should be caught - // by the unit tests not working. - return false - } - - attr := bpfMapCreateAttr{ - mapType: Array, - keySize: 4, - valueSize: 4, - maxEntries: 1, - mapName: name, - } - - fd, err := bpfMapCreate(&attr) - if err != nil { - return false - } - - _ = fd.close() - return true - }, -} - -func bpfGetMapFDByID(id uint32) (*bpfFD, error) { - // available from 4.13 - attr := bpfGetFDByIDAttr{ - id: id, - } - ptr, err := bpfCall(_MapGetFDByID, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) + err := internal.BPFObjGetInfoByFD(fd, unsafe.Pointer(&info), unsafe.Sizeof(info)) if err != nil { - return nil, errors.Wrapf(err, "can't get fd for map id %d", id) + return nil, fmt.Errorf("can't get map info: %w", err) } - return newBPFFD(uint32(ptr)), nil + return &info, nil } -func bpfGetProgramFDByID(id uint32) (*bpfFD, error) { - // available from 4.13 - attr := bpfGetFDByIDAttr{ - id: id, +var haveObjName = internal.FeatureTest("object names", "4.15", func() error { + attr := internal.BPFMapCreateAttr{ + MapType: uint32(Array), + KeySize: 4, + ValueSize: 4, + MaxEntries: 1, + MapName: internal.NewBPFObjName("feature_test"), } - ptr, err := bpfCall(_ProgGetFDByID, unsafe.Pointer(&attr), unsafe.Sizeof(attr)) + + fd, err := internal.BPFMapCreate(&attr) if err != nil { - return nil, errors.Wrapf(err, "can't get fd for program id %d", id) - } - return newBPFFD(uint32(ptr)), nil -} - -func bpfCall(cmd int, attr unsafe.Pointer, size uintptr) (uintptr, error) { - r1, _, errNo := unix.Syscall(unix.SYS_BPF, uintptr(cmd), uintptr(attr), size) - runtime.KeepAlive(attr) - - var err error - if errNo != 0 { - err = errNo + return internal.ErrNotSupported } - return r1, err -} + _ = fd.Close() + return nil +}) -func convertCString(in []byte) string { - inLen := bytes.IndexByte(in, 0) - if inLen == -1 { - return "" +var objNameAllowsDot = internal.FeatureTest("dot in object names", "5.2", func() error { + if err := haveObjName(); err != nil { + return err } - return string(in[:inLen]) -} + + attr := internal.BPFMapCreateAttr{ + MapType: uint32(Array), + KeySize: 4, + ValueSize: 4, + MaxEntries: 1, + MapName: internal.NewBPFObjName(".test"), + } + + fd, err := internal.BPFMapCreate(&attr) + if err != nil { + return internal.ErrNotSupported + } + + _ = fd.Close() + return nil +}) + +var haveBatchAPI = internal.FeatureTest("map batch api", "5.6", func() error { + var maxEntries uint32 = 2 + attr := internal.BPFMapCreateAttr{ + MapType: uint32(Hash), + KeySize: 4, + ValueSize: 4, + MaxEntries: maxEntries, + } + + fd, err := internal.BPFMapCreate(&attr) + if err != nil { + return internal.ErrNotSupported + } + defer fd.Close() + keys := []uint32{1, 2} + values := []uint32{3, 4} + kp, _ := marshalPtr(keys, 8) + vp, _ := marshalPtr(values, 8) + nilPtr := internal.NewPointer(nil) + _, err = bpfMapBatch(internal.BPF_MAP_UPDATE_BATCH, fd, nilPtr, nilPtr, kp, vp, maxEntries, nil) + if err != nil { + return internal.ErrNotSupported + } + return nil +}) + +var haveProbeReadKernel = internal.FeatureTest("bpf_probe_read_kernel", "5.5", func() error { + insns := asm.Instructions{ + asm.Mov.Reg(asm.R1, asm.R10), + asm.Add.Imm(asm.R1, -8), + asm.Mov.Imm(asm.R2, 8), + asm.Mov.Imm(asm.R3, 0), + asm.FnProbeReadKernel.Call(), + asm.Return(), + } + buf := bytes.NewBuffer(make([]byte, 0, len(insns)*asm.InstructionSize)) + if err := insns.Marshal(buf, internal.NativeEndian); err != nil { + return err + } + bytecode := buf.Bytes() + + fd, err := internal.BPFProgLoad(&internal.BPFProgLoadAttr{ + ProgType: uint32(Kprobe), + License: internal.NewStringPointer("GPL"), + Instructions: internal.NewSlicePointer(bytecode), + InsCount: uint32(len(bytecode) / asm.InstructionSize), + }) + if err != nil { + return internal.ErrNotSupported + } + _ = fd.Close() + return nil +}) diff --git a/vendor/github.com/cilium/ebpf/types.go b/vendor/github.com/cilium/ebpf/types.go index 0daf9a7..84b83f9 100644 --- a/vendor/github.com/cilium/ebpf/types.go +++ b/vendor/github.com/cilium/ebpf/types.go @@ -1,11 +1,20 @@ package ebpf -//go:generate stringer -output types_string.go -type=MapType,ProgramType +import ( + "github.com/cilium/ebpf/internal/unix" +) + +//go:generate stringer -output types_string.go -type=MapType,ProgramType,PinType // MapType indicates the type map structure // that will be initialized in the kernel. type MapType uint32 +// Max returns the latest supported MapType. +func (_ MapType) Max() MapType { + return maxMapType - 1 +} + // All the various map types that can be created const ( UnspecifiedMap MapType = iota @@ -57,98 +66,108 @@ const ( // HashOfMaps - Each item in the hash map is another map. The inner map mustn't be a map of maps // itself. HashOfMaps + // DevMap - Specialized map to store references to network devices. + DevMap + // SockMap - Specialized map to store references to sockets. + SockMap + // CPUMap - Specialized map to store references to CPUs. + CPUMap + // XSKMap - Specialized map for XDP programs to store references to open sockets. + XSKMap + // SockHash - Specialized hash to store references to sockets. + SockHash + // CGroupStorage - Special map for CGroups. + CGroupStorage + // ReusePortSockArray - Specialized map to store references to sockets that can be reused. + ReusePortSockArray + // PerCPUCGroupStorage - Special per CPU map for CGroups. + PerCPUCGroupStorage + // Queue - FIFO storage for BPF programs. + Queue + // Stack - LIFO storage for BPF programs. + Stack + // SkStorage - Specialized map for local storage at SK for BPF programs. + SkStorage + // DevMapHash - Hash-based indexing scheme for references to network devices. + DevMapHash + // StructOpsMap - This map holds a kernel struct with its function pointer implemented in a BPF + // program. + StructOpsMap + // RingBuf - Similar to PerfEventArray, but shared across all CPUs. + RingBuf + // InodeStorage - Specialized local storage map for inodes. + InodeStorage + // TaskStorage - Specialized local storage map for task_struct. + TaskStorage + // maxMapType - Bound enum of MapTypes, has to be last in enum. + maxMapType ) +// Deprecated: StructOpts was a typo, use StructOpsMap instead. +// +// Declared as a variable to prevent stringer from picking it up +// as an enum value. +var StructOpts MapType = StructOpsMap + // hasPerCPUValue returns true if the Map stores a value per CPU. func (mt MapType) hasPerCPUValue() bool { - if mt == PerCPUHash || mt == PerCPUArray { - return true - } - return false + return mt == PerCPUHash || mt == PerCPUArray || mt == LRUCPUHash || mt == PerCPUCGroupStorage } -const ( - _MapCreate = iota - _MapLookupElem - _MapUpdateElem - _MapDeleteElem - _MapGetNextKey - _ProgLoad - _ObjPin - _ObjGet - _ProgAttach - _ProgDetach - _ProgTestRun - _ProgGetNextID - _MapGetNextID - _ProgGetFDByID - _MapGetFDByID - _ObjGetInfoByFD -) +// canStoreMap returns true if the map type accepts a map fd +// for update and returns a map id for lookup. +func (mt MapType) canStoreMap() bool { + return mt == ArrayOfMaps || mt == HashOfMaps +} -const ( - _Any = iota - _NoExist - _Exist -) +// canStoreProgram returns true if the map type accepts a program fd +// for update and returns a program id for lookup. +func (mt MapType) canStoreProgram() bool { + return mt == ProgramArray +} // ProgramType of the eBPF program type ProgramType uint32 +// Max return the latest supported ProgramType. +func (_ ProgramType) Max() ProgramType { + return maxProgramType - 1 +} + // eBPF program types const ( - // Unrecognized program type UnspecifiedProgram ProgramType = iota - // SocketFilter socket or seccomp filter SocketFilter - // Kprobe program Kprobe - // SchedCLS traffic control shaper SchedCLS - // SchedACT routing control shaper SchedACT - // TracePoint program TracePoint - // XDP program XDP - // PerfEvent program PerfEvent - // CGroupSKB program CGroupSKB - // CGroupSock program CGroupSock - // LWTIn program LWTIn - // LWTOut program LWTOut - // LWTXmit program LWTXmit - // SockOps program SockOps - // SkSKB program SkSKB - // CGroupDevice program CGroupDevice - // SkMsg program SkMsg - // RawTracepoint program RawTracepoint - // CGroupSockAddr program CGroupSockAddr - // LWTSeg6Local program LWTSeg6Local - // LircMode2 program LircMode2 - // SkReuseport program SkReuseport - // FlowDissector program FlowDissector - // CGroupSysctl program CGroupSysctl - // RawTracepointWritable program RawTracepointWritable - // CGroupSockopt program CGroupSockopt + Tracing + StructOps + Extension + LSM + SkLookup + maxProgramType ) // AttachType of the eBPF program, needed to differentiate allowed context accesses in @@ -156,7 +175,9 @@ const ( // Will cause invalid argument (EINVAL) at program load time if set incorrectly. type AttachType uint32 -// AttachNone is an alias for AttachCGroupInetIngress for readability reasons +//go:generate stringer -type AttachType -trimprefix Attach + +// AttachNone is an alias for AttachCGroupInetIngress for readability reasons. const AttachNone AttachType = 0 const ( @@ -183,7 +204,75 @@ const ( AttachCGroupUDP6Recvmsg AttachCGroupGetsockopt AttachCGroupSetsockopt + AttachTraceRawTp + AttachTraceFEntry + AttachTraceFExit + AttachModifyReturn + AttachLSMMac + AttachTraceIter + AttachCgroupInet4GetPeername + AttachCgroupInet6GetPeername + AttachCgroupInet4GetSockname + AttachCgroupInet6GetSockname + AttachXDPDevMap + AttachCgroupInetSockRelease + AttachXDPCPUMap + AttachSkLookup + AttachXDP + AttachSkSKBVerdict + AttachSkReuseportSelect + AttachSkReuseportSelectOrMigrate + AttachPerfEvent ) // AttachFlags of the eBPF program used in BPF_PROG_ATTACH command type AttachFlags uint32 + +// PinType determines whether a map is pinned into a BPFFS. +type PinType int + +// Valid pin types. +// +// Mirrors enum libbpf_pin_type. +const ( + PinNone PinType = iota + // Pin an object by using its name as the filename. + PinByName +) + +// LoadPinOptions control how a pinned object is loaded. +type LoadPinOptions struct { + // Request a read-only or write-only object. The default is a read-write + // object. Only one of the flags may be set. + ReadOnly bool + WriteOnly bool + + // Raw flags for the syscall. Other fields of this struct take precedence. + Flags uint32 +} + +// Marshal returns a value suitable for BPF_OBJ_GET syscall file_flags parameter. +func (lpo *LoadPinOptions) Marshal() uint32 { + if lpo == nil { + return 0 + } + + flags := lpo.Flags + if lpo.ReadOnly { + flags |= unix.BPF_F_RDONLY + } + if lpo.WriteOnly { + flags |= unix.BPF_F_WRONLY + } + return flags +} + +// BatchOptions batch map operations options +// +// Mirrors libbpf struct bpf_map_batch_opts +// Currently BPF_F_FLAG is the only supported +// flag (for ElemFlags). +type BatchOptions struct { + ElemFlags uint64 + Flags uint64 +} diff --git a/vendor/github.com/cilium/ebpf/types_string.go b/vendor/github.com/cilium/ebpf/types_string.go index 4813437..81cbc9e 100644 --- a/vendor/github.com/cilium/ebpf/types_string.go +++ b/vendor/github.com/cilium/ebpf/types_string.go @@ -1,4 +1,4 @@ -// Code generated by "stringer -output types_string.go -type=MapType,ProgramType"; DO NOT EDIT. +// Code generated by "stringer -output types_string.go -type=MapType,ProgramType,PinType"; DO NOT EDIT. package ebpf @@ -22,11 +22,28 @@ func _() { _ = x[LPMTrie-11] _ = x[ArrayOfMaps-12] _ = x[HashOfMaps-13] + _ = x[DevMap-14] + _ = x[SockMap-15] + _ = x[CPUMap-16] + _ = x[XSKMap-17] + _ = x[SockHash-18] + _ = x[CGroupStorage-19] + _ = x[ReusePortSockArray-20] + _ = x[PerCPUCGroupStorage-21] + _ = x[Queue-22] + _ = x[Stack-23] + _ = x[SkStorage-24] + _ = x[DevMapHash-25] + _ = x[StructOpsMap-26] + _ = x[RingBuf-27] + _ = x[InodeStorage-28] + _ = x[TaskStorage-29] + _ = x[maxMapType-30] } -const _MapType_name = "UnspecifiedMapHashArrayProgramArrayPerfEventArrayPerCPUHashPerCPUArrayStackTraceCGroupArrayLRUHashLRUCPUHashLPMTrieArrayOfMapsHashOfMaps" +const _MapType_name = "UnspecifiedMapHashArrayProgramArrayPerfEventArrayPerCPUHashPerCPUArrayStackTraceCGroupArrayLRUHashLRUCPUHashLPMTrieArrayOfMapsHashOfMapsDevMapSockMapCPUMapXSKMapSockHashCGroupStorageReusePortSockArrayPerCPUCGroupStorageQueueStackSkStorageDevMapHashStructOpsMapRingBufInodeStorageTaskStoragemaxMapType" -var _MapType_index = [...]uint8{0, 14, 18, 23, 35, 49, 59, 70, 80, 91, 98, 108, 115, 126, 136} +var _MapType_index = [...]uint16{0, 14, 18, 23, 35, 49, 59, 70, 80, 91, 98, 108, 115, 126, 136, 142, 149, 155, 161, 169, 182, 200, 219, 224, 229, 238, 248, 260, 267, 279, 290, 300} func (i MapType) String() string { if i >= MapType(len(_MapType_index)-1) { @@ -64,11 +81,17 @@ func _() { _ = x[CGroupSysctl-23] _ = x[RawTracepointWritable-24] _ = x[CGroupSockopt-25] + _ = x[Tracing-26] + _ = x[StructOps-27] + _ = x[Extension-28] + _ = x[LSM-29] + _ = x[SkLookup-30] + _ = x[maxProgramType-31] } -const _ProgramType_name = "UnspecifiedProgramSocketFilterKprobeSchedCLSSchedACTTracePointXDPPerfEventCGroupSKBCGroupSockLWTInLWTOutLWTXmitSockOpsSkSKBCGroupDeviceSkMsgRawTracepointCGroupSockAddrLWTSeg6LocalLircMode2SkReuseportFlowDissectorCGroupSysctlRawTracepointWritableCGroupSockopt" +const _ProgramType_name = "UnspecifiedProgramSocketFilterKprobeSchedCLSSchedACTTracePointXDPPerfEventCGroupSKBCGroupSockLWTInLWTOutLWTXmitSockOpsSkSKBCGroupDeviceSkMsgRawTracepointCGroupSockAddrLWTSeg6LocalLircMode2SkReuseportFlowDissectorCGroupSysctlRawTracepointWritableCGroupSockoptTracingStructOpsExtensionLSMSkLookupmaxProgramType" -var _ProgramType_index = [...]uint16{0, 18, 30, 36, 44, 52, 62, 65, 74, 83, 93, 98, 104, 111, 118, 123, 135, 140, 153, 167, 179, 188, 199, 212, 224, 245, 258} +var _ProgramType_index = [...]uint16{0, 18, 30, 36, 44, 52, 62, 65, 74, 83, 93, 98, 104, 111, 118, 123, 135, 140, 153, 167, 179, 188, 199, 212, 224, 245, 258, 265, 274, 283, 286, 294, 308} func (i ProgramType) String() string { if i >= ProgramType(len(_ProgramType_index)-1) { @@ -76,3 +99,21 @@ func (i ProgramType) String() string { } return _ProgramType_name[_ProgramType_index[i]:_ProgramType_index[i+1]] } +func _() { + // An "invalid array index" compiler error signifies that the constant values have changed. + // Re-run the stringer command to generate them again. + var x [1]struct{} + _ = x[PinNone-0] + _ = x[PinByName-1] +} + +const _PinType_name = "PinNonePinByName" + +var _PinType_index = [...]uint8{0, 7, 16} + +func (i PinType) String() string { + if i < 0 || i >= PinType(len(_PinType_index)-1) { + return "PinType(" + strconv.FormatInt(int64(i), 10) + ")" + } + return _PinType_name[_PinType_index[i]:_PinType_index[i+1]] +} diff --git a/vendor/github.com/containerd/console/.golangci.yml b/vendor/github.com/containerd/console/.golangci.yml new file mode 100644 index 0000000..fcba5e8 --- /dev/null +++ b/vendor/github.com/containerd/console/.golangci.yml @@ -0,0 +1,20 @@ +linters: + enable: + - structcheck + - varcheck + - staticcheck + - unconvert + - gofmt + - goimports + - golint + - ineffassign + - vet + - unused + - misspell + disable: + - errcheck + +run: + timeout: 3m + skip-dirs: + - vendor diff --git a/vendor/github.com/containerd/console/README.md b/vendor/github.com/containerd/console/README.md index 5392fda..580b461 100644 --- a/vendor/github.com/containerd/console/README.md +++ b/vendor/github.com/containerd/console/README.md @@ -1,6 +1,8 @@ # console -[![Build Status](https://travis-ci.org/containerd/console.svg?branch=master)](https://travis-ci.org/containerd/console) +[![PkgGoDev](https://pkg.go.dev/badge/github.com/containerd/console)](https://pkg.go.dev/github.com/containerd/console) +[![Build Status](https://github.com/containerd/console/workflows/CI/badge.svg)](https://github.com/containerd/console/actions?query=workflow%3ACI) +[![Go Report Card](https://goreportcard.com/badge/github.com/containerd/console)](https://goreportcard.com/report/github.com/containerd/console) Golang package for dealing with consoles. Light on deps and a simple API. diff --git a/vendor/github.com/containerd/console/console.go b/vendor/github.com/containerd/console/console.go index c187a9b..f989d28 100644 --- a/vendor/github.com/containerd/console/console.go +++ b/vendor/github.com/containerd/console/console.go @@ -24,10 +24,17 @@ import ( var ErrNotAConsole = errors.New("provided file is not a console") +type File interface { + io.ReadWriteCloser + + // Fd returns its file descriptor + Fd() uintptr + // Name returns its file name + Name() string +} + type Console interface { - io.Reader - io.Writer - io.Closer + File // Resize resizes the console to the provided window size Resize(WinSize) error @@ -42,10 +49,6 @@ type Console interface { Reset() error // Size returns the window size of the console Size() (WinSize, error) - // Fd returns the console's file descriptor - Fd() uintptr - // Name returns the console's file name - Name() string } // WinSize specifies the window size of the console @@ -58,19 +61,25 @@ type WinSize struct { y uint16 } -// Current returns the current processes console -func Current() Console { - c, err := ConsoleFromFile(os.Stdin) - if err != nil { - // stdin should always be a console for the design - // of this function - panic(err) +// Current returns the current process' console +func Current() (c Console) { + var err error + // Usually all three streams (stdin, stdout, and stderr) + // are open to the same console, but some might be redirected, + // so try all three. + for _, s := range []*os.File{os.Stderr, os.Stdout, os.Stdin} { + if c, err = ConsoleFromFile(s); err == nil { + return c + } } - return c + // One of the std streams should always be a console + // for the design of this function. + panic(err) } // ConsoleFromFile returns a console using the provided file -func ConsoleFromFile(f *os.File) (Console, error) { +// nolint:golint +func ConsoleFromFile(f File) (Console, error) { if err := checkConsole(f); err != nil { return nil, err } diff --git a/vendor/github.com/containerd/console/console_linux.go b/vendor/github.com/containerd/console/console_linux.go index 42274e1..c1c839e 100644 --- a/vendor/github.com/containerd/console/console_linux.go +++ b/vendor/github.com/containerd/console/console_linux.go @@ -58,6 +58,7 @@ type Epoller struct { efd int mu sync.Mutex fdMapping map[int]*EpollConsole + closeOnce sync.Once } // NewEpoller returns an instance of epoller with a valid epoll fd. @@ -151,7 +152,11 @@ func (e *Epoller) getConsole(sysfd int) *EpollConsole { // Close closes the epoll fd func (e *Epoller) Close() error { - return unix.Close(e.efd) + closeErr := os.ErrClosed // default to "file already closed" + e.closeOnce.Do(func() { + closeErr = unix.Close(e.efd) + }) + return closeErr } // EpollConsole acts like a console but registers its file descriptor with an diff --git a/vendor/github.com/containerd/console/console_unix.go b/vendor/github.com/containerd/console/console_unix.go index a4a8d12..a081176 100644 --- a/vendor/github.com/containerd/console/console_unix.go +++ b/vendor/github.com/containerd/console/console_unix.go @@ -1,4 +1,4 @@ -// +build darwin freebsd linux openbsd solaris +// +build darwin freebsd linux netbsd openbsd solaris /* Copyright The containerd Authors. @@ -19,8 +19,6 @@ package console import ( - "os" - "golang.org/x/sys/unix" ) @@ -28,7 +26,7 @@ import ( // The master is returned as the first console and a string // with the path to the pty slave is returned as the second func NewPty() (Console, string, error) { - f, err := os.OpenFile("/dev/ptmx", unix.O_RDWR|unix.O_NOCTTY|unix.O_CLOEXEC, 0) + f, err := openpt() if err != nil { return nil, "", err } @@ -47,7 +45,7 @@ func NewPty() (Console, string, error) { } type master struct { - f *os.File + f File original *unix.Termios } @@ -122,7 +120,7 @@ func (m *master) Name() string { } // checkConsole checks if the provided file is a console -func checkConsole(f *os.File) error { +func checkConsole(f File) error { var termios unix.Termios if tcget(f.Fd(), &termios) != nil { return ErrNotAConsole @@ -130,7 +128,7 @@ func checkConsole(f *os.File) error { return nil } -func newMaster(f *os.File) (Console, error) { +func newMaster(f File) (Console, error) { m := &master{ f: f, } diff --git a/vendor/github.com/containerd/console/console_windows.go b/vendor/github.com/containerd/console/console_windows.go index 62dbe1c..787c11f 100644 --- a/vendor/github.com/containerd/console/console_windows.go +++ b/vendor/github.com/containerd/console/console_windows.go @@ -17,10 +17,10 @@ package console import ( + "errors" "fmt" "os" - "github.com/pkg/errors" "golang.org/x/sys/windows" ) @@ -103,7 +103,7 @@ func (m *master) Reset() error { {m.err, m.errMode}, } { if err := windows.SetConsoleMode(s.fd, s.mode); err != nil { - return errors.Wrap(err, "unable to restore console mode") + return fmt.Errorf("unable to restore console mode: %w", err) } } @@ -114,7 +114,7 @@ func (m *master) Size() (WinSize, error) { var info windows.ConsoleScreenBufferInfo err := windows.GetConsoleScreenBufferInfo(m.out, &info) if err != nil { - return WinSize{}, errors.Wrap(err, "unable to get console info") + return WinSize{}, fmt.Errorf("unable to get console info: %w", err) } winsize := WinSize{ @@ -139,7 +139,7 @@ func (m *master) DisableEcho() error { mode |= windows.ENABLE_LINE_INPUT if err := windows.SetConsoleMode(m.in, mode); err != nil { - return errors.Wrap(err, "unable to set console to disable echo") + return fmt.Errorf("unable to set console to disable echo: %w", err) } return nil @@ -192,13 +192,13 @@ func makeInputRaw(fd windows.Handle, mode uint32) error { } if err := windows.SetConsoleMode(fd, mode); err != nil { - return errors.Wrap(err, "unable to set console to raw mode") + return fmt.Errorf("unable to set console to raw mode: %w", err) } return nil } -func checkConsole(f *os.File) error { +func checkConsole(f File) error { var mode uint32 if err := windows.GetConsoleMode(windows.Handle(f.Fd()), &mode); err != nil { return err @@ -206,7 +206,7 @@ func checkConsole(f *os.File) error { return nil } -func newMaster(f *os.File) (Console, error) { +func newMaster(f File) (Console, error) { if f != os.Stdin && f != os.Stdout && f != os.Stderr { return nil, errors.New("creating a console from a file is not supported on windows") } diff --git a/vendor/github.com/containerd/console/console_zos.go b/vendor/github.com/containerd/console/console_zos.go new file mode 100644 index 0000000..b348a83 --- /dev/null +++ b/vendor/github.com/containerd/console/console_zos.go @@ -0,0 +1,163 @@ +// +build zos + +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package console + +import ( + "fmt" + "os" + + "golang.org/x/sys/unix" +) + +// NewPty creates a new pty pair +// The master is returned as the first console and a string +// with the path to the pty slave is returned as the second +func NewPty() (Console, string, error) { + var f File + var err error + var slave string + for i := 0;; i++ { + ptyp := fmt.Sprintf("/dev/ptyp%04d", i) + f, err = os.OpenFile(ptyp, os.O_RDWR, 0600) + if err == nil { + slave = fmt.Sprintf("/dev/ttyp%04d", i) + break + } + if os.IsNotExist(err) { + return nil, "", err + } + // else probably Resource Busy + } + m, err := newMaster(f) + if err != nil { + return nil, "", err + } + return m, slave, nil +} + +type master struct { + f File + original *unix.Termios +} + +func (m *master) Read(b []byte) (int, error) { + return m.f.Read(b) +} + +func (m *master) Write(b []byte) (int, error) { + return m.f.Write(b) +} + +func (m *master) Close() error { + return m.f.Close() +} + +func (m *master) Resize(ws WinSize) error { + return tcswinsz(m.f.Fd(), ws) +} + +func (m *master) ResizeFrom(c Console) error { + ws, err := c.Size() + if err != nil { + return err + } + return m.Resize(ws) +} + +func (m *master) Reset() error { + if m.original == nil { + return nil + } + return tcset(m.f.Fd(), m.original) +} + +func (m *master) getCurrent() (unix.Termios, error) { + var termios unix.Termios + if err := tcget(m.f.Fd(), &termios); err != nil { + return unix.Termios{}, err + } + return termios, nil +} + +func (m *master) SetRaw() error { + rawState, err := m.getCurrent() + if err != nil { + return err + } + rawState = cfmakeraw(rawState) + rawState.Oflag = rawState.Oflag | unix.OPOST + return tcset(m.f.Fd(), &rawState) +} + +func (m *master) DisableEcho() error { + rawState, err := m.getCurrent() + if err != nil { + return err + } + rawState.Lflag = rawState.Lflag &^ unix.ECHO + return tcset(m.f.Fd(), &rawState) +} + +func (m *master) Size() (WinSize, error) { + return tcgwinsz(m.f.Fd()) +} + +func (m *master) Fd() uintptr { + return m.f.Fd() +} + +func (m *master) Name() string { + return m.f.Name() +} + +// checkConsole checks if the provided file is a console +func checkConsole(f File) error { + var termios unix.Termios + if tcget(f.Fd(), &termios) != nil { + return ErrNotAConsole + } + return nil +} + +func newMaster(f File) (Console, error) { + m := &master{ + f: f, + } + t, err := m.getCurrent() + if err != nil { + return nil, err + } + m.original = &t + return m, nil +} + +// ClearONLCR sets the necessary tty_ioctl(4)s to ensure that a pty pair +// created by us acts normally. In particular, a not-very-well-known default of +// Linux unix98 ptys is that they have +onlcr by default. While this isn't a +// problem for terminal emulators, because we relay data from the terminal we +// also relay that funky line discipline. +func ClearONLCR(fd uintptr) error { + return setONLCR(fd, false) +} + +// SetONLCR sets the necessary tty_ioctl(4)s to ensure that a pty pair +// created by us acts as intended for a terminal emulator. +func SetONLCR(fd uintptr) error { + return setONLCR(fd, true) +} diff --git a/vendor/github.com/containerd/console/go.mod b/vendor/github.com/containerd/console/go.mod new file mode 100644 index 0000000..1fe5b7f --- /dev/null +++ b/vendor/github.com/containerd/console/go.mod @@ -0,0 +1,5 @@ +module github.com/containerd/console + +go 1.13 + +require golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c diff --git a/vendor/github.com/containerd/console/go.sum b/vendor/github.com/containerd/console/go.sum new file mode 100644 index 0000000..1225630 --- /dev/null +++ b/vendor/github.com/containerd/console/go.sum @@ -0,0 +1,2 @@ +golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c h1:VwygUrnw9jn88c4u8GD3rZQbqrP/tgas88tPUbBxQrk= +golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= diff --git a/vendor/github.com/containerd/console/pty_freebsd_cgo.go b/vendor/github.com/containerd/console/pty_freebsd_cgo.go new file mode 100644 index 0000000..cbd3cd7 --- /dev/null +++ b/vendor/github.com/containerd/console/pty_freebsd_cgo.go @@ -0,0 +1,45 @@ +// +build freebsd,cgo + +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package console + +import ( + "fmt" + "os" +) + +/* +#include +#include +#include +*/ +import "C" + +// openpt allocates a new pseudo-terminal and establishes a connection with its +// control device. +func openpt() (*os.File, error) { + fd, err := C.posix_openpt(C.O_RDWR) + if err != nil { + return nil, fmt.Errorf("posix_openpt: %w", err) + } + if _, err := C.grantpt(fd); err != nil { + C.close(fd) + return nil, fmt.Errorf("grantpt: %w", err) + } + return os.NewFile(uintptr(fd), ""), nil +} diff --git a/vendor/github.com/containerd/console/pty_freebsd_nocgo.go b/vendor/github.com/containerd/console/pty_freebsd_nocgo.go new file mode 100644 index 0000000..b5e4318 --- /dev/null +++ b/vendor/github.com/containerd/console/pty_freebsd_nocgo.go @@ -0,0 +1,36 @@ +// +build freebsd,!cgo + +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package console + +import ( + "os" +) + +// +// Implementing the functions below requires cgo support. Non-cgo stubs +// versions are defined below to enable cross-compilation of source code +// that depends on these functions, but the resultant cross-compiled +// binaries cannot actually be used. If the stub function(s) below are +// actually invoked they will display an error message and cause the +// calling process to exit. +// + +func openpt() (*os.File, error) { + panic("openpt() support requires cgo.") +} diff --git a/vendor/github.com/containerd/console/pty_unix.go b/vendor/github.com/containerd/console/pty_unix.go new file mode 100644 index 0000000..d5a6bd8 --- /dev/null +++ b/vendor/github.com/containerd/console/pty_unix.go @@ -0,0 +1,30 @@ +// +build darwin linux netbsd openbsd solaris + +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package console + +import ( + "os" + + "golang.org/x/sys/unix" +) + +// openpt allocates a new pseudo-terminal by opening the /dev/ptmx device +func openpt() (*os.File, error) { + return os.OpenFile("/dev/ptmx", unix.O_RDWR|unix.O_NOCTTY|unix.O_CLOEXEC, 0) +} diff --git a/vendor/github.com/containerd/console/tc_darwin.go b/vendor/github.com/containerd/console/tc_darwin.go index b0128ab..7871545 100644 --- a/vendor/github.com/containerd/console/tc_darwin.go +++ b/vendor/github.com/containerd/console/tc_darwin.go @@ -19,7 +19,6 @@ package console import ( "fmt" "os" - "unsafe" "golang.org/x/sys/unix" ) @@ -29,18 +28,10 @@ const ( cmdTcSet = unix.TIOCSETA ) -func ioctl(fd, flag, data uintptr) error { - if _, _, err := unix.Syscall(unix.SYS_IOCTL, fd, flag, data); err != 0 { - return err - } - return nil -} - // unlockpt unlocks the slave pseudoterminal device corresponding to the master pseudoterminal referred to by f. // unlockpt should be called before opening the slave side of a pty. func unlockpt(f *os.File) error { - var u int32 - return ioctl(f.Fd(), unix.TIOCPTYUNLK, uintptr(unsafe.Pointer(&u))) + return unix.IoctlSetPointerInt(int(f.Fd()), unix.TIOCPTYUNLK, 0) } // ptsname retrieves the name of the first available pts for the given master. diff --git a/vendor/github.com/containerd/console/tc_freebsd.go b/vendor/github.com/containerd/console/tc_freebsd_cgo.go similarity index 85% rename from vendor/github.com/containerd/console/tc_freebsd.go rename to vendor/github.com/containerd/console/tc_freebsd_cgo.go index 04583a6..0f3d272 100644 --- a/vendor/github.com/containerd/console/tc_freebsd.go +++ b/vendor/github.com/containerd/console/tc_freebsd_cgo.go @@ -1,3 +1,5 @@ +// +build freebsd,cgo + /* Copyright The containerd Authors. @@ -23,6 +25,12 @@ import ( "golang.org/x/sys/unix" ) +/* +#include +#include +*/ +import "C" + const ( cmdTcGet = unix.TIOCGETA cmdTcSet = unix.TIOCSETA @@ -30,8 +38,12 @@ const ( // unlockpt unlocks the slave pseudoterminal device corresponding to the master pseudoterminal referred to by f. // unlockpt should be called before opening the slave side of a pty. -// This does not exist on FreeBSD, it does not allocate controlling terminals on open func unlockpt(f *os.File) error { + fd := C.int(f.Fd()) + if _, err := C.unlockpt(fd); err != nil { + C.close(fd) + return fmt.Errorf("unlockpt: %w", err) + } return nil } diff --git a/vendor/github.com/containerd/console/tc_freebsd_nocgo.go b/vendor/github.com/containerd/console/tc_freebsd_nocgo.go new file mode 100644 index 0000000..087fc15 --- /dev/null +++ b/vendor/github.com/containerd/console/tc_freebsd_nocgo.go @@ -0,0 +1,55 @@ +// +build freebsd,!cgo + +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package console + +import ( + "fmt" + "os" + + "golang.org/x/sys/unix" +) + +const ( + cmdTcGet = unix.TIOCGETA + cmdTcSet = unix.TIOCSETA +) + +// +// Implementing the functions below requires cgo support. Non-cgo stubs +// versions are defined below to enable cross-compilation of source code +// that depends on these functions, but the resultant cross-compiled +// binaries cannot actually be used. If the stub function(s) below are +// actually invoked they will display an error message and cause the +// calling process to exit. +// + +// unlockpt unlocks the slave pseudoterminal device corresponding to the master pseudoterminal referred to by f. +// unlockpt should be called before opening the slave side of a pty. +func unlockpt(f *os.File) error { + panic("unlockpt() support requires cgo.") +} + +// ptsname retrieves the name of the first available pts for the given master. +func ptsname(f *os.File) (string, error) { + n, err := unix.IoctlGetInt(int(f.Fd()), unix.TIOCGPTN) + if err != nil { + return "", err + } + return fmt.Sprintf("/dev/pts/%d", n), nil +} diff --git a/vendor/github.com/containerd/console/tc_linux.go b/vendor/github.com/containerd/console/tc_linux.go index 1bdd68e..7d552ea 100644 --- a/vendor/github.com/containerd/console/tc_linux.go +++ b/vendor/github.com/containerd/console/tc_linux.go @@ -33,6 +33,7 @@ const ( // unlockpt should be called before opening the slave side of a pty. func unlockpt(f *os.File) error { var u int32 + // XXX do not use unix.IoctlSetPointerInt here, see commit dbd69c59b81. if _, _, err := unix.Syscall(unix.SYS_IOCTL, f.Fd(), unix.TIOCSPTLCK, uintptr(unsafe.Pointer(&u))); err != 0 { return err } @@ -42,6 +43,7 @@ func unlockpt(f *os.File) error { // ptsname retrieves the name of the first available pts for the given master. func ptsname(f *os.File) (string, error) { var u uint32 + // XXX do not use unix.IoctlGetInt here, see commit dbd69c59b81. if _, _, err := unix.Syscall(unix.SYS_IOCTL, f.Fd(), unix.TIOCGPTN, uintptr(unsafe.Pointer(&u))); err != 0 { return "", err } diff --git a/vendor/github.com/containerd/console/tc_netbsd.go b/vendor/github.com/containerd/console/tc_netbsd.go new file mode 100644 index 0000000..71227ae --- /dev/null +++ b/vendor/github.com/containerd/console/tc_netbsd.go @@ -0,0 +1,45 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package console + +import ( + "bytes" + "os" + + "golang.org/x/sys/unix" +) + +const ( + cmdTcGet = unix.TIOCGETA + cmdTcSet = unix.TIOCSETA +) + +// unlockpt unlocks the slave pseudoterminal device corresponding to the master pseudoterminal referred to by f. +// unlockpt should be called before opening the slave side of a pty. +// This does not exist on NetBSD, it does not allocate controlling terminals on open +func unlockpt(f *os.File) error { + return nil +} + +// ptsname retrieves the name of the first available pts for the given master. +func ptsname(f *os.File) (string, error) { + ptm, err := unix.IoctlGetPtmget(int(f.Fd()), unix.TIOCPTSNAME) + if err != nil { + return "", err + } + return string(ptm.Sn[:bytes.IndexByte(ptm.Sn[:], 0)]), nil +} diff --git a/vendor/github.com/containerd/console/tc_unix.go b/vendor/github.com/containerd/console/tc_unix.go index 7ae773c..a6bf01e 100644 --- a/vendor/github.com/containerd/console/tc_unix.go +++ b/vendor/github.com/containerd/console/tc_unix.go @@ -1,4 +1,4 @@ -// +build darwin freebsd linux openbsd solaris +// +build darwin freebsd linux netbsd openbsd solaris zos /* Copyright The containerd Authors. diff --git a/vendor/github.com/containerd/console/tc_zos.go b/vendor/github.com/containerd/console/tc_zos.go new file mode 100644 index 0000000..4262eaf --- /dev/null +++ b/vendor/github.com/containerd/console/tc_zos.go @@ -0,0 +1,26 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package console + +import ( + "golang.org/x/sys/unix" +) + +const ( + cmdTcGet = unix.TCGETS + cmdTcSet = unix.TCSETS +) diff --git a/vendor/github.com/coreos/go-systemd/README.md b/vendor/github.com/coreos/go-systemd/README.md deleted file mode 100644 index cde3a8f..0000000 --- a/vendor/github.com/coreos/go-systemd/README.md +++ /dev/null @@ -1,71 +0,0 @@ -# go-systemd - -[![Build Status](https://travis-ci.org/coreos/go-systemd.png?branch=master)](https://travis-ci.org/coreos/go-systemd) -[![godoc](https://godoc.org/github.com/coreos/go-systemd?status.svg)](http://godoc.org/github.com/coreos/go-systemd) -![minimum golang 1.10](https://img.shields.io/badge/golang-1.10%2B-orange.svg) - - -Go bindings to systemd. The project has several packages: - -- `activation` - for writing and using socket activation from Go -- `daemon` - for notifying systemd of service status changes -- `dbus` - for starting/stopping/inspecting running services and units -- `journal` - for writing to systemd's logging service, journald -- `sdjournal` - for reading from journald by wrapping its C API -- `login1` - for integration with the systemd logind API -- `machine1` - for registering machines/containers with systemd -- `unit` - for (de)serialization and comparison of unit files - -## Socket Activation - -An example HTTP server using socket activation can be quickly set up by following this README on a Linux machine running systemd: - -https://github.com/coreos/go-systemd/tree/master/examples/activation/httpserver - -## systemd Service Notification - -The `daemon` package is an implementation of the [sd_notify protocol](https://www.freedesktop.org/software/systemd/man/sd_notify.html#Description). It can be used to inform systemd of service start-up completion, watchdog events, and other status changes. - -## D-Bus - -The `dbus` package connects to the [systemd D-Bus API](http://www.freedesktop.org/wiki/Software/systemd/dbus/) and lets you start, stop and introspect systemd units. The API docs are here: - -http://godoc.org/github.com/coreos/go-systemd/dbus - -### Debugging - -Create `/etc/dbus-1/system-local.conf` that looks like this: - -``` - - - - - - - -``` - -## Journal - -### Writing to the Journal - -Using the pure-Go `journal` package you can submit journal entries directly to systemd's journal, taking advantage of features like indexed key/value pairs for each log entry. - -### Reading from the Journal - -The `sdjournal` package provides read access to the journal by wrapping around journald's native C API; consequently it requires cgo and the journal headers to be available. - -## logind - -The `login1` package provides functions to integrate with the [systemd logind API](http://www.freedesktop.org/wiki/Software/systemd/logind/). - -## machined - -The `machine1` package allows interaction with the [systemd machined D-Bus API](http://www.freedesktop.org/wiki/Software/systemd/machined/). - -## Units - -The `unit` package provides various functions for working with [systemd unit files](http://www.freedesktop.org/software/systemd/man/systemd.unit.html). diff --git a/vendor/github.com/coreos/go-systemd/dbus/methods.go b/vendor/github.com/coreos/go-systemd/dbus/methods.go deleted file mode 100644 index 5f2790a..0000000 --- a/vendor/github.com/coreos/go-systemd/dbus/methods.go +++ /dev/null @@ -1,594 +0,0 @@ -// Copyright 2015, 2018 CoreOS, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package dbus - -import ( - "errors" - "fmt" - "path" - "strconv" - - "github.com/godbus/dbus" -) - -func (c *Conn) jobComplete(signal *dbus.Signal) { - var id uint32 - var job dbus.ObjectPath - var unit string - var result string - dbus.Store(signal.Body, &id, &job, &unit, &result) - c.jobListener.Lock() - out, ok := c.jobListener.jobs[job] - if ok { - out <- result - delete(c.jobListener.jobs, job) - } - c.jobListener.Unlock() -} - -func (c *Conn) startJob(ch chan<- string, job string, args ...interface{}) (int, error) { - if ch != nil { - c.jobListener.Lock() - defer c.jobListener.Unlock() - } - - var p dbus.ObjectPath - err := c.sysobj.Call(job, 0, args...).Store(&p) - if err != nil { - return 0, err - } - - if ch != nil { - c.jobListener.jobs[p] = ch - } - - // ignore error since 0 is fine if conversion fails - jobID, _ := strconv.Atoi(path.Base(string(p))) - - return jobID, nil -} - -// StartUnit enqueues a start job and depending jobs, if any (unless otherwise -// specified by the mode string). -// -// Takes the unit to activate, plus a mode string. The mode needs to be one of -// replace, fail, isolate, ignore-dependencies, ignore-requirements. If -// "replace" the call will start the unit and its dependencies, possibly -// replacing already queued jobs that conflict with this. If "fail" the call -// will start the unit and its dependencies, but will fail if this would change -// an already queued job. If "isolate" the call will start the unit in question -// and terminate all units that aren't dependencies of it. If -// "ignore-dependencies" it will start a unit but ignore all its dependencies. -// If "ignore-requirements" it will start a unit but only ignore the -// requirement dependencies. It is not recommended to make use of the latter -// two options. -// -// If the provided channel is non-nil, a result string will be sent to it upon -// job completion: one of done, canceled, timeout, failed, dependency, skipped. -// done indicates successful execution of a job. canceled indicates that a job -// has been canceled before it finished execution. timeout indicates that the -// job timeout was reached. failed indicates that the job failed. dependency -// indicates that a job this job has been depending on failed and the job hence -// has been removed too. skipped indicates that a job was skipped because it -// didn't apply to the units current state. -// -// If no error occurs, the ID of the underlying systemd job will be returned. There -// does exist the possibility for no error to be returned, but for the returned job -// ID to be 0. In this case, the actual underlying ID is not 0 and this datapoint -// should not be considered authoritative. -// -// If an error does occur, it will be returned to the user alongside a job ID of 0. -func (c *Conn) StartUnit(name string, mode string, ch chan<- string) (int, error) { - return c.startJob(ch, "org.freedesktop.systemd1.Manager.StartUnit", name, mode) -} - -// StopUnit is similar to StartUnit but stops the specified unit rather -// than starting it. -func (c *Conn) StopUnit(name string, mode string, ch chan<- string) (int, error) { - return c.startJob(ch, "org.freedesktop.systemd1.Manager.StopUnit", name, mode) -} - -// ReloadUnit reloads a unit. Reloading is done only if the unit is already running and fails otherwise. -func (c *Conn) ReloadUnit(name string, mode string, ch chan<- string) (int, error) { - return c.startJob(ch, "org.freedesktop.systemd1.Manager.ReloadUnit", name, mode) -} - -// RestartUnit restarts a service. If a service is restarted that isn't -// running it will be started. -func (c *Conn) RestartUnit(name string, mode string, ch chan<- string) (int, error) { - return c.startJob(ch, "org.freedesktop.systemd1.Manager.RestartUnit", name, mode) -} - -// TryRestartUnit is like RestartUnit, except that a service that isn't running -// is not affected by the restart. -func (c *Conn) TryRestartUnit(name string, mode string, ch chan<- string) (int, error) { - return c.startJob(ch, "org.freedesktop.systemd1.Manager.TryRestartUnit", name, mode) -} - -// ReloadOrRestartUnit attempts a reload if the unit supports it and use a restart -// otherwise. -func (c *Conn) ReloadOrRestartUnit(name string, mode string, ch chan<- string) (int, error) { - return c.startJob(ch, "org.freedesktop.systemd1.Manager.ReloadOrRestartUnit", name, mode) -} - -// ReloadOrTryRestartUnit attempts a reload if the unit supports it and use a "Try" -// flavored restart otherwise. -func (c *Conn) ReloadOrTryRestartUnit(name string, mode string, ch chan<- string) (int, error) { - return c.startJob(ch, "org.freedesktop.systemd1.Manager.ReloadOrTryRestartUnit", name, mode) -} - -// StartTransientUnit() may be used to create and start a transient unit, which -// will be released as soon as it is not running or referenced anymore or the -// system is rebooted. name is the unit name including suffix, and must be -// unique. mode is the same as in StartUnit(), properties contains properties -// of the unit. -func (c *Conn) StartTransientUnit(name string, mode string, properties []Property, ch chan<- string) (int, error) { - return c.startJob(ch, "org.freedesktop.systemd1.Manager.StartTransientUnit", name, mode, properties, make([]PropertyCollection, 0)) -} - -// KillUnit takes the unit name and a UNIX signal number to send. All of the unit's -// processes are killed. -func (c *Conn) KillUnit(name string, signal int32) { - c.sysobj.Call("org.freedesktop.systemd1.Manager.KillUnit", 0, name, "all", signal).Store() -} - -// ResetFailedUnit resets the "failed" state of a specific unit. -func (c *Conn) ResetFailedUnit(name string) error { - return c.sysobj.Call("org.freedesktop.systemd1.Manager.ResetFailedUnit", 0, name).Store() -} - -// SystemState returns the systemd state. Equivalent to `systemctl is-system-running`. -func (c *Conn) SystemState() (*Property, error) { - var err error - var prop dbus.Variant - - obj := c.sysconn.Object("org.freedesktop.systemd1", "/org/freedesktop/systemd1") - err = obj.Call("org.freedesktop.DBus.Properties.Get", 0, "org.freedesktop.systemd1.Manager", "SystemState").Store(&prop) - if err != nil { - return nil, err - } - - return &Property{Name: "SystemState", Value: prop}, nil -} - -// getProperties takes the unit path and returns all of its dbus object properties, for the given dbus interface -func (c *Conn) getProperties(path dbus.ObjectPath, dbusInterface string) (map[string]interface{}, error) { - var err error - var props map[string]dbus.Variant - - if !path.IsValid() { - return nil, fmt.Errorf("invalid unit name: %v", path) - } - - obj := c.sysconn.Object("org.freedesktop.systemd1", path) - err = obj.Call("org.freedesktop.DBus.Properties.GetAll", 0, dbusInterface).Store(&props) - if err != nil { - return nil, err - } - - out := make(map[string]interface{}, len(props)) - for k, v := range props { - out[k] = v.Value() - } - - return out, nil -} - -// GetUnitProperties takes the (unescaped) unit name and returns all of its dbus object properties. -func (c *Conn) GetUnitProperties(unit string) (map[string]interface{}, error) { - path := unitPath(unit) - return c.getProperties(path, "org.freedesktop.systemd1.Unit") -} - -// GetUnitPathProperties takes the (escaped) unit path and returns all of its dbus object properties. -func (c *Conn) GetUnitPathProperties(path dbus.ObjectPath) (map[string]interface{}, error) { - return c.getProperties(path, "org.freedesktop.systemd1.Unit") -} - -func (c *Conn) getProperty(unit string, dbusInterface string, propertyName string) (*Property, error) { - var err error - var prop dbus.Variant - - path := unitPath(unit) - if !path.IsValid() { - return nil, errors.New("invalid unit name: " + unit) - } - - obj := c.sysconn.Object("org.freedesktop.systemd1", path) - err = obj.Call("org.freedesktop.DBus.Properties.Get", 0, dbusInterface, propertyName).Store(&prop) - if err != nil { - return nil, err - } - - return &Property{Name: propertyName, Value: prop}, nil -} - -func (c *Conn) GetUnitProperty(unit string, propertyName string) (*Property, error) { - return c.getProperty(unit, "org.freedesktop.systemd1.Unit", propertyName) -} - -// GetServiceProperty returns property for given service name and property name -func (c *Conn) GetServiceProperty(service string, propertyName string) (*Property, error) { - return c.getProperty(service, "org.freedesktop.systemd1.Service", propertyName) -} - -// GetUnitTypeProperties returns the extra properties for a unit, specific to the unit type. -// Valid values for unitType: Service, Socket, Target, Device, Mount, Automount, Snapshot, Timer, Swap, Path, Slice, Scope -// return "dbus.Error: Unknown interface" if the unitType is not the correct type of the unit -func (c *Conn) GetUnitTypeProperties(unit string, unitType string) (map[string]interface{}, error) { - path := unitPath(unit) - return c.getProperties(path, "org.freedesktop.systemd1."+unitType) -} - -// SetUnitProperties() may be used to modify certain unit properties at runtime. -// Not all properties may be changed at runtime, but many resource management -// settings (primarily those in systemd.cgroup(5)) may. The changes are applied -// instantly, and stored on disk for future boots, unless runtime is true, in which -// case the settings only apply until the next reboot. name is the name of the unit -// to modify. properties are the settings to set, encoded as an array of property -// name and value pairs. -func (c *Conn) SetUnitProperties(name string, runtime bool, properties ...Property) error { - return c.sysobj.Call("org.freedesktop.systemd1.Manager.SetUnitProperties", 0, name, runtime, properties).Store() -} - -func (c *Conn) GetUnitTypeProperty(unit string, unitType string, propertyName string) (*Property, error) { - return c.getProperty(unit, "org.freedesktop.systemd1."+unitType, propertyName) -} - -type UnitStatus struct { - Name string // The primary unit name as string - Description string // The human readable description string - LoadState string // The load state (i.e. whether the unit file has been loaded successfully) - ActiveState string // The active state (i.e. whether the unit is currently started or not) - SubState string // The sub state (a more fine-grained version of the active state that is specific to the unit type, which the active state is not) - Followed string // A unit that is being followed in its state by this unit, if there is any, otherwise the empty string. - Path dbus.ObjectPath // The unit object path - JobId uint32 // If there is a job queued for the job unit the numeric job id, 0 otherwise - JobType string // The job type as string - JobPath dbus.ObjectPath // The job object path -} - -type storeFunc func(retvalues ...interface{}) error - -func (c *Conn) listUnitsInternal(f storeFunc) ([]UnitStatus, error) { - result := make([][]interface{}, 0) - err := f(&result) - if err != nil { - return nil, err - } - - resultInterface := make([]interface{}, len(result)) - for i := range result { - resultInterface[i] = result[i] - } - - status := make([]UnitStatus, len(result)) - statusInterface := make([]interface{}, len(status)) - for i := range status { - statusInterface[i] = &status[i] - } - - err = dbus.Store(resultInterface, statusInterface...) - if err != nil { - return nil, err - } - - return status, nil -} - -// ListUnits returns an array with all currently loaded units. Note that -// units may be known by multiple names at the same time, and hence there might -// be more unit names loaded than actual units behind them. -// Also note that a unit is only loaded if it is active and/or enabled. -// Units that are both disabled and inactive will thus not be returned. -func (c *Conn) ListUnits() ([]UnitStatus, error) { - return c.listUnitsInternal(c.sysobj.Call("org.freedesktop.systemd1.Manager.ListUnits", 0).Store) -} - -// ListUnitsFiltered returns an array with units filtered by state. -// It takes a list of units' statuses to filter. -func (c *Conn) ListUnitsFiltered(states []string) ([]UnitStatus, error) { - return c.listUnitsInternal(c.sysobj.Call("org.freedesktop.systemd1.Manager.ListUnitsFiltered", 0, states).Store) -} - -// ListUnitsByPatterns returns an array with units. -// It takes a list of units' statuses and names to filter. -// Note that units may be known by multiple names at the same time, -// and hence there might be more unit names loaded than actual units behind them. -func (c *Conn) ListUnitsByPatterns(states []string, patterns []string) ([]UnitStatus, error) { - return c.listUnitsInternal(c.sysobj.Call("org.freedesktop.systemd1.Manager.ListUnitsByPatterns", 0, states, patterns).Store) -} - -// ListUnitsByNames returns an array with units. It takes a list of units' -// names and returns an UnitStatus array. Comparing to ListUnitsByPatterns -// method, this method returns statuses even for inactive or non-existing -// units. Input array should contain exact unit names, but not patterns. -// Note: Requires systemd v230 or higher -func (c *Conn) ListUnitsByNames(units []string) ([]UnitStatus, error) { - return c.listUnitsInternal(c.sysobj.Call("org.freedesktop.systemd1.Manager.ListUnitsByNames", 0, units).Store) -} - -type UnitFile struct { - Path string - Type string -} - -func (c *Conn) listUnitFilesInternal(f storeFunc) ([]UnitFile, error) { - result := make([][]interface{}, 0) - err := f(&result) - if err != nil { - return nil, err - } - - resultInterface := make([]interface{}, len(result)) - for i := range result { - resultInterface[i] = result[i] - } - - files := make([]UnitFile, len(result)) - fileInterface := make([]interface{}, len(files)) - for i := range files { - fileInterface[i] = &files[i] - } - - err = dbus.Store(resultInterface, fileInterface...) - if err != nil { - return nil, err - } - - return files, nil -} - -// ListUnitFiles returns an array of all available units on disk. -func (c *Conn) ListUnitFiles() ([]UnitFile, error) { - return c.listUnitFilesInternal(c.sysobj.Call("org.freedesktop.systemd1.Manager.ListUnitFiles", 0).Store) -} - -// ListUnitFilesByPatterns returns an array of all available units on disk matched the patterns. -func (c *Conn) ListUnitFilesByPatterns(states []string, patterns []string) ([]UnitFile, error) { - return c.listUnitFilesInternal(c.sysobj.Call("org.freedesktop.systemd1.Manager.ListUnitFilesByPatterns", 0, states, patterns).Store) -} - -type LinkUnitFileChange EnableUnitFileChange - -// LinkUnitFiles() links unit files (that are located outside of the -// usual unit search paths) into the unit search path. -// -// It takes a list of absolute paths to unit files to link and two -// booleans. The first boolean controls whether the unit shall be -// enabled for runtime only (true, /run), or persistently (false, -// /etc). -// The second controls whether symlinks pointing to other units shall -// be replaced if necessary. -// -// This call returns a list of the changes made. The list consists of -// structures with three strings: the type of the change (one of symlink -// or unlink), the file name of the symlink and the destination of the -// symlink. -func (c *Conn) LinkUnitFiles(files []string, runtime bool, force bool) ([]LinkUnitFileChange, error) { - result := make([][]interface{}, 0) - err := c.sysobj.Call("org.freedesktop.systemd1.Manager.LinkUnitFiles", 0, files, runtime, force).Store(&result) - if err != nil { - return nil, err - } - - resultInterface := make([]interface{}, len(result)) - for i := range result { - resultInterface[i] = result[i] - } - - changes := make([]LinkUnitFileChange, len(result)) - changesInterface := make([]interface{}, len(changes)) - for i := range changes { - changesInterface[i] = &changes[i] - } - - err = dbus.Store(resultInterface, changesInterface...) - if err != nil { - return nil, err - } - - return changes, nil -} - -// EnableUnitFiles() may be used to enable one or more units in the system (by -// creating symlinks to them in /etc or /run). -// -// It takes a list of unit files to enable (either just file names or full -// absolute paths if the unit files are residing outside the usual unit -// search paths), and two booleans: the first controls whether the unit shall -// be enabled for runtime only (true, /run), or persistently (false, /etc). -// The second one controls whether symlinks pointing to other units shall -// be replaced if necessary. -// -// This call returns one boolean and an array with the changes made. The -// boolean signals whether the unit files contained any enablement -// information (i.e. an [Install]) section. The changes list consists of -// structures with three strings: the type of the change (one of symlink -// or unlink), the file name of the symlink and the destination of the -// symlink. -func (c *Conn) EnableUnitFiles(files []string, runtime bool, force bool) (bool, []EnableUnitFileChange, error) { - var carries_install_info bool - - result := make([][]interface{}, 0) - err := c.sysobj.Call("org.freedesktop.systemd1.Manager.EnableUnitFiles", 0, files, runtime, force).Store(&carries_install_info, &result) - if err != nil { - return false, nil, err - } - - resultInterface := make([]interface{}, len(result)) - for i := range result { - resultInterface[i] = result[i] - } - - changes := make([]EnableUnitFileChange, len(result)) - changesInterface := make([]interface{}, len(changes)) - for i := range changes { - changesInterface[i] = &changes[i] - } - - err = dbus.Store(resultInterface, changesInterface...) - if err != nil { - return false, nil, err - } - - return carries_install_info, changes, nil -} - -type EnableUnitFileChange struct { - Type string // Type of the change (one of symlink or unlink) - Filename string // File name of the symlink - Destination string // Destination of the symlink -} - -// DisableUnitFiles() may be used to disable one or more units in the system (by -// removing symlinks to them from /etc or /run). -// -// It takes a list of unit files to disable (either just file names or full -// absolute paths if the unit files are residing outside the usual unit -// search paths), and one boolean: whether the unit was enabled for runtime -// only (true, /run), or persistently (false, /etc). -// -// This call returns an array with the changes made. The changes list -// consists of structures with three strings: the type of the change (one of -// symlink or unlink), the file name of the symlink and the destination of the -// symlink. -func (c *Conn) DisableUnitFiles(files []string, runtime bool) ([]DisableUnitFileChange, error) { - result := make([][]interface{}, 0) - err := c.sysobj.Call("org.freedesktop.systemd1.Manager.DisableUnitFiles", 0, files, runtime).Store(&result) - if err != nil { - return nil, err - } - - resultInterface := make([]interface{}, len(result)) - for i := range result { - resultInterface[i] = result[i] - } - - changes := make([]DisableUnitFileChange, len(result)) - changesInterface := make([]interface{}, len(changes)) - for i := range changes { - changesInterface[i] = &changes[i] - } - - err = dbus.Store(resultInterface, changesInterface...) - if err != nil { - return nil, err - } - - return changes, nil -} - -type DisableUnitFileChange struct { - Type string // Type of the change (one of symlink or unlink) - Filename string // File name of the symlink - Destination string // Destination of the symlink -} - -// MaskUnitFiles masks one or more units in the system -// -// It takes three arguments: -// * list of units to mask (either just file names or full -// absolute paths if the unit files are residing outside -// the usual unit search paths) -// * runtime to specify whether the unit was enabled for runtime -// only (true, /run/systemd/..), or persistently (false, /etc/systemd/..) -// * force flag -func (c *Conn) MaskUnitFiles(files []string, runtime bool, force bool) ([]MaskUnitFileChange, error) { - result := make([][]interface{}, 0) - err := c.sysobj.Call("org.freedesktop.systemd1.Manager.MaskUnitFiles", 0, files, runtime, force).Store(&result) - if err != nil { - return nil, err - } - - resultInterface := make([]interface{}, len(result)) - for i := range result { - resultInterface[i] = result[i] - } - - changes := make([]MaskUnitFileChange, len(result)) - changesInterface := make([]interface{}, len(changes)) - for i := range changes { - changesInterface[i] = &changes[i] - } - - err = dbus.Store(resultInterface, changesInterface...) - if err != nil { - return nil, err - } - - return changes, nil -} - -type MaskUnitFileChange struct { - Type string // Type of the change (one of symlink or unlink) - Filename string // File name of the symlink - Destination string // Destination of the symlink -} - -// UnmaskUnitFiles unmasks one or more units in the system -// -// It takes two arguments: -// * list of unit files to mask (either just file names or full -// absolute paths if the unit files are residing outside -// the usual unit search paths) -// * runtime to specify whether the unit was enabled for runtime -// only (true, /run/systemd/..), or persistently (false, /etc/systemd/..) -func (c *Conn) UnmaskUnitFiles(files []string, runtime bool) ([]UnmaskUnitFileChange, error) { - result := make([][]interface{}, 0) - err := c.sysobj.Call("org.freedesktop.systemd1.Manager.UnmaskUnitFiles", 0, files, runtime).Store(&result) - if err != nil { - return nil, err - } - - resultInterface := make([]interface{}, len(result)) - for i := range result { - resultInterface[i] = result[i] - } - - changes := make([]UnmaskUnitFileChange, len(result)) - changesInterface := make([]interface{}, len(changes)) - for i := range changes { - changesInterface[i] = &changes[i] - } - - err = dbus.Store(resultInterface, changesInterface...) - if err != nil { - return nil, err - } - - return changes, nil -} - -type UnmaskUnitFileChange struct { - Type string // Type of the change (one of symlink or unlink) - Filename string // File name of the symlink - Destination string // Destination of the symlink -} - -// Reload instructs systemd to scan for and reload unit files. This is -// equivalent to a 'systemctl daemon-reload'. -func (c *Conn) Reload() error { - return c.sysobj.Call("org.freedesktop.systemd1.Manager.Reload", 0).Store() -} - -func unitPath(name string) dbus.ObjectPath { - return dbus.ObjectPath("/org/freedesktop/systemd1/unit/" + PathBusEscape(name)) -} - -// unitName returns the unescaped base element of the supplied escaped path -func unitName(dpath dbus.ObjectPath) string { - return pathBusUnescape(path.Base(string(dpath))) -} diff --git a/vendor/github.com/coreos/go-systemd/LICENSE b/vendor/github.com/coreos/go-systemd/v22/LICENSE similarity index 100% rename from vendor/github.com/coreos/go-systemd/LICENSE rename to vendor/github.com/coreos/go-systemd/v22/LICENSE diff --git a/vendor/github.com/coreos/go-systemd/NOTICE b/vendor/github.com/coreos/go-systemd/v22/NOTICE similarity index 100% rename from vendor/github.com/coreos/go-systemd/NOTICE rename to vendor/github.com/coreos/go-systemd/v22/NOTICE diff --git a/vendor/github.com/coreos/go-systemd/activation/files.go b/vendor/github.com/coreos/go-systemd/v22/activation/files_unix.go similarity index 98% rename from vendor/github.com/coreos/go-systemd/activation/files.go rename to vendor/github.com/coreos/go-systemd/v22/activation/files_unix.go index 29dd18d..fc7db98 100644 --- a/vendor/github.com/coreos/go-systemd/activation/files.go +++ b/vendor/github.com/coreos/go-systemd/v22/activation/files_unix.go @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +// +build !windows + // Package activation implements primitives for systemd socket activation. package activation diff --git a/vendor/github.com/coreos/go-systemd/v22/activation/files_windows.go b/vendor/github.com/coreos/go-systemd/v22/activation/files_windows.go new file mode 100644 index 0000000..d391bf0 --- /dev/null +++ b/vendor/github.com/coreos/go-systemd/v22/activation/files_windows.go @@ -0,0 +1,21 @@ +// Copyright 2015 CoreOS, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package activation + +import "os" + +func Files(unsetEnv bool) []*os.File { + return nil +} diff --git a/vendor/github.com/coreos/go-systemd/activation/listeners.go b/vendor/github.com/coreos/go-systemd/v22/activation/listeners.go similarity index 97% rename from vendor/github.com/coreos/go-systemd/activation/listeners.go rename to vendor/github.com/coreos/go-systemd/v22/activation/listeners.go index bb5cc23..3dbe2b0 100644 --- a/vendor/github.com/coreos/go-systemd/activation/listeners.go +++ b/vendor/github.com/coreos/go-systemd/v22/activation/listeners.go @@ -67,7 +67,7 @@ func TLSListeners(tlsConfig *tls.Config) ([]net.Listener, error) { return nil, err } - if tlsConfig != nil && err == nil { + if tlsConfig != nil { for i, l := range listeners { // Activate TLS only for TCP sockets if l.Addr().Network() == "tcp" { @@ -88,7 +88,7 @@ func TLSListenersWithNames(tlsConfig *tls.Config) (map[string][]net.Listener, er return nil, err } - if tlsConfig != nil && err == nil { + if tlsConfig != nil { for _, ll := range listeners { // Activate TLS only for TCP sockets for i, l := range ll { diff --git a/vendor/github.com/coreos/go-systemd/activation/packetconns.go b/vendor/github.com/coreos/go-systemd/v22/activation/packetconns.go similarity index 100% rename from vendor/github.com/coreos/go-systemd/activation/packetconns.go rename to vendor/github.com/coreos/go-systemd/v22/activation/packetconns.go diff --git a/vendor/github.com/coreos/go-systemd/dbus/dbus.go b/vendor/github.com/coreos/go-systemd/v22/dbus/dbus.go similarity index 74% rename from vendor/github.com/coreos/go-systemd/dbus/dbus.go rename to vendor/github.com/coreos/go-systemd/v22/dbus/dbus.go index f652582..cff5af1 100644 --- a/vendor/github.com/coreos/go-systemd/dbus/dbus.go +++ b/vendor/github.com/coreos/go-systemd/v22/dbus/dbus.go @@ -16,6 +16,7 @@ package dbus import ( + "context" "encoding/hex" "fmt" "os" @@ -23,7 +24,7 @@ import ( "strings" "sync" - "github.com/godbus/dbus" + "github.com/godbus/dbus/v5" ) const ( @@ -110,46 +111,66 @@ type Conn struct { } } -// New establishes a connection to any available bus and authenticates. -// Callers should call Close() when done with the connection. +// Deprecated: use NewWithContext instead. func New() (*Conn, error) { - conn, err := NewSystemConnection() + return NewWithContext(context.Background()) +} + +// NewWithContext establishes a connection to any available bus and authenticates. +// Callers should call Close() when done with the connection. +func NewWithContext(ctx context.Context) (*Conn, error) { + conn, err := NewSystemConnectionContext(ctx) if err != nil && os.Geteuid() == 0 { - return NewSystemdConnection() + return NewSystemdConnectionContext(ctx) } return conn, err } -// NewSystemConnection establishes a connection to the system bus and authenticates. -// Callers should call Close() when done with the connection +// Deprecated: use NewSystemConnectionContext instead. func NewSystemConnection() (*Conn, error) { + return NewSystemConnectionContext(context.Background()) +} + +// NewSystemConnectionContext establishes a connection to the system bus and authenticates. +// Callers should call Close() when done with the connection. +func NewSystemConnectionContext(ctx context.Context) (*Conn, error) { return NewConnection(func() (*dbus.Conn, error) { - return dbusAuthHelloConnection(dbus.SystemBusPrivate) + return dbusAuthHelloConnection(ctx, dbus.SystemBusPrivate) }) } -// NewUserConnection establishes a connection to the session bus and +// Deprecated: use NewUserConnectionContext instead. +func NewUserConnection() (*Conn, error) { + return NewUserConnectionContext(context.Background()) +} + +// NewUserConnectionContext establishes a connection to the session bus and // authenticates. This can be used to connect to systemd user instances. // Callers should call Close() when done with the connection. -func NewUserConnection() (*Conn, error) { +func NewUserConnectionContext(ctx context.Context) (*Conn, error) { return NewConnection(func() (*dbus.Conn, error) { - return dbusAuthHelloConnection(dbus.SessionBusPrivate) + return dbusAuthHelloConnection(ctx, dbus.SessionBusPrivate) }) } -// NewSystemdConnection establishes a private, direct connection to systemd. +// Deprecated: use NewSystemdConnectionContext instead. +func NewSystemdConnection() (*Conn, error) { + return NewSystemdConnectionContext(context.Background()) +} + +// NewSystemdConnectionContext establishes a private, direct connection to systemd. // This can be used for communicating with systemd without a dbus daemon. // Callers should call Close() when done with the connection. -func NewSystemdConnection() (*Conn, error) { +func NewSystemdConnectionContext(ctx context.Context) (*Conn, error) { return NewConnection(func() (*dbus.Conn, error) { // We skip Hello when talking directly to systemd. - return dbusAuthConnection(func(opts ...dbus.ConnOption) (*dbus.Conn, error) { - return dbus.Dial("unix:path=/run/systemd/private") + return dbusAuthConnection(ctx, func(opts ...dbus.ConnOption) (*dbus.Conn, error) { + return dbus.Dial("unix:path=/run/systemd/private", opts...) }) }) } -// Close closes an established connection +// Close closes an established connection. func (c *Conn) Close() { c.sysconn.Close() c.sigconn.Close() @@ -192,7 +213,7 @@ func NewConnection(dialBus func() (*dbus.Conn, error)) (*Conn, error) { // GetManagerProperty returns the value of a property on the org.freedesktop.systemd1.Manager // interface. The value is returned in its string representation, as defined at -// https://developer.gnome.org/glib/unstable/gvariant-text.html +// https://developer.gnome.org/glib/unstable/gvariant-text.html. func (c *Conn) GetManagerProperty(prop string) (string, error) { variant, err := c.sysobj.GetProperty("org.freedesktop.systemd1.Manager." + prop) if err != nil { @@ -201,8 +222,8 @@ func (c *Conn) GetManagerProperty(prop string) (string, error) { return variant.String(), nil } -func dbusAuthConnection(createBus func(opts ...dbus.ConnOption) (*dbus.Conn, error)) (*dbus.Conn, error) { - conn, err := createBus() +func dbusAuthConnection(ctx context.Context, createBus func(opts ...dbus.ConnOption) (*dbus.Conn, error)) (*dbus.Conn, error) { + conn, err := createBus(dbus.WithContext(ctx)) if err != nil { return nil, err } @@ -221,8 +242,8 @@ func dbusAuthConnection(createBus func(opts ...dbus.ConnOption) (*dbus.Conn, err return conn, nil } -func dbusAuthHelloConnection(createBus func(opts ...dbus.ConnOption) (*dbus.Conn, error)) (*dbus.Conn, error) { - conn, err := dbusAuthConnection(createBus) +func dbusAuthHelloConnection(ctx context.Context, createBus func(opts ...dbus.ConnOption) (*dbus.Conn, error)) (*dbus.Conn, error) { + conn, err := dbusAuthConnection(ctx, createBus) if err != nil { return nil, err } diff --git a/vendor/github.com/coreos/go-systemd/v22/dbus/methods.go b/vendor/github.com/coreos/go-systemd/v22/dbus/methods.go new file mode 100644 index 0000000..fa04afc --- /dev/null +++ b/vendor/github.com/coreos/go-systemd/v22/dbus/methods.go @@ -0,0 +1,830 @@ +// Copyright 2015, 2018 CoreOS, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package dbus + +import ( + "context" + "errors" + "fmt" + "path" + "strconv" + + "github.com/godbus/dbus/v5" +) + +// Who can be used to specify which process to kill in the unit via the KillUnitWithTarget API +type Who string + +const ( + // All sends the signal to all processes in the unit + All Who = "all" + // Main sends the signal to the main process of the unit + Main Who = "main" + // Control sends the signal to the control process of the unit + Control Who = "control" +) + +func (c *Conn) jobComplete(signal *dbus.Signal) { + var id uint32 + var job dbus.ObjectPath + var unit string + var result string + dbus.Store(signal.Body, &id, &job, &unit, &result) + c.jobListener.Lock() + out, ok := c.jobListener.jobs[job] + if ok { + out <- result + delete(c.jobListener.jobs, job) + } + c.jobListener.Unlock() +} + +func (c *Conn) startJob(ctx context.Context, ch chan<- string, job string, args ...interface{}) (int, error) { + if ch != nil { + c.jobListener.Lock() + defer c.jobListener.Unlock() + } + + var p dbus.ObjectPath + err := c.sysobj.CallWithContext(ctx, job, 0, args...).Store(&p) + if err != nil { + return 0, err + } + + if ch != nil { + c.jobListener.jobs[p] = ch + } + + // ignore error since 0 is fine if conversion fails + jobID, _ := strconv.Atoi(path.Base(string(p))) + + return jobID, nil +} + +// Deprecated: use StartUnitContext instead. +func (c *Conn) StartUnit(name string, mode string, ch chan<- string) (int, error) { + return c.StartUnitContext(context.Background(), name, mode, ch) +} + +// StartUnitContext enqueues a start job and depending jobs, if any (unless otherwise +// specified by the mode string). +// +// Takes the unit to activate, plus a mode string. The mode needs to be one of +// replace, fail, isolate, ignore-dependencies, ignore-requirements. If +// "replace" the call will start the unit and its dependencies, possibly +// replacing already queued jobs that conflict with this. If "fail" the call +// will start the unit and its dependencies, but will fail if this would change +// an already queued job. If "isolate" the call will start the unit in question +// and terminate all units that aren't dependencies of it. If +// "ignore-dependencies" it will start a unit but ignore all its dependencies. +// If "ignore-requirements" it will start a unit but only ignore the +// requirement dependencies. It is not recommended to make use of the latter +// two options. +// +// If the provided channel is non-nil, a result string will be sent to it upon +// job completion: one of done, canceled, timeout, failed, dependency, skipped. +// done indicates successful execution of a job. canceled indicates that a job +// has been canceled before it finished execution. timeout indicates that the +// job timeout was reached. failed indicates that the job failed. dependency +// indicates that a job this job has been depending on failed and the job hence +// has been removed too. skipped indicates that a job was skipped because it +// didn't apply to the units current state. +// +// If no error occurs, the ID of the underlying systemd job will be returned. There +// does exist the possibility for no error to be returned, but for the returned job +// ID to be 0. In this case, the actual underlying ID is not 0 and this datapoint +// should not be considered authoritative. +// +// If an error does occur, it will be returned to the user alongside a job ID of 0. +func (c *Conn) StartUnitContext(ctx context.Context, name string, mode string, ch chan<- string) (int, error) { + return c.startJob(ctx, ch, "org.freedesktop.systemd1.Manager.StartUnit", name, mode) +} + +// Deprecated: use StopUnitContext instead. +func (c *Conn) StopUnit(name string, mode string, ch chan<- string) (int, error) { + return c.StopUnitContext(context.Background(), name, mode, ch) +} + +// StopUnitContext is similar to StartUnitContext, but stops the specified unit +// rather than starting it. +func (c *Conn) StopUnitContext(ctx context.Context, name string, mode string, ch chan<- string) (int, error) { + return c.startJob(ctx, ch, "org.freedesktop.systemd1.Manager.StopUnit", name, mode) +} + +// Deprecated: use ReloadUnitContext instead. +func (c *Conn) ReloadUnit(name string, mode string, ch chan<- string) (int, error) { + return c.ReloadUnitContext(context.Background(), name, mode, ch) +} + +// ReloadUnitContext reloads a unit. Reloading is done only if the unit +// is already running, and fails otherwise. +func (c *Conn) ReloadUnitContext(ctx context.Context, name string, mode string, ch chan<- string) (int, error) { + return c.startJob(ctx, ch, "org.freedesktop.systemd1.Manager.ReloadUnit", name, mode) +} + +// Deprecated: use RestartUnitContext instead. +func (c *Conn) RestartUnit(name string, mode string, ch chan<- string) (int, error) { + return c.RestartUnitContext(context.Background(), name, mode, ch) +} + +// RestartUnitContext restarts a service. If a service is restarted that isn't +// running it will be started. +func (c *Conn) RestartUnitContext(ctx context.Context, name string, mode string, ch chan<- string) (int, error) { + return c.startJob(ctx, ch, "org.freedesktop.systemd1.Manager.RestartUnit", name, mode) +} + +// Deprecated: use TryRestartUnitContext instead. +func (c *Conn) TryRestartUnit(name string, mode string, ch chan<- string) (int, error) { + return c.TryRestartUnitContext(context.Background(), name, mode, ch) +} + +// TryRestartUnitContext is like RestartUnitContext, except that a service that +// isn't running is not affected by the restart. +func (c *Conn) TryRestartUnitContext(ctx context.Context, name string, mode string, ch chan<- string) (int, error) { + return c.startJob(ctx, ch, "org.freedesktop.systemd1.Manager.TryRestartUnit", name, mode) +} + +// Deprecated: use ReloadOrRestartUnitContext instead. +func (c *Conn) ReloadOrRestartUnit(name string, mode string, ch chan<- string) (int, error) { + return c.ReloadOrRestartUnitContext(context.Background(), name, mode, ch) +} + +// ReloadOrRestartUnitContext attempts a reload if the unit supports it and use +// a restart otherwise. +func (c *Conn) ReloadOrRestartUnitContext(ctx context.Context, name string, mode string, ch chan<- string) (int, error) { + return c.startJob(ctx, ch, "org.freedesktop.systemd1.Manager.ReloadOrRestartUnit", name, mode) +} + +// Deprecated: use ReloadOrTryRestartUnitContext instead. +func (c *Conn) ReloadOrTryRestartUnit(name string, mode string, ch chan<- string) (int, error) { + return c.ReloadOrTryRestartUnitContext(context.Background(), name, mode, ch) +} + +// ReloadOrTryRestartUnitContext attempts a reload if the unit supports it, +// and use a "Try" flavored restart otherwise. +func (c *Conn) ReloadOrTryRestartUnitContext(ctx context.Context, name string, mode string, ch chan<- string) (int, error) { + return c.startJob(ctx, ch, "org.freedesktop.systemd1.Manager.ReloadOrTryRestartUnit", name, mode) +} + +// Deprecated: use StartTransientUnitContext instead. +func (c *Conn) StartTransientUnit(name string, mode string, properties []Property, ch chan<- string) (int, error) { + return c.StartTransientUnitContext(context.Background(), name, mode, properties, ch) +} + +// StartTransientUnitContext may be used to create and start a transient unit, which +// will be released as soon as it is not running or referenced anymore or the +// system is rebooted. name is the unit name including suffix, and must be +// unique. mode is the same as in StartUnitContext, properties contains properties +// of the unit. +func (c *Conn) StartTransientUnitContext(ctx context.Context, name string, mode string, properties []Property, ch chan<- string) (int, error) { + return c.startJob(ctx, ch, "org.freedesktop.systemd1.Manager.StartTransientUnit", name, mode, properties, make([]PropertyCollection, 0)) +} + +// Deprecated: use KillUnitContext instead. +func (c *Conn) KillUnit(name string, signal int32) { + c.KillUnitContext(context.Background(), name, signal) +} + +// KillUnitContext takes the unit name and a UNIX signal number to send. +// All of the unit's processes are killed. +func (c *Conn) KillUnitContext(ctx context.Context, name string, signal int32) { + c.KillUnitWithTarget(ctx, name, All, signal) +} + +// KillUnitWithTarget is like KillUnitContext, but allows you to specify which +// process in the unit to send the signal to. +func (c *Conn) KillUnitWithTarget(ctx context.Context, name string, target Who, signal int32) error { + return c.sysobj.CallWithContext(ctx, "org.freedesktop.systemd1.Manager.KillUnit", 0, name, string(target), signal).Store() +} + +// Deprecated: use ResetFailedUnitContext instead. +func (c *Conn) ResetFailedUnit(name string) error { + return c.ResetFailedUnitContext(context.Background(), name) +} + +// ResetFailedUnitContext resets the "failed" state of a specific unit. +func (c *Conn) ResetFailedUnitContext(ctx context.Context, name string) error { + return c.sysobj.CallWithContext(ctx, "org.freedesktop.systemd1.Manager.ResetFailedUnit", 0, name).Store() +} + +// Deprecated: use SystemStateContext instead. +func (c *Conn) SystemState() (*Property, error) { + return c.SystemStateContext(context.Background()) +} + +// SystemStateContext returns the systemd state. Equivalent to +// systemctl is-system-running. +func (c *Conn) SystemStateContext(ctx context.Context) (*Property, error) { + var err error + var prop dbus.Variant + + obj := c.sysconn.Object("org.freedesktop.systemd1", "/org/freedesktop/systemd1") + err = obj.CallWithContext(ctx, "org.freedesktop.DBus.Properties.Get", 0, "org.freedesktop.systemd1.Manager", "SystemState").Store(&prop) + if err != nil { + return nil, err + } + + return &Property{Name: "SystemState", Value: prop}, nil +} + +// getProperties takes the unit path and returns all of its dbus object properties, for the given dbus interface. +func (c *Conn) getProperties(ctx context.Context, path dbus.ObjectPath, dbusInterface string) (map[string]interface{}, error) { + var err error + var props map[string]dbus.Variant + + if !path.IsValid() { + return nil, fmt.Errorf("invalid unit name: %v", path) + } + + obj := c.sysconn.Object("org.freedesktop.systemd1", path) + err = obj.CallWithContext(ctx, "org.freedesktop.DBus.Properties.GetAll", 0, dbusInterface).Store(&props) + if err != nil { + return nil, err + } + + out := make(map[string]interface{}, len(props)) + for k, v := range props { + out[k] = v.Value() + } + + return out, nil +} + +// Deprecated: use GetUnitPropertiesContext instead. +func (c *Conn) GetUnitProperties(unit string) (map[string]interface{}, error) { + return c.GetUnitPropertiesContext(context.Background(), unit) +} + +// GetUnitPropertiesContext takes the (unescaped) unit name and returns all of +// its dbus object properties. +func (c *Conn) GetUnitPropertiesContext(ctx context.Context, unit string) (map[string]interface{}, error) { + path := unitPath(unit) + return c.getProperties(ctx, path, "org.freedesktop.systemd1.Unit") +} + +// Deprecated: use GetUnitPathPropertiesContext instead. +func (c *Conn) GetUnitPathProperties(path dbus.ObjectPath) (map[string]interface{}, error) { + return c.GetUnitPathPropertiesContext(context.Background(), path) +} + +// GetUnitPathPropertiesContext takes the (escaped) unit path and returns all +// of its dbus object properties. +func (c *Conn) GetUnitPathPropertiesContext(ctx context.Context, path dbus.ObjectPath) (map[string]interface{}, error) { + return c.getProperties(ctx, path, "org.freedesktop.systemd1.Unit") +} + +// Deprecated: use GetAllPropertiesContext instead. +func (c *Conn) GetAllProperties(unit string) (map[string]interface{}, error) { + return c.GetAllPropertiesContext(context.Background(), unit) +} + +// GetAllPropertiesContext takes the (unescaped) unit name and returns all of +// its dbus object properties. +func (c *Conn) GetAllPropertiesContext(ctx context.Context, unit string) (map[string]interface{}, error) { + path := unitPath(unit) + return c.getProperties(ctx, path, "") +} + +func (c *Conn) getProperty(ctx context.Context, unit string, dbusInterface string, propertyName string) (*Property, error) { + var err error + var prop dbus.Variant + + path := unitPath(unit) + if !path.IsValid() { + return nil, errors.New("invalid unit name: " + unit) + } + + obj := c.sysconn.Object("org.freedesktop.systemd1", path) + err = obj.CallWithContext(ctx, "org.freedesktop.DBus.Properties.Get", 0, dbusInterface, propertyName).Store(&prop) + if err != nil { + return nil, err + } + + return &Property{Name: propertyName, Value: prop}, nil +} + +// Deprecated: use GetUnitPropertyContext instead. +func (c *Conn) GetUnitProperty(unit string, propertyName string) (*Property, error) { + return c.GetUnitPropertyContext(context.Background(), unit, propertyName) +} + +// GetUnitPropertyContext takes an (unescaped) unit name, and a property name, +// and returns the property value. +func (c *Conn) GetUnitPropertyContext(ctx context.Context, unit string, propertyName string) (*Property, error) { + return c.getProperty(ctx, unit, "org.freedesktop.systemd1.Unit", propertyName) +} + +// Deprecated: use GetServicePropertyContext instead. +func (c *Conn) GetServiceProperty(service string, propertyName string) (*Property, error) { + return c.GetServicePropertyContext(context.Background(), service, propertyName) +} + +// GetServiceProperty returns property for given service name and property name. +func (c *Conn) GetServicePropertyContext(ctx context.Context, service string, propertyName string) (*Property, error) { + return c.getProperty(ctx, service, "org.freedesktop.systemd1.Service", propertyName) +} + +// Deprecated: use GetUnitTypePropertiesContext instead. +func (c *Conn) GetUnitTypeProperties(unit string, unitType string) (map[string]interface{}, error) { + return c.GetUnitTypePropertiesContext(context.Background(), unit, unitType) +} + +// GetUnitTypePropertiesContext returns the extra properties for a unit, specific to the unit type. +// Valid values for unitType: Service, Socket, Target, Device, Mount, Automount, Snapshot, Timer, Swap, Path, Slice, Scope. +// Returns "dbus.Error: Unknown interface" error if the unitType is not the correct type of the unit. +func (c *Conn) GetUnitTypePropertiesContext(ctx context.Context, unit string, unitType string) (map[string]interface{}, error) { + path := unitPath(unit) + return c.getProperties(ctx, path, "org.freedesktop.systemd1."+unitType) +} + +// Deprecated: use SetUnitPropertiesContext instead. +func (c *Conn) SetUnitProperties(name string, runtime bool, properties ...Property) error { + return c.SetUnitPropertiesContext(context.Background(), name, runtime, properties...) +} + +// SetUnitPropertiesContext may be used to modify certain unit properties at runtime. +// Not all properties may be changed at runtime, but many resource management +// settings (primarily those in systemd.cgroup(5)) may. The changes are applied +// instantly, and stored on disk for future boots, unless runtime is true, in which +// case the settings only apply until the next reboot. name is the name of the unit +// to modify. properties are the settings to set, encoded as an array of property +// name and value pairs. +func (c *Conn) SetUnitPropertiesContext(ctx context.Context, name string, runtime bool, properties ...Property) error { + return c.sysobj.CallWithContext(ctx, "org.freedesktop.systemd1.Manager.SetUnitProperties", 0, name, runtime, properties).Store() +} + +// Deprecated: use GetUnitTypePropertyContext instead. +func (c *Conn) GetUnitTypeProperty(unit string, unitType string, propertyName string) (*Property, error) { + return c.GetUnitTypePropertyContext(context.Background(), unit, unitType, propertyName) +} + +// GetUnitTypePropertyContext takes a property name, a unit name, and a unit type, +// and returns a property value. For valid values of unitType, see GetUnitTypePropertiesContext. +func (c *Conn) GetUnitTypePropertyContext(ctx context.Context, unit string, unitType string, propertyName string) (*Property, error) { + return c.getProperty(ctx, unit, "org.freedesktop.systemd1."+unitType, propertyName) +} + +type UnitStatus struct { + Name string // The primary unit name as string + Description string // The human readable description string + LoadState string // The load state (i.e. whether the unit file has been loaded successfully) + ActiveState string // The active state (i.e. whether the unit is currently started or not) + SubState string // The sub state (a more fine-grained version of the active state that is specific to the unit type, which the active state is not) + Followed string // A unit that is being followed in its state by this unit, if there is any, otherwise the empty string. + Path dbus.ObjectPath // The unit object path + JobId uint32 // If there is a job queued for the job unit the numeric job id, 0 otherwise + JobType string // The job type as string + JobPath dbus.ObjectPath // The job object path +} + +type storeFunc func(retvalues ...interface{}) error + +func (c *Conn) listUnitsInternal(f storeFunc) ([]UnitStatus, error) { + result := make([][]interface{}, 0) + err := f(&result) + if err != nil { + return nil, err + } + + resultInterface := make([]interface{}, len(result)) + for i := range result { + resultInterface[i] = result[i] + } + + status := make([]UnitStatus, len(result)) + statusInterface := make([]interface{}, len(status)) + for i := range status { + statusInterface[i] = &status[i] + } + + err = dbus.Store(resultInterface, statusInterface...) + if err != nil { + return nil, err + } + + return status, nil +} + +// Deprecated: use ListUnitsContext instead. +func (c *Conn) ListUnits() ([]UnitStatus, error) { + return c.ListUnitsContext(context.Background()) +} + +// ListUnitsContext returns an array with all currently loaded units. Note that +// units may be known by multiple names at the same time, and hence there might +// be more unit names loaded than actual units behind them. +// Also note that a unit is only loaded if it is active and/or enabled. +// Units that are both disabled and inactive will thus not be returned. +func (c *Conn) ListUnitsContext(ctx context.Context) ([]UnitStatus, error) { + return c.listUnitsInternal(c.sysobj.CallWithContext(ctx, "org.freedesktop.systemd1.Manager.ListUnits", 0).Store) +} + +// Deprecated: use ListUnitsFilteredContext instead. +func (c *Conn) ListUnitsFiltered(states []string) ([]UnitStatus, error) { + return c.ListUnitsFilteredContext(context.Background(), states) +} + +// ListUnitsFilteredContext returns an array with units filtered by state. +// It takes a list of units' statuses to filter. +func (c *Conn) ListUnitsFilteredContext(ctx context.Context, states []string) ([]UnitStatus, error) { + return c.listUnitsInternal(c.sysobj.CallWithContext(ctx, "org.freedesktop.systemd1.Manager.ListUnitsFiltered", 0, states).Store) +} + +// Deprecated: use ListUnitsByPatternsContext instead. +func (c *Conn) ListUnitsByPatterns(states []string, patterns []string) ([]UnitStatus, error) { + return c.ListUnitsByPatternsContext(context.Background(), states, patterns) +} + +// ListUnitsByPatternsContext returns an array with units. +// It takes a list of units' statuses and names to filter. +// Note that units may be known by multiple names at the same time, +// and hence there might be more unit names loaded than actual units behind them. +func (c *Conn) ListUnitsByPatternsContext(ctx context.Context, states []string, patterns []string) ([]UnitStatus, error) { + return c.listUnitsInternal(c.sysobj.CallWithContext(ctx, "org.freedesktop.systemd1.Manager.ListUnitsByPatterns", 0, states, patterns).Store) +} + +// Deprecated: use ListUnitsByNamesContext instead. +func (c *Conn) ListUnitsByNames(units []string) ([]UnitStatus, error) { + return c.ListUnitsByNamesContext(context.Background(), units) +} + +// ListUnitsByNamesContext returns an array with units. It takes a list of units' +// names and returns an UnitStatus array. Comparing to ListUnitsByPatternsContext +// method, this method returns statuses even for inactive or non-existing +// units. Input array should contain exact unit names, but not patterns. +// +// Requires systemd v230 or higher. +func (c *Conn) ListUnitsByNamesContext(ctx context.Context, units []string) ([]UnitStatus, error) { + return c.listUnitsInternal(c.sysobj.CallWithContext(ctx, "org.freedesktop.systemd1.Manager.ListUnitsByNames", 0, units).Store) +} + +type UnitFile struct { + Path string + Type string +} + +func (c *Conn) listUnitFilesInternal(f storeFunc) ([]UnitFile, error) { + result := make([][]interface{}, 0) + err := f(&result) + if err != nil { + return nil, err + } + + resultInterface := make([]interface{}, len(result)) + for i := range result { + resultInterface[i] = result[i] + } + + files := make([]UnitFile, len(result)) + fileInterface := make([]interface{}, len(files)) + for i := range files { + fileInterface[i] = &files[i] + } + + err = dbus.Store(resultInterface, fileInterface...) + if err != nil { + return nil, err + } + + return files, nil +} + +// Deprecated: use ListUnitFilesContext instead. +func (c *Conn) ListUnitFiles() ([]UnitFile, error) { + return c.ListUnitFilesContext(context.Background()) +} + +// ListUnitFiles returns an array of all available units on disk. +func (c *Conn) ListUnitFilesContext(ctx context.Context) ([]UnitFile, error) { + return c.listUnitFilesInternal(c.sysobj.CallWithContext(ctx, "org.freedesktop.systemd1.Manager.ListUnitFiles", 0).Store) +} + +// Deprecated: use ListUnitFilesByPatternsContext instead. +func (c *Conn) ListUnitFilesByPatterns(states []string, patterns []string) ([]UnitFile, error) { + return c.ListUnitFilesByPatternsContext(context.Background(), states, patterns) +} + +// ListUnitFilesByPatternsContext returns an array of all available units on disk matched the patterns. +func (c *Conn) ListUnitFilesByPatternsContext(ctx context.Context, states []string, patterns []string) ([]UnitFile, error) { + return c.listUnitFilesInternal(c.sysobj.CallWithContext(ctx, "org.freedesktop.systemd1.Manager.ListUnitFilesByPatterns", 0, states, patterns).Store) +} + +type LinkUnitFileChange EnableUnitFileChange + +// Deprecated: use LinkUnitFilesContext instead. +func (c *Conn) LinkUnitFiles(files []string, runtime bool, force bool) ([]LinkUnitFileChange, error) { + return c.LinkUnitFilesContext(context.Background(), files, runtime, force) +} + +// LinkUnitFilesContext links unit files (that are located outside of the +// usual unit search paths) into the unit search path. +// +// It takes a list of absolute paths to unit files to link and two +// booleans. +// +// The first boolean controls whether the unit shall be +// enabled for runtime only (true, /run), or persistently (false, +// /etc). +// +// The second controls whether symlinks pointing to other units shall +// be replaced if necessary. +// +// This call returns a list of the changes made. The list consists of +// structures with three strings: the type of the change (one of symlink +// or unlink), the file name of the symlink and the destination of the +// symlink. +func (c *Conn) LinkUnitFilesContext(ctx context.Context, files []string, runtime bool, force bool) ([]LinkUnitFileChange, error) { + result := make([][]interface{}, 0) + err := c.sysobj.CallWithContext(ctx, "org.freedesktop.systemd1.Manager.LinkUnitFiles", 0, files, runtime, force).Store(&result) + if err != nil { + return nil, err + } + + resultInterface := make([]interface{}, len(result)) + for i := range result { + resultInterface[i] = result[i] + } + + changes := make([]LinkUnitFileChange, len(result)) + changesInterface := make([]interface{}, len(changes)) + for i := range changes { + changesInterface[i] = &changes[i] + } + + err = dbus.Store(resultInterface, changesInterface...) + if err != nil { + return nil, err + } + + return changes, nil +} + +// Deprecated: use EnableUnitFilesContext instead. +func (c *Conn) EnableUnitFiles(files []string, runtime bool, force bool) (bool, []EnableUnitFileChange, error) { + return c.EnableUnitFilesContext(context.Background(), files, runtime, force) +} + +// EnableUnitFilesContext may be used to enable one or more units in the system +// (by creating symlinks to them in /etc or /run). +// +// It takes a list of unit files to enable (either just file names or full +// absolute paths if the unit files are residing outside the usual unit +// search paths), and two booleans: the first controls whether the unit shall +// be enabled for runtime only (true, /run), or persistently (false, /etc). +// The second one controls whether symlinks pointing to other units shall +// be replaced if necessary. +// +// This call returns one boolean and an array with the changes made. The +// boolean signals whether the unit files contained any enablement +// information (i.e. an [Install]) section. The changes list consists of +// structures with three strings: the type of the change (one of symlink +// or unlink), the file name of the symlink and the destination of the +// symlink. +func (c *Conn) EnableUnitFilesContext(ctx context.Context, files []string, runtime bool, force bool) (bool, []EnableUnitFileChange, error) { + var carries_install_info bool + + result := make([][]interface{}, 0) + err := c.sysobj.CallWithContext(ctx, "org.freedesktop.systemd1.Manager.EnableUnitFiles", 0, files, runtime, force).Store(&carries_install_info, &result) + if err != nil { + return false, nil, err + } + + resultInterface := make([]interface{}, len(result)) + for i := range result { + resultInterface[i] = result[i] + } + + changes := make([]EnableUnitFileChange, len(result)) + changesInterface := make([]interface{}, len(changes)) + for i := range changes { + changesInterface[i] = &changes[i] + } + + err = dbus.Store(resultInterface, changesInterface...) + if err != nil { + return false, nil, err + } + + return carries_install_info, changes, nil +} + +type EnableUnitFileChange struct { + Type string // Type of the change (one of symlink or unlink) + Filename string // File name of the symlink + Destination string // Destination of the symlink +} + +// Deprecated: use DisableUnitFilesContext instead. +func (c *Conn) DisableUnitFiles(files []string, runtime bool) ([]DisableUnitFileChange, error) { + return c.DisableUnitFilesContext(context.Background(), files, runtime) +} + +// DisableUnitFilesContext may be used to disable one or more units in the +// system (by removing symlinks to them from /etc or /run). +// +// It takes a list of unit files to disable (either just file names or full +// absolute paths if the unit files are residing outside the usual unit +// search paths), and one boolean: whether the unit was enabled for runtime +// only (true, /run), or persistently (false, /etc). +// +// This call returns an array with the changes made. The changes list +// consists of structures with three strings: the type of the change (one of +// symlink or unlink), the file name of the symlink and the destination of the +// symlink. +func (c *Conn) DisableUnitFilesContext(ctx context.Context, files []string, runtime bool) ([]DisableUnitFileChange, error) { + result := make([][]interface{}, 0) + err := c.sysobj.CallWithContext(ctx, "org.freedesktop.systemd1.Manager.DisableUnitFiles", 0, files, runtime).Store(&result) + if err != nil { + return nil, err + } + + resultInterface := make([]interface{}, len(result)) + for i := range result { + resultInterface[i] = result[i] + } + + changes := make([]DisableUnitFileChange, len(result)) + changesInterface := make([]interface{}, len(changes)) + for i := range changes { + changesInterface[i] = &changes[i] + } + + err = dbus.Store(resultInterface, changesInterface...) + if err != nil { + return nil, err + } + + return changes, nil +} + +type DisableUnitFileChange struct { + Type string // Type of the change (one of symlink or unlink) + Filename string // File name of the symlink + Destination string // Destination of the symlink +} + +// Deprecated: use MaskUnitFilesContext instead. +func (c *Conn) MaskUnitFiles(files []string, runtime bool, force bool) ([]MaskUnitFileChange, error) { + return c.MaskUnitFilesContext(context.Background(), files, runtime, force) +} + +// MaskUnitFilesContext masks one or more units in the system. +// +// The files argument contains a list of units to mask (either just file names +// or full absolute paths if the unit files are residing outside the usual unit +// search paths). +// +// The runtime argument is used to specify whether the unit was enabled for +// runtime only (true, /run/systemd/..), or persistently (false, +// /etc/systemd/..). +func (c *Conn) MaskUnitFilesContext(ctx context.Context, files []string, runtime bool, force bool) ([]MaskUnitFileChange, error) { + result := make([][]interface{}, 0) + err := c.sysobj.CallWithContext(ctx, "org.freedesktop.systemd1.Manager.MaskUnitFiles", 0, files, runtime, force).Store(&result) + if err != nil { + return nil, err + } + + resultInterface := make([]interface{}, len(result)) + for i := range result { + resultInterface[i] = result[i] + } + + changes := make([]MaskUnitFileChange, len(result)) + changesInterface := make([]interface{}, len(changes)) + for i := range changes { + changesInterface[i] = &changes[i] + } + + err = dbus.Store(resultInterface, changesInterface...) + if err != nil { + return nil, err + } + + return changes, nil +} + +type MaskUnitFileChange struct { + Type string // Type of the change (one of symlink or unlink) + Filename string // File name of the symlink + Destination string // Destination of the symlink +} + +// Deprecated: use UnmaskUnitFilesContext instead. +func (c *Conn) UnmaskUnitFiles(files []string, runtime bool) ([]UnmaskUnitFileChange, error) { + return c.UnmaskUnitFilesContext(context.Background(), files, runtime) +} + +// UnmaskUnitFilesContext unmasks one or more units in the system. +// +// It takes the list of unit files to mask (either just file names or full +// absolute paths if the unit files are residing outside the usual unit search +// paths), and a boolean runtime flag to specify whether the unit was enabled +// for runtime only (true, /run/systemd/..), or persistently (false, +// /etc/systemd/..). +func (c *Conn) UnmaskUnitFilesContext(ctx context.Context, files []string, runtime bool) ([]UnmaskUnitFileChange, error) { + result := make([][]interface{}, 0) + err := c.sysobj.CallWithContext(ctx, "org.freedesktop.systemd1.Manager.UnmaskUnitFiles", 0, files, runtime).Store(&result) + if err != nil { + return nil, err + } + + resultInterface := make([]interface{}, len(result)) + for i := range result { + resultInterface[i] = result[i] + } + + changes := make([]UnmaskUnitFileChange, len(result)) + changesInterface := make([]interface{}, len(changes)) + for i := range changes { + changesInterface[i] = &changes[i] + } + + err = dbus.Store(resultInterface, changesInterface...) + if err != nil { + return nil, err + } + + return changes, nil +} + +type UnmaskUnitFileChange struct { + Type string // Type of the change (one of symlink or unlink) + Filename string // File name of the symlink + Destination string // Destination of the symlink +} + +// Deprecated: use ReloadContext instead. +func (c *Conn) Reload() error { + return c.ReloadContext(context.Background()) +} + +// ReloadContext instructs systemd to scan for and reload unit files. This is +// an equivalent to systemctl daemon-reload. +func (c *Conn) ReloadContext(ctx context.Context) error { + return c.sysobj.CallWithContext(ctx, "org.freedesktop.systemd1.Manager.Reload", 0).Store() +} + +func unitPath(name string) dbus.ObjectPath { + return dbus.ObjectPath("/org/freedesktop/systemd1/unit/" + PathBusEscape(name)) +} + +// unitName returns the unescaped base element of the supplied escaped path. +func unitName(dpath dbus.ObjectPath) string { + return pathBusUnescape(path.Base(string(dpath))) +} + +// JobStatus holds a currently queued job definition. +type JobStatus struct { + Id uint32 // The numeric job id + Unit string // The primary unit name for this job + JobType string // The job type as string + Status string // The job state as string + JobPath dbus.ObjectPath // The job object path + UnitPath dbus.ObjectPath // The unit object path +} + +// Deprecated: use ListJobsContext instead. +func (c *Conn) ListJobs() ([]JobStatus, error) { + return c.ListJobsContext(context.Background()) +} + +// ListJobsContext returns an array with all currently queued jobs. +func (c *Conn) ListJobsContext(ctx context.Context) ([]JobStatus, error) { + return c.listJobsInternal(ctx) +} + +func (c *Conn) listJobsInternal(ctx context.Context) ([]JobStatus, error) { + result := make([][]interface{}, 0) + if err := c.sysobj.CallWithContext(ctx, "org.freedesktop.systemd1.Manager.ListJobs", 0).Store(&result); err != nil { + return nil, err + } + + resultInterface := make([]interface{}, len(result)) + for i := range result { + resultInterface[i] = result[i] + } + + status := make([]JobStatus, len(result)) + statusInterface := make([]interface{}, len(status)) + for i := range status { + statusInterface[i] = &status[i] + } + + if err := dbus.Store(resultInterface, statusInterface...); err != nil { + return nil, err + } + + return status, nil +} diff --git a/vendor/github.com/coreos/go-systemd/dbus/properties.go b/vendor/github.com/coreos/go-systemd/v22/dbus/properties.go similarity index 99% rename from vendor/github.com/coreos/go-systemd/dbus/properties.go rename to vendor/github.com/coreos/go-systemd/v22/dbus/properties.go index 6c81895..fb42b62 100644 --- a/vendor/github.com/coreos/go-systemd/dbus/properties.go +++ b/vendor/github.com/coreos/go-systemd/v22/dbus/properties.go @@ -15,7 +15,7 @@ package dbus import ( - "github.com/godbus/dbus" + "github.com/godbus/dbus/v5" ) // From the systemd docs: @@ -56,7 +56,7 @@ type execStart struct { // http://www.freedesktop.org/software/systemd/man/systemd.service.html#ExecStart= func PropExecStart(command []string, uncleanIsFailure bool) Property { execStarts := []execStart{ - execStart{ + { Path: command[0], Args: command, UncleanIsFailure: uncleanIsFailure, diff --git a/vendor/github.com/coreos/go-systemd/dbus/set.go b/vendor/github.com/coreos/go-systemd/v22/dbus/set.go similarity index 100% rename from vendor/github.com/coreos/go-systemd/dbus/set.go rename to vendor/github.com/coreos/go-systemd/v22/dbus/set.go diff --git a/vendor/github.com/coreos/go-systemd/dbus/subscription.go b/vendor/github.com/coreos/go-systemd/v22/dbus/subscription.go similarity index 99% rename from vendor/github.com/coreos/go-systemd/dbus/subscription.go rename to vendor/github.com/coreos/go-systemd/v22/dbus/subscription.go index f6d7a08..7e370fe 100644 --- a/vendor/github.com/coreos/go-systemd/dbus/subscription.go +++ b/vendor/github.com/coreos/go-systemd/v22/dbus/subscription.go @@ -19,7 +19,7 @@ import ( "log" "time" - "github.com/godbus/dbus" + "github.com/godbus/dbus/v5" ) const ( diff --git a/vendor/github.com/coreos/go-systemd/dbus/subscription_set.go b/vendor/github.com/coreos/go-systemd/v22/dbus/subscription_set.go similarity index 100% rename from vendor/github.com/coreos/go-systemd/dbus/subscription_set.go rename to vendor/github.com/coreos/go-systemd/v22/dbus/subscription_set.go diff --git a/vendor/github.com/cpuguy83/go-md2man/v2/LICENSE.md b/vendor/github.com/cpuguy83/go-md2man/v2/LICENSE.md new file mode 100644 index 0000000..1cade6c --- /dev/null +++ b/vendor/github.com/cpuguy83/go-md2man/v2/LICENSE.md @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2014 Brian Goff + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/vendor/github.com/cpuguy83/go-md2man/v2/md2man/md2man.go b/vendor/github.com/cpuguy83/go-md2man/v2/md2man/md2man.go new file mode 100644 index 0000000..b480056 --- /dev/null +++ b/vendor/github.com/cpuguy83/go-md2man/v2/md2man/md2man.go @@ -0,0 +1,14 @@ +package md2man + +import ( + "github.com/russross/blackfriday/v2" +) + +// Render converts a markdown document into a roff formatted document. +func Render(doc []byte) []byte { + renderer := NewRoffRenderer() + + return blackfriday.Run(doc, + []blackfriday.Option{blackfriday.WithRenderer(renderer), + blackfriday.WithExtensions(renderer.GetExtensions())}...) +} diff --git a/vendor/github.com/cpuguy83/go-md2man/v2/md2man/roff.go b/vendor/github.com/cpuguy83/go-md2man/v2/md2man/roff.go new file mode 100644 index 0000000..0668a66 --- /dev/null +++ b/vendor/github.com/cpuguy83/go-md2man/v2/md2man/roff.go @@ -0,0 +1,345 @@ +package md2man + +import ( + "fmt" + "io" + "os" + "strings" + + "github.com/russross/blackfriday/v2" +) + +// roffRenderer implements the blackfriday.Renderer interface for creating +// roff format (manpages) from markdown text +type roffRenderer struct { + extensions blackfriday.Extensions + listCounters []int + firstHeader bool + defineTerm bool + listDepth int +} + +const ( + titleHeader = ".TH " + topLevelHeader = "\n\n.SH " + secondLevelHdr = "\n.SH " + otherHeader = "\n.SS " + crTag = "\n" + emphTag = "\\fI" + emphCloseTag = "\\fP" + strongTag = "\\fB" + strongCloseTag = "\\fP" + breakTag = "\n.br\n" + paraTag = "\n.PP\n" + hruleTag = "\n.ti 0\n\\l'\\n(.lu'\n" + linkTag = "\n\\[la]" + linkCloseTag = "\\[ra]" + codespanTag = "\\fB\\fC" + codespanCloseTag = "\\fR" + codeTag = "\n.PP\n.RS\n\n.nf\n" + codeCloseTag = "\n.fi\n.RE\n" + quoteTag = "\n.PP\n.RS\n" + quoteCloseTag = "\n.RE\n" + listTag = "\n.RS\n" + listCloseTag = "\n.RE\n" + arglistTag = "\n.TP\n" + tableStart = "\n.TS\nallbox;\n" + tableEnd = ".TE\n" + tableCellStart = "T{\n" + tableCellEnd = "\nT}\n" +) + +// NewRoffRenderer creates a new blackfriday Renderer for generating roff documents +// from markdown +func NewRoffRenderer() *roffRenderer { // nolint: golint + var extensions blackfriday.Extensions + + extensions |= blackfriday.NoIntraEmphasis + extensions |= blackfriday.Tables + extensions |= blackfriday.FencedCode + extensions |= blackfriday.SpaceHeadings + extensions |= blackfriday.Footnotes + extensions |= blackfriday.Titleblock + extensions |= blackfriday.DefinitionLists + return &roffRenderer{ + extensions: extensions, + } +} + +// GetExtensions returns the list of extensions used by this renderer implementation +func (r *roffRenderer) GetExtensions() blackfriday.Extensions { + return r.extensions +} + +// RenderHeader handles outputting the header at document start +func (r *roffRenderer) RenderHeader(w io.Writer, ast *blackfriday.Node) { + // disable hyphenation + out(w, ".nh\n") +} + +// RenderFooter handles outputting the footer at the document end; the roff +// renderer has no footer information +func (r *roffRenderer) RenderFooter(w io.Writer, ast *blackfriday.Node) { +} + +// RenderNode is called for each node in a markdown document; based on the node +// type the equivalent roff output is sent to the writer +func (r *roffRenderer) RenderNode(w io.Writer, node *blackfriday.Node, entering bool) blackfriday.WalkStatus { + + var walkAction = blackfriday.GoToNext + + switch node.Type { + case blackfriday.Text: + r.handleText(w, node, entering) + case blackfriday.Softbreak: + out(w, crTag) + case blackfriday.Hardbreak: + out(w, breakTag) + case blackfriday.Emph: + if entering { + out(w, emphTag) + } else { + out(w, emphCloseTag) + } + case blackfriday.Strong: + if entering { + out(w, strongTag) + } else { + out(w, strongCloseTag) + } + case blackfriday.Link: + if !entering { + out(w, linkTag+string(node.LinkData.Destination)+linkCloseTag) + } + case blackfriday.Image: + // ignore images + walkAction = blackfriday.SkipChildren + case blackfriday.Code: + out(w, codespanTag) + escapeSpecialChars(w, node.Literal) + out(w, codespanCloseTag) + case blackfriday.Document: + break + case blackfriday.Paragraph: + // roff .PP markers break lists + if r.listDepth > 0 { + return blackfriday.GoToNext + } + if entering { + out(w, paraTag) + } else { + out(w, crTag) + } + case blackfriday.BlockQuote: + if entering { + out(w, quoteTag) + } else { + out(w, quoteCloseTag) + } + case blackfriday.Heading: + r.handleHeading(w, node, entering) + case blackfriday.HorizontalRule: + out(w, hruleTag) + case blackfriday.List: + r.handleList(w, node, entering) + case blackfriday.Item: + r.handleItem(w, node, entering) + case blackfriday.CodeBlock: + out(w, codeTag) + escapeSpecialChars(w, node.Literal) + out(w, codeCloseTag) + case blackfriday.Table: + r.handleTable(w, node, entering) + case blackfriday.TableCell: + r.handleTableCell(w, node, entering) + case blackfriday.TableHead: + case blackfriday.TableBody: + case blackfriday.TableRow: + // no action as cell entries do all the nroff formatting + return blackfriday.GoToNext + default: + fmt.Fprintln(os.Stderr, "WARNING: go-md2man does not handle node type "+node.Type.String()) + } + return walkAction +} + +func (r *roffRenderer) handleText(w io.Writer, node *blackfriday.Node, entering bool) { + var ( + start, end string + ) + // handle special roff table cell text encapsulation + if node.Parent.Type == blackfriday.TableCell { + if len(node.Literal) > 30 { + start = tableCellStart + end = tableCellEnd + } else { + // end rows that aren't terminated by "tableCellEnd" with a cr if end of row + if node.Parent.Next == nil && !node.Parent.IsHeader { + end = crTag + } + } + } + out(w, start) + escapeSpecialChars(w, node.Literal) + out(w, end) +} + +func (r *roffRenderer) handleHeading(w io.Writer, node *blackfriday.Node, entering bool) { + if entering { + switch node.Level { + case 1: + if !r.firstHeader { + out(w, titleHeader) + r.firstHeader = true + break + } + out(w, topLevelHeader) + case 2: + out(w, secondLevelHdr) + default: + out(w, otherHeader) + } + } +} + +func (r *roffRenderer) handleList(w io.Writer, node *blackfriday.Node, entering bool) { + openTag := listTag + closeTag := listCloseTag + if node.ListFlags&blackfriday.ListTypeDefinition != 0 { + // tags for definition lists handled within Item node + openTag = "" + closeTag = "" + } + if entering { + r.listDepth++ + if node.ListFlags&blackfriday.ListTypeOrdered != 0 { + r.listCounters = append(r.listCounters, 1) + } + out(w, openTag) + } else { + if node.ListFlags&blackfriday.ListTypeOrdered != 0 { + r.listCounters = r.listCounters[:len(r.listCounters)-1] + } + out(w, closeTag) + r.listDepth-- + } +} + +func (r *roffRenderer) handleItem(w io.Writer, node *blackfriday.Node, entering bool) { + if entering { + if node.ListFlags&blackfriday.ListTypeOrdered != 0 { + out(w, fmt.Sprintf(".IP \"%3d.\" 5\n", r.listCounters[len(r.listCounters)-1])) + r.listCounters[len(r.listCounters)-1]++ + } else if node.ListFlags&blackfriday.ListTypeDefinition != 0 { + // state machine for handling terms and following definitions + // since blackfriday does not distinguish them properly, nor + // does it seperate them into separate lists as it should + if !r.defineTerm { + out(w, arglistTag) + r.defineTerm = true + } else { + r.defineTerm = false + } + } else { + out(w, ".IP \\(bu 2\n") + } + } else { + out(w, "\n") + } +} + +func (r *roffRenderer) handleTable(w io.Writer, node *blackfriday.Node, entering bool) { + if entering { + out(w, tableStart) + //call walker to count cells (and rows?) so format section can be produced + columns := countColumns(node) + out(w, strings.Repeat("l ", columns)+"\n") + out(w, strings.Repeat("l ", columns)+".\n") + } else { + out(w, tableEnd) + } +} + +func (r *roffRenderer) handleTableCell(w io.Writer, node *blackfriday.Node, entering bool) { + var ( + start, end string + ) + if node.IsHeader { + start = codespanTag + end = codespanCloseTag + } + if entering { + if node.Prev != nil && node.Prev.Type == blackfriday.TableCell { + out(w, "\t"+start) + } else { + out(w, start) + } + } else { + // need to carriage return if we are at the end of the header row + if node.IsHeader && node.Next == nil { + end = end + crTag + } + out(w, end) + } +} + +// because roff format requires knowing the column count before outputting any table +// data we need to walk a table tree and count the columns +func countColumns(node *blackfriday.Node) int { + var columns int + + node.Walk(func(node *blackfriday.Node, entering bool) blackfriday.WalkStatus { + switch node.Type { + case blackfriday.TableRow: + if !entering { + return blackfriday.Terminate + } + case blackfriday.TableCell: + if entering { + columns++ + } + default: + } + return blackfriday.GoToNext + }) + return columns +} + +func out(w io.Writer, output string) { + io.WriteString(w, output) // nolint: errcheck +} + +func needsBackslash(c byte) bool { + for _, r := range []byte("-_&\\~") { + if c == r { + return true + } + } + return false +} + +func escapeSpecialChars(w io.Writer, text []byte) { + for i := 0; i < len(text); i++ { + // escape initial apostrophe or period + if len(text) >= 1 && (text[0] == '\'' || text[0] == '.') { + out(w, "\\&") + } + + // directly copy normal characters + org := i + + for i < len(text) && !needsBackslash(text[i]) { + i++ + } + if i > org { + w.Write(text[org:i]) // nolint: errcheck + } + + // escape a character + if i >= len(text) { + break + } + + w.Write([]byte{'\\', text[i]}) // nolint: errcheck + } +} diff --git a/vendor/github.com/cyphar/filepath-securejoin/.travis.yml b/vendor/github.com/cyphar/filepath-securejoin/.travis.yml new file mode 100644 index 0000000..b94ff8c --- /dev/null +++ b/vendor/github.com/cyphar/filepath-securejoin/.travis.yml @@ -0,0 +1,21 @@ +# Copyright (C) 2017 SUSE LLC. All rights reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +language: go +go: + - 1.13.x + - 1.16.x + - tip +arch: + - AMD64 + - ppc64le +os: + - linux + - osx + +script: + - go test -cover -v ./... + +notifications: + email: false diff --git a/vendor/github.com/cyphar/filepath-securejoin/README.md b/vendor/github.com/cyphar/filepath-securejoin/README.md index 49b2baa..3624617 100644 --- a/vendor/github.com/cyphar/filepath-securejoin/README.md +++ b/vendor/github.com/cyphar/filepath-securejoin/README.md @@ -7,6 +7,19 @@ standard library][go#20126]. The purpose of this function is to be a "secure" alternative to `filepath.Join`, and in particular it provides certain guarantees that are not provided by `filepath.Join`. +> **NOTE**: This code is *only* safe if you are not at risk of other processes +> modifying path components after you've used `SecureJoin`. If it is possible +> for a malicious process to modify path components of the resolved path, then +> you will be vulnerable to some fairly trivial TOCTOU race conditions. [There +> are some Linux kernel patches I'm working on which might allow for a better +> solution.][lwn-obeneath] +> +> In addition, with a slightly modified API it might be possible to use +> `O_PATH` and verify that the opened path is actually the resolved one -- but +> I have not done that yet. I might add it in the future as a helper function +> to help users verify the path (we can't just return `/proc/self/fd/` +> because that doesn't always work transparently for all users). + This is the function prototype: ```go @@ -16,8 +29,8 @@ func SecureJoin(root, unsafePath string) (string, error) This library **guarantees** the following: * If no error is set, the resulting string **must** be a child path of - `SecureJoin` and will not contain any symlink path components (they will all - be expanded). + `root` and will not contain any symlink path components (they will all be + expanded). * When expanding symlinks, all symlink path components **must** be resolved relative to the provided root. In particular, this can be considered a @@ -25,7 +38,7 @@ This library **guarantees** the following: these symlinks will **not** be expanded lexically (`filepath.Clean` is not called on the input before processing). -* Non-existant path components are unaffected by `SecureJoin` (similar to +* Non-existent path components are unaffected by `SecureJoin` (similar to `filepath.EvalSymlinks`'s semantics). * The returned path will always be `filepath.Clean`ed and thus not contain any @@ -57,6 +70,7 @@ func SecureJoin(root, unsafePath string) (string, error) { } ``` +[lwn-obeneath]: https://lwn.net/Articles/767547/ [go#20126]: https://github.com/golang/go/issues/20126 ### License ### diff --git a/vendor/github.com/cyphar/filepath-securejoin/VERSION b/vendor/github.com/cyphar/filepath-securejoin/VERSION new file mode 100644 index 0000000..7179039 --- /dev/null +++ b/vendor/github.com/cyphar/filepath-securejoin/VERSION @@ -0,0 +1 @@ +0.2.3 diff --git a/vendor/github.com/cyphar/filepath-securejoin/go.mod b/vendor/github.com/cyphar/filepath-securejoin/go.mod new file mode 100644 index 0000000..0607c1f --- /dev/null +++ b/vendor/github.com/cyphar/filepath-securejoin/go.mod @@ -0,0 +1,3 @@ +module github.com/cyphar/filepath-securejoin + +go 1.13 diff --git a/vendor/github.com/cyphar/filepath-securejoin/join.go b/vendor/github.com/cyphar/filepath-securejoin/join.go index c4ca3d7..7dd08db 100644 --- a/vendor/github.com/cyphar/filepath-securejoin/join.go +++ b/vendor/github.com/cyphar/filepath-securejoin/join.go @@ -12,39 +12,20 @@ package securejoin import ( "bytes" + "errors" "os" "path/filepath" "strings" "syscall" - - "github.com/pkg/errors" ) -// ErrSymlinkLoop is returned by SecureJoinVFS when too many symlinks have been -// evaluated in attempting to securely join the two given paths. -var ErrSymlinkLoop = errors.Wrap(syscall.ELOOP, "secure join") - // IsNotExist tells you if err is an error that implies that either the path // accessed does not exist (or path components don't exist). This is // effectively a more broad version of os.IsNotExist. func IsNotExist(err error) bool { - // If it's a bone-fide ENOENT just bail. - if os.IsNotExist(errors.Cause(err)) { - return true - } - // Check that it's not actually an ENOTDIR, which in some cases is a more // convoluted case of ENOENT (usually involving weird paths). - var errno error - switch err := errors.Cause(err).(type) { - case *os.PathError: - errno = err.Err - case *os.LinkError: - errno = err.Err - case *os.SyscallError: - errno = err.Err - } - return errno == syscall.ENOTDIR || errno == syscall.ENOENT + return errors.Is(err, os.ErrNotExist) || errors.Is(err, syscall.ENOTDIR) || errors.Is(err, syscall.ENOENT) } // SecureJoinVFS joins the two given path components (similar to Join) except @@ -68,7 +49,7 @@ func SecureJoinVFS(root, unsafePath string, vfs VFS) (string, error) { n := 0 for unsafePath != "" { if n > 255 { - return "", ErrSymlinkLoop + return "", &os.PathError{Op: "SecureJoin", Path: root + "/" + unsafePath, Err: syscall.ELOOP} } // Next path component, p. diff --git a/vendor/github.com/cyphar/filepath-securejoin/vendor.conf b/vendor/github.com/cyphar/filepath-securejoin/vendor.conf deleted file mode 100644 index 66bb574..0000000 --- a/vendor/github.com/cyphar/filepath-securejoin/vendor.conf +++ /dev/null @@ -1 +0,0 @@ -github.com/pkg/errors v0.8.0 diff --git a/vendor/github.com/docker/go-units/CONTRIBUTING.md b/vendor/github.com/docker/go-units/CONTRIBUTING.md new file mode 100644 index 0000000..9ea86d7 --- /dev/null +++ b/vendor/github.com/docker/go-units/CONTRIBUTING.md @@ -0,0 +1,67 @@ +# Contributing to go-units + +Want to hack on go-units? Awesome! Here are instructions to get you started. + +go-units is a part of the [Docker](https://www.docker.com) project, and follows +the same rules and principles. If you're already familiar with the way +Docker does things, you'll feel right at home. + +Otherwise, go read Docker's +[contributions guidelines](https://github.com/docker/docker/blob/master/CONTRIBUTING.md), +[issue triaging](https://github.com/docker/docker/blob/master/project/ISSUE-TRIAGE.md), +[review process](https://github.com/docker/docker/blob/master/project/REVIEWING.md) and +[branches and tags](https://github.com/docker/docker/blob/master/project/BRANCHES-AND-TAGS.md). + +### Sign your work + +The sign-off is a simple line at the end of the explanation for the patch. Your +signature certifies that you wrote the patch or otherwise have the right to pass +it on as an open-source patch. The rules are pretty simple: if you can certify +the below (from [developercertificate.org](http://developercertificate.org/)): + +``` +Developer Certificate of Origin +Version 1.1 + +Copyright (C) 2004, 2006 The Linux Foundation and its contributors. +660 York Street, Suite 102, +San Francisco, CA 94110 USA + +Everyone is permitted to copy and distribute verbatim copies of this +license document, but changing it is not allowed. + +Developer's Certificate of Origin 1.1 + +By making a contribution to this project, I certify that: + +(a) The contribution was created in whole or in part by me and I + have the right to submit it under the open source license + indicated in the file; or + +(b) The contribution is based upon previous work that, to the best + of my knowledge, is covered under an appropriate open source + license and I have the right under that license to submit that + work with modifications, whether created in whole or in part + by me, under the same open source license (unless I am + permitted to submit under a different license), as indicated + in the file; or + +(c) The contribution was provided directly to me by some other + person who certified (a), (b) or (c) and I have not modified + it. + +(d) I understand and agree that this project and the contribution + are public and that a record of the contribution (including all + personal information I submit with it, including my sign-off) is + maintained indefinitely and may be redistributed consistent with + this project or the open source license(s) involved. +``` + +Then you just add a line to every git commit message: + + Signed-off-by: Joe Smith + +Use your real name (sorry, no pseudonyms or anonymous contributions.) + +If you set your `user.name` and `user.email` git configs, you can sign your +commit automatically with `git commit -s`. diff --git a/vendor/github.com/docker/go-units/MAINTAINERS b/vendor/github.com/docker/go-units/MAINTAINERS new file mode 100644 index 0000000..4aac7c7 --- /dev/null +++ b/vendor/github.com/docker/go-units/MAINTAINERS @@ -0,0 +1,46 @@ +# go-units maintainers file +# +# This file describes who runs the docker/go-units project and how. +# This is a living document - if you see something out of date or missing, speak up! +# +# It is structured to be consumable by both humans and programs. +# To extract its contents programmatically, use any TOML-compliant parser. +# +# This file is compiled into the MAINTAINERS file in docker/opensource. +# +[Org] + [Org."Core maintainers"] + people = [ + "akihirosuda", + "dnephin", + "thajeztah", + "vdemeester", + ] + +[people] + +# A reference list of all people associated with the project. +# All other sections should refer to people by their canonical key +# in the people section. + + # ADD YOURSELF HERE IN ALPHABETICAL ORDER + + [people.akihirosuda] + Name = "Akihiro Suda" + Email = "akihiro.suda.cz@hco.ntt.co.jp" + GitHub = "AkihiroSuda" + + [people.dnephin] + Name = "Daniel Nephin" + Email = "dnephin@gmail.com" + GitHub = "dnephin" + + [people.thajeztah] + Name = "Sebastiaan van Stijn" + Email = "github@gone.nl" + GitHub = "thaJeztah" + + [people.vdemeester] + Name = "Vincent Demeester" + Email = "vincent@sbr.pm" + GitHub = "vdemeester" \ No newline at end of file diff --git a/vendor/github.com/docker/go-units/circle.yml b/vendor/github.com/docker/go-units/circle.yml new file mode 100644 index 0000000..af9d605 --- /dev/null +++ b/vendor/github.com/docker/go-units/circle.yml @@ -0,0 +1,11 @@ +dependencies: + post: + # install golint + - go get golang.org/x/lint/golint + +test: + pre: + # run analysis before tests + - go vet ./... + - test -z "$(golint ./... | tee /dev/stderr)" + - test -z "$(gofmt -s -l . | tee /dev/stderr)" diff --git a/vendor/github.com/docker/go-units/duration.go b/vendor/github.com/docker/go-units/duration.go index ba02af2..48dd874 100644 --- a/vendor/github.com/docker/go-units/duration.go +++ b/vendor/github.com/docker/go-units/duration.go @@ -18,7 +18,7 @@ func HumanDuration(d time.Duration) string { return fmt.Sprintf("%d seconds", seconds) } else if minutes := int(d.Minutes()); minutes == 1 { return "About a minute" - } else if minutes < 46 { + } else if minutes < 60 { return fmt.Sprintf("%d minutes", minutes) } else if hours := int(d.Hours() + 0.5); hours == 1 { return "About an hour" diff --git a/vendor/github.com/docker/go-units/ulimit.go b/vendor/github.com/docker/go-units/ulimit.go index 5ac7fd8..fca0400 100644 --- a/vendor/github.com/docker/go-units/ulimit.go +++ b/vendor/github.com/docker/go-units/ulimit.go @@ -96,8 +96,13 @@ func ParseUlimit(val string) (*Ulimit, error) { return nil, fmt.Errorf("too many limit value arguments - %s, can only have up to two, `soft[:hard]`", parts[1]) } - if soft > *hard { - return nil, fmt.Errorf("ulimit soft limit must be less than or equal to hard limit: %d > %d", soft, *hard) + if *hard != -1 { + if soft == -1 { + return nil, fmt.Errorf("ulimit soft limit must be less than or equal to hard limit: soft: -1 (unlimited), hard: %d", *hard) + } + if soft > *hard { + return nil, fmt.Errorf("ulimit soft limit must be less than or equal to hard limit: %d > %d", soft, *hard) + } } return &Ulimit{Name: parts[0], Soft: soft, Hard: *hard}, nil diff --git a/vendor/github.com/godbus/dbus/go.mod b/vendor/github.com/godbus/dbus/go.mod deleted file mode 100644 index bdcd125..0000000 --- a/vendor/github.com/godbus/dbus/go.mod +++ /dev/null @@ -1 +0,0 @@ -module github.com/godbus/dbus diff --git a/vendor/github.com/godbus/dbus/v5/CONTRIBUTING.md b/vendor/github.com/godbus/dbus/v5/CONTRIBUTING.md new file mode 100644 index 0000000..c88f9b2 --- /dev/null +++ b/vendor/github.com/godbus/dbus/v5/CONTRIBUTING.md @@ -0,0 +1,50 @@ +# How to Contribute + +## Getting Started + +- Fork the repository on GitHub +- Read the [README](README.markdown) for build and test instructions +- Play with the project, submit bugs, submit patches! + +## Contribution Flow + +This is a rough outline of what a contributor's workflow looks like: + +- Create a topic branch from where you want to base your work (usually master). +- Make commits of logical units. +- Make sure your commit messages are in the proper format (see below). +- Push your changes to a topic branch in your fork of the repository. +- Make sure the tests pass, and add any new tests as appropriate. +- Submit a pull request to the original repository. + +Thanks for your contributions! + +### Format of the Commit Message + +We follow a rough convention for commit messages that is designed to answer two +questions: what changed and why. The subject line should feature the what and +the body of the commit should describe the why. + +``` +scripts: add the test-cluster command + +this uses tmux to setup a test cluster that you can easily kill and +start for debugging. + +Fixes #38 +``` + +The format can be described more formally as follows: + +``` +: + + + +