merge upstream 1.1.0

This commit is contained in:
luoyaoming 2022-12-30 11:21:19 +08:00
parent f67506f80e
commit 1dc29861c3
1285 changed files with 161305 additions and 143317 deletions

158
.cirrus.yml Normal file
View File

@ -0,0 +1,158 @@
---
# We use Cirrus for Vagrant tests and native CentOS 7 and 8, because macOS
# instances of GHA are too slow and flaky, and Linux instances of GHA do not
# support KVM.
# NOTE Cirrus execution environments lack a terminal, needed for
# some integration tests. So we use `ssh -tt` command to fake a terminal.
task:
timeout_in: 30m
env:
DEBIAN_FRONTEND: noninteractive
HOME: /root
# yamllint disable rule:key-duplicates
matrix:
DISTRO: fedora
name: vagrant DISTRO:$DISTRO
compute_engine_instance:
image_project: cirrus-images
image: family/docker-kvm
platform: linux
nested_virtualization: true
# CPU limit: `16 / NTASK`: see https://cirrus-ci.org/faq/#are-there-any-limits
cpu: 8
# Memory limit: `4GB * NCPU`
memory: 32G
host_info_script: |
uname -a
echo "-----"
cat /etc/os-release
echo "-----"
cat /proc/cpuinfo
echo "-----"
df -T
install_libvirt_vagrant_script: |
apt-get update
apt-get install -y libvirt-daemon libvirt-daemon-system vagrant vagrant-libvirt
systemctl enable --now libvirtd
vagrant_cache:
fingerprint_script: uname -s ; cat Vagrantfile.$DISTRO
folder: /root/.vagrant.d
vagrant_up_script: |
ln -sf Vagrantfile.$DISTRO Vagrantfile
# Retry if it fails (download.fedoraproject.org returns 404 sometimes)
vagrant up --no-tty || vagrant up --no-tty
mkdir -p -m 0700 /root/.ssh
vagrant ssh-config >> /root/.ssh/config
guest_info_script: |
ssh default 'sh -exc "uname -a && systemctl --version && df -T && cat /etc/os-release"'
unit_tests_script: |
ssh default 'sudo -i make -C /vagrant localunittest'
integration_systemd_script: |
ssh -tt default "sudo -i make -C /vagrant localintegration RUNC_USE_SYSTEMD=yes"
integration_fs_script: |
ssh -tt default "sudo -i make -C /vagrant localintegration"
integration_systemd_rootless_script: |
ssh -tt default "sudo -i make -C /vagrant localrootlessintegration RUNC_USE_SYSTEMD=yes"
integration_fs_rootless_script: |
ssh -tt default "sudo -i make -C /vagrant localrootlessintegration"
task:
timeout_in: 30m
env:
HOME: /root
CIRRUS_WORKING_DIR: /home/runc
GO_VERSION: "1.17.3"
BATS_VERSION: "v1.3.0"
# yamllint disable rule:key-duplicates
matrix:
DISTRO: centos-7
DISTRO: centos-stream-8
name: ci / $DISTRO
compute_engine_instance:
image_project: centos-cloud
image: family/$DISTRO
platform: linux
cpu: 4
memory: 8G
install_dependencies_script: |
case $DISTRO in
centos-7)
(cd /etc/yum.repos.d && curl -O https://copr.fedorainfracloud.org/coprs/adrian/criu-el7/repo/epel-7/adrian-criu-el7-epel-7.repo)
# sysctl
echo "user.max_user_namespaces=15076" > /etc/sysctl.d/userns.conf
sysctl --system
;;
centos-stream-8)
yum config-manager --set-enabled powertools # for glibc-static
;;
esac
# Work around dnf mirror failures by retrying a few times.
for i in $(seq 0 2); do
sleep $i
yum install -y -q gcc git iptables jq glibc-static libseccomp-devel make criu fuse-sshfs && break
done
[ $? -eq 0 ] # fail if yum failed
# install Go
curl -fsSL "https://dl.google.com/go/go${GO_VERSION}.linux-amd64.tar.gz" | tar Cxz /usr/local
# install bats
cd /tmp
git clone https://github.com/bats-core/bats-core
cd bats-core
git checkout $BATS_VERSION
./install.sh /usr/local
cd -
# Add a user for rootless tests
useradd -u2000 -m -d/home/rootless -s/bin/bash rootless
# Allow root and rootless itself to execute `ssh rootless@localhost` in tests/rootless.sh
ssh-keygen -t ecdsa -N "" -f /root/rootless.key
mkdir -m 0700 -p /home/rootless/.ssh
cp /root/rootless.key /home/rootless/.ssh/id_ecdsa
cat /root/rootless.key.pub >> /home/rootless/.ssh/authorized_keys
chown -R rootless.rootless /home/rootless
# set PATH
echo 'export PATH=/usr/local/go/bin:/usr/local/bin:$PATH' >> /root/.bashrc
# Setup ssh localhost for terminal emulation (script -e did not work)
ssh-keygen -t ed25519 -f /root/.ssh/id_ed25519 -N ""
cat /root/.ssh/id_ed25519.pub >> /root/.ssh/authorized_keys
chmod 400 /root/.ssh/authorized_keys
ssh-keyscan localhost >> /root/.ssh/known_hosts
echo -e "Host localhost\n\tStrictHostKeyChecking no\t\nIdentityFile /root/.ssh/id_ed25519\n" >> /root/.ssh/config
sed -e "s,PermitRootLogin.*,PermitRootLogin prohibit-password,g" -i /etc/ssh/sshd_config
systemctl restart sshd
host_info_script: |
uname -a
echo "-----"
cat /etc/os-release
echo "-----"
cat /proc/cpuinfo
echo "-----"
df -T
echo "-----"
systemctl --version
unit_tests_script: |
ssh -tt localhost "make -C /home/runc localunittest"
integration_systemd_script: |
ssh -tt localhost "make -C /home/runc localintegration RUNC_USE_SYSTEMD=yes"
integration_fs_script: |
ssh -tt localhost "make -C /home/runc localintegration"
integration_systemd_rootless_script: |
echo "SKIP: integration_systemd_rootless_script requires cgroup v2"
integration_fs_rootless_script: |
case $DISTRO in
centos-7)
echo "SKIP: FIXME: integration_fs_rootless_script is skipped because of EPERM on writing cgroup.procs"
;;
centos-stream-8)
ssh -tt localhost "make -C /home/runc localrootlessintegration"
;;
esac

3
.codespellrc Normal file
View File

@ -0,0 +1,3 @@
[codespell]
skip = ./vendor,./.git
ignore-words-list = clos,creat

25
.github/dependabot.yml vendored Normal file
View File

@ -0,0 +1,25 @@
# Please see the documentation for all configuration options:
# https://help.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
version: 2
updates:
# Dependencies listed in go.mod
- package-ecosystem: "gomod"
directory: "/" # Location of package manifests
schedule:
interval: "daily"
ignore:
# a regression in v1.22.2, see https://github.com/urfave/cli/issues/1092
- dependency-name: "github.com/urfave/cli"
# Dependencies listed in .github/workflows/*.yml
- package-ecosystem: "github-actions"
directory: "/"
schedule:
interval: "daily"
# Dependencies listed in Dockerfile
- package-ecosystem: "docker"
directory: "/"
schedule:
interval: "daily"

129
.github/workflows/test.yml vendored Normal file
View File

@ -0,0 +1,129 @@
# NOTE Github Actions execution environments lack a terminal, needed for
# some integration tests. So we use `script` command to fake a terminal.
name: ci
on:
push:
tags:
- v*
branches:
- master
- release-*
pull_request:
env:
# Don't ignore C warnings. Note that the output of "go env CGO_CFLAGS" by default is "-g -O2", so we keep them.
CGO_CFLAGS: -g -O2 -Werror
jobs:
test:
runs-on: ubuntu-20.04
strategy:
fail-fast: false
matrix:
go-version: [1.16.x, 1.17.x]
rootless: ["rootless", ""]
race: ["-race", ""]
criu: [""]
include:
# Also test against latest criu-dev
- go-version: 1.17.x
rootless: ""
race: ""
criu: "criu-dev"
steps:
- name: checkout
uses: actions/checkout@v2
- name: install deps
if: matrix.criu == ''
env:
REPO: https://download.opensuse.org/repositories/devel:/tools:/criu/xUbuntu_20.04
run: |
# criu repo
curl -fSsl $REPO/Release.key | sudo apt-key add -
echo "deb $REPO/ /" | sudo tee /etc/apt/sources.list.d/criu.list
sudo apt update
sudo apt install libseccomp-dev criu sshfs
- name: install deps (criu ${{ matrix.criu }})
if: matrix.criu != ''
run: |
sudo apt -q update
sudo apt -q install libseccomp-dev sshfs \
libcap-dev libnet1-dev libnl-3-dev \
libprotobuf-c-dev libprotobuf-dev protobuf-c-compiler protobuf-compiler
git clone https://github.com/checkpoint-restore/criu.git ~/criu
(cd ~/criu && git checkout ${{ matrix.criu }} && sudo make install-criu)
rm -rf ~/criu
- name: install go ${{ matrix.go-version }}
uses: actions/setup-go@v2
with:
stable: '!contains(${{ matrix.go-version }}, "beta") && !contains(${{ matrix.go-version }}, "rc")'
go-version: ${{ matrix.go-version }}
- name: build
run: sudo -E PATH="$PATH" make EXTRA_FLAGS="${{ matrix.race }}" all
- name: install bats
uses: mig4/setup-bats@v1
with:
bats-version: 1.3.0
- name: unit test
if: matrix.rootless != 'rootless'
run: sudo -E PATH="$PATH" -- make TESTFLAGS="${{ matrix.race }}" localunittest
- name: add rootless user
if: matrix.rootless == 'rootless'
run: |
sudo useradd -u2000 -m -d/home/rootless -s/bin/bash rootless
# Allow root and rootless itself to execute `ssh rootless@localhost` in tests/rootless.sh
ssh-keygen -t ecdsa -N "" -f $HOME/rootless.key
sudo mkdir -m 0700 -p /home/rootless/.ssh
sudo cp $HOME/rootless.key /home/rootless/.ssh/id_ecdsa
sudo cp $HOME/rootless.key.pub /home/rootless/.ssh/authorized_keys
sudo chown -R rootless.rootless /home/rootless
- name: integration test (fs driver)
run: sudo -E PATH="$PATH" script -e -c 'make local${{ matrix.rootless }}integration'
- name: integration test (systemd driver)
# can't use systemd driver with cgroupv1
if: matrix.rootless != 'rootless'
run: sudo -E PATH="$PATH" script -e -c 'make RUNC_USE_SYSTEMD=yes local${{ matrix.rootless }}integration'
# We need to continue support for 32-bit ARM.
# However, we do not have 32-bit ARM CI, so we use i386 for testing 32bit stuff.
# We are not interested in providing official support for i386.
cross-i386:
runs-on: ubuntu-20.04
steps:
- name: checkout
uses: actions/checkout@v2
- name: install deps
run: |
sudo dpkg --add-architecture i386
# add criu repo
sudo add-apt-repository -y ppa:criu/ppa
# apt-add-repository runs apt update so we don't have to.
# Due to a bug in apt, we have to update it first
# (see https://bugs.launchpad.net/ubuntu-cdimage/+bug/1871268)
sudo apt -q install apt
sudo apt -q install libseccomp-dev libseccomp-dev:i386 gcc-multilib criu
- name: install go
uses: actions/setup-go@v2
with:
go-version: 1.x # Latest stable
- name: unit test
# cgo is disabled by default when cross-compiling
run: sudo -E PATH="$PATH" -- make GOARCH=386 CGO_ENABLED=1 localunittest

198
.github/workflows/validate.yml vendored Normal file
View File

@ -0,0 +1,198 @@
name: validate
on:
push:
tags:
- v*
branches:
- master
- release-*
pull_request:
jobs:
lint:
runs-on: ubuntu-20.04
steps:
- uses: actions/checkout@v2
- name: install deps
run: |
sudo apt -q update
sudo apt -q install libseccomp-dev
- uses: golangci/golangci-lint-action@v2
with:
# must be specified without patch version
version: v1.42
lint-extra:
# Extra linters, only checking new code from pull requests.
if: github.event_name == 'pull_request'
runs-on: ubuntu-20.04
permissions:
contents: read
steps:
- uses: actions/checkout@v2
- name: install deps
run: |
sudo apt -q update
sudo apt -q install libseccomp-dev
- uses: golangci/golangci-lint-action@v2
with:
only-new-issues: true
args: --config .golangci-extra.yml
# must be specified without patch version
version: v1.43
compile-buildtags:
runs-on: ubuntu-20.04
env:
# Don't ignore C warnings. Note that the output of "go env CGO_CFLAGS" by default is "-g -O2", so we keep them.
CGO_CFLAGS: -g -O2 -Werror
steps:
- uses: actions/checkout@v2
- name: install go
uses: actions/setup-go@v2
with:
go-version: 1.x # Latest stable
- name: compile with no build tags
run: make BUILDTAGS=""
codespell:
runs-on: ubuntu-20.04
steps:
- uses: actions/checkout@v2
- name: install deps
# Version of codespell bundled with Ubuntu is way old, so use pip.
run: pip install codespell
- name: run codespell
run: codespell
shfmt:
runs-on: ubuntu-20.04
steps:
- uses: actions/checkout@v2
- name: vars
run: |
echo "VERSION=3.3.1" >> $GITHUB_ENV
echo "$(go env GOPATH)/bin" >> $GITHUB_PATH
- name: cache go mod and $GOCACHE
uses: actions/cache@v2
with:
path: |
~/go/pkg/mod
~/.cache/go-build
key: ${{ runner.os }}-shfmt-${{ env.VERSION }}
restore-keys: ${{ runner.os }}-shfmt-
- name: install shfmt
run: |
command -v shfmt || \
(cd ~ && GO111MODULE=on time go get mvdan.cc/sh/v3/cmd/shfmt@v$VERSION)
- name: shfmt
run: make shfmt
shellcheck:
runs-on: ubuntu-20.04
steps:
- uses: actions/checkout@v2
- name: vars
run: |
echo 'VERSION=v0.7.2' >> $GITHUB_ENV
echo 'BASEURL=https://github.com/koalaman/shellcheck/releases/download' >> $GITHUB_ENV
echo 'SHA256SUM=12ee2e0b90a3d1e9cae24ac9b2838be66b48573cb2c8e8f3c566b959df6f050c' >> $GITHUB_ENV
echo ~/bin >> $GITHUB_PATH
- name: install shellcheck
run: |
mkdir ~/bin
curl -sSfL --retry 5 $BASEURL/$VERSION/shellcheck-$VERSION.linux.x86_64.tar.xz |
tar xfJ - -C ~/bin --strip 1 shellcheck-$VERSION/shellcheck
sha256sum ~/bin/shellcheck | grep -q $SHA256SUM
# make sure to remove the old version
sudo rm -f /usr/bin/shellcheck
- uses: lumaxis/shellcheck-problem-matchers@v1
- name: shellcheck
run: |
make shellcheck
deps:
runs-on: ubuntu-20.04
steps:
- uses: actions/checkout@v2
- name: install go
uses: actions/setup-go@v2
with:
go-version: 1.x # Latest stable
- name: cache go mod and $GOCACHE
uses: actions/cache@v2
with:
path: |
~/go/pkg/mod
~/.cache/go-build
key: ${{ runner.os }}-go.sum-${{ hashFiles('**/go.sum') }}
restore-keys: ${{ runner.os }}-go.sum-
- name: verify deps
run: make verify-dependencies
commit:
runs-on: ubuntu-20.04
# Only check commits on pull requests.
if: github.event_name == 'pull_request'
steps:
- name: get pr commits
id: 'get-pr-commits'
uses: tim-actions/get-pr-commits@v1.1.0
with:
token: ${{ secrets.GITHUB_TOKEN }}
- name: check subject line length
uses: tim-actions/commit-message-checker-with-regex@v0.3.1
with:
commits: ${{ steps.get-pr-commits.outputs.commits }}
pattern: '^.{0,72}(\n.*)*$'
error: 'Subject too long (max 72)'
cfmt:
runs-on: ubuntu-20.04
steps:
- name: checkout
uses: actions/checkout@v2
with:
fetch-depth: 0
- name: install deps
run: |
sudo apt -qq update
sudo apt -qq install indent
- name: cfmt
run: |
make cfmt
git diff --exit-code
release:
runs-on: ubuntu-20.04
steps:
- name: checkout
uses: actions/checkout@v2
with:
fetch-depth: 0
# We have to run this under Docker as Ubuntu (host) does not support all
# the architectures we want to compile test against, and Dockerfile uses
# Debian (which does).
#
# XXX: as currently this is the only job that is using Docker, we are
# building and using the runcimage locally. In case more jobs running
# under Docker will emerge, it will be good to have a separate make
# runcimage job and share its result (the docker image) with whoever
# needs it.
- uses: satackey/action-docker-layer-caching@v0.0.11
continue-on-error: true
- name: build docker image
run: make runcimage
- name: make releaseall
run: make releaseall
- name: upload artifacts
uses: actions/upload-artifact@v2
with:
name: release-${{ github.run_id }}
path: release/*

4
.gitignore vendored
View File

@ -2,5 +2,9 @@ vendor/pkg
/runc
/runc-*
contrib/cmd/recvtty/recvtty
contrib/cmd/sd-helper/sd-helper
contrib/cmd/seccompagent/seccompagent
man/man8
release
Vagrantfile
.vagrant

15
.golangci-extra.yml Normal file
View File

@ -0,0 +1,15 @@
# This is golangci-lint config file which is used to check new code in
# github PRs only (see lint-extra job in .github/workflows/validate.yml).
#
# For the default linter config, see .golangci.yml. This config should
# only enable additional linters not enabled in the default config.
run:
build-tags:
- seccomp
linters:
disable-all: true
enable:
- godot
- revive

12
.golangci.yml Normal file
View File

@ -0,0 +1,12 @@
# For documentation, see https://golangci-lint.run/usage/configuration/
run:
build-tags:
- seccomp
linters:
enable:
- gofumpt
- errorlint
- unconvert
- unparam

View File

@ -1,10 +0,0 @@
approve_by_comment: true
approve_regex: ^LGTM
reject_regex: ^Rejected
reset_on_push: true
author_approval: ignored
reviewers:
teams:
- runc-maintainers
name: default
required: 2

View File

@ -1,54 +0,0 @@
dist: bionic
language: go
go:
- 1.11.x
- 1.12.x
- tip
matrix:
include:
- go: 1.12.x
env:
- RUNC_USE_SYSTEMD=1
script:
- make BUILDTAGS="${BUILDTAGS}" all
- sudo PATH="$PATH" make localintegration RUNC_USE_SYSTEMD=1
- go: 1.12.x
env:
- VIRTUALBOX_VERSION=6.0
- VAGRANT_VERSION=2.2.6
- FEDORA_VERSION=31
before_install:
- cat /proc/cpuinfo
- wget -q https://www.virtualbox.org/download/oracle_vbox_2016.asc -O- | sudo apt-key add - && sudo sh -c "echo deb https://download.virtualbox.org/virtualbox/debian $(lsb_release -cs) contrib >> /etc/apt/sources.list" && sudo apt-get update && sudo apt-get install -yq build-essential gcc make linux-headers-$(uname -r) virtualbox-${VIRTUALBOX_VERSION} && sudo usermod -aG vboxusers $(whoami)
- wget https://releases.hashicorp.com/vagrant/${VAGRANT_VERSION}/vagrant_${VAGRANT_VERSION}_$(uname -m).deb && sudo dpkg -i vagrant_${VAGRANT_VERSION}_$(uname -m).deb
- vagrant init bento/fedora-${FEDORA_VERSION} && vagrant up && mkdir -p ~/.ssh && vagrant ssh-config >> ~/.ssh/config
- ssh default sudo dnf install -y podman
script:
- ssh default sudo podman build -t test /vagrant
- ssh default sudo podman run --privileged --cgroupns=private test make localunittest
allow_failures:
- go: tip
go_import_path: github.com/opencontainers/runc
# `make ci` uses Docker.
sudo: required
services:
- docker
env:
global:
- BUILDTAGS="seccomp apparmor selinux ambient"
before_install:
- sudo apt-get -qq update
- sudo apt-get install -y libseccomp-dev
- go get -u golang.org/x/lint/golint
- go get -u github.com/vbatts/git-validation
- env | grep TRAVIS_
script:
- git-validation -run DCO,short-subject -v
- make BUILDTAGS="${BUILDTAGS}"
- make BUILDTAGS="${BUILDTAGS}" clean ci cross

248
CHANGELOG.md Normal file
View File

@ -0,0 +1,248 @@
# Changelog/
This file documents all notable changes made to this project since runc 1.0.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [Unreleased]
## [1.1.0] - 2022-01-14
> A plan depends as much upon execution as it does upon concept.
## Changed
* libcontainer will now refuse to build without the nsenter package being
correctly compiled (specifically this requires CGO to be enabled). This
should avoid folks accidentally creating broken runc binaries (and
incorrectly importing our internal libraries into their projects). (#3331)
## [1.1.0-rc.1] - 2021-12-14
> He who controls the spice controls the universe.
### Deprecated
* runc run/start now warns if a new container cgroup is non-empty or frozen;
this warning will become an error in runc 1.2. (#3132, #3223)
* runc can only be built with Go 1.16 or later from this release onwards.
(#3100, #3245, #3325)
### Removed
* `cgroup.GetHugePageSizes` has been removed entirely, and been replaced with
`cgroup.HugePageSizes` which is more efficient. (#3234)
* `intelrdt.GetIntelRdtPath` has been removed. Users who were using this
function to get the intelrdt root should use the new `intelrdt.Root`
instead. (#2920, #3239)
### Added
* Add support for RDMA cgroup added in Linux 4.11. (#2883)
* runc exec now produces exit code of 255 when the exec failed.
This may help in distinguishing between runc exec failures
(such as invalid options, non-running container or non-existent
binary etc.) and failures of the command being executed. (#3073)
* runc run: new `--keep` option to skip removal exited containers artefacts.
This might be useful to check the state (e.g. of cgroup controllers) after
the container hasexited. (#2817, #2825)
* seccomp: add support for `SCMP_ACT_KILL_PROCESS` and `SCMP_ACT_KILL_THREAD`
(the latter is just an alias for `SCMP_ACT_KILL`). (#3204)
* seccomp: add support for `SCMP_ACT_NOTIFY` (seccomp actions). This allows
users to create sophisticated seccomp filters where syscalls can be
efficiently emulated by privileged processes on the host. (#2682)
* checkpoint/restore: add an option (`--lsm-mount-context`) to set
a different LSM mount context on restore. (#3068)
* runc releases are now cross-compiled for several architectures. Static
builds for said architectures will be available for all future releases.
(#3197)
* intelrdt: support ClosID parameter. (#2920)
* runc exec --cgroup: an option to specify a (non-top) in-container cgroup
to use for the process being executed. (#3040, #3059)
* cgroup v1 controllers now support hybrid hierarchy (i.e. when on a cgroup v1
machine a cgroup2 filesystem is mounted to /sys/fs/cgroup/unified, runc
run/exec now adds the container to the appropriate cgroup under it). (#2087,
#3059)
* sysctl: allow slashes in sysctl names, to better match `sysctl(8)`'s
behaviour. (#3254, #3257)
* mounts: add support for bind-mounts which are inaccessible after switching
the user namespace. Note that this does not permit the container any
additional access to the host filesystem, it simply allows containers to
have bind-mounts configured for paths the user can access but have
restrictive access control settings for other users. (#2576)
* Add support for recursive mount attributes using `mount_setattr(2)`. These
have the same names as the proposed `mount(8)` options -- just prepend `r`
to the option name (such as `rro`). (#3272)
* Add `runc features` subcommand to allow runc users to detect what features
runc has been built with. This includes critical information such as
supported mount flags, hook names, and so on. Note that the output of this
command is subject to change and will not be considered stable until runc
1.2 at the earliest. The runtime-spec specification for this feature is
being developed in [opencontainers/runtime-spec#1130]. (#3296)
[opencontainers/runtime-spec#1130]: https://github.com/opencontainers/runtime-spec/pull/1130
### Changed
* system: improve performance of `/proc/$pid/stat` parsing. (#2696)
* cgroup2: when `/sys/fs/cgroup` is configured as a read-write mount, change
the ownership of certain cgroup control files (as per
`/sys/kernel/cgroup/delegate`) to allow for proper deferral to the container
process. (#3057)
* docs: series of improvements to man pages to make them easier to read and
use. (#3032)
#### libcontainer API
* internal api: remove internal error types and handling system, switch to Go
wrapped errors. (#3033)
* New configs.Cgroup structure fields (#3177):
* Systemd (whether to use systemd cgroup manager); and
* Rootless (whether to use rootless cgroups).
* New cgroups/manager package aiming to simplify cgroup manager instantiation.
(#3177)
* All cgroup managers' instantiation methods now initialize cgroup paths and
can return errors. This allows to use any cgroup manager method (e.g.
Exists, Destroy, Set, GetStats) right after instantiation, which was not
possible before (as paths were initialized in Apply only). (#3178)
### Fixed
* nsenter: do not try to close already-closed fds during container setup and
bail on close(2) failures. (#3058)
* runc checkpoint/restore: fixed for containers with an external bind mount
which destination is a symlink. (#3047).
* cgroup: improve openat2 handling for cgroup directory handle hardening.
(#3030)
* `runc delete -f` now succeeds (rather than timing out) on a paused
container. (#3134)
* runc run/start/exec now refuses a frozen cgroup (paused container in case of
exec). Users can disable this using `--ignore-paused`. (#3132, #3223)
* config: do not permit null bytes in mount fields. (#3287)
## [1.0.3] - 2021-12-06
> If you were waiting for the opportune moment, that was it.
### Security
* A potential vulnerability was discovered in runc (related to an internal
usage of netlink), however upon further investigation we discovered that
while this bug was exploitable on the master branch of runc, no released
version of runc could be exploited using this bug. The exploit required being
able to create a netlink attribute with a length that would overflow a uint16
but this was not possible in any released version of runc. For more
information, see [GHSA-v95c-p5hm-xq8f][] and CVE-2021-43784.
### Fixed
* Fixed inability to start a container with read-write bind mount of a
read-only fuse host mount. (#3283, #3292)
* Fixed inability to start when read-only /dev in set in spec (#3276, #3277)
* Fixed not removing sub-cgroups upon container delete, when rootless cgroup v2
is used with older systemd. (#3226, #3297)
* Fixed returning error from GetStats when hugetlb is unsupported (which causes
excessive logging for Kubernetes). (#3233, #3295)
* Improved an error message when dbus-user-session is not installed and
rootless + cgroup2 + systemd are used (#3212)
[GHSA-v95c-p5hm-xq8f]: https://github.com/opencontainers/runc/security/advisories/GHSA-v95c-p5hm-xq8f
## [1.0.2] - 2021-07-16
> Given the right lever, you can move a planet.
### Changed
* Made release builds reproducible from now on. (#3099, #3142)
### Fixed
* Fixed a failure to set CPU quota period in some cases on cgroup v1. (#3090
#3115)
* Fixed the inability to start a container with the "adding seccomp filter
rule for syscall ..." error, caused by redundant seccomp rules (i.e. those
that has action equal to the default one). Such redundant rules are now
skipped. (#3109, #3129)
* Fixed a rare debug log race in runc init, which can result in occasional
harmful "failed to decode ..." errors from runc run or exec. (#3120, #3130)
* Fixed the check in cgroup v1 systemd manager if a container needs to be
frozen before Set, and add a setting to skip such freeze unconditionally.
The previous fix for that issue, done in runc 1.0.1, was not working.
(#3166, #3167)
## [1.0.1] - 2021-07-16
> If in doubt, Meriadoc, always follow your nose.
### Fixed
* Fixed occasional runc exec/run failure ("interrupted system call") on an
Azure volume. (#3045, #3074)
* Fixed "unable to find groups ... token too long" error with /etc/group
containing lines longer than 64K characters. (#3062, #3079)
* cgroup/systemd/v1: fix leaving cgroup frozen after Set if a parent cgroup is
frozen. This is a regression in 1.0.0, not affecting runc itself but some
of libcontainer users (e.g Kubernetes). (#3081, #3085)
* cgroupv2: bpf: Ignore inaccessible existing programs in case of
permission error when handling replacement of existing bpf cgroup
programs. This fixes a regression in 1.0.0, where some SELinux
policies would block runc from being able to run entirely. (#3055, #3087)
* cgroup/systemd/v2: don't freeze cgroup on Set. (#3067, #3092)
* cgroup/systemd/v1: avoid unnecessary freeze on Set. (#3082, #3093)
## [1.0.0] - 2021-06-22
> A wizard is never late, nor is he early, he arrives precisely when he means
> to.
As runc follows Semantic Versioning, we will endeavour to not make any
breaking changes without bumping the major version number of runc.
However, it should be noted that Go API usage of runc's internal
implementation (libcontainer) is *not* covered by this policy.
### Removed
* Removed libcontainer/configs.Device* identifiers (deprecated since rc94,
use libcontainer/devices). (#2999)
* Removed libcontainer/system.RunningInUserNS function (deprecated since
rc94, use libcontainer/userns). (#2999)
### Deprecated
* The usage of relative paths for mountpoints will now produce a warning
(such configurations are outside of the spec, and in future runc will
produce an error when given such configurations). (#2917, #3004)
### Fixed
* cgroupv2: devices: rework the filter generation to produce consistent
results with cgroupv1, and always clobber any existing eBPF
program(s) to fix `runc update` and avoid leaking eBPF programs
(resulting in errors when managing containers). (#2951)
* cgroupv2: correctly convert "number of IOs" statistics in a
cgroupv1-compatible way. (#2965, #2967, #2968, #2964)
* cgroupv2: support larger than 32-bit IO statistics on 32-bit architectures.
* cgroupv2: wait for freeze to finish before returning from the freezing
code, optimize the method for checking whether a cgroup is frozen. (#2955)
* cgroups/systemd: fixed "retry on dbus disconnect" logic introduced in rc94
* cgroups/systemd: fixed returning "unit already exists" error from a systemd
cgroup manager (regression in rc94) (#2997, #2996)
### Added
* cgroupv2: support SkipDevices with systemd driver. (#2958, #3019)
* cgroup1: blkio: support BFQ weights. (#3010)
* cgroupv2: set per-device io weights if BFQ IO scheduler is available.
(#3022)
### Changed
* cgroup/systemd: return, not ignore, stop unit error from Destroy (#2946)
* Fix all golangci-lint failures. (#2781, #2962)
* Make `runc --version` output sane even when built with `go get` or
otherwise outside of our build scripts. (#2962)
* cgroups: set SkipDevices during runc update (so we don't modify
cgroups at all during `runc update`). (#2994)
<!-- minor releases -->
[Unreleased]: https://github.com/opencontainers/runc/compare/v1.1.0...HEAD
[1.1.0]: https://github.com/opencontainers/runc/compare/v1.1.0-rc.1...v1.1.0
[1.0.0]: https://github.com/opencontainers/runc/releases/tag/v1.0.0
<!-- 1.0.z patch releases -->
[Unreleased 1.0.z]: https://github.com/opencontainers/runc/compare/v1.0.3...release-1.0
[1.0.3]: https://github.com/opencontainers/runc/compare/v1.0.2...v1.0.3
[1.0.2]: https://github.com/opencontainers/runc/compare/v1.0.1...v1.0.2
[1.0.1]: https://github.com/opencontainers/runc/compare/v1.0.0...v1.0.1
<!-- 1.1.z patch releases -->
[Unreleased 1.1.z]: https://github.com/opencontainers/runc/compare/v1.1.0...release-1.1
[1.1.0-rc.1]: https://github.com/opencontainers/runc/compare/v1.0.0...v1.1.0-rc.1

View File

@ -1,34 +1,41 @@
FROM golang:1.12-stretch
ARG GO_VERSION=1.17
ARG BATS_VERSION=v1.3.0
ARG LIBSECCOMP_VERSION=2.5.3
RUN dpkg --add-architecture armel \
FROM golang:${GO_VERSION}-bullseye
ARG DEBIAN_FRONTEND=noninteractive
ARG CRIU_REPO=https://download.opensuse.org/repositories/devel:/tools:/criu/Debian_11
RUN KEYFILE=/usr/share/keyrings/criu-repo-keyring.gpg; \
wget -nv $CRIU_REPO/Release.key -O- | gpg --dearmor > "$KEYFILE" \
&& echo "deb [signed-by=$KEYFILE] $CRIU_REPO/ /" > /etc/apt/sources.list.d/criu.list \
&& dpkg --add-architecture armel \
&& dpkg --add-architecture armhf \
&& dpkg --add-architecture arm64 \
&& dpkg --add-architecture ppc64el \
&& apt-get update && apt-get install -y \
build-essential \
curl \
sudo \
gawk \
iptables \
jq \
pkg-config \
libaio-dev \
libcap-dev \
libprotobuf-dev \
libprotobuf-c0-dev \
libnl-3-dev \
libnet-dev \
libseccomp2 \
libseccomp-dev \
protobuf-c-compiler \
protobuf-compiler \
python-minimal \
uidmap \
kmod \
crossbuild-essential-armel crossbuild-essential-armhf crossbuild-essential-arm64 crossbuild-essential-ppc64el \
libseccomp-dev:armel libseccomp-dev:armhf libseccomp-dev:arm64 libseccomp-dev:ppc64el \
--no-install-recommends \
&& apt-get clean
&& apt-get update \
&& apt-get install -y --no-install-recommends \
build-essential \
criu \
crossbuild-essential-arm64 \
crossbuild-essential-armel \
crossbuild-essential-armhf \
crossbuild-essential-ppc64el \
crossbuild-essential-s390x \
curl \
gawk \
gcc \
gperf \
iptables \
jq \
kmod \
pkg-config \
python3-minimal \
sshfs \
sudo \
uidmap \
&& apt-get clean \
&& rm -rf /var/cache/apt /var/lib/apt/lists/* /etc/apt/sources.list.d/*.list
# Add a dummy user for the rootless integration tests. While runC does
# not require an entry in /etc/passwd to operate, one of the tests uses
@ -37,30 +44,21 @@ RUN dpkg --add-architecture armel \
RUN useradd -u1000 -m -d/home/rootless -s/bin/bash rootless
# install bats
ARG BATS_VERSION
RUN cd /tmp \
&& git clone https://github.com/sstephenson/bats.git \
&& cd bats \
&& git reset --hard 03608115df2071fff4eaaff1605768c275e5f81f \
&& git clone https://github.com/bats-core/bats-core.git \
&& cd bats-core \
&& git reset --hard "${BATS_VERSION}" \
&& ./install.sh /usr/local \
&& rm -rf /tmp/bats
&& rm -rf /tmp/bats-core
# install criu
ENV CRIU_VERSION v3.12
RUN mkdir -p /usr/src/criu \
&& curl -sSL https://github.com/checkpoint-restore/criu/archive/${CRIU_VERSION}.tar.gz | tar -v -C /usr/src/criu/ -xz --strip-components=1 \
&& cd /usr/src/criu \
&& make install-criu \
&& rm -rf /usr/src/criu
# install libseccomp
ARG LIBSECCOMP_VERSION
COPY script/* /tmp/script/
RUN mkdir -p /opt/libseccomp \
&& /tmp/script/seccomp.sh "$LIBSECCOMP_VERSION" /opt/libseccomp arm64 armel armhf ppc64le s390x
ENV LIBSECCOMP_VERSION=$LIBSECCOMP_VERSION
ENV LD_LIBRARY_PATH=/opt/libseccomp/lib
ENV PKG_CONFIG_PATH=/opt/libseccomp/lib/pkgconfig
# setup a playground for us to spawn containers in
ENV ROOTFS /busybox
RUN mkdir -p ${ROOTFS}
COPY script/tmpmount /
WORKDIR /go/src/github.com/opencontainers/runc
ENTRYPOINT ["/tmpmount"]
ADD . /go/src/github.com/opencontainers/runc
RUN . tests/integration/multi-arch.bash \
&& curl -o- -sSL `get_busybox` | tar xfJC - ${ROOTFS}

11
EMERITUS.md Normal file
View File

@ -0,0 +1,11 @@
## Emeritus ##
We would like to acknowledge previous runc maintainers and their huge
contributions to our collective success:
* Alexander Morozov (@lk4d4)
* Andrei Vagin (@avagin)
* Rohit Jnagal (@rjnagal)
* Victor Marmol (@vmarmol)
We thank these members for their service to the OCI community.

View File

@ -1,5 +1,8 @@
Michael Crosby <michael@docker.com> (@crosbymichael)
Michael Crosby <michael@thepasture.io> (@crosbymichael)
Mrunal Patel <mpatel@redhat.com> (@mrunalp)
Daniel, Dao Quang Minh <dqminh89@gmail.com> (@dqminh)
Qiang Huang <h.huangqiang@huawei.com> (@hqhq)
Aleksa Sarai <asarai@suse.de> (@cyphar)
Aleksa Sarai <cyphar@cyphar.com> (@cyphar)
Akihiro Suda <akihiro.suda.cz@hco.ntt.co.jp> (@AkihiroSuda)
Kir Kolyshkin <kolyshkin@gmail.com> (@kolyshkin)
Sebastiaan van Stijn <github@gone.nl> (@thaJeztah)

175
Makefile
View File

@ -1,133 +1,158 @@
.PHONY: all shell dbuild man release \
localtest localunittest localintegration \
test unittest integration \
cross localcross
CONTAINER_ENGINE := docker
GO := go
GO ?= go
SOURCES := $(shell find . 2>&1 | grep -E '.*\.(c|h|go)$$')
PREFIX := $(DESTDIR)/usr/local
PREFIX ?= /usr/local
BINDIR := $(PREFIX)/sbin
MANDIR := $(PREFIX)/share/man
GIT_BRANCH := $(shell git rev-parse --abbrev-ref HEAD 2>/dev/null)
GIT_BRANCH_CLEAN := $(shell echo $(GIT_BRANCH) | sed -e "s/[^[:alnum:]]/-/g")
RUNC_IMAGE := runc_dev$(if $(GIT_BRANCH_CLEAN),:$(GIT_BRANCH_CLEAN))
PROJECT := github.com/opencontainers/runc
BUILDTAGS ?= seccomp
COMMIT_NO := $(shell git rev-parse HEAD 2> /dev/null || true)
COMMIT ?= $(if $(shell git status --porcelain --untracked-files=no),"${COMMIT_NO}-dirty","${COMMIT_NO}")
COMMIT ?= $(shell git describe --dirty --long --always)
VERSION := $(shell cat ./VERSION)
MAN_DIR := $(CURDIR)/man/man8
MAN_PAGES = $(shell ls $(MAN_DIR)/*.8)
MAN_PAGES_BASE = $(notdir $(MAN_PAGES))
MAN_INSTALL_PATH := ${PREFIX}/share/man/man8/
ifeq ($(shell $(GO) env GOOS),linux)
ifeq (,$(filter $(shell $(GO) env GOARCH),mips mipsle mips64 mips64le ppc64))
ifeq (,$(findstring -race,$(EXTRA_FLAGS)))
GO_BUILDMODE := "-buildmode=pie"
endif
endif
endif
GO_BUILD := $(GO) build -trimpath $(GO_BUILDMODE) $(EXTRA_FLAGS) -tags "$(BUILDTAGS)" \
-ldflags "-X main.gitCommit=$(COMMIT) -X main.version=$(VERSION) $(EXTRA_LDFLAGS)"
GO_BUILD_STATIC := CGO_ENABLED=1 $(GO) build -trimpath $(EXTRA_FLAGS) -tags "$(BUILDTAGS) netgo osusergo" \
-ldflags "-extldflags -static -X main.gitCommit=$(COMMIT) -X main.version=$(VERSION) $(EXTRA_LDFLAGS)"
RELEASE_DIR := $(CURDIR)/release
VERSION := ${shell cat ./VERSION}
SHELL := $(shell command -v bash 2>/dev/null)
GPG_KEYID ?= asarai@suse.de
.DEFAULT: runc
runc: $(SOURCES)
$(GO) build -buildmode=pie $(EXTRA_FLAGS) -ldflags "-X main.gitCommit=${COMMIT} -X main.version=${VERSION} $(EXTRA_LDFLAGS)" -tags "$(BUILDTAGS)" -o runc .
runc:
$(GO_BUILD) -o runc .
all: runc recvtty
all: runc recvtty sd-helper seccompagent
recvtty: contrib/cmd/recvtty/recvtty
recvtty sd-helper seccompagent:
$(GO_BUILD) -o contrib/cmd/$@/$@ ./contrib/cmd/$@
contrib/cmd/recvtty/recvtty: $(SOURCES)
$(GO) build -buildmode=pie $(EXTRA_FLAGS) -ldflags "-X main.gitCommit=${COMMIT} -X main.version=${VERSION} $(EXTRA_LDFLAGS)" -tags "$(BUILDTAGS)" -o contrib/cmd/recvtty/recvtty ./contrib/cmd/recvtty
static:
$(GO_BUILD_STATIC) -o runc .
static: $(SOURCES)
CGO_ENABLED=1 $(GO) build $(EXTRA_FLAGS) -tags "$(BUILDTAGS) netgo osusergo" -installsuffix netgo -ldflags "-w -extldflags -static -X main.gitCommit=${COMMIT} -X main.version=${VERSION} $(EXTRA_LDFLAGS)" -o runc .
CGO_ENABLED=1 $(GO) build $(EXTRA_FLAGS) -tags "$(BUILDTAGS) netgo osusergo" -installsuffix netgo -ldflags "-w -extldflags -static -X main.gitCommit=${COMMIT} -X main.version=${VERSION} $(EXTRA_LDFLAGS)" -o contrib/cmd/recvtty/recvtty ./contrib/cmd/recvtty
releaseall: RELEASE_ARGS := "-a arm64 -a armel -a armhf -a ppc64le -a s390x"
releaseall: release
release:
script/release.sh -r release/$(VERSION) -v $(VERSION)
release: runcimage
$(CONTAINER_ENGINE) run $(CONTAINER_ENGINE_RUN_FLAGS) \
--rm -v $(CURDIR):/go/src/$(PROJECT) \
-e RELEASE_ARGS=$(RELEASE_ARGS) \
$(RUNC_IMAGE) make localrelease
script/release_sign.sh -S $(GPG_KEYID) -r release/$(VERSION) -v $(VERSION)
localrelease:
script/release_build.sh -r release/$(VERSION) -v $(VERSION) $(RELEASE_ARGS)
dbuild: runcimage
$(CONTAINER_ENGINE) run ${CONTAINER_ENGINE_RUN_FLAGS} --rm -v $(CURDIR):/go/src/$(PROJECT) --privileged $(RUNC_IMAGE) make clean all
$(CONTAINER_ENGINE) run $(CONTAINER_ENGINE_RUN_FLAGS) \
--privileged --rm \
-v $(CURDIR):/go/src/$(PROJECT) \
$(RUNC_IMAGE) make clean all
lint:
$(GO) vet $(allpackages)
$(GO) fmt $(allpackages)
golangci-lint run ./...
man:
man/md2man-all.sh
runcimage:
$(CONTAINER_ENGINE) build ${CONTAINER_ENGINE_BUILD_FLAGS} -t $(RUNC_IMAGE) .
$(CONTAINER_ENGINE) build $(CONTAINER_ENGINE_BUILD_FLAGS) -t $(RUNC_IMAGE) .
test:
make unittest integration rootlessintegration
test: unittest integration rootlessintegration
localtest:
make localunittest localintegration localrootlessintegration
localtest: localunittest localintegration localrootlessintegration
unittest: runcimage
$(CONTAINER_ENGINE) run ${CONTAINER_ENGINE_RUN_FLAGS} -t --privileged --rm -v /lib/modules:/lib/modules:ro -v $(CURDIR):/go/src/$(PROJECT) $(RUNC_IMAGE) make localunittest TESTFLAGS=${TESTFLAGS}
$(CONTAINER_ENGINE) run $(CONTAINER_ENGINE_RUN_FLAGS) \
-t --privileged --rm \
-v /lib/modules:/lib/modules:ro \
-v $(CURDIR):/go/src/$(PROJECT) \
$(RUNC_IMAGE) make localunittest TESTFLAGS=$(TESTFLAGS)
localunittest: all
$(GO) test -timeout 3m -tags "$(BUILDTAGS)" ${TESTFLAGS} -v $(allpackages)
$(GO) test -timeout 3m -tags "$(BUILDTAGS)" $(TESTFLAGS) -v ./...
integration: runcimage
$(CONTAINER_ENGINE) run ${CONTAINER_ENGINE_RUN_FLAGS} -t --privileged --rm -v /lib/modules:/lib/modules:ro -v $(CURDIR):/go/src/$(PROJECT) $(RUNC_IMAGE) make localintegration TESTPATH=${TESTPATH}
$(CONTAINER_ENGINE) run $(CONTAINER_ENGINE_RUN_FLAGS) \
-t --privileged --rm \
-v /lib/modules:/lib/modules:ro \
-v $(CURDIR):/go/src/$(PROJECT) \
$(RUNC_IMAGE) make localintegration TESTPATH=$(TESTPATH)
localintegration: all
bats -t tests/integration${TESTPATH}
bats -t tests/integration$(TESTPATH)
rootlessintegration: runcimage
$(CONTAINER_ENGINE) run ${CONTAINER_ENGINE_RUN_FLAGS} -t --privileged --rm -v $(CURDIR):/go/src/$(PROJECT) $(RUNC_IMAGE) make localrootlessintegration
$(CONTAINER_ENGINE) run $(CONTAINER_ENGINE_RUN_FLAGS) \
-t --privileged --rm \
-v $(CURDIR):/go/src/$(PROJECT) \
-e ROOTLESS_TESTPATH \
$(RUNC_IMAGE) make localrootlessintegration
localrootlessintegration: all
tests/rootless.sh
shell: runcimage
$(CONTAINER_ENGINE) run ${CONTAINER_ENGINE_RUN_FLAGS} -ti --privileged --rm -v $(CURDIR):/go/src/$(PROJECT) $(RUNC_IMAGE) bash
$(CONTAINER_ENGINE) run $(CONTAINER_ENGINE_RUN_FLAGS) \
-ti --privileged --rm \
-v $(CURDIR):/go/src/$(PROJECT) \
$(RUNC_IMAGE) bash
install:
install -D -m0755 runc $(BINDIR)/runc
install -D -m0755 runc $(DESTDIR)$(BINDIR)/runc
install-bash:
install -D -m0644 contrib/completions/bash/runc $(PREFIX)/share/bash-completion/completions/runc
install -D -m0644 contrib/completions/bash/runc $(DESTDIR)$(PREFIX)/share/bash-completion/completions/runc
install-man:
install -d -m 755 $(MAN_INSTALL_PATH)
install -m 644 $(MAN_PAGES) $(MAN_INSTALL_PATH)
uninstall:
rm -f $(BINDIR)/runc
uninstall-bash:
rm -f $(PREFIX)/share/bash-completion/completions/runc
uninstall-man:
rm -f $(addprefix $(MAN_INSTALL_PATH),$(MAN_PAGES_BASE))
install-man: man
install -d -m 755 $(DESTDIR)$(MANDIR)/man8
install -D -m 644 man/man8/*.8 $(DESTDIR)$(MANDIR)/man8
clean:
rm -f runc runc-*
rm -f contrib/cmd/recvtty/recvtty
rm -rf $(RELEASE_DIR)
rm -rf $(MAN_DIR)
rm -f contrib/cmd/sd-helper/sd-helper
rm -f contrib/cmd/seccompagent/seccompagent
rm -rf release
rm -rf man/man8
validate:
script/validate-gofmt
script/validate-c
$(GO) vet $(allpackages)
cfmt: C_SRC=$(shell git ls-files '*.c' | grep -v '^vendor/')
cfmt:
indent -linux -l120 -il0 -ppi2 -cp1 -T size_t -T jmp_buf $(C_SRC)
ci: validate test release
shellcheck:
shellcheck tests/integration/*.bats tests/integration/*.sh \
tests/integration/*.bash tests/*.sh \
script/release_*.sh script/seccomp.sh script/lib.sh
# TODO: add shellcheck for more sh files
cross: runcimage
$(CONTAINER_ENGINE) run ${CONTAINER_ENGINE_RUN_FLAGS} -e BUILDTAGS="$(BUILDTAGS)" --rm -v $(CURDIR):/go/src/$(PROJECT) $(RUNC_IMAGE) make localcross
shfmt:
shfmt -ln bats -d -w tests/integration/*.bats
shfmt -ln bash -d -w man/*.sh script/* tests/*.sh tests/integration/*.bash
localcross:
CGO_ENABLED=1 GOARCH=arm GOARM=6 CC=arm-linux-gnueabi-gcc $(GO) build -buildmode=pie $(EXTRA_FLAGS) -ldflags "-X main.gitCommit=${COMMIT} -X main.version=${VERSION} $(EXTRA_LDFLAGS)" -tags "$(BUILDTAGS)" -o runc-armel .
CGO_ENABLED=1 GOARCH=arm GOARM=7 CC=arm-linux-gnueabihf-gcc $(GO) build -buildmode=pie $(EXTRA_FLAGS) -ldflags "-X main.gitCommit=${COMMIT} -X main.version=${VERSION} $(EXTRA_LDFLAGS)" -tags "$(BUILDTAGS)" -o runc-armhf .
CGO_ENABLED=1 GOARCH=arm64 CC=aarch64-linux-gnu-gcc $(GO) build -buildmode=pie $(EXTRA_FLAGS) -ldflags "-X main.gitCommit=${COMMIT} -X main.version=${VERSION} $(EXTRA_LDFLAGS)" -tags "$(BUILDTAGS)" -o runc-arm64 .
CGO_ENABLED=1 GOARCH=ppc64le CC=powerpc64le-linux-gnu-gcc $(GO) build -buildmode=pie $(EXTRA_FLAGS) -ldflags "-X main.gitCommit=${COMMIT} -X main.version=${VERSION} $(EXTRA_LDFLAGS)" -tags "$(BUILDTAGS)" -o runc-ppc64le .
vendor:
$(GO) mod tidy
$(GO) mod vendor
$(GO) mod verify
# memoize allpackages, so that it's executed only once and only if used
_allpackages = $(shell $(GO) list ./... | grep -v vendor)
allpackages = $(if $(__allpackages),,$(eval __allpackages := $$(_allpackages)))$(__allpackages)
verify-dependencies: vendor
@test -z "$$(git status --porcelain -- go.mod go.sum vendor/)" \
|| (echo -e "git status:\n $$(git status -- go.mod go.sum vendor/)\nerror: vendor/, go.mod and/or go.sum not up to date. Run \"make vendor\" to update"; exit 1) \
&& echo "all vendor files are up to date."
.PHONY: runc all recvtty sd-helper seccompagent static releaseall release \
localrelease dbuild lint man runcimage \
test localtest unittest localunittest integration localintegration \
rootlessintegration localrootlessintegration shell install install-bash \
install-man clean cfmt shfmt shellcheck \
vendor verify-dependencies

View File

@ -1,39 +1,33 @@
# runc
[![Build Status](https://travis-ci.org/opencontainers/runc.svg?branch=master)](https://travis-ci.org/opencontainers/runc)
[![Go Report Card](https://goreportcard.com/badge/github.com/opencontainers/runc)](https://goreportcard.com/report/github.com/opencontainers/runc)
[![GoDoc](https://godoc.org/github.com/opencontainers/runc?status.svg)](https://godoc.org/github.com/opencontainers/runc)
[![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/588/badge)](https://bestpractices.coreinfrastructure.org/projects/588)
[![gha/validate](https://github.com/opencontainers/runc/workflows/validate/badge.svg)](https://github.com/opencontainers/runc/actions?query=workflow%3Avalidate)
[![gha/ci](https://github.com/opencontainers/runc/workflows/ci/badge.svg)](https://github.com/opencontainers/runc/actions?query=workflow%3Aci)
## Introduction
`runc` is a CLI tool for spawning and running containers according to the OCI specification.
`runc` is a CLI tool for spawning and running containers on Linux according to the OCI specification.
## Releases
`runc` depends on and tracks the [runtime-spec](https://github.com/opencontainers/runtime-spec) repository.
We will try to make sure that `runc` and the OCI specification major versions stay in lockstep.
This means that `runc` 1.0.0 should implement the 1.0 version of the specification.
You can find official releases of `runc` on the [release](https://github.com/opencontainers/runc/releases) page.
Currently, the following features are not considered to be production-ready:
* Support for cgroup v2
## Security
The reporting process and disclosure communications are outlined in [/org/security](https://github.com/opencontainers/org/blob/master/security/).
The reporting process and disclosure communications are outlined [here](https://github.com/opencontainers/org/blob/master/SECURITY.md).
### Security Audit
A third party security audit was performed by Cure53, you can see the full report [here](https://github.com/opencontainers/runc/blob/master/docs/Security-Audit.pdf).
## Building
`runc` currently supports the Linux platform with various architecture support.
It must be built with Go version 1.6 or higher in order for some features to function properly.
`runc` only supports Linux. It must be built with Go version 1.16 or higher.
In order to enable seccomp support you will need to install `libseccomp` on your platform.
> e.g. `libseccomp-devel` for CentOS, or `libseccomp-dev` for Ubuntu
Otherwise, if you do not want to build `runc` with seccomp support you can add `BUILDTAGS=""` when running make.
```bash
# create a 'github.com/opencontainers' in your GOPATH/src
cd github.com/opencontainers
@ -58,21 +52,24 @@ sudo make install
#### Build Tags
`runc` supports optional build tags for compiling support of various features.
To add build tags to the make option the `BUILDTAGS` variable must be set.
`runc` supports optional build tags for compiling support of various features,
with some of them enabled by default (see `BUILDTAGS` in top-level `Makefile`).
To change build tags from the default, set the `BUILDTAGS` variable for make,
e.g. to disable seccomp:
```bash
make BUILDTAGS='seccomp apparmor'
make BUILDTAGS=""
```
| Build Tag | Feature | Dependency |
|-----------|------------------------------------|-------------|
| seccomp | Syscall filtering | libseccomp |
| selinux | selinux process and mount labeling | <none> |
| apparmor | apparmor profile support | <none> |
| ambient | ambient capability support | kernel 4.3 |
| nokmem | disable kernel memory account | <none> |
| Build Tag | Feature | Enabled by default | Dependency |
|-----------|------------------------------------|--------------------|------------|
| seccomp | Syscall filtering | yes | libseccomp |
The following build tags were used earlier, but are now obsoleted:
- **nokmem** (since runc v1.0.0-rc94 kernel memory settings are ignored)
- **apparmor** (since runc v1.0.0-rc93 the feature is always enabled)
- **selinux** (since runc v1.0.0-rc93 the feature is always enabled)
### Running the test suite
@ -97,20 +94,41 @@ You can run a specific integration test by setting the `TESTPATH` variable.
# make test TESTPATH="/checkpoint.bats"
```
You can run a test in your proxy environment by setting `DOCKER_BUILD_PROXY` and `DOCKER_RUN_PROXY` variables.
You can run a specific rootless integration test by setting the `ROOTLESS_TESTPATH` variable.
```bash
# make test DOCKER_BUILD_PROXY="--build-arg HTTP_PROXY=http://yourproxy/" DOCKER_RUN_PROXY="-e HTTP_PROXY=http://yourproxy/"
# make test ROOTLESS_TESTPATH="/checkpoint.bats"
```
You can run a test using your container engine's flags by setting `CONTAINER_ENGINE_BUILD_FLAGS` and `CONTAINER_ENGINE_RUN_FLAGS` variables.
```bash
# make test CONTAINER_ENGINE_BUILD_FLAGS="--build-arg http_proxy=http://yourproxy/" CONTAINER_ENGINE_RUN_FLAGS="-e http_proxy=http://yourproxy/"
```
### Dependencies Management
`runc` uses [vndr](https://github.com/LK4D4/vndr) for dependencies management.
Please refer to [vndr](https://github.com/LK4D4/vndr) for how to add or update
`runc` uses [Go Modules](https://github.com/golang/go/wiki/Modules) for dependencies management.
Please refer to [Go Modules](https://github.com/golang/go/wiki/Modules) for how to add or update
new dependencies.
```
# Update vendored dependencies
make vendor
# Verify all dependencies
make verify-dependencies
```
## Using runc
Please note that runc is a low level tool not designed with an end user
in mind. It is mostly employed by other higher level container software.
Therefore, unless there is some specific use case that prevents the use
of tools like Docker or Podman, it is not recommended to use runc directly.
If you still want to use runc, here's how.
### Creating an OCI Bundle
In order to use runc you must have your container in the format of an OCI bundle.
@ -152,7 +170,9 @@ If you used the unmodified `runc spec` template this should give you a `sh` sess
The second way to start a container is using the specs lifecycle operations.
This gives you more power over how the container is created and managed while it is running.
This will also launch the container in the background so you will have to edit the `config.json` to remove the `terminal` setting for the simple examples here.
This will also launch the container in the background so you will have to edit
the `config.json` to remove the `terminal` setting for the simple examples
below (see more details about [runc terminal handling](docs/terminals.md)).
Your process field in the `config.json` should look like this below with `"terminal": false` and `"args": ["sleep", "5"]`.
@ -275,6 +295,14 @@ PIDFile=/run/mycontainerid.pid
WantedBy=multi-user.target
```
## More documentation
* [cgroup v2](./docs/cgroup-v2.md)
* [Checkpoint and restore](./docs/checkpoint-restore.md)
* [systemd cgroup driver](./docs/systemd.md)
* [Terminals and standard IO](./docs/terminals.md)
* [Experimental features](./docs/experimental.md)
## License
The code and docs are released under the [Apache 2.0 license](LICENSE).

View File

@ -1,3 +1,3 @@
# Security
The reporting process and disclosure communications are outlined in [/org/security](https://github.com/opencontainers/org/blob/master/security/).
The reporting process and disclosure communications are outlined [here](https://github.com/opencontainers/org/blob/master/SECURITY.md).

View File

@ -1 +1 @@
1.0.0-rc10
1.1.0

52
Vagrantfile.fedora Normal file
View File

@ -0,0 +1,52 @@
# -*- mode: ruby -*-
# vi: set ft=ruby :
Vagrant.configure("2") do |config|
# Fedora box is used for testing cgroup v2 support
config.vm.box = "fedora/35-cloud-base"
config.vm.provider :virtualbox do |v|
v.memory = 2048
v.cpus = 2
end
config.vm.provider :libvirt do |v|
v.memory = 2048
v.cpus = 2
end
config.vm.provision "shell", inline: <<-SHELL
set -e -u -o pipefail
# Work around dnf mirror failures by retrying a few times
for i in $(seq 0 2); do
sleep $i
# "config exclude" dnf shell command is not working in Fedora 35
# (see https://bugzilla.redhat.com/show_bug.cgi?id=2022571);
# the workaround is to specify it as an option.
cat << EOF | dnf -y --exclude=kernel,kernel-core shell && break
config install_weak_deps false
update
install iptables gcc make golang-go glibc-static libseccomp-devel bats jq git-core criu fuse-sshfs
ts run
EOF
done
dnf clean all
# Add a user for rootless tests
useradd -u2000 -m -d/home/rootless -s/bin/bash rootless
# Allow root and rootless itself to execute `ssh rootless@localhost` in tests/rootless.sh
ssh-keygen -t ecdsa -N "" -f /root/rootless.key
mkdir -m 0700 -p /home/rootless/.ssh
cp /root/rootless.key /home/rootless/.ssh/id_ecdsa
cat /root/rootless.key.pub >> /home/rootless/.ssh/authorized_keys
chown -R rootless.rootless /home/rootless
# Delegate cgroup v2 controllers to rootless user via --systemd-cgroup
mkdir -p /etc/systemd/system/user@.service.d
cat > /etc/systemd/system/user@.service.d/delegate.conf << EOF
[Service]
# default: Delegate=pids memory
# NOTE: delegation of cpuset requires systemd >= 244 (Fedora >= 32, Ubuntu >= 20.04).
Delegate=yes
EOF
systemctl daemon-reload
SHELL
end

View File

@ -1,19 +1,19 @@
// +build linux
package main
import (
"errors"
"fmt"
"net"
"os"
"path/filepath"
"strconv"
"strings"
criu "github.com/checkpoint-restore/go-criu/v5/rpc"
"github.com/opencontainers/runc/libcontainer"
"github.com/opencontainers/runc/libcontainer/system"
"github.com/opencontainers/runc/libcontainer/userns"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/sirupsen/logrus"
"github.com/urfave/cli"
"golang.org/x/sys/unix"
)
@ -34,7 +34,7 @@ checkpointed.`,
cli.BoolFlag{Name: "ext-unix-sk", Usage: "allow external unix sockets"},
cli.BoolFlag{Name: "shell-job", Usage: "allow shell jobs"},
cli.BoolFlag{Name: "lazy-pages", Usage: "use userfaultfd to lazily restore memory pages"},
cli.StringFlag{Name: "status-fd", Value: "", Usage: "criu writes \\0 to this FD once lazy-pages is ready"},
cli.IntFlag{Name: "status-fd", Value: -1, Usage: "criu writes \\0 to this FD once lazy-pages is ready"},
cli.StringFlag{Name: "page-server", Value: "", Usage: "ADDRESS:PORT of the page server"},
cli.BoolFlag{Name: "file-locks", Usage: "handle file locks, for safety"},
cli.BoolFlag{Name: "pre-dump", Usage: "dump container's memory information only, leave the container running after this"},
@ -47,7 +47,7 @@ checkpointed.`,
return err
}
// XXX: Currently this is untested with rootless containers.
if os.Geteuid() != 0 || system.RunningInUserNS() {
if os.Geteuid() != 0 || userns.RunningInUserNS() {
logrus.Warn("runc checkpoint is untested with rootless containers")
}
@ -60,10 +60,13 @@ checkpointed.`,
return err
}
if status == libcontainer.Created || status == libcontainer.Stopped {
fatalf("Container cannot be checkpointed in %s state", status.String())
fatal(fmt.Errorf("Container cannot be checkpointed in %s state", status.String()))
}
defer destroy(container)
options := criuOptions(context)
if !(options.LeaveRunning || options.PreDump) {
// destroy container unless we tell CRIU to keep it
defer destroy(container)
}
// these are the mandatory criu options for a container
setPageServer(context, options)
setManageCgroupsMode(context, options)
@ -74,28 +77,53 @@ checkpointed.`,
},
}
func getCheckpointImagePath(context *cli.Context) string {
func prepareImagePaths(context *cli.Context) (string, string, error) {
imagePath := context.String("image-path")
if imagePath == "" {
imagePath = getDefaultImagePath(context)
imagePath = getDefaultImagePath()
}
return imagePath
if err := os.MkdirAll(imagePath, 0o600); err != nil {
return "", "", err
}
parentPath := context.String("parent-path")
if parentPath == "" {
return imagePath, parentPath, nil
}
if filepath.IsAbs(parentPath) {
return "", "", errors.New("--parent-path must be relative")
}
realParent := filepath.Join(imagePath, parentPath)
fi, err := os.Stat(realParent)
if err == nil && !fi.IsDir() {
err = &os.PathError{Path: realParent, Err: unix.ENOTDIR}
}
if err != nil {
return "", "", fmt.Errorf("invalid --parent-path: %w", err)
}
return imagePath, parentPath, nil
}
func setPageServer(context *cli.Context, options *libcontainer.CriuOpts) {
// xxx following criu opts are optional
// The dump image can be sent to a criu page server
if psOpt := context.String("page-server"); psOpt != "" {
addressPort := strings.Split(psOpt, ":")
if len(addressPort) != 2 {
fatal(fmt.Errorf("Use --page-server ADDRESS:PORT to specify page server"))
address, port, err := net.SplitHostPort(psOpt)
if err != nil || address == "" || port == "" {
fatal(errors.New("Use --page-server ADDRESS:PORT to specify page server"))
}
portInt, err := strconv.Atoi(addressPort[1])
portInt, err := strconv.Atoi(port)
if err != nil {
fatal(fmt.Errorf("Invalid port number"))
fatal(errors.New("Invalid port number"))
}
options.PageServer = libcontainer.CriuPageServerInfo{
Address: addressPort[0],
Address: address,
Port: int32(portInt),
}
}
@ -105,13 +133,13 @@ func setManageCgroupsMode(context *cli.Context, options *libcontainer.CriuOpts)
if cgOpt := context.String("manage-cgroups-mode"); cgOpt != "" {
switch cgOpt {
case "soft":
options.ManageCgroupsMode = libcontainer.CRIU_CG_MODE_SOFT
options.ManageCgroupsMode = criu.CriuCgMode_SOFT
case "full":
options.ManageCgroupsMode = libcontainer.CRIU_CG_MODE_FULL
options.ManageCgroupsMode = criu.CriuCgMode_FULL
case "strict":
options.ManageCgroupsMode = libcontainer.CRIU_CG_MODE_STRICT
options.ManageCgroupsMode = criu.CriuCgMode_STRICT
default:
fatal(fmt.Errorf("Invalid manage cgroups mode"))
fatal(errors.New("Invalid manage cgroups mode"))
}
}
}

View File

@ -17,12 +17,13 @@
package main
import (
"errors"
"fmt"
"io"
"io/ioutil"
"net"
"os"
"strings"
"sync"
"github.com/containerd/console"
"github.com/opencontainers/runc/libcontainer/utils"
@ -65,7 +66,7 @@ func bail(err error) {
os.Exit(1)
}
func handleSingle(path string) error {
func handleSingle(path string, noStdin bool) error {
// Open a socket.
ln, err := net.Listen("unix", path)
if err != nil {
@ -87,7 +88,7 @@ func handleSingle(path string) error {
// Get the fd of the connection.
unixconn, ok := conn.(*net.UnixConn)
if !ok {
return fmt.Errorf("failed to cast to unixconn")
return errors.New("failed to cast to unixconn")
}
socket, err := unixconn.File()
@ -105,23 +106,37 @@ func handleSingle(path string) error {
if err != nil {
return err
}
console.ClearONLCR(c.Fd())
if err := console.ClearONLCR(c.Fd()); err != nil {
return err
}
// Copy from our stdio to the master fd.
quitChan := make(chan struct{})
var (
wg sync.WaitGroup
inErr, outErr error
)
wg.Add(1)
go func() {
io.Copy(os.Stdout, c)
quitChan <- struct{}{}
}()
go func() {
io.Copy(c, os.Stdin)
quitChan <- struct{}{}
_, outErr = io.Copy(os.Stdout, c)
wg.Done()
}()
if !noStdin {
wg.Add(1)
go func() {
_, inErr = io.Copy(c, os.Stdin)
wg.Done()
}()
}
// Only close the master fd once we've stopped copying.
<-quitChan
wg.Wait()
c.Close()
return nil
if outErr != nil {
return outErr
}
return inErr
}
func handleNull(path string) error {
@ -161,15 +176,7 @@ func handleNull(path string) error {
return
}
// Just do a dumb copy to /dev/null.
devnull, err := os.OpenFile("/dev/null", os.O_RDWR, 0)
if err != nil {
// TODO: Handle this nicely.
return
}
io.Copy(devnull, master)
devnull.Close()
_, _ = io.Copy(io.Discard, master)
}(conn)
}
}
@ -185,7 +192,7 @@ func main() {
v = append(v, version)
}
if gitCommit != "" {
v = append(v, fmt.Sprintf("commit: %s", gitCommit))
v = append(v, "commit: "+gitCommit)
}
app.Version = strings.Join(v, "\n")
@ -201,26 +208,31 @@ func main() {
Value: "",
Usage: "Path to write daemon process ID to",
},
cli.BoolFlag{
Name: "no-stdin",
Usage: "Disable stdin handling (no-op for null mode)",
},
}
app.Action = func(ctx *cli.Context) error {
args := ctx.Args()
if len(args) != 1 {
return fmt.Errorf("need to specify a single socket path")
return errors.New("need to specify a single socket path")
}
path := ctx.Args()[0]
pidPath := ctx.String("pid-file")
if pidPath != "" {
pid := fmt.Sprintf("%d\n", os.Getpid())
if err := ioutil.WriteFile(pidPath, []byte(pid), 0644); err != nil {
if err := os.WriteFile(pidPath, []byte(pid), 0o644); err != nil {
return err
}
}
noStdin := ctx.Bool("no-stdin")
switch ctx.String("mode") {
case "single":
if err := handleSingle(path); err != nil {
if err := handleSingle(path, noStdin); err != nil {
return err
}
case "null":

View File

@ -0,0 +1,86 @@
package main
import (
"flag"
"fmt"
"os"
"github.com/sirupsen/logrus"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/systemd"
"github.com/opencontainers/runc/libcontainer/configs"
)
func usage() {
fmt.Print(`Open Container Initiative contrib/cmd/sd-helper
sd-helper is a tool that uses runc/libcontainer/cgroups/systemd package
functionality to communicate to systemd in order to perform various operations.
Currently this is limited to starting and stopping systemd transient slice
units.
Usage:
sd-helper [-debug] [-parent <pname>] {start|stop} <name>
Example:
sd-helper -parent system.slice start system-pod123.slice
`)
os.Exit(1)
}
var (
debug = flag.Bool("debug", false, "enable debug output")
parent = flag.String("parent", "", "parent unit name")
)
func main() {
if !systemd.IsRunningSystemd() {
logrus.Fatal("systemd is required")
}
// Set the flags.
flag.Parse()
if *debug {
logrus.SetLevel(logrus.DebugLevel)
}
if flag.NArg() != 2 {
usage()
}
cmd := flag.Arg(0)
unit := flag.Arg(1)
err := unitCommand(cmd, unit, *parent)
if err != nil {
logrus.Fatal(err)
}
}
func newManager(config *configs.Cgroup) (cgroups.Manager, error) {
if cgroups.IsCgroup2UnifiedMode() {
return systemd.NewUnifiedManager(config, "")
}
return systemd.NewLegacyManager(config, nil)
}
func unitCommand(cmd, name, parent string) error {
podConfig := &configs.Cgroup{
Name: name,
Parent: parent,
Resources: &configs.Resources{},
}
pm, err := newManager(podConfig)
if err != nil {
return err
}
switch cmd {
case "start":
return pm.Apply(-1)
case "stop":
return pm.Destroy()
}
return fmt.Errorf("unknown command: %s", cmd)
}

View File

@ -0,0 +1,70 @@
# Seccomp Agent
## Warning
Please note this is an example agent, as such it is possible that specially
crafted messages can produce bad behaviour. Please use it as an example only.
Also, this agent is used for integration tests. Be aware that changing the
behaviour can break the integration tests.
## Get started
Compile runc and seccompagent:
```bash
make all
```
Run the seccomp agent in the background:
```bash
sudo ./contrib/cmd/seccompagent/seccompagent &
```
Prepare a container:
```bash
mkdir container-seccomp-notify
cd container-seccomp-notify
mkdir rootfs
docker export $(docker create busybox) | tar -C rootfs -xvf -
```
Then, generate a config.json by running the script gen-seccomp-example-cfg.sh
from the directory where this README.md is in the container directory you
prepared earlier (`container-seccomp-notify`).
Then start the container:
```bash
runc run mycontainerid
```
The container will output something like this:
```bash
+ cd /dev/shm
+ mkdir test-dir
+ touch test-file
+ chmod 777 test-file
chmod: changing permissions of 'test-file': No medium found
+ stat /dev/shm/test-dir-foo
File: /dev/shm/test-dir-foo
Size: 40 Blocks: 0 IO Block: 4096 directory
Device: 3eh/62d Inode: 2 Links: 2
Access: (0755/drwxr-xr-x) Uid: ( 0/ root) Gid: ( 0/ root)
Access: 2021-09-09 15:03:13.043716040 +0000
Modify: 2021-09-09 15:03:13.043716040 +0000
Change: 2021-09-09 15:03:13.043716040 +0000
Birth: -
+ ls -l /dev/shm
total 0
drwxr-xr-x 2 root root 40 Sep 9 15:03 test-dir-foo
-rw-r--r-- 1 root root 0 Sep 9 15:03 test-file
+ echo Note the agent added a suffix for the directory name and chmod fails
Note the agent added a suffix for the directory name and chmod fails
```
This shows a simple example that runs in /dev/shm just because it is a tmpfs in
the example config.json.
The agent makes all chmod calls fail with ENOMEDIUM, as the example output shows.
For mkdir, the agent adds a "-foo" suffix: the container runs "mkdir test-dir"
but the directory created is "test-dir-foo".

View File

@ -0,0 +1,35 @@
#!/usr/bin/env bash
# Detect if we are running inside bats (i.e. inside integration tests) or just
# called by an end-user
# bats-core v1.2.1 defines BATS_RUN_TMPDIR
if [ -z "$BATS_RUN_TMPDIR" ]; then
# When not running in bats, we create the config.json
set -e
runc spec
fi
# We can't source $(dirname $0)/../../../tests/integration/helpers.bash as that
# exits when not running inside bats. We can do hacks, but just to redefine
# update_config() seems clearer. We don't even really need to keep them in sync.
function update_config() {
jq "$1" "./config.json" | awk 'BEGIN{RS="";getline<"-";print>ARGV[1]}' "./config.json"
}
update_config '.linux.seccomp = {
"defaultAction": "SCMP_ACT_ALLOW",
"listenerPath": "/run/seccomp-agent.socket",
"listenerMetadata": "foo",
"architectures": [ "SCMP_ARCH_X86", "SCMP_ARCH_X32", "SCMP_ARCH_X86_64" ],
"syscalls": [
{
"names": [ "chmod", "fchmod", "fchmodat", "mkdir" ],
"action": "SCMP_ACT_NOTIFY"
}
]
}'
update_config '.process.args = [
"sh",
"-c",
"set -x; cd /dev/shm; mkdir test-dir; touch test-file; chmod 777 test-file; stat /dev/shm/test-dir-foo && ls -l /dev/shm && echo \"Note the agent added a suffix for the directory name and chmod fails\" "
]'

View File

@ -0,0 +1,291 @@
//go:build linux && seccomp
// +build linux,seccomp
package main
import (
"bytes"
"encoding/json"
"errors"
"flag"
"fmt"
"net"
"os"
"path/filepath"
"strings"
securejoin "github.com/cyphar/filepath-securejoin"
"github.com/opencontainers/runtime-spec/specs-go"
libseccomp "github.com/seccomp/libseccomp-golang"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
var (
socketFile string
pidFile string
)
func closeStateFds(recvFds []int) {
for i := range recvFds {
unix.Close(i)
}
}
// parseStateFds returns the seccomp-fd and closes the rest of the fds in recvFds.
// In case of error, no fd is closed.
// StateFds is assumed to be formatted as specs.ContainerProcessState.Fds and
// recvFds the corresponding list of received fds in the same SCM_RIGHT message.
func parseStateFds(stateFds []string, recvFds []int) (uintptr, error) {
// Let's find the index in stateFds of the seccomp-fd.
idx := -1
err := false
for i, name := range stateFds {
if name == specs.SeccompFdName && idx == -1 {
idx = i
continue
}
// We found the seccompFdName twice. Error out!
if name == specs.SeccompFdName && idx != -1 {
err = true
}
}
if idx == -1 || err {
return 0, errors.New("seccomp fd not found or malformed containerProcessState.Fds")
}
if idx >= len(recvFds) || idx < 0 {
return 0, errors.New("seccomp fd index out of range")
}
fd := uintptr(recvFds[idx])
for i := range recvFds {
if i == idx {
continue
}
unix.Close(recvFds[i])
}
return fd, nil
}
func handleNewMessage(sockfd int) (uintptr, string, error) {
const maxNameLen = 4096
stateBuf := make([]byte, maxNameLen)
oobSpace := unix.CmsgSpace(4)
oob := make([]byte, oobSpace)
n, oobn, _, _, err := unix.Recvmsg(sockfd, stateBuf, oob, 0)
if err != nil {
return 0, "", err
}
if n >= maxNameLen || oobn != oobSpace {
return 0, "", fmt.Errorf("recvfd: incorrect number of bytes read (n=%d oobn=%d)", n, oobn)
}
// Truncate.
stateBuf = stateBuf[:n]
oob = oob[:oobn]
scms, err := unix.ParseSocketControlMessage(oob)
if err != nil {
return 0, "", err
}
if len(scms) != 1 {
return 0, "", fmt.Errorf("recvfd: number of SCMs is not 1: %d", len(scms))
}
scm := scms[0]
fds, err := unix.ParseUnixRights(&scm)
if err != nil {
return 0, "", err
}
containerProcessState := &specs.ContainerProcessState{}
err = json.Unmarshal(stateBuf, containerProcessState)
if err != nil {
closeStateFds(fds)
return 0, "", fmt.Errorf("cannot parse OCI state: %w", err)
}
fd, err := parseStateFds(containerProcessState.Fds, fds)
if err != nil {
closeStateFds(fds)
return 0, "", err
}
return fd, containerProcessState.Metadata, nil
}
func readArgString(pid uint32, offset int64) (string, error) {
buffer := make([]byte, 4096) // PATH_MAX
memfd, err := unix.Open(fmt.Sprintf("/proc/%d/mem", pid), unix.O_RDONLY, 0o777)
if err != nil {
return "", err
}
defer unix.Close(memfd)
_, err = unix.Pread(memfd, buffer, offset)
if err != nil {
return "", err
}
buffer[len(buffer)-1] = 0
s := buffer[:bytes.IndexByte(buffer, 0)]
return string(s), nil
}
func runMkdirForContainer(pid uint32, fileName string, mode uint32, metadata string) error {
// We validated before that metadata is not a string that can make
// newFile a file in a different location other than root.
newFile := fmt.Sprintf("%s-%s", fileName, metadata)
root := fmt.Sprintf("/proc/%d/cwd/", pid)
if strings.HasPrefix(fileName, "/") {
// If it starts with /, use the rootfs as base
root = fmt.Sprintf("/proc/%d/root/", pid)
}
path, err := securejoin.SecureJoin(root, newFile)
if err != nil {
return err
}
return unix.Mkdir(path, mode)
}
// notifHandler handles seccomp notifications and responses
func notifHandler(fd libseccomp.ScmpFd, metadata string) {
defer unix.Close(int(fd))
for {
req, err := libseccomp.NotifReceive(fd)
if err != nil {
logrus.Errorf("Error in NotifReceive(): %s", err)
continue
}
syscallName, err := req.Data.Syscall.GetName()
if err != nil {
logrus.Errorf("Error decoding syscall %v(): %s", req.Data.Syscall, err)
continue
}
logrus.Debugf("Received syscall %q, pid %v, arch %q, args %+v", syscallName, req.Pid, req.Data.Arch, req.Data.Args)
resp := &libseccomp.ScmpNotifResp{
ID: req.ID,
Error: 0,
Val: 0,
Flags: libseccomp.NotifRespFlagContinue,
}
// TOCTOU check
if err := libseccomp.NotifIDValid(fd, req.ID); err != nil {
logrus.Errorf("TOCTOU check failed: req.ID is no longer valid: %s", err)
continue
}
switch syscallName {
case "mkdir":
fileName, err := readArgString(req.Pid, int64(req.Data.Args[0]))
if err != nil {
logrus.Errorf("Cannot read argument: %s", err)
resp.Error = int32(unix.ENOSYS)
resp.Val = ^uint64(0) // -1
goto sendResponse
}
logrus.Debugf("mkdir: %q", fileName)
// TOCTOU check
if err := libseccomp.NotifIDValid(fd, req.ID); err != nil {
logrus.Errorf("TOCTOU check failed: req.ID is no longer valid: %s", err)
continue
}
err = runMkdirForContainer(req.Pid, fileName, uint32(req.Data.Args[1]), metadata)
if err != nil {
resp.Error = int32(unix.ENOSYS)
resp.Val = ^uint64(0) // -1
}
resp.Flags = 0
case "chmod", "fchmod", "fchmodat":
resp.Error = int32(unix.ENOMEDIUM)
resp.Val = ^uint64(0) // -1
resp.Flags = 0
}
sendResponse:
if err = libseccomp.NotifRespond(fd, resp); err != nil {
logrus.Errorf("Error in notification response: %s", err)
continue
}
}
}
func main() {
flag.StringVar(&socketFile, "socketfile", "/run/seccomp-agent.socket", "Socket file")
flag.StringVar(&pidFile, "pid-file", "", "Pid file")
logrus.SetLevel(logrus.DebugLevel)
// Parse arguments
flag.Parse()
if flag.NArg() > 0 {
flag.PrintDefaults()
logrus.Fatal("Invalid command")
}
if err := os.Remove(socketFile); err != nil && !errors.Is(err, os.ErrNotExist) {
logrus.Fatalf("Cannot cleanup socket file: %v", err)
}
if pidFile != "" {
pid := fmt.Sprintf("%d", os.Getpid())
if err := os.WriteFile(pidFile, []byte(pid), 0o644); err != nil {
logrus.Fatalf("Cannot write pid file: %v", err)
}
}
logrus.Info("Waiting for seccomp file descriptors")
l, err := net.Listen("unix", socketFile)
if err != nil {
logrus.Fatalf("Cannot listen: %s", err)
}
defer l.Close()
for {
conn, err := l.Accept()
if err != nil {
logrus.Errorf("Cannot accept connection: %s", err)
continue
}
socket, err := conn.(*net.UnixConn).File()
conn.Close()
if err != nil {
logrus.Errorf("Cannot get socket: %v", err)
continue
}
newFd, metadata, err := handleNewMessage(int(socket.Fd()))
socket.Close()
if err != nil {
logrus.Errorf("Error receiving seccomp file descriptor: %v", err)
continue
}
// Make sure we don't allow strings like "/../p", as that means
// a file in a different location than expected. We just want
// safe things to use as a suffix for a file name.
metadata = filepath.Base(metadata)
if strings.Contains(metadata, "/") {
// Fallback to a safe string.
metadata = "agent-generated-suffix"
}
logrus.Infof("Received new seccomp fd: %v", newFd)
go notifHandler(libseccomp.ScmpFd(newFd), metadata)
}
}

View File

@ -0,0 +1,10 @@
//go:build !linux || !seccomp
// +build !linux !seccomp
package main
import "fmt"
func main() {
fmt.Println("Not supported, to use this compile with build tag: seccomp.")
}

View File

@ -113,6 +113,8 @@ __runc_complete_capabilities() {
AUDIT_WRITE
AUDIT_READ
BLOCK_SUSPEND
BPF
CHECKPOINT_RESTORE
CHOWN
DAC_OVERRIDE
DAC_READ_SEARCH
@ -130,6 +132,7 @@ __runc_complete_capabilities() {
NET_BIND_SERVICE
NET_BROADCAST
NET_RAW
PERFMON
SETFCAP
SETGID
SETPCAP
@ -170,6 +173,7 @@ _runc_exec() {
--apparmor
--cap, -c
--preserve-fds
--ignore-paused
"
local all_options="$options_with_args $boolean_options"
@ -221,6 +225,7 @@ _runc_runc() {
--help
--version -v
--debug
--systemd-cgroup
"
local options_with_args="
--log
@ -733,8 +738,6 @@ _runc_update() {
--cpu-share
--cpuset-cpus
--cpuset-mems
--kernel-memory
--kernel-memory-tcp
--memory
--memory-reservation
--memory-swap
@ -769,7 +772,6 @@ _runc() {
delete
events
exec
init
kill
list
pause

View File

@ -1,6 +1,7 @@
package main
import (
"fmt"
"os"
"github.com/urfave/cli"
@ -55,20 +56,12 @@ command(s) that get executed on start, edit the args parameter of the spec. See
if err := checkArgs(context, 1, exactArgs); err != nil {
return err
}
if err := revisePidFile(context); err != nil {
return err
status, err := startContainer(context, CT_ACT_CREATE, nil)
if err == nil {
// exit with the container's exit status so any external supervisor
// is notified of the exit with the correct exit status.
os.Exit(status)
}
spec, err := setupSpec(context)
if err != nil {
return err
}
status, err := startContainer(context, spec, CT_ACT_CREATE, nil)
if err != nil {
return err
}
// exit with the container's exit status so any external supervisor is
// notified of the exit with the correct exit status.
os.Exit(status)
return nil
return fmt.Errorf("runc create failed: %w", err)
},
}

6
debian/changelog vendored
View File

@ -1,3 +1,9 @@
runc (1.1.0-ok1) yangtze; urgency=medium
* Merge new upstream version 1.1.0
-- Luoyaoming <luoyaoming@kylinos.cn> Fri, 30 Dec 2022 11:11:29 +0800
runc (1.0.0~rc10-ok2) yangtze; urgency=medium
* Update version.

View File

@ -0,0 +1,39 @@
From: Dmitry Smirnov <onlyjob@debian.org>
Date: Thu, 28 Jul 2022 16:28:22 +0800
Subject: fix FTBFS on i686
src/github.com/opencontainers/runc/libcontainer/user/user_test.go:448:36: constant 2147483648 overflows int
Last-Update: 2018-06-16
Forwarded: https://github.com/opencontainers/runc/pull/1821
Bug-Upstream: https://github.com/opencontainers/runc/issues/941
---
libcontainer/user/user.go | 2 +-
libcontainer/user/user_test.go | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/libcontainer/user/user.go b/libcontainer/user/user.go
index 7b912bb..38caded 100644
--- a/libcontainer/user/user.go
+++ b/libcontainer/user/user.go
@@ -473,7 +473,7 @@ func GetAdditionalGroups(additionalGroups []string, group io.Reader) ([]int, err
return nil, fmt.Errorf("Unable to find group %s", ag)
}
// Ensure gid is inside gid range.
- if gid < minId || gid > maxId {
+ if gid < minId || gid >= maxId {
return nil, ErrRange
}
gidMap[gid] = struct{}{}
diff --git a/libcontainer/user/user_test.go b/libcontainer/user/user_test.go
index 24ee559..a4aabdc 100644
--- a/libcontainer/user/user_test.go
+++ b/libcontainer/user/user_test.go
@@ -445,7 +445,7 @@ this is just some garbage data
if utils.GetIntSize() > 4 {
tests = append(tests, foo{
// groups with too large id
- groups: []string{strconv.Itoa(1 << 31)},
+ groups: []string{strconv.Itoa( 1<<31 -1 )},
expected: nil,
hasError: true,
})

48
debian/patches/test--skip-Hugetlb.patch vendored Normal file
View File

@ -0,0 +1,48 @@
From: Dmitry Smirnov <onlyjob@debian.org>
Date: Thu, 28 Jul 2022 16:28:22 +0800
Subject: disabled unreliable tests due to random failures on [ppc64el,
s390x].
Last-Update: 2018-09-27
Forwarded: not-needed
Bug-Upstream: https://github.com/opencontainers/runc/issues/1822
---
libcontainer/cgroups/fs/hugetlb_test.go | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/libcontainer/cgroups/fs/hugetlb_test.go b/libcontainer/cgroups/fs/hugetlb_test.go
index 9ddacfe..9b60650 100644
--- a/libcontainer/cgroups/fs/hugetlb_test.go
+++ b/libcontainer/cgroups/fs/hugetlb_test.go
@@ -89,6 +89,7 @@ func TestHugetlbStats(t *testing.T) {
}
func TestHugetlbStatsNoUsageFile(t *testing.T) {
+t.Skip("Disabled unreliable test")
helper := NewCgroupTestUtil("hugetlb", t)
defer helper.cleanup()
helper.writeFileContents(map[string]string{
@@ -104,6 +105,7 @@ func TestHugetlbStatsNoUsageFile(t *testing.T) {
}
func TestHugetlbStatsNoMaxUsageFile(t *testing.T) {
+t.Skip("Disabled unreliable test")
helper := NewCgroupTestUtil("hugetlb", t)
defer helper.cleanup()
for _, pageSize := range HugePageSizes {
@@ -121,6 +123,7 @@ func TestHugetlbStatsNoMaxUsageFile(t *testing.T) {
}
func TestHugetlbStatsBadUsageFile(t *testing.T) {
+t.Skip("Disabled unreliable test")
helper := NewCgroupTestUtil("hugetlb", t)
defer helper.cleanup()
for _, pageSize := range HugePageSizes {
@@ -139,6 +142,7 @@ func TestHugetlbStatsBadUsageFile(t *testing.T) {
}
func TestHugetlbStatsBadMaxUsageFile(t *testing.T) {
+t.Skip("Disabled unreliable test")
helper := NewCgroupTestUtil("hugetlb", t)
defer helper.cleanup()
helper.writeFileContents(map[string]string{

View File

@ -0,0 +1,22 @@
From: Dmitry Smirnov <onlyjob@debian.org>
Date: Thu, 28 Jul 2022 16:28:22 +0800
Subject: disable test (requires root)
Last-Update: 2018-06-15
Forwarded: not-needed
---
libcontainer/factory_linux_test.go | 1 +
1 file changed, 1 insertion(+)
diff --git a/libcontainer/factory_linux_test.go b/libcontainer/factory_linux_test.go
index 8d0ca8a..1dc0180 100644
--- a/libcontainer/factory_linux_test.go
+++ b/libcontainer/factory_linux_test.go
@@ -78,6 +78,7 @@ func TestFactoryNewIntelRdt(t *testing.T) {
}
func TestFactoryNewTmpfs(t *testing.T) {
+t.Skip("DM - skipping privileged test")
root, rerr := newTestRoot()
if rerr != nil {
t.Fatal(rerr)

View File

@ -1,12 +1,10 @@
// +build !solaris
package main
import (
"errors"
"fmt"
"os"
"path/filepath"
"syscall"
"time"
"github.com/opencontainers/runc/libcontainer"
@ -19,12 +17,12 @@ func killContainer(container libcontainer.Container) error {
_ = container.Signal(unix.SIGKILL, false)
for i := 0; i < 100; i++ {
time.Sleep(100 * time.Millisecond)
if err := container.Signal(syscall.Signal(0), false); err != nil {
if err := container.Signal(unix.Signal(0), false); err != nil {
destroy(container)
return nil
}
}
return fmt.Errorf("container init still running")
return errors.New("container init still running")
}
var deleteCommand = cli.Command{
@ -55,7 +53,7 @@ status of "ubuntu01" as "stopped" the following will delete resources held for
force := context.Bool("force")
container, err := getContainer(context)
if err != nil {
if lerr, ok := err.(libcontainer.Error); ok && lerr.Code() == libcontainer.ContainerNotExists {
if errors.Is(err, libcontainer.ErrNotExist) {
// if there was an aborted start or something of the sort then the container's directory could exist but
// libcontainer does not see it because the state.json file inside that directory was never created.
path := filepath.Join(context.GlobalString("root"), id)
@ -81,7 +79,7 @@ status of "ubuntu01" as "stopped" the following will delete resources held for
if force {
return killContainer(container)
}
return fmt.Errorf("cannot delete container %s that is not stopped: %s\n", id, s)
return fmt.Errorf("cannot delete container %s that is not stopped: %s", id, s)
}
return nil

BIN
docs/Security-Audit.pdf Normal file

Binary file not shown.

62
docs/cgroup-v2.md Normal file
View File

@ -0,0 +1,62 @@
# cgroup v2
runc fully supports cgroup v2 (unified mode) since v1.0.0-rc93.
To use cgroup v2, you might need to change the configuration of the host init system.
Fedora (>= 31) uses cgroup v2 by default and no extra configuration is required.
On other systemd-based distros, cgroup v2 can be enabled by adding `systemd.unified_cgroup_hierarchy=1` to the kernel cmdline.
## Am I using cgroup v2?
Yes if `/sys/fs/cgroup/cgroup.controllers` is present.
## Host Requirements
### Kernel
* Recommended version: 5.2 or later
* Minimum version: 4.15
Kernel older than 5.2 is not recommended due to lack of freezer.
Notably, kernel older than 4.15 MUST NOT be used (unless you are running containers with user namespaces), as it lacks support for controlling permissions of devices.
### Systemd
On cgroup v2 hosts, it is highly recommended to run runc with the systemd cgroup driver (`runc --systemd-cgroup`), though not mandatory.
The recommended systemd version is 244 or later. Older systemd does not support delegation of `cpuset` controller.
Make sure you also have the `dbus-user-session` (Debian/Ubuntu) or `dbus-daemon` (CentOS/Fedora) package installed, and that `dbus` is running. On Debian-flavored distros, this can be accomplished like so:
```console
$ sudo apt install -y dbus-user-session
$ systemctl --user start dbus
```
## Rootless
On cgroup v2 hosts, rootless runc can talk to systemd to get cgroup permissions to be delegated.
```console
$ runc spec --rootless
$ jq '.linux.cgroupsPath="user.slice:runc:foo"' config.json | sponge config.json
$ runc --systemd-cgroup run foo
```
The container processes are executed in a cgroup like `/user.slice/user-$(id -u).slice/user@$(id -u).service/user.slice/runc-foo.scope`.
### Configuring delegation
Typically, only `memory` and `pids` controllers are delegated to non-root users by default.
```console
$ cat /sys/fs/cgroup/user.slice/user-$(id -u).slice/user@$(id -u).service/cgroup.controllers
memory pids
```
To allow delegation of other controllers, you need to change the systemd configuration as follows:
```console
# mkdir -p /etc/systemd/system/user@.service.d
# cat > /etc/systemd/system/user@.service.d/delegate.conf << EOF
[Service]
Delegate=cpu cpuset io memory pids
EOF
# systemctl daemon-reload
```

11
docs/experimental.md Normal file
View File

@ -0,0 +1,11 @@
# Experimental features
The following features are experimental and subject to change:
- The `runc features` command (since runc v1.1.0)
The following features were experimental in the past:
Feature | Experimental release | Graduation release
---------------------------------------- | -------------------- | ------------------
cgroup v2 | v1.0.0-rc91 | v1.0.0-rc93

130
docs/systemd.md Normal file
View File

@ -0,0 +1,130 @@
## systemd cgroup driver
By default, runc creates cgroups and sets cgroup limits on its own (this mode
is known as fs cgroup driver). When `--systemd-cgroup` global option is given
(as in e.g. `runc --systemd-cgroup run ...`), runc switches to systemd cgroup
driver. This document describes its features and peculiarities.
### systemd unit name and placement
When creating a container, runc requests systemd (over dbus) to create
a transient unit for the container, and place it into a specified slice.
The name of the unit and the containing slice is derived from the container
runtime spec in the following way:
1. If `Linux.CgroupsPath` is set, it is expected to be in the form
`[slice]:[prefix]:[name]`.
Here `slice` is a systemd slice under which the container is placed.
If empty, it defaults to `system.slice`, except when cgroup v2 is
used and rootless container is created, in which case it defaults
to `user.slice`.
Note that `slice` can contain dashes to denote a sub-slice
(e.g. `user-1000.slice` is a correct notation, meaning a subslice
of `user.slice`), but it must not contain slashes (e.g.
`user.slice/user-1000.slice` is invalid).
A `slice` of `-` represents a root slice.
Next, `prefix` and `name` are used to compose the unit name, which
is `<prefix>-<name>.scope`, unless `name` has `.slice` suffix, in
which case `prefix` is ignored and the `name` is used as is.
2. If `Linux.CgroupsPath` is not set or empty, it works the same way as if it
would be set to `:runc:<container-id>`. See the description above to see
what it transforms to.
As described above, a unit being created can either be a scope or a slice.
For a scope, runc specifies its parent slice via a _Slice=_ systemd property,
and also sets _Delegate=true_. For a slice, runc specifies a weak dependency on
the parent slice via a _Wants=_ property.
### Resource limits
runc always enables accounting for all controllers, regardless of any limits
being set. This means it unconditionally sets the following properties for the
systemd unit being created:
* _CPUAccounting=true_
* _IOAccounting=true_ (_BlockIOAccounting_ for cgroup v1)
* _MemoryAccounting=true_
* _TasksAccounting=true_
The resource limits of the systemd unit are set by runc by translating the
runtime spec resources to systemd unit properties.
Such translation is by no means complete, as there are some cgroup properties
that can not be set via systemd. Therefore, runc systemd cgroup driver is
backed by fs driver (in other words, cgroup limits are first set via systemd
unit properties, and when by writing to cgroupfs files).
The set of runtime spec resources which is translated by runc to systemd unit
properties depends on kernel cgroup version being used (v1 or v2), and on the
systemd version being run. If an older systemd version (which does not support
some resources) is used, runc do not set those resources.
The following tables summarize which properties are translated.
#### cgroup v1
| runtime spec resource | systemd property name | min systemd version |
|-----------------------|-----------------------|---------------------|
| memory.limit | MemoryLimit | |
| cpu.shares | CPUShares | |
| blockIO.weight | BlockIOWeight | |
| pids.limit | TasksMax | |
| cpu.cpus | AllowedCPUs | v244 |
| cpu.mems | AllowedMemoryNodes | v244 |
#### cgroup v2
| runtime spec resource | systemd property name | min systemd version |
|-------------------------|-----------------------|---------------------|
| memory.limit | MemoryMax | |
| memory.reservation | MemoryLow | |
| memory.swap | MemorySwapMax | |
| cpu.shares | CPUWeight | |
| pids.limit | TasksMax | |
| cpu.cpus | AllowedCPUs | v244 |
| cpu.mems | AllowedMemoryNodes | v244 |
| unified.cpu.max | CPUQuota, CPUQuotaPeriodSec | v242 |
| unified.cpu.weight | CPUWeight | |
| unified.cpuset.cpus | AllowedCPUs | v244 |
| unified.cpuset.mems | AllowedMemoryNodes | v244 |
| unified.memory.high | MemoryHigh | |
| unified.memory.low | MemoryLow | |
| unified.memory.min | MemoryMin | |
| unified.memory.max | MemoryMax | |
| unified.memory.swap.max | MemorySwapMax | |
| unified.pids.max | TasksMax | |
For documentation on systemd unit resource properties, see
`systemd.resource-control(5)` man page.
### Auxiliary properties
Auxiliary properties of a systemd unit (as shown by `systemctl show
<unit-name>` after the container is created) can be set (or overwritten) by
adding annotations to the container runtime spec (`config.json`).
For example:
```json
"annotations": {
"org.systemd.property.TimeoutStopUSec": "uint64 123456789",
"org.systemd.property.CollectMode":"'inactive-or-failed'"
},
```
The above will set the following properties:
* `TimeoutStopSec` to 2 minutes and 3 seconds;
* `CollectMode` to "inactive-or-failed".
The values must be in the gvariant format (for details, see
[gvariant documentation](https://developer.gnome.org/glib/stable/gvariant-text.html)).
To find out which type systemd expects for a particular parameter, please
consult systemd sources.

View File

@ -113,6 +113,33 @@ interact with pseudo-terminal `stdio`][tty_ioctl(4)].
> means that it is not really possible to uniquely distinguish between `stdout`
> and `stderr` from the caller's perspective.
#### Issues
If you see an error like
```
open /dev/tty: no such device or address
```
from runc, it means it can't open a terminal (because there isn't one). This
can happen when stdin (and possibly also stdout and stderr) are redirected,
or in some environments that lack a tty (such as GitHub Actions runners).
The solution to this is to *not* use a terminal for the container, i.e. have
`terminal: false` in `config.json`. If the container really needs a terminal
(some programs require one), you can provide one, using one of the following
methods.
One way is to use `ssh` with the `-tt` flag. The second `t` forces a terminal
allocation even if there's no local one -- and so it is required when stdin is
not a terminal (some `ssh` implementations only look for a terminal on stdin).
Another way is to run runc under the `script` utility, like this
```console
$ script -e -c 'runc run <container>'
```
[tty_ioctl(4)]: https://linux.die.net/man/4/tty_ioctl
### <a name="pass-through"> Pass-Through ###
@ -124,7 +151,7 @@ passing of file descriptors -- [details below](#runc-modes)). As an example
(assuming that `terminal: false` is set in `config.json`):
```
% echo input | runc run some_container > /tmp/log.out 2>& /tmp/log.err
% echo input | runc run some_container > /tmp/log.out 2> /tmp/log.err
```
Here the container's various `stdio` file descriptors will be substituted with
@ -228,6 +255,19 @@ Unfortunately using detached mode is a bit more complicated and requires more
care than the foreground mode -- mainly because it is now up to the caller to
handle the `stdio` of the container.
Another complication is that the parent process is responsible for acting as
the subreaper for the container. In short, you need to call
`prctl(PR_SET_CHILD_SUBREAPER, 1, ...)` in the parent process and correctly
handle the implications of being a subreaper. Failing to do so may result in
zombie processes being accumulated on your host.
These tasks are usually performed by a dedicated (and minimal) monitor process
per-container. For the sake of comparison, other runtimes such as LXC do not
have an equivalent detached mode and instead integrate this monitor process
into the container runtime itself -- this has several tradeoffs, and runc has
opted to support delegating the monitoring responsibility to the parent process
through this detached mode.
#### Detached Pass-Through ####
In detached mode, pass-through actually does what it says on the tin -- the

View File

@ -1,9 +1,8 @@
// +build linux
package main
import (
"encoding/json"
"errors"
"fmt"
"os"
"sync"
@ -40,7 +39,7 @@ information is displayed once every 5 seconds.`,
}
duration := context.Duration("interval")
if duration <= 0 {
return fmt.Errorf("duration interval must be greater than 0")
return errors.New("duration interval must be greater than 0")
}
status, err := container.Status()
if err != nil {
@ -125,10 +124,14 @@ func convertLibcontainerStats(ls *libcontainer.Stats) *types.Stats {
s.CPU.Usage.User = cg.CpuStats.CpuUsage.UsageInUsermode
s.CPU.Usage.Total = cg.CpuStats.CpuUsage.TotalUsage
s.CPU.Usage.Percpu = cg.CpuStats.CpuUsage.PercpuUsage
s.CPU.Usage.PercpuKernel = cg.CpuStats.CpuUsage.PercpuUsageInKernelmode
s.CPU.Usage.PercpuUser = cg.CpuStats.CpuUsage.PercpuUsageInUsermode
s.CPU.Throttling.Periods = cg.CpuStats.ThrottlingData.Periods
s.CPU.Throttling.ThrottledPeriods = cg.CpuStats.ThrottlingData.ThrottledPeriods
s.CPU.Throttling.ThrottledTime = cg.CpuStats.ThrottlingData.ThrottledTime
s.CPUSet = types.CPUSet(cg.CPUSetStats)
s.Memory.Cache = cg.MemoryStats.Cache
s.Memory.Kernel = convertMemoryEntry(cg.MemoryStats.KernelUsage)
s.Memory.KernelTCP = convertMemoryEntry(cg.MemoryStats.KernelTCPUsage)
@ -151,16 +154,22 @@ func convertLibcontainerStats(ls *libcontainer.Stats) *types.Stats {
}
if is := ls.IntelRdtStats; is != nil {
if intelrdt.IsCatEnabled() {
if intelrdt.IsCATEnabled() {
s.IntelRdt.L3CacheInfo = convertL3CacheInfo(is.L3CacheInfo)
s.IntelRdt.L3CacheSchemaRoot = is.L3CacheSchemaRoot
s.IntelRdt.L3CacheSchema = is.L3CacheSchema
}
if intelrdt.IsMbaEnabled() {
if intelrdt.IsMBAEnabled() {
s.IntelRdt.MemBwInfo = convertMemBwInfo(is.MemBwInfo)
s.IntelRdt.MemBwSchemaRoot = is.MemBwSchemaRoot
s.IntelRdt.MemBwSchema = is.MemBwSchema
}
if intelrdt.IsMBMEnabled() {
s.IntelRdt.MBMStats = is.MBMStats
}
if intelrdt.IsCMTEnabled() {
s.IntelRdt.CMTStats = is.CMTStats
}
}
s.NetworkInterfaces = ls.Interfaces
@ -187,29 +196,17 @@ func convertMemoryEntry(c cgroups.MemoryData) types.MemoryEntry {
func convertBlkioEntry(c []cgroups.BlkioStatEntry) []types.BlkioEntry {
var out []types.BlkioEntry
for _, e := range c {
out = append(out, types.BlkioEntry{
Major: e.Major,
Minor: e.Minor,
Op: e.Op,
Value: e.Value,
})
out = append(out, types.BlkioEntry(e))
}
return out
}
func convertL3CacheInfo(i *intelrdt.L3CacheInfo) *types.L3CacheInfo {
return &types.L3CacheInfo{
CbmMask: i.CbmMask,
MinCbmBits: i.MinCbmBits,
NumClosids: i.NumClosids,
}
ci := types.L3CacheInfo(*i)
return &ci
}
func convertMemBwInfo(i *intelrdt.MemBwInfo) *types.MemBwInfo {
return &types.MemBwInfo{
BandwidthGran: i.BandwidthGran,
DelayLinear: i.DelayLinear,
MinBandwidth: i.MinBandwidth,
NumClosids: i.NumClosids,
}
mi := types.MemBwInfo(*i)
return &mi
}

68
exec.go
View File

@ -1,9 +1,8 @@
// +build linux
package main
import (
"encoding/json"
"errors"
"fmt"
"os"
"strconv"
@ -84,15 +83,18 @@ following will output a list of processes running in the container:
Value: &cli.StringSlice{},
Usage: "add a capability to the bounding set for the process",
},
cli.BoolFlag{
Name: "no-subreaper",
Usage: "disable the use of the subreaper used to reap reparented processes",
Hidden: true,
},
cli.IntFlag{
Name: "preserve-fds",
Usage: "Pass N additional file descriptors to the container (stdio + $LISTEN_FDS + N in total)",
},
cli.StringSliceFlag{
Name: "cgroup",
Usage: "run the process in an (existing) sub-cgroup(s). Format is [<controller>:]<cgroup>.",
},
cli.BoolFlag{
Name: "ignore-paused",
Usage: "allow exec in a paused container",
},
},
Action: func(context *cli.Context) error {
if err := checkArgs(context, 1, minArgs); err != nil {
@ -105,11 +107,38 @@ following will output a list of processes running in the container:
if err == nil {
os.Exit(status)
}
return fmt.Errorf("exec failed: %v", err)
fatalWithCode(fmt.Errorf("exec failed: %w", err), 255)
return nil // to satisfy the linter
},
SkipArgReorder: true,
}
func getSubCgroupPaths(args []string) (map[string]string, error) {
if len(args) == 0 {
return nil, nil
}
paths := make(map[string]string, len(args))
for _, c := range args {
// Split into controller:path.
cs := strings.SplitN(c, ":", 3)
if len(cs) > 2 {
return nil, fmt.Errorf("invalid --cgroup argument: %s", c)
}
if len(cs) == 1 { // no controller: prefix
if len(args) != 1 {
return nil, fmt.Errorf("invalid --cgroup argument: %s (missing <controller>: prefix)", c)
}
paths[""] = c
} else {
// There may be a few comma-separated controllers.
for _, ctrl := range strings.Split(cs[0], ",") {
paths[ctrl] = cs[1]
}
}
}
return paths, nil
}
func execProcess(context *cli.Context) (int, error) {
container, err := getContainer(context)
if err != nil {
@ -120,13 +149,15 @@ func execProcess(context *cli.Context) (int, error) {
return -1, err
}
if status == libcontainer.Stopped {
return -1, fmt.Errorf("cannot exec a container that has stopped")
return -1, errors.New("cannot exec in a stopped container")
}
if status == libcontainer.Paused && !context.Bool("ignore-paused") {
return -1, errors.New("cannot exec in a paused container (use --ignore-paused to override)")
}
path := context.String("process")
if path == "" && len(context.Args()) == 1 {
return -1, fmt.Errorf("process args cannot be empty")
return -1, errors.New("process args cannot be empty")
}
detach := context.Bool("detach")
state, err := container.State()
if err != nil {
return -1, err
@ -137,9 +168,9 @@ func execProcess(context *cli.Context) (int, error) {
return -1, err
}
logLevel := "info"
if context.GlobalBool("debug") {
logLevel = "debug"
cgPaths, err := getSubCgroupPaths(context.StringSlice("cgroup"))
if err != nil {
return -1, err
}
r := &runner{
@ -147,12 +178,12 @@ func execProcess(context *cli.Context) (int, error) {
shouldDestroy: false,
container: container,
consoleSocket: context.String("console-socket"),
detach: detach,
detach: context.Bool("detach"),
pidFile: context.String("pid-file"),
action: CT_ACT_RUN,
init: false,
preserveFDs: context.Int("preserve-fds"),
logLevel: logLevel,
subCgroupPaths: cgPaths,
}
return r.run(p)
}
@ -203,6 +234,7 @@ func getProcess(context *cli.Context, bundle string) (*specs.Process, error) {
p.Env = append(p.Env, context.StringSlice("env")...)
// set the tty
p.Terminal = false
if context.IsSet("tty") {
p.Terminal = context.Bool("tty")
}
@ -215,13 +247,13 @@ func getProcess(context *cli.Context, bundle string) (*specs.Process, error) {
if len(u) > 1 {
gid, err := strconv.Atoi(u[1])
if err != nil {
return nil, fmt.Errorf("parsing %s as int for gid failed: %v", u[1], err)
return nil, fmt.Errorf("parsing %s as int for gid failed: %w", u[1], err)
}
p.User.GID = uint32(gid)
}
uid, err := strconv.Atoi(u[0])
if err != nil {
return nil, fmt.Errorf("parsing %s as int for uid failed: %v", u[0], err)
return nil, fmt.Errorf("parsing %s as int for uid failed: %w", u[0], err)
}
p.User.UID = uint32(uid)
}

75
features.go Normal file
View File

@ -0,0 +1,75 @@
package main
import (
"encoding/json"
"fmt"
"github.com/opencontainers/runc/libcontainer/capabilities"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/seccomp"
"github.com/opencontainers/runc/libcontainer/specconv"
"github.com/opencontainers/runc/types/features"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/urfave/cli"
)
var featuresCommand = cli.Command{
Name: "features",
Usage: "show the enabled features",
ArgsUsage: "",
Description: `Show the enabled features.
The result is parsable as a JSON.
See https://pkg.go.dev/github.com/opencontainers/runc/types/features for the type definition.
The types are experimental and subject to change.
`,
Action: func(context *cli.Context) error {
if err := checkArgs(context, 0, exactArgs); err != nil {
return err
}
tru := true
feat := features.Features{
OCIVersionMin: "1.0.0",
OCIVersionMax: specs.Version,
Annotations: map[string]string{
features.AnnotationRuncVersion: version,
features.AnnotationRuncCommit: gitCommit,
features.AnnotationRuncCheckpointEnabled: "true",
},
Hooks: configs.KnownHookNames(),
MountOptions: specconv.KnownMountOptions(),
Linux: &features.Linux{
Namespaces: specconv.KnownNamespaces(),
Capabilities: capabilities.KnownCapabilities(),
Cgroup: &features.Cgroup{
V1: &tru,
V2: &tru,
Systemd: &tru,
SystemdUser: &tru,
},
Apparmor: &features.Apparmor{
Enabled: &tru,
},
Selinux: &features.Selinux{
Enabled: &tru,
},
},
}
if seccomp.Enabled {
feat.Linux.Seccomp = &features.Seccomp{
Enabled: &tru,
Actions: seccomp.KnownActions(),
Operators: seccomp.KnownOperators(),
Archs: seccomp.KnownArchs(),
}
major, minor, patch := seccomp.Version()
feat.Annotations[features.AnnotationLibseccompVersion] = fmt.Sprintf("%d.%d.%d", major, minor, patch)
}
enc := json.NewEncoder(context.App.Writer)
enc.SetIndent("", " ")
return enc.Encode(feat)
},
}

26
go.mod Normal file
View File

@ -0,0 +1,26 @@
module github.com/opencontainers/runc
go 1.16
require (
github.com/checkpoint-restore/go-criu/v5 v5.3.0
github.com/cilium/ebpf v0.7.0
github.com/containerd/console v1.0.3
github.com/coreos/go-systemd/v22 v22.3.2
github.com/cyphar/filepath-securejoin v0.2.3
github.com/docker/go-units v0.4.0
github.com/godbus/dbus/v5 v5.0.6
github.com/moby/sys/mountinfo v0.5.0
github.com/mrunalp/fileutils v0.5.0
github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417
github.com/opencontainers/selinux v1.10.0
github.com/seccomp/libseccomp-golang v0.9.2-0.20210429002308-3879420cc921
github.com/sirupsen/logrus v1.8.1
github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635
// NOTE: urfave/cli must be <= v1.22.1 due to a regression: https://github.com/urfave/cli/issues/1092
github.com/urfave/cli v1.22.1
github.com/vishvananda/netlink v1.1.0
golang.org/x/net v0.0.0-20201224014010-6772e930b67b
golang.org/x/sys v0.0.0-20211116061358-0a5406a5449c
google.golang.org/protobuf v1.27.1
)

80
go.sum Normal file
View File

@ -0,0 +1,80 @@
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
github.com/checkpoint-restore/go-criu/v5 v5.3.0 h1:wpFFOoomK3389ue2lAb0Boag6XPht5QYpipxmSNL4d8=
github.com/checkpoint-restore/go-criu/v5 v5.3.0/go.mod h1:E/eQpaFtUKGOOSEBZgmKAcn+zUUwWxqcaKZlF54wK8E=
github.com/cilium/ebpf v0.7.0 h1:1k/q3ATgxSXRdrmPfH8d7YK0GfqVsEKZAX9dQZvs56k=
github.com/cilium/ebpf v0.7.0/go.mod h1:/oI2+1shJiTGAMgl6/RgJr36Eo1jzrRcAWbcXO2usCA=
github.com/containerd/console v1.0.3 h1:lIr7SlA5PxZyMV30bDW0MGbiOPXwc63yRuCP0ARubLw=
github.com/containerd/console v1.0.3/go.mod h1:7LqA/THxQ86k76b8c/EMSiaJ3h1eZkMkXar0TQ1gf3U=
github.com/coreos/go-systemd/v22 v22.3.2 h1:D9/bQk5vlXQFZ6Kwuu6zaiXJ9oTPe68++AzAJc1DzSI=
github.com/coreos/go-systemd/v22 v22.3.2/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d h1:U+s90UTSYgptZMwQh2aRr3LuazLJIa+Pg3Kc1ylSYVY=
github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU=
github.com/cyphar/filepath-securejoin v0.2.3 h1:YX6ebbZCZP7VkM3scTTokDgBL2TY741X51MTk3ycuNI=
github.com/cyphar/filepath-securejoin v0.2.3/go.mod h1:aPGpWjXOXUn2NCNjFvBE6aRxGGx79pTxQpKOJNYHHl4=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/docker/go-units v0.4.0 h1:3uh0PgVws3nIA0Q+MwDC8yjEPf9zjRfZZWXZYDct3Tw=
github.com/docker/go-units v0.4.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk=
github.com/frankban/quicktest v1.11.3 h1:8sXhOn0uLys67V8EsXLc6eszDs8VXWxL3iRvebPhedY=
github.com/frankban/quicktest v1.11.3/go.mod h1:wRf/ReqHper53s+kmmSZizM8NamnL3IM0I9ntUbOk+k=
github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
github.com/godbus/dbus/v5 v5.0.6 h1:mkgN1ofwASrYnJ5W6U/BxG15eXXXjirgZc7CLqkcaro=
github.com/godbus/dbus/v5 v5.0.6/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk=
github.com/google/go-cmp v0.5.4/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-cmp v0.5.5 h1:Khx7svrCpmxxtHBq5j2mp/xVjsi8hQMfNLvJFAlrGgU=
github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/kr/pretty v0.2.1 h1:Fmg33tUaq4/8ym9TJN1x7sLJnHVwhP33CNkpYV/7rwI=
github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/moby/sys/mountinfo v0.5.0 h1:2Ks8/r6lopsxWi9m58nlwjaeSzUX9iiL1vj5qB/9ObI=
github.com/moby/sys/mountinfo v0.5.0/go.mod h1:3bMD3Rg+zkqx8MRYPi7Pyb0Ie97QEBmdxbhnCLlSvSU=
github.com/mrunalp/fileutils v0.5.0 h1:NKzVxiH7eSk+OQ4M+ZYW1K6h27RUV3MI6NUTsHhU6Z4=
github.com/mrunalp/fileutils v0.5.0/go.mod h1:M1WthSahJixYnrXQl/DFQuteStB1weuxD2QJNHXfbSQ=
github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417 h1:3snG66yBm59tKhhSPQrQ/0bCrv1LQbKt40LnUPiUxdc=
github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0=
github.com/opencontainers/selinux v1.10.0 h1:rAiKF8hTcgLI3w0DHm6i0ylVVcOrlgR1kK99DRLDhyU=
github.com/opencontainers/selinux v1.10.0/go.mod h1:2i0OySw99QjzBBQByd1Gr9gSjvuho1lHsJxIJ3gGbJI=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/russross/blackfriday/v2 v2.0.1 h1:lPqVAte+HuHNfhJ/0LC98ESWRz8afy9tM/0RK8m9o+Q=
github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
github.com/seccomp/libseccomp-golang v0.9.2-0.20210429002308-3879420cc921 h1:58EBmR2dMNL2n/FnbQewK3D14nXr0V9CObDSvMJLq+Y=
github.com/seccomp/libseccomp-golang v0.9.2-0.20210429002308-3879420cc921/go.mod h1:JA8cRccbGaA1s33RQf7Y1+q9gHmZX1yB/z9WDN1C6fg=
github.com/shurcooL/sanitized_anchor_name v1.0.0 h1:PdmoCO6wvbs+7yrJyMORt4/BmY5IYyJwS/kOiWx8mHo=
github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc=
github.com/sirupsen/logrus v1.8.1 h1:dJKuHgqk1NNQlqoA6BTlM1Wf9DOH3NBjQyu0h9+AZZE=
github.com/sirupsen/logrus v1.8.1/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0=
github.com/stretchr/testify v1.2.2 h1:bSDNvY7ZPG5RlJ8otE/7V6gMiyenm9RtJ7IUVIAoJ1w=
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635 h1:kdXcSzyDtseVEc4yCz2qF8ZrQvIDBJLl4S1c3GCXmoI=
github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635/go.mod h1:hkRG7XYTFWNJGYcbNJQlaLq0fg1yr4J4t/NcTQtrfww=
github.com/urfave/cli v1.22.1 h1:+mkCCcOFKPnCmVYVcURKps1Xe+3zP90gSYGNfRkjoIY=
github.com/urfave/cli v1.22.1/go.mod h1:Gos4lmkARVdJ6EkW0WaNv/tZAAMe9V7XWyB60NtXRu0=
github.com/vishvananda/netlink v1.1.0 h1:1iyaYNBLmP6L0220aDnYQpo1QEV4t4hJ+xEEhhJH8j0=
github.com/vishvananda/netlink v1.1.0/go.mod h1:cTgwzPIzzgDAYoQrMm0EdrjRUBkTqKYppBueQtXaqoE=
github.com/vishvananda/netns v0.0.0-20191106174202-0a2b9b5464df h1:OviZH7qLw/7ZovXvuNyL3XQl8UFofeikI1NW1Gypu7k=
github.com/vishvananda/netns v0.0.0-20191106174202-0a2b9b5464df/go.mod h1:JP3t17pCcGlemwknint6hfoeCVQrEMVwxRLRjXpq+BU=
golang.org/x/net v0.0.0-20201224014010-6772e930b67b h1:iFwSg7t5GZmB/Q5TjiEAsdoLDrdJRC1RiF2WhuV29Qw=
golang.org/x/net v0.0.0-20201224014010-6772e930b67b/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
golang.org/x/sys v0.0.0-20190606203320-7fc4e5ec1444/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20191115151921-52ab43148777/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210906170528-6f6e22806c34/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20211025201205-69cdffdb9359/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20211116061358-0a5406a5449c h1:DHcbWVXeY+0Y8HHKR+rbLwnoh2F4tNCY7rTiHJ30RmA=
golang.org/x/sys v0.0.0-20211116061358-0a5406a5449c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
google.golang.org/protobuf v1.27.1 h1:SnqbnDw1V7RiZcXPx5MEeqPv2s79L9i7BJUlG/+RurQ=
google.golang.org/protobuf v1.27.1/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=

31
init.go
View File

@ -1,44 +1,37 @@
package main
import (
"fmt"
"os"
"runtime"
"strconv"
"github.com/opencontainers/runc/libcontainer"
"github.com/opencontainers/runc/libcontainer/logs"
_ "github.com/opencontainers/runc/libcontainer/nsenter"
"github.com/sirupsen/logrus"
"github.com/urfave/cli"
)
func init() {
if len(os.Args) > 1 && os.Args[1] == "init" {
// This is the golang entry point for runc init, executed
// before main() but after libcontainer/nsenter's nsexec().
runtime.GOMAXPROCS(1)
runtime.LockOSThread()
level := os.Getenv("_LIBCONTAINER_LOGLEVEL")
logLevel, err := logrus.ParseLevel(level)
level, err := strconv.Atoi(os.Getenv("_LIBCONTAINER_LOGLEVEL"))
if err != nil {
panic(fmt.Sprintf("libcontainer: failed to parse log level: %q: %v", level, err))
panic(err)
}
err = logs.ConfigureLogging(logs.Config{
LogPipeFd: os.Getenv("_LIBCONTAINER_LOGPIPE"),
LogFormat: "json",
LogLevel: logLevel,
})
logPipeFd, err := strconv.Atoi(os.Getenv("_LIBCONTAINER_LOGPIPE"))
if err != nil {
panic(fmt.Sprintf("libcontainer: failed to configure logging: %v", err))
panic(err)
}
logrus.SetLevel(logrus.Level(level))
logrus.SetOutput(os.NewFile(uintptr(logPipeFd), "logpipe"))
logrus.SetFormatter(new(logrus.JSONFormatter))
logrus.Debug("child process in init()")
}
}
var initCommand = cli.Command{
Name: "init",
Usage: `initialize the namespaces and launch the process (do not call it outside of runc)`,
Action: func(context *cli.Context) error {
factory, _ := libcontainer.New("")
if err := factory.StartInitialization(); err != nil {
// as the error is sent back to the parent there is no need to log
@ -46,5 +39,5 @@ var initCommand = cli.Command{
os.Exit(1)
}
panic("libcontainer: container init failed to exec")
},
}
}

16
kill.go
View File

@ -1,14 +1,12 @@
// +build linux
package main
import (
"fmt"
"strconv"
"strings"
"syscall"
"github.com/urfave/cli"
"golang.org/x/sys/unix"
)
var killCommand = cli.Command{
@ -55,13 +53,17 @@ signal to the init process of the "ubuntu01" container:
},
}
func parseSignal(rawSignal string) (syscall.Signal, error) {
func parseSignal(rawSignal string) (unix.Signal, error) {
s, err := strconv.Atoi(rawSignal)
if err == nil {
return syscall.Signal(s), nil
return unix.Signal(s), nil
}
signal, ok := signalMap[strings.TrimPrefix(strings.ToUpper(rawSignal), "SIG")]
if !ok {
sig := strings.ToUpper(rawSignal)
if !strings.HasPrefix(sig, "SIG") {
sig = "SIG" + sig
}
signal := unix.SignalNum(sig)
if signal == 0 {
return -1, fmt.Errorf("unknown signal %q", rawSignal)
}
return signal, nil

View File

@ -57,90 +57,94 @@ struct describing how the container is to be created. A sample would look simila
```go
defaultMountFlags := unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV
var devices []*configs.DeviceRule
for _, device := range specconv.AllowedDevices {
devices = append(devices, &device.Rule)
}
config := &configs.Config{
Rootfs: "/your/path/to/rootfs",
Capabilities: &configs.Capabilities{
Bounding: []string{
"CAP_CHOWN",
"CAP_DAC_OVERRIDE",
"CAP_FSETID",
"CAP_FOWNER",
"CAP_MKNOD",
"CAP_NET_RAW",
"CAP_SETGID",
"CAP_SETUID",
"CAP_SETFCAP",
"CAP_SETPCAP",
"CAP_NET_BIND_SERVICE",
"CAP_SYS_CHROOT",
"CAP_KILL",
"CAP_AUDIT_WRITE",
},
Effective: []string{
"CAP_CHOWN",
"CAP_DAC_OVERRIDE",
"CAP_FSETID",
"CAP_FOWNER",
"CAP_MKNOD",
"CAP_NET_RAW",
"CAP_SETGID",
"CAP_SETUID",
"CAP_SETFCAP",
"CAP_SETPCAP",
"CAP_NET_BIND_SERVICE",
"CAP_SYS_CHROOT",
"CAP_KILL",
"CAP_AUDIT_WRITE",
},
Inheritable: []string{
"CAP_CHOWN",
"CAP_DAC_OVERRIDE",
"CAP_FSETID",
"CAP_FOWNER",
"CAP_MKNOD",
"CAP_NET_RAW",
"CAP_SETGID",
"CAP_SETUID",
"CAP_SETFCAP",
"CAP_SETPCAP",
"CAP_NET_BIND_SERVICE",
"CAP_SYS_CHROOT",
"CAP_KILL",
"CAP_AUDIT_WRITE",
},
Permitted: []string{
"CAP_CHOWN",
"CAP_DAC_OVERRIDE",
"CAP_FSETID",
"CAP_FOWNER",
"CAP_MKNOD",
"CAP_NET_RAW",
"CAP_SETGID",
"CAP_SETUID",
"CAP_SETFCAP",
"CAP_SETPCAP",
"CAP_NET_BIND_SERVICE",
"CAP_SYS_CHROOT",
"CAP_KILL",
"CAP_AUDIT_WRITE",
},
Ambient: []string{
"CAP_CHOWN",
"CAP_DAC_OVERRIDE",
"CAP_FSETID",
"CAP_FOWNER",
"CAP_MKNOD",
"CAP_NET_RAW",
"CAP_SETGID",
"CAP_SETUID",
"CAP_SETFCAP",
"CAP_SETPCAP",
"CAP_NET_BIND_SERVICE",
"CAP_SYS_CHROOT",
"CAP_KILL",
"CAP_AUDIT_WRITE",
},
},
Bounding: []string{
"CAP_CHOWN",
"CAP_DAC_OVERRIDE",
"CAP_FSETID",
"CAP_FOWNER",
"CAP_MKNOD",
"CAP_NET_RAW",
"CAP_SETGID",
"CAP_SETUID",
"CAP_SETFCAP",
"CAP_SETPCAP",
"CAP_NET_BIND_SERVICE",
"CAP_SYS_CHROOT",
"CAP_KILL",
"CAP_AUDIT_WRITE",
},
Effective: []string{
"CAP_CHOWN",
"CAP_DAC_OVERRIDE",
"CAP_FSETID",
"CAP_FOWNER",
"CAP_MKNOD",
"CAP_NET_RAW",
"CAP_SETGID",
"CAP_SETUID",
"CAP_SETFCAP",
"CAP_SETPCAP",
"CAP_NET_BIND_SERVICE",
"CAP_SYS_CHROOT",
"CAP_KILL",
"CAP_AUDIT_WRITE",
},
Inheritable: []string{
"CAP_CHOWN",
"CAP_DAC_OVERRIDE",
"CAP_FSETID",
"CAP_FOWNER",
"CAP_MKNOD",
"CAP_NET_RAW",
"CAP_SETGID",
"CAP_SETUID",
"CAP_SETFCAP",
"CAP_SETPCAP",
"CAP_NET_BIND_SERVICE",
"CAP_SYS_CHROOT",
"CAP_KILL",
"CAP_AUDIT_WRITE",
},
Permitted: []string{
"CAP_CHOWN",
"CAP_DAC_OVERRIDE",
"CAP_FSETID",
"CAP_FOWNER",
"CAP_MKNOD",
"CAP_NET_RAW",
"CAP_SETGID",
"CAP_SETUID",
"CAP_SETFCAP",
"CAP_SETPCAP",
"CAP_NET_BIND_SERVICE",
"CAP_SYS_CHROOT",
"CAP_KILL",
"CAP_AUDIT_WRITE",
},
Ambient: []string{
"CAP_CHOWN",
"CAP_DAC_OVERRIDE",
"CAP_FSETID",
"CAP_FOWNER",
"CAP_MKNOD",
"CAP_NET_RAW",
"CAP_SETGID",
"CAP_SETUID",
"CAP_SETFCAP",
"CAP_SETPCAP",
"CAP_NET_BIND_SERVICE",
"CAP_SYS_CHROOT",
"CAP_KILL",
"CAP_AUDIT_WRITE",
},
},
Namespaces: configs.Namespaces([]configs.Namespace{
{Type: configs.NEWNS},
{Type: configs.NEWUTS},
@ -155,8 +159,7 @@ config := &configs.Config{
Parent: "system",
Resources: &configs.Resources{
MemorySwappiness: nil,
AllowAllDevices: nil,
AllowedDevices: configs.DefaultAllowedDevices,
Devices: devices,
},
},
MaskPaths: []string{
@ -166,7 +169,7 @@ config := &configs.Config{
ReadonlyPaths: []string{
"/proc/sys", "/proc/sysrq-trigger", "/proc/irq", "/proc/bus",
},
Devices: configs.DefaultAutoCreatedDevices,
Devices: specconv.AllowedDevices,
Hostname: "testing",
Mounts: []*configs.Mount{
{
@ -314,7 +317,7 @@ state, err := container.State()
#### Checkpoint & Restore
libcontainer now integrates [CRIU](http://criu.org/) for checkpointing and restoring containers.
This let's you save the state of a process running inside a container to disk, and then restore
This lets you save the state of a process running inside a container to disk, and then restore
that state into a new process, on the same machine or on another machine.
`criu` version 1.5.2 or higher is required to use checkpoint and restore.

View File

@ -1,60 +1,16 @@
// +build apparmor,linux
package apparmor
import (
"fmt"
"io/ioutil"
"os"
import "errors"
"github.com/opencontainers/runc/libcontainer/utils"
var (
// IsEnabled returns true if apparmor is enabled for the host.
IsEnabled = isEnabled
// ApplyProfile will apply the profile with the specified name to the process after
// the next exec. It is only supported on Linux and produces an ErrApparmorNotEnabled
// on other platforms.
ApplyProfile = applyProfile
// ErrApparmorNotEnabled indicates that AppArmor is not enabled or not supported.
ErrApparmorNotEnabled = errors.New("apparmor: config provided but apparmor not supported")
)
// IsEnabled returns true if apparmor is enabled for the host.
func IsEnabled() bool {
if _, err := os.Stat("/sys/kernel/security/apparmor"); err == nil && os.Getenv("container") == "" {
if _, err = os.Stat("/sbin/apparmor_parser"); err == nil {
buf, err := ioutil.ReadFile("/sys/module/apparmor/parameters/enabled")
return err == nil && len(buf) > 1 && buf[0] == 'Y'
}
}
return false
}
func setProcAttr(attr, value string) error {
// Under AppArmor you can only change your own attr, so use /proc/self/
// instead of /proc/<tid>/ like libapparmor does
path := fmt.Sprintf("/proc/self/attr/%s", attr)
f, err := os.OpenFile(path, os.O_WRONLY, 0)
if err != nil {
return err
}
defer f.Close()
if err := utils.EnsureProcHandle(f); err != nil {
return err
}
_, err = fmt.Fprintf(f, "%s", value)
return err
}
// changeOnExec reimplements aa_change_onexec from libapparmor in Go
func changeOnExec(name string) error {
value := "exec " + name
if err := setProcAttr("exec", value); err != nil {
return fmt.Errorf("apparmor failed to apply profile: %s", err)
}
return nil
}
// ApplyProfile will apply the profile with the specified name to the process after
// the next exec.
func ApplyProfile(name string) error {
if name == "" {
return nil
}
return changeOnExec(name)
}

View File

@ -1,20 +0,0 @@
// +build !apparmor !linux
package apparmor
import (
"errors"
)
var ErrApparmorNotEnabled = errors.New("apparmor: config provided but apparmor not supported")
func IsEnabled() bool {
return false
}
func ApplyProfile(name string) error {
if name != "" {
return ErrApparmorNotEnabled
}
return nil
}

View File

@ -0,0 +1,68 @@
package apparmor
import (
"errors"
"fmt"
"os"
"sync"
"github.com/opencontainers/runc/libcontainer/utils"
)
var (
appArmorEnabled bool
checkAppArmor sync.Once
)
// isEnabled returns true if apparmor is enabled for the host.
func isEnabled() bool {
checkAppArmor.Do(func() {
if _, err := os.Stat("/sys/kernel/security/apparmor"); err == nil {
buf, err := os.ReadFile("/sys/module/apparmor/parameters/enabled")
appArmorEnabled = err == nil && len(buf) > 1 && buf[0] == 'Y'
}
})
return appArmorEnabled
}
func setProcAttr(attr, value string) error {
// Under AppArmor you can only change your own attr, so use /proc/self/
// instead of /proc/<tid>/ like libapparmor does
attrPath := "/proc/self/attr/apparmor/" + attr
if _, err := os.Stat(attrPath); errors.Is(err, os.ErrNotExist) {
// fall back to the old convention
attrPath = "/proc/self/attr/" + attr
}
f, err := os.OpenFile(attrPath, os.O_WRONLY, 0)
if err != nil {
return err
}
defer f.Close()
if err := utils.EnsureProcHandle(f); err != nil {
return err
}
_, err = f.WriteString(value)
return err
}
// changeOnExec reimplements aa_change_onexec from libapparmor in Go
func changeOnExec(name string) error {
if err := setProcAttr("exec", "exec "+name); err != nil {
return fmt.Errorf("apparmor failed to apply profile: %w", err)
}
return nil
}
// applyProfile will apply the profile with the specified name to the process after
// the next exec. It is only supported on Linux and produces an error on other
// platforms.
func applyProfile(name string) error {
if name == "" {
return nil
}
return changeOnExec(name)
}

View File

@ -0,0 +1,15 @@
//go:build !linux
// +build !linux
package apparmor
func isEnabled() bool {
return false
}
func applyProfile(name string) error {
if name != "" {
return ErrApparmorNotEnabled
}
return nil
}

View File

@ -0,0 +1,123 @@
//go:build linux
// +build linux
package capabilities
import (
"sort"
"strings"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/sirupsen/logrus"
"github.com/syndtr/gocapability/capability"
)
const allCapabilityTypes = capability.CAPS | capability.BOUNDING | capability.AMBIENT
var (
capabilityMap map[string]capability.Cap
capTypes = []capability.CapType{
capability.BOUNDING,
capability.PERMITTED,
capability.INHERITABLE,
capability.EFFECTIVE,
capability.AMBIENT,
}
)
func init() {
capabilityMap = make(map[string]capability.Cap, capability.CAP_LAST_CAP+1)
for _, c := range capability.List() {
if c > capability.CAP_LAST_CAP {
continue
}
capabilityMap["CAP_"+strings.ToUpper(c.String())] = c
}
}
// KnownCapabilities returns the list of the known capabilities.
// Used by `runc features`.
func KnownCapabilities() []string {
list := capability.List()
res := make([]string, len(list))
for i, c := range list {
res[i] = "CAP_" + strings.ToUpper(c.String())
}
return res
}
// New creates a new Caps from the given Capabilities config. Unknown Capabilities
// or Capabilities that are unavailable in the current environment are ignored,
// printing a warning instead.
func New(capConfig *configs.Capabilities) (*Caps, error) {
var (
err error
c Caps
)
unknownCaps := make(map[string]struct{})
c.caps = map[capability.CapType][]capability.Cap{
capability.BOUNDING: capSlice(capConfig.Bounding, unknownCaps),
capability.EFFECTIVE: capSlice(capConfig.Effective, unknownCaps),
capability.INHERITABLE: capSlice(capConfig.Inheritable, unknownCaps),
capability.PERMITTED: capSlice(capConfig.Permitted, unknownCaps),
capability.AMBIENT: capSlice(capConfig.Ambient, unknownCaps),
}
if c.pid, err = capability.NewPid2(0); err != nil {
return nil, err
}
if err = c.pid.Load(); err != nil {
return nil, err
}
if len(unknownCaps) > 0 {
logrus.Warn("ignoring unknown or unavailable capabilities: ", mapKeys(unknownCaps))
}
return &c, nil
}
// capSlice converts the slice of capability names in caps, to their numeric
// equivalent, and returns them as a slice. Unknown or unavailable capabilities
// are not returned, but appended to unknownCaps.
func capSlice(caps []string, unknownCaps map[string]struct{}) []capability.Cap {
var out []capability.Cap
for _, c := range caps {
if v, ok := capabilityMap[c]; !ok {
unknownCaps[c] = struct{}{}
} else {
out = append(out, v)
}
}
return out
}
// mapKeys returns the keys of input in sorted order
func mapKeys(input map[string]struct{}) []string {
var keys []string
for c := range input {
keys = append(keys, c)
}
sort.Strings(keys)
return keys
}
// Caps holds the capabilities for a container.
type Caps struct {
pid capability.Capabilities
caps map[capability.CapType][]capability.Cap
}
// ApplyBoundingSet sets the capability bounding set to those specified in the whitelist.
func (c *Caps) ApplyBoundingSet() error {
c.pid.Clear(capability.BOUNDING)
c.pid.Set(capability.BOUNDING, c.caps[capability.BOUNDING]...)
return c.pid.Apply(capability.BOUNDING)
}
// Apply sets all the capabilities for the current process in the config.
func (c *Caps) ApplyCaps() error {
c.pid.Clear(allCapabilityTypes)
for _, g := range capTypes {
c.pid.Set(g, c.caps[g]...)
}
return c.pid.Apply(allCapabilityTypes)
}

View File

@ -0,0 +1,71 @@
package capabilities
import (
"io"
"os"
"testing"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/sirupsen/logrus"
"github.com/sirupsen/logrus/hooks/test"
"github.com/syndtr/gocapability/capability"
)
func TestNew(t *testing.T) {
cs := []string{"CAP_CHOWN", "CAP_UNKNOWN", "CAP_UNKNOWN2"}
conf := configs.Capabilities{
Bounding: cs,
Effective: cs,
Inheritable: cs,
Permitted: cs,
Ambient: cs,
}
hook := test.NewGlobal()
defer hook.Reset()
logrus.SetOutput(io.Discard)
caps, err := New(&conf)
logrus.SetOutput(os.Stderr)
if err != nil {
t.Error(err)
}
e := hook.AllEntries()
if len(e) != 1 {
t.Errorf("expected 1 warning, got %d", len(e))
}
expectedLogs := logrus.Entry{
Level: logrus.WarnLevel,
Message: "ignoring unknown or unavailable capabilities: [CAP_UNKNOWN CAP_UNKNOWN2]",
}
l := hook.LastEntry()
if l == nil {
t.Fatal("expected a warning, but got none")
}
if l.Level != expectedLogs.Level {
t.Errorf("expected %q, got %q", expectedLogs.Level, l.Level)
}
if l.Message != expectedLogs.Message {
t.Errorf("expected %q, got %q", expectedLogs.Message, l.Message)
}
if len(caps.caps) != len(capTypes) {
t.Errorf("expected %d capability types, got %d: %v", len(capTypes), len(caps.caps), caps.caps)
}
for _, cType := range capTypes {
if i := len(caps.caps[cType]); i != 1 {
t.Errorf("expected 1 capability for %s, got %d: %v", cType, i, caps.caps[cType])
continue
}
if caps.caps[cType][0] != capability.CAP_CHOWN {
t.Errorf("expected CAP_CHOWN, got %s: ", caps.caps[cType][0])
continue
}
}
hook.Reset()
}

View File

@ -0,0 +1,4 @@
//go:build !linux
// +build !linux
package capabilities

View File

@ -1,117 +0,0 @@
// +build linux
package libcontainer
import (
"fmt"
"strings"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/syndtr/gocapability/capability"
)
const allCapabilityTypes = capability.CAPS | capability.BOUNDS | capability.AMBS
var capabilityMap map[string]capability.Cap
func init() {
capabilityMap = make(map[string]capability.Cap)
last := capability.CAP_LAST_CAP
// workaround for RHEL6 which has no /proc/sys/kernel/cap_last_cap
if last == capability.Cap(63) {
last = capability.CAP_BLOCK_SUSPEND
}
for _, cap := range capability.List() {
if cap > last {
continue
}
capKey := fmt.Sprintf("CAP_%s", strings.ToUpper(cap.String()))
capabilityMap[capKey] = cap
}
}
func newContainerCapList(capConfig *configs.Capabilities) (*containerCapabilities, error) {
bounding := []capability.Cap{}
for _, c := range capConfig.Bounding {
v, ok := capabilityMap[c]
if !ok {
return nil, fmt.Errorf("unknown capability %q", c)
}
bounding = append(bounding, v)
}
effective := []capability.Cap{}
for _, c := range capConfig.Effective {
v, ok := capabilityMap[c]
if !ok {
return nil, fmt.Errorf("unknown capability %q", c)
}
effective = append(effective, v)
}
inheritable := []capability.Cap{}
for _, c := range capConfig.Inheritable {
v, ok := capabilityMap[c]
if !ok {
return nil, fmt.Errorf("unknown capability %q", c)
}
inheritable = append(inheritable, v)
}
permitted := []capability.Cap{}
for _, c := range capConfig.Permitted {
v, ok := capabilityMap[c]
if !ok {
return nil, fmt.Errorf("unknown capability %q", c)
}
permitted = append(permitted, v)
}
ambient := []capability.Cap{}
for _, c := range capConfig.Ambient {
v, ok := capabilityMap[c]
if !ok {
return nil, fmt.Errorf("unknown capability %q", c)
}
ambient = append(ambient, v)
}
pid, err := capability.NewPid2(0)
if err != nil {
return nil, err
}
err = pid.Load()
if err != nil {
return nil, err
}
return &containerCapabilities{
bounding: bounding,
effective: effective,
inheritable: inheritable,
permitted: permitted,
ambient: ambient,
pid: pid,
}, nil
}
type containerCapabilities struct {
pid capability.Capabilities
bounding []capability.Cap
effective []capability.Cap
inheritable []capability.Cap
permitted []capability.Cap
ambient []capability.Cap
}
// ApplyBoundingSet sets the capability bounding set to those specified in the whitelist.
func (c *containerCapabilities) ApplyBoundingSet() error {
c.pid.Clear(capability.BOUNDS)
c.pid.Set(capability.BOUNDS, c.bounding...)
return c.pid.Apply(capability.BOUNDS)
}
// Apply sets all the capabilities for the current process in the config.
func (c *containerCapabilities) ApplyCaps() error {
c.pid.Clear(allCapabilityTypes)
c.pid.Set(capability.BOUNDS, c.bounding...)
c.pid.Set(capability.PERMITTED, c.permitted...)
c.pid.Set(capability.INHERITABLE, c.inheritable...)
c.pid.Set(capability.EFFECTIVE, c.effective...)
c.pid.Set(capability.AMBIENT, c.ambient...)
return c.pid.Apply(allCapabilityTypes)
}

View File

@ -1,74 +1,59 @@
// +build linux
package cgroups
import (
"fmt"
"github.com/opencontainers/runc/libcontainer/configs"
)
type Manager interface {
// Applies cgroup configuration to the process with the specified pid
// Apply creates a cgroup, if not yet created, and adds a process
// with the specified pid into that cgroup. A special value of -1
// can be used to merely create a cgroup.
Apply(pid int) error
// Returns the PIDs inside the cgroup set
// GetPids returns the PIDs of all processes inside the cgroup.
GetPids() ([]int, error)
// Returns the PIDs inside the cgroup set & all sub-cgroups
// GetAllPids returns the PIDs of all processes inside the cgroup
// any all its sub-cgroups.
GetAllPids() ([]int, error)
// Returns statistics for the cgroup set
// GetStats returns cgroups statistics.
GetStats() (*Stats, error)
// Toggles the freezer cgroup according with specified state
// Freeze sets the freezer cgroup to the specified state.
Freeze(state configs.FreezerState) error
// Destroys the cgroup set
// Destroy removes cgroup.
Destroy() error
// The option func SystemdCgroups() and Cgroupfs() require following attributes:
// Paths map[string]string
// Cgroups *configs.Cgroup
// Paths maps cgroup subsystem to path at which it is mounted.
// Cgroups specifies specific cgroup settings for the various subsystems
// Path returns a cgroup path to the specified controller/subsystem.
// For cgroupv2, the argument is unused and can be empty.
Path(string) string
// Returns cgroup paths to save in a state file and to be able to
// restore the object later.
// Set sets cgroup resources parameters/limits. If the argument is nil,
// the resources specified during Manager creation (or the previous call
// to Set) are used.
Set(r *configs.Resources) error
// GetPaths returns cgroup path(s) to save in a state file in order to
// restore later.
//
// For cgroup v1, a key is cgroup subsystem name, and the value is the
// path to the cgroup for this subsystem.
//
// For cgroup v2 unified hierarchy, a key is "", and the value is the
// unified path.
GetPaths() map[string]string
// GetUnifiedPath returns the unified path when running in unified mode.
// The value corresponds to the all values of GetPaths() map.
//
// GetUnifiedPath returns error when running in hybrid mode as well as
// in legacy mode.
GetUnifiedPath() (string, error)
// Sets the cgroup as configured.
Set(container *configs.Config) error
// Gets the cgroup as configured.
// GetCgroups returns the cgroup data as configured.
GetCgroups() (*configs.Cgroup, error)
}
type NotFoundError struct {
Subsystem string
}
// GetFreezerState retrieves the current FreezerState of the cgroup.
GetFreezerState() (configs.FreezerState, error)
func (e *NotFoundError) Error() string {
return fmt.Sprintf("mountpoint for %s not found", e.Subsystem)
}
// Exists returns whether the cgroup path exists or not.
Exists() bool
func NewNotFoundError(sub string) error {
return &NotFoundError{
Subsystem: sub,
}
}
func IsNotFound(err error) bool {
if err == nil {
return false
}
_, ok := err.(*NotFoundError)
return ok
// OOMKillCount reports OOM kill count for the cgroup.
OOMKillCount() (uint64, error)
}

View File

@ -1,5 +1,3 @@
// +build linux
package cgroups
import (

View File

@ -1,3 +0,0 @@
// +build !linux
package cgroups

View File

@ -0,0 +1,386 @@
// SPDX-License-Identifier: Apache-2.0
/*
* Copyright (C) 2020 Aleksa Sarai <cyphar@cyphar.com>
* Copyright (C) 2020 SUSE LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package devices
import (
"bufio"
"fmt"
"io"
"sort"
"strconv"
"strings"
"github.com/opencontainers/runc/libcontainer/devices"
)
// deviceMeta is a Rule without the Allow or Permissions fields, and no
// wildcard-type support. It's effectively the "match" portion of a metadata
// rule, for the purposes of our emulation.
type deviceMeta struct {
node devices.Type
major int64
minor int64
}
// deviceRule is effectively the tuple (deviceMeta, Permissions).
type deviceRule struct {
meta deviceMeta
perms devices.Permissions
}
// deviceRules is a mapping of device metadata rules to the associated
// permissions in the ruleset.
type deviceRules map[deviceMeta]devices.Permissions
func (r deviceRules) orderedEntries() []deviceRule {
var rules []deviceRule
for meta, perms := range r {
rules = append(rules, deviceRule{meta: meta, perms: perms})
}
sort.Slice(rules, func(i, j int) bool {
// Sort by (major, minor, type).
a, b := rules[i].meta, rules[j].meta
return a.major < b.major ||
(a.major == b.major && a.minor < b.minor) ||
(a.major == b.major && a.minor == b.minor && a.node < b.node)
})
return rules
}
type Emulator struct {
defaultAllow bool
rules deviceRules
}
func (e *Emulator) IsBlacklist() bool {
return e.defaultAllow
}
func (e *Emulator) IsAllowAll() bool {
return e.IsBlacklist() && len(e.rules) == 0
}
func parseLine(line string) (*deviceRule, error) {
// Input: node major:minor perms.
fields := strings.FieldsFunc(line, func(r rune) bool {
return r == ' ' || r == ':'
})
if len(fields) != 4 {
return nil, fmt.Errorf("malformed devices.list rule %s", line)
}
var (
rule deviceRule
node = fields[0]
major = fields[1]
minor = fields[2]
perms = fields[3]
)
// Parse the node type.
switch node {
case "a":
// Super-special case -- "a" always means every device with every
// access mode. In fact, for devices.list this actually indicates that
// the cgroup is in black-list mode.
// TODO: Double-check that the entire file is "a *:* rwm".
return nil, nil
case "b":
rule.meta.node = devices.BlockDevice
case "c":
rule.meta.node = devices.CharDevice
default:
return nil, fmt.Errorf("unknown device type %q", node)
}
// Parse the major number.
if major == "*" {
rule.meta.major = devices.Wildcard
} else {
val, err := strconv.ParseUint(major, 10, 32)
if err != nil {
return nil, fmt.Errorf("invalid major number: %w", err)
}
rule.meta.major = int64(val)
}
// Parse the minor number.
if minor == "*" {
rule.meta.minor = devices.Wildcard
} else {
val, err := strconv.ParseUint(minor, 10, 32)
if err != nil {
return nil, fmt.Errorf("invalid minor number: %w", err)
}
rule.meta.minor = int64(val)
}
// Parse the access permissions.
rule.perms = devices.Permissions(perms)
if !rule.perms.IsValid() || rule.perms.IsEmpty() {
return nil, fmt.Errorf("parse access mode: contained unknown modes or is empty: %q", perms)
}
return &rule, nil
}
func (e *Emulator) addRule(rule deviceRule) error { //nolint:unparam
if e.rules == nil {
e.rules = make(map[deviceMeta]devices.Permissions)
}
// Merge with any pre-existing permissions.
oldPerms := e.rules[rule.meta]
newPerms := rule.perms.Union(oldPerms)
e.rules[rule.meta] = newPerms
return nil
}
func (e *Emulator) rmRule(rule deviceRule) error {
// Give an error if any of the permissions requested to be removed are
// present in a partially-matching wildcard rule, because such rules will
// be ignored by cgroupv1.
//
// This is a diversion from cgroupv1, but is necessary to avoid leading
// users into a false sense of security. cgroupv1 will silently(!) ignore
// requests to remove partial exceptions, but we really shouldn't do that.
//
// It may seem like we could just "split" wildcard rules which hit this
// issue, but unfortunately there are 2^32 possible major and minor
// numbers, which would exhaust kernel memory quickly if we did this. Not
// to mention it'd be really slow (the kernel side is implemented as a
// linked-list of exceptions).
for _, partialMeta := range []deviceMeta{
{node: rule.meta.node, major: devices.Wildcard, minor: rule.meta.minor},
{node: rule.meta.node, major: rule.meta.major, minor: devices.Wildcard},
{node: rule.meta.node, major: devices.Wildcard, minor: devices.Wildcard},
} {
// This wildcard rule is equivalent to the requested rule, so skip it.
if rule.meta == partialMeta {
continue
}
// Only give an error if the set of permissions overlap.
partialPerms := e.rules[partialMeta]
if !partialPerms.Intersection(rule.perms).IsEmpty() {
return fmt.Errorf("requested rule [%v %v] not supported by devices cgroupv1 (cannot punch hole in existing wildcard rule [%v %v])", rule.meta, rule.perms, partialMeta, partialPerms)
}
}
// Subtract all of the permissions listed from the full match rule. If the
// rule didn't exist, all of this is a no-op.
newPerms := e.rules[rule.meta].Difference(rule.perms)
if newPerms.IsEmpty() {
delete(e.rules, rule.meta)
} else {
e.rules[rule.meta] = newPerms
}
// TODO: The actual cgroup code doesn't care if an exception didn't exist
// during removal, so not erroring out here is /accurate/ but quite
// worrying. Maybe we should do additional validation, but again we
// have to worry about backwards-compatibility.
return nil
}
func (e *Emulator) allow(rule *deviceRule) error {
// This cgroup is configured as a black-list. Reset the entire emulator,
// and put is into black-list mode.
if rule == nil || rule.meta.node == devices.WildcardDevice {
*e = Emulator{
defaultAllow: true,
rules: nil,
}
return nil
}
var err error
if e.defaultAllow {
err = wrapErr(e.rmRule(*rule), "unable to remove 'deny' exception")
} else {
err = wrapErr(e.addRule(*rule), "unable to add 'allow' exception")
}
return err
}
func (e *Emulator) deny(rule *deviceRule) error {
// This cgroup is configured as a white-list. Reset the entire emulator,
// and put is into white-list mode.
if rule == nil || rule.meta.node == devices.WildcardDevice {
*e = Emulator{
defaultAllow: false,
rules: nil,
}
return nil
}
var err error
if e.defaultAllow {
err = wrapErr(e.addRule(*rule), "unable to add 'deny' exception")
} else {
err = wrapErr(e.rmRule(*rule), "unable to remove 'allow' exception")
}
return err
}
func (e *Emulator) Apply(rule devices.Rule) error {
if !rule.Type.CanCgroup() {
return fmt.Errorf("cannot add rule [%#v] with non-cgroup type %q", rule, rule.Type)
}
innerRule := &deviceRule{
meta: deviceMeta{
node: rule.Type,
major: rule.Major,
minor: rule.Minor,
},
perms: rule.Permissions,
}
if innerRule.meta.node == devices.WildcardDevice {
innerRule = nil
}
if rule.Allow {
return e.allow(innerRule)
}
return e.deny(innerRule)
}
// EmulatorFromList takes a reader to a "devices.list"-like source, and returns
// a new Emulator that represents the state of the devices cgroup. Note that
// black-list devices cgroups cannot be fully reconstructed, due to limitations
// in the devices cgroup API. Instead, such cgroups are always treated as
// "allow all" cgroups.
func EmulatorFromList(list io.Reader) (*Emulator, error) {
// Normally cgroups are in black-list mode by default, but the way we
// figure out the current mode is whether or not devices.list has an
// allow-all rule. So we default to a white-list, and the existence of an
// "a *:* rwm" entry will tell us otherwise.
e := &Emulator{
defaultAllow: false,
}
// Parse the "devices.list".
s := bufio.NewScanner(list)
for s.Scan() {
line := s.Text()
deviceRule, err := parseLine(line)
if err != nil {
return nil, fmt.Errorf("error parsing line %q: %w", line, err)
}
// "devices.list" is an allow list. Note that this means that in
// black-list mode, we have no idea what rules are in play. As a
// result, we need to be very careful in Transition().
if err := e.allow(deviceRule); err != nil {
return nil, fmt.Errorf("error adding devices.list rule: %w", err)
}
}
if err := s.Err(); err != nil {
return nil, fmt.Errorf("error reading devices.list lines: %w", err)
}
return e, nil
}
// Transition calculates what is the minimally-disruptive set of rules need to
// be applied to a devices cgroup in order to transition to the given target.
// This means that any already-existing rules will not be applied, and
// disruptive rules (like denying all device access) will only be applied if
// necessary.
//
// This function is the sole reason for all of Emulator -- to allow us
// to figure out how to update a containers' cgroups without causing spurious
// device errors (if possible).
func (source *Emulator) Transition(target *Emulator) ([]*devices.Rule, error) {
var transitionRules []*devices.Rule
oldRules := source.rules
// If the default policy doesn't match, we need to include a "disruptive"
// rule (either allow-all or deny-all) in order to switch the cgroup to the
// correct default policy.
//
// However, due to a limitation in "devices.list" we cannot be sure what
// deny rules are in place in a black-list cgroup. Thus if the source is a
// black-list we also have to include a disruptive rule.
if source.IsBlacklist() || source.defaultAllow != target.defaultAllow {
transitionRules = append(transitionRules, &devices.Rule{
Type: 'a',
Major: -1,
Minor: -1,
Permissions: devices.Permissions("rwm"),
Allow: target.defaultAllow,
})
// The old rules are only relevant if we aren't starting out with a
// disruptive rule.
oldRules = nil
}
// NOTE: We traverse through the rules in a sorted order so we always write
// the same set of rules (this is to aid testing).
// First, we create inverse rules for any old rules not in the new set.
// This includes partial-inverse rules for specific permissions. This is a
// no-op if we added a disruptive rule, since oldRules will be empty.
for _, rule := range oldRules.orderedEntries() {
meta, oldPerms := rule.meta, rule.perms
newPerms := target.rules[meta]
droppedPerms := oldPerms.Difference(newPerms)
if !droppedPerms.IsEmpty() {
transitionRules = append(transitionRules, &devices.Rule{
Type: meta.node,
Major: meta.major,
Minor: meta.minor,
Permissions: droppedPerms,
Allow: target.defaultAllow,
})
}
}
// Add any additional rules which weren't in the old set. We happen to
// filter out rules which are present in both sets, though this isn't
// strictly necessary.
for _, rule := range target.rules.orderedEntries() {
meta, newPerms := rule.meta, rule.perms
oldPerms := oldRules[meta]
gainedPerms := newPerms.Difference(oldPerms)
if !gainedPerms.IsEmpty() {
transitionRules = append(transitionRules, &devices.Rule{
Type: meta.node,
Major: meta.major,
Minor: meta.minor,
Permissions: gainedPerms,
Allow: !target.defaultAllow,
})
}
}
return transitionRules, nil
}
// Rules returns the minimum set of rules necessary to convert a *deny-all*
// cgroup to the emulated filter state (note that this is not the same as a
// default cgroupv1 cgroup -- which is allow-all). This is effectively just a
// wrapper around Transition() with the source emulator being an empty cgroup.
func (e *Emulator) Rules() ([]*devices.Rule, error) {
defaultCgroup := &Emulator{defaultAllow: false}
return defaultCgroup.Transition(e)
}
func wrapErr(err error, text string) error {
if err == nil {
return nil
}
return fmt.Errorf(text+": %w", err)
}

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +1,4 @@
// Package devicefilter containes eBPF device filter program
// Package devicefilter contains eBPF device filter program
//
// The implementation is based on https://github.com/containers/crun/blob/0.10.2/src/libcrun/ebpf.c
//
@ -7,12 +7,14 @@
package devicefilter
import (
"errors"
"fmt"
"math"
"strconv"
"github.com/cilium/ebpf/asm"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/pkg/errors"
devicesemulator "github.com/opencontainers/runc/libcontainer/cgroups/devices"
"github.com/opencontainers/runc/libcontainer/devices"
"golang.org/x/sys/unix"
)
@ -22,22 +24,54 @@ const (
)
// DeviceFilter returns eBPF device filter program and its license string
func DeviceFilter(devices []*configs.Device) (asm.Instructions, string, error) {
p := &program{}
p.init()
for i := len(devices) - 1; i >= 0; i-- {
if err := p.appendDevice(devices[i]); err != nil {
func DeviceFilter(rules []*devices.Rule) (asm.Instructions, string, error) {
// Generate the minimum ruleset for the device rules we are given. While we
// don't care about minimum transitions in cgroupv2, using the emulator
// gives us a guarantee that the behaviour of devices filtering is the same
// as cgroupv1, including security hardenings to avoid misconfiguration
// (such as punching holes in wildcard rules).
emu := new(devicesemulator.Emulator)
for _, rule := range rules {
if err := emu.Apply(*rule); err != nil {
return nil, "", err
}
}
insts, err := p.finalize()
return insts, license, err
cleanRules, err := emu.Rules()
if err != nil {
return nil, "", err
}
p := &program{
defaultAllow: emu.IsBlacklist(),
}
p.init()
for idx, rule := range cleanRules {
if rule.Type == devices.WildcardDevice {
// We can safely skip over wildcard entries because there should
// only be one (at most) at the very start to instruct cgroupv1 to
// go into allow-list mode. However we do double-check this here.
if idx != 0 || rule.Allow != emu.IsBlacklist() {
return nil, "", fmt.Errorf("[internal error] emulated cgroupv2 devices ruleset had bad wildcard at idx %v (%s)", idx, rule.CgroupString())
}
continue
}
if rule.Allow == p.defaultAllow {
// There should be no rules which have an action equal to the
// default action, the emulator removes those.
return nil, "", fmt.Errorf("[internal error] emulated cgroupv2 devices ruleset had no-op rule at idx %v (%s)", idx, rule.CgroupString())
}
if err := p.appendRule(rule); err != nil {
return nil, "", err
}
}
return p.finalize(), license, nil
}
type program struct {
insts asm.Instructions
hasWildCard bool
blockID int
insts asm.Instructions
defaultAllow bool
blockID int
}
func (p *program) init() {
@ -49,7 +83,8 @@ func (p *program) init() {
*/
// R2 <- type (lower 16 bit of u32 access_type at R1[0])
p.insts = append(p.insts,
asm.LoadMem(asm.R2, asm.R1, 0, asm.Half))
asm.LoadMem(asm.R2, asm.R1, 0, asm.Word),
asm.And.Imm32(asm.R2, 0xFFFF))
// R3 <- access (upper 16 bit of u32 access_type at R1[0])
p.insts = append(p.insts,
@ -66,39 +101,35 @@ func (p *program) init() {
asm.LoadMem(asm.R5, asm.R1, 8, asm.Word))
}
// appendDevice needs to be called from the last element of OCI linux.resources.devices to the head element.
func (p *program) appendDevice(dev *configs.Device) error {
// appendRule rule converts an OCI rule to the relevant eBPF block and adds it
// to the in-progress filter program. In order to operate properly, it must be
// called with a "clean" rule list (generated by devices.Emulator.Rules() --
// with any "a" rules removed).
func (p *program) appendRule(rule *devices.Rule) error {
if p.blockID < 0 {
return errors.New("the program is finalized")
}
if p.hasWildCard {
// All entries after wildcard entry are ignored
return nil
}
bpfType := int32(-1)
hasType := true
switch dev.Type {
case 'c':
var bpfType int32
switch rule.Type {
case devices.CharDevice:
bpfType = int32(unix.BPF_DEVCG_DEV_CHAR)
case 'b':
case devices.BlockDevice:
bpfType = int32(unix.BPF_DEVCG_DEV_BLOCK)
case 'a':
hasType = false
default:
// if not specified in OCI json, typ is set to DeviceTypeAll
return errors.Errorf("invalid DeviceType %q", string(dev.Type))
// We do not permit 'a', nor any other types we don't know about.
return fmt.Errorf("invalid type %q", string(rule.Type))
}
if dev.Major > math.MaxUint32 {
return errors.Errorf("invalid major %d", dev.Major)
if rule.Major > math.MaxUint32 {
return fmt.Errorf("invalid major %d", rule.Major)
}
if dev.Minor > math.MaxUint32 {
return errors.Errorf("invalid minor %d", dev.Major)
if rule.Minor > math.MaxUint32 {
return fmt.Errorf("invalid minor %d", rule.Major)
}
hasMajor := dev.Major >= 0 // if not specified in OCI json, major is set to -1
hasMinor := dev.Minor >= 0
hasMajor := rule.Major >= 0 // if not specified in OCI json, major is set to -1
hasMinor := rule.Minor >= 0
bpfAccess := int32(0)
for _, r := range dev.Permissions {
for _, r := range rule.Permissions {
switch r {
case 'r':
bpfAccess |= unix.BPF_DEVCG_ACC_READ
@ -107,68 +138,65 @@ func (p *program) appendDevice(dev *configs.Device) error {
case 'm':
bpfAccess |= unix.BPF_DEVCG_ACC_MKNOD
default:
return errors.Errorf("unknown device access %v", r)
return fmt.Errorf("unknown device access %v", r)
}
}
// If the access is rwm, skip the check.
hasAccess := bpfAccess != (unix.BPF_DEVCG_ACC_READ | unix.BPF_DEVCG_ACC_WRITE | unix.BPF_DEVCG_ACC_MKNOD)
blockSym := fmt.Sprintf("block-%d", p.blockID)
nextBlockSym := fmt.Sprintf("block-%d", p.blockID+1)
prevBlockLastIdx := len(p.insts) - 1
if hasType {
p.insts = append(p.insts,
// if (R2 != bpfType) goto next
asm.JNE.Imm(asm.R2, bpfType, nextBlockSym),
)
}
var (
blockSym = "block-" + strconv.Itoa(p.blockID)
nextBlockSym = "block-" + strconv.Itoa(p.blockID+1)
prevBlockLastIdx = len(p.insts) - 1
)
p.insts = append(p.insts,
// if (R2 != bpfType) goto next
asm.JNE.Imm(asm.R2, bpfType, nextBlockSym),
)
if hasAccess {
p.insts = append(p.insts,
// if (R3 & bpfAccess == 0 /* use R1 as a temp var */) goto next
// if (R3 & bpfAccess != R3 /* use R1 as a temp var */) goto next
asm.Mov.Reg32(asm.R1, asm.R3),
asm.And.Imm32(asm.R1, bpfAccess),
asm.JEq.Imm(asm.R1, 0, nextBlockSym),
asm.JNE.Reg(asm.R1, asm.R3, nextBlockSym),
)
}
if hasMajor {
p.insts = append(p.insts,
// if (R4 != major) goto next
asm.JNE.Imm(asm.R4, int32(dev.Major), nextBlockSym),
asm.JNE.Imm(asm.R4, int32(rule.Major), nextBlockSym),
)
}
if hasMinor {
p.insts = append(p.insts,
// if (R5 != minor) goto next
asm.JNE.Imm(asm.R5, int32(dev.Minor), nextBlockSym),
asm.JNE.Imm(asm.R5, int32(rule.Minor), nextBlockSym),
)
}
if !hasType && !hasAccess && !hasMajor && !hasMinor {
p.hasWildCard = true
}
p.insts = append(p.insts, acceptBlock(dev.Allow)...)
p.insts = append(p.insts, acceptBlock(rule.Allow)...)
// set blockSym to the first instruction we added in this iteration
p.insts[prevBlockLastIdx+1] = p.insts[prevBlockLastIdx+1].Sym(blockSym)
p.blockID++
return nil
}
func (p *program) finalize() (asm.Instructions, error) {
if p.hasWildCard {
// acceptBlock with asm.Return() is already inserted
return p.insts, nil
func (p *program) finalize() asm.Instructions {
var v int32
if p.defaultAllow {
v = 1
}
blockSym := fmt.Sprintf("block-%d", p.blockID)
blockSym := "block-" + strconv.Itoa(p.blockID)
p.insts = append(p.insts,
// R0 <- 0
asm.Mov.Imm32(asm.R0, 0).Sym(blockSym),
// R0 <- v
asm.Mov.Imm32(asm.R0, v).Sym(blockSym),
asm.Return(),
)
p.blockID = -1
return p.insts, nil
return p.insts
}
func acceptBlock(accept bool) asm.Instructions {
v := int32(0)
var v int32
if accept {
v = 1
}

View File

@ -4,7 +4,7 @@ import (
"strings"
"testing"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/devices"
"github.com/opencontainers/runc/libcontainer/specconv"
)
@ -20,13 +20,12 @@ func hash(s, comm string) string {
return strings.Join(res, "\n")
}
func testDeviceFilter(t testing.TB, devices []*configs.Device, expectedStr string) {
func testDeviceFilter(t testing.TB, devices []*devices.Rule, expectedStr string) {
insts, _, err := DeviceFilter(devices)
if err != nil {
t.Fatalf("%s: %v (devices: %+v)", t.Name(), err, devices)
}
s := insts.String()
t.Logf("%s: devices: %+v\n%s", t.Name(), devices, s)
if expectedStr != "" {
hashed := hash(s, "//")
expectedHashed := hash(expectedStr, "//")
@ -39,15 +38,16 @@ func testDeviceFilter(t testing.TB, devices []*configs.Device, expectedStr strin
func TestDeviceFilter_Nil(t *testing.T) {
expected := `
// load parameters into registers
0: LdXMemH dst: r2 src: r1 off: 0 imm: 0
1: LdXMemW dst: r3 src: r1 off: 0 imm: 0
2: RSh32Imm dst: r3 imm: 16
3: LdXMemW dst: r4 src: r1 off: 4 imm: 0
4: LdXMemW dst: r5 src: r1 off: 8 imm: 0
0: LdXMemW dst: r2 src: r1 off: 0 imm: 0
1: And32Imm dst: r2 imm: 65535
2: LdXMemW dst: r3 src: r1 off: 0 imm: 0
3: RSh32Imm dst: r3 imm: 16
4: LdXMemW dst: r4 src: r1 off: 4 imm: 0
5: LdXMemW dst: r5 src: r1 off: 8 imm: 0
block-0:
// return 0 (reject)
5: Mov32Imm dst: r0 imm: 0
6: Exit
6: Mov32Imm dst: r0 imm: 0
7: Exit
`
testDeviceFilter(t, nil, expected)
}
@ -55,97 +55,96 @@ block-0:
func TestDeviceFilter_BuiltInAllowList(t *testing.T) {
expected := `
// load parameters into registers
0: LdXMemH dst: r2 src: r1 off: 0 imm: 0
1: LdXMemW dst: r3 src: r1 off: 0 imm: 0
2: RSh32Imm dst: r3 imm: 16
3: LdXMemW dst: r4 src: r1 off: 4 imm: 0
4: LdXMemW dst: r5 src: r1 off: 8 imm: 0
0: LdXMemW dst: r2 src: r1 off: 0 imm: 0
1: And32Imm dst: r2 imm: 65535
2: LdXMemW dst: r3 src: r1 off: 0 imm: 0
3: RSh32Imm dst: r3 imm: 16
4: LdXMemW dst: r4 src: r1 off: 4 imm: 0
5: LdXMemW dst: r5 src: r1 off: 8 imm: 0
block-0:
// tuntap (c, 10, 200, rwm, allow)
5: JNEImm dst: r2 off: -1 imm: 2 <block-1>
6: JNEImm dst: r4 off: -1 imm: 10 <block-1>
7: JNEImm dst: r5 off: -1 imm: 200 <block-1>
8: Mov32Imm dst: r0 imm: 1
9: Exit
block-1:
10: JNEImm dst: r2 off: -1 imm: 2 <block-2>
11: JNEImm dst: r4 off: -1 imm: 5 <block-2>
12: JNEImm dst: r5 off: -1 imm: 2 <block-2>
13: Mov32Imm dst: r0 imm: 1
14: Exit
block-2:
// /dev/pts (c, 136, wildcard, rwm, true)
15: JNEImm dst: r2 off: -1 imm: 2 <block-3>
16: JNEImm dst: r4 off: -1 imm: 136 <block-3>
17: Mov32Imm dst: r0 imm: 1
18: Exit
block-3:
19: JNEImm dst: r2 off: -1 imm: 2 <block-4>
20: JNEImm dst: r4 off: -1 imm: 5 <block-4>
21: JNEImm dst: r5 off: -1 imm: 1 <block-4>
22: Mov32Imm dst: r0 imm: 1
23: Exit
block-4:
24: JNEImm dst: r2 off: -1 imm: 2 <block-5>
25: JNEImm dst: r4 off: -1 imm: 1 <block-5>
26: JNEImm dst: r5 off: -1 imm: 9 <block-5>
27: Mov32Imm dst: r0 imm: 1
28: Exit
block-5:
29: JNEImm dst: r2 off: -1 imm: 2 <block-6>
30: JNEImm dst: r4 off: -1 imm: 1 <block-6>
31: JNEImm dst: r5 off: -1 imm: 5 <block-6>
32: Mov32Imm dst: r0 imm: 1
33: Exit
block-6:
34: JNEImm dst: r2 off: -1 imm: 2 <block-7>
35: JNEImm dst: r4 off: -1 imm: 5 <block-7>
36: JNEImm dst: r5 off: -1 imm: 0 <block-7>
37: Mov32Imm dst: r0 imm: 1
38: Exit
block-7:
39: JNEImm dst: r2 off: -1 imm: 2 <block-8>
40: JNEImm dst: r4 off: -1 imm: 1 <block-8>
41: JNEImm dst: r5 off: -1 imm: 7 <block-8>
42: Mov32Imm dst: r0 imm: 1
43: Exit
block-8:
44: JNEImm dst: r2 off: -1 imm: 2 <block-9>
45: JNEImm dst: r4 off: -1 imm: 1 <block-9>
46: JNEImm dst: r5 off: -1 imm: 8 <block-9>
47: Mov32Imm dst: r0 imm: 1
48: Exit
block-9:
49: JNEImm dst: r2 off: -1 imm: 2 <block-10>
50: JNEImm dst: r4 off: -1 imm: 1 <block-10>
51: JNEImm dst: r5 off: -1 imm: 3 <block-10>
52: Mov32Imm dst: r0 imm: 1
53: Exit
block-10:
// (b, wildcard, wildcard, m, true)
54: JNEImm dst: r2 off: -1 imm: 1 <block-11>
55: Mov32Reg dst: r1 src: r3
56: And32Imm dst: r1 imm: 1
57: JEqImm dst: r1 off: -1 imm: 0 <block-11>
58: Mov32Imm dst: r0 imm: 1
59: Exit
block-11:
6: JNEImm dst: r2 off: -1 imm: 1 <block-1>
7: Mov32Reg dst: r1 src: r3
8: And32Imm dst: r1 imm: 1
9: JNEReg dst: r1 off: -1 src: r3 <block-1>
10: Mov32Imm dst: r0 imm: 1
11: Exit
block-1:
// (c, wildcard, wildcard, m, true)
60: JNEImm dst: r2 off: -1 imm: 2 <block-12>
61: Mov32Reg dst: r1 src: r3
62: And32Imm dst: r1 imm: 1
63: JEqImm dst: r1 off: -1 imm: 0 <block-12>
64: Mov32Imm dst: r0 imm: 1
65: Exit
block-12:
66: Mov32Imm dst: r0 imm: 0
67: Exit
12: JNEImm dst: r2 off: -1 imm: 2 <block-2>
13: Mov32Reg dst: r1 src: r3
14: And32Imm dst: r1 imm: 1
15: JNEReg dst: r1 off: -1 src: r3 <block-2>
16: Mov32Imm dst: r0 imm: 1
17: Exit
block-2:
18: JNEImm dst: r2 off: -1 imm: 2 <block-3>
19: JNEImm dst: r4 off: -1 imm: 1 <block-3>
20: JNEImm dst: r5 off: -1 imm: 3 <block-3>
21: Mov32Imm dst: r0 imm: 1
22: Exit
block-3:
23: JNEImm dst: r2 off: -1 imm: 2 <block-4>
24: JNEImm dst: r4 off: -1 imm: 1 <block-4>
25: JNEImm dst: r5 off: -1 imm: 5 <block-4>
26: Mov32Imm dst: r0 imm: 1
27: Exit
block-4:
28: JNEImm dst: r2 off: -1 imm: 2 <block-5>
29: JNEImm dst: r4 off: -1 imm: 1 <block-5>
30: JNEImm dst: r5 off: -1 imm: 7 <block-5>
31: Mov32Imm dst: r0 imm: 1
32: Exit
block-5:
33: JNEImm dst: r2 off: -1 imm: 2 <block-6>
34: JNEImm dst: r4 off: -1 imm: 1 <block-6>
35: JNEImm dst: r5 off: -1 imm: 8 <block-6>
36: Mov32Imm dst: r0 imm: 1
37: Exit
block-6:
38: JNEImm dst: r2 off: -1 imm: 2 <block-7>
39: JNEImm dst: r4 off: -1 imm: 1 <block-7>
40: JNEImm dst: r5 off: -1 imm: 9 <block-7>
41: Mov32Imm dst: r0 imm: 1
42: Exit
block-7:
43: JNEImm dst: r2 off: -1 imm: 2 <block-8>
44: JNEImm dst: r4 off: -1 imm: 5 <block-8>
45: JNEImm dst: r5 off: -1 imm: 0 <block-8>
46: Mov32Imm dst: r0 imm: 1
47: Exit
block-8:
48: JNEImm dst: r2 off: -1 imm: 2 <block-9>
49: JNEImm dst: r4 off: -1 imm: 5 <block-9>
50: JNEImm dst: r5 off: -1 imm: 2 <block-9>
51: Mov32Imm dst: r0 imm: 1
52: Exit
block-9:
// tuntap (c, 10, 200, rwm, allow)
53: JNEImm dst: r2 off: -1 imm: 2 <block-10>
54: JNEImm dst: r4 off: -1 imm: 10 <block-10>
55: JNEImm dst: r5 off: -1 imm: 200 <block-10>
56: Mov32Imm dst: r0 imm: 1
57: Exit
block-10:
// /dev/pts (c, 136, wildcard, rwm, true)
58: JNEImm dst: r2 off: -1 imm: 2 <block-11>
59: JNEImm dst: r4 off: -1 imm: 136 <block-11>
60: Mov32Imm dst: r0 imm: 1
61: Exit
block-11:
62: Mov32Imm dst: r0 imm: 0
63: Exit
`
testDeviceFilter(t, specconv.AllowedDevices, expected)
var devices []*devices.Rule
for _, device := range specconv.AllowedDevices {
devices = append(devices, &device.Rule)
}
testDeviceFilter(t, devices, expected)
}
func TestDeviceFilter_Privileged(t *testing.T) {
devices := []*configs.Device{
devices := []*devices.Rule{
{
Type: 'a',
Major: -1,
@ -157,21 +156,22 @@ func TestDeviceFilter_Privileged(t *testing.T) {
expected :=
`
// load parameters into registers
0: LdXMemH dst: r2 src: r1 off: 0 imm: 0
1: LdXMemW dst: r3 src: r1 off: 0 imm: 0
2: RSh32Imm dst: r3 imm: 16
3: LdXMemW dst: r4 src: r1 off: 4 imm: 0
4: LdXMemW dst: r5 src: r1 off: 8 imm: 0
0: LdXMemW dst: r2 src: r1 off: 0 imm: 0
1: And32Imm dst: r2 imm: 65535
2: LdXMemW dst: r3 src: r1 off: 0 imm: 0
3: RSh32Imm dst: r3 imm: 16
4: LdXMemW dst: r4 src: r1 off: 4 imm: 0
5: LdXMemW dst: r5 src: r1 off: 8 imm: 0
block-0:
// return 1 (accept)
5: Mov32Imm dst: r0 imm: 1
6: Exit
6: Mov32Imm dst: r0 imm: 1
7: Exit
`
testDeviceFilter(t, devices, expected)
}
func TestDeviceFilter_PrivilegedExceptSingleDevice(t *testing.T) {
devices := []*configs.Device{
devices := []*devices.Rule{
{
Type: 'a',
Major: -1,
@ -189,28 +189,29 @@ func TestDeviceFilter_PrivilegedExceptSingleDevice(t *testing.T) {
}
expected := `
// load parameters into registers
0: LdXMemH dst: r2 src: r1 off: 0 imm: 0
1: LdXMemW dst: r3 src: r1 off: 0 imm: 0
2: RSh32Imm dst: r3 imm: 16
3: LdXMemW dst: r4 src: r1 off: 4 imm: 0
4: LdXMemW dst: r5 src: r1 off: 8 imm: 0
0: LdXMemW dst: r2 src: r1 off: 0 imm: 0
1: And32Imm dst: r2 imm: 65535
2: LdXMemW dst: r3 src: r1 off: 0 imm: 0
3: RSh32Imm dst: r3 imm: 16
4: LdXMemW dst: r4 src: r1 off: 4 imm: 0
5: LdXMemW dst: r5 src: r1 off: 8 imm: 0
block-0:
// return 0 (reject) if type==b && major == 8 && minor == 0
5: JNEImm dst: r2 off: -1 imm: 1 <block-1>
6: JNEImm dst: r4 off: -1 imm: 8 <block-1>
7: JNEImm dst: r5 off: -1 imm: 0 <block-1>
8: Mov32Imm dst: r0 imm: 0
9: Exit
6: JNEImm dst: r2 off: -1 imm: 1 <block-1>
7: JNEImm dst: r4 off: -1 imm: 8 <block-1>
8: JNEImm dst: r5 off: -1 imm: 0 <block-1>
9: Mov32Imm dst: r0 imm: 0
10: Exit
block-1:
// return 1 (accept)
10: Mov32Imm dst: r0 imm: 1
11: Exit
11: Mov32Imm dst: r0 imm: 1
12: Exit
`
testDeviceFilter(t, devices, expected)
}
func TestDeviceFilter_Weird(t *testing.T) {
devices := []*configs.Device{
devices := []*devices.Rule{
{
Type: 'b',
Major: 8,
@ -237,22 +238,23 @@ func TestDeviceFilter_Weird(t *testing.T) {
// This conforms to runc v1.0.0-rc.9 (cgroup1) behavior.
expected := `
// load parameters into registers
0: LdXMemH dst: r2 src: r1 off: 0 imm: 0
1: LdXMemW dst: r3 src: r1 off: 0 imm: 0
2: RSh32Imm dst: r3 imm: 16
3: LdXMemW dst: r4 src: r1 off: 4 imm: 0
4: LdXMemW dst: r5 src: r1 off: 8 imm: 0
0: LdXMemW dst: r2 src: r1 off: 0 imm: 0
1: And32Imm dst: r2 imm: 65535
2: LdXMemW dst: r3 src: r1 off: 0 imm: 0
3: RSh32Imm dst: r3 imm: 16
4: LdXMemW dst: r4 src: r1 off: 4 imm: 0
5: LdXMemW dst: r5 src: r1 off: 8 imm: 0
block-0:
// return 0 (reject) if type==b && major == 8 && minor == 2
5: JNEImm dst: r2 off: -1 imm: 1 <block-1>
6: JNEImm dst: r4 off: -1 imm: 8 <block-1>
7: JNEImm dst: r5 off: -1 imm: 2 <block-1>
8: Mov32Imm dst: r0 imm: 0
9: Exit
6: JNEImm dst: r2 off: -1 imm: 1 <block-1>
7: JNEImm dst: r4 off: -1 imm: 8 <block-1>
8: JNEImm dst: r5 off: -1 imm: 2 <block-1>
9: Mov32Imm dst: r0 imm: 0
10: Exit
block-1:
// return 1 (accept)
10: Mov32Imm dst: r0 imm: 1
11: Exit
11: Mov32Imm dst: r0 imm: 1
12: Exit
`
testDeviceFilter(t, devices, expected)
}

View File

@ -1,45 +0,0 @@
package ebpf
import (
"github.com/cilium/ebpf"
"github.com/cilium/ebpf/asm"
"github.com/pkg/errors"
"golang.org/x/sys/unix"
)
// LoadAttachCgroupDeviceFilter installs eBPF device filter program to /sys/fs/cgroup/<foo> directory.
//
// Requires the system to be running in cgroup2 unified-mode with kernel >= 4.15 .
//
// https://github.com/torvalds/linux/commit/ebc614f687369f9df99828572b1d85a7c2de3d92
func LoadAttachCgroupDeviceFilter(insts asm.Instructions, license string, dirFD int) (func() error, error) {
nilCloser := func() error {
return nil
}
// Increase `ulimit -l` limit to avoid BPF_PROG_LOAD error (#2167).
// This limit is not inherited into the container.
memlockLimit := &unix.Rlimit{
Cur: unix.RLIM_INFINITY,
Max: unix.RLIM_INFINITY,
}
_ = unix.Setrlimit(unix.RLIMIT_MEMLOCK, memlockLimit)
spec := &ebpf.ProgramSpec{
Type: ebpf.CGroupDevice,
Instructions: insts,
License: license,
}
prog, err := ebpf.NewProgram(spec)
if err != nil {
return nilCloser, err
}
if err := prog.Attach(dirFD, ebpf.AttachCGroupDevice, unix.BPF_F_ALLOW_MULTI); err != nil {
return nilCloser, errors.Wrap(err, "failed to call BPF_PROG_ATTACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI)")
}
closer := func() error {
if err := prog.Detach(dirFD, ebpf.AttachCGroupDevice, unix.BPF_F_ALLOW_MULTI); err != nil {
return errors.Wrap(err, "failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI)")
}
return nil
}
return closer, nil
}

View File

@ -0,0 +1,253 @@
package ebpf
import (
"errors"
"fmt"
"os"
"runtime"
"sync"
"unsafe"
"github.com/cilium/ebpf"
"github.com/cilium/ebpf/asm"
"github.com/cilium/ebpf/link"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
func nilCloser() error {
return nil
}
func findAttachedCgroupDeviceFilters(dirFd int) ([]*ebpf.Program, error) {
type bpfAttrQuery struct {
TargetFd uint32
AttachType uint32
QueryType uint32
AttachFlags uint32
ProgIds uint64 // __aligned_u64
ProgCnt uint32
}
// Currently you can only have 64 eBPF programs attached to a cgroup.
size := 64
retries := 0
for retries < 10 {
progIds := make([]uint32, size)
query := bpfAttrQuery{
TargetFd: uint32(dirFd),
AttachType: uint32(unix.BPF_CGROUP_DEVICE),
ProgIds: uint64(uintptr(unsafe.Pointer(&progIds[0]))),
ProgCnt: uint32(len(progIds)),
}
// Fetch the list of program ids.
_, _, errno := unix.Syscall(unix.SYS_BPF,
uintptr(unix.BPF_PROG_QUERY),
uintptr(unsafe.Pointer(&query)),
unsafe.Sizeof(query))
size = int(query.ProgCnt)
runtime.KeepAlive(query)
if errno != 0 {
// On ENOSPC we get the correct number of programs.
if errno == unix.ENOSPC {
retries++
continue
}
return nil, fmt.Errorf("bpf_prog_query(BPF_CGROUP_DEVICE) failed: %w", errno)
}
// Convert the ids to program handles.
progIds = progIds[:size]
programs := make([]*ebpf.Program, 0, len(progIds))
for _, progId := range progIds {
program, err := ebpf.NewProgramFromID(ebpf.ProgramID(progId))
if err != nil {
// We skip over programs that give us -EACCES or -EPERM. This
// is necessary because there may be BPF programs that have
// been attached (such as with --systemd-cgroup) which have an
// LSM label that blocks us from interacting with the program.
//
// Because additional BPF_CGROUP_DEVICE programs only can add
// restrictions, there's no real issue with just ignoring these
// programs (and stops runc from breaking on distributions with
// very strict SELinux policies).
if errors.Is(err, os.ErrPermission) {
logrus.Debugf("ignoring existing CGROUP_DEVICE program (prog_id=%v) which cannot be accessed by runc -- likely due to LSM policy: %v", progId, err)
continue
}
return nil, fmt.Errorf("cannot fetch program from id: %w", err)
}
programs = append(programs, program)
}
runtime.KeepAlive(progIds)
return programs, nil
}
return nil, errors.New("could not get complete list of CGROUP_DEVICE programs")
}
var (
haveBpfProgReplaceBool bool
haveBpfProgReplaceOnce sync.Once
)
// Loosely based on the BPF_F_REPLACE support check in
// <https://github.com/cilium/ebpf/blob/v0.6.0/link/syscalls.go>.
//
// TODO: move this logic to cilium/ebpf
func haveBpfProgReplace() bool {
haveBpfProgReplaceOnce.Do(func() {
prog, err := ebpf.NewProgram(&ebpf.ProgramSpec{
Type: ebpf.CGroupDevice,
License: "MIT",
Instructions: asm.Instructions{
asm.Mov.Imm(asm.R0, 0),
asm.Return(),
},
})
if err != nil {
logrus.Debugf("checking for BPF_F_REPLACE support: ebpf.NewProgram failed: %v", err)
return
}
defer prog.Close()
devnull, err := os.Open("/dev/null")
if err != nil {
logrus.Debugf("checking for BPF_F_REPLACE support: open dummy target fd: %v", err)
return
}
defer devnull.Close()
// We know that we have BPF_PROG_ATTACH since we can load
// BPF_CGROUP_DEVICE programs. If passing BPF_F_REPLACE gives us EINVAL
// we know that the feature isn't present.
err = link.RawAttachProgram(link.RawAttachProgramOptions{
// We rely on this fd being checked after attachFlags.
Target: int(devnull.Fd()),
// Attempt to "replace" bad fds with this program.
Program: prog,
Attach: ebpf.AttachCGroupDevice,
Flags: unix.BPF_F_ALLOW_MULTI | unix.BPF_F_REPLACE,
})
if errors.Is(err, unix.EINVAL) {
// not supported
return
}
// attach_flags test succeeded.
if !errors.Is(err, unix.EBADF) {
logrus.Debugf("checking for BPF_F_REPLACE: got unexpected (not EBADF or EINVAL) error: %v", err)
}
haveBpfProgReplaceBool = true
})
return haveBpfProgReplaceBool
}
// LoadAttachCgroupDeviceFilter installs eBPF device filter program to /sys/fs/cgroup/<foo> directory.
//
// Requires the system to be running in cgroup2 unified-mode with kernel >= 4.15 .
//
// https://github.com/torvalds/linux/commit/ebc614f687369f9df99828572b1d85a7c2de3d92
func LoadAttachCgroupDeviceFilter(insts asm.Instructions, license string, dirFd int) (func() error, error) {
// Increase `ulimit -l` limit to avoid BPF_PROG_LOAD error (#2167).
// This limit is not inherited into the container.
memlockLimit := &unix.Rlimit{
Cur: unix.RLIM_INFINITY,
Max: unix.RLIM_INFINITY,
}
_ = unix.Setrlimit(unix.RLIMIT_MEMLOCK, memlockLimit)
// Get the list of existing programs.
oldProgs, err := findAttachedCgroupDeviceFilters(dirFd)
if err != nil {
return nilCloser, err
}
useReplaceProg := haveBpfProgReplace() && len(oldProgs) == 1
// Generate new program.
spec := &ebpf.ProgramSpec{
Type: ebpf.CGroupDevice,
Instructions: insts,
License: license,
}
prog, err := ebpf.NewProgram(spec)
if err != nil {
return nilCloser, err
}
// If there is only one old program, we can just replace it directly.
var (
replaceProg *ebpf.Program
attachFlags uint32 = unix.BPF_F_ALLOW_MULTI
)
if useReplaceProg {
replaceProg = oldProgs[0]
attachFlags |= unix.BPF_F_REPLACE
}
err = link.RawAttachProgram(link.RawAttachProgramOptions{
Target: dirFd,
Program: prog,
Replace: replaceProg,
Attach: ebpf.AttachCGroupDevice,
Flags: attachFlags,
})
if err != nil {
return nilCloser, fmt.Errorf("failed to call BPF_PROG_ATTACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI): %w", err)
}
closer := func() error {
err = link.RawDetachProgram(link.RawDetachProgramOptions{
Target: dirFd,
Program: prog,
Attach: ebpf.AttachCGroupDevice,
})
if err != nil {
return fmt.Errorf("failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE): %w", err)
}
// TODO: Should we attach the old filters back in this case? Otherwise
// we fail-open on a security feature, which is a bit scary.
return nil
}
if !useReplaceProg {
logLevel := logrus.DebugLevel
// If there was more than one old program, give a warning (since this
// really shouldn't happen with runc-managed cgroups) and then detach
// all the old programs.
if len(oldProgs) > 1 {
// NOTE: Ideally this should be a warning but it turns out that
// systemd-managed cgroups trigger this warning (apparently
// systemd doesn't delete old non-systemd programs when
// setting properties).
logrus.Infof("found more than one filter (%d) attached to a cgroup -- removing extra filters!", len(oldProgs))
logLevel = logrus.InfoLevel
}
for idx, oldProg := range oldProgs {
// Output some extra debug info.
if info, err := oldProg.Info(); err == nil {
fields := logrus.Fields{
"type": info.Type.String(),
"tag": info.Tag,
"name": info.Name,
}
if id, ok := info.ID(); ok {
fields["id"] = id
}
if runCount, ok := info.RunCount(); ok {
fields["run_count"] = runCount
}
if runtime, ok := info.Runtime(); ok {
fields["runtime"] = runtime.String()
}
logrus.WithFields(fields).Logf(logLevel, "removing old filter %d from cgroup", idx)
}
err = link.RawDetachProgram(link.RawDetachProgramOptions{
Target: dirFd,
Program: oldProg,
Attach: ebpf.AttachCGroupDevice,
})
if err != nil {
return closer, fmt.Errorf("failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE) on old filter program: %w", err)
}
}
}
return closer, nil
}

View File

@ -0,0 +1,190 @@
package cgroups
import (
"bytes"
"errors"
"fmt"
"os"
"path"
"strconv"
"strings"
"sync"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
// OpenFile opens a cgroup file in a given dir with given flags.
// It is supposed to be used for cgroup files only, and returns
// an error if the file is not a cgroup file.
//
// Arguments dir and file are joined together to form an absolute path
// to a file being opened.
func OpenFile(dir, file string, flags int) (*os.File, error) {
if dir == "" {
return nil, fmt.Errorf("no directory specified for %s", file)
}
return openFile(dir, file, flags)
}
// ReadFile reads data from a cgroup file in dir.
// It is supposed to be used for cgroup files only.
func ReadFile(dir, file string) (string, error) {
fd, err := OpenFile(dir, file, unix.O_RDONLY)
if err != nil {
return "", err
}
defer fd.Close()
var buf bytes.Buffer
_, err = buf.ReadFrom(fd)
return buf.String(), err
}
// WriteFile writes data to a cgroup file in dir.
// It is supposed to be used for cgroup files only.
func WriteFile(dir, file, data string) error {
fd, err := OpenFile(dir, file, unix.O_WRONLY)
if err != nil {
return err
}
defer fd.Close()
if err := retryingWriteFile(fd, data); err != nil {
// Having data in the error message helps in debugging.
return fmt.Errorf("failed to write %q: %w", data, err)
}
return nil
}
func retryingWriteFile(fd *os.File, data string) error {
for {
_, err := fd.Write([]byte(data))
if errors.Is(err, unix.EINTR) {
logrus.Infof("interrupted while writing %s to %s", data, fd.Name())
continue
}
return err
}
}
const (
cgroupfsDir = "/sys/fs/cgroup"
cgroupfsPrefix = cgroupfsDir + "/"
)
var (
// TestMode is set to true by unit tests that need "fake" cgroupfs.
TestMode bool
cgroupFd int = -1
prepOnce sync.Once
prepErr error
resolveFlags uint64
)
func prepareOpenat2() error {
prepOnce.Do(func() {
fd, err := unix.Openat2(-1, cgroupfsDir, &unix.OpenHow{
Flags: unix.O_DIRECTORY | unix.O_PATH,
})
if err != nil {
prepErr = &os.PathError{Op: "openat2", Path: cgroupfsDir, Err: err}
if err != unix.ENOSYS { //nolint:errorlint // unix errors are bare
logrus.Warnf("falling back to securejoin: %s", prepErr)
} else {
logrus.Debug("openat2 not available, falling back to securejoin")
}
return
}
var st unix.Statfs_t
if err = unix.Fstatfs(fd, &st); err != nil {
prepErr = &os.PathError{Op: "statfs", Path: cgroupfsDir, Err: err}
logrus.Warnf("falling back to securejoin: %s", prepErr)
return
}
cgroupFd = fd
resolveFlags = unix.RESOLVE_BENEATH | unix.RESOLVE_NO_MAGICLINKS
if st.Type == unix.CGROUP2_SUPER_MAGIC {
// cgroupv2 has a single mountpoint and no "cpu,cpuacct" symlinks
resolveFlags |= unix.RESOLVE_NO_XDEV | unix.RESOLVE_NO_SYMLINKS
}
})
return prepErr
}
func openFile(dir, file string, flags int) (*os.File, error) {
mode := os.FileMode(0)
if TestMode && flags&os.O_WRONLY != 0 {
// "emulate" cgroup fs for unit tests
flags |= os.O_TRUNC | os.O_CREATE
mode = 0o600
}
path := path.Join(dir, file)
if prepareOpenat2() != nil {
return openFallback(path, flags, mode)
}
relPath := strings.TrimPrefix(path, cgroupfsPrefix)
if len(relPath) == len(path) { // non-standard path, old system?
return openFallback(path, flags, mode)
}
fd, err := unix.Openat2(cgroupFd, relPath,
&unix.OpenHow{
Resolve: resolveFlags,
Flags: uint64(flags) | unix.O_CLOEXEC,
Mode: uint64(mode),
})
if err != nil {
err = &os.PathError{Op: "openat2", Path: path, Err: err}
// Check if cgroupFd is still opened to cgroupfsDir
// (happens when this package is incorrectly used
// across the chroot/pivot_root/mntns boundary, or
// when /sys/fs/cgroup is remounted).
//
// TODO: if such usage will ever be common, amend this
// to reopen cgroupFd and retry openat2.
fdStr := strconv.Itoa(cgroupFd)
fdDest, _ := os.Readlink("/proc/self/fd/" + fdStr)
if fdDest != cgroupfsDir {
// Wrap the error so it is clear that cgroupFd
// is opened to an unexpected/wrong directory.
err = fmt.Errorf("cgroupFd %s unexpectedly opened to %s != %s: %w",
fdStr, fdDest, cgroupfsDir, err)
}
return nil, err
}
return os.NewFile(uintptr(fd), path), nil
}
var errNotCgroupfs = errors.New("not a cgroup file")
// Can be changed by unit tests.
var openFallback = openAndCheck
// openAndCheck is used when openat2(2) is not available. It checks the opened
// file is on cgroupfs, returning an error otherwise.
func openAndCheck(path string, flags int, mode os.FileMode) (*os.File, error) {
fd, err := os.OpenFile(path, flags, mode)
if err != nil {
return nil, err
}
if TestMode {
return fd, nil
}
// Check this is a cgroupfs file.
var st unix.Statfs_t
if err := unix.Fstatfs(int(fd.Fd()), &st); err != nil {
_ = fd.Close()
return nil, &os.PathError{Op: "statfs", Path: path, Err: err}
}
if st.Type != unix.CGROUP_SUPER_MAGIC && st.Type != unix.CGROUP2_SUPER_MAGIC {
_ = fd.Close()
return nil, &os.PathError{Op: "open", Path: path, Err: errNotCgroupfs}
}
return fd, nil
}

View File

@ -0,0 +1,73 @@
package cgroups
import (
"errors"
"fmt"
"os"
"path/filepath"
"strconv"
"testing"
"time"
)
func TestWriteCgroupFileHandlesInterrupt(t *testing.T) {
const (
memoryCgroupMount = "/sys/fs/cgroup/memory"
memoryLimit = "memory.limit_in_bytes"
)
if _, err := os.Stat(memoryCgroupMount); err != nil {
// most probably cgroupv2
t.Skip(err)
}
cgroupName := fmt.Sprintf("test-eint-%d", time.Now().Nanosecond())
cgroupPath := filepath.Join(memoryCgroupMount, cgroupName)
if err := os.MkdirAll(cgroupPath, 0o755); err != nil {
t.Fatal(err)
}
defer os.RemoveAll(cgroupPath)
if _, err := os.Stat(filepath.Join(cgroupPath, memoryLimit)); err != nil {
// either cgroupv2, or memory controller is not available
t.Skip(err)
}
for i := 0; i < 100000; i++ {
limit := 1024*1024 + i
if err := WriteFile(cgroupPath, memoryLimit, strconv.Itoa(limit)); err != nil {
t.Fatalf("Failed to write %d on attempt %d: %+v", limit, i, err)
}
}
}
func TestOpenat2(t *testing.T) {
if !IsCgroup2UnifiedMode() {
// The reason is many test cases below test opening files from
// the top-level directory, where cgroup v1 has no files.
t.Skip("test requires cgroup v2")
}
// Make sure we test openat2, not its fallback.
openFallback = func(_ string, _ int, _ os.FileMode) (*os.File, error) {
return nil, errors.New("fallback")
}
defer func() { openFallback = openAndCheck }()
for _, tc := range []struct{ dir, file string }{
{"/sys/fs/cgroup", "cgroup.controllers"},
{"/sys/fs/cgroup", "/cgroup.controllers"},
{"/sys/fs/cgroup/", "cgroup.controllers"},
{"/sys/fs/cgroup/", "/cgroup.controllers"},
{"/sys/fs/cgroup/user.slice", "cgroup.controllers"},
{"/sys/fs/cgroup/user.slice/", "/cgroup.controllers"},
{"/", "/sys/fs/cgroup/cgroup.controllers"},
{"/", "sys/fs/cgroup/cgroup.controllers"},
{"/sys/fs/cgroup/cgroup.controllers", ""},
} {
fd, err := OpenFile(tc.dir, tc.file, os.O_RDONLY)
if err != nil {
t.Errorf("case %+v: %v", tc, err)
}
fd.Close()
}
}

View File

@ -1,411 +0,0 @@
// +build linux
package fs
import (
"fmt"
"io"
"os"
"path/filepath"
"sync"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
"github.com/pkg/errors"
"golang.org/x/sys/unix"
)
var (
subsystemsLegacy = subsystemSet{
&CpusetGroup{},
&DevicesGroup{},
&MemoryGroup{},
&CpuGroup{},
&CpuacctGroup{},
&PidsGroup{},
&BlkioGroup{},
&HugetlbGroup{},
&NetClsGroup{},
&NetPrioGroup{},
&PerfEventGroup{},
&FreezerGroup{},
&NameGroup{GroupName: "name=systemd", Join: true},
}
HugePageSizes, _ = cgroups.GetHugePageSize()
)
var errSubsystemDoesNotExist = fmt.Errorf("cgroup: subsystem does not exist")
type subsystemSet []subsystem
func (s subsystemSet) Get(name string) (subsystem, error) {
for _, ss := range s {
if ss.Name() == name {
return ss, nil
}
}
return nil, errSubsystemDoesNotExist
}
type subsystem interface {
// Name returns the name of the subsystem.
Name() string
// Returns the stats, as 'stats', corresponding to the cgroup under 'path'.
GetStats(path string, stats *cgroups.Stats) error
// Removes the cgroup represented by 'cgroupData'.
Remove(*cgroupData) error
// Creates and joins the cgroup represented by 'cgroupData'.
Apply(*cgroupData) error
// Set the cgroup represented by cgroup.
Set(path string, cgroup *configs.Cgroup) error
}
type Manager struct {
mu sync.Mutex
Cgroups *configs.Cgroup
Rootless bool // ignore permission-related errors
Paths map[string]string
}
// The absolute path to the root of the cgroup hierarchies.
var cgroupRootLock sync.Mutex
var cgroupRoot string
// Gets the cgroupRoot.
func getCgroupRoot() (string, error) {
cgroupRootLock.Lock()
defer cgroupRootLock.Unlock()
if cgroupRoot != "" {
return cgroupRoot, nil
}
root, err := cgroups.FindCgroupMountpointDir()
if err != nil {
return "", err
}
if _, err := os.Stat(root); err != nil {
return "", err
}
cgroupRoot = root
return cgroupRoot, nil
}
type cgroupData struct {
root string
innerPath string
config *configs.Cgroup
pid int
}
// isIgnorableError returns whether err is a permission error (in the loose
// sense of the word). This includes EROFS (which for an unprivileged user is
// basically a permission error) and EACCES (for similar reasons) as well as
// the normal EPERM.
func isIgnorableError(rootless bool, err error) bool {
// We do not ignore errors if we are root.
if !rootless {
return false
}
// Is it an ordinary EPERM?
if os.IsPermission(errors.Cause(err)) {
return true
}
// Try to handle other errnos.
var errno error
switch err := errors.Cause(err).(type) {
case *os.PathError:
errno = err.Err
case *os.LinkError:
errno = err.Err
case *os.SyscallError:
errno = err.Err
}
return errno == unix.EROFS || errno == unix.EPERM || errno == unix.EACCES
}
func (m *Manager) getSubsystems() subsystemSet {
return subsystemsLegacy
}
func (m *Manager) Apply(pid int) (err error) {
if m.Cgroups == nil {
return nil
}
m.mu.Lock()
defer m.mu.Unlock()
var c = m.Cgroups
d, err := getCgroupData(m.Cgroups, pid)
if err != nil {
return err
}
m.Paths = make(map[string]string)
if c.Paths != nil {
for name, path := range c.Paths {
_, err := d.path(name)
if err != nil {
if cgroups.IsNotFound(err) {
continue
}
return err
}
m.Paths[name] = path
}
return cgroups.EnterPid(m.Paths, pid)
}
for _, sys := range m.getSubsystems() {
// TODO: Apply should, ideally, be reentrant or be broken up into a separate
// create and join phase so that the cgroup hierarchy for a container can be
// created then join consists of writing the process pids to cgroup.procs
p, err := d.path(sys.Name())
if err != nil {
// The non-presence of the devices subsystem is
// considered fatal for security reasons.
if cgroups.IsNotFound(err) && sys.Name() != "devices" {
continue
}
return err
}
m.Paths[sys.Name()] = p
if err := sys.Apply(d); err != nil {
// In the case of rootless (including euid=0 in userns), where an explicit cgroup path hasn't
// been set, we don't bail on error in case of permission problems.
// Cases where limits have been set (and we couldn't create our own
// cgroup) are handled by Set.
if isIgnorableError(m.Rootless, err) && m.Cgroups.Path == "" {
delete(m.Paths, sys.Name())
continue
}
return err
}
}
return nil
}
func (m *Manager) Destroy() error {
if m.Cgroups == nil || m.Cgroups.Paths != nil {
return nil
}
m.mu.Lock()
defer m.mu.Unlock()
if err := cgroups.RemovePaths(m.Paths); err != nil {
return err
}
m.Paths = make(map[string]string)
return nil
}
func (m *Manager) GetPaths() map[string]string {
m.mu.Lock()
paths := m.Paths
m.mu.Unlock()
return paths
}
func (m *Manager) GetUnifiedPath() (string, error) {
return "", errors.New("unified path is only supported when running in unified mode")
}
func (m *Manager) GetStats() (*cgroups.Stats, error) {
m.mu.Lock()
defer m.mu.Unlock()
stats := cgroups.NewStats()
for name, path := range m.Paths {
sys, err := m.getSubsystems().Get(name)
if err == errSubsystemDoesNotExist || !cgroups.PathExists(path) {
continue
}
if err := sys.GetStats(path, stats); err != nil {
return nil, err
}
}
return stats, nil
}
func (m *Manager) Set(container *configs.Config) error {
if container.Cgroups == nil {
return nil
}
// If Paths are set, then we are just joining cgroups paths
// and there is no need to set any values.
if m.Cgroups != nil && m.Cgroups.Paths != nil {
return nil
}
paths := m.GetPaths()
for _, sys := range m.getSubsystems() {
path := paths[sys.Name()]
if err := sys.Set(path, container.Cgroups); err != nil {
if m.Rootless && sys.Name() == "devices" {
continue
}
// When m.Rootless is true, errors from the device subsystem are ignored because it is really not expected to work.
// However, errors from other subsystems are not ignored.
// see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error"
if path == "" {
// We never created a path for this cgroup, so we cannot set
// limits for it (though we have already tried at this point).
return fmt.Errorf("cannot set %s limit: container could not join or create cgroup", sys.Name())
}
return err
}
}
if m.Paths["cpu"] != "" {
if err := CheckCpushares(m.Paths["cpu"], container.Cgroups.Resources.CpuShares); err != nil {
return err
}
}
return nil
}
// Freeze toggles the container's freezer cgroup depending on the state
// provided
func (m *Manager) Freeze(state configs.FreezerState) error {
if m.Cgroups == nil {
return errors.New("cannot toggle freezer: cgroups not configured for container")
}
paths := m.GetPaths()
dir := paths["freezer"]
prevState := m.Cgroups.Resources.Freezer
m.Cgroups.Resources.Freezer = state
freezer, err := m.getSubsystems().Get("freezer")
if err != nil {
return err
}
err = freezer.Set(dir, m.Cgroups)
if err != nil {
m.Cgroups.Resources.Freezer = prevState
return err
}
return nil
}
func (m *Manager) GetPids() ([]int, error) {
paths := m.GetPaths()
return cgroups.GetPids(paths["devices"])
}
func (m *Manager) GetAllPids() ([]int, error) {
paths := m.GetPaths()
return cgroups.GetAllPids(paths["devices"])
}
func getCgroupData(c *configs.Cgroup, pid int) (*cgroupData, error) {
root, err := getCgroupRoot()
if err != nil {
return nil, err
}
if (c.Name != "" || c.Parent != "") && c.Path != "" {
return nil, fmt.Errorf("cgroup: either Path or Name and Parent should be used")
}
// XXX: Do not remove this code. Path safety is important! -- cyphar
cgPath := libcontainerUtils.CleanPath(c.Path)
cgParent := libcontainerUtils.CleanPath(c.Parent)
cgName := libcontainerUtils.CleanPath(c.Name)
innerPath := cgPath
if innerPath == "" {
innerPath = filepath.Join(cgParent, cgName)
}
return &cgroupData{
root: root,
innerPath: innerPath,
config: c,
pid: pid,
}, nil
}
func (raw *cgroupData) path(subsystem string) (string, error) {
mnt, err := cgroups.FindCgroupMountpoint(raw.root, subsystem)
// If we didn't mount the subsystem, there is no point we make the path.
if err != nil {
return "", err
}
// If the cgroup name/path is absolute do not look relative to the cgroup of the init process.
if filepath.IsAbs(raw.innerPath) {
// Sometimes subsystems can be mounted together as 'cpu,cpuacct'.
return filepath.Join(raw.root, filepath.Base(mnt), raw.innerPath), nil
}
// Use GetOwnCgroupPath instead of GetInitCgroupPath, because the creating
// process could in container and shared pid namespace with host, and
// /proc/1/cgroup could point to whole other world of cgroups.
parentPath, err := cgroups.GetOwnCgroupPath(subsystem)
if err != nil {
return "", err
}
return filepath.Join(parentPath, raw.innerPath), nil
}
func (raw *cgroupData) join(subsystem string) (string, error) {
path, err := raw.path(subsystem)
if err != nil {
return "", err
}
if err := os.MkdirAll(path, 0755); err != nil {
return "", err
}
if err := cgroups.WriteCgroupProc(path, raw.pid); err != nil {
return "", err
}
return path, nil
}
func removePath(p string, err error) error {
if err != nil {
return err
}
if p != "" {
return os.RemoveAll(p)
}
return nil
}
func CheckCpushares(path string, c uint64) error {
var cpuShares uint64
if c == 0 {
return nil
}
fd, err := os.Open(filepath.Join(path, "cpu.shares"))
if err != nil {
return err
}
defer fd.Close()
_, err = fmt.Fscanf(fd, "%d", &cpuShares)
if err != nil && err != io.EOF {
return err
}
if c > cpuShares {
return fmt.Errorf("The maximum allowed cpu-shares is %d", cpuShares)
} else if c < cpuShares {
return fmt.Errorf("The minimum allowed cpu-shares is %d", cpuShares)
}
return nil
}
func (m *Manager) GetCgroups() (*configs.Cgroup, error) {
return m.Cgroups, nil
}

View File

@ -1,297 +0,0 @@
// +build linux
package fs
import (
"path/filepath"
"strings"
"testing"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
)
func TestInvalidCgroupPath(t *testing.T) {
if cgroups.IsCgroup2UnifiedMode() {
t.Skip("cgroup v1 is not supported")
}
root, err := getCgroupRoot()
if err != nil {
t.Errorf("couldn't get cgroup root: %v", err)
}
config := &configs.Cgroup{
Path: "../../../../../../../../../../some/path",
}
data, err := getCgroupData(config, 0)
if err != nil {
t.Errorf("couldn't get cgroup data: %v", err)
}
// Make sure the final innerPath doesn't go outside the cgroup mountpoint.
if strings.HasPrefix(data.innerPath, "..") {
t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!")
}
// Double-check, using an actual cgroup.
deviceRoot := filepath.Join(root, "devices")
devicePath, err := data.path("devices")
if err != nil {
t.Errorf("couldn't get cgroup path: %v", err)
}
if !strings.HasPrefix(devicePath, deviceRoot) {
t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!")
}
}
func TestInvalidAbsoluteCgroupPath(t *testing.T) {
if cgroups.IsCgroup2UnifiedMode() {
t.Skip("cgroup v1 is not supported")
}
root, err := getCgroupRoot()
if err != nil {
t.Errorf("couldn't get cgroup root: %v", err)
}
config := &configs.Cgroup{
Path: "/../../../../../../../../../../some/path",
}
data, err := getCgroupData(config, 0)
if err != nil {
t.Errorf("couldn't get cgroup data: %v", err)
}
// Make sure the final innerPath doesn't go outside the cgroup mountpoint.
if strings.HasPrefix(data.innerPath, "..") {
t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!")
}
// Double-check, using an actual cgroup.
deviceRoot := filepath.Join(root, "devices")
devicePath, err := data.path("devices")
if err != nil {
t.Errorf("couldn't get cgroup path: %v", err)
}
if !strings.HasPrefix(devicePath, deviceRoot) {
t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!")
}
}
// XXX: Remove me after we get rid of configs.Cgroup.Name and configs.Cgroup.Parent.
func TestInvalidCgroupParent(t *testing.T) {
if cgroups.IsCgroup2UnifiedMode() {
t.Skip("cgroup v1 is not supported")
}
root, err := getCgroupRoot()
if err != nil {
t.Errorf("couldn't get cgroup root: %v", err)
}
config := &configs.Cgroup{
Parent: "../../../../../../../../../../some/path",
Name: "name",
}
data, err := getCgroupData(config, 0)
if err != nil {
t.Errorf("couldn't get cgroup data: %v", err)
}
// Make sure the final innerPath doesn't go outside the cgroup mountpoint.
if strings.HasPrefix(data.innerPath, "..") {
t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!")
}
// Double-check, using an actual cgroup.
deviceRoot := filepath.Join(root, "devices")
devicePath, err := data.path("devices")
if err != nil {
t.Errorf("couldn't get cgroup path: %v", err)
}
if !strings.HasPrefix(devicePath, deviceRoot) {
t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!")
}
}
// XXX: Remove me after we get rid of configs.Cgroup.Name and configs.Cgroup.Parent.
func TestInvalidAbsoluteCgroupParent(t *testing.T) {
if cgroups.IsCgroup2UnifiedMode() {
t.Skip("cgroup v1 is not supported")
}
root, err := getCgroupRoot()
if err != nil {
t.Errorf("couldn't get cgroup root: %v", err)
}
config := &configs.Cgroup{
Parent: "/../../../../../../../../../../some/path",
Name: "name",
}
data, err := getCgroupData(config, 0)
if err != nil {
t.Errorf("couldn't get cgroup data: %v", err)
}
// Make sure the final innerPath doesn't go outside the cgroup mountpoint.
if strings.HasPrefix(data.innerPath, "..") {
t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!")
}
// Double-check, using an actual cgroup.
deviceRoot := filepath.Join(root, "devices")
devicePath, err := data.path("devices")
if err != nil {
t.Errorf("couldn't get cgroup path: %v", err)
}
if !strings.HasPrefix(devicePath, deviceRoot) {
t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!")
}
}
// XXX: Remove me after we get rid of configs.Cgroup.Name and configs.Cgroup.Parent.
func TestInvalidCgroupName(t *testing.T) {
if cgroups.IsCgroup2UnifiedMode() {
t.Skip("cgroup v1 is not supported")
}
root, err := getCgroupRoot()
if err != nil {
t.Errorf("couldn't get cgroup root: %v", err)
}
config := &configs.Cgroup{
Parent: "parent",
Name: "../../../../../../../../../../some/path",
}
data, err := getCgroupData(config, 0)
if err != nil {
t.Errorf("couldn't get cgroup data: %v", err)
}
// Make sure the final innerPath doesn't go outside the cgroup mountpoint.
if strings.HasPrefix(data.innerPath, "..") {
t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!")
}
// Double-check, using an actual cgroup.
deviceRoot := filepath.Join(root, "devices")
devicePath, err := data.path("devices")
if err != nil {
t.Errorf("couldn't get cgroup path: %v", err)
}
if !strings.HasPrefix(devicePath, deviceRoot) {
t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!")
}
}
// XXX: Remove me after we get rid of configs.Cgroup.Name and configs.Cgroup.Parent.
func TestInvalidAbsoluteCgroupName(t *testing.T) {
if cgroups.IsCgroup2UnifiedMode() {
t.Skip("cgroup v1 is not supported")
}
root, err := getCgroupRoot()
if err != nil {
t.Errorf("couldn't get cgroup root: %v", err)
}
config := &configs.Cgroup{
Parent: "parent",
Name: "/../../../../../../../../../../some/path",
}
data, err := getCgroupData(config, 0)
if err != nil {
t.Errorf("couldn't get cgroup data: %v", err)
}
// Make sure the final innerPath doesn't go outside the cgroup mountpoint.
if strings.HasPrefix(data.innerPath, "..") {
t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!")
}
// Double-check, using an actual cgroup.
deviceRoot := filepath.Join(root, "devices")
devicePath, err := data.path("devices")
if err != nil {
t.Errorf("couldn't get cgroup path: %v", err)
}
if !strings.HasPrefix(devicePath, deviceRoot) {
t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!")
}
}
// XXX: Remove me after we get rid of configs.Cgroup.Name and configs.Cgroup.Parent.
func TestInvalidCgroupNameAndParent(t *testing.T) {
if cgroups.IsCgroup2UnifiedMode() {
t.Skip("cgroup v1 is not supported")
}
root, err := getCgroupRoot()
if err != nil {
t.Errorf("couldn't get cgroup root: %v", err)
}
config := &configs.Cgroup{
Parent: "../../../../../../../../../../some/path",
Name: "../../../../../../../../../../some/path",
}
data, err := getCgroupData(config, 0)
if err != nil {
t.Errorf("couldn't get cgroup data: %v", err)
}
// Make sure the final innerPath doesn't go outside the cgroup mountpoint.
if strings.HasPrefix(data.innerPath, "..") {
t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!")
}
// Double-check, using an actual cgroup.
deviceRoot := filepath.Join(root, "devices")
devicePath, err := data.path("devices")
if err != nil {
t.Errorf("couldn't get cgroup path: %v", err)
}
if !strings.HasPrefix(devicePath, deviceRoot) {
t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!")
}
}
// XXX: Remove me after we get rid of configs.Cgroup.Name and configs.Cgroup.Parent.
func TestInvalidAbsoluteCgroupNameAndParent(t *testing.T) {
if cgroups.IsCgroup2UnifiedMode() {
t.Skip("cgroup v1 is not supported")
}
root, err := getCgroupRoot()
if err != nil {
t.Errorf("couldn't get cgroup root: %v", err)
}
config := &configs.Cgroup{
Parent: "/../../../../../../../../../../some/path",
Name: "/../../../../../../../../../../some/path",
}
data, err := getCgroupData(config, 0)
if err != nil {
t.Errorf("couldn't get cgroup data: %v", err)
}
// Make sure the final innerPath doesn't go outside the cgroup mountpoint.
if strings.HasPrefix(data.innerPath, "..") {
t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!")
}
// Double-check, using an actual cgroup.
deviceRoot := filepath.Join(root, "devices")
devicePath, err := data.path("devices")
if err != nil {
t.Errorf("couldn't get cgroup path: %v", err)
}
if !strings.HasPrefix(devicePath, deviceRoot) {
t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!")
}
}

View File

@ -1,72 +1,71 @@
// +build linux
package fs
import (
"bufio"
"fmt"
"os"
"path/filepath"
"strconv"
"strings"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
)
type BlkioGroup struct {
weightFilename string
weightDeviceFilename string
}
func (s *BlkioGroup) Name() string {
return "blkio"
}
func (s *BlkioGroup) Apply(d *cgroupData) error {
_, err := d.join("blkio")
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
func (s *BlkioGroup) Apply(path string, _ *configs.Resources, pid int) error {
return apply(path, pid)
}
func (s *BlkioGroup) Set(path string, cgroup *configs.Cgroup) error {
if cgroup.Resources.BlkioWeight != 0 {
if err := fscommon.WriteFile(path, "blkio.weight", strconv.FormatUint(uint64(cgroup.Resources.BlkioWeight), 10)); err != nil {
func (s *BlkioGroup) Set(path string, r *configs.Resources) error {
s.detectWeightFilenames(path)
if r.BlkioWeight != 0 {
if err := cgroups.WriteFile(path, s.weightFilename, strconv.FormatUint(uint64(r.BlkioWeight), 10)); err != nil {
return err
}
}
if cgroup.Resources.BlkioLeafWeight != 0 {
if err := fscommon.WriteFile(path, "blkio.leaf_weight", strconv.FormatUint(uint64(cgroup.Resources.BlkioLeafWeight), 10)); err != nil {
if r.BlkioLeafWeight != 0 {
if err := cgroups.WriteFile(path, "blkio.leaf_weight", strconv.FormatUint(uint64(r.BlkioLeafWeight), 10)); err != nil {
return err
}
}
for _, wd := range cgroup.Resources.BlkioWeightDevice {
if err := fscommon.WriteFile(path, "blkio.weight_device", wd.WeightString()); err != nil {
return err
for _, wd := range r.BlkioWeightDevice {
if wd.Weight != 0 {
if err := cgroups.WriteFile(path, s.weightDeviceFilename, wd.WeightString()); err != nil {
return err
}
}
if err := fscommon.WriteFile(path, "blkio.leaf_weight_device", wd.LeafWeightString()); err != nil {
if wd.LeafWeight != 0 {
if err := cgroups.WriteFile(path, "blkio.leaf_weight_device", wd.LeafWeightString()); err != nil {
return err
}
}
}
for _, td := range r.BlkioThrottleReadBpsDevice {
if err := cgroups.WriteFile(path, "blkio.throttle.read_bps_device", td.String()); err != nil {
return err
}
}
for _, td := range cgroup.Resources.BlkioThrottleReadBpsDevice {
if err := fscommon.WriteFile(path, "blkio.throttle.read_bps_device", td.String()); err != nil {
for _, td := range r.BlkioThrottleWriteBpsDevice {
if err := cgroups.WriteFile(path, "blkio.throttle.write_bps_device", td.String()); err != nil {
return err
}
}
for _, td := range cgroup.Resources.BlkioThrottleWriteBpsDevice {
if err := fscommon.WriteFile(path, "blkio.throttle.write_bps_device", td.String()); err != nil {
for _, td := range r.BlkioThrottleReadIOPSDevice {
if err := cgroups.WriteFile(path, "blkio.throttle.read_iops_device", td.String()); err != nil {
return err
}
}
for _, td := range cgroup.Resources.BlkioThrottleReadIOPSDevice {
if err := fscommon.WriteFile(path, "blkio.throttle.read_iops_device", td.String()); err != nil {
return err
}
}
for _, td := range cgroup.Resources.BlkioThrottleWriteIOPSDevice {
if err := fscommon.WriteFile(path, "blkio.throttle.write_iops_device", td.String()); err != nil {
for _, td := range r.BlkioThrottleWriteIOPSDevice {
if err := cgroups.WriteFile(path, "blkio.throttle.write_iops_device", td.String()); err != nil {
return err
}
}
@ -74,10 +73,6 @@ func (s *BlkioGroup) Set(path string, cgroup *configs.Cgroup) error {
return nil
}
func (s *BlkioGroup) Remove(d *cgroupData) error {
return removePath(d.path("blkio"))
}
/*
examples:
@ -113,9 +108,9 @@ func splitBlkioStatLine(r rune) bool {
return r == ' ' || r == ':'
}
func getBlkioStat(path string) ([]cgroups.BlkioStatEntry, error) {
func getBlkioStat(dir, file string) ([]cgroups.BlkioStatEntry, error) {
var blkioStats []cgroups.BlkioStatEntry
f, err := os.Open(path)
f, err := cgroups.OpenFile(dir, file, os.O_RDONLY)
if err != nil {
if os.IsNotExist(err) {
return blkioStats, nil
@ -133,19 +128,19 @@ func getBlkioStat(path string) ([]cgroups.BlkioStatEntry, error) {
// skip total line
continue
} else {
return nil, fmt.Errorf("Invalid line found while parsing %s: %s", path, sc.Text())
return nil, malformedLine(dir, file, sc.Text())
}
}
v, err := strconv.ParseUint(fields[0], 10, 64)
if err != nil {
return nil, err
return nil, &parseError{Path: dir, File: file, Err: err}
}
major := v
v, err = strconv.ParseUint(fields[1], 10, 64)
if err != nil {
return nil, err
return nil, &parseError{Path: dir, File: file, Err: err}
}
minor := v
@ -157,82 +152,160 @@ func getBlkioStat(path string) ([]cgroups.BlkioStatEntry, error) {
}
v, err = strconv.ParseUint(fields[valueField], 10, 64)
if err != nil {
return nil, err
return nil, &parseError{Path: dir, File: file, Err: err}
}
blkioStats = append(blkioStats, cgroups.BlkioStatEntry{Major: major, Minor: minor, Op: op, Value: v})
}
if err := sc.Err(); err != nil {
return nil, &parseError{Path: dir, File: file, Err: err}
}
return blkioStats, nil
}
func (s *BlkioGroup) GetStats(path string, stats *cgroups.Stats) error {
// Try to read CFQ stats available on all CFQ enabled kernels first
if blkioStats, err := getBlkioStat(filepath.Join(path, "blkio.io_serviced_recursive")); err == nil && blkioStats != nil {
return getCFQStats(path, stats)
type blkioStatInfo struct {
filename string
blkioStatEntriesPtr *[]cgroups.BlkioStatEntry
}
bfqDebugStats := []blkioStatInfo{
{
filename: "blkio.bfq.sectors_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.SectorsRecursive,
},
{
filename: "blkio.bfq.io_service_time_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoServiceTimeRecursive,
},
{
filename: "blkio.bfq.io_wait_time_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoWaitTimeRecursive,
},
{
filename: "blkio.bfq.io_merged_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoMergedRecursive,
},
{
filename: "blkio.bfq.io_queued_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoQueuedRecursive,
},
{
filename: "blkio.bfq.time_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoTimeRecursive,
},
{
filename: "blkio.bfq.io_serviced_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive,
},
{
filename: "blkio.bfq.io_service_bytes_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive,
},
}
bfqStats := []blkioStatInfo{
{
filename: "blkio.bfq.io_serviced_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive,
},
{
filename: "blkio.bfq.io_service_bytes_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive,
},
}
cfqStats := []blkioStatInfo{
{
filename: "blkio.sectors_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.SectorsRecursive,
},
{
filename: "blkio.io_service_time_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoServiceTimeRecursive,
},
{
filename: "blkio.io_wait_time_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoWaitTimeRecursive,
},
{
filename: "blkio.io_merged_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoMergedRecursive,
},
{
filename: "blkio.io_queued_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoQueuedRecursive,
},
{
filename: "blkio.time_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoTimeRecursive,
},
{
filename: "blkio.io_serviced_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive,
},
{
filename: "blkio.io_service_bytes_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive,
},
}
throttleRecursiveStats := []blkioStatInfo{
{
filename: "blkio.throttle.io_serviced_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive,
},
{
filename: "blkio.throttle.io_service_bytes_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive,
},
}
baseStats := []blkioStatInfo{
{
filename: "blkio.throttle.io_serviced",
blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive,
},
{
filename: "blkio.throttle.io_service_bytes",
blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive,
},
}
orderedStats := [][]blkioStatInfo{
bfqDebugStats,
bfqStats,
cfqStats,
throttleRecursiveStats,
baseStats,
}
return getStats(path, stats) // Use generic stats as fallback
}
func getCFQStats(path string, stats *cgroups.Stats) error {
var blkioStats []cgroups.BlkioStatEntry
var err error
if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.sectors_recursive")); err != nil {
return err
for _, statGroup := range orderedStats {
for i, statInfo := range statGroup {
if blkioStats, err = getBlkioStat(path, statInfo.filename); err != nil || blkioStats == nil {
// if error occurs on first file, move to next group
if i == 0 {
break
}
return err
}
*statInfo.blkioStatEntriesPtr = blkioStats
// finish if all stats are gathered
if i == len(statGroup)-1 {
return nil
}
}
}
stats.BlkioStats.SectorsRecursive = blkioStats
if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_service_bytes_recursive")); err != nil {
return err
}
stats.BlkioStats.IoServiceBytesRecursive = blkioStats
if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_serviced_recursive")); err != nil {
return err
}
stats.BlkioStats.IoServicedRecursive = blkioStats
if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_queued_recursive")); err != nil {
return err
}
stats.BlkioStats.IoQueuedRecursive = blkioStats
if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_service_time_recursive")); err != nil {
return err
}
stats.BlkioStats.IoServiceTimeRecursive = blkioStats
if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_wait_time_recursive")); err != nil {
return err
}
stats.BlkioStats.IoWaitTimeRecursive = blkioStats
if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_merged_recursive")); err != nil {
return err
}
stats.BlkioStats.IoMergedRecursive = blkioStats
if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.time_recursive")); err != nil {
return err
}
stats.BlkioStats.IoTimeRecursive = blkioStats
return nil
}
func getStats(path string, stats *cgroups.Stats) error {
var blkioStats []cgroups.BlkioStatEntry
var err error
if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.throttle.io_service_bytes")); err != nil {
return err
func (s *BlkioGroup) detectWeightFilenames(path string) {
if s.weightFilename != "" {
// Already detected.
return
}
stats.BlkioStats.IoServiceBytesRecursive = blkioStats
if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.throttle.io_serviced")); err != nil {
return err
if cgroups.PathExists(filepath.Join(path, "blkio.weight")) {
s.weightFilename = "blkio.weight"
s.weightDeviceFilename = "blkio.weight_device"
} else {
s.weightFilename = "blkio.bfq.weight"
s.weightDeviceFilename = "blkio.bfq.weight_device"
}
stats.BlkioStats.IoServicedRecursive = blkioStats
return nil
}

File diff suppressed because it is too large Load Diff

View File

@ -1,94 +1,105 @@
// +build linux
package fs
import (
"bufio"
"errors"
"fmt"
"os"
"path/filepath"
"strconv"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
"golang.org/x/sys/unix"
)
type CpuGroup struct {
}
type CpuGroup struct{}
func (s *CpuGroup) Name() string {
return "cpu"
}
func (s *CpuGroup) Apply(d *cgroupData) error {
// We always want to join the cpu group, to allow fair cpu scheduling
// on a container basis
path, err := d.path("cpu")
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return s.ApplyDir(path, d.config, d.pid)
}
func (s *CpuGroup) ApplyDir(path string, cgroup *configs.Cgroup, pid int) error {
// This might happen if we have no cpu cgroup mounted.
// Just do nothing and don't fail.
if path == "" {
return nil
}
if err := os.MkdirAll(path, 0755); err != nil {
func (s *CpuGroup) Apply(path string, r *configs.Resources, pid int) error {
if err := os.MkdirAll(path, 0o755); err != nil {
return err
}
// We should set the real-Time group scheduling settings before moving
// in the process because if the process is already in SCHED_RR mode
// and no RT bandwidth is set, adding it will fail.
if err := s.SetRtSched(path, cgroup); err != nil {
if err := s.SetRtSched(path, r); err != nil {
return err
}
// because we are not using d.join we need to place the pid into the procs file
// unlike the other subsystems
// Since we are not using apply(), we need to place the pid
// into the procs file.
return cgroups.WriteCgroupProc(path, pid)
}
func (s *CpuGroup) SetRtSched(path string, cgroup *configs.Cgroup) error {
if cgroup.Resources.CpuRtPeriod != 0 {
if err := fscommon.WriteFile(path, "cpu.rt_period_us", strconv.FormatUint(cgroup.Resources.CpuRtPeriod, 10)); err != nil {
func (s *CpuGroup) SetRtSched(path string, r *configs.Resources) error {
if r.CpuRtPeriod != 0 {
if err := cgroups.WriteFile(path, "cpu.rt_period_us", strconv.FormatUint(r.CpuRtPeriod, 10)); err != nil {
return err
}
}
if cgroup.Resources.CpuRtRuntime != 0 {
if err := fscommon.WriteFile(path, "cpu.rt_runtime_us", strconv.FormatInt(cgroup.Resources.CpuRtRuntime, 10)); err != nil {
if r.CpuRtRuntime != 0 {
if err := cgroups.WriteFile(path, "cpu.rt_runtime_us", strconv.FormatInt(r.CpuRtRuntime, 10)); err != nil {
return err
}
}
return nil
}
func (s *CpuGroup) Set(path string, cgroup *configs.Cgroup) error {
if cgroup.Resources.CpuShares != 0 {
if err := fscommon.WriteFile(path, "cpu.shares", strconv.FormatUint(cgroup.Resources.CpuShares, 10)); err != nil {
func (s *CpuGroup) Set(path string, r *configs.Resources) error {
if r.CpuShares != 0 {
shares := r.CpuShares
if err := cgroups.WriteFile(path, "cpu.shares", strconv.FormatUint(shares, 10)); err != nil {
return err
}
}
if cgroup.Resources.CpuPeriod != 0 {
if err := fscommon.WriteFile(path, "cpu.cfs_period_us", strconv.FormatUint(cgroup.Resources.CpuPeriod, 10)); err != nil {
// read it back
sharesRead, err := fscommon.GetCgroupParamUint(path, "cpu.shares")
if err != nil {
return err
}
}
if cgroup.Resources.CpuQuota != 0 {
if err := fscommon.WriteFile(path, "cpu.cfs_quota_us", strconv.FormatInt(cgroup.Resources.CpuQuota, 10)); err != nil {
return err
// ... and check
if shares > sharesRead {
return fmt.Errorf("the maximum allowed cpu-shares is %d", sharesRead)
} else if shares < sharesRead {
return fmt.Errorf("the minimum allowed cpu-shares is %d", sharesRead)
}
}
return s.SetRtSched(path, cgroup)
}
func (s *CpuGroup) Remove(d *cgroupData) error {
return removePath(d.path("cpu"))
var period string
if r.CpuPeriod != 0 {
period = strconv.FormatUint(r.CpuPeriod, 10)
if err := cgroups.WriteFile(path, "cpu.cfs_period_us", period); err != nil {
// Sometimes when the period to be set is smaller
// than the current one, it is rejected by the kernel
// (EINVAL) as old_quota/new_period exceeds the parent
// cgroup quota limit. If this happens and the quota is
// going to be set, ignore the error for now and retry
// after setting the quota.
if !errors.Is(err, unix.EINVAL) || r.CpuQuota == 0 {
return err
}
} else {
period = ""
}
}
if r.CpuQuota != 0 {
if err := cgroups.WriteFile(path, "cpu.cfs_quota_us", strconv.FormatInt(r.CpuQuota, 10)); err != nil {
return err
}
if period != "" {
if err := cgroups.WriteFile(path, "cpu.cfs_period_us", period); err != nil {
return err
}
}
}
return s.SetRtSched(path, r)
}
func (s *CpuGroup) GetStats(path string, stats *cgroups.Stats) error {
f, err := os.Open(filepath.Join(path, "cpu.stat"))
const file = "cpu.stat"
f, err := cgroups.OpenFile(path, file, os.O_RDONLY)
if err != nil {
if os.IsNotExist(err) {
return nil
@ -99,9 +110,9 @@ func (s *CpuGroup) GetStats(path string, stats *cgroups.Stats) error {
sc := bufio.NewScanner(f)
for sc.Scan() {
t, v, err := fscommon.GetCgroupParamKeyValue(sc.Text())
t, v, err := fscommon.ParseKeyValue(sc.Text())
if err != nil {
return err
return &parseError{Path: path, File: file, Err: err}
}
switch t {
case "nr_periods":

View File

@ -1,5 +1,3 @@
// +build linux
package fs
import (
@ -9,40 +7,40 @@ import (
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
)
func TestCpuSetShares(t *testing.T) {
helper := NewCgroupTestUtil("cpu", t)
defer helper.cleanup()
path := tempDir(t, "cpu")
const (
sharesBefore = 1024
sharesAfter = 512
)
helper.writeFileContents(map[string]string{
writeFileContents(t, path, map[string]string{
"cpu.shares": strconv.Itoa(sharesBefore),
})
helper.CgroupData.config.Resources.CpuShares = sharesAfter
r := &configs.Resources{
CpuShares: sharesAfter,
}
cpu := &CpuGroup{}
if err := cpu.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
if err := cpu.Set(path, r); err != nil {
t.Fatal(err)
}
value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "cpu.shares")
value, err := fscommon.GetCgroupParamUint(path, "cpu.shares")
if err != nil {
t.Fatalf("Failed to parse cpu.shares - %s", err)
t.Fatal(err)
}
if value != sharesAfter {
t.Fatal("Got the wrong value, set cpu.shares failed.")
}
}
func TestCpuSetBandWidth(t *testing.T) {
helper := NewCgroupTestUtil("cpu", t)
defer helper.cleanup()
path := tempDir(t, "cpu")
const (
quotaBefore = 8000
@ -55,47 +53,51 @@ func TestCpuSetBandWidth(t *testing.T) {
rtPeriodAfter = 7000
)
helper.writeFileContents(map[string]string{
writeFileContents(t, path, map[string]string{
"cpu.cfs_quota_us": strconv.Itoa(quotaBefore),
"cpu.cfs_period_us": strconv.Itoa(periodBefore),
"cpu.rt_runtime_us": strconv.Itoa(rtRuntimeBefore),
"cpu.rt_period_us": strconv.Itoa(rtPeriodBefore),
})
helper.CgroupData.config.Resources.CpuQuota = quotaAfter
helper.CgroupData.config.Resources.CpuPeriod = periodAfter
helper.CgroupData.config.Resources.CpuRtRuntime = rtRuntimeAfter
helper.CgroupData.config.Resources.CpuRtPeriod = rtPeriodAfter
r := &configs.Resources{
CpuQuota: quotaAfter,
CpuPeriod: periodAfter,
CpuRtRuntime: rtRuntimeAfter,
CpuRtPeriod: rtPeriodAfter,
}
cpu := &CpuGroup{}
if err := cpu.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
if err := cpu.Set(path, r); err != nil {
t.Fatal(err)
}
quota, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "cpu.cfs_quota_us")
quota, err := fscommon.GetCgroupParamUint(path, "cpu.cfs_quota_us")
if err != nil {
t.Fatalf("Failed to parse cpu.cfs_quota_us - %s", err)
t.Fatal(err)
}
if quota != quotaAfter {
t.Fatal("Got the wrong value, set cpu.cfs_quota_us failed.")
}
period, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "cpu.cfs_period_us")
period, err := fscommon.GetCgroupParamUint(path, "cpu.cfs_period_us")
if err != nil {
t.Fatalf("Failed to parse cpu.cfs_period_us - %s", err)
t.Fatal(err)
}
if period != periodAfter {
t.Fatal("Got the wrong value, set cpu.cfs_period_us failed.")
}
rtRuntime, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "cpu.rt_runtime_us")
rtRuntime, err := fscommon.GetCgroupParamUint(path, "cpu.rt_runtime_us")
if err != nil {
t.Fatalf("Failed to parse cpu.rt_runtime_us - %s", err)
t.Fatal(err)
}
if rtRuntime != rtRuntimeAfter {
t.Fatal("Got the wrong value, set cpu.rt_runtime_us failed.")
}
rtPeriod, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "cpu.rt_period_us")
rtPeriod, err := fscommon.GetCgroupParamUint(path, "cpu.rt_period_us")
if err != nil {
t.Fatalf("Failed to parse cpu.rt_period_us - %s", err)
t.Fatal(err)
}
if rtPeriod != rtPeriodAfter {
t.Fatal("Got the wrong value, set cpu.rt_period_us failed.")
@ -103,8 +105,7 @@ func TestCpuSetBandWidth(t *testing.T) {
}
func TestCpuStats(t *testing.T) {
helper := NewCgroupTestUtil("cpu", t)
defer helper.cleanup()
path := tempDir(t, "cpu")
const (
nrPeriods = 2000
@ -112,15 +113,15 @@ func TestCpuStats(t *testing.T) {
throttledTime = uint64(18446744073709551615)
)
cpuStatContent := fmt.Sprintf("nr_periods %d\n nr_throttled %d\n throttled_time %d\n",
cpuStatContent := fmt.Sprintf("nr_periods %d\nnr_throttled %d\nthrottled_time %d\n",
nrPeriods, nrThrottled, throttledTime)
helper.writeFileContents(map[string]string{
writeFileContents(t, path, map[string]string{
"cpu.stat": cpuStatContent,
})
cpu := &CpuGroup{}
actualStats := *cgroups.NewStats()
err := cpu.GetStats(helper.CgroupPath, &actualStats)
err := cpu.GetStats(path, &actualStats)
if err != nil {
t.Fatal(err)
}
@ -128,44 +129,43 @@ func TestCpuStats(t *testing.T) {
expectedStats := cgroups.ThrottlingData{
Periods: nrPeriods,
ThrottledPeriods: nrThrottled,
ThrottledTime: throttledTime}
ThrottledTime: throttledTime,
}
expectThrottlingDataEquals(t, expectedStats, actualStats.CpuStats.ThrottlingData)
}
func TestNoCpuStatFile(t *testing.T) {
helper := NewCgroupTestUtil("cpu", t)
defer helper.cleanup()
path := tempDir(t, "cpu")
cpu := &CpuGroup{}
actualStats := *cgroups.NewStats()
err := cpu.GetStats(helper.CgroupPath, &actualStats)
err := cpu.GetStats(path, &actualStats)
if err != nil {
t.Fatal("Expected not to fail, but did")
}
}
func TestInvalidCpuStat(t *testing.T) {
helper := NewCgroupTestUtil("cpu", t)
defer helper.cleanup()
path := tempDir(t, "cpu")
cpuStatContent := `nr_periods 2000
nr_throttled 200
throttled_time fortytwo`
helper.writeFileContents(map[string]string{
writeFileContents(t, path, map[string]string{
"cpu.stat": cpuStatContent,
})
cpu := &CpuGroup{}
actualStats := *cgroups.NewStats()
err := cpu.GetStats(helper.CgroupPath, &actualStats)
err := cpu.GetStats(path, &actualStats)
if err == nil {
t.Fatal("Expected failed stat parsing.")
}
}
func TestCpuSetRtSchedAtApply(t *testing.T) {
helper := NewCgroupTestUtil("cpu", t)
defer helper.cleanup()
path := tempDir(t, "cpu")
const (
rtRuntimeBefore = 0
@ -174,35 +174,40 @@ func TestCpuSetRtSchedAtApply(t *testing.T) {
rtPeriodAfter = 7000
)
helper.writeFileContents(map[string]string{
writeFileContents(t, path, map[string]string{
"cpu.rt_runtime_us": strconv.Itoa(rtRuntimeBefore),
"cpu.rt_period_us": strconv.Itoa(rtPeriodBefore),
})
helper.CgroupData.config.Resources.CpuRtRuntime = rtRuntimeAfter
helper.CgroupData.config.Resources.CpuRtPeriod = rtPeriodAfter
r := &configs.Resources{
CpuRtRuntime: rtRuntimeAfter,
CpuRtPeriod: rtPeriodAfter,
}
cpu := &CpuGroup{}
if err := cpu.ApplyDir(helper.CgroupPath, helper.CgroupData.config, 1234); err != nil {
if err := cpu.Apply(path, r, 1234); err != nil {
t.Fatal(err)
}
rtRuntime, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "cpu.rt_runtime_us")
rtRuntime, err := fscommon.GetCgroupParamUint(path, "cpu.rt_runtime_us")
if err != nil {
t.Fatalf("Failed to parse cpu.rt_runtime_us - %s", err)
t.Fatal(err)
}
if rtRuntime != rtRuntimeAfter {
t.Fatal("Got the wrong value, set cpu.rt_runtime_us failed.")
}
rtPeriod, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "cpu.rt_period_us")
rtPeriod, err := fscommon.GetCgroupParamUint(path, "cpu.rt_period_us")
if err != nil {
t.Fatalf("Failed to parse cpu.rt_period_us - %s", err)
t.Fatal(err)
}
if rtPeriod != rtPeriodAfter {
t.Fatal("Got the wrong value, set cpu.rt_period_us failed.")
}
pid, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "cgroup.procs")
pid, err := fscommon.GetCgroupParamUint(path, "cgroup.procs")
if err != nil {
t.Fatalf("Failed to parse cgroup.procs - %s", err)
t.Fatal(err)
}
if pid != 1234 {
t.Fatal("Got the wrong value, set cgroup.procs failed.")

View File

@ -1,52 +1,51 @@
// +build linux
package fs
import (
"fmt"
"io/ioutil"
"path/filepath"
"bufio"
"os"
"strconv"
"strings"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/system"
)
const (
cgroupCpuacctStat = "cpuacct.stat"
cgroupCpuacctStat = "cpuacct.stat"
cgroupCpuacctUsageAll = "cpuacct.usage_all"
nanosecondsInSecond = 1000000000
userModeColumn = 1
kernelModeColumn = 2
cuacctUsageAllColumnsNumber = 3
// The value comes from `C.sysconf(C._SC_CLK_TCK)`, and
// on Linux it's a constant which is safe to be hard coded,
// so we can avoid using cgo here. For details, see:
// https://github.com/containerd/cgroups/pull/12
clockTicks uint64 = 100
)
var clockTicks = uint64(system.GetClockTicks())
type CpuacctGroup struct {
}
type CpuacctGroup struct{}
func (s *CpuacctGroup) Name() string {
return "cpuacct"
}
func (s *CpuacctGroup) Apply(d *cgroupData) error {
// we just want to join this group even though we don't set anything
if _, err := d.join("cpuacct"); err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
func (s *CpuacctGroup) Apply(path string, _ *configs.Resources, pid int) error {
return apply(path, pid)
}
func (s *CpuacctGroup) Set(path string, cgroup *configs.Cgroup) error {
func (s *CpuacctGroup) Set(_ string, _ *configs.Resources) error {
return nil
}
func (s *CpuacctGroup) Remove(d *cgroupData) error {
return removePath(d.path("cpuacct"))
}
func (s *CpuacctGroup) GetStats(path string, stats *cgroups.Stats) error {
if !cgroups.PathExists(path) {
return nil
}
userModeUsage, kernelModeUsage, err := getCpuUsageBreakdown(path)
if err != nil {
return err
@ -62,8 +61,15 @@ func (s *CpuacctGroup) GetStats(path string, stats *cgroups.Stats) error {
return err
}
percpuUsageInKernelmode, percpuUsageInUsermode, err := getPercpuUsageInModes(path)
if err != nil {
return err
}
stats.CpuStats.CpuUsage.TotalUsage = totalUsage
stats.CpuStats.CpuUsage.PercpuUsage = percpuUsage
stats.CpuStats.CpuUsage.PercpuUsageInKernelmode = percpuUsageInKernelmode
stats.CpuStats.CpuUsage.PercpuUsageInUsermode = percpuUsageInUsermode
stats.CpuStats.CpuUsage.UsageInUsermode = userModeUsage
stats.CpuStats.CpuUsage.UsageInKernelmode = kernelModeUsage
return nil
@ -71,52 +77,90 @@ func (s *CpuacctGroup) GetStats(path string, stats *cgroups.Stats) error {
// Returns user and kernel usage breakdown in nanoseconds.
func getCpuUsageBreakdown(path string) (uint64, uint64, error) {
userModeUsage := uint64(0)
kernelModeUsage := uint64(0)
var userModeUsage, kernelModeUsage uint64
const (
userField = "user"
systemField = "system"
file = cgroupCpuacctStat
)
// Expected format:
// user <usage in ticks>
// system <usage in ticks>
data, err := ioutil.ReadFile(filepath.Join(path, cgroupCpuacctStat))
data, err := cgroups.ReadFile(path, file)
if err != nil {
return 0, 0, err
}
fields := strings.Fields(string(data))
if len(fields) < 4 {
return 0, 0, fmt.Errorf("failure - %s is expected to have at least 4 fields", filepath.Join(path, cgroupCpuacctStat))
}
if fields[0] != userField {
return 0, 0, fmt.Errorf("unexpected field %q in %q, expected %q", fields[0], cgroupCpuacctStat, userField)
}
if fields[2] != systemField {
return 0, 0, fmt.Errorf("unexpected field %q in %q, expected %q", fields[2], cgroupCpuacctStat, systemField)
// TODO: use strings.SplitN instead.
fields := strings.Fields(data)
if len(fields) < 4 || fields[0] != userField || fields[2] != systemField {
return 0, 0, malformedLine(path, file, data)
}
if userModeUsage, err = strconv.ParseUint(fields[1], 10, 64); err != nil {
return 0, 0, err
return 0, 0, &parseError{Path: path, File: file, Err: err}
}
if kernelModeUsage, err = strconv.ParseUint(fields[3], 10, 64); err != nil {
return 0, 0, err
return 0, 0, &parseError{Path: path, File: file, Err: err}
}
return (userModeUsage * nanosecondsInSecond) / clockTicks, (kernelModeUsage * nanosecondsInSecond) / clockTicks, nil
}
func getPercpuUsage(path string) ([]uint64, error) {
const file = "cpuacct.usage_percpu"
percpuUsage := []uint64{}
data, err := ioutil.ReadFile(filepath.Join(path, "cpuacct.usage_percpu"))
data, err := cgroups.ReadFile(path, file)
if err != nil {
return percpuUsage, err
}
for _, value := range strings.Fields(string(data)) {
// TODO: use strings.SplitN instead.
for _, value := range strings.Fields(data) {
value, err := strconv.ParseUint(value, 10, 64)
if err != nil {
return percpuUsage, fmt.Errorf("Unable to convert param value to uint64: %s", err)
return percpuUsage, &parseError{Path: path, File: file, Err: err}
}
percpuUsage = append(percpuUsage, value)
}
return percpuUsage, nil
}
func getPercpuUsageInModes(path string) ([]uint64, []uint64, error) {
usageKernelMode := []uint64{}
usageUserMode := []uint64{}
const file = cgroupCpuacctUsageAll
fd, err := cgroups.OpenFile(path, file, os.O_RDONLY)
if os.IsNotExist(err) {
return usageKernelMode, usageUserMode, nil
} else if err != nil {
return nil, nil, err
}
defer fd.Close()
scanner := bufio.NewScanner(fd)
scanner.Scan() // skipping header line
for scanner.Scan() {
lineFields := strings.SplitN(scanner.Text(), " ", cuacctUsageAllColumnsNumber+1)
if len(lineFields) != cuacctUsageAllColumnsNumber {
continue
}
usageInKernelMode, err := strconv.ParseUint(lineFields[kernelModeColumn], 10, 64)
if err != nil {
return nil, nil, &parseError{Path: path, File: file, Err: err}
}
usageKernelMode = append(usageKernelMode, usageInKernelMode)
usageInUserMode, err := strconv.ParseUint(lineFields[userModeColumn], 10, 64)
if err != nil {
return nil, nil, &parseError{Path: path, File: file, Err: err}
}
usageUserMode = append(usageUserMode, usageInUserMode)
}
if err := scanner.Err(); err != nil {
return nil, nil, &parseError{Path: path, File: file, Err: err}
}
return usageKernelMode, usageUserMode, nil
}

View File

@ -0,0 +1,97 @@
package fs
import (
"reflect"
"testing"
"github.com/opencontainers/runc/libcontainer/cgroups"
)
const (
cpuAcctUsageContents = "12262454190222160"
cpuAcctUsagePerCPUContents = "1564936537989058 1583937096487821 1604195415465681 1596445226820187 1481069084155629 1478735613864327 1477610593414743 1476362015778086"
cpuAcctStatContents = "user 452278264\nsystem 291429664"
cpuAcctUsageAll = `cpu user system
0 962250696038415 637727786389114
1 981956408513304 638197595421064
2 1002658817529022 638956774598358
3 994937703492523 637985531181620
4 874843781648690 638837766495476
5 872544369885276 638763309884944
6 870104915696359 640081778921247
7 870202363887496 638716766259495
`
)
func TestCpuacctStats(t *testing.T) {
path := tempDir(t, "cpuacct")
writeFileContents(t, path, map[string]string{
"cpuacct.usage": cpuAcctUsageContents,
"cpuacct.usage_percpu": cpuAcctUsagePerCPUContents,
"cpuacct.stat": cpuAcctStatContents,
"cpuacct.usage_all": cpuAcctUsageAll,
})
cpuacct := &CpuacctGroup{}
actualStats := *cgroups.NewStats()
err := cpuacct.GetStats(path, &actualStats)
if err != nil {
t.Fatal(err)
}
expectedStats := cgroups.CpuUsage{
TotalUsage: uint64(12262454190222160),
PercpuUsage: []uint64{
1564936537989058, 1583937096487821, 1604195415465681, 1596445226820187,
1481069084155629, 1478735613864327, 1477610593414743, 1476362015778086,
},
PercpuUsageInKernelmode: []uint64{
637727786389114, 638197595421064, 638956774598358, 637985531181620,
638837766495476, 638763309884944, 640081778921247, 638716766259495,
},
PercpuUsageInUsermode: []uint64{
962250696038415, 981956408513304, 1002658817529022, 994937703492523,
874843781648690, 872544369885276, 870104915696359, 870202363887496,
},
UsageInKernelmode: (uint64(291429664) * nanosecondsInSecond) / clockTicks,
UsageInUsermode: (uint64(452278264) * nanosecondsInSecond) / clockTicks,
}
if !reflect.DeepEqual(expectedStats, actualStats.CpuStats.CpuUsage) {
t.Errorf("Expected CPU usage %#v but found %#v\n",
expectedStats, actualStats.CpuStats.CpuUsage)
}
}
func TestCpuacctStatsWithoutUsageAll(t *testing.T) {
path := tempDir(t, "cpuacct")
writeFileContents(t, path, map[string]string{
"cpuacct.usage": cpuAcctUsageContents,
"cpuacct.usage_percpu": cpuAcctUsagePerCPUContents,
"cpuacct.stat": cpuAcctStatContents,
})
cpuacct := &CpuacctGroup{}
actualStats := *cgroups.NewStats()
err := cpuacct.GetStats(path, &actualStats)
if err != nil {
t.Fatal(err)
}
expectedStats := cgroups.CpuUsage{
TotalUsage: uint64(12262454190222160),
PercpuUsage: []uint64{
1564936537989058, 1583937096487821, 1604195415465681, 1596445226820187,
1481069084155629, 1478735613864327, 1477610593414743, 1476362015778086,
},
PercpuUsageInKernelmode: []uint64{},
PercpuUsageInUsermode: []uint64{},
UsageInKernelmode: (uint64(291429664) * nanosecondsInSecond) / clockTicks,
UsageInUsermode: (uint64(452278264) * nanosecondsInSecond) / clockTicks,
}
if !reflect.DeepEqual(expectedStats, actualStats.CpuStats.CpuUsage) {
t.Errorf("Expected CPU usage %#v but found %#v\n",
expectedStats, actualStats.CpuStats.CpuUsage)
}
}

View File

@ -1,75 +1,159 @@
// +build linux
package fs
import (
"bytes"
"fmt"
"io/ioutil"
"errors"
"os"
"path/filepath"
"strconv"
"strings"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
)
type CpusetGroup struct {
}
type CpusetGroup struct{}
func (s *CpusetGroup) Name() string {
return "cpuset"
}
func (s *CpusetGroup) Apply(d *cgroupData) error {
dir, err := d.path("cpuset")
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return s.ApplyDir(dir, d.config, d.pid)
func (s *CpusetGroup) Apply(path string, r *configs.Resources, pid int) error {
return s.ApplyDir(path, r, pid)
}
func (s *CpusetGroup) Set(path string, cgroup *configs.Cgroup) error {
if cgroup.Resources.CpusetCpus != "" {
if err := fscommon.WriteFile(path, "cpuset.cpus", cgroup.Resources.CpusetCpus); err != nil {
func (s *CpusetGroup) Set(path string, r *configs.Resources) error {
if r.CpusetCpus != "" {
if err := cgroups.WriteFile(path, "cpuset.cpus", r.CpusetCpus); err != nil {
return err
}
}
if cgroup.Resources.CpusetMems != "" {
if err := fscommon.WriteFile(path, "cpuset.mems", cgroup.Resources.CpusetMems); err != nil {
if r.CpusetMems != "" {
if err := cgroups.WriteFile(path, "cpuset.mems", r.CpusetMems); err != nil {
return err
}
}
return nil
}
func (s *CpusetGroup) Remove(d *cgroupData) error {
return removePath(d.path("cpuset"))
func getCpusetStat(path string, file string) ([]uint16, error) {
var extracted []uint16
fileContent, err := fscommon.GetCgroupParamString(path, file)
if err != nil {
return extracted, err
}
if len(fileContent) == 0 {
return extracted, &parseError{Path: path, File: file, Err: errors.New("empty file")}
}
for _, s := range strings.Split(fileContent, ",") {
sp := strings.SplitN(s, "-", 3)
switch len(sp) {
case 3:
return extracted, &parseError{Path: path, File: file, Err: errors.New("extra dash")}
case 2:
min, err := strconv.ParseUint(sp[0], 10, 16)
if err != nil {
return extracted, &parseError{Path: path, File: file, Err: err}
}
max, err := strconv.ParseUint(sp[1], 10, 16)
if err != nil {
return extracted, &parseError{Path: path, File: file, Err: err}
}
if min > max {
return extracted, &parseError{Path: path, File: file, Err: errors.New("invalid values, min > max")}
}
for i := min; i <= max; i++ {
extracted = append(extracted, uint16(i))
}
case 1:
value, err := strconv.ParseUint(s, 10, 16)
if err != nil {
return extracted, &parseError{Path: path, File: file, Err: err}
}
extracted = append(extracted, uint16(value))
}
}
return extracted, nil
}
func (s *CpusetGroup) GetStats(path string, stats *cgroups.Stats) error {
var err error
stats.CPUSetStats.CPUs, err = getCpusetStat(path, "cpuset.cpus")
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
stats.CPUSetStats.CPUExclusive, err = fscommon.GetCgroupParamUint(path, "cpuset.cpu_exclusive")
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
stats.CPUSetStats.Mems, err = getCpusetStat(path, "cpuset.mems")
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
stats.CPUSetStats.MemHardwall, err = fscommon.GetCgroupParamUint(path, "cpuset.mem_hardwall")
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
stats.CPUSetStats.MemExclusive, err = fscommon.GetCgroupParamUint(path, "cpuset.mem_exclusive")
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
stats.CPUSetStats.MemoryMigrate, err = fscommon.GetCgroupParamUint(path, "cpuset.memory_migrate")
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
stats.CPUSetStats.MemorySpreadPage, err = fscommon.GetCgroupParamUint(path, "cpuset.memory_spread_page")
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
stats.CPUSetStats.MemorySpreadSlab, err = fscommon.GetCgroupParamUint(path, "cpuset.memory_spread_slab")
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
stats.CPUSetStats.MemoryPressure, err = fscommon.GetCgroupParamUint(path, "cpuset.memory_pressure")
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
stats.CPUSetStats.SchedLoadBalance, err = fscommon.GetCgroupParamUint(path, "cpuset.sched_load_balance")
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
stats.CPUSetStats.SchedRelaxDomainLevel, err = fscommon.GetCgroupParamInt(path, "cpuset.sched_relax_domain_level")
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
return nil
}
func (s *CpusetGroup) ApplyDir(dir string, cgroup *configs.Cgroup, pid int) error {
func (s *CpusetGroup) ApplyDir(dir string, r *configs.Resources, pid int) error {
// This might happen if we have no cpuset cgroup mounted.
// Just do nothing and don't fail.
if dir == "" {
return nil
}
mountInfo, err := ioutil.ReadFile("/proc/self/mountinfo")
if err != nil {
return err
}
root := filepath.Dir(cgroups.GetClosestMountpointAncestor(dir, string(mountInfo)))
// 'ensureParent' start with parent because we don't want to
// explicitly inherit from parent, it could conflict with
// 'cpuset.cpu_exclusive'.
if err := s.ensureParent(filepath.Dir(dir), root); err != nil {
if err := cpusetEnsureParent(filepath.Dir(dir)); err != nil {
return err
}
if err := os.MkdirAll(dir, 0755); err != nil {
if err := os.Mkdir(dir, 0o755); err != nil && !os.IsExist(err) {
return err
}
// We didn't inherit cpuset configs from parent, but we have
@ -79,82 +163,83 @@ func (s *CpusetGroup) ApplyDir(dir string, cgroup *configs.Cgroup, pid int) erro
// specified configs, otherwise, inherit from parent. This makes
// cpuset configs work correctly with 'cpuset.cpu_exclusive', and
// keep backward compatibility.
if err := s.ensureCpusAndMems(dir, cgroup); err != nil {
if err := s.ensureCpusAndMems(dir, r); err != nil {
return err
}
// because we are not using d.join we need to place the pid into the procs file
// unlike the other subsystems
// Since we are not using apply(), we need to place the pid
// into the procs file.
return cgroups.WriteCgroupProc(dir, pid)
}
func (s *CpusetGroup) getSubsystemSettings(parent string) (cpus []byte, mems []byte, err error) {
if cpus, err = ioutil.ReadFile(filepath.Join(parent, "cpuset.cpus")); err != nil {
func getCpusetSubsystemSettings(parent string) (cpus, mems string, err error) {
if cpus, err = cgroups.ReadFile(parent, "cpuset.cpus"); err != nil {
return
}
if mems, err = ioutil.ReadFile(filepath.Join(parent, "cpuset.mems")); err != nil {
if mems, err = cgroups.ReadFile(parent, "cpuset.mems"); err != nil {
return
}
return cpus, mems, nil
}
// ensureParent makes sure that the parent directory of current is created
// and populated with the proper cpus and mems files copied from
// it's parent.
func (s *CpusetGroup) ensureParent(current, root string) error {
// cpusetEnsureParent makes sure that the parent directories of current
// are created and populated with the proper cpus and mems files copied
// from their respective parent. It does that recursively, starting from
// the top of the cpuset hierarchy (i.e. cpuset cgroup mount point).
func cpusetEnsureParent(current string) error {
var st unix.Statfs_t
parent := filepath.Dir(current)
if libcontainerUtils.CleanPath(parent) == root {
err := unix.Statfs(parent, &st)
if err == nil && st.Type != unix.CGROUP_SUPER_MAGIC {
return nil
}
// Avoid infinite recursion.
if parent == current {
return fmt.Errorf("cpuset: cgroup parent path outside cgroup root")
// Treat non-existing directory as cgroupfs as it will be created,
// and the root cpuset directory obviously exists.
if err != nil && err != unix.ENOENT { //nolint:errorlint // unix errors are bare
return &os.PathError{Op: "statfs", Path: parent, Err: err}
}
if err := s.ensureParent(parent, root); err != nil {
if err := cpusetEnsureParent(parent); err != nil {
return err
}
if err := os.MkdirAll(current, 0755); err != nil {
if err := os.Mkdir(current, 0o755); err != nil && !os.IsExist(err) {
return err
}
return s.copyIfNeeded(current, parent)
return cpusetCopyIfNeeded(current, parent)
}
// copyIfNeeded copies the cpuset.cpus and cpuset.mems from the parent
// cpusetCopyIfNeeded copies the cpuset.cpus and cpuset.mems from the parent
// directory to the current directory if the file's contents are 0
func (s *CpusetGroup) copyIfNeeded(current, parent string) error {
var (
err error
currentCpus, currentMems []byte
parentCpus, parentMems []byte
)
if currentCpus, currentMems, err = s.getSubsystemSettings(current); err != nil {
func cpusetCopyIfNeeded(current, parent string) error {
currentCpus, currentMems, err := getCpusetSubsystemSettings(current)
if err != nil {
return err
}
if parentCpus, parentMems, err = s.getSubsystemSettings(parent); err != nil {
parentCpus, parentMems, err := getCpusetSubsystemSettings(parent)
if err != nil {
return err
}
if s.isEmpty(currentCpus) {
if err := fscommon.WriteFile(current, "cpuset.cpus", string(parentCpus)); err != nil {
if isEmptyCpuset(currentCpus) {
if err := cgroups.WriteFile(current, "cpuset.cpus", parentCpus); err != nil {
return err
}
}
if s.isEmpty(currentMems) {
if err := fscommon.WriteFile(current, "cpuset.mems", string(parentMems)); err != nil {
if isEmptyCpuset(currentMems) {
if err := cgroups.WriteFile(current, "cpuset.mems", parentMems); err != nil {
return err
}
}
return nil
}
func (s *CpusetGroup) isEmpty(b []byte) bool {
return len(bytes.Trim(b, "\n")) == 0
func isEmptyCpuset(str string) bool {
return str == "" || str == "\n"
}
func (s *CpusetGroup) ensureCpusAndMems(path string, cgroup *configs.Cgroup) error {
if err := s.Set(path, cgroup); err != nil {
func (s *CpusetGroup) ensureCpusAndMems(path string, r *configs.Resources) error {
if err := s.Set(path, r); err != nil {
return err
}
return s.copyIfNeeded(path, filepath.Dir(path))
return cpusetCopyIfNeeded(path, filepath.Dir(path))
}

View File

@ -1,67 +1,242 @@
// +build linux
package fs
import (
"reflect"
"testing"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
)
func TestCpusetSetCpus(t *testing.T) {
helper := NewCgroupTestUtil("cpuset", t)
defer helper.cleanup()
const (
cpus = "0-2,7,12-14\n"
cpuExclusive = "1\n"
mems = "1-4,6,9\n"
memHardwall = "0\n"
memExclusive = "0\n"
memoryMigrate = "1\n"
memorySpreadPage = "0\n"
memorySpeadSlab = "1\n"
memoryPressure = "34377\n"
schedLoadBalance = "1\n"
schedRelaxDomainLevel = "-1\n"
)
var cpusetTestFiles = map[string]string{
"cpuset.cpus": cpus,
"cpuset.cpu_exclusive": cpuExclusive,
"cpuset.mems": mems,
"cpuset.mem_hardwall": memHardwall,
"cpuset.mem_exclusive": memExclusive,
"cpuset.memory_migrate": memoryMigrate,
"cpuset.memory_spread_page": memorySpreadPage,
"cpuset.memory_spread_slab": memorySpeadSlab,
"cpuset.memory_pressure": memoryPressure,
"cpuset.sched_load_balance": schedLoadBalance,
"cpuset.sched_relax_domain_level": schedRelaxDomainLevel,
}
func TestCPUSetSetCpus(t *testing.T) {
path := tempDir(t, "cpuset")
const (
cpusBefore = "0"
cpusAfter = "1-3"
)
helper.writeFileContents(map[string]string{
writeFileContents(t, path, map[string]string{
"cpuset.cpus": cpusBefore,
})
helper.CgroupData.config.Resources.CpusetCpus = cpusAfter
r := &configs.Resources{
CpusetCpus: cpusAfter,
}
cpuset := &CpusetGroup{}
if err := cpuset.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
if err := cpuset.Set(path, r); err != nil {
t.Fatal(err)
}
value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "cpuset.cpus")
value, err := fscommon.GetCgroupParamString(path, "cpuset.cpus")
if err != nil {
t.Fatalf("Failed to parse cpuset.cpus - %s", err)
t.Fatal(err)
}
if value != cpusAfter {
t.Fatal("Got the wrong value, set cpuset.cpus failed.")
}
}
func TestCpusetSetMems(t *testing.T) {
helper := NewCgroupTestUtil("cpuset", t)
defer helper.cleanup()
func TestCPUSetSetMems(t *testing.T) {
path := tempDir(t, "cpuset")
const (
memsBefore = "0"
memsAfter = "1"
)
helper.writeFileContents(map[string]string{
writeFileContents(t, path, map[string]string{
"cpuset.mems": memsBefore,
})
helper.CgroupData.config.Resources.CpusetMems = memsAfter
r := &configs.Resources{
CpusetMems: memsAfter,
}
cpuset := &CpusetGroup{}
if err := cpuset.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
if err := cpuset.Set(path, r); err != nil {
t.Fatal(err)
}
value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "cpuset.mems")
value, err := fscommon.GetCgroupParamString(path, "cpuset.mems")
if err != nil {
t.Fatalf("Failed to parse cpuset.mems - %s", err)
t.Fatal(err)
}
if value != memsAfter {
t.Fatal("Got the wrong value, set cpuset.mems failed.")
}
}
func TestCPUSetStatsCorrect(t *testing.T) {
path := tempDir(t, "cpuset")
writeFileContents(t, path, cpusetTestFiles)
cpuset := &CpusetGroup{}
actualStats := *cgroups.NewStats()
err := cpuset.GetStats(path, &actualStats)
if err != nil {
t.Fatal(err)
}
expectedStats := cgroups.CPUSetStats{
CPUs: []uint16{0, 1, 2, 7, 12, 13, 14},
CPUExclusive: 1,
Mems: []uint16{1, 2, 3, 4, 6, 9},
MemoryMigrate: 1,
MemHardwall: 0,
MemExclusive: 0,
MemorySpreadPage: 0,
MemorySpreadSlab: 1,
MemoryPressure: 34377,
SchedLoadBalance: 1,
SchedRelaxDomainLevel: -1,
}
if !reflect.DeepEqual(expectedStats, actualStats.CPUSetStats) {
t.Fatalf("Expected Cpuset stats usage %#v but found %#v",
expectedStats, actualStats.CPUSetStats)
}
}
func TestCPUSetStatsMissingFiles(t *testing.T) {
for _, testCase := range []struct {
desc string
filename, contents string
removeFile bool
}{
{
desc: "empty cpus file",
filename: "cpuset.cpus",
contents: "",
removeFile: false,
},
{
desc: "empty mems file",
filename: "cpuset.mems",
contents: "",
removeFile: false,
},
{
desc: "corrupted cpus file",
filename: "cpuset.cpus",
contents: "0-3,*4^2",
removeFile: false,
},
{
desc: "corrupted mems file",
filename: "cpuset.mems",
contents: "0,1,2-5,8-7",
removeFile: false,
},
{
desc: "missing cpu_exclusive file",
filename: "cpuset.cpu_exclusive",
contents: "",
removeFile: true,
},
{
desc: "missing memory_migrate file",
filename: "cpuset.memory_migrate",
contents: "",
removeFile: true,
},
{
desc: "missing mem_hardwall file",
filename: "cpuset.mem_hardwall",
contents: "",
removeFile: true,
},
{
desc: "missing mem_exclusive file",
filename: "cpuset.mem_exclusive",
contents: "",
removeFile: true,
},
{
desc: "missing memory_spread_page file",
filename: "cpuset.memory_spread_page",
contents: "",
removeFile: true,
},
{
desc: "missing memory_spread_slab file",
filename: "cpuset.memory_spread_slab",
contents: "",
removeFile: true,
},
{
desc: "missing memory_pressure file",
filename: "cpuset.memory_pressure",
contents: "",
removeFile: true,
},
{
desc: "missing sched_load_balance file",
filename: "cpuset.sched_load_balance",
contents: "",
removeFile: true,
},
{
desc: "missing sched_relax_domain_level file",
filename: "cpuset.sched_relax_domain_level",
contents: "",
removeFile: true,
},
} {
t.Run(testCase.desc, func(t *testing.T) {
path := tempDir(t, "cpuset")
tempCpusetTestFiles := map[string]string{}
for i, v := range cpusetTestFiles {
tempCpusetTestFiles[i] = v
}
if testCase.removeFile {
delete(tempCpusetTestFiles, testCase.filename)
writeFileContents(t, path, tempCpusetTestFiles)
cpuset := &CpusetGroup{}
actualStats := *cgroups.NewStats()
err := cpuset.GetStats(path, &actualStats)
if err != nil {
t.Errorf("failed unexpectedly: %q", err)
}
} else {
tempCpusetTestFiles[testCase.filename] = testCase.contents
writeFileContents(t, path, tempCpusetTestFiles)
cpuset := &CpusetGroup{}
actualStats := *cgroups.NewStats()
err := cpuset.GetStats(path, &actualStats)
if err == nil {
t.Error("failed to return expected error")
}
}
})
}
}

View File

@ -1,81 +1,109 @@
// +build linux
package fs
import (
"bytes"
"errors"
"reflect"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
cgroupdevices "github.com/opencontainers/runc/libcontainer/cgroups/devices"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/system"
"github.com/opencontainers/runc/libcontainer/devices"
"github.com/opencontainers/runc/libcontainer/userns"
)
type DevicesGroup struct {
TestingSkipFinalCheck bool
}
func (s *DevicesGroup) Name() string {
return "devices"
}
func (s *DevicesGroup) Apply(d *cgroupData) error {
_, err := d.join("devices")
func (s *DevicesGroup) Apply(path string, r *configs.Resources, pid int) error {
if r.SkipDevices {
return nil
}
if path == "" {
// Return error here, since devices cgroup
// is a hard requirement for container's security.
return errSubsystemDoesNotExist
}
return apply(path, pid)
}
func loadEmulator(path string) (*cgroupdevices.Emulator, error) {
list, err := cgroups.ReadFile(path, "devices.list")
if err != nil {
return nil, err
}
return cgroupdevices.EmulatorFromList(bytes.NewBufferString(list))
}
func buildEmulator(rules []*devices.Rule) (*cgroupdevices.Emulator, error) {
// This defaults to a white-list -- which is what we want!
emu := &cgroupdevices.Emulator{}
for _, rule := range rules {
if err := emu.Apply(*rule); err != nil {
return nil, err
}
}
return emu, nil
}
func (s *DevicesGroup) Set(path string, r *configs.Resources) error {
if userns.RunningInUserNS() || r.SkipDevices {
return nil
}
// Generate two emulators, one for the current state of the cgroup and one
// for the requested state by the user.
current, err := loadEmulator(path)
if err != nil {
// We will return error even it's `not found` error, devices
// cgroup is hard requirement for container's security.
return err
}
return nil
}
func (s *DevicesGroup) Set(path string, cgroup *configs.Cgroup) error {
if system.RunningInUserNS() {
return nil
target, err := buildEmulator(r.Devices)
if err != nil {
return err
}
devices := cgroup.Resources.Devices
if len(devices) > 0 {
for _, dev := range devices {
file := "devices.deny"
if dev.Allow {
file = "devices.allow"
}
if err := fscommon.WriteFile(path, file, dev.CgroupString()); err != nil {
return err
}
}
return nil
// Compute the minimal set of transition rules needed to achieve the
// requested state.
transitionRules, err := current.Transition(target)
if err != nil {
return err
}
if cgroup.Resources.AllowAllDevices != nil {
if *cgroup.Resources.AllowAllDevices == false {
if err := fscommon.WriteFile(path, "devices.deny", "a"); err != nil {
return err
}
for _, dev := range cgroup.Resources.AllowedDevices {
if err := fscommon.WriteFile(path, "devices.allow", dev.CgroupString()); err != nil {
return err
}
}
return nil
for _, rule := range transitionRules {
file := "devices.deny"
if rule.Allow {
file = "devices.allow"
}
if err := fscommon.WriteFile(path, "devices.allow", "a"); err != nil {
if err := cgroups.WriteFile(path, file, rule.CgroupString()); err != nil {
return err
}
}
for _, dev := range cgroup.Resources.DeniedDevices {
if err := fscommon.WriteFile(path, "devices.deny", dev.CgroupString()); err != nil {
// Final safety check -- ensure that the resulting state is what was
// requested. This is only really correct for white-lists, but for
// black-lists we can at least check that the cgroup is in the right mode.
//
// This safety-check is skipped for the unit tests because we cannot
// currently mock devices.list correctly.
if !s.TestingSkipFinalCheck {
currentAfter, err := loadEmulator(path)
if err != nil {
return err
}
if !target.IsBlacklist() && !reflect.DeepEqual(currentAfter, target) {
return errors.New("resulting devices cgroup doesn't precisely match target")
} else if target.IsBlacklist() != currentAfter.IsBlacklist() {
return errors.New("resulting devices cgroup doesn't match target mode")
}
}
return nil
}
func (s *DevicesGroup) Remove(d *cgroupData) error {
return removePath(d.path("devices"))
}
func (s *DevicesGroup) GetStats(path string, stats *cgroups.Stats) error {
return nil
}

View File

@ -1,5 +1,3 @@
// +build linux
package fs
import (
@ -7,93 +5,48 @@ import (
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
)
var (
allowedDevices = []*configs.Device{
{
Path: "/dev/zero",
Type: 'c',
Major: 1,
Minor: 5,
Permissions: "rwm",
FileMode: 0666,
},
}
allowedList = "c 1:5 rwm"
deniedDevices = []*configs.Device{
{
Path: "/dev/null",
Type: 'c',
Major: 1,
Minor: 3,
Permissions: "rwm",
FileMode: 0666,
},
}
deniedList = "c 1:3 rwm"
"github.com/opencontainers/runc/libcontainer/devices"
)
func TestDevicesSetAllow(t *testing.T) {
helper := NewCgroupTestUtil("devices", t)
defer helper.cleanup()
path := tempDir(t, "devices")
helper.writeFileContents(map[string]string{
"devices.deny": "a",
})
allowAllDevices := false
helper.CgroupData.config.Resources.AllowAllDevices = &allowAllDevices
helper.CgroupData.config.Resources.AllowedDevices = allowedDevices
devices := &DevicesGroup{}
if err := devices.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
t.Fatal(err)
}
value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "devices.allow")
if err != nil {
t.Fatalf("Failed to parse devices.allow - %s", err)
}
if value != allowedList {
t.Fatal("Got the wrong value, set devices.allow failed.")
}
// When AllowAllDevices is nil, devices.allow file should not be modified.
helper.CgroupData.config.Resources.AllowAllDevices = nil
if err := devices.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
t.Fatal(err)
}
value, err = fscommon.GetCgroupParamString(helper.CgroupPath, "devices.allow")
if err != nil {
t.Fatalf("Failed to parse devices.allow - %s", err)
}
if value != allowedList {
t.Fatal("devices policy shouldn't have changed on AllowedAllDevices=nil.")
}
}
func TestDevicesSetDeny(t *testing.T) {
helper := NewCgroupTestUtil("devices", t)
defer helper.cleanup()
helper.writeFileContents(map[string]string{
"devices.allow": "a",
writeFileContents(t, path, map[string]string{
"devices.allow": "",
"devices.deny": "",
"devices.list": "a *:* rwm",
})
allowAllDevices := true
helper.CgroupData.config.Resources.AllowAllDevices = &allowAllDevices
helper.CgroupData.config.Resources.DeniedDevices = deniedDevices
devices := &DevicesGroup{}
if err := devices.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
r := &configs.Resources{
Devices: []*devices.Rule{
{
Type: devices.CharDevice,
Major: 1,
Minor: 5,
Permissions: devices.Permissions("rwm"),
Allow: true,
},
},
}
d := &DevicesGroup{TestingSkipFinalCheck: true}
if err := d.Set(path, r); err != nil {
t.Fatal(err)
}
value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "devices.deny")
// The default deny rule must be written.
value, err := fscommon.GetCgroupParamString(path, "devices.deny")
if err != nil {
t.Fatalf("Failed to parse devices.deny - %s", err)
t.Fatal(err)
}
if value[0] != 'a' {
t.Errorf("Got the wrong value (%q), set devices.deny failed.", value)
}
if value != deniedList {
t.Fatal("Got the wrong value, set devices.deny failed.")
// Permitted rule must be written.
if value, err := fscommon.GetCgroupParamString(path, "devices.allow"); err != nil {
t.Fatal(err)
} else if value != "c 1:5 rwm" {
t.Errorf("Got the wrong value (%q), set devices.allow failed.", value)
}
}

View File

@ -0,0 +1,15 @@
package fs
import (
"fmt"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
)
type parseError = fscommon.ParseError
// malformedLine is used by all cgroupfs file parsers that expect a line
// in a particular format but get some garbage instead.
func malformedLine(path, file, line string) error {
return &parseError{Path: path, File: file, Err: fmt.Errorf("malformed line: %s", line)}
}

View File

@ -1,67 +1,158 @@
// +build linux
package fs
import (
"errors"
"fmt"
"os"
"strings"
"time"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
type FreezerGroup struct {
}
type FreezerGroup struct{}
func (s *FreezerGroup) Name() string {
return "freezer"
}
func (s *FreezerGroup) Apply(d *cgroupData) error {
_, err := d.join("freezer")
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
func (s *FreezerGroup) Apply(path string, _ *configs.Resources, pid int) error {
return apply(path, pid)
}
func (s *FreezerGroup) Set(path string, cgroup *configs.Cgroup) error {
switch cgroup.Resources.Freezer {
case configs.Frozen, configs.Thawed:
for {
// In case this loop does not exit because it doesn't get the expected
// state, let's write again this state, hoping it's going to be properly
// set this time. Otherwise, this loop could run infinitely, waiting for
// a state change that would never happen.
if err := fscommon.WriteFile(path, "freezer.state", string(cgroup.Resources.Freezer)); err != nil {
func (s *FreezerGroup) Set(path string, r *configs.Resources) (Err error) {
switch r.Freezer {
case configs.Frozen:
defer func() {
if Err != nil {
// Freezing failed, and it is bad and dangerous
// to leave the cgroup in FROZEN or FREEZING
// state, so (try to) thaw it back.
_ = cgroups.WriteFile(path, "freezer.state", string(configs.Thawed))
}
}()
// As per older kernel docs (freezer-subsystem.txt before
// kernel commit ef9fe980c6fcc1821), if FREEZING is seen,
// userspace should either retry or thaw. While current
// kernel cgroup v1 docs no longer mention a need to retry,
// even a recent kernel (v5.4, Ubuntu 20.04) can't reliably
// freeze a cgroup v1 while new processes keep appearing in it
// (either via fork/clone or by writing new PIDs to
// cgroup.procs).
//
// The numbers below are empirically chosen to have a decent
// chance to succeed in various scenarios ("runc pause/unpause
// with parallel runc exec" and "bare freeze/unfreeze on a very
// slow system"), tested on RHEL7 and Ubuntu 20.04 kernels.
//
// Adding any amount of sleep in between retries did not
// increase the chances of successful freeze in "pause/unpause
// with parallel exec" reproducer. OTOH, adding an occasional
// sleep helped for the case where the system is extremely slow
// (CentOS 7 VM on GHA CI).
//
// Alas, this is still a game of chances, since the real fix
// belong to the kernel (cgroup v2 do not have this bug).
for i := 0; i < 1000; i++ {
if i%50 == 49 {
// Occasional thaw and sleep improves
// the chances to succeed in freezing
// in case new processes keep appearing
// in the cgroup.
_ = cgroups.WriteFile(path, "freezer.state", string(configs.Thawed))
time.Sleep(10 * time.Millisecond)
}
if err := cgroups.WriteFile(path, "freezer.state", string(configs.Frozen)); err != nil {
return err
}
state, err := fscommon.ReadFile(path, "freezer.state")
if i%25 == 24 {
// Occasional short sleep before reading
// the state back also improves the chances to
// succeed in freezing in case of a very slow
// system.
time.Sleep(10 * time.Microsecond)
}
state, err := cgroups.ReadFile(path, "freezer.state")
if err != nil {
return err
}
if strings.TrimSpace(state) == string(cgroup.Resources.Freezer) {
break
state = strings.TrimSpace(state)
switch state {
case "FREEZING":
continue
case string(configs.Frozen):
if i > 1 {
logrus.Debugf("frozen after %d retries", i)
}
return nil
default:
// should never happen
return fmt.Errorf("unexpected state %s while freezing", strings.TrimSpace(state))
}
time.Sleep(1 * time.Millisecond)
}
// Despite our best efforts, it got stuck in FREEZING.
return errors.New("unable to freeze")
case configs.Thawed:
return cgroups.WriteFile(path, "freezer.state", string(configs.Thawed))
case configs.Undefined:
return nil
default:
return fmt.Errorf("Invalid argument '%s' to freezer.state", string(cgroup.Resources.Freezer))
return fmt.Errorf("Invalid argument '%s' to freezer.state", string(r.Freezer))
}
return nil
}
func (s *FreezerGroup) Remove(d *cgroupData) error {
return removePath(d.path("freezer"))
}
func (s *FreezerGroup) GetStats(path string, stats *cgroups.Stats) error {
return nil
}
func (s *FreezerGroup) GetState(path string) (configs.FreezerState, error) {
for {
state, err := cgroups.ReadFile(path, "freezer.state")
if err != nil {
// If the kernel is too old, then we just treat the freezer as
// being in an "undefined" state.
if os.IsNotExist(err) || errors.Is(err, unix.ENODEV) {
err = nil
}
return configs.Undefined, err
}
switch strings.TrimSpace(state) {
case "THAWED":
return configs.Thawed, nil
case "FROZEN":
// Find out whether the cgroup is frozen directly,
// or indirectly via an ancestor.
self, err := cgroups.ReadFile(path, "freezer.self_freezing")
if err != nil {
// If the kernel is too old, then we just treat
// it as being frozen.
if errors.Is(err, os.ErrNotExist) || errors.Is(err, unix.ENODEV) {
err = nil
}
return configs.Frozen, err
}
switch self {
case "0\n":
return configs.Thawed, nil
case "1\n":
return configs.Frozen, nil
default:
return configs.Undefined, fmt.Errorf(`unknown "freezer.self_freezing" state: %q`, self)
}
case "FREEZING":
// Make sure we get a stable freezer state, so retry if the cgroup
// is still undergoing freezing. This should be a temporary delay.
time.Sleep(1 * time.Millisecond)
continue
default:
return configs.Undefined, fmt.Errorf("unknown freezer.state %q", state)
}
}
}

View File

@ -1,5 +1,3 @@
// +build linux
package fs
import (
@ -10,22 +8,23 @@ import (
)
func TestFreezerSetState(t *testing.T) {
helper := NewCgroupTestUtil("freezer", t)
defer helper.cleanup()
path := tempDir(t, "freezer")
helper.writeFileContents(map[string]string{
writeFileContents(t, path, map[string]string{
"freezer.state": string(configs.Frozen),
})
helper.CgroupData.config.Resources.Freezer = configs.Thawed
r := &configs.Resources{
Freezer: configs.Thawed,
}
freezer := &FreezerGroup{}
if err := freezer.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
if err := freezer.Set(path, r); err != nil {
t.Fatal(err)
}
value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "freezer.state")
value, err := fscommon.GetCgroupParamString(path, "freezer.state")
if err != nil {
t.Fatalf("Failed to parse freezer.state - %s", err)
t.Fatal(err)
}
if value != string(configs.Thawed) {
t.Fatal("Got the wrong value, set freezer.state failed.")
@ -33,16 +32,15 @@ func TestFreezerSetState(t *testing.T) {
}
func TestFreezerSetInvalidState(t *testing.T) {
helper := NewCgroupTestUtil("freezer", t)
defer helper.cleanup()
path := tempDir(t, "freezer")
const (
invalidArg configs.FreezerState = "Invalid"
)
const invalidArg configs.FreezerState = "Invalid"
helper.CgroupData.config.Resources.Freezer = invalidArg
r := &configs.Resources{
Freezer: invalidArg,
}
freezer := &FreezerGroup{}
if err := freezer.Set(helper.CgroupPath, helper.CgroupData.config); err == nil {
if err := freezer.Set(path, r); err == nil {
t.Fatal("Failed to return invalid argument error")
}
}

View File

@ -0,0 +1,264 @@
package fs
import (
"errors"
"fmt"
"os"
"sync"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
)
var subsystems = []subsystem{
&CpusetGroup{},
&DevicesGroup{},
&MemoryGroup{},
&CpuGroup{},
&CpuacctGroup{},
&PidsGroup{},
&BlkioGroup{},
&HugetlbGroup{},
&NetClsGroup{},
&NetPrioGroup{},
&PerfEventGroup{},
&FreezerGroup{},
&RdmaGroup{},
&NameGroup{GroupName: "name=systemd", Join: true},
}
var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist")
func init() {
// If using cgroups-hybrid mode then add a "" controller indicating
// it should join the cgroups v2.
if cgroups.IsCgroup2HybridMode() {
subsystems = append(subsystems, &NameGroup{GroupName: "", Join: true})
}
}
type subsystem interface {
// Name returns the name of the subsystem.
Name() string
// GetStats fills in the stats for the subsystem.
GetStats(path string, stats *cgroups.Stats) error
// Apply creates and joins a cgroup, adding pid into it. Some
// subsystems use resources to pre-configure the cgroup parents
// before creating or joining it.
Apply(path string, r *configs.Resources, pid int) error
// Set sets the cgroup resources.
Set(path string, r *configs.Resources) error
}
type manager struct {
mu sync.Mutex
cgroups *configs.Cgroup
paths map[string]string
}
func NewManager(cg *configs.Cgroup, paths map[string]string) (cgroups.Manager, error) {
// Some v1 controllers (cpu, cpuset, and devices) expect
// cgroups.Resources to not be nil in Apply.
if cg.Resources == nil {
return nil, errors.New("cgroup v1 manager needs configs.Resources to be set during manager creation")
}
if cg.Resources.Unified != nil {
return nil, cgroups.ErrV1NoUnified
}
if paths == nil {
var err error
paths, err = initPaths(cg)
if err != nil {
return nil, err
}
}
return &manager{
cgroups: cg,
paths: paths,
}, nil
}
// isIgnorableError returns whether err is a permission error (in the loose
// sense of the word). This includes EROFS (which for an unprivileged user is
// basically a permission error) and EACCES (for similar reasons) as well as
// the normal EPERM.
func isIgnorableError(rootless bool, err error) bool {
// We do not ignore errors if we are root.
if !rootless {
return false
}
// Is it an ordinary EPERM?
if errors.Is(err, os.ErrPermission) {
return true
}
// Handle some specific syscall errors.
var errno unix.Errno
if errors.As(err, &errno) {
return errno == unix.EROFS || errno == unix.EPERM || errno == unix.EACCES
}
return false
}
func (m *manager) Apply(pid int) (err error) {
m.mu.Lock()
defer m.mu.Unlock()
c := m.cgroups
for _, sys := range subsystems {
name := sys.Name()
p, ok := m.paths[name]
if !ok {
continue
}
if err := sys.Apply(p, c.Resources, pid); err != nil {
// In the case of rootless (including euid=0 in userns), where an
// explicit cgroup path hasn't been set, we don't bail on error in
// case of permission problems here, but do delete the path from
// the m.paths map, since it is either non-existent and could not
// be created, or the pid could not be added to it.
//
// Cases where limits for the subsystem have been set are handled
// later by Set, which fails with a friendly error (see
// if path == "" in Set).
if isIgnorableError(c.Rootless, err) && c.Path == "" {
delete(m.paths, name)
continue
}
return err
}
}
return nil
}
func (m *manager) Destroy() error {
m.mu.Lock()
defer m.mu.Unlock()
return cgroups.RemovePaths(m.paths)
}
func (m *manager) Path(subsys string) string {
m.mu.Lock()
defer m.mu.Unlock()
return m.paths[subsys]
}
func (m *manager) GetStats() (*cgroups.Stats, error) {
m.mu.Lock()
defer m.mu.Unlock()
stats := cgroups.NewStats()
for _, sys := range subsystems {
path := m.paths[sys.Name()]
if path == "" {
continue
}
if err := sys.GetStats(path, stats); err != nil {
return nil, err
}
}
return stats, nil
}
func (m *manager) Set(r *configs.Resources) error {
if r == nil {
return nil
}
if r.Unified != nil {
return cgroups.ErrV1NoUnified
}
m.mu.Lock()
defer m.mu.Unlock()
for _, sys := range subsystems {
path := m.paths[sys.Name()]
if err := sys.Set(path, r); err != nil {
// When rootless is true, errors from the device subsystem
// are ignored, as it is really not expected to work.
if m.cgroups.Rootless && sys.Name() == "devices" {
continue
}
// However, errors from other subsystems are not ignored.
// see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error"
if path == "" {
// We never created a path for this cgroup, so we cannot set
// limits for it (though we have already tried at this point).
return fmt.Errorf("cannot set %s limit: container could not join or create cgroup", sys.Name())
}
return err
}
}
return nil
}
// Freeze toggles the container's freezer cgroup depending on the state
// provided
func (m *manager) Freeze(state configs.FreezerState) error {
path := m.Path("freezer")
if path == "" {
return errors.New("cannot toggle freezer: cgroups not configured for container")
}
prevState := m.cgroups.Resources.Freezer
m.cgroups.Resources.Freezer = state
freezer := &FreezerGroup{}
if err := freezer.Set(path, m.cgroups.Resources); err != nil {
m.cgroups.Resources.Freezer = prevState
return err
}
return nil
}
func (m *manager) GetPids() ([]int, error) {
return cgroups.GetPids(m.Path("devices"))
}
func (m *manager) GetAllPids() ([]int, error) {
return cgroups.GetAllPids(m.Path("devices"))
}
func (m *manager) GetPaths() map[string]string {
m.mu.Lock()
defer m.mu.Unlock()
return m.paths
}
func (m *manager) GetCgroups() (*configs.Cgroup, error) {
return m.cgroups, nil
}
func (m *manager) GetFreezerState() (configs.FreezerState, error) {
dir := m.Path("freezer")
// If the container doesn't have the freezer cgroup, say it's undefined.
if dir == "" {
return configs.Undefined, nil
}
freezer := &FreezerGroup{}
return freezer.GetState(dir)
}
func (m *manager) Exists() bool {
return cgroups.PathExists(m.Path("devices"))
}
func OOMKillCount(path string) (uint64, error) {
return fscommon.GetValueByKey(path, "memory.oom_control", "oom_kill")
}
func (m *manager) OOMKillCount() (uint64, error) {
c, err := OOMKillCount(m.Path("memory"))
// Ignore ENOENT when rootless as it couldn't create cgroup.
if err != nil && m.cgroups.Rootless && os.IsNotExist(err) {
err = nil
}
return c, err
}

View File

@ -0,0 +1,50 @@
package fs
import (
"testing"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
)
func BenchmarkGetStats(b *testing.B) {
if cgroups.IsCgroup2UnifiedMode() {
b.Skip("cgroup v2 is not supported")
}
// Unset TestMode as we work with real cgroupfs here,
// and we want OpenFile to perform the fstype check.
cgroups.TestMode = false
defer func() {
cgroups.TestMode = true
}()
cg := &configs.Cgroup{
Path: "/some/kind/of/a/path/here",
Resources: &configs.Resources{},
}
m, err := NewManager(cg, nil)
if err != nil {
b.Fatal(err)
}
err = m.Apply(-1)
if err != nil {
b.Fatal(err)
}
defer func() {
_ = m.Destroy()
}()
var st *cgroups.Stats
b.ResetTimer()
for i := 0; i < b.N; i++ {
st, err = m.GetStats()
if err != nil {
b.Fatal(err)
}
}
if st.CpuStats.CpuUsage.TotalUsage != 0 {
b.Fatalf("stats: %+v", st)
}
}

View File

@ -1,3 +0,0 @@
// +build !linux
package fs

View File

@ -1,35 +1,26 @@
// +build linux
package fs
import (
"fmt"
"strconv"
"strings"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
)
type HugetlbGroup struct {
}
type HugetlbGroup struct{}
func (s *HugetlbGroup) Name() string {
return "hugetlb"
}
func (s *HugetlbGroup) Apply(d *cgroupData) error {
_, err := d.join("hugetlb")
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
func (s *HugetlbGroup) Apply(path string, _ *configs.Resources, pid int) error {
return apply(path, pid)
}
func (s *HugetlbGroup) Set(path string, cgroup *configs.Cgroup) error {
for _, hugetlb := range cgroup.Resources.HugetlbLimit {
if err := fscommon.WriteFile(path, strings.Join([]string{"hugetlb", hugetlb.Pagesize, "limit_in_bytes"}, "."), strconv.FormatUint(hugetlb.Limit, 10)); err != nil {
func (s *HugetlbGroup) Set(path string, r *configs.Resources) error {
for _, hugetlb := range r.HugetlbLimit {
if err := cgroups.WriteFile(path, "hugetlb."+hugetlb.Pagesize+".limit_in_bytes", strconv.FormatUint(hugetlb.Limit, 10)); err != nil {
return err
}
}
@ -37,31 +28,30 @@ func (s *HugetlbGroup) Set(path string, cgroup *configs.Cgroup) error {
return nil
}
func (s *HugetlbGroup) Remove(d *cgroupData) error {
return removePath(d.path("hugetlb"))
}
func (s *HugetlbGroup) GetStats(path string, stats *cgroups.Stats) error {
if !cgroups.PathExists(path) {
return nil
}
hugetlbStats := cgroups.HugetlbStats{}
for _, pageSize := range HugePageSizes {
usage := strings.Join([]string{"hugetlb", pageSize, "usage_in_bytes"}, ".")
for _, pageSize := range cgroups.HugePageSizes() {
usage := "hugetlb." + pageSize + ".usage_in_bytes"
value, err := fscommon.GetCgroupParamUint(path, usage)
if err != nil {
return fmt.Errorf("failed to parse %s - %v", usage, err)
return err
}
hugetlbStats.Usage = value
maxUsage := strings.Join([]string{"hugetlb", pageSize, "max_usage_in_bytes"}, ".")
maxUsage := "hugetlb." + pageSize + ".max_usage_in_bytes"
value, err = fscommon.GetCgroupParamUint(path, maxUsage)
if err != nil {
return fmt.Errorf("failed to parse %s - %v", maxUsage, err)
return err
}
hugetlbStats.MaxUsage = value
failcnt := strings.Join([]string{"hugetlb", pageSize, "failcnt"}, ".")
failcnt := "hugetlb." + pageSize + ".failcnt"
value, err = fscommon.GetCgroupParamUint(path, failcnt)
if err != nil {
return fmt.Errorf("failed to parse %s - %v", failcnt, err)
return err
}
hugetlbStats.Failcnt = value

View File

@ -1,5 +1,3 @@
// +build linux
package fs
import (
@ -18,7 +16,7 @@ const (
hugetlbFailcnt = "100\n"
)
var (
const (
usage = "hugetlb.%s.usage_in_bytes"
limit = "hugetlb.%s.limit_in_bytes"
maxUsage = "hugetlb.%s.max_usage_in_bytes"
@ -26,38 +24,38 @@ var (
)
func TestHugetlbSetHugetlb(t *testing.T) {
helper := NewCgroupTestUtil("hugetlb", t)
defer helper.cleanup()
path := tempDir(t, "hugetlb")
const (
hugetlbBefore = 256
hugetlbAfter = 512
)
for _, pageSize := range HugePageSizes {
helper.writeFileContents(map[string]string{
for _, pageSize := range cgroups.HugePageSizes() {
writeFileContents(t, path, map[string]string{
fmt.Sprintf(limit, pageSize): strconv.Itoa(hugetlbBefore),
})
}
for _, pageSize := range HugePageSizes {
helper.CgroupData.config.Resources.HugetlbLimit = []*configs.HugepageLimit{
r := &configs.Resources{}
for _, pageSize := range cgroups.HugePageSizes() {
r.HugetlbLimit = []*configs.HugepageLimit{
{
Pagesize: pageSize,
Limit: hugetlbAfter,
},
}
hugetlb := &HugetlbGroup{}
if err := hugetlb.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
if err := hugetlb.Set(path, r); err != nil {
t.Fatal(err)
}
}
for _, pageSize := range HugePageSizes {
for _, pageSize := range cgroups.HugePageSizes() {
limit := fmt.Sprintf(limit, pageSize)
value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, limit)
value, err := fscommon.GetCgroupParamUint(path, limit)
if err != nil {
t.Fatalf("Failed to parse %s - %s", limit, err)
t.Fatal(err)
}
if value != hugetlbAfter {
t.Fatalf("Set hugetlb.limit_in_bytes failed. Expected: %v, Got: %v", hugetlbAfter, value)
@ -66,10 +64,9 @@ func TestHugetlbSetHugetlb(t *testing.T) {
}
func TestHugetlbStats(t *testing.T) {
helper := NewCgroupTestUtil("hugetlb", t)
defer helper.cleanup()
for _, pageSize := range HugePageSizes {
helper.writeFileContents(map[string]string{
path := tempDir(t, "hugetlb")
for _, pageSize := range cgroups.HugePageSizes() {
writeFileContents(t, path, map[string]string{
fmt.Sprintf(usage, pageSize): hugetlbUsageContents,
fmt.Sprintf(maxUsage, pageSize): hugetlbMaxUsageContents,
fmt.Sprintf(failcnt, pageSize): hugetlbFailcnt,
@ -78,56 +75,50 @@ func TestHugetlbStats(t *testing.T) {
hugetlb := &HugetlbGroup{}
actualStats := *cgroups.NewStats()
err := hugetlb.GetStats(helper.CgroupPath, &actualStats)
err := hugetlb.GetStats(path, &actualStats)
if err != nil {
t.Fatal(err)
}
expectedStats := cgroups.HugetlbStats{Usage: 128, MaxUsage: 256, Failcnt: 100}
for _, pageSize := range HugePageSizes {
for _, pageSize := range cgroups.HugePageSizes() {
expectHugetlbStatEquals(t, expectedStats, actualStats.HugetlbStats[pageSize])
}
}
func TestHugetlbStatsNoUsageFile(t *testing.T) {
t.Skip("Disabled unreliable test")
helper := NewCgroupTestUtil("hugetlb", t)
defer helper.cleanup()
helper.writeFileContents(map[string]string{
path := tempDir(t, "hugetlb")
writeFileContents(t, path, map[string]string{
maxUsage: hugetlbMaxUsageContents,
})
hugetlb := &HugetlbGroup{}
actualStats := *cgroups.NewStats()
err := hugetlb.GetStats(helper.CgroupPath, &actualStats)
err := hugetlb.GetStats(path, &actualStats)
if err == nil {
t.Fatal("Expected failure")
}
}
func TestHugetlbStatsNoMaxUsageFile(t *testing.T) {
t.Skip("Disabled unreliable test")
helper := NewCgroupTestUtil("hugetlb", t)
defer helper.cleanup()
for _, pageSize := range HugePageSizes {
helper.writeFileContents(map[string]string{
path := tempDir(t, "hugetlb")
for _, pageSize := range cgroups.HugePageSizes() {
writeFileContents(t, path, map[string]string{
fmt.Sprintf(usage, pageSize): hugetlbUsageContents,
})
}
hugetlb := &HugetlbGroup{}
actualStats := *cgroups.NewStats()
err := hugetlb.GetStats(helper.CgroupPath, &actualStats)
err := hugetlb.GetStats(path, &actualStats)
if err == nil {
t.Fatal("Expected failure")
}
}
func TestHugetlbStatsBadUsageFile(t *testing.T) {
t.Skip("Disabled unreliable test")
helper := NewCgroupTestUtil("hugetlb", t)
defer helper.cleanup()
for _, pageSize := range HugePageSizes {
helper.writeFileContents(map[string]string{
path := tempDir(t, "hugetlb")
for _, pageSize := range cgroups.HugePageSizes() {
writeFileContents(t, path, map[string]string{
fmt.Sprintf(usage, pageSize): "bad",
maxUsage: hugetlbMaxUsageContents,
})
@ -135,24 +126,22 @@ t.Skip("Disabled unreliable test")
hugetlb := &HugetlbGroup{}
actualStats := *cgroups.NewStats()
err := hugetlb.GetStats(helper.CgroupPath, &actualStats)
err := hugetlb.GetStats(path, &actualStats)
if err == nil {
t.Fatal("Expected failure")
}
}
func TestHugetlbStatsBadMaxUsageFile(t *testing.T) {
t.Skip("Disabled unreliable test")
helper := NewCgroupTestUtil("hugetlb", t)
defer helper.cleanup()
helper.writeFileContents(map[string]string{
path := tempDir(t, "hugetlb")
writeFileContents(t, path, map[string]string{
usage: hugetlbUsageContents,
maxUsage: "bad",
})
hugetlb := &HugetlbGroup{}
actualStats := *cgroups.NewStats()
err := hugetlb.GetStats(helper.CgroupPath, &actualStats)
err := hugetlb.GetStats(path, &actualStats)
if err == nil {
t.Fatal("Expected failure")
}

View File

@ -1,62 +0,0 @@
// +build linux,!nokmem
package fs
import (
"errors"
"fmt"
"io/ioutil"
"os"
"path/filepath"
"strconv"
"syscall" // for Errno type only
"github.com/opencontainers/runc/libcontainer/cgroups"
"golang.org/x/sys/unix"
)
const cgroupKernelMemoryLimit = "memory.kmem.limit_in_bytes"
func EnableKernelMemoryAccounting(path string) error {
// Ensure that kernel memory is available in this kernel build. If it
// isn't, we just ignore it because EnableKernelMemoryAccounting is
// automatically called for all memory limits.
if !cgroups.PathExists(filepath.Join(path, cgroupKernelMemoryLimit)) {
return nil
}
// We have to limit the kernel memory here as it won't be accounted at all
// until a limit is set on the cgroup and limit cannot be set once the
// cgroup has children, or if there are already tasks in the cgroup.
for _, i := range []int64{1, -1} {
if err := setKernelMemory(path, i); err != nil {
return err
}
}
return nil
}
func setKernelMemory(path string, kernelMemoryLimit int64) error {
if path == "" {
return fmt.Errorf("no such directory for %s", cgroupKernelMemoryLimit)
}
if !cgroups.PathExists(filepath.Join(path, cgroupKernelMemoryLimit)) {
// We have specifically been asked to set a kmem limit. If the kernel
// doesn't support it we *must* error out.
return errors.New("kernel memory accounting not supported by this kernel")
}
if err := ioutil.WriteFile(filepath.Join(path, cgroupKernelMemoryLimit), []byte(strconv.FormatInt(kernelMemoryLimit, 10)), 0700); err != nil {
// Check if the error number returned by the syscall is "EBUSY"
// The EBUSY signal is returned on attempts to write to the
// memory.kmem.limit_in_bytes file if the cgroup has children or
// once tasks have been attached to the cgroup
if pathErr, ok := err.(*os.PathError); ok {
if errNo, ok := pathErr.Err.(syscall.Errno); ok {
if errNo == unix.EBUSY {
return fmt.Errorf("failed to set %s, because either tasks have already joined this cgroup or it has children", cgroupKernelMemoryLimit)
}
}
}
return fmt.Errorf("failed to write %v to %v: %v", kernelMemoryLimit, cgroupKernelMemoryLimit, err)
}
return nil
}

View File

@ -1,15 +0,0 @@
// +build linux,nokmem
package fs
import (
"errors"
)
func EnableKernelMemoryAccounting(path string) error {
return nil
}
func setKernelMemory(path string, kernelMemoryLimit int64) error {
return errors.New("kernel memory accounting disabled in this runc build")
}

View File

@ -1,15 +1,17 @@
// +build linux
package fs
import (
"bufio"
"errors"
"fmt"
"math"
"os"
"path/filepath"
"strconv"
"strings"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
@ -18,65 +20,66 @@ import (
const (
cgroupMemorySwapLimit = "memory.memsw.limit_in_bytes"
cgroupMemoryLimit = "memory.limit_in_bytes"
cgroupMemoryUsage = "memory.usage_in_bytes"
cgroupMemoryMaxUsage = "memory.max_usage_in_bytes"
)
type MemoryGroup struct {
}
type MemoryGroup struct{}
func (s *MemoryGroup) Name() string {
return "memory"
}
func (s *MemoryGroup) Apply(d *cgroupData) (err error) {
path, err := d.path("memory")
if err != nil && !cgroups.IsNotFound(err) {
return err
} else if path == "" {
return nil
}
if memoryAssigned(d.config) {
if _, err := os.Stat(path); os.IsNotExist(err) {
if err := os.MkdirAll(path, 0755); err != nil {
return err
}
// Only enable kernel memory accouting when this cgroup
// is created by libcontainer, otherwise we might get
// error when people use `cgroupsPath` to join an existed
// cgroup whose kernel memory is not initialized.
if err := EnableKernelMemoryAccounting(path); err != nil {
return err
}
}
}
defer func() {
if err != nil {
os.RemoveAll(path)
}
}()
// We need to join memory cgroup after set memory limits, because
// kmem.limit_in_bytes can only be set when the cgroup is empty.
_, err = d.join("memory")
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
func (s *MemoryGroup) Apply(path string, _ *configs.Resources, pid int) error {
return apply(path, pid)
}
func setMemoryAndSwap(path string, cgroup *configs.Cgroup) error {
// If the memory update is set to -1 we should also
// set swap to -1, it means unlimited memory.
if cgroup.Resources.Memory == -1 {
func setMemory(path string, val int64) error {
if val == 0 {
return nil
}
err := cgroups.WriteFile(path, cgroupMemoryLimit, strconv.FormatInt(val, 10))
if !errors.Is(err, unix.EBUSY) {
return err
}
// EBUSY means the kernel can't set new limit as it's too low
// (lower than the current usage). Return more specific error.
usage, err := fscommon.GetCgroupParamUint(path, cgroupMemoryUsage)
if err != nil {
return err
}
max, err := fscommon.GetCgroupParamUint(path, cgroupMemoryMaxUsage)
if err != nil {
return err
}
return fmt.Errorf("unable to set memory limit to %d (current usage: %d, peak usage: %d)", val, usage, max)
}
func setSwap(path string, val int64) error {
if val == 0 {
return nil
}
return cgroups.WriteFile(path, cgroupMemorySwapLimit, strconv.FormatInt(val, 10))
}
func setMemoryAndSwap(path string, r *configs.Resources) error {
// If the memory update is set to -1 and the swap is not explicitly
// set, we should also set swap to -1, it means unlimited memory.
if r.Memory == -1 && r.MemorySwap == 0 {
// Only set swap if it's enabled in kernel
if cgroups.PathExists(filepath.Join(path, cgroupMemorySwapLimit)) {
cgroup.Resources.MemorySwap = -1
r.MemorySwap = -1
}
}
// When memory and swap memory are both set, we need to handle the cases
// for updating container.
if cgroup.Resources.Memory != 0 && cgroup.Resources.MemorySwap != 0 {
memoryUsage, err := getMemoryData(path, "")
if r.Memory != 0 && r.MemorySwap != 0 {
curLimit, err := fscommon.GetCgroupParamUint(path, cgroupMemoryLimit)
if err != nil {
return err
}
@ -84,84 +87,61 @@ func setMemoryAndSwap(path string, cgroup *configs.Cgroup) error {
// When update memory limit, we should adapt the write sequence
// for memory and swap memory, so it won't fail because the new
// value and the old value don't fit kernel's validation.
if cgroup.Resources.MemorySwap == -1 || memoryUsage.Limit < uint64(cgroup.Resources.MemorySwap) {
if err := fscommon.WriteFile(path, cgroupMemorySwapLimit, strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil {
if r.MemorySwap == -1 || curLimit < uint64(r.MemorySwap) {
if err := setSwap(path, r.MemorySwap); err != nil {
return err
}
if err := fscommon.WriteFile(path, cgroupMemoryLimit, strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil {
return err
}
} else {
if err := fscommon.WriteFile(path, cgroupMemoryLimit, strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil {
return err
}
if err := fscommon.WriteFile(path, cgroupMemorySwapLimit, strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil {
return err
}
}
} else {
if cgroup.Resources.Memory != 0 {
if err := fscommon.WriteFile(path, cgroupMemoryLimit, strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil {
return err
}
}
if cgroup.Resources.MemorySwap != 0 {
if err := fscommon.WriteFile(path, cgroupMemorySwapLimit, strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil {
if err := setMemory(path, r.Memory); err != nil {
return err
}
return nil
}
}
return nil
}
func (s *MemoryGroup) Set(path string, cgroup *configs.Cgroup) error {
if err := setMemoryAndSwap(path, cgroup); err != nil {
if err := setMemory(path, r.Memory); err != nil {
return err
}
if err := setSwap(path, r.MemorySwap); err != nil {
return err
}
if cgroup.Resources.KernelMemory != 0 {
if err := setKernelMemory(path, cgroup.Resources.KernelMemory); err != nil {
return nil
}
func (s *MemoryGroup) Set(path string, r *configs.Resources) error {
if err := setMemoryAndSwap(path, r); err != nil {
return err
}
// ignore KernelMemory and KernelMemoryTCP
if r.MemoryReservation != 0 {
if err := cgroups.WriteFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(r.MemoryReservation, 10)); err != nil {
return err
}
}
if cgroup.Resources.MemoryReservation != 0 {
if err := fscommon.WriteFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemoryReservation, 10)); err != nil {
if r.OomKillDisable {
if err := cgroups.WriteFile(path, "memory.oom_control", "1"); err != nil {
return err
}
}
if cgroup.Resources.KernelMemoryTCP != 0 {
if err := fscommon.WriteFile(path, "memory.kmem.tcp.limit_in_bytes", strconv.FormatInt(cgroup.Resources.KernelMemoryTCP, 10)); err != nil {
return err
}
}
if cgroup.Resources.OomKillDisable {
if err := fscommon.WriteFile(path, "memory.oom_control", "1"); err != nil {
return err
}
}
if cgroup.Resources.MemorySwappiness == nil || int64(*cgroup.Resources.MemorySwappiness) == -1 {
if r.MemorySwappiness == nil || int64(*r.MemorySwappiness) == -1 {
return nil
} else if *cgroup.Resources.MemorySwappiness <= 100 {
if err := fscommon.WriteFile(path, "memory.swappiness", strconv.FormatUint(*cgroup.Resources.MemorySwappiness, 10)); err != nil {
} else if *r.MemorySwappiness <= 100 {
if err := cgroups.WriteFile(path, "memory.swappiness", strconv.FormatUint(*r.MemorySwappiness, 10)); err != nil {
return err
}
} else {
return fmt.Errorf("invalid value:%d. valid memory swappiness range is 0-100", *cgroup.Resources.MemorySwappiness)
return fmt.Errorf("invalid memory swappiness value: %d (valid range is 0-100)", *r.MemorySwappiness)
}
return nil
}
func (s *MemoryGroup) Remove(d *cgroupData) error {
return removePath(d.path("memory"))
}
func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error {
// Set stats from memory.stat.
statsFile, err := os.Open(filepath.Join(path, "memory.stat"))
const file = "memory.stat"
statsFile, err := cgroups.OpenFile(path, file, os.O_RDONLY)
if err != nil {
if os.IsNotExist(err) {
return nil
@ -172,9 +152,9 @@ func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error {
sc := bufio.NewScanner(statsFile)
for sc.Scan() {
t, v, err := fscommon.GetCgroupParamKeyValue(sc.Text())
t, v, err := fscommon.ParseKeyValue(sc.Text())
if err != nil {
return fmt.Errorf("failed to parse memory.stat (%q) - %v", sc.Text(), err)
return &parseError{Path: path, File: file, Err: err}
}
stats.MemoryStats.Stats[t] = v
}
@ -201,25 +181,21 @@ func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error {
}
stats.MemoryStats.KernelTCPUsage = kernelTCPUsage
useHierarchy := strings.Join([]string{"memory", "use_hierarchy"}, ".")
value, err := fscommon.GetCgroupParamUint(path, useHierarchy)
value, err := fscommon.GetCgroupParamUint(path, "memory.use_hierarchy")
if err != nil {
return err
}
if value == 1 {
stats.MemoryStats.UseHierarchy = true
}
return nil
}
func memoryAssigned(cgroup *configs.Cgroup) bool {
return cgroup.Resources.Memory != 0 ||
cgroup.Resources.MemoryReservation != 0 ||
cgroup.Resources.MemorySwap > 0 ||
cgroup.Resources.KernelMemory > 0 ||
cgroup.Resources.KernelMemoryTCP > 0 ||
cgroup.Resources.OomKillDisable ||
(cgroup.Resources.MemorySwappiness != nil && int64(*cgroup.Resources.MemorySwappiness) != -1)
pagesByNUMA, err := getPageUsageByNUMA(path)
if err != nil {
return err
}
stats.MemoryStats.PageUsageByNUMA = pagesByNUMA
return nil
}
func getMemoryData(path, name string) (cgroups.MemoryData, error) {
@ -227,45 +203,146 @@ func getMemoryData(path, name string) (cgroups.MemoryData, error) {
moduleName := "memory"
if name != "" {
moduleName = strings.Join([]string{"memory", name}, ".")
moduleName = "memory." + name
}
usage := strings.Join([]string{moduleName, "usage_in_bytes"}, ".")
maxUsage := strings.Join([]string{moduleName, "max_usage_in_bytes"}, ".")
failcnt := strings.Join([]string{moduleName, "failcnt"}, ".")
limit := strings.Join([]string{moduleName, "limit_in_bytes"}, ".")
var (
usage = moduleName + ".usage_in_bytes"
maxUsage = moduleName + ".max_usage_in_bytes"
failcnt = moduleName + ".failcnt"
limit = moduleName + ".limit_in_bytes"
)
value, err := fscommon.GetCgroupParamUint(path, usage)
if err != nil {
if moduleName != "memory" && os.IsNotExist(err) {
if name != "" && os.IsNotExist(err) {
// Ignore ENOENT as swap and kmem controllers
// are optional in the kernel.
return cgroups.MemoryData{}, nil
}
return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", usage, err)
return cgroups.MemoryData{}, err
}
memoryData.Usage = value
value, err = fscommon.GetCgroupParamUint(path, maxUsage)
if err != nil {
if moduleName != "memory" && os.IsNotExist(err) {
return cgroups.MemoryData{}, nil
}
return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", maxUsage, err)
return cgroups.MemoryData{}, err
}
memoryData.MaxUsage = value
value, err = fscommon.GetCgroupParamUint(path, failcnt)
if err != nil {
if moduleName != "memory" && os.IsNotExist(err) {
return cgroups.MemoryData{}, nil
}
return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", failcnt, err)
return cgroups.MemoryData{}, err
}
memoryData.Failcnt = value
value, err = fscommon.GetCgroupParamUint(path, limit)
if err != nil {
if moduleName != "memory" && os.IsNotExist(err) {
return cgroups.MemoryData{}, nil
}
return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", limit, err)
return cgroups.MemoryData{}, err
}
memoryData.Limit = value
return memoryData, nil
}
func getPageUsageByNUMA(path string) (cgroups.PageUsageByNUMA, error) {
const (
maxColumns = math.MaxUint8 + 1
file = "memory.numa_stat"
)
stats := cgroups.PageUsageByNUMA{}
fd, err := cgroups.OpenFile(path, file, os.O_RDONLY)
if os.IsNotExist(err) {
return stats, nil
} else if err != nil {
return stats, err
}
defer fd.Close()
// File format is documented in linux/Documentation/cgroup-v1/memory.txt
// and it looks like this:
//
// total=<total pages> N0=<node 0 pages> N1=<node 1 pages> ...
// file=<total file pages> N0=<node 0 pages> N1=<node 1 pages> ...
// anon=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ...
// unevictable=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ...
// hierarchical_<counter>=<counter pages> N0=<node 0 pages> N1=<node 1 pages> ...
scanner := bufio.NewScanner(fd)
for scanner.Scan() {
var field *cgroups.PageStats
line := scanner.Text()
columns := strings.SplitN(line, " ", maxColumns)
for i, column := range columns {
byNode := strings.SplitN(column, "=", 2)
// Some custom kernels have non-standard fields, like
// numa_locality 0 0 0 0 0 0 0 0 0 0
// numa_exectime 0
if len(byNode) < 2 {
if i == 0 {
// Ignore/skip those.
break
} else {
// The first column was already validated,
// so be strict to the rest.
return stats, malformedLine(path, file, line)
}
}
key, val := byNode[0], byNode[1]
if i == 0 { // First column: key is name, val is total.
field = getNUMAField(&stats, key)
if field == nil { // unknown field (new kernel?)
break
}
field.Total, err = strconv.ParseUint(val, 0, 64)
if err != nil {
return stats, &parseError{Path: path, File: file, Err: err}
}
field.Nodes = map[uint8]uint64{}
} else { // Subsequent columns: key is N<id>, val is usage.
if len(key) < 2 || key[0] != 'N' {
// This is definitely an error.
return stats, malformedLine(path, file, line)
}
n, err := strconv.ParseUint(key[1:], 10, 8)
if err != nil {
return stats, &parseError{Path: path, File: file, Err: err}
}
usage, err := strconv.ParseUint(val, 10, 64)
if err != nil {
return stats, &parseError{Path: path, File: file, Err: err}
}
field.Nodes[uint8(n)] = usage
}
}
}
if err := scanner.Err(); err != nil {
return cgroups.PageUsageByNUMA{}, &parseError{Path: path, File: file, Err: err}
}
return stats, nil
}
func getNUMAField(stats *cgroups.PageUsageByNUMA, name string) *cgroups.PageStats {
switch name {
case "total":
return &stats.Total
case "file":
return &stats.File
case "anon":
return &stats.Anon
case "unevictable":
return &stats.Unevictable
case "hierarchical_total":
return &stats.Hierarchical.Total
case "hierarchical_file":
return &stats.Hierarchical.File
case "hierarchical_anon":
return &stats.Hierarchical.Anon
case "hierarchical_unevictable":
return &stats.Hierarchical.Unevictable
}
return nil
}

View File

@ -1,5 +1,3 @@
// +build linux
package fs
import (
@ -8,6 +6,7 @@ import (
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
)
const (
@ -18,11 +17,29 @@ rss 1024`
memoryFailcnt = "100\n"
memoryLimitContents = "8192\n"
memoryUseHierarchyContents = "1\n"
memoryNUMAStatContents = `total=44611 N0=32631 N1=7501 N2=1982 N3=2497
file=44428 N0=32614 N1=7335 N2=1982 N3=2497
anon=183 N0=17 N1=166 N2=0 N3=0
unevictable=0 N0=0 N1=0 N2=0 N3=0
hierarchical_total=768133 N0=509113 N1=138887 N2=20464 N3=99669
hierarchical_file=722017 N0=496516 N1=119997 N2=20181 N3=85323
hierarchical_anon=46096 N0=12597 N1=18890 N2=283 N3=14326
hierarchical_unevictable=20 N0=0 N1=0 N2=0 N3=20
`
memoryNUMAStatNoHierarchyContents = `total=44611 N0=32631 N1=7501 N2=1982 N3=2497
file=44428 N0=32614 N1=7335 N2=1982 N3=2497
anon=183 N0=17 N1=166 N2=0 N3=0
unevictable=0 N0=0 N1=0 N2=0 N3=0
`
// Some custom kernels has extra fields that should be ignored
memoryNUMAStatExtraContents = `numa_locality 0 0 0 0 0 0 0 0 0 0
numa_exectime 0
whatever=100 N0=0
`
)
func TestMemorySetMemory(t *testing.T) {
helper := NewCgroupTestUtil("memory", t)
defer helper.cleanup()
path := tempDir(t, "memory")
const (
memoryBefore = 314572800 // 300M
@ -31,29 +48,31 @@ func TestMemorySetMemory(t *testing.T) {
reservationAfter = 314572800 // 300M
)
helper.writeFileContents(map[string]string{
writeFileContents(t, path, map[string]string{
"memory.limit_in_bytes": strconv.Itoa(memoryBefore),
"memory.soft_limit_in_bytes": strconv.Itoa(reservationBefore),
})
helper.CgroupData.config.Resources.Memory = memoryAfter
helper.CgroupData.config.Resources.MemoryReservation = reservationAfter
r := &configs.Resources{
Memory: memoryAfter,
MemoryReservation: reservationAfter,
}
memory := &MemoryGroup{}
if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
if err := memory.Set(path, r); err != nil {
t.Fatal(err)
}
value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.limit_in_bytes")
value, err := fscommon.GetCgroupParamUint(path, "memory.limit_in_bytes")
if err != nil {
t.Fatalf("Failed to parse memory.limit_in_bytes - %s", err)
t.Fatal(err)
}
if value != memoryAfter {
t.Fatal("Got the wrong value, set memory.limit_in_bytes failed.")
}
value, err = fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.soft_limit_in_bytes")
value, err = fscommon.GetCgroupParamUint(path, "memory.soft_limit_in_bytes")
if err != nil {
t.Fatalf("Failed to parse memory.soft_limit_in_bytes - %s", err)
t.Fatal(err)
}
if value != reservationAfter {
t.Fatal("Got the wrong value, set memory.soft_limit_in_bytes failed.")
@ -61,27 +80,28 @@ func TestMemorySetMemory(t *testing.T) {
}
func TestMemorySetMemoryswap(t *testing.T) {
helper := NewCgroupTestUtil("memory", t)
defer helper.cleanup()
path := tempDir(t, "memory")
const (
memoryswapBefore = 314572800 // 300M
memoryswapAfter = 524288000 // 500M
)
helper.writeFileContents(map[string]string{
writeFileContents(t, path, map[string]string{
"memory.memsw.limit_in_bytes": strconv.Itoa(memoryswapBefore),
})
helper.CgroupData.config.Resources.MemorySwap = memoryswapAfter
r := &configs.Resources{
MemorySwap: memoryswapAfter,
}
memory := &MemoryGroup{}
if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
if err := memory.Set(path, r); err != nil {
t.Fatal(err)
}
value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.memsw.limit_in_bytes")
value, err := fscommon.GetCgroupParamUint(path, "memory.memsw.limit_in_bytes")
if err != nil {
t.Fatalf("Failed to parse memory.memsw.limit_in_bytes - %s", err)
t.Fatal(err)
}
if value != memoryswapAfter {
t.Fatal("Got the wrong value, set memory.memsw.limit_in_bytes failed.")
@ -89,8 +109,7 @@ func TestMemorySetMemoryswap(t *testing.T) {
}
func TestMemorySetMemoryLargerThanSwap(t *testing.T) {
helper := NewCgroupTestUtil("memory", t)
defer helper.cleanup()
path := tempDir(t, "memory")
const (
memoryBefore = 314572800 // 300M
@ -99,7 +118,7 @@ func TestMemorySetMemoryLargerThanSwap(t *testing.T) {
memoryswapAfter = 838860800 // 800M
)
helper.writeFileContents(map[string]string{
writeFileContents(t, path, map[string]string{
"memory.limit_in_bytes": strconv.Itoa(memoryBefore),
"memory.memsw.limit_in_bytes": strconv.Itoa(memoryswapBefore),
// Set will call getMemoryData when memory and swap memory are
@ -109,23 +128,26 @@ func TestMemorySetMemoryLargerThanSwap(t *testing.T) {
"memory.failcnt": "0",
})
helper.CgroupData.config.Resources.Memory = memoryAfter
helper.CgroupData.config.Resources.MemorySwap = memoryswapAfter
r := &configs.Resources{
Memory: memoryAfter,
MemorySwap: memoryswapAfter,
}
memory := &MemoryGroup{}
if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
if err := memory.Set(path, r); err != nil {
t.Fatal(err)
}
value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.limit_in_bytes")
value, err := fscommon.GetCgroupParamUint(path, "memory.limit_in_bytes")
if err != nil {
t.Fatalf("Failed to parse memory.limit_in_bytes - %s", err)
t.Fatal(err)
}
if value != memoryAfter {
t.Fatal("Got the wrong value, set memory.limit_in_bytes failed.")
}
value, err = fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.memsw.limit_in_bytes")
value, err = fscommon.GetCgroupParamUint(path, "memory.memsw.limit_in_bytes")
if err != nil {
t.Fatalf("Failed to parse memory.memsw.limit_in_bytes - %s", err)
t.Fatal(err)
}
if value != memoryswapAfter {
t.Fatal("Got the wrong value, set memory.memsw.limit_in_bytes failed.")
@ -133,8 +155,7 @@ func TestMemorySetMemoryLargerThanSwap(t *testing.T) {
}
func TestMemorySetSwapSmallerThanMemory(t *testing.T) {
helper := NewCgroupTestUtil("memory", t)
defer helper.cleanup()
path := tempDir(t, "memory")
const (
memoryBefore = 629145600 // 600M
@ -143,115 +164,58 @@ func TestMemorySetSwapSmallerThanMemory(t *testing.T) {
memoryswapAfter = 524288000 // 500M
)
helper.writeFileContents(map[string]string{
writeFileContents(t, path, map[string]string{
"memory.limit_in_bytes": strconv.Itoa(memoryBefore),
"memory.memsw.limit_in_bytes": strconv.Itoa(memoryswapBefore),
// Set will call getMemoryData when memory and swap memory are
// both set, fake these fields so we don't get error.
"memory.usage_in_bytes": "0",
"memory.max_usage_in_bytes": "0",
"memory.failcnt": "0",
})
helper.CgroupData.config.Resources.Memory = memoryAfter
helper.CgroupData.config.Resources.MemorySwap = memoryswapAfter
r := &configs.Resources{
Memory: memoryAfter,
MemorySwap: memoryswapAfter,
}
memory := &MemoryGroup{}
if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
if err := memory.Set(path, r); err != nil {
t.Fatal(err)
}
value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.limit_in_bytes")
value, err := fscommon.GetCgroupParamUint(path, "memory.limit_in_bytes")
if err != nil {
t.Fatalf("Failed to parse memory.limit_in_bytes - %s", err)
t.Fatal(err)
}
if value != memoryAfter {
t.Fatal("Got the wrong value, set memory.limit_in_bytes failed.")
t.Fatalf("Got the wrong value (%d != %d), set memory.limit_in_bytes failed", value, memoryAfter)
}
value, err = fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.memsw.limit_in_bytes")
value, err = fscommon.GetCgroupParamUint(path, "memory.memsw.limit_in_bytes")
if err != nil {
t.Fatalf("Failed to parse memory.memsw.limit_in_bytes - %s", err)
t.Fatal(err)
}
if value != memoryswapAfter {
t.Fatal("Got the wrong value, set memory.memsw.limit_in_bytes failed.")
}
}
func TestMemorySetKernelMemory(t *testing.T) {
helper := NewCgroupTestUtil("memory", t)
defer helper.cleanup()
const (
kernelMemoryBefore = 314572800 // 300M
kernelMemoryAfter = 524288000 // 500M
)
helper.writeFileContents(map[string]string{
"memory.kmem.limit_in_bytes": strconv.Itoa(kernelMemoryBefore),
})
helper.CgroupData.config.Resources.KernelMemory = kernelMemoryAfter
memory := &MemoryGroup{}
if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
t.Fatal(err)
}
value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.kmem.limit_in_bytes")
if err != nil {
t.Fatalf("Failed to parse memory.kmem.limit_in_bytes - %s", err)
}
if value != kernelMemoryAfter {
t.Fatal("Got the wrong value, set memory.kmem.limit_in_bytes failed.")
}
}
func TestMemorySetKernelMemoryTCP(t *testing.T) {
helper := NewCgroupTestUtil("memory", t)
defer helper.cleanup()
const (
kernelMemoryTCPBefore = 314572800 // 300M
kernelMemoryTCPAfter = 524288000 // 500M
)
helper.writeFileContents(map[string]string{
"memory.kmem.tcp.limit_in_bytes": strconv.Itoa(kernelMemoryTCPBefore),
})
helper.CgroupData.config.Resources.KernelMemoryTCP = kernelMemoryTCPAfter
memory := &MemoryGroup{}
if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
t.Fatal(err)
}
value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.kmem.tcp.limit_in_bytes")
if err != nil {
t.Fatalf("Failed to parse memory.kmem.tcp.limit_in_bytes - %s", err)
}
if value != kernelMemoryTCPAfter {
t.Fatal("Got the wrong value, set memory.kmem.tcp.limit_in_bytes failed.")
t.Fatalf("Got the wrong value (%d != %d), set memory.memsw.limit_in_bytes failed", value, memoryswapAfter)
}
}
func TestMemorySetMemorySwappinessDefault(t *testing.T) {
helper := NewCgroupTestUtil("memory", t)
defer helper.cleanup()
path := tempDir(t, "memory")
swappinessBefore := 60 //default is 60
swappinessBefore := 60 // default is 60
swappinessAfter := uint64(0)
helper.writeFileContents(map[string]string{
writeFileContents(t, path, map[string]string{
"memory.swappiness": strconv.Itoa(swappinessBefore),
})
helper.CgroupData.config.Resources.MemorySwappiness = &swappinessAfter
r := &configs.Resources{
MemorySwappiness: &swappinessAfter,
}
memory := &MemoryGroup{}
if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
if err := memory.Set(path, r); err != nil {
t.Fatal(err)
}
value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.swappiness")
value, err := fscommon.GetCgroupParamUint(path, "memory.swappiness")
if err != nil {
t.Fatalf("Failed to parse memory.swappiness - %s", err)
t.Fatal(err)
}
if value != swappinessAfter {
t.Fatalf("Got the wrong value (%d), set memory.swappiness = %d failed.", value, swappinessAfter)
@ -259,9 +223,8 @@ func TestMemorySetMemorySwappinessDefault(t *testing.T) {
}
func TestMemoryStats(t *testing.T) {
helper := NewCgroupTestUtil("memory", t)
defer helper.cleanup()
helper.writeFileContents(map[string]string{
path := tempDir(t, "memory")
writeFileContents(t, path, map[string]string{
"memory.stat": memoryStatContents,
"memory.usage_in_bytes": memoryUsageContents,
"memory.limit_in_bytes": memoryLimitContents,
@ -276,22 +239,43 @@ func TestMemoryStats(t *testing.T) {
"memory.kmem.failcnt": memoryFailcnt,
"memory.kmem.limit_in_bytes": memoryLimitContents,
"memory.use_hierarchy": memoryUseHierarchyContents,
"memory.numa_stat": memoryNUMAStatContents + memoryNUMAStatExtraContents,
})
memory := &MemoryGroup{}
actualStats := *cgroups.NewStats()
err := memory.GetStats(helper.CgroupPath, &actualStats)
err := memory.GetStats(path, &actualStats)
if err != nil {
t.Fatal(err)
}
expectedStats := cgroups.MemoryStats{Cache: 512, Usage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192}, SwapUsage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192}, KernelUsage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192}, Stats: map[string]uint64{"cache": 512, "rss": 1024}, UseHierarchy: true}
expectedStats := cgroups.MemoryStats{
Cache: 512,
Usage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192},
SwapUsage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192},
KernelUsage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192},
Stats: map[string]uint64{"cache": 512, "rss": 1024},
UseHierarchy: true,
PageUsageByNUMA: cgroups.PageUsageByNUMA{
PageUsageByNUMAInner: cgroups.PageUsageByNUMAInner{
Total: cgroups.PageStats{Total: 44611, Nodes: map[uint8]uint64{0: 32631, 1: 7501, 2: 1982, 3: 2497}},
File: cgroups.PageStats{Total: 44428, Nodes: map[uint8]uint64{0: 32614, 1: 7335, 2: 1982, 3: 2497}},
Anon: cgroups.PageStats{Total: 183, Nodes: map[uint8]uint64{0: 17, 1: 166, 2: 0, 3: 0}},
Unevictable: cgroups.PageStats{Total: 0, Nodes: map[uint8]uint64{0: 0, 1: 0, 2: 0, 3: 0}},
},
Hierarchical: cgroups.PageUsageByNUMAInner{
Total: cgroups.PageStats{Total: 768133, Nodes: map[uint8]uint64{0: 509113, 1: 138887, 2: 20464, 3: 99669}},
File: cgroups.PageStats{Total: 722017, Nodes: map[uint8]uint64{0: 496516, 1: 119997, 2: 20181, 3: 85323}},
Anon: cgroups.PageStats{Total: 46096, Nodes: map[uint8]uint64{0: 12597, 1: 18890, 2: 283, 3: 14326}},
Unevictable: cgroups.PageStats{Total: 20, Nodes: map[uint8]uint64{0: 0, 1: 0, 2: 0, 3: 20}},
},
},
}
expectMemoryStatEquals(t, expectedStats, actualStats.MemoryStats)
}
func TestMemoryStatsNoStatFile(t *testing.T) {
helper := NewCgroupTestUtil("memory", t)
defer helper.cleanup()
helper.writeFileContents(map[string]string{
path := tempDir(t, "memory")
writeFileContents(t, path, map[string]string{
"memory.usage_in_bytes": memoryUsageContents,
"memory.max_usage_in_bytes": memoryMaxUsageContents,
"memory.limit_in_bytes": memoryLimitContents,
@ -299,16 +283,15 @@ func TestMemoryStatsNoStatFile(t *testing.T) {
memory := &MemoryGroup{}
actualStats := *cgroups.NewStats()
err := memory.GetStats(helper.CgroupPath, &actualStats)
err := memory.GetStats(path, &actualStats)
if err != nil {
t.Fatal(err)
}
}
func TestMemoryStatsNoUsageFile(t *testing.T) {
helper := NewCgroupTestUtil("memory", t)
defer helper.cleanup()
helper.writeFileContents(map[string]string{
path := tempDir(t, "memory")
writeFileContents(t, path, map[string]string{
"memory.stat": memoryStatContents,
"memory.max_usage_in_bytes": memoryMaxUsageContents,
"memory.limit_in_bytes": memoryLimitContents,
@ -316,16 +299,15 @@ func TestMemoryStatsNoUsageFile(t *testing.T) {
memory := &MemoryGroup{}
actualStats := *cgroups.NewStats()
err := memory.GetStats(helper.CgroupPath, &actualStats)
err := memory.GetStats(path, &actualStats)
if err == nil {
t.Fatal("Expected failure")
}
}
func TestMemoryStatsNoMaxUsageFile(t *testing.T) {
helper := NewCgroupTestUtil("memory", t)
defer helper.cleanup()
helper.writeFileContents(map[string]string{
path := tempDir(t, "memory")
writeFileContents(t, path, map[string]string{
"memory.stat": memoryStatContents,
"memory.usage_in_bytes": memoryUsageContents,
"memory.limit_in_bytes": memoryLimitContents,
@ -333,16 +315,15 @@ func TestMemoryStatsNoMaxUsageFile(t *testing.T) {
memory := &MemoryGroup{}
actualStats := *cgroups.NewStats()
err := memory.GetStats(helper.CgroupPath, &actualStats)
err := memory.GetStats(path, &actualStats)
if err == nil {
t.Fatal("Expected failure")
}
}
func TestMemoryStatsNoLimitInBytesFile(t *testing.T) {
helper := NewCgroupTestUtil("memory", t)
defer helper.cleanup()
helper.writeFileContents(map[string]string{
path := tempDir(t, "memory")
writeFileContents(t, path, map[string]string{
"memory.stat": memoryStatContents,
"memory.usage_in_bytes": memoryUsageContents,
"memory.max_usage_in_bytes": memoryMaxUsageContents,
@ -350,16 +331,15 @@ func TestMemoryStatsNoLimitInBytesFile(t *testing.T) {
memory := &MemoryGroup{}
actualStats := *cgroups.NewStats()
err := memory.GetStats(helper.CgroupPath, &actualStats)
err := memory.GetStats(path, &actualStats)
if err == nil {
t.Fatal("Expected failure")
}
}
func TestMemoryStatsBadStatFile(t *testing.T) {
helper := NewCgroupTestUtil("memory", t)
defer helper.cleanup()
helper.writeFileContents(map[string]string{
path := tempDir(t, "memory")
writeFileContents(t, path, map[string]string{
"memory.stat": "rss rss",
"memory.usage_in_bytes": memoryUsageContents,
"memory.max_usage_in_bytes": memoryMaxUsageContents,
@ -368,16 +348,15 @@ func TestMemoryStatsBadStatFile(t *testing.T) {
memory := &MemoryGroup{}
actualStats := *cgroups.NewStats()
err := memory.GetStats(helper.CgroupPath, &actualStats)
err := memory.GetStats(path, &actualStats)
if err == nil {
t.Fatal("Expected failure")
}
}
func TestMemoryStatsBadUsageFile(t *testing.T) {
helper := NewCgroupTestUtil("memory", t)
defer helper.cleanup()
helper.writeFileContents(map[string]string{
path := tempDir(t, "memory")
writeFileContents(t, path, map[string]string{
"memory.stat": memoryStatContents,
"memory.usage_in_bytes": "bad",
"memory.max_usage_in_bytes": memoryMaxUsageContents,
@ -386,16 +365,15 @@ func TestMemoryStatsBadUsageFile(t *testing.T) {
memory := &MemoryGroup{}
actualStats := *cgroups.NewStats()
err := memory.GetStats(helper.CgroupPath, &actualStats)
err := memory.GetStats(path, &actualStats)
if err == nil {
t.Fatal("Expected failure")
}
}
func TestMemoryStatsBadMaxUsageFile(t *testing.T) {
helper := NewCgroupTestUtil("memory", t)
defer helper.cleanup()
helper.writeFileContents(map[string]string{
path := tempDir(t, "memory")
writeFileContents(t, path, map[string]string{
"memory.stat": memoryStatContents,
"memory.usage_in_bytes": memoryUsageContents,
"memory.max_usage_in_bytes": "bad",
@ -404,16 +382,15 @@ func TestMemoryStatsBadMaxUsageFile(t *testing.T) {
memory := &MemoryGroup{}
actualStats := *cgroups.NewStats()
err := memory.GetStats(helper.CgroupPath, &actualStats)
err := memory.GetStats(path, &actualStats)
if err == nil {
t.Fatal("Expected failure")
}
}
func TestMemoryStatsBadLimitInBytesFile(t *testing.T) {
helper := NewCgroupTestUtil("memory", t)
defer helper.cleanup()
helper.writeFileContents(map[string]string{
path := tempDir(t, "memory")
writeFileContents(t, path, map[string]string{
"memory.stat": memoryStatContents,
"memory.usage_in_bytes": memoryUsageContents,
"memory.max_usage_in_bytes": memoryMaxUsageContents,
@ -422,35 +399,108 @@ func TestMemoryStatsBadLimitInBytesFile(t *testing.T) {
memory := &MemoryGroup{}
actualStats := *cgroups.NewStats()
err := memory.GetStats(helper.CgroupPath, &actualStats)
err := memory.GetStats(path, &actualStats)
if err == nil {
t.Fatal("Expected failure")
}
}
func TestMemorySetOomControl(t *testing.T) {
helper := NewCgroupTestUtil("memory", t)
defer helper.cleanup()
path := tempDir(t, "memory")
const (
oomKillDisable = 1 // disable oom killer, default is 0
)
helper.writeFileContents(map[string]string{
writeFileContents(t, path, map[string]string{
"memory.oom_control": strconv.Itoa(oomKillDisable),
})
memory := &MemoryGroup{}
if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
r := &configs.Resources{}
if err := memory.Set(path, r); err != nil {
t.Fatal(err)
}
value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.oom_control")
value, err := fscommon.GetCgroupParamUint(path, "memory.oom_control")
if err != nil {
t.Fatalf("Failed to parse memory.oom_control - %s", err)
t.Fatal(err)
}
if value != oomKillDisable {
t.Fatalf("Got the wrong value, set memory.oom_control failed.")
}
}
func TestNoHierarchicalNumaStat(t *testing.T) {
path := tempDir(t, "memory")
writeFileContents(t, path, map[string]string{
"memory.numa_stat": memoryNUMAStatNoHierarchyContents + memoryNUMAStatExtraContents,
})
actualStats, err := getPageUsageByNUMA(path)
if err != nil {
t.Fatal(err)
}
pageUsageByNUMA := cgroups.PageUsageByNUMA{
PageUsageByNUMAInner: cgroups.PageUsageByNUMAInner{
Total: cgroups.PageStats{Total: 44611, Nodes: map[uint8]uint64{0: 32631, 1: 7501, 2: 1982, 3: 2497}},
File: cgroups.PageStats{Total: 44428, Nodes: map[uint8]uint64{0: 32614, 1: 7335, 2: 1982, 3: 2497}},
Anon: cgroups.PageStats{Total: 183, Nodes: map[uint8]uint64{0: 17, 1: 166, 2: 0, 3: 0}},
Unevictable: cgroups.PageStats{Total: 0, Nodes: map[uint8]uint64{0: 0, 1: 0, 2: 0, 3: 0}},
},
Hierarchical: cgroups.PageUsageByNUMAInner{},
}
expectPageUsageByNUMAEquals(t, pageUsageByNUMA, actualStats)
}
func TestBadNumaStat(t *testing.T) {
memoryNUMAStatBadContents := []struct {
desc, contents string
}{
{
desc: "Nx where x is not a number",
contents: `total=44611 N0=44611,
file=44428 Nx=0
`,
}, {
desc: "Nx where x > 255",
contents: `total=44611 N333=444`,
}, {
desc: "Nx argument missing",
contents: `total=44611 N0=123 N1=`,
}, {
desc: "Nx argument is not a number",
contents: `total=44611 N0=123 N1=a`,
}, {
desc: "Missing = after Nx",
contents: `total=44611 N0=123 N1`,
}, {
desc: "No Nx at non-first position",
contents: `total=44611 N0=32631
file=44428 N0=32614
anon=183 N0=12 badone
`,
},
}
path := tempDir(t, "memory")
for _, c := range memoryNUMAStatBadContents {
writeFileContents(t, path, map[string]string{
"memory.numa_stat": c.contents,
})
_, err := getPageUsageByNUMA(path)
if err == nil {
t.Errorf("case %q: expected error, got nil", c.desc)
}
}
}
func TestWithoutNumaStat(t *testing.T) {
path := tempDir(t, "memory")
actualStats, err := getPageUsageByNUMA(path)
if err != nil {
t.Fatal(err)
}
expectPageUsageByNUMAEquals(t, cgroups.PageUsageByNUMA{}, actualStats)
}

View File

@ -1,5 +1,3 @@
// +build linux
package fs
import (
@ -16,22 +14,15 @@ func (s *NameGroup) Name() string {
return s.GroupName
}
func (s *NameGroup) Apply(d *cgroupData) error {
func (s *NameGroup) Apply(path string, _ *configs.Resources, pid int) error {
if s.Join {
// ignore errors if the named cgroup does not exist
d.join(s.GroupName)
// Ignore errors if the named cgroup does not exist.
_ = apply(path, pid)
}
return nil
}
func (s *NameGroup) Set(path string, cgroup *configs.Cgroup) error {
return nil
}
func (s *NameGroup) Remove(d *cgroupData) error {
if s.Join {
removePath(d.path(s.GroupName))
}
func (s *NameGroup) Set(_ string, _ *configs.Resources) error {
return nil
}

View File

@ -1,33 +1,25 @@
// +build linux
package fs
import (
"strconv"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
)
type NetClsGroup struct {
}
type NetClsGroup struct{}
func (s *NetClsGroup) Name() string {
return "net_cls"
}
func (s *NetClsGroup) Apply(d *cgroupData) error {
_, err := d.join("net_cls")
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
func (s *NetClsGroup) Apply(path string, _ *configs.Resources, pid int) error {
return apply(path, pid)
}
func (s *NetClsGroup) Set(path string, cgroup *configs.Cgroup) error {
if cgroup.Resources.NetClsClassid != 0 {
if err := fscommon.WriteFile(path, "net_cls.classid", strconv.FormatUint(uint64(cgroup.Resources.NetClsClassid), 10)); err != nil {
func (s *NetClsGroup) Set(path string, r *configs.Resources) error {
if r.NetClsClassid != 0 {
if err := cgroups.WriteFile(path, "net_cls.classid", strconv.FormatUint(uint64(r.NetClsClassid), 10)); err != nil {
return err
}
}
@ -35,10 +27,6 @@ func (s *NetClsGroup) Set(path string, cgroup *configs.Cgroup) error {
return nil
}
func (s *NetClsGroup) Remove(d *cgroupData) error {
return removePath(d.path("net_cls"))
}
func (s *NetClsGroup) GetStats(path string, stats *cgroups.Stats) error {
return nil
}

View File

@ -1,5 +1,3 @@
// +build linux
package fs
import (
@ -7,6 +5,7 @@ import (
"testing"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
)
const (
@ -15,25 +14,26 @@ const (
)
func TestNetClsSetClassid(t *testing.T) {
helper := NewCgroupTestUtil("net_cls", t)
defer helper.cleanup()
path := tempDir(t, "net_cls")
helper.writeFileContents(map[string]string{
writeFileContents(t, path, map[string]string{
"net_cls.classid": strconv.FormatUint(classidBefore, 10),
})
helper.CgroupData.config.Resources.NetClsClassid = classidAfter
r := &configs.Resources{
NetClsClassid: classidAfter,
}
netcls := &NetClsGroup{}
if err := netcls.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
if err := netcls.Set(path, r); err != nil {
t.Fatal(err)
}
// As we are in mock environment, we can't get correct value of classid from
// net_cls.classid.
// So. we just judge if we successfully write classid into file
value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "net_cls.classid")
value, err := fscommon.GetCgroupParamUint(path, "net_cls.classid")
if err != nil {
t.Fatalf("Failed to parse net_cls.classid - %s", err)
t.Fatal(err)
}
if value != classidAfter {
t.Fatal("Got the wrong value, set net_cls.classid failed.")

View File

@ -1,31 +1,23 @@
// +build linux
package fs
import (
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
)
type NetPrioGroup struct {
}
type NetPrioGroup struct{}
func (s *NetPrioGroup) Name() string {
return "net_prio"
}
func (s *NetPrioGroup) Apply(d *cgroupData) error {
_, err := d.join("net_prio")
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
func (s *NetPrioGroup) Apply(path string, _ *configs.Resources, pid int) error {
return apply(path, pid)
}
func (s *NetPrioGroup) Set(path string, cgroup *configs.Cgroup) error {
for _, prioMap := range cgroup.Resources.NetPrioIfpriomap {
if err := fscommon.WriteFile(path, "net_prio.ifpriomap", prioMap.CgroupString()); err != nil {
func (s *NetPrioGroup) Set(path string, r *configs.Resources) error {
for _, prioMap := range r.NetPrioIfpriomap {
if err := cgroups.WriteFile(path, "net_prio.ifpriomap", prioMap.CgroupString()); err != nil {
return err
}
}
@ -33,10 +25,6 @@ func (s *NetPrioGroup) Set(path string, cgroup *configs.Cgroup) error {
return nil
}
func (s *NetPrioGroup) Remove(d *cgroupData) error {
return removePath(d.path("net_prio"))
}
func (s *NetPrioGroup) GetStats(path string, stats *cgroups.Stats) error {
return nil
}

View File

@ -1,5 +1,3 @@
// +build linux
package fs
import (
@ -10,28 +8,27 @@ import (
"github.com/opencontainers/runc/libcontainer/configs"
)
var (
prioMap = []*configs.IfPrioMap{
{
Interface: "test",
Priority: 5,
},
}
)
var prioMap = []*configs.IfPrioMap{
{
Interface: "test",
Priority: 5,
},
}
func TestNetPrioSetIfPrio(t *testing.T) {
helper := NewCgroupTestUtil("net_prio", t)
defer helper.cleanup()
path := tempDir(t, "net_prio")
helper.CgroupData.config.Resources.NetPrioIfpriomap = prioMap
r := &configs.Resources{
NetPrioIfpriomap: prioMap,
}
netPrio := &NetPrioGroup{}
if err := netPrio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
if err := netPrio.Set(path, r); err != nil {
t.Fatal(err)
}
value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "net_prio.ifpriomap")
value, err := fscommon.GetCgroupParamString(path, "net_prio.ifpriomap")
if err != nil {
t.Fatalf("Failed to parse net_prio.ifpriomap - %s", err)
t.Fatal(err)
}
if !strings.Contains(value, "test 5") {
t.Fatal("Got the wrong value, set net_prio.ifpriomap failed.")

View File

@ -0,0 +1,186 @@
package fs
import (
"errors"
"os"
"path/filepath"
"sync"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/utils"
)
// The absolute path to the root of the cgroup hierarchies.
var (
cgroupRootLock sync.Mutex
cgroupRoot string
)
const defaultCgroupRoot = "/sys/fs/cgroup"
func initPaths(cg *configs.Cgroup) (map[string]string, error) {
root, err := rootPath()
if err != nil {
return nil, err
}
inner, err := innerPath(cg)
if err != nil {
return nil, err
}
paths := make(map[string]string)
for _, sys := range subsystems {
name := sys.Name()
path, err := subsysPath(root, inner, name)
if err != nil {
// The non-presence of the devices subsystem
// is considered fatal for security reasons.
if cgroups.IsNotFound(err) && (cg.SkipDevices || name != "devices") {
continue
}
return nil, err
}
paths[name] = path
}
return paths, nil
}
func tryDefaultCgroupRoot() string {
var st, pst unix.Stat_t
// (1) it should be a directory...
err := unix.Lstat(defaultCgroupRoot, &st)
if err != nil || st.Mode&unix.S_IFDIR == 0 {
return ""
}
// (2) ... and a mount point ...
err = unix.Lstat(filepath.Dir(defaultCgroupRoot), &pst)
if err != nil {
return ""
}
if st.Dev == pst.Dev {
// parent dir has the same dev -- not a mount point
return ""
}
// (3) ... of 'tmpfs' fs type.
var fst unix.Statfs_t
err = unix.Statfs(defaultCgroupRoot, &fst)
if err != nil || fst.Type != unix.TMPFS_MAGIC {
return ""
}
// (4) it should have at least 1 entry ...
dir, err := os.Open(defaultCgroupRoot)
if err != nil {
return ""
}
names, err := dir.Readdirnames(1)
if err != nil {
return ""
}
if len(names) < 1 {
return ""
}
// ... which is a cgroup mount point.
err = unix.Statfs(filepath.Join(defaultCgroupRoot, names[0]), &fst)
if err != nil || fst.Type != unix.CGROUP_SUPER_MAGIC {
return ""
}
return defaultCgroupRoot
}
// rootPath finds and returns path to the root of the cgroup hierarchies.
func rootPath() (string, error) {
cgroupRootLock.Lock()
defer cgroupRootLock.Unlock()
if cgroupRoot != "" {
return cgroupRoot, nil
}
// fast path
cgroupRoot = tryDefaultCgroupRoot()
if cgroupRoot != "" {
return cgroupRoot, nil
}
// slow path: parse mountinfo
mi, err := cgroups.GetCgroupMounts(false)
if err != nil {
return "", err
}
if len(mi) < 1 {
return "", errors.New("no cgroup mount found in mountinfo")
}
// Get the first cgroup mount (e.g. "/sys/fs/cgroup/memory"),
// use its parent directory.
root := filepath.Dir(mi[0].Mountpoint)
if _, err := os.Stat(root); err != nil {
return "", err
}
cgroupRoot = root
return cgroupRoot, nil
}
func innerPath(c *configs.Cgroup) (string, error) {
if (c.Name != "" || c.Parent != "") && c.Path != "" {
return "", errors.New("cgroup: either Path or Name and Parent should be used")
}
// XXX: Do not remove CleanPath. Path safety is important! -- cyphar
innerPath := utils.CleanPath(c.Path)
if innerPath == "" {
cgParent := utils.CleanPath(c.Parent)
cgName := utils.CleanPath(c.Name)
innerPath = filepath.Join(cgParent, cgName)
}
return innerPath, nil
}
func subsysPath(root, inner, subsystem string) (string, error) {
// If the cgroup name/path is absolute do not look relative to the cgroup of the init process.
if filepath.IsAbs(inner) {
mnt, err := cgroups.FindCgroupMountpoint(root, subsystem)
// If we didn't mount the subsystem, there is no point we make the path.
if err != nil {
return "", err
}
// Sometimes subsystems can be mounted together as 'cpu,cpuacct'.
return filepath.Join(root, filepath.Base(mnt), inner), nil
}
// Use GetOwnCgroupPath instead of GetInitCgroupPath, because the creating
// process could in container and shared pid namespace with host, and
// /proc/1/cgroup could point to whole other world of cgroups.
parentPath, err := cgroups.GetOwnCgroupPath(subsystem)
if err != nil {
return "", err
}
return filepath.Join(parentPath, inner), nil
}
func apply(path string, pid int) error {
if path == "" {
return nil
}
if err := os.MkdirAll(path, 0o755); err != nil {
return err
}
return cgroups.WriteCgroupProc(path, pid)
}

View File

@ -0,0 +1,104 @@
package fs
import (
"path/filepath"
"strings"
"testing"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
)
func TestInvalidCgroupPath(t *testing.T) {
if cgroups.IsCgroup2UnifiedMode() {
t.Skip("cgroup v2 is not supported")
}
root, err := rootPath()
if err != nil {
t.Fatalf("couldn't get cgroup root: %v", err)
}
testCases := []struct {
test string
path, name, parent string
}{
{
test: "invalid cgroup path",
path: "../../../../../../../../../../some/path",
},
{
test: "invalid absolute cgroup path",
path: "/../../../../../../../../../../some/path",
},
{
test: "invalid cgroup parent",
parent: "../../../../../../../../../../some/path",
name: "name",
},
{
test: "invalid absolute cgroup parent",
parent: "/../../../../../../../../../../some/path",
name: "name",
},
{
test: "invalid cgroup name",
parent: "parent",
name: "../../../../../../../../../../some/path",
},
{
test: "invalid absolute cgroup name",
parent: "parent",
name: "/../../../../../../../../../../some/path",
},
{
test: "invalid cgroup name and parent",
parent: "../../../../../../../../../../some/path",
name: "../../../../../../../../../../some/path",
},
{
test: "invalid absolute cgroup name and parent",
parent: "/../../../../../../../../../../some/path",
name: "/../../../../../../../../../../some/path",
},
}
for _, tc := range testCases {
t.Run(tc.test, func(t *testing.T) {
config := &configs.Cgroup{Path: tc.path, Name: tc.name, Parent: tc.parent}
inner, err := innerPath(config)
if err != nil {
t.Fatalf("couldn't get cgroup data: %v", err)
}
// Make sure the final inner path doesn't go outside the cgroup mountpoint.
if strings.HasPrefix(inner, "..") {
t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!")
}
// Double-check, using an actual cgroup.
deviceRoot := filepath.Join(root, "devices")
devicePath, err := subsysPath(root, inner, "devices")
if err != nil {
t.Fatalf("couldn't get cgroup path: %v", err)
}
if !strings.HasPrefix(devicePath, deviceRoot) {
t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!")
}
})
}
}
func TestTryDefaultCgroupRoot(t *testing.T) {
res := tryDefaultCgroupRoot()
exp := defaultCgroupRoot
if cgroups.IsCgroup2UnifiedMode() {
// checking that tryDefaultCgroupRoot does return ""
// in case /sys/fs/cgroup is not cgroup v1 root dir.
exp = ""
}
if res != exp {
t.Errorf("tryDefaultCgroupRoot: want %q, got %q", exp, res)
}
}

View File

@ -1,5 +1,3 @@
// +build linux
package fs
import (
@ -7,29 +5,20 @@ import (
"github.com/opencontainers/runc/libcontainer/configs"
)
type PerfEventGroup struct {
}
type PerfEventGroup struct{}
func (s *PerfEventGroup) Name() string {
return "perf_event"
}
func (s *PerfEventGroup) Apply(d *cgroupData) error {
// we just want to join this group even though we don't set anything
if _, err := d.join("perf_event"); err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
func (s *PerfEventGroup) Apply(path string, _ *configs.Resources, pid int) error {
return apply(path, pid)
}
func (s *PerfEventGroup) Set(path string, cgroup *configs.Cgroup) error {
func (s *PerfEventGroup) Set(_ string, _ *configs.Resources) error {
return nil
}
func (s *PerfEventGroup) Remove(d *cgroupData) error {
return removePath(d.path("perf_event"))
}
func (s *PerfEventGroup) GetStats(path string, stats *cgroups.Stats) error {
return nil
}

View File

@ -1,10 +1,7 @@
// +build linux
package fs
import (
"fmt"
"path/filepath"
"math"
"strconv"
"github.com/opencontainers/runc/libcontainer/cgroups"
@ -12,31 +9,26 @@ import (
"github.com/opencontainers/runc/libcontainer/configs"
)
type PidsGroup struct {
}
type PidsGroup struct{}
func (s *PidsGroup) Name() string {
return "pids"
}
func (s *PidsGroup) Apply(d *cgroupData) error {
_, err := d.join("pids")
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
func (s *PidsGroup) Apply(path string, _ *configs.Resources, pid int) error {
return apply(path, pid)
}
func (s *PidsGroup) Set(path string, cgroup *configs.Cgroup) error {
if cgroup.Resources.PidsLimit != 0 {
func (s *PidsGroup) Set(path string, r *configs.Resources) error {
if r.PidsLimit != 0 {
// "max" is the fallback value.
limit := "max"
if cgroup.Resources.PidsLimit > 0 {
limit = strconv.FormatInt(cgroup.Resources.PidsLimit, 10)
if r.PidsLimit > 0 {
limit = strconv.FormatInt(r.PidsLimit, 10)
}
if err := fscommon.WriteFile(path, "pids.max", limit); err != nil {
if err := cgroups.WriteFile(path, "pids.max", limit); err != nil {
return err
}
}
@ -44,28 +36,24 @@ func (s *PidsGroup) Set(path string, cgroup *configs.Cgroup) error {
return nil
}
func (s *PidsGroup) Remove(d *cgroupData) error {
return removePath(d.path("pids"))
}
func (s *PidsGroup) GetStats(path string, stats *cgroups.Stats) error {
if !cgroups.PathExists(path) {
return nil
}
current, err := fscommon.GetCgroupParamUint(path, "pids.current")
if err != nil {
return fmt.Errorf("failed to parse pids.current - %s", err)
return err
}
maxString, err := fscommon.GetCgroupParamString(path, "pids.max")
max, err := fscommon.GetCgroupParamUint(path, "pids.max")
if err != nil {
return fmt.Errorf("failed to parse pids.max - %s", err)
return err
}
// Default if pids.max == "max" is 0 -- which represents "no limit".
var max uint64
if maxString != "max" {
max, err = fscommon.ParseUint(maxString, 10, 64)
if err != nil {
return fmt.Errorf("failed to parse pids.max - unable to parse %q as a uint from Cgroup file %q", maxString, filepath.Join(path, "pids.max"))
}
// If no limit is set, read from pids.max returns "max", which is
// converted to MaxUint64 by GetCgroupParamUint. Historically, we
// represent "no limit" for pids as 0, thus this conversion.
if max == math.MaxUint64 {
max = 0
}
stats.PidsStats.Current = current

View File

@ -1,5 +1,3 @@
// +build linux
package fs
import (
@ -8,6 +6,7 @@ import (
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
)
const (
@ -16,65 +15,64 @@ const (
)
func TestPidsSetMax(t *testing.T) {
helper := NewCgroupTestUtil("pids", t)
defer helper.cleanup()
path := tempDir(t, "pids")
helper.writeFileContents(map[string]string{
writeFileContents(t, path, map[string]string{
"pids.max": "max",
})
helper.CgroupData.config.Resources.PidsLimit = maxLimited
r := &configs.Resources{
PidsLimit: maxLimited,
}
pids := &PidsGroup{}
if err := pids.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
if err := pids.Set(path, r); err != nil {
t.Fatal(err)
}
value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "pids.max")
value, err := fscommon.GetCgroupParamUint(path, "pids.max")
if err != nil {
t.Fatalf("Failed to parse pids.max - %s", err)
t.Fatal(err)
}
if value != maxLimited {
t.Fatalf("Expected %d, got %d for setting pids.max - limited", maxLimited, value)
}
}
func TestPidsSetUnlimited(t *testing.T) {
helper := NewCgroupTestUtil("pids", t)
defer helper.cleanup()
path := tempDir(t, "pids")
helper.writeFileContents(map[string]string{
writeFileContents(t, path, map[string]string{
"pids.max": strconv.Itoa(maxLimited),
})
helper.CgroupData.config.Resources.PidsLimit = maxUnlimited
r := &configs.Resources{
PidsLimit: maxUnlimited,
}
pids := &PidsGroup{}
if err := pids.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
if err := pids.Set(path, r); err != nil {
t.Fatal(err)
}
value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "pids.max")
value, err := fscommon.GetCgroupParamString(path, "pids.max")
if err != nil {
t.Fatalf("Failed to parse pids.max - %s", err)
t.Fatal(err)
}
if value != "max" {
t.Fatalf("Expected %s, got %s for setting pids.max - unlimited", "max", value)
}
}
func TestPidsStats(t *testing.T) {
helper := NewCgroupTestUtil("pids", t)
defer helper.cleanup()
path := tempDir(t, "pids")
helper.writeFileContents(map[string]string{
writeFileContents(t, path, map[string]string{
"pids.current": strconv.Itoa(1337),
"pids.max": strconv.Itoa(maxLimited),
})
pids := &PidsGroup{}
stats := *cgroups.NewStats()
if err := pids.GetStats(helper.CgroupPath, &stats); err != nil {
if err := pids.GetStats(path, &stats); err != nil {
t.Fatal(err)
}
@ -88,17 +86,16 @@ func TestPidsStats(t *testing.T) {
}
func TestPidsStatsUnlimited(t *testing.T) {
helper := NewCgroupTestUtil("pids", t)
defer helper.cleanup()
path := tempDir(t, "pids")
helper.writeFileContents(map[string]string{
writeFileContents(t, path, map[string]string{
"pids.current": strconv.Itoa(4096),
"pids.max": "max",
})
pids := &PidsGroup{}
stats := *cgroups.NewStats()
if err := pids.GetStats(helper.CgroupPath, &stats); err != nil {
if err := pids.GetStats(path, &stats); err != nil {
t.Fatal(err)
}

View File

@ -0,0 +1,25 @@
package fs
import (
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
)
type RdmaGroup struct{}
func (s *RdmaGroup) Name() string {
return "rdma"
}
func (s *RdmaGroup) Apply(path string, _ *configs.Resources, pid int) error {
return apply(path, pid)
}
func (s *RdmaGroup) Set(path string, r *configs.Resources) error {
return fscommon.RdmaSet(path, r)
}
func (s *RdmaGroup) GetStats(path string, stats *cgroups.Stats) error {
return fscommon.RdmaGetStats(path, stats)
}

Some files were not shown because too many files have changed in this diff Show More