Import Upstream version 1.1.12+ds1

This commit is contained in:
luoyaoming 2024-04-30 18:13:09 +08:00
parent a9cf6bd6d6
commit 68d1261cab
1005 changed files with 3396 additions and 289043 deletions

View File

@ -1,7 +1,8 @@
---
# We use Cirrus for Vagrant tests and native CentOS 7 and 8, because macOS
# instances of GHA are too slow and flaky, and Linux instances of GHA do not
# support KVM.
# We use Cirrus for CentOS (native) and Fedora (in Vagrant), because neither
# CentOS nor Fedora is available on GHA natively, so the only option is VM.
# In GHA, nested virtualization is only supported on macOS instances, which
# are slow and flaky.
# NOTE Cirrus execution environments lack a terminal, needed for
# some integration tests. So we use `ssh -tt` command to fake a terminal.
@ -24,25 +25,31 @@ task:
platform: linux
nested_virtualization: true
# CPU limit: `16 / NTASK`: see https://cirrus-ci.org/faq/#are-there-any-limits
cpu: 8
cpu: 4
# Memory limit: `4GB * NCPU`
memory: 32G
memory: 16G
host_info_script: |
uname -a
echo "-----"
# -----
cat /etc/os-release
echo "-----"
cat /proc/cpuinfo
echo "-----"
# -----
df -T
# -----
cat /proc/cpuinfo
install_libvirt_vagrant_script: |
curl -fsSL https://apt.releases.hashicorp.com/gpg | sudo gpg --dearmor -o /usr/share/keyrings/hashicorp-archive-keyring.gpg
echo "deb [signed-by=/usr/share/keyrings/hashicorp-archive-keyring.gpg] https://apt.releases.hashicorp.com $(lsb_release -cs) main" | sudo tee /etc/apt/sources.list.d/hashicorp.list
sudo sed -i 's/^# deb-src/deb-src/' /etc/apt/sources.list
apt-get update
apt-get install -y libvirt-daemon libvirt-daemon-system vagrant vagrant-libvirt
apt-get install -y libvirt-daemon libvirt-daemon-system vagrant
systemctl enable --now libvirtd
apt-get build-dep -y vagrant ruby-libvirt
apt-get install -y --no-install-recommends libxslt-dev libxml2-dev libvirt-dev ruby-bundler ruby-dev zlib1g-dev
vagrant plugin install vagrant-libvirt
vagrant_cache:
fingerprint_script: uname -s ; cat Vagrantfile.$DISTRO
folder: /root/.vagrant.d
fingerprint_script: cat Vagrantfile.$DISTRO
folder: /root/.vagrant.d/boxes
vagrant_up_script: |
ln -sf Vagrantfile.$DISTRO Vagrantfile
# Retry if it fails (download.fedoraproject.org returns 404 sometimes)
@ -50,7 +57,9 @@ task:
mkdir -p -m 0700 /root/.ssh
vagrant ssh-config >> /root/.ssh/config
guest_info_script: |
ssh default 'sh -exc "uname -a && systemctl --version && df -T && cat /etc/os-release"'
ssh default 'sh -exc "uname -a && systemctl --version && df -T && cat /etc/os-release && go version"'
check_config_script: |
ssh default /vagrant/script/check-config.sh
unit_tests_script: |
ssh default 'sudo -i make -C /vagrant localunittest'
integration_systemd_script: |
@ -68,12 +77,14 @@ task:
env:
HOME: /root
CIRRUS_WORKING_DIR: /home/runc
GO_VERSION: "1.17.3"
BATS_VERSION: "v1.3.0"
GO_VERSION: "1.20"
BATS_VERSION: "v1.9.0"
RPMS: gcc git iptables jq glibc-static libseccomp-devel make criu fuse-sshfs
# yamllint disable rule:key-duplicates
matrix:
DISTRO: centos-7
DISTRO: centos-stream-8
DISTRO: centos-stream-9
name: ci / $DISTRO
@ -88,6 +99,8 @@ task:
case $DISTRO in
centos-7)
(cd /etc/yum.repos.d && curl -O https://copr.fedorainfracloud.org/coprs/adrian/criu-el7/repo/epel-7/adrian-criu-el7-epel-7.repo)
# EPEL is needed for jq and fuse-sshfs.
rpm -q epel-release || rpm -Uvh https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
# sysctl
echo "user.max_user_namespaces=15076" > /etc/sysctl.d/userns.conf
sysctl --system
@ -95,15 +108,32 @@ task:
centos-stream-8)
yum config-manager --set-enabled powertools # for glibc-static
;;
centos-stream-9)
dnf config-manager --set-enabled crb # for glibc-static
dnf -y install epel-release epel-next-release # for fuse-sshfs
# Delegate all cgroup v2 controllers to rootless user via --systemd-cgroup.
# The default (since systemd v252) is "pids memory cpu".
mkdir -p /etc/systemd/system/user@.service.d
printf "[Service]\nDelegate=yes\n" > /etc/systemd/system/user@.service.d/delegate.conf
systemctl daemon-reload
;;
esac
# Work around dnf mirror failures by retrying a few times.
for i in $(seq 0 2); do
sleep $i
yum install -y -q gcc git iptables jq glibc-static libseccomp-devel make criu fuse-sshfs && break
yum install -y $RPMS && break
done
[ $? -eq 0 ] # fail if yum failed
# Double check that all rpms were installed (yum from CentOS 7
# does not exit with an error if some packages were not found).
# Use --whatprovides since some packages are renamed.
rpm -q --whatprovides $RPMS
# install Go
curl -fsSL "https://dl.google.com/go/go${GO_VERSION}.linux-amd64.tar.gz" | tar Cxz /usr/local
PREFIX="https://go.dev/dl/"
# Find out the latest minor release URL.
eval $(curl -fsSL "${PREFIX}?mode=json" | jq -r --arg Ver "$GO_VERSION" '.[] | select(.version | startswith("go\($Ver)")) | .files[] | select(.os == "linux" and .arch == "amd64" and .kind == "archive") | "filename=\"" + .filename + "\""')
curl -fsSL "$PREFIX$filename" | tar Cxz /usr/local
# install bats
cd /tmp
git clone https://github.com/bats-core/bats-core
@ -131,14 +161,18 @@ task:
systemctl restart sshd
host_info_script: |
uname -a
echo "-----"
cat /etc/os-release
echo "-----"
cat /proc/cpuinfo
echo "-----"
df -T
echo "-----"
# -----
/usr/local/go/bin/go version
# -----
systemctl --version
# -----
cat /etc/os-release
# -----
df -T
# -----
cat /proc/cpuinfo
check_config_script: |
/home/runc/script/check-config.sh
unit_tests_script: |
ssh -tt localhost "make -C /home/runc localunittest"
integration_systemd_script: |
@ -146,13 +180,19 @@ task:
integration_fs_script: |
ssh -tt localhost "make -C /home/runc localintegration"
integration_systemd_rootless_script: |
echo "SKIP: integration_systemd_rootless_script requires cgroup v2"
case $DISTRO in
centos-7|centos-stream-8)
echo "SKIP: integration_systemd_rootless_script requires cgroup v2"
;;
*)
ssh -tt localhost "make -C /home/runc localrootlessintegration RUNC_USE_SYSTEMD=yes"
esac
integration_fs_rootless_script: |
case $DISTRO in
centos-7)
echo "SKIP: FIXME: integration_fs_rootless_script is skipped because of EPERM on writing cgroup.procs"
;;
centos-stream-8)
*)
ssh -tt localhost "make -C /home/runc localrootlessintegration"
;;
esac

View File

@ -1,3 +1,3 @@
[codespell]
skip = ./vendor,./.git
ignore-words-list = clos,creat
skip = ./vendor,./.git,./go.sum
ignore-words-list = clos,mis

8
.editorconfig Normal file
View File

@ -0,0 +1,8 @@
# This file is used by shfmt. See https://EditorConfig.org
# This is a top-most EditorConfig file.
root = true
# Ignore the entire "vendor" directory.
[vendor/**]
ignore = true

View File

@ -21,13 +21,13 @@ jobs:
strategy:
fail-fast: false
matrix:
go-version: [1.16.x, 1.17.x]
go-version: [1.17.x, 1.20.x, 1.21.x]
rootless: ["rootless", ""]
race: ["-race", ""]
criu: [""]
include:
# Also test against latest criu-dev
- go-version: 1.17.x
- go-version: 1.20.x
rootless: ""
race: ""
criu: "criu-dev"
@ -35,7 +35,7 @@ jobs:
steps:
- name: checkout
uses: actions/checkout@v2
uses: actions/checkout@v3
- name: install deps
if: matrix.criu == ''
@ -43,7 +43,7 @@ jobs:
REPO: https://download.opensuse.org/repositories/devel:/tools:/criu/xUbuntu_20.04
run: |
# criu repo
curl -fSsl $REPO/Release.key | sudo apt-key add -
curl -fSsLl $REPO/Release.key | gpg --dearmor | sudo tee /etc/apt/trusted.gpg.d/devel_tools_criu.gpg > /dev/null
echo "deb $REPO/ /" | sudo tee /etc/apt/sources.list.d/criu.list
sudo apt update
sudo apt install libseccomp-dev criu sshfs
@ -60,9 +60,8 @@ jobs:
rm -rf ~/criu
- name: install go ${{ matrix.go-version }}
uses: actions/setup-go@v2
uses: actions/setup-go@v4
with:
stable: '!contains(${{ matrix.go-version }}, "beta") && !contains(${{ matrix.go-version }}, "rc")'
go-version: ${{ matrix.go-version }}
- name: build
@ -71,7 +70,7 @@ jobs:
- name: install bats
uses: mig4/setup-bats@v1
with:
bats-version: 1.3.0
bats-version: 1.9.0
- name: unit test
if: matrix.rootless != 'rootless'
@ -105,7 +104,7 @@ jobs:
steps:
- name: checkout
uses: actions/checkout@v2
uses: actions/checkout@v3
- name: install deps
run: |
@ -120,10 +119,9 @@ jobs:
sudo apt -q install libseccomp-dev libseccomp-dev:i386 gcc-multilib criu
- name: install go
uses: actions/setup-go@v2
uses: actions/setup-go@v4
with:
go-version: 1.x # Latest stable
- name: unit test
# cgo is disabled by default when cross-compiling
run: sudo -E PATH="$PATH" -- make GOARCH=386 CGO_ENABLED=1 localunittest
run: sudo -E PATH="$PATH" -- make GOARCH=386 localunittest

View File

@ -7,41 +7,39 @@ on:
- master
- release-*
pull_request:
env:
GO_VERSION: 1.20.x
jobs:
keyring:
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v3
- name: check runc.keyring
run: make validate-keyring
lint:
runs-on: ubuntu-20.04
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3
with:
fetch-depth: 2
- uses: actions/setup-go@v4
with:
go-version: "${{ env.GO_VERSION }}"
cache: false # golangci-lint-action does its own caching
- name: install deps
run: |
sudo apt -q update
sudo apt -q install libseccomp-dev
- uses: golangci/golangci-lint-action@v2
- uses: golangci/golangci-lint-action@v3
with:
# must be specified without patch version
version: v1.42
lint-extra:
# Extra linters, only checking new code from pull requests.
if: github.event_name == 'pull_request'
runs-on: ubuntu-20.04
permissions:
contents: read
steps:
- uses: actions/checkout@v2
- name: install deps
version: v1.53
# Extra linters, only checking new code from a pull request.
- name: lint-extra
if: github.event_name == 'pull_request'
run: |
sudo apt -q update
sudo apt -q install libseccomp-dev
- uses: golangci/golangci-lint-action@v2
with:
only-new-issues: true
args: --config .golangci-extra.yml
# must be specified without patch version
version: v1.43
golangci-lint run --config .golangci-extra.yml --new-from-rev=HEAD~1 --out-format=github-actions
compile-buildtags:
runs-on: ubuntu-20.04
@ -49,18 +47,18 @@ jobs:
# Don't ignore C warnings. Note that the output of "go env CGO_CFLAGS" by default is "-g -O2", so we keep them.
CGO_CFLAGS: -g -O2 -Werror
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3
- name: install go
uses: actions/setup-go@v2
uses: actions/setup-go@v4
with:
go-version: 1.x # Latest stable
go-version: "${{ env.GO_VERSION }}"
- name: compile with no build tags
run: make BUILDTAGS=""
codespell:
runs-on: ubuntu-20.04
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3
- name: install deps
# Version of codespell bundled with Ubuntu is way old, so use pip.
run: pip install codespell
@ -70,35 +68,19 @@ jobs:
shfmt:
runs-on: ubuntu-20.04
steps:
- uses: actions/checkout@v2
- name: vars
run: |
echo "VERSION=3.3.1" >> $GITHUB_ENV
echo "$(go env GOPATH)/bin" >> $GITHUB_PATH
- name: cache go mod and $GOCACHE
uses: actions/cache@v2
with:
path: |
~/go/pkg/mod
~/.cache/go-build
key: ${{ runner.os }}-shfmt-${{ env.VERSION }}
restore-keys: ${{ runner.os }}-shfmt-
- name: install shfmt
run: |
command -v shfmt || \
(cd ~ && GO111MODULE=on time go get mvdan.cc/sh/v3/cmd/shfmt@v$VERSION)
- uses: actions/checkout@v3
- name: shfmt
run: make shfmt
shellcheck:
runs-on: ubuntu-20.04
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3
- name: vars
run: |
echo 'VERSION=v0.7.2' >> $GITHUB_ENV
echo 'VERSION=v0.8.0' >> $GITHUB_ENV
echo 'BASEURL=https://github.com/koalaman/shellcheck/releases/download' >> $GITHUB_ENV
echo 'SHA256SUM=12ee2e0b90a3d1e9cae24ac9b2838be66b48573cb2c8e8f3c566b959df6f050c' >> $GITHUB_ENV
echo 'SHA256SUM=f4bce23c11c3919c1b20bcb0f206f6b44c44e26f2bc95f8aa708716095fa0651' >> $GITHUB_ENV
echo ~/bin >> $GITHUB_PATH
- name: install shellcheck
run: |
@ -108,27 +90,21 @@ jobs:
sha256sum ~/bin/shellcheck | grep -q $SHA256SUM
# make sure to remove the old version
sudo rm -f /usr/bin/shellcheck
- uses: lumaxis/shellcheck-problem-matchers@v1
- uses: lumaxis/shellcheck-problem-matchers@v2
- name: shellcheck
run: |
make shellcheck
- name: check-config.sh
run : ./script/check-config.sh
deps:
runs-on: ubuntu-20.04
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3
- name: install go
uses: actions/setup-go@v2
uses: actions/setup-go@v4
with:
go-version: 1.x # Latest stable
- name: cache go mod and $GOCACHE
uses: actions/cache@v2
with:
path: |
~/go/pkg/mod
~/.cache/go-build
key: ${{ runner.os }}-go.sum-${{ hashFiles('**/go.sum') }}
restore-keys: ${{ runner.os }}-go.sum-
go-version: "${{ env.GO_VERSION }}"
- name: verify deps
run: make verify-dependencies
@ -151,12 +127,11 @@ jobs:
pattern: '^.{0,72}(\n.*)*$'
error: 'Subject too long (max 72)'
cfmt:
runs-on: ubuntu-20.04
steps:
- name: checkout
uses: actions/checkout@v2
uses: actions/checkout@v3
with:
fetch-depth: 0
- name: install deps
@ -173,9 +148,13 @@ jobs:
runs-on: ubuntu-20.04
steps:
- name: checkout
uses: actions/checkout@v2
uses: actions/checkout@v3
with:
fetch-depth: 0
- name: check CHANGELOG.md
run: make verify-changelog
# We have to run this under Docker as Ubuntu (host) does not support all
# the architectures we want to compile test against, and Dockerfile uses
# Debian (which does).
@ -185,14 +164,12 @@ jobs:
# under Docker will emerge, it will be good to have a separate make
# runcimage job and share its result (the docker image) with whoever
# needs it.
- uses: satackey/action-docker-layer-caching@v0.0.11
continue-on-error: true
- name: build docker image
run: make runcimage
- name: make releaseall
run: make releaseall
- name: upload artifacts
uses: actions/upload-artifact@v2
uses: actions/upload-artifact@v3
with:
name: release-${{ github.run_id }}
path: release/*

View File

@ -1,5 +1,5 @@
# This is golangci-lint config file which is used to check new code in
# github PRs only (see lint-extra job in .github/workflows/validate.yml).
# github PRs only (see lint-extra in .github/workflows/validate.yml).
#
# For the default linter config, see .golangci.yml. This config should
# only enable additional linters not enabled in the default config.

View File

@ -1,21 +1,288 @@
# Changelog/
# Changelog
This file documents all notable changes made to this project since runc 1.0.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [Unreleased]
## [Unreleased 1.1.z]
## [1.1.12] - 2024-01-31
> Now you're thinking with Portals™!
### Security
* Fix [CVE-2024-21626][cve-2024-21626], a container breakout attack that took
advantage of a file descriptor that was leaked internally within runc (but
never leaked to the container process). In addition to fixing the leak,
several strict hardening measures were added to ensure that future internal
leaks could not be used to break out in this manner again. Based on our
research, while no other container runtime had a similar leak, none had any
of the hardening steps we've introduced (and some runtimes would not check
for any file descriptors that a calling process may have leaked to them,
allowing for container breakouts due to basic user error).
[cve-2024-21626]: https://github.com/opencontainers/runc/security/advisories/GHSA-xr7r-f8xq-vfvv
## [1.1.11] - 2024-01-01
> Happy New Year!
### Fixed
* Fix several issues with userns path handling. (#4122, #4124, #4134, #4144)
### Changed
* Support memory.peak and memory.swap.peak in cgroups v2.
Add `swapOnlyUsage` in `MemoryStats`. This field reports swap-only usage.
For cgroupv1, `Usage` and `Failcnt` are set by subtracting memory usage
from memory+swap usage. For cgroupv2, `Usage`, `Limit`, and `MaxUsage`
are set. (#4000, #4010, #4131)
* build(deps): bump github.com/cyphar/filepath-securejoin. (#4140)
## [1.1.10] - 2023-10-31
> Śruba, przykręcona we śnie, nie zmieni sytuacji, jaka panuje na jawie.
### Added
* Support for `hugetlb.<pagesize>.rsvd` limiting and accounting. Fixes the
issue of postres failing when hugepage limits are set. (#3859, #4077)
### Fixed
* Fixed permissions of a newly created directories to not depend on the value
of umask in tmpcopyup feature implementation. (#3991, #4060)
* libcontainer: cgroup v1 GetStats now ignores missing `kmem.limit_in_bytes`
(fixes the compatibility with Linux kernel 6.1+). (#4028)
* Fix a semi-arbitrary cgroup write bug when given a malicious hugetlb
configuration. This issue is not a security issue because it requires a
malicious `config.json`, which is outside of our threat model. (#4103)
* Various CI fixes. (#4081, #4055)
## [1.1.9] - 2023-08-10
> There is a crack in everything. That's how the light gets in.
### Added
* Added go 1.21 to the CI matrix; other CI updates. (#3976, #3958)
### Fixed
* Fixed losing sticky bit on tmpfs (a regression in 1.1.8). (#3952, #3961)
* intelrdt: fixed ignoring ClosID on some systems. (#3550, #3978)
### Changed
* Sum `anon` and `file` from `memory.stat` for cgroupv2 root usage,
as the root does not have `memory.current` for cgroupv2.
This aligns cgroupv2 root usage more closely with cgroupv1 reporting.
Additionally, report root swap usage as sum of swap and memory usage,
aligned with v1 and existing non-root v2 reporting. (#3933)
## [1.1.8] - 2023-07-20
> 海纳百川 有容乃大
### Added
* Support riscv64. (#3905)
### Fixed
* init: do not print environment variable value. (#3879)
* libct: fix a race with systemd removal. (#3877)
* tests/int: increase num retries for oom tests. (#3891)
* man/runc: fixes. (#3892)
* Fix tmpfs mode opts when dir already exists. (#3916)
* docs/systemd: fix a broken link. (#3917)
* ci/cirrus: enable some rootless tests on cs9. (#3918)
* runc delete: call systemd's reset-failed. (#3932)
* libct/cg/sd/v1: do not update non-frozen cgroup after frozen failed. (#3921)
### Changed
* CI: bump Fedora, Vagrant, bats. (#3878)
* `.codespellrc`: update for 2.2.5. (#3909)
## [1.1.7] - 2023-04-26
> Ночевала тучка золотая на груди утеса-великана.
### Fixed
* When used with systemd v240+, systemd cgroup drivers no longer skip
`DeviceAllow` rules if the device does not exist (a regression introduced
in runc 1.1.3). This fix also reverts the workaround added in runc 1.1.5,
removing an extra warning emitted by runc run/start. (#3845, #3708, #3671)
### Added
* The source code now has a new file, `runc.keyring`, which contains the keys
used to sign runc releases. (#3838)
## [1.1.6] - 2023-04-11
> In this world nothing is certain but death and taxes.
### Compatibility
* This release can no longer be built from sources using Go 1.16. Using a
latest maintained Go 1.20.x or Go 1.19.x release is recommended.
Go 1.17 can still be used.
### Fixed
* systemd cgroup v1 and v2 drivers were deliberately ignoring `UnitExist` error
from systemd while trying to create a systemd unit, which in some scenarios
may result in a container not being added to the proper systemd unit and
cgroup. (#3780, #3806)
* systemd cgroup v2 driver was incorrectly translating cpuset range from spec's
`resources.cpu.cpus` to systemd unit property (`AllowedCPUs`) in case of more
than 8 CPUs, resulting in the wrong AllowedCPUs setting. (#3808)
* systemd cgroup v1 driver was prefixing container's cgroup path with the path
of PID 1 cgroup, resulting in inability to place PID 1 in a non-root cgroup.
(#3811)
* runc run/start may return "permission denied" error when starting a rootless
container when the file to be executed does not have executable bit set for
the user, not taking the `CAP_DAC_OVERRIDE` capability into account. This is
a regression in runc 1.1.4, as well as in Go 1.20 and 1.20.1 (#3715, #3817)
* cgroup v1 drivers are now aware of `misc` controller. (#3823)
* Various CI fixes and improvements, mostly to ensure Go 1.19.x and Go 1.20.x
compatibility.
## [1.1.5] - 2023-03-29
> 囚われた屈辱は
> 反撃の嚆矢だ
### Security
The following CVEs were fixed in this release:
* [CVE-2023-25809][] is a vulnerability involving rootless containers where
(under specific configurations), the container would have write access to the
`/sys/fs/cgroup/user.slice/...` cgroup hierarchy. No other hierarchies on the
host were affected. This vulnerability was discovered by Akihiro Suda.
* [CVE-2023-27561][] was a regression in our protections against tricky `/proc`
and `/sys` configurations (where the container mountpoint is a symlink)
causing us to be tricked into incorrectly configuring the container, which
effectively re-introduced [CVE-2019-19921][]. This regression was present
from v1.0.0-rc95 to v1.1.4 and was discovered by @Beuc. (#3785)
* [CVE-2023-28642][] is a different attack vector using the same regression
as in [CVE-2023-27561][]. This was reported by Lei Wang.
[CVE-2019-19921]: https://github.com/advisories/GHSA-fh74-hm69-rqjw
[CVE-2023-25809]: https://github.com/opencontainers/runc/security/advisories/GHSA-m8cg-xc2p-r3fc
[CVE-2023-27561]: https://github.com/advisories/GHSA-vpvm-3wq2-2wvm
[CVE-2023-28642]: https://github.com/opencontainers/runc/security/advisories/GHSA-g2j6-57v7-gm8c
### Fixed
* Fix the inability to use `/dev/null` when inside a container. (#3620)
* Fix changing the ownership of host's `/dev/null` caused by fd redirection
(a regression in 1.1.1). (#3674, #3731)
* Fix rare runc exec/enter unshare error on older kernels, including
CentOS < 7.7. (#3776)
* nsexec: Check for errors in `write_log()`. (#3721)
* Various CI fixes and updates. (#3618, #3630, #3640, #3729)
## [1.1.4] - 2022-08-24
> If you look for perfection, you'll never be content.
### Fixed
* Fix mounting via wrong proc fd.
When the user and mount namespaces are used, and the bind mount is followed by
the cgroup mount in the spec, the cgroup was mounted using the bind mount's
mount fd. (#3511)
* Switch `kill()` in `libcontainer/nsenter` to `sane_kill()`. (#3536)
* Fix "permission denied" error from `runc run` on `noexec` fs. (#3541)
* Fix failed exec after `systemctl daemon-reload`.
Due to a regression in v1.1.3, the `DeviceAllow=char-pts rwm` rule was no
longer added and was causing an error `open /dev/pts/0: operation not permitted: unknown`
when systemd was reloaded. (#3554)
* Various CI fixes. (#3538, #3558, #3562)
## [1.1.3] - 2022-06-09
> In the beginning there was nothing, which exploded.
### Fixed
* Our seccomp `-ENOSYS` stub now correctly handles multiplexed syscalls on
s390 and s390x. This solves the issue where syscalls the host kernel did not
support would return `-EPERM` despite the existence of the `-ENOSYS` stub
code (this was due to how s390x does syscall multiplexing). (#3478)
* Retry on dbus disconnect logic in libcontainer/cgroups/systemd now works as
intended; this fix does not affect runc binary itself but is important for
libcontainer users such as Kubernetes. (#3476)
* Inability to compile with recent clang due to an issue with duplicate
constants in libseccomp-golang. (#3477)
* When using systemd cgroup driver, skip adding device paths that don't exist,
to stop systemd from emitting warnings about those paths. (#3504)
* Socket activation was failing when more than 3 sockets were used. (#3494)
* Various CI fixes. (#3472, #3479)
### Added
* Allow to bind mount /proc/sys/kernel/ns_last_pid to inside container. (#3493)
### Changed
* runc static binaries are now linked against libseccomp v2.5.4. (#3481)
## [1.1.2] - 2022-05-11
> I should think I'm going to be a perpetual student.
### Security
* A bug was found in runc where runc exec --cap executed processes with
non-empty inheritable Linux process capabilities, creating an atypical Linux
environment. For more information, see [GHSA-f3fp-gc8g-vw66][] and
CVE-2022-29162.
### Changed
* `runc spec` no longer sets any inheritable capabilities in the created
example OCI spec (`config.json`) file.
[GHSA-f3fp-gc8g-vw66]: https://github.com/opencontainers/runc/security/advisories/GHSA-f3fp-gc8g-vw66
## [1.1.1] - 2022-03-28
> Violence is the last refuge of the incompetent.
### Added
* CI is now also run on centos-stream-9. (#3436)
### Fixed
* `runc run/start` can now run a container with read-only `/dev` in OCI spec,
rather than error out. (#3355)
* `runc exec` now ensures that `--cgroup` argument is a sub-cgroup. (#3403)
* libcontainer systemd v2 manager no longer errors out if one of the files
listed in `/sys/kernel/cgroup/delegate` do not exist in container's cgroup.
(#3387, #3404)
* Loose OCI spec validation to avoid bogus "Intel RDT is not supported" error.
(#3406)
* libcontainer/cgroups no longer panics in cgroup v1 managers if `stat`
of `/sys/fs/cgroup/unified` returns an error other than ENOENT. (#3435)
## [1.1.0] - 2022-01-14
> A plan depends as much upon execution as it does upon concept.
## Changed
### Changed
* libcontainer will now refuse to build without the nsenter package being
correctly compiled (specifically this requires CGO to be enabled). This
should avoid folks accidentally creating broken runc binaries (and
incorrectly importing our internal libraries into their projects). (#3331)
## [1.1.0-rc.1] - 2021-12-14
> He who controls the spice controls the universe.
@ -41,7 +308,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
binary etc.) and failures of the command being executed. (#3073)
* runc run: new `--keep` option to skip removal exited containers artefacts.
This might be useful to check the state (e.g. of cgroup controllers) after
the container hasexited. (#2817, #2825)
the container has exited. (#2817, #2825)
* seccomp: add support for `SCMP_ACT_KILL_PROCESS` and `SCMP_ACT_KILL_THREAD`
(the latter is just an alias for `SCMP_ACT_KILL`). (#3204)
* seccomp: add support for `SCMP_ACT_NOTIFY` (seccomp actions). This allows
@ -130,13 +397,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Fixed
* Fixed inability to start a container with read-write bind mount of a
read-only fuse host mount. (#3283, #3292)
* Fixed inability to start when read-only /dev in set in spec (#3276, #3277)
* Fixed inability to start when read-only /dev in set in spec. (#3276, #3277)
* Fixed not removing sub-cgroups upon container delete, when rootless cgroup v2
is used with older systemd. (#3226, #3297)
* Fixed returning error from GetStats when hugetlb is unsupported (which causes
excessive logging for Kubernetes). (#3233, #3295)
* Improved an error message when dbus-user-session is not installed and
rootless + cgroup2 + systemd are used (#3212)
rootless + cgroup2 + systemd are used. (#3212)
[GHSA-v95c-p5hm-xq8f]: https://github.com/opencontainers/runc/security/advisories/GHSA-v95c-p5hm-xq8f
@ -216,7 +483,7 @@ implementation (libcontainer) is *not* covered by this policy.
code, optimize the method for checking whether a cgroup is frozen. (#2955)
* cgroups/systemd: fixed "retry on dbus disconnect" logic introduced in rc94
* cgroups/systemd: fixed returning "unit already exists" error from a systemd
cgroup manager (regression in rc94) (#2997, #2996)
cgroup manager (regression in rc94). (#2997, #2996)
### Added
* cgroupv2: support SkipDevices with systemd driver. (#2958, #3019)
@ -225,7 +492,7 @@ implementation (libcontainer) is *not* covered by this policy.
(#3022)
### Changed
* cgroup/systemd: return, not ignore, stop unit error from Destroy (#2946)
* cgroup/systemd: return, not ignore, stop unit error from Destroy. (#2946)
* Fix all golangci-lint failures. (#2781, #2962)
* Make `runc --version` output sane even when built with `go get` or
otherwise outside of our build scripts. (#2962)
@ -244,5 +511,17 @@ implementation (libcontainer) is *not* covered by this policy.
[1.0.1]: https://github.com/opencontainers/runc/compare/v1.0.0...v1.0.1
<!-- 1.1.z patch releases -->
[Unreleased 1.1.z]: https://github.com/opencontainers/runc/compare/v1.1.0...release-1.1
[Unreleased 1.1.z]: https://github.com/opencontainers/runc/compare/v1.1.12...release-1.1
[1.1.12]: https://github.com/opencontainers/runc/compare/v1.1.11...v1.1.12
[1.1.11]: https://github.com/opencontainers/runc/compare/v1.1.10...v1.1.11
[1.1.10]: https://github.com/opencontainers/runc/compare/v1.1.9...v1.1.10
[1.1.9]: https://github.com/opencontainers/runc/compare/v1.1.8...v1.1.9
[1.1.8]: https://github.com/opencontainers/runc/compare/v1.1.7...v1.1.8
[1.1.7]: https://github.com/opencontainers/runc/compare/v1.1.6...v1.1.7
[1.1.6]: https://github.com/opencontainers/runc/compare/v1.1.5...v1.1.6
[1.1.5]: https://github.com/opencontainers/runc/compare/v1.1.4...v1.1.5
[1.1.4]: https://github.com/opencontainers/runc/compare/v1.1.3...v1.1.4
[1.1.3]: https://github.com/opencontainers/runc/compare/v1.1.2...v1.1.3
[1.1.2]: https://github.com/opencontainers/runc/compare/v1.1.1...v1.1.2
[1.1.1]: https://github.com/opencontainers/runc/compare/v1.1.0...v1.1.1
[1.1.0-rc.1]: https://github.com/opencontainers/runc/compare/v1.0.0...v1.1.0-rc.1

View File

@ -1,6 +1,6 @@
ARG GO_VERSION=1.17
ARG BATS_VERSION=v1.3.0
ARG LIBSECCOMP_VERSION=2.5.3
ARG GO_VERSION=1.20
ARG BATS_VERSION=v1.9.0
ARG LIBSECCOMP_VERSION=2.5.4
FROM golang:${GO_VERSION}-bullseye
ARG DEBIAN_FRONTEND=noninteractive
@ -9,19 +9,16 @@ ARG CRIU_REPO=https://download.opensuse.org/repositories/devel:/tools:/criu/Debi
RUN KEYFILE=/usr/share/keyrings/criu-repo-keyring.gpg; \
wget -nv $CRIU_REPO/Release.key -O- | gpg --dearmor > "$KEYFILE" \
&& echo "deb [signed-by=$KEYFILE] $CRIU_REPO/ /" > /etc/apt/sources.list.d/criu.list \
&& dpkg --add-architecture armel \
&& dpkg --add-architecture armhf \
&& dpkg --add-architecture arm64 \
&& dpkg --add-architecture ppc64el \
&& apt-get update \
&& apt-get install -y --no-install-recommends \
build-essential \
criu \
crossbuild-essential-arm64 \
crossbuild-essential-armel \
crossbuild-essential-armhf \
crossbuild-essential-ppc64el \
crossbuild-essential-s390x \
gcc-aarch64-linux-gnu libc-dev-arm64-cross \
gcc-arm-linux-gnueabi libc-dev-armel-cross \
gcc-arm-linux-gnueabihf libc-dev-armhf-cross \
gcc-powerpc64le-linux-gnu libc-dev-ppc64el-cross \
gcc-s390x-linux-gnu libc-dev-s390x-cross \
gcc-riscv64-linux-gnu libc-dev-riscv64-cross \
curl \
gawk \
gcc \
@ -54,11 +51,18 @@ RUN cd /tmp \
# install libseccomp
ARG LIBSECCOMP_VERSION
COPY script/* /tmp/script/
COPY script/seccomp.sh script/lib.sh /tmp/script/
RUN mkdir -p /opt/libseccomp \
&& /tmp/script/seccomp.sh "$LIBSECCOMP_VERSION" /opt/libseccomp arm64 armel armhf ppc64le s390x
&& /tmp/script/seccomp.sh "$LIBSECCOMP_VERSION" /opt/libseccomp arm64 armel armhf ppc64le riscv64 s390x
ENV LIBSECCOMP_VERSION=$LIBSECCOMP_VERSION
ENV LD_LIBRARY_PATH=/opt/libseccomp/lib
ENV PKG_CONFIG_PATH=/opt/libseccomp/lib/pkgconfig
# Prevent the "fatal: detected dubious ownership in repository" git complain during build.
RUN git config --global --add safe.directory /go/src/github.com/opencontainers/runc
WORKDIR /go/src/github.com/opencontainers/runc
# Fixup for cgroup v2.
COPY script/prepare-cgroup-v2.sh /
ENTRYPOINT [ "/prepare-cgroup-v2.sh" ]

View File

@ -10,23 +10,51 @@ GIT_BRANCH_CLEAN := $(shell echo $(GIT_BRANCH) | sed -e "s/[^[:alnum:]]/-/g")
RUNC_IMAGE := runc_dev$(if $(GIT_BRANCH_CLEAN),:$(GIT_BRANCH_CLEAN))
PROJECT := github.com/opencontainers/runc
BUILDTAGS ?= seccomp
COMMIT ?= $(shell git describe --dirty --long --always)
VERSION := $(shell cat ./VERSION)
LDFLAGS_COMMON := -X main.gitCommit=$(COMMIT) -X main.version=$(VERSION)
ifeq ($(shell $(GO) env GOOS),linux)
ifeq (,$(filter $(shell $(GO) env GOARCH),mips mipsle mips64 mips64le ppc64))
ifeq (,$(findstring -race,$(EXTRA_FLAGS)))
GO_BUILDMODE := "-buildmode=pie"
endif
GOARCH := $(shell $(GO) env GOARCH)
GO_BUILDMODE :=
# Enable dynamic PIE executables on supported platforms.
ifneq (,$(filter $(GOARCH),386 amd64 arm arm64 ppc64le riscv64 s390x))
ifeq (,$(findstring -race,$(EXTRA_FLAGS)))
GO_BUILDMODE := "-buildmode=pie"
endif
endif
GO_BUILD := $(GO) build -trimpath $(GO_BUILDMODE) $(EXTRA_FLAGS) -tags "$(BUILDTAGS)" \
-ldflags "-X main.gitCommit=$(COMMIT) -X main.version=$(VERSION) $(EXTRA_LDFLAGS)"
GO_BUILD_STATIC := CGO_ENABLED=1 $(GO) build -trimpath $(EXTRA_FLAGS) -tags "$(BUILDTAGS) netgo osusergo" \
-ldflags "-extldflags -static -X main.gitCommit=$(COMMIT) -X main.version=$(VERSION) $(EXTRA_LDFLAGS)"
GO_BUILD := $(GO) build -trimpath $(GO_BUILDMODE) \
$(EXTRA_FLAGS) -tags "$(BUILDTAGS)" \
-ldflags "$(LDFLAGS_COMMON) $(EXTRA_LDFLAGS)"
GO_BUILDMODE_STATIC :=
LDFLAGS_STATIC := -extldflags -static
# Enable static PIE executables on supported platforms.
# This (among the other things) requires libc support (rcrt1.o), which seems
# to be available only for arm64 and amd64 (Debian Bullseye).
ifneq (,$(filter $(GOARCH),arm64 amd64))
ifeq (,$(findstring -race,$(EXTRA_FLAGS)))
GO_BUILDMODE_STATIC := -buildmode=pie
LDFLAGS_STATIC := -linkmode external -extldflags --static-pie
endif
endif
# Enable static PIE binaries on supported platforms.
GO_BUILD_STATIC := $(GO) build -trimpath $(GO_BUILDMODE_STATIC) \
$(EXTRA_FLAGS) -tags "$(BUILDTAGS) netgo osusergo" \
-ldflags "$(LDFLAGS_COMMON) $(LDFLAGS_STATIC) $(EXTRA_LDFLAGS)"
GPG_KEYID ?= asarai@suse.de
# Some targets need cgo, which is disabled by default when cross compiling.
# Enable cgo explicitly for those.
# Both runc and libcontainer/integration need libcontainer/nsenter.
runc static localunittest: export CGO_ENABLED=1
# seccompagent needs libseccomp (when seccomp build tag is set).
ifneq (,$(filter $(BUILDTAGS),seccomp))
seccompagent: export CGO_ENABLED=1
endif
.DEFAULT: runc
runc:
@ -40,7 +68,7 @@ recvtty sd-helper seccompagent:
static:
$(GO_BUILD_STATIC) -o runc .
releaseall: RELEASE_ARGS := "-a arm64 -a armel -a armhf -a ppc64le -a s390x"
releaseall: RELEASE_ARGS := "-a arm64 -a armel -a armhf -a ppc64le -a riscv64 -a s390x"
releaseall: release
release: runcimage
@ -50,7 +78,7 @@ release: runcimage
$(RUNC_IMAGE) make localrelease
script/release_sign.sh -S $(GPG_KEYID) -r release/$(VERSION) -v $(VERSION)
localrelease:
localrelease: verify-changelog
script/release_build.sh -r release/$(VERSION) -v $(VERSION) $(RELEASE_ARGS)
dbuild: runcimage
@ -133,26 +161,39 @@ cfmt:
shellcheck:
shellcheck tests/integration/*.bats tests/integration/*.sh \
tests/integration/*.bash tests/*.sh \
script/release_*.sh script/seccomp.sh script/lib.sh
# TODO: add shellcheck for more sh files
man/*.sh script/*
# TODO: add shellcheck for more sh files (contrib/completions/bash/runc).
shfmt:
shfmt -ln bats -d -w tests/integration/*.bats
shfmt -ln bash -d -w man/*.sh script/* tests/*.sh tests/integration/*.bash
$(CONTAINER_ENGINE) run $(CONTAINER_ENGINE_RUN_FLAGS) \
--rm -v $(CURDIR):/src -w /src \
mvdan/shfmt:v3.5.1 -d -w .
localshfmt:
shfmt -d -w .
vendor:
$(GO) mod tidy
$(GO) mod vendor
$(GO) mod verify
verify-changelog:
# No space at EOL.
! grep -n '\s$$' CHANGELOG.md
# Period before issue/PR references.
! grep -n '[0-9a-zA-Z][^.] (#[1-9][0-9, #]*)$$' CHANGELOG.md
verify-dependencies: vendor
@test -z "$$(git status --porcelain -- go.mod go.sum vendor/)" \
|| (echo -e "git status:\n $$(git status -- go.mod go.sum vendor/)\nerror: vendor/, go.mod and/or go.sum not up to date. Run \"make vendor\" to update"; exit 1) \
&& echo "all vendor files are up to date."
validate-keyring:
script/keyring_validate.sh
.PHONY: runc all recvtty sd-helper seccompagent static releaseall release \
localrelease dbuild lint man runcimage \
test localtest unittest localunittest integration localintegration \
rootlessintegration localrootlessintegration shell install install-bash \
install-man clean cfmt shfmt shellcheck \
vendor verify-dependencies
install-man clean cfmt shfmt localshfmt shellcheck \
vendor verify-changelog verify-dependencies validate-keyring

View File

@ -1,10 +1,11 @@
# runc
[![Go Report Card](https://goreportcard.com/badge/github.com/opencontainers/runc)](https://goreportcard.com/report/github.com/opencontainers/runc)
[![GoDoc](https://godoc.org/github.com/opencontainers/runc?status.svg)](https://godoc.org/github.com/opencontainers/runc)
[![Go Reference](https://pkg.go.dev/badge/github.com/opencontainers/runc.svg)](https://pkg.go.dev/github.com/opencontainers/runc)
[![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/588/badge)](https://bestpractices.coreinfrastructure.org/projects/588)
[![gha/validate](https://github.com/opencontainers/runc/workflows/validate/badge.svg)](https://github.com/opencontainers/runc/actions?query=workflow%3Avalidate)
[![gha/ci](https://github.com/opencontainers/runc/workflows/ci/badge.svg)](https://github.com/opencontainers/runc/actions?query=workflow%3Aci)
[![CirrusCI](https://api.cirrus-ci.com/github/opencontainers/runc.svg)](https://cirrus-ci.com/github/opencontainers/runc)
## Introduction
@ -14,6 +15,8 @@
You can find official releases of `runc` on the [release](https://github.com/opencontainers/runc/releases) page.
All releases are signed by one of the keys listed in the [`runc.keyring` file in the root of this repository](runc.keyring).
## Security
The reporting process and disclosure communications are outlined [here](https://github.com/opencontainers/org/blob/master/SECURITY.md).
@ -23,7 +26,7 @@ A third party security audit was performed by Cure53, you can see the full repor
## Building
`runc` only supports Linux. It must be built with Go version 1.16 or higher.
`runc` only supports Linux. It must be built with Go version 1.17 or higher.
In order to enable seccomp support you will need to install `libseccomp` on your platform.
> e.g. `libseccomp-devel` for CentOS, or `libseccomp-dev` for Ubuntu

View File

@ -1 +1 @@
1.1.0
1.1.12

View File

@ -3,7 +3,7 @@
Vagrant.configure("2") do |config|
# Fedora box is used for testing cgroup v2 support
config.vm.box = "fedora/35-cloud-base"
config.vm.box = "fedora/38-cloud-base"
config.vm.provider :virtualbox do |v|
v.memory = 2048
v.cpus = 2
@ -29,6 +29,9 @@ EOF
done
dnf clean all
# Prevent the "fatal: unsafe repository" git complain during build.
git config --global --add safe.directory /vagrant
# Add a user for rootless tests
useradd -u2000 -m -d/home/rootless -s/bin/bash rootless

View File

@ -12,7 +12,7 @@ fi
# exits when not running inside bats. We can do hacks, but just to redefine
# update_config() seems clearer. We don't even really need to keep them in sync.
function update_config() {
jq "$1" "./config.json" | awk 'BEGIN{RS="";getline<"-";print>ARGV[1]}' "./config.json"
jq "$1" "./config.json" | awk 'BEGIN{RS="";getline<"-";print>ARGV[1]}' "./config.json"
}
update_config '.linux.seccomp = {

29
debian/changelog vendored
View File

@ -1,29 +0,0 @@
runc (1.1.0-ok3) yangtze; urgency=medium
* CVE-2022-29162 安全修复
-- chenxinquan <chenxinquan@kylinos.cn> Fri, 28 Jul 2023 16:16:46 +0800
runc (1.1.0-ok2) yangtze; urgency=medium
* yangfs215 CVE-2022-29162 runc: do not set inheritable capabilities
-- yangfengsheng <yangfs@whu.edu.cn> Tue, 18 Jul 2023 00:10:28 +0800
runc (1.1.0-ok1) yangtze; urgency=medium
* Merge new upstream version 1.1.0
-- Luoyaoming <luoyaoming@kylinos.cn> Fri, 30 Dec 2022 11:11:29 +0800
runc (1.0.0~rc10-ok2) yangtze; urgency=medium
* Update version.
-- zhouganqing <zhouganqing@kylinos.cn> Thu, 28 Jul 2022 16:49:00 +0800
runc (1.0.0~rc10-ok1) yangtze; urgency=medium
* Build for openKylin.
-- openKylinBot <openKylinBot@openkylin.com> Mon, 25 Apr 2022 22:03:04 +0800

17
debian/clean vendored
View File

@ -1,17 +0,0 @@
## Remove generated man pages:
man/man8/*
## Drop hanging test (introduced in 0.0.9).
## https://github.com/opencontainers/runc/issues/692
libcontainer/nsenter/nsenter_test.go
## Failing tests:
## Privileged tests:
### couldn't get cgroup root: mountpoint for cgroup not found
libcontainer/cgroups/fs/apply_raw_test.go
### FAIL: TestXattr (0.00s)
### xattr_test.go:26: Success
### xattr_test.go:30: failed
libcontainer/xattr/xattr_test.go

1
debian/compat vendored
View File

@ -1 +0,0 @@
10

43
debian/control vendored
View File

@ -1,43 +0,0 @@
Source: runc
Section: devel
Priority: optional
Maintainer: Openkylin Developers <packaging@lists.openkylin.top>
XSBC-Original-Maintainer: Debian Go Packaging Team <pkg-go-maintainers@lists.alioth.debian.org>
Uploaders: Alexandre Viau <aviau@debian.org>,
Dmitry Smirnov <onlyjob@debian.org>,
Tim Potter <tpot@hpe.com>
Build-Depends: debhelper (>= 11~),
dh-golang,
go-md2man,
golang-any,
libapparmor-dev,
libseccomp-dev,
pkg-config,
protobuf-compiler
Standards-Version: 4.1.4
Homepage: https://github.com/opencontainers/runc
Vcs-Git: https://salsa.debian.org/go-team/packages/runc.git
Vcs-Browser: https://salsa.debian.org/go-team/packages/runc
XS-Go-Import-Path: github.com/opencontainers/runc
Package: runc
Architecture: any
Depends: ${misc:Depends}, ${shlibs:Depends}
Breaks: docker.io (<= 1.13.1~ds1-0)
Built-Using: ${misc:Built-Using}
Description: Open Container Project - runtime
"runc" is a command line client for running applications packaged according
to the Open Container Format (OCF) and is a compliant implementation of
the Open Container Project specification.
Package: golang-github-opencontainers-runc-dev
Architecture: all
Depends: ${misc:Depends}
Description: Open Container Project - development files
"runc" is a command line client for running applications packaged according
to the Open Container Format (OCF) and is a compliant implementation of
the Open Container Project specification.
.
This package provides development files formerly known as
"github.com/docker/libcontainer".

82
debian/copyright vendored
View File

@ -1,82 +0,0 @@
Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
Upstream-Name: runc
Source: https://github.com/opencontainers/runc
Files: *
Copyright: 2012-2015 Docker, Inc.
License: Apache-2.0
Files:
vendor/github.com/cyphar/filepath-securejoin/*
Copyright:
2014-2015 Docker Inc & Go Authors. All rights reserved.
2017 SUSE LLC. All rights reserved.
License: BSD-3-Clause~Google
Files: debian/*
Copyright:
2015 Alexandre Viau <alexandre@alexandreviau.net>
2015-2016 Dmitry Smirnov <onlyjob@debian.org>
License: GPL-3+
Files: debian/patches/*
Copyright: 2015 Dmitry Smirnov <onlyjob@debian.org>
License: GPL-3+ or Apache-2.0
Comment: patches can be licensed under the same terms as upstream.
License: Apache-2.0
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
.
http://www.apache.org/licenses/LICENSE-2.0
.
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
.
The complete text of the Apache version 2.0 license
can be found in "/usr/share/common-licenses/Apache-2.0".
License: GPL-3+
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
The complete text of the GNU General Public License version 3
can be found in "/usr/share/common-licenses/GPL-3".
License: BSD-3-Clause~Google
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
.
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

28
debian/gitlab-ci.yml vendored
View File

@ -1,28 +0,0 @@
# auto-generated, DO NOT MODIFY.
# The authoritative copy of this file lives at:
# https://salsa.debian.org/go-team/ci/blob/master/cmd/ci/gitlabciyml.go
# TODO: publish under debian-go-team/ci
image: stapelberg/ci2
test_the_archive:
artifacts:
paths:
- before-applying-commit.json
- after-applying-commit.json
script:
# Create an overlay to discard writes to /srv/gopath/src after the build:
- "rm -rf /cache/overlay/{upper,work}"
- "mkdir -p /cache/overlay/{upper,work}"
- "mount -t overlay overlay -o lowerdir=/srv/gopath/src,upperdir=/cache/overlay/upper,workdir=/cache/overlay/work /srv/gopath/src"
- "export GOPATH=/srv/gopath"
- "export GOCACHE=/cache/go"
# Build the world as-is:
- "ci-build -exemptions=/var/lib/ci-build/exemptions.json > before-applying-commit.json"
# Copy this package into the overlay:
- "GBP_CONF_FILES=:debian/gbp.conf gbp buildpackage --git-no-pristine-tar --git-ignore-branch --git-ignore-new --git-export-dir=/tmp/export --git-no-overlay --git-tarball-dir=/nonexistant --git-cleaner=/bin/true --git-builder='dpkg-buildpackage -S -d --no-sign'"
- "pgt-gopath -dsc /tmp/export/*.dsc"
# Rebuild the world:
- "ci-build -exemptions=/var/lib/ci-build/exemptions.json > after-applying-commit.json"
- "ci-diff before-applying-commit.json after-applying-commit.json"

View File

@ -1 +0,0 @@
usr/share/gocode/src

View File

@ -1,39 +0,0 @@
From: Dmitry Smirnov <onlyjob@debian.org>
Date: Thu, 28 Jul 2022 16:28:22 +0800
Subject: fix FTBFS on i686
src/github.com/opencontainers/runc/libcontainer/user/user_test.go:448:36: constant 2147483648 overflows int
Last-Update: 2018-06-16
Forwarded: https://github.com/opencontainers/runc/pull/1821
Bug-Upstream: https://github.com/opencontainers/runc/issues/941
---
libcontainer/user/user.go | 2 +-
libcontainer/user/user_test.go | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/libcontainer/user/user.go b/libcontainer/user/user.go
index 7b912bb..38caded 100644
--- a/libcontainer/user/user.go
+++ b/libcontainer/user/user.go
@@ -473,7 +473,7 @@ func GetAdditionalGroups(additionalGroups []string, group io.Reader) ([]int, err
return nil, fmt.Errorf("Unable to find group %s", ag)
}
// Ensure gid is inside gid range.
- if gid < minId || gid > maxId {
+ if gid < minId || gid >= maxId {
return nil, ErrRange
}
gidMap[gid] = struct{}{}
diff --git a/libcontainer/user/user_test.go b/libcontainer/user/user_test.go
index 24ee559..a4aabdc 100644
--- a/libcontainer/user/user_test.go
+++ b/libcontainer/user/user_test.go
@@ -445,7 +445,7 @@ this is just some garbage data
if utils.GetIntSize() > 4 {
tests = append(tests, foo{
// groups with too large id
- groups: []string{strconv.Itoa(1 << 31)},
+ groups: []string{strconv.Itoa( 1<<31 -1 )},
expected: nil,
hasError: true,
})

View File

@ -1,48 +0,0 @@
From: Dmitry Smirnov <onlyjob@debian.org>
Date: Thu, 28 Jul 2022 16:28:22 +0800
Subject: disabled unreliable tests due to random failures on [ppc64el,
s390x].
Last-Update: 2018-09-27
Forwarded: not-needed
Bug-Upstream: https://github.com/opencontainers/runc/issues/1822
---
libcontainer/cgroups/fs/hugetlb_test.go | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/libcontainer/cgroups/fs/hugetlb_test.go b/libcontainer/cgroups/fs/hugetlb_test.go
index 9ddacfe..9b60650 100644
--- a/libcontainer/cgroups/fs/hugetlb_test.go
+++ b/libcontainer/cgroups/fs/hugetlb_test.go
@@ -89,6 +89,7 @@ func TestHugetlbStats(t *testing.T) {
}
func TestHugetlbStatsNoUsageFile(t *testing.T) {
+t.Skip("Disabled unreliable test")
helper := NewCgroupTestUtil("hugetlb", t)
defer helper.cleanup()
helper.writeFileContents(map[string]string{
@@ -104,6 +105,7 @@ func TestHugetlbStatsNoUsageFile(t *testing.T) {
}
func TestHugetlbStatsNoMaxUsageFile(t *testing.T) {
+t.Skip("Disabled unreliable test")
helper := NewCgroupTestUtil("hugetlb", t)
defer helper.cleanup()
for _, pageSize := range HugePageSizes {
@@ -121,6 +123,7 @@ func TestHugetlbStatsNoMaxUsageFile(t *testing.T) {
}
func TestHugetlbStatsBadUsageFile(t *testing.T) {
+t.Skip("Disabled unreliable test")
helper := NewCgroupTestUtil("hugetlb", t)
defer helper.cleanup()
for _, pageSize := range HugePageSizes {
@@ -139,6 +142,7 @@ func TestHugetlbStatsBadUsageFile(t *testing.T) {
}
func TestHugetlbStatsBadMaxUsageFile(t *testing.T) {
+t.Skip("Disabled unreliable test")
helper := NewCgroupTestUtil("hugetlb", t)
defer helper.cleanup()
helper.writeFileContents(map[string]string{

View File

@ -1,22 +0,0 @@
From: Dmitry Smirnov <onlyjob@debian.org>
Date: Thu, 28 Jul 2022 16:28:22 +0800
Subject: disable test (requires root)
Last-Update: 2018-06-15
Forwarded: not-needed
---
libcontainer/factory_linux_test.go | 1 +
1 file changed, 1 insertion(+)
diff --git a/libcontainer/factory_linux_test.go b/libcontainer/factory_linux_test.go
index 8d0ca8a..1dc0180 100644
--- a/libcontainer/factory_linux_test.go
+++ b/libcontainer/factory_linux_test.go
@@ -78,6 +78,7 @@ func TestFactoryNewIntelRdt(t *testing.T) {
}
func TestFactoryNewTmpfs(t *testing.T) {
+t.Skip("DM - skipping privileged test")
root, rerr := newTestRoot()
if rerr != nil {
t.Fatal(rerr)

26
debian/rules vendored
View File

@ -1,26 +0,0 @@
#!/usr/bin/make -f
# Uncomment this to turn on verbose mode.
#export DH_VERBOSE=1
export DH_GOPKG := github.com/opencontainers/runc
export DH_GOLANG_INSTALL_EXTRA := libcontainer/seccomp/fixtures libcontainer/criurpc
TAGS=apparmor seccomp selinux ambient
%:
dh $@ --buildsystem=golang --with=golang --builddirectory=_build
override_dh_auto_configure:
cd man && ./md2man-all.sh
dh_auto_configure
## Remove extra license files:
$(RM) -v \
_build/src/$(DH_GOPKG)/vendor/github.com/docker/docker/*/*/LICENSE* \
;
override_dh_auto_build:
dh_auto_build -- -tags "$(TAGS)"
override_dh_auto_test:
DH_GOLANG_EXCLUDES="libcontainer/integration" \
dh_auto_test -- -tags "$(TAGS)"

2
debian/runc.docs vendored
View File

@ -1,2 +0,0 @@
NOTICE
README*

1
debian/runc.install vendored
View File

@ -1 +0,0 @@
usr/bin/* /usr/sbin/

View File

@ -1 +0,0 @@
runc: spelling-error-in-binary

View File

@ -1 +0,0 @@
man/man8/*.8

View File

@ -1 +0,0 @@
3.0 (native)

View File

@ -1,2 +0,0 @@
# Result of Files-Excluded:
source-contains-empty-directory vendor/*

View File

@ -1,34 +0,0 @@
#!/bin/bash
set -Eeuo pipefail
set -x
runc --version
tempDir="$(mktemp -d)"
trap 'rm -rf "$tempDir"' EXIT
# build up rootfs with busybox
busybox="$(which busybox)" # from busybox-static
mkdir "$tempDir/rootfs"
cp -a "$busybox" "$tempDir/rootfs/"
# rough "rootfs" smoke test (makes sure "busybox" is actually static)
chroot "$tempDir/rootfs" /busybox true
# make a config.json file for our "bundle"
runc spec --bundle "$tempDir"
# edit the default command to something we can actually run with our rootfs
grep '"sh"' "$tempDir/config.json"
sed -i 's@"sh"@"/busybox","echo","success"@g' "$tempDir/config.json"
grep '"/busybox","echo","success"' "$tempDir/config.json"
# and disable the TTY
grep '"terminal": true,' "$tempDir/config.json"
sed -i 's/"terminal": true,/"terminal": false,/g' "$tempDir/config.json"
grep '"terminal": false,' "$tempDir/config.json"
# run it and capture the output
output="$(runc run --bundle "$tempDir" "test-$$-$RANDOM")"
# ensure the output was exactly what we expected
[ "$output" = 'success' ]

View File

@ -1,7 +0,0 @@
Tests: basic-smoke
Depends: busybox-static, @
Restrictions: allow-stderr, isolation-machine, needs-root
Test-Command: /usr/bin/dh_golang_autopkgtest
Depends: @, @builddeps@, dh-golang
Restrictions: allow-stderr, isolation-machine

9
debian/watch vendored
View File

@ -1,9 +0,0 @@
version=3
opts=\
repack,\
repacksuffix=+dfsg1,\
uversionmangle=s/-rc/~rc/,\
dversionmangle=s/[~+]dfsg\d*$// \
https://github.com/opencontainers/runc/releases \
.*archive/v?(\d\.\d\.\d.*)\.tar\.gz

View File

@ -123,8 +123,8 @@ The above will set the following properties:
* `TimeoutStopSec` to 2 minutes and 3 seconds;
* `CollectMode` to "inactive-or-failed".
The values must be in the gvariant format (for details, see
[gvariant documentation](https://developer.gnome.org/glib/stable/gvariant-text.html)).
The values must be in the gvariant text format, as described in
[gvariant documentation](https://docs.gtk.org/glib/gvariant-text.html).
To find out which type systemd expects for a particular parameter, please
consult systemd sources.

19
go.mod
View File

@ -1,26 +1,33 @@
module github.com/opencontainers/runc
go 1.16
go 1.17
require (
github.com/checkpoint-restore/go-criu/v5 v5.3.0
github.com/cilium/ebpf v0.7.0
github.com/containerd/console v1.0.3
github.com/coreos/go-systemd/v22 v22.3.2
github.com/cyphar/filepath-securejoin v0.2.3
github.com/cyphar/filepath-securejoin v0.2.4
github.com/docker/go-units v0.4.0
github.com/godbus/dbus/v5 v5.0.6
github.com/moby/sys/mountinfo v0.5.0
github.com/mrunalp/fileutils v0.5.0
github.com/mrunalp/fileutils v0.5.1
github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417
github.com/opencontainers/selinux v1.10.0
github.com/seccomp/libseccomp-golang v0.9.2-0.20210429002308-3879420cc921
github.com/seccomp/libseccomp-golang v0.9.2-0.20220502022130-f33da4d89646
github.com/sirupsen/logrus v1.8.1
github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635
// NOTE: urfave/cli must be <= v1.22.1 due to a regression: https://github.com/urfave/cli/issues/1092
github.com/urfave/cli v1.22.1
github.com/vishvananda/netlink v1.1.0
golang.org/x/net v0.0.0-20201224014010-6772e930b67b
golang.org/x/sys v0.0.0-20211116061358-0a5406a5449c
golang.org/x/net v0.8.0
golang.org/x/sys v0.6.0
google.golang.org/protobuf v1.27.1
)
require (
github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d // indirect
github.com/russross/blackfriday/v2 v2.0.1 // indirect
github.com/shurcooL/sanitized_anchor_name v1.0.0 // indirect
github.com/vishvananda/netns v0.0.0-20191106174202-0a2b9b5464df // indirect
)

48
go.sum
View File

@ -9,8 +9,8 @@ github.com/coreos/go-systemd/v22 v22.3.2 h1:D9/bQk5vlXQFZ6Kwuu6zaiXJ9oTPe68++AzA
github.com/coreos/go-systemd/v22 v22.3.2/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d h1:U+s90UTSYgptZMwQh2aRr3LuazLJIa+Pg3Kc1ylSYVY=
github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU=
github.com/cyphar/filepath-securejoin v0.2.3 h1:YX6ebbZCZP7VkM3scTTokDgBL2TY741X51MTk3ycuNI=
github.com/cyphar/filepath-securejoin v0.2.3/go.mod h1:aPGpWjXOXUn2NCNjFvBE6aRxGGx79pTxQpKOJNYHHl4=
github.com/cyphar/filepath-securejoin v0.2.4 h1:Ugdm7cg7i6ZK6x3xDF1oEu1nfkyfH53EtKeQYTC3kyg=
github.com/cyphar/filepath-securejoin v0.2.4/go.mod h1:aPGpWjXOXUn2NCNjFvBE6aRxGGx79pTxQpKOJNYHHl4=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/docker/go-units v0.4.0 h1:3uh0PgVws3nIA0Q+MwDC8yjEPf9zjRfZZWXZYDct3Tw=
@ -31,8 +31,8 @@ github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/moby/sys/mountinfo v0.5.0 h1:2Ks8/r6lopsxWi9m58nlwjaeSzUX9iiL1vj5qB/9ObI=
github.com/moby/sys/mountinfo v0.5.0/go.mod h1:3bMD3Rg+zkqx8MRYPi7Pyb0Ie97QEBmdxbhnCLlSvSU=
github.com/mrunalp/fileutils v0.5.0 h1:NKzVxiH7eSk+OQ4M+ZYW1K6h27RUV3MI6NUTsHhU6Z4=
github.com/mrunalp/fileutils v0.5.0/go.mod h1:M1WthSahJixYnrXQl/DFQuteStB1weuxD2QJNHXfbSQ=
github.com/mrunalp/fileutils v0.5.1 h1:F+S7ZlNKnrwHfSwdlgNSkKo67ReVf8o9fel6C3dkm/Q=
github.com/mrunalp/fileutils v0.5.1/go.mod h1:M1WthSahJixYnrXQl/DFQuteStB1weuxD2QJNHXfbSQ=
github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417 h1:3snG66yBm59tKhhSPQrQ/0bCrv1LQbKt40LnUPiUxdc=
github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0=
github.com/opencontainers/selinux v1.10.0 h1:rAiKF8hTcgLI3w0DHm6i0ylVVcOrlgR1kK99DRLDhyU=
@ -41,8 +41,8 @@ github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZb
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/russross/blackfriday/v2 v2.0.1 h1:lPqVAte+HuHNfhJ/0LC98ESWRz8afy9tM/0RK8m9o+Q=
github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
github.com/seccomp/libseccomp-golang v0.9.2-0.20210429002308-3879420cc921 h1:58EBmR2dMNL2n/FnbQewK3D14nXr0V9CObDSvMJLq+Y=
github.com/seccomp/libseccomp-golang v0.9.2-0.20210429002308-3879420cc921/go.mod h1:JA8cRccbGaA1s33RQf7Y1+q9gHmZX1yB/z9WDN1C6fg=
github.com/seccomp/libseccomp-golang v0.9.2-0.20220502022130-f33da4d89646 h1:RpforrEYXWkmGwJHIGnLZ3tTWStkjVVstwzNGqxX2Ds=
github.com/seccomp/libseccomp-golang v0.9.2-0.20220502022130-f33da4d89646/go.mod h1:JA8cRccbGaA1s33RQf7Y1+q9gHmZX1yB/z9WDN1C6fg=
github.com/shurcooL/sanitized_anchor_name v1.0.0 h1:PdmoCO6wvbs+7yrJyMORt4/BmY5IYyJwS/kOiWx8mHo=
github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc=
github.com/sirupsen/logrus v1.8.1 h1:dJKuHgqk1NNQlqoA6BTlM1Wf9DOH3NBjQyu0h9+AZZE=
@ -57,20 +57,48 @@ github.com/vishvananda/netlink v1.1.0 h1:1iyaYNBLmP6L0220aDnYQpo1QEV4t4hJ+xEEhhJ
github.com/vishvananda/netlink v1.1.0/go.mod h1:cTgwzPIzzgDAYoQrMm0EdrjRUBkTqKYppBueQtXaqoE=
github.com/vishvananda/netns v0.0.0-20191106174202-0a2b9b5464df h1:OviZH7qLw/7ZovXvuNyL3XQl8UFofeikI1NW1Gypu7k=
github.com/vishvananda/netns v0.0.0-20191106174202-0a2b9b5464df/go.mod h1:JP3t17pCcGlemwknint6hfoeCVQrEMVwxRLRjXpq+BU=
golang.org/x/net v0.0.0-20201224014010-6772e930b67b h1:iFwSg7t5GZmB/Q5TjiEAsdoLDrdJRC1RiF2WhuV29Qw=
golang.org/x/net v0.0.0-20201224014010-6772e930b67b/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
golang.org/x/net v0.8.0 h1:Zrh2ngAOFYneWTAIAPethzeaQLuHwhuBkuV6ZiRnUaQ=
golang.org/x/net v0.8.0/go.mod h1:QVkue5JL9kW//ek3r6jTKnTFis1tRmNAW2P1shuFdJc=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190606203320-7fc4e5ec1444/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20191115151921-52ab43148777/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20210906170528-6f6e22806c34/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20211025201205-69cdffdb9359/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20211116061358-0a5406a5449c h1:DHcbWVXeY+0Y8HHKR+rbLwnoh2F4tNCY7rTiHJ30RmA=
golang.org/x/sys v0.0.0-20211116061358-0a5406a5449c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0 h1:MVltZSvRTcU2ljQOhs94SXPftV6DCNnZViHeQps87pQ=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
golang.org/x/term v0.6.0/go.mod h1:m6U89DPEgQRMq3DNkDClhWw02AUbt2daBVO4cn4Hv9U=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
golang.org/x/text v0.8.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=

View File

@ -1,6 +1,6 @@
# libcontainer
[![GoDoc](https://godoc.org/github.com/opencontainers/runc/libcontainer?status.svg)](https://godoc.org/github.com/opencontainers/runc/libcontainer)
[![Go Reference](https://pkg.go.dev/badge/github.com/opencontainers/runc/libcontainer.svg)](https://pkg.go.dev/github.com/opencontainers/runc/libcontainer)
Libcontainer provides a native Go implementation for creating containers
with namespaces, cgroups, capabilities, and filesystem access controls.

View File

@ -153,8 +153,7 @@ func TestDeviceFilter_Privileged(t *testing.T) {
Allow: true,
},
}
expected :=
`
expected := `
// load parameters into registers
0: LdXMemW dst: r2 src: r1 off: 0 imm: 0
1: And32Imm dst: r2 imm: 65535

View File

@ -93,7 +93,7 @@ var (
)
// Loosely based on the BPF_F_REPLACE support check in
// <https://github.com/cilium/ebpf/blob/v0.6.0/link/syscalls.go>.
// https://github.com/cilium/ebpf/blob/v0.6.0/link/syscalls.go.
//
// TODO: move this logic to cilium/ebpf
func haveBpfProgReplace() bool {

View File

@ -10,6 +10,7 @@ import (
"strings"
"sync"
"github.com/opencontainers/runc/libcontainer/utils"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
@ -76,16 +77,16 @@ var (
// TestMode is set to true by unit tests that need "fake" cgroupfs.
TestMode bool
cgroupFd int = -1
prepOnce sync.Once
prepErr error
resolveFlags uint64
cgroupRootHandle *os.File
prepOnce sync.Once
prepErr error
resolveFlags uint64
)
func prepareOpenat2() error {
prepOnce.Do(func() {
fd, err := unix.Openat2(-1, cgroupfsDir, &unix.OpenHow{
Flags: unix.O_DIRECTORY | unix.O_PATH,
Flags: unix.O_DIRECTORY | unix.O_PATH | unix.O_CLOEXEC,
})
if err != nil {
prepErr = &os.PathError{Op: "openat2", Path: cgroupfsDir, Err: err}
@ -96,15 +97,16 @@ func prepareOpenat2() error {
}
return
}
file := os.NewFile(uintptr(fd), cgroupfsDir)
var st unix.Statfs_t
if err = unix.Fstatfs(fd, &st); err != nil {
if err := unix.Fstatfs(int(file.Fd()), &st); err != nil {
prepErr = &os.PathError{Op: "statfs", Path: cgroupfsDir, Err: err}
logrus.Warnf("falling back to securejoin: %s", prepErr)
return
}
cgroupFd = fd
cgroupRootHandle = file
resolveFlags = unix.RESOLVE_BENEATH | unix.RESOLVE_NO_MAGICLINKS
if st.Type == unix.CGROUP2_SUPER_MAGIC {
// cgroupv2 has a single mountpoint and no "cpu,cpuacct" symlinks
@ -122,7 +124,7 @@ func openFile(dir, file string, flags int) (*os.File, error) {
flags |= os.O_TRUNC | os.O_CREATE
mode = 0o600
}
path := path.Join(dir, file)
path := path.Join(dir, utils.CleanPath(file))
if prepareOpenat2() != nil {
return openFallback(path, flags, mode)
}
@ -131,7 +133,7 @@ func openFile(dir, file string, flags int) (*os.File, error) {
return openFallback(path, flags, mode)
}
fd, err := unix.Openat2(cgroupFd, relPath,
fd, err := unix.Openat2(int(cgroupRootHandle.Fd()), relPath,
&unix.OpenHow{
Resolve: resolveFlags,
Flags: uint64(flags) | unix.O_CLOEXEC,
@ -139,20 +141,20 @@ func openFile(dir, file string, flags int) (*os.File, error) {
})
if err != nil {
err = &os.PathError{Op: "openat2", Path: path, Err: err}
// Check if cgroupFd is still opened to cgroupfsDir
// Check if cgroupRootHandle is still opened to cgroupfsDir
// (happens when this package is incorrectly used
// across the chroot/pivot_root/mntns boundary, or
// when /sys/fs/cgroup is remounted).
//
// TODO: if such usage will ever be common, amend this
// to reopen cgroupFd and retry openat2.
fdStr := strconv.Itoa(cgroupFd)
// to reopen cgroupRootHandle and retry openat2.
fdStr := strconv.Itoa(int(cgroupRootHandle.Fd()))
fdDest, _ := os.Readlink("/proc/self/fd/" + fdStr)
if fdDest != cgroupfsDir {
// Wrap the error so it is clear that cgroupFd
// Wrap the error so it is clear that cgroupRootHandle
// is opened to an unexpected/wrong directory.
err = fmt.Errorf("cgroupFd %s unexpectedly opened to %s != %s: %w",
fdStr, fdDest, cgroupfsDir, err)
err = fmt.Errorf("cgroupRootHandle %d unexpectedly opened to %s != %s: %w",
cgroupRootHandle.Fd(), fdDest, cgroupfsDir, err)
}
return nil, err
}

View File

@ -58,8 +58,6 @@ func TestOpenat2(t *testing.T) {
{"/sys/fs/cgroup", "/cgroup.controllers"},
{"/sys/fs/cgroup/", "cgroup.controllers"},
{"/sys/fs/cgroup/", "/cgroup.controllers"},
{"/sys/fs/cgroup/user.slice", "cgroup.controllers"},
{"/sys/fs/cgroup/user.slice/", "/cgroup.controllers"},
{"/", "/sys/fs/cgroup/cgroup.controllers"},
{"/", "sys/fs/cgroup/cgroup.controllers"},
{"/sys/fs/cgroup/cgroup.controllers", ""},

View File

@ -28,6 +28,7 @@ var subsystems = []subsystem{
&FreezerGroup{},
&RdmaGroup{},
&NameGroup{GroupName: "name=systemd", Join: true},
&NameGroup{GroupName: "misc", Join: true},
}
var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist")

View File

@ -1,6 +1,8 @@
package fs
import (
"errors"
"os"
"strconv"
"github.com/opencontainers/runc/libcontainer/cgroups"
@ -19,8 +21,23 @@ func (s *HugetlbGroup) Apply(path string, _ *configs.Resources, pid int) error {
}
func (s *HugetlbGroup) Set(path string, r *configs.Resources) error {
const suffix = ".limit_in_bytes"
skipRsvd := false
for _, hugetlb := range r.HugetlbLimit {
if err := cgroups.WriteFile(path, "hugetlb."+hugetlb.Pagesize+".limit_in_bytes", strconv.FormatUint(hugetlb.Limit, 10)); err != nil {
prefix := "hugetlb." + hugetlb.Pagesize
val := strconv.FormatUint(hugetlb.Limit, 10)
if err := cgroups.WriteFile(path, prefix+suffix, val); err != nil {
return err
}
if skipRsvd {
continue
}
if err := cgroups.WriteFile(path, prefix+".rsvd"+suffix, val); err != nil {
if errors.Is(err, os.ErrNotExist) {
skipRsvd = true
continue
}
return err
}
}
@ -32,24 +49,29 @@ func (s *HugetlbGroup) GetStats(path string, stats *cgroups.Stats) error {
if !cgroups.PathExists(path) {
return nil
}
rsvd := ".rsvd"
hugetlbStats := cgroups.HugetlbStats{}
for _, pageSize := range cgroups.HugePageSizes() {
usage := "hugetlb." + pageSize + ".usage_in_bytes"
value, err := fscommon.GetCgroupParamUint(path, usage)
again:
prefix := "hugetlb." + pageSize + rsvd
value, err := fscommon.GetCgroupParamUint(path, prefix+".usage_in_bytes")
if err != nil {
if rsvd != "" && errors.Is(err, os.ErrNotExist) {
rsvd = ""
goto again
}
return err
}
hugetlbStats.Usage = value
maxUsage := "hugetlb." + pageSize + ".max_usage_in_bytes"
value, err = fscommon.GetCgroupParamUint(path, maxUsage)
value, err = fscommon.GetCgroupParamUint(path, prefix+".max_usage_in_bytes")
if err != nil {
return err
}
hugetlbStats.MaxUsage = value
failcnt := "hugetlb." + pageSize + ".failcnt"
value, err = fscommon.GetCgroupParamUint(path, failcnt)
value, err = fscommon.GetCgroupParamUint(path, prefix+".failcnt")
if err != nil {
return err
}

View File

@ -21,6 +21,11 @@ const (
limit = "hugetlb.%s.limit_in_bytes"
maxUsage = "hugetlb.%s.max_usage_in_bytes"
failcnt = "hugetlb.%s.failcnt"
rsvdUsage = "hugetlb.%s.rsvd.usage_in_bytes"
rsvdLimit = "hugetlb.%s.rsvd.limit_in_bytes"
rsvdMaxUsage = "hugetlb.%s.rsvd.max_usage_in_bytes"
rsvdFailcnt = "hugetlb.%s.rsvd.failcnt"
)
func TestHugetlbSetHugetlb(t *testing.T) {
@ -52,13 +57,15 @@ func TestHugetlbSetHugetlb(t *testing.T) {
}
for _, pageSize := range cgroups.HugePageSizes() {
limit := fmt.Sprintf(limit, pageSize)
value, err := fscommon.GetCgroupParamUint(path, limit)
if err != nil {
t.Fatal(err)
}
if value != hugetlbAfter {
t.Fatalf("Set hugetlb.limit_in_bytes failed. Expected: %v, Got: %v", hugetlbAfter, value)
for _, f := range []string{limit, rsvdLimit} {
limit := fmt.Sprintf(f, pageSize)
value, err := fscommon.GetCgroupParamUint(path, limit)
if err != nil {
t.Fatal(err)
}
if value != hugetlbAfter {
t.Fatalf("Set %s failed. Expected: %v, Got: %v", limit, hugetlbAfter, value)
}
}
}
}
@ -85,6 +92,28 @@ func TestHugetlbStats(t *testing.T) {
}
}
func TestHugetlbRStatsRsvd(t *testing.T) {
path := tempDir(t, "hugetlb")
for _, pageSize := range cgroups.HugePageSizes() {
writeFileContents(t, path, map[string]string{
fmt.Sprintf(rsvdUsage, pageSize): hugetlbUsageContents,
fmt.Sprintf(rsvdMaxUsage, pageSize): hugetlbMaxUsageContents,
fmt.Sprintf(rsvdFailcnt, pageSize): hugetlbFailcnt,
})
}
hugetlb := &HugetlbGroup{}
actualStats := *cgroups.NewStats()
err := hugetlb.GetStats(path, &actualStats)
if err != nil {
t.Fatal(err)
}
expectedStats := cgroups.HugetlbStats{Usage: 128, MaxUsage: 256, Failcnt: 100}
for _, pageSize := range cgroups.HugePageSizes() {
expectHugetlbStatEquals(t, expectedStats, actualStats.HugetlbStats[pageSize])
}
}
func TestHugetlbStatsNoUsageFile(t *testing.T) {
path := tempDir(t, "hugetlb")
writeFileContents(t, path, map[string]string{

View File

@ -170,6 +170,10 @@ func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error {
return err
}
stats.MemoryStats.SwapUsage = swapUsage
stats.MemoryStats.SwapOnlyUsage = cgroups.MemoryData{
Usage: swapUsage.Usage - memoryUsage.Usage,
Failcnt: swapUsage.Failcnt - memoryUsage.Failcnt,
}
kernelUsage, err := getMemoryData(path, "kmem")
if err != nil {
return err
@ -234,6 +238,12 @@ func getMemoryData(path, name string) (cgroups.MemoryData, error) {
memoryData.Failcnt = value
value, err = fscommon.GetCgroupParamUint(path, limit)
if err != nil {
if name == "kmem" && os.IsNotExist(err) {
// Ignore ENOENT as kmem.limit_in_bytes has
// been removed in newer kernels.
return memoryData, nil
}
return cgroups.MemoryData{}, err
}
memoryData.Limit = value

View File

@ -249,12 +249,13 @@ func TestMemoryStats(t *testing.T) {
t.Fatal(err)
}
expectedStats := cgroups.MemoryStats{
Cache: 512,
Usage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192},
SwapUsage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192},
KernelUsage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192},
Stats: map[string]uint64{"cache": 512, "rss": 1024},
UseHierarchy: true,
Cache: 512,
Usage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192},
SwapUsage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192},
SwapOnlyUsage: cgroups.MemoryData{Usage: 0, MaxUsage: 0, Failcnt: 0, Limit: 0},
KernelUsage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192},
Stats: map[string]uint64{"cache": 512, "rss": 1024},
UseHierarchy: true,
PageUsageByNUMA: cgroups.PageUsageByNUMA{
PageUsageByNUMAInner: cgroups.PageUsageByNUMAInner{
Total: cgroups.PageStats{Total: 44611, Nodes: map[uint8]uint64{0: 32631, 1: 7501, 2: 1982, 3: 2497}},

View File

@ -83,6 +83,7 @@ func tryDefaultCgroupRoot() string {
if err != nil {
return ""
}
defer dir.Close()
names, err := dir.Readdirnames(1)
if err != nil {
return ""

View File

@ -1,6 +1,8 @@
package fs2
import (
"errors"
"os"
"strconv"
"github.com/opencontainers/runc/libcontainer/cgroups"
@ -16,8 +18,22 @@ func setHugeTlb(dirPath string, r *configs.Resources) error {
if !isHugeTlbSet(r) {
return nil
}
const suffix = ".max"
skipRsvd := false
for _, hugetlb := range r.HugetlbLimit {
if err := cgroups.WriteFile(dirPath, "hugetlb."+hugetlb.Pagesize+".max", strconv.FormatUint(hugetlb.Limit, 10)); err != nil {
prefix := "hugetlb." + hugetlb.Pagesize
val := strconv.FormatUint(hugetlb.Limit, 10)
if err := cgroups.WriteFile(dirPath, prefix+suffix, val); err != nil {
return err
}
if skipRsvd {
continue
}
if err := cgroups.WriteFile(dirPath, prefix+".rsvd"+suffix, val); err != nil {
if errors.Is(err, os.ErrNotExist) {
skipRsvd = true
continue
}
return err
}
}
@ -27,15 +43,21 @@ func setHugeTlb(dirPath string, r *configs.Resources) error {
func statHugeTlb(dirPath string, stats *cgroups.Stats) error {
hugetlbStats := cgroups.HugetlbStats{}
rsvd := ".rsvd"
for _, pagesize := range cgroups.HugePageSizes() {
value, err := fscommon.GetCgroupParamUint(dirPath, "hugetlb."+pagesize+".current")
again:
prefix := "hugetlb." + pagesize + rsvd
value, err := fscommon.GetCgroupParamUint(dirPath, prefix+".current")
if err != nil {
if rsvd != "" && errors.Is(err, os.ErrNotExist) {
rsvd = ""
goto again
}
return err
}
hugetlbStats.Usage = value
fileName := "hugetlb." + pagesize + ".events"
value, err = fscommon.GetValueByKey(dirPath, fileName, "max")
value, err = fscommon.GetValueByKey(dirPath, prefix+".events", "max")
if err != nil {
return err
}

View File

@ -100,17 +100,20 @@ func statMemory(dirPath string, stats *cgroups.Stats) error {
memoryUsage, err := getMemoryDataV2(dirPath, "")
if err != nil {
if errors.Is(err, unix.ENOENT) && dirPath == UnifiedMountpoint {
// The root cgroup does not have memory.{current,max}
// so emulate those using data from /proc/meminfo.
return statsFromMeminfo(stats)
// The root cgroup does not have memory.{current,max,peak}
// so emulate those using data from /proc/meminfo and
// /sys/fs/cgroup/memory.stat
return rootStatsFromMeminfo(stats)
}
return err
}
stats.MemoryStats.Usage = memoryUsage
swapUsage, err := getMemoryDataV2(dirPath, "swap")
swapOnlyUsage, err := getMemoryDataV2(dirPath, "swap")
if err != nil {
return err
}
stats.MemoryStats.SwapOnlyUsage = swapOnlyUsage
swapUsage := swapOnlyUsage
// As cgroup v1 reports SwapUsage values as mem+swap combined,
// while in cgroup v2 swap values do not include memory,
// report combined mem+swap for v1 compatibility.
@ -118,6 +121,9 @@ func statMemory(dirPath string, stats *cgroups.Stats) error {
if swapUsage.Limit != math.MaxUint64 {
swapUsage.Limit += memoryUsage.Limit
}
// The `MaxUsage` of mem+swap cannot simply combine mem with
// swap. So set it to 0 for v1 compatibility.
swapUsage.MaxUsage = 0
stats.MemoryStats.SwapUsage = swapUsage
return nil
@ -132,6 +138,7 @@ func getMemoryDataV2(path, name string) (cgroups.MemoryData, error) {
}
usage := moduleName + ".current"
limit := moduleName + ".max"
maxUsage := moduleName + ".peak"
value, err := fscommon.GetCgroupParamUint(path, usage)
if err != nil {
@ -151,10 +158,18 @@ func getMemoryDataV2(path, name string) (cgroups.MemoryData, error) {
}
memoryData.Limit = value
// `memory.peak` since kernel 5.19
// `memory.swap.peak` since kernel 6.5
value, err = fscommon.GetCgroupParamUint(path, maxUsage)
if err != nil && !os.IsNotExist(err) {
return cgroups.MemoryData{}, err
}
memoryData.MaxUsage = value
return memoryData, nil
}
func statsFromMeminfo(stats *cgroups.Stats) error {
func rootStatsFromMeminfo(stats *cgroups.Stats) error {
const file = "/proc/meminfo"
f, err := os.Open(file)
if err != nil {
@ -166,14 +181,10 @@ func statsFromMeminfo(stats *cgroups.Stats) error {
var (
swap_free uint64
swap_total uint64
main_total uint64
main_free uint64
)
mem := map[string]*uint64{
"SwapFree": &swap_free,
"SwapTotal": &swap_total,
"MemTotal": &main_total,
"MemFree": &main_free,
}
found := 0
@ -206,11 +217,18 @@ func statsFromMeminfo(stats *cgroups.Stats) error {
return &parseError{Path: "", File: file, Err: err}
}
// cgroup v1 `usage_in_bytes` reports memory usage as the sum of
// - rss (NR_ANON_MAPPED)
// - cache (NR_FILE_PAGES)
// cgroup v1 reports SwapUsage values as mem+swap combined
// cgroup v2 reports rss and cache as anon and file.
// sum `anon` + `file` to report the same value as `usage_in_bytes` in v1.
// sum swap usage as combined mem+swap usage for consistency as well.
stats.MemoryStats.Usage.Usage = stats.MemoryStats.Stats["anon"] + stats.MemoryStats.Stats["file"]
stats.MemoryStats.Usage.Limit = math.MaxUint64
stats.MemoryStats.SwapUsage.Usage = (swap_total - swap_free) * 1024
stats.MemoryStats.SwapUsage.Limit = math.MaxUint64
stats.MemoryStats.Usage.Usage = (main_total - main_free) * 1024
stats.MemoryStats.Usage.Limit = math.MaxUint64
stats.MemoryStats.SwapUsage.Usage += stats.MemoryStats.Usage.Usage
return nil
}

View File

@ -0,0 +1,155 @@
package fs2
import (
"os"
"path/filepath"
"strings"
"testing"
"github.com/opencontainers/runc/libcontainer/cgroups"
)
const exampleMemoryStatData = `anon 790425600
file 6502666240
kernel_stack 7012352
pagetables 8867840
percpu 2445520
sock 40960
shmem 6721536
file_mapped 656187392
file_dirty 1122304
file_writeback 0
swapcached 10
anon_thp 438304768
file_thp 0
shmem_thp 0
inactive_anon 892223488
active_anon 2973696
inactive_file 5307346944
active_file 1179316224
unevictable 31477760
slab_reclaimable 348866240
slab_unreclaimable 10099808
slab 358966048
workingset_refault_anon 0
workingset_refault_file 0
workingset_activate_anon 0
workingset_activate_file 0
workingset_restore_anon 0
workingset_restore_file 0
workingset_nodereclaim 0
pgfault 103216687
pgmajfault 6879
pgrefill 0
pgscan 0
pgsteal 0
pgactivate 1110217
pgdeactivate 292
pglazyfree 267
pglazyfreed 0
thp_fault_alloc 57411
thp_collapse_alloc 443`
func TestStatMemoryPodCgroupNotFound(t *testing.T) {
// We're using a fake cgroupfs.
cgroups.TestMode = true
fakeCgroupDir := t.TempDir()
// only write memory.stat to ensure pod cgroup usage
// still reads memory.current.
statPath := filepath.Join(fakeCgroupDir, "memory.stat")
if err := os.WriteFile(statPath, []byte(exampleMemoryStatData), 0o644); err != nil {
t.Fatal(err)
}
gotStats := cgroups.NewStats()
// use a fake root path to mismatch the file we wrote.
// this triggers the non-root path which should fail to find memory.current.
err := statMemory(fakeCgroupDir, gotStats)
if err == nil {
t.Errorf("expected error when statting memory for cgroupv2 root, but was nil")
}
if !strings.Contains(err.Error(), "memory.current: no such file or directory") {
t.Errorf("expected error to contain 'memory.current: no such file or directory', but was %s", err.Error())
}
}
func TestStatMemoryPodCgroup(t *testing.T) {
// We're using a fake cgroupfs.
cgroups.TestMode = true
fakeCgroupDir := t.TempDir()
statPath := filepath.Join(fakeCgroupDir, "memory.stat")
if err := os.WriteFile(statPath, []byte(exampleMemoryStatData), 0o644); err != nil {
t.Fatal(err)
}
if err := os.WriteFile(filepath.Join(fakeCgroupDir, "memory.current"), []byte("123456789"), 0o644); err != nil {
t.Fatal(err)
}
if err := os.WriteFile(filepath.Join(fakeCgroupDir, "memory.max"), []byte("999999999"), 0o644); err != nil {
t.Fatal(err)
}
if err := os.WriteFile(filepath.Join(fakeCgroupDir, "memory.peak"), []byte("987654321"), 0o644); err != nil {
t.Fatal(err)
}
gotStats := cgroups.NewStats()
// use a fake root path to trigger the pod cgroup lookup.
err := statMemory(fakeCgroupDir, gotStats)
if err != nil {
t.Errorf("expected no error when statting memory for cgroupv2 root, but got %#+v", err)
}
// result should be "memory.current"
var expectedUsageBytes uint64 = 123456789
if gotStats.MemoryStats.Usage.Usage != expectedUsageBytes {
t.Errorf("parsed cgroupv2 memory.stat doesn't match expected result: \ngot %#v\nexpected %#v\n", gotStats.MemoryStats.Usage.Usage, expectedUsageBytes)
}
// result should be "memory.max"
var expectedLimitBytes uint64 = 999999999
if gotStats.MemoryStats.Usage.Limit != expectedLimitBytes {
t.Errorf("parsed cgroupv2 memory.stat doesn't match expected result: \ngot %#v\nexpected %#v\n", gotStats.MemoryStats.Usage.Limit, expectedLimitBytes)
}
// result should be "memory.peak"
var expectedMaxUsageBytes uint64 = 987654321
if gotStats.MemoryStats.Usage.MaxUsage != expectedMaxUsageBytes {
t.Errorf("parsed cgroupv2 memory.stat doesn't match expected result: \ngot %#v\nexpected %#v\n", gotStats.MemoryStats.Usage.MaxUsage, expectedMaxUsageBytes)
}
}
func TestRootStatsFromMeminfo(t *testing.T) {
stats := &cgroups.Stats{
MemoryStats: cgroups.MemoryStats{
Stats: map[string]uint64{
"anon": 790425600,
"file": 6502666240,
},
},
}
if err := rootStatsFromMeminfo(stats); err != nil {
t.Fatal(err)
}
// result is anon + file
var expectedUsageBytes uint64 = 7293091840
if stats.MemoryStats.Usage.Usage != expectedUsageBytes {
t.Errorf("parsed cgroupv2 memory.stat doesn't match expected result: \ngot %d\nexpected %d\n", stats.MemoryStats.Usage.Usage, expectedUsageBytes)
}
// swap is adjusted to mem+swap
if stats.MemoryStats.SwapUsage.Usage < stats.MemoryStats.Usage.Usage {
t.Errorf("swap usage %d should be at least mem usage %d", stats.MemoryStats.SwapUsage.Usage, stats.MemoryStats.Usage.Usage)
}
if stats.MemoryStats.SwapUsage.Limit < stats.MemoryStats.Usage.Limit {
t.Errorf("swap limit %d should be at least mem limit %d", stats.MemoryStats.SwapUsage.Limit, stats.MemoryStats.Usage.Limit)
}
}

View File

@ -3,6 +3,7 @@ package manager
import (
"testing"
"github.com/opencontainers/runc/libcontainer/cgroups/systemd"
"github.com/opencontainers/runc/libcontainer/configs"
)
@ -10,35 +11,45 @@ import (
// config.Resources is nil. While it does not make sense to use a
// manager with no resources, it should not result in a panic.
//
// This tests either v1 or v2 managers (both fs and systemd),
// depending on what cgroup version is available on the host.
// This tests either v1 or v2 fs cgroup manager, depending on which
// cgroup version is available.
func TestNilResources(t *testing.T) {
for _, sd := range []bool{false, true} {
cg := &configs.Cgroup{} // .Resources is nil
cg.Systemd = sd
mgr, err := New(cg)
if err != nil {
// Some managers require non-nil Resources during
// instantiation -- provide and retry. In such case
// we're mostly testing Set(nil) below.
cg.Resources = &configs.Resources{}
mgr, err = New(cg)
if err != nil {
t.Error(err)
continue
}
}
_ = mgr.Apply(-1)
_ = mgr.Set(nil)
_ = mgr.Freeze(configs.Thawed)
_ = mgr.Exists()
_, _ = mgr.GetAllPids()
_, _ = mgr.GetCgroups()
_, _ = mgr.GetFreezerState()
_ = mgr.Path("")
_ = mgr.GetPaths()
_, _ = mgr.GetStats()
_, _ = mgr.OOMKillCount()
_ = mgr.Destroy()
}
testNilResources(t, false)
}
// TestNilResourcesSystemd is the same as TestNilResources,
// only checking the systemd cgroup manager.
func TestNilResourcesSystemd(t *testing.T) {
if !systemd.IsRunningSystemd() {
t.Skip("requires systemd")
}
testNilResources(t, true)
}
func testNilResources(t *testing.T, systemd bool) {
cg := &configs.Cgroup{} // .Resources is nil
cg.Systemd = systemd
mgr, err := New(cg)
if err != nil {
// Some managers require non-nil Resources during
// instantiation -- provide and retry. In such case
// we're mostly testing Set(nil) below.
cg.Resources = &configs.Resources{}
mgr, err = New(cg)
if err != nil {
t.Fatal(err)
}
}
_ = mgr.Apply(-1)
_ = mgr.Set(nil)
_ = mgr.Freeze(configs.Thawed)
_ = mgr.Exists()
_, _ = mgr.GetAllPids()
_, _ = mgr.GetCgroups()
_, _ = mgr.GetFreezerState()
_ = mgr.Path("")
_ = mgr.GetPaths()
_, _ = mgr.GetStats()
_, _ = mgr.OOMKillCount()
_ = mgr.Destroy()
}

View File

@ -78,6 +78,8 @@ type MemoryStats struct {
Usage MemoryData `json:"usage,omitempty"`
// usage of memory + swap
SwapUsage MemoryData `json:"swap_usage,omitempty"`
// usage of swap only
SwapOnlyUsage MemoryData `json:"swap_only_usage,omitempty"`
// usage of kernel memory
KernelUsage MemoryData `json:"kernel_usage,omitempty"`
// usage of kernel TCP memory

View File

@ -177,7 +177,7 @@ func allowAllDevices() []systemdDbus.Property {
// generateDeviceProperties takes the configured device rules and generates a
// corresponding set of systemd properties to configure the devices correctly.
func generateDeviceProperties(r *configs.Resources) ([]systemdDbus.Property, error) {
func generateDeviceProperties(r *configs.Resources, sdVer int) ([]systemdDbus.Property, error) {
if r.SkipDevices {
return nil, nil
}
@ -238,9 +238,10 @@ func generateDeviceProperties(r *configs.Resources) ([]systemdDbus.Property, err
// trickery to convert things:
//
// * Concrete rules with non-wildcard major/minor numbers have to use
// /dev/{block,char} paths. This is slightly odd because it means
// that we cannot add whitelist rules for devices that don't exist,
// but there's not too much we can do about that.
// /dev/{block,char}/MAJOR:minor paths. Before v240, systemd uses
// stat(2) on such paths to look up device properties, meaning we
// cannot add whitelist rules for devices that don't exist. Since v240,
// device properties are parsed from the path string.
//
// However, path globbing is not support for path-based rules so we
// need to handle wildcards in some other manner.
@ -288,6 +289,17 @@ func generateDeviceProperties(r *configs.Resources) ([]systemdDbus.Property, err
case devices.CharDevice:
entry.Path = fmt.Sprintf("/dev/char/%d:%d", rule.Major, rule.Minor)
}
if sdVer < 240 {
// Old systemd versions use stat(2) on path to find out device major:minor
// numbers and type. If the path doesn't exist, it will not add the rule,
// emitting a warning instead.
// Since all of this logic is best-effort anyway (we manually set these
// rules separately to systemd) we can safely skip entries that don't
// have a corresponding path.
if _, err := os.Stat(entry.Path); err != nil {
continue
}
}
}
deviceAllowList = append(deviceAllowList, entry)
}
@ -335,32 +347,55 @@ func isUnitExists(err error) bool {
return isDbusError(err, "org.freedesktop.systemd1.UnitExists")
}
func startUnit(cm *dbusConnManager, unitName string, properties []systemdDbus.Property) error {
func startUnit(cm *dbusConnManager, unitName string, properties []systemdDbus.Property, ignoreExist bool) error {
statusChan := make(chan string, 1)
retry := true
retry:
err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) error {
_, err := c.StartTransientUnitContext(context.TODO(), unitName, "replace", properties, statusChan)
return err
})
if err == nil {
timeout := time.NewTimer(30 * time.Second)
defer timeout.Stop()
select {
case s := <-statusChan:
close(statusChan)
// Please refer to https://pkg.go.dev/github.com/coreos/go-systemd/v22/dbus#Conn.StartUnit
if s != "done" {
resetFailedUnit(cm, unitName)
return fmt.Errorf("error creating systemd unit `%s`: got `%s`", unitName, s)
}
case <-timeout.C:
resetFailedUnit(cm, unitName)
return errors.New("Timeout waiting for systemd to create " + unitName)
if err != nil {
if !isUnitExists(err) {
return err
}
if ignoreExist {
// TODO: remove this hack.
// This is kubelet making sure a slice exists (see
// https://github.com/opencontainers/runc/pull/1124).
return nil
}
if retry {
// In case a unit with the same name exists, this may
// be a leftover failed unit. Reset it, so systemd can
// remove it, and retry once.
err = resetFailedUnit(cm, unitName)
if err != nil {
logrus.Warnf("unable to reset failed unit: %v", err)
}
retry = false
goto retry
}
} else if !isUnitExists(err) {
return err
}
timeout := time.NewTimer(30 * time.Second)
defer timeout.Stop()
select {
case s := <-statusChan:
close(statusChan)
// Please refer to https://pkg.go.dev/github.com/coreos/go-systemd/v22/dbus#Conn.StartUnit
if s != "done" {
_ = resetFailedUnit(cm, unitName)
return fmt.Errorf("error creating systemd unit `%s`: got `%s`", unitName, s)
}
case <-timeout.C:
_ = resetFailedUnit(cm, unitName)
return errors.New("Timeout waiting for systemd to create " + unitName)
}
return nil
}
@ -385,16 +420,17 @@ func stopUnit(cm *dbusConnManager, unitName string) error {
return errors.New("Timed out while waiting for systemd to remove " + unitName)
}
}
// In case of a failed unit, let systemd remove it.
_ = resetFailedUnit(cm, unitName)
return nil
}
func resetFailedUnit(cm *dbusConnManager, name string) {
err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) error {
func resetFailedUnit(cm *dbusConnManager, name string) error {
return cm.retryOnDisconnect(func(c *systemdDbus.Conn) error {
return c.ResetFailedUnitContext(context.TODO(), name)
})
if err != nil {
logrus.Warnf("unable to reset failed unit: %v", err)
}
}
func getUnitTypeProperty(cm *dbusConnManager, unitName string, unitType string, propertyName string) (*systemdDbus.Property, error) {

View File

@ -51,5 +51,10 @@ func RangeToBits(str string) ([]byte, error) {
// do not allow empty values
return nil, errors.New("empty value")
}
// fit cpuset parsing order in systemd
for l, r := 0, len(ret)-1; l < r; l, r = l+1, r-1 {
ret[l], ret[r] = ret[r], ret[l]
}
return ret, nil
}

View File

@ -22,13 +22,13 @@ func TestRangeToBits(t *testing.T) {
{in: "4-7", out: []byte{0xf0}},
{in: "0-7", out: []byte{0xff}},
{in: "0-15", out: []byte{0xff, 0xff}},
{in: "16", out: []byte{1, 0, 0}},
{in: "0-3,32-33", out: []byte{3, 0, 0, 0, 0x0f}},
{in: "16", out: []byte{0, 0, 1}},
{in: "0-3,32-33", out: []byte{0x0f, 0, 0, 0, 3}},
// extra spaces and tabs are ok
{in: "1, 2, 1-2", out: []byte{6}},
{in: " , 1 , 3 , 5-7, ", out: []byte{0xea}},
// somewhat large values
{in: "128-130,1", out: []byte{7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2}},
{in: "128-130,1", out: []byte{2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7}},
{in: "-", isErr: true},
{in: "1-", isErr: true},

View File

@ -2,6 +2,7 @@ package systemd
import (
"context"
"errors"
"fmt"
"sync"
@ -80,8 +81,6 @@ func (d *dbusConnManager) resetConnection(conn *systemdDbus.Conn) {
}
}
var errDbusConnClosed = dbus.ErrClosed.Error()
// retryOnDisconnect calls op, and if the error it returns is about closed dbus
// connection, the connection is re-established and the op is retried. This helps
// with the situation when dbus is restarted and we have a stale connection.
@ -92,7 +91,10 @@ func (d *dbusConnManager) retryOnDisconnect(op func(*systemdDbus.Conn) error) er
return err
}
err = op(conn)
if !isDbusError(err, errDbusConnClosed) {
if err == nil {
return nil
}
if !errors.Is(err, dbus.ErrClosed) {
return err
}
d.resetConnection(conn)

View File

@ -127,7 +127,7 @@ func TestPodSkipDevicesUpdate(t *testing.T) {
// Create a "container" within the "pod" cgroup.
// This is not a real container, just a process in the cgroup.
cmd := exec.Command("bash", "-c", "while true; do echo > /dev/null; done")
cmd := exec.Command("sleep", "infinity")
cmd.Env = append(os.Environ(), "LANG=C")
var stderr bytes.Buffer
cmd.Stderr = &stderr
@ -183,6 +183,11 @@ func testSkipDevices(t *testing.T, skipDevices bool, expected []string) {
if os.Geteuid() != 0 {
t.Skip("Test requires root.")
}
// https://github.com/opencontainers/runc/issues/3743
centosVer, _ := exec.Command("rpm", "-q", "--qf", "%{version}", "centos-release").CombinedOutput()
if string(centosVer) == "7" {
t.Skip("Flaky on CentOS 7")
}
podConfig := &configs.Cgroup{
Parent: "system.slice",

View File

@ -71,12 +71,13 @@ var legacySubsystems = []subsystem{
&fs.NetClsGroup{},
&fs.NameGroup{GroupName: "name=systemd"},
&fs.RdmaGroup{},
&fs.NameGroup{GroupName: "misc"},
}
func genV1ResourcesProperties(r *configs.Resources, cm *dbusConnManager) ([]systemdDbus.Property, error) {
var properties []systemdDbus.Property
deviceProperties, err := generateDeviceProperties(r)
deviceProperties, err := generateDeviceProperties(r, systemdVersion(cm))
if err != nil {
return nil, err
}
@ -206,7 +207,7 @@ func (m *legacyManager) Apply(pid int) error {
properties = append(properties, c.SystemdProps...)
if err := startUnit(m.dbus, unitName, properties); err != nil {
if err := startUnit(m.dbus, unitName, properties, pid == -1); err != nil {
return err
}
@ -273,14 +274,7 @@ func getSubsystemPath(slice, unit, subsystem string) (string, error) {
return "", err
}
initPath, err := cgroups.GetInitCgroup(subsystem)
if err != nil {
return "", err
}
// if pid 1 is systemd 226 or later, it will be in init.scope, not the root
initPath = strings.TrimSuffix(filepath.Clean(initPath), "init.scope")
return filepath.Join(mountpoint, initPath, slice, unit), nil
return filepath.Join(mountpoint, slice, unit), nil
}
func (m *legacyManager) Freeze(state configs.FreezerState) error {
@ -423,6 +417,15 @@ func (m *legacyManager) Set(r *configs.Resources) error {
if err := m.doFreeze(configs.Frozen); err != nil {
// If freezer cgroup isn't supported, we just warn about it.
logrus.Infof("freeze container before SetUnitProperties failed: %v", err)
// skip update the cgroup while frozen failed. #3803
if !errors.Is(err, errSubsystemDoesNotExist) {
if needsThaw {
if thawErr := m.doFreeze(configs.Thawed); thawErr != nil {
logrus.Infof("thaw container after doFreeze failed: %v", thawErr)
}
}
return err
}
}
}
setErr := setUnitProperties(m.dbus, unitName, properties...)

View File

@ -2,6 +2,7 @@ package systemd
import (
"bufio"
"errors"
"fmt"
"math"
"os"
@ -181,7 +182,7 @@ func genV2ResourcesProperties(r *configs.Resources, cm *dbusConnManager) ([]syst
// aren't the end of the world, but it is a bit concerning. However
// it's unclear if systemd removes all eBPF programs attached when
// doing SetUnitProperties...
deviceProperties, err := generateDeviceProperties(r)
deviceProperties, err := generateDeviceProperties(r, systemdVersion(cm))
if err != nil {
return nil, err
}
@ -283,7 +284,7 @@ func (m *unifiedManager) Apply(pid int) error {
properties = append(properties, c.SystemdProps...)
if err := startUnit(m.dbus, unitName, properties); err != nil {
if err := startUnit(m.dbus, unitName, properties, pid == -1); err != nil {
return fmt.Errorf("unable to start unit %q (properties %+v): %w", unitName, properties, err)
}
@ -292,6 +293,12 @@ func (m *unifiedManager) Apply(pid int) error {
}
if c.OwnerUID != nil {
// The directory itself must be chowned.
err := os.Chown(m.path, *c.OwnerUID, -1)
if err != nil {
return err
}
filesToChown, err := cgroupFilesToChown()
if err != nil {
return err
@ -299,7 +306,8 @@ func (m *unifiedManager) Apply(pid int) error {
for _, v := range filesToChown {
err := os.Chown(m.path+"/"+v, *c.OwnerUID, -1)
if err != nil {
// Some files might not be present.
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
}
@ -312,21 +320,23 @@ func (m *unifiedManager) Apply(pid int) error {
// uid in /sys/kernel/cgroup/delegate. If the file is not present
// (Linux < 4.15), use the initial values mentioned in cgroups(7).
func cgroupFilesToChown() ([]string, error) {
filesToChown := []string{"."} // the directory itself must be chowned
const cgroupDelegateFile = "/sys/kernel/cgroup/delegate"
f, err := os.Open(cgroupDelegateFile)
if err == nil {
defer f.Close()
scanner := bufio.NewScanner(f)
for scanner.Scan() {
filesToChown = append(filesToChown, scanner.Text())
}
if err := scanner.Err(); err != nil {
return nil, fmt.Errorf("error reading %s: %w", cgroupDelegateFile, err)
}
} else {
filesToChown = append(filesToChown, "cgroup.procs", "cgroup.subtree_control", "cgroup.threads")
if err != nil {
return []string{"cgroup.procs", "cgroup.subtree_control", "cgroup.threads"}, nil
}
defer f.Close()
filesToChown := []string{}
scanner := bufio.NewScanner(f)
for scanner.Scan() {
filesToChown = append(filesToChown, scanner.Text())
}
if err := scanner.Err(); err != nil {
return nil, fmt.Errorf("error reading %s: %w", cgroupDelegateFile, err)
}
return filesToChown, nil
}

View File

@ -55,12 +55,12 @@ func IsCgroup2HybridMode() bool {
var st unix.Statfs_t
err := unix.Statfs(hybridMountpoint, &st)
if err != nil {
if os.IsNotExist(err) {
// ignore the "not found" error
isHybrid = false
return
isHybrid = false
if !os.IsNotExist(err) {
// Report unexpected errors.
logrus.WithError(err).Debugf("statfs(%q) failed", hybridMountpoint)
}
panic(fmt.Sprintf("cannot statfs cgroup root: %s", err))
return
}
isHybrid = st.Type == unix.CGROUP2_SUPER_MAGIC
})
@ -162,8 +162,10 @@ func readProcsFile(dir string) ([]int, error) {
// ParseCgroupFile parses the given cgroup file, typically /proc/self/cgroup
// or /proc/<pid>/cgroup, into a map of subsystems to cgroup paths, e.g.
// "cpu": "/user.slice/user-1000.slice"
// "pids": "/user.slice/user-1000.slice"
//
// "cpu": "/user.slice/user-1000.slice"
// "pids": "/user.slice/user-1000.slice"
//
// etc.
//
// Note that for cgroup v2 unified hierarchy, there are no per-controller

View File

@ -21,9 +21,9 @@ type Rlimit struct {
// IDMap represents UID/GID Mappings for User Namespaces.
type IDMap struct {
ContainerID int `json:"container_id"`
HostID int `json:"host_id"`
Size int `json:"size"`
ContainerID int64 `json:"container_id"`
HostID int64 `json:"host_id"`
Size int64 `json:"size"`
}
// Seccomp represents syscall restrictions

View File

@ -1,6 +1,10 @@
package configs
import "errors"
import (
"errors"
"fmt"
"math"
)
var (
errNoUIDMap = errors.New("User namespaces enabled, but no uid mappings found.")
@ -16,11 +20,18 @@ func (c Config) HostUID(containerId int) (int, error) {
if c.UidMappings == nil {
return -1, errNoUIDMap
}
id, found := c.hostIDFromMapping(containerId, c.UidMappings)
id, found := c.hostIDFromMapping(int64(containerId), c.UidMappings)
if !found {
return -1, errNoUserMap
}
return id, nil
// If we are a 32-bit binary running on a 64-bit system, it's possible
// the mapped user is too large to store in an int, which means we
// cannot do the mapping. We can't just return an int64, because
// os.Setuid() takes an int.
if id > math.MaxInt {
return -1, fmt.Errorf("mapping for uid %d (host id %d) is larger than native integer size (%d)", containerId, id, math.MaxInt)
}
return int(id), nil
}
// Return unchanged id.
return containerId, nil
@ -39,11 +50,18 @@ func (c Config) HostGID(containerId int) (int, error) {
if c.GidMappings == nil {
return -1, errNoGIDMap
}
id, found := c.hostIDFromMapping(containerId, c.GidMappings)
id, found := c.hostIDFromMapping(int64(containerId), c.GidMappings)
if !found {
return -1, errNoGroupMap
}
return id, nil
// If we are a 32-bit binary running on a 64-bit system, it's possible
// the mapped user is too large to store in an int, which means we
// cannot do the mapping. We can't just return an int64, because
// os.Setgid() takes an int.
if id > math.MaxInt {
return -1, fmt.Errorf("mapping for gid %d (host id %d) is larger than native integer size (%d)", containerId, id, math.MaxInt)
}
return int(id), nil
}
// Return unchanged id.
return containerId, nil
@ -57,7 +75,7 @@ func (c Config) HostRootGID() (int, error) {
// Utility function that gets a host ID for a container ID from user namespace map
// if that ID is present in the map.
func (c Config) hostIDFromMapping(containerID int, uMap []IDMap) (int, bool) {
func (c Config) hostIDFromMapping(containerID int64, uMap []IDMap) (int64, bool) {
for _, m := range uMap {
if (containerID >= m.ContainerID) && (containerID <= (m.ContainerID + m.Size - 1)) {
hostID := m.HostID + (containerID - m.ContainerID)

View File

@ -28,25 +28,18 @@ func (v *ConfigValidator) rootlessEUID(config *configs.Config) error {
return nil
}
func hasIDMapping(id int, mappings []configs.IDMap) bool {
for _, m := range mappings {
if id >= m.ContainerID && id < m.ContainerID+m.Size {
return true
}
}
return false
}
func rootlessEUIDMappings(config *configs.Config) error {
if !config.Namespaces.Contains(configs.NEWUSER) {
return errors.New("rootless container requires user namespaces")
}
if len(config.UidMappings) == 0 {
return errors.New("rootless containers requires at least one UID mapping")
}
if len(config.GidMappings) == 0 {
return errors.New("rootless containers requires at least one GID mapping")
// We only require mappings if we are not joining another userns.
if path := config.Namespaces.PathOf(configs.NEWUSER); path == "" {
if len(config.UidMappings) == 0 {
return errors.New("rootless containers requires at least one UID mapping")
}
if len(config.GidMappings) == 0 {
return errors.New("rootless containers requires at least one GID mapping")
}
}
return nil
}
@ -70,8 +63,8 @@ func rootlessEUIDMount(config *configs.Config) error {
// Ignore unknown mount options.
continue
}
if !hasIDMapping(uid, config.UidMappings) {
return errors.New("cannot specify uid= mount options for unmapped uid in rootless containers")
if _, err := config.HostUID(uid); err != nil {
return fmt.Errorf("cannot specify uid=%d mount option for rootless container: %w", uid, err)
}
}
@ -82,8 +75,8 @@ func rootlessEUIDMount(config *configs.Config) error {
// Ignore unknown mount options.
continue
}
if !hasIDMapping(gid, config.GidMappings) {
return errors.New("cannot specify gid= mount options for unmapped gid in rootless containers")
if _, err := config.HostGID(gid); err != nil {
return fmt.Errorf("cannot specify gid=%d mount option for rootless container: %w", gid, err)
}
}
}

View File

@ -109,11 +109,19 @@ func (v *ConfigValidator) security(config *configs.Config) error {
func (v *ConfigValidator) usernamespace(config *configs.Config) error {
if config.Namespaces.Contains(configs.NEWUSER) {
if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) {
return errors.New("USER namespaces aren't enabled in the kernel")
return errors.New("user namespaces aren't enabled in the kernel")
}
hasPath := config.Namespaces.PathOf(configs.NEWUSER) != ""
hasMappings := config.UidMappings != nil || config.GidMappings != nil
if !hasPath && !hasMappings {
return errors.New("user namespaces enabled, but no namespace path to join nor mappings to apply specified")
}
// The hasPath && hasMappings validation case is handled in specconv --
// we cache the mappings in Config during specconv in the hasPath case,
// so we cannot do that validation here.
} else {
if config.UidMappings != nil || config.GidMappings != nil {
return errors.New("User namespace mappings specified, but USER namespace isn't enabled in the config")
return errors.New("user namespace mappings specified, but user namespace isn't enabled in the config")
}
}
return nil
@ -131,9 +139,8 @@ func (v *ConfigValidator) cgroupnamespace(config *configs.Config) error {
// convertSysctlVariableToDotsSeparator can return sysctl variables in dots separator format.
// The '/' separator is also accepted in place of a '.'.
// Convert the sysctl variables to dots separator format for validation.
// More info:
// https://man7.org/linux/man-pages/man8/sysctl.8.html
// https://man7.org/linux/man-pages/man5/sysctl.d.5.html
// More info: sysctl(8), sysctl.d(5).
//
// For example:
// Input sysctl variable "net/ipv4/conf/eno2.100.rp_filter"
// will return the converted value "net.ipv4.conf.eno2/100.rp_filter"
@ -229,10 +236,6 @@ func (v *ConfigValidator) sysctl(config *configs.Config) error {
func (v *ConfigValidator) intelrdt(config *configs.Config) error {
if config.IntelRdt != nil {
if !intelrdt.IsCATEnabled() && !intelrdt.IsMBAEnabled() {
return errors.New("intelRdt is specified in config, but Intel RDT is not supported or enabled")
}
if config.IntelRdt.ClosID == "." || config.IntelRdt.ClosID == ".." || strings.Contains(config.IntelRdt.ClosID, "/") {
return fmt.Errorf("invalid intelRdt.ClosID %q", config.IntelRdt.ClosID)
}

View File

@ -150,7 +150,7 @@ func TestValidateSecurityWithoutNEWNS(t *testing.T) {
}
}
func TestValidateUsernamespace(t *testing.T) {
func TestValidateUserNamespace(t *testing.T) {
if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) {
t.Skip("Test requires userns.")
}
@ -161,6 +161,8 @@ func TestValidateUsernamespace(t *testing.T) {
{Type: configs.NEWUSER},
},
),
UidMappings: []configs.IDMap{{HostID: 0, ContainerID: 123, Size: 100}},
GidMappings: []configs.IDMap{{HostID: 0, ContainerID: 123, Size: 100}},
}
validator := New()
@ -170,11 +172,11 @@ func TestValidateUsernamespace(t *testing.T) {
}
}
func TestValidateUsernamespaceWithoutUserNS(t *testing.T) {
uidMap := configs.IDMap{ContainerID: 123}
func TestValidateUsernsMappingWithoutNamespace(t *testing.T) {
config := &configs.Config{
Rootfs: "/var",
UidMappings: []configs.IDMap{uidMap},
UidMappings: []configs.IDMap{{HostID: 0, ContainerID: 123, Size: 100}},
GidMappings: []configs.IDMap{{HostID: 0, ContainerID: 123, Size: 100}},
}
validator := New()

View File

@ -40,7 +40,7 @@ type linuxContainer struct {
root string
config *configs.Config
cgroupManager cgroups.Manager
intelRdtManager intelrdt.Manager
intelRdtManager *intelrdt.Manager
initPath string
initArgs []string
initProcess parentProcess
@ -146,19 +146,21 @@ func (c *linuxContainer) OCIState() (*specs.State, error) {
return c.currentOCIState()
}
func (c *linuxContainer) Processes() ([]int, error) {
var pids []int
status, err := c.currentStatus()
if err != nil {
return pids, err
// ignoreCgroupError filters out cgroup-related errors that can be ignored,
// because the container is stopped and its cgroup is gone.
func (c *linuxContainer) ignoreCgroupError(err error) error {
if err == nil {
return nil
}
// for systemd cgroup, the unit's cgroup path will be auto removed if container's all processes exited
if status == Stopped && !c.cgroupManager.Exists() {
return pids, nil
if errors.Is(err, os.ErrNotExist) && c.runType() == Stopped && !c.cgroupManager.Exists() {
return nil
}
return err
}
pids, err = c.cgroupManager.GetAllPids()
if err != nil {
func (c *linuxContainer) Processes() ([]int, error) {
pids, err := c.cgroupManager.GetAllPids()
if err = c.ignoreCgroupError(err); err != nil {
return nil, fmt.Errorf("unable to get all container pids: %w", err)
}
return pids, nil
@ -351,6 +353,15 @@ func (c *linuxContainer) start(process *Process) (retErr error) {
}()
}
// Before starting "runc init", mark all non-stdio open files as O_CLOEXEC
// to make sure we don't leak any files into "runc init". Any files to be
// passed to "runc init" through ExtraFiles will get dup2'd by the Go
// runtime and thus their O_CLOEXEC flag will be cleared. This is some
// additional protection against attacks like CVE-2024-21626, by making
// sure we never leak files to "runc init" we didn't intend to.
if err := utils.CloseExecFrom(3); err != nil {
return fmt.Errorf("unable to mark non-stdio fds as cloexec: %w", err)
}
if err := parent.start(); err != nil {
return fmt.Errorf("unable to start container process: %w", err)
}
@ -382,11 +393,12 @@ func (c *linuxContainer) Signal(s os.Signal, all bool) error {
return err
}
if all {
// for systemd cgroup, the unit's cgroup path will be auto removed if container's all processes exited
if status == Stopped && !c.cgroupManager.Exists() {
// Avoid calling signalAllProcesses which may print
// a warning trying to freeze a non-existing cgroup.
return nil
}
return signalAllProcesses(c.cgroupManager, s)
return c.ignoreCgroupError(signalAllProcesses(c.cgroupManager, s))
}
// to avoid a PID reuse attack
if status == Running || status == Created || status == Paused {
@ -636,7 +648,11 @@ func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, messageSockP
// cgroup v1: using the same path for all controllers.
// cgroup v2: the only possible way.
for k := range proc.cgroupPaths {
proc.cgroupPaths[k] = path.Join(proc.cgroupPaths[k], add)
subPath := path.Join(proc.cgroupPaths[k], add)
if !strings.HasPrefix(subPath, proc.cgroupPaths[k]) {
return nil, fmt.Errorf("%s is not a sub cgroup path", add)
}
proc.cgroupPaths[k] = subPath
}
// cgroup v2: do not try to join init process's cgroup
// as a fallback (see (*setnsProcess).start).
@ -645,7 +661,11 @@ func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, messageSockP
// Per-controller paths.
for ctrl, add := range p.SubCgroupPaths {
if val, ok := proc.cgroupPaths[ctrl]; ok {
proc.cgroupPaths[ctrl] = path.Join(val, add)
subPath := path.Join(val, add)
if !strings.HasPrefix(subPath, val) {
return nil, fmt.Errorf("%s is not a sub cgroup path", add)
}
proc.cgroupPaths[ctrl] = subPath
} else {
return nil, fmt.Errorf("unknown controller %s in SubCgroupPaths", ctrl)
}
@ -918,7 +938,7 @@ func (c *linuxContainer) criuSupportsExtNS(t configs.NamespaceType) bool {
}
func criuNsToKey(t configs.NamespaceType) string {
return "extRoot" + strings.Title(configs.NsName(t)) + "NS"
return "extRoot" + strings.Title(configs.NsName(t)) + "NS" //nolint:staticcheck // SA1019: strings.Title is deprecated
}
func (c *linuxContainer) handleCheckpointingExternalNamespaces(rpcOpts *criurpc.CriuOpts, t configs.NamespaceType) error {
@ -2257,7 +2277,7 @@ func ignoreTerminateErrors(err error) error {
func requiresRootOrMappingTool(c *configs.Config) bool {
gidMap := []configs.IDMap{
{ContainerID: 0, HostID: os.Getegid(), Size: 1},
{ContainerID: 0, HostID: int64(os.Getegid()), Size: 1},
}
return !reflect.DeepEqual(c.GidMappings, gidMap)
}

View File

@ -7,22 +7,15 @@ import (
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/intelrdt"
"github.com/opencontainers/runc/libcontainer/system"
)
type mockCgroupManager struct {
pids []int
allPids []int
stats *cgroups.Stats
paths map[string]string
}
type mockIntelRdtManager struct {
stats *intelrdt.Stats
path string
}
func (m *mockCgroupManager) GetPids() ([]int, error) {
return m.pids, nil
}
@ -32,7 +25,7 @@ func (m *mockCgroupManager) GetAllPids() ([]int, error) {
}
func (m *mockCgroupManager) GetStats() (*cgroups.Stats, error) {
return m.stats, nil
return nil, nil
}
func (m *mockCgroupManager) Apply(pid int) error {
@ -76,30 +69,6 @@ func (m *mockCgroupManager) GetFreezerState() (configs.FreezerState, error) {
return configs.Thawed, nil
}
func (m *mockIntelRdtManager) Apply(pid int) error {
return nil
}
func (m *mockIntelRdtManager) GetStats() (*intelrdt.Stats, error) {
return m.stats, nil
}
func (m *mockIntelRdtManager) Destroy() error {
return nil
}
func (m *mockIntelRdtManager) GetPath() string {
return m.path
}
func (m *mockIntelRdtManager) Set(container *configs.Config) error {
return nil
}
func (m *mockIntelRdtManager) GetCgroups() (*configs.Cgroup, error) {
return nil, nil
}
type mockProcess struct {
_pid int
started uint64
@ -173,61 +142,11 @@ func TestGetContainerPids(t *testing.T) {
}
}
func TestGetContainerStats(t *testing.T) {
container := &linuxContainer{
id: "myid",
config: &configs.Config{},
cgroupManager: &mockCgroupManager{
pids: []int{1, 2, 3},
stats: &cgroups.Stats{
MemoryStats: cgroups.MemoryStats{
Usage: cgroups.MemoryData{
Usage: 1024,
},
},
},
},
intelRdtManager: &mockIntelRdtManager{
stats: &intelrdt.Stats{
L3CacheSchema: "L3:0=f;1=f0",
MemBwSchema: "MB:0=20;1=70",
},
},
}
stats, err := container.Stats()
if err != nil {
t.Fatal(err)
}
if stats.CgroupStats == nil {
t.Fatal("cgroup stats are nil")
}
if stats.CgroupStats.MemoryStats.Usage.Usage != 1024 {
t.Fatalf("expected memory usage 1024 but received %d", stats.CgroupStats.MemoryStats.Usage.Usage)
}
if intelrdt.IsCATEnabled() {
if stats.IntelRdtStats == nil {
t.Fatal("intel rdt stats are nil")
}
if stats.IntelRdtStats.L3CacheSchema != "L3:0=f;1=f0" {
t.Fatalf("expected L3CacheSchema L3:0=f;1=f0 but received %s", stats.IntelRdtStats.L3CacheSchema)
}
}
if intelrdt.IsMBAEnabled() {
if stats.IntelRdtStats == nil {
t.Fatal("intel rdt stats are nil")
}
if stats.IntelRdtStats.MemBwSchema != "MB:0=20;1=70" {
t.Fatalf("expected MemBwSchema MB:0=20;1=70 but received %s", stats.IntelRdtStats.MemBwSchema)
}
}
}
func TestGetContainerState(t *testing.T) {
var (
pid = os.Getpid()
expectedMemoryPath = "/sys/fs/cgroup/memory/myid"
expectedNetworkPath = fmt.Sprintf("/proc/%d/ns/net", pid)
expectedIntelRdtPath = "/sys/fs/resctrl/myid"
pid = os.Getpid()
expectedMemoryPath = "/sys/fs/cgroup/memory/myid"
expectedNetworkPath = fmt.Sprintf("/proc/%d/ns/net", pid)
)
container := &linuxContainer{
id: "myid",
@ -248,24 +167,10 @@ func TestGetContainerState(t *testing.T) {
},
cgroupManager: &mockCgroupManager{
pids: []int{1, 2, 3},
stats: &cgroups.Stats{
MemoryStats: cgroups.MemoryStats{
Usage: cgroups.MemoryData{
Usage: 1024,
},
},
},
paths: map[string]string{
"memory": expectedMemoryPath,
},
},
intelRdtManager: &mockIntelRdtManager{
stats: &intelrdt.Stats{
L3CacheSchema: "L3:0=f0;1=f",
MemBwSchema: "MB:0=70;1=20",
},
path: expectedIntelRdtPath,
},
}
container.state = &createdState{c: container}
state, err := container.State()
@ -285,15 +190,6 @@ func TestGetContainerState(t *testing.T) {
if memPath := paths["memory"]; memPath != expectedMemoryPath {
t.Fatalf("expected memory path %q but received %q", expectedMemoryPath, memPath)
}
if intelrdt.IsCATEnabled() || intelrdt.IsMBAEnabled() {
intelRdtPath := state.IntelRdtPath
if intelRdtPath == "" {
t.Fatal("intel rdt path should not be empty")
}
if intelRdtPath != expectedIntelRdtPath {
t.Fatalf("expected intel rdt path %q but received %q", expectedIntelRdtPath, intelRdtPath)
}
}
for _, ns := range container.config.Namespaces {
path := state.NamespacePaths[ns.Type]
if path == "" {

View File

@ -1,39 +0,0 @@
//go:build !go1.17
// +build !go1.17
package devices
import "io/fs"
// The following code is adapted from go1.17.1/src/io/fs/readdir.go
// to compensate for the lack of fs.FileInfoToDirEntry in Go 1.16.
// dirInfo is a DirEntry based on a FileInfo.
type dirInfo struct {
fileInfo fs.FileInfo
}
func (di dirInfo) IsDir() bool {
return di.fileInfo.IsDir()
}
func (di dirInfo) Type() fs.FileMode {
return di.fileInfo.Mode().Type()
}
func (di dirInfo) Info() (fs.FileInfo, error) {
return di.fileInfo, nil
}
func (di dirInfo) Name() string {
return di.fileInfo.Name()
}
// fileInfoToDirEntry returns a DirEntry that returns information from info.
// If info is nil, FileInfoToDirEntry returns nil.
func fileInfoToDirEntry(info fs.FileInfo) fs.DirEntry {
if info == nil {
return nil
}
return dirInfo{fileInfo: info}
}

View File

@ -1,8 +0,0 @@
//go:build go1.17
// +build go1.17
package devices
import "io/fs"
var fileInfoToDirEntry = fs.FileInfoToDirEntry

View File

@ -64,7 +64,7 @@ func TestHostDevicesIoutilReadDirDeepFailure(t *testing.T) {
t.Fatalf("Unexpected error %v", err)
}
return []fs.DirEntry{fileInfoToDirEntry(fi)}, nil
return []fs.DirEntry{fs.FileInfoToDirEntry(fi)}, nil
}
defer cleanupTest()

View File

@ -0,0 +1,17 @@
//go:build !go1.20
// +build !go1.20
package libcontainer
import "golang.org/x/sys/unix"
func eaccess(path string) error {
// This check is similar to access(2) with X_OK except for
// setuid/setgid binaries where it checks against the effective
// (rather than real) uid and gid. It is not needed in go 1.20
// and beyond and will be removed later.
// Relies on code added in https://go-review.googlesource.com/c/sys/+/468877
// and older CLs linked from there.
return unix.Faccessat(unix.AT_FDCWD, path, unix.X_OK, unix.AT_EACCESS)
}

View File

@ -0,0 +1,10 @@
//go:build go1.20
package libcontainer
func eaccess(path string) error {
// Not needed in Go 1.20+ as the functionality is already in there
// (added by https://go.dev/cl/416115, https://go.dev/cl/414824,
// and fixed in Go 1.20.2 by https://go.dev/cl/469956).
return nil
}

View File

@ -48,20 +48,6 @@ func InitArgs(args ...string) func(*LinuxFactory) error {
}
}
// IntelRdtfs is an options func to configure a LinuxFactory to return
// containers that use the Intel RDT "resource control" filesystem to
// create and manage Intel RDT resources (e.g., L3 cache, memory bandwidth).
func IntelRdtFs(l *LinuxFactory) error {
if !intelrdt.IsCATEnabled() && !intelrdt.IsMBAEnabled() {
l.NewIntelRdtManager = nil
} else {
l.NewIntelRdtManager = func(config *configs.Config, id string, path string) intelrdt.Manager {
return intelrdt.NewManager(config, id, path)
}
}
return nil
}
// TmpfsRoot is an option func to mount LinuxFactory.Root to tmpfs.
func TmpfsRoot(l *LinuxFactory) error {
mounted, err := mountinfo.Mounted(l.Root)
@ -136,9 +122,6 @@ type LinuxFactory struct {
// Validator provides validation to container configurations.
Validator validate.Validator
// NewIntelRdtManager returns an initialized Intel RDT manager for a single container.
NewIntelRdtManager func(config *configs.Config, id string, path string) intelrdt.Manager
}
func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, error) {
@ -179,6 +162,12 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err
return nil, fmt.Errorf("unable to get cgroup PIDs: %w", err)
}
if len(pids) != 0 {
if config.Cgroups.Systemd {
// systemd cgroup driver can't add a pid to an
// existing systemd unit and will return an
// error anyway, so let's error out early.
return nil, fmt.Errorf("container's cgroup is not empty: %d process(es) found", len(pids))
}
// TODO: return an error.
logrus.Warnf("container's cgroup is not empty: %d process(es) found", len(pids))
logrus.Warn("DEPRECATED: running container in a non-empty cgroup won't be supported in runc 1.2; https://github.com/opencontainers/runc/issues/3132")
@ -202,18 +191,16 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err
return nil, err
}
c := &linuxContainer{
id: id,
root: containerRoot,
config: config,
initPath: l.InitPath,
initArgs: l.InitArgs,
criuPath: l.CriuPath,
newuidmapPath: l.NewuidmapPath,
newgidmapPath: l.NewgidmapPath,
cgroupManager: cm,
}
if l.NewIntelRdtManager != nil {
c.intelRdtManager = l.NewIntelRdtManager(config, id, "")
id: id,
root: containerRoot,
config: config,
initPath: l.InitPath,
initArgs: l.InitArgs,
criuPath: l.CriuPath,
newuidmapPath: l.NewuidmapPath,
newgidmapPath: l.NewgidmapPath,
cgroupManager: cm,
intelRdtManager: intelrdt.NewManager(config, id, ""),
}
c.state = &stoppedState{c: c}
return c, nil
@ -255,12 +242,10 @@ func (l *LinuxFactory) Load(id string) (Container, error) {
newuidmapPath: l.NewuidmapPath,
newgidmapPath: l.NewgidmapPath,
cgroupManager: cm,
intelRdtManager: intelrdt.NewManager(&state.Config, id, state.IntelRdtPath),
root: containerRoot,
created: state.Created,
}
if l.NewIntelRdtManager != nil {
c.intelRdtManager = l.NewIntelRdtManager(&state.Config, id, state.IntelRdtPath)
}
c.state = &loadedState{c: c}
if err := c.refreshState(); err != nil {
return nil, err
@ -338,7 +323,11 @@ func (l *LinuxFactory) StartInitialization() (err error) {
defer func() {
if e := recover(); e != nil {
err = fmt.Errorf("panic from initialization: %w, %v", e, string(debug.Stack()))
if ee, ok := e.(error); ok {
err = fmt.Errorf("panic from initialization: %w, %s", ee, debug.Stack())
} else {
err = fmt.Errorf("panic from initialization: %v, %s", e, debug.Stack())
}
}
}()

View File

@ -37,28 +37,6 @@ func TestFactoryNew(t *testing.T) {
}
}
func TestFactoryNewIntelRdt(t *testing.T) {
root := t.TempDir()
factory, err := New(root, IntelRdtFs)
if err != nil {
t.Fatal(err)
}
if factory == nil {
t.Fatal("factory should not be nil")
}
lfactory, ok := factory.(*LinuxFactory)
if !ok {
t.Fatal("expected linux factory returned on linux based systems")
}
if lfactory.Root != root {
t.Fatalf("expected factory root to be %q but received %q", root, lfactory.Root)
}
if factory.Type() != "libcontainer" {
t.Fatalf("unexpected factory type: %q, expected %q", factory.Type(), "libcontainer")
}
}
func TestFactoryNewTmpfs(t *testing.T) {
root := t.TempDir()
factory, err := New(root, TmpfsRoot)
@ -157,7 +135,7 @@ func TestFactoryLoadContainer(t *testing.T) {
if err := marshal(filepath.Join(root, id, stateFilename), expectedState); err != nil {
t.Fatal(err)
}
factory, err := New(root, IntelRdtFs)
factory, err := New(root)
if err != nil {
t.Fatal(err)
}

View File

@ -8,7 +8,7 @@ import (
"io"
"net"
"os"
"strconv"
"path/filepath"
"strings"
"unsafe"
@ -117,17 +117,17 @@ func populateProcessEnvironment(env []string) error {
for _, pair := range env {
p := strings.SplitN(pair, "=", 2)
if len(p) < 2 {
return fmt.Errorf("invalid environment variable: %q", pair)
return errors.New("invalid environment variable: missing '='")
}
name, val := p[0], p[1]
if name == "" {
return fmt.Errorf("environment variable name can't be empty: %q", pair)
return errors.New("invalid environment variable: name cannot be empty")
}
if strings.IndexByte(name, 0) >= 0 {
return fmt.Errorf("environment variable name can't contain null(\\x00): %q", pair)
return fmt.Errorf("invalid environment variable %q: name contains nul byte (\\x00)", name)
}
if strings.IndexByte(val, 0) >= 0 {
return fmt.Errorf("environment variable value can't contain null(\\x00): %q", pair)
return fmt.Errorf("invalid environment variable %q: value contains nul byte (\\x00)", name)
}
if err := os.Setenv(name, val); err != nil {
return err
@ -136,6 +136,32 @@ func populateProcessEnvironment(env []string) error {
return nil
}
// verifyCwd ensures that the current directory is actually inside the mount
// namespace root of the current process.
func verifyCwd() error {
// getcwd(2) on Linux detects if cwd is outside of the rootfs of the
// current mount namespace root, and in that case prefixes "(unreachable)"
// to the returned string. glibc's getcwd(3) and Go's Getwd() both detect
// when this happens and return ENOENT rather than returning a non-absolute
// path. In both cases we can therefore easily detect if we have an invalid
// cwd by checking the return value of getcwd(3). See getcwd(3) for more
// details, and CVE-2024-21626 for the security issue that motivated this
// check.
//
// We have to use unix.Getwd() here because os.Getwd() has a workaround for
// $PWD which involves doing stat(.), which can fail if the current
// directory is inaccessible to the container process.
if wd, err := unix.Getwd(); errors.Is(err, unix.ENOENT) {
return errors.New("current working directory is outside of container mount namespace root -- possible container breakout detected")
} else if err != nil {
return fmt.Errorf("failed to verify if current working directory is safe: %w", err)
} else if !filepath.IsAbs(wd) {
// We shouldn't ever hit this, but check just in case.
return fmt.Errorf("current working directory is not absolute -- possible container breakout detected: cwd is %q", wd)
}
return nil
}
// finalizeNamespace drops the caps, sets the correct user
// and working dir, and closes any leaked file descriptors
// before executing the command inside the namespace
@ -194,6 +220,10 @@ func finalizeNamespace(config *initConfig) error {
return fmt.Errorf("chdir to cwd (%q) set in config.json failed: %w", config.Cwd, err)
}
}
// Make sure our final working directory is inside the container.
if err := verifyCwd(); err != nil {
return err
}
if err := system.ClearKeepCaps(); err != nil {
return fmt.Errorf("unable to clear keep caps: %w", err)
}
@ -406,40 +436,37 @@ func fixStdioPermissions(u *user.ExecUser) error {
if err := unix.Stat("/dev/null", &null); err != nil {
return &os.PathError{Op: "stat", Path: "/dev/null", Err: err}
}
for _, fd := range []uintptr{
os.Stdin.Fd(),
os.Stderr.Fd(),
os.Stdout.Fd(),
} {
for _, file := range []*os.File{os.Stdin, os.Stdout, os.Stderr} {
var s unix.Stat_t
if err := unix.Fstat(int(fd), &s); err != nil {
return &os.PathError{Op: "fstat", Path: "fd " + strconv.Itoa(int(fd)), Err: err}
if err := unix.Fstat(int(file.Fd()), &s); err != nil {
return &os.PathError{Op: "fstat", Path: file.Name(), Err: err}
}
// Skip chown of /dev/null if it was used as one of the STDIO fds.
if s.Rdev == null.Rdev {
// Skip chown if uid is already the one we want or any of the STDIO descriptors
// were redirected to /dev/null.
if int(s.Uid) == u.Uid || s.Rdev == null.Rdev {
continue
}
// We only change the uid owner (as it is possible for the mount to
// We only change the uid (as it is possible for the mount to
// prefer a different gid, and there's no reason for us to change it).
// The reason why we don't just leave the default uid=X mount setup is
// that users expect to be able to actually use their console. Without
// this code, you couldn't effectively run as a non-root user inside a
// container and also have a console set up.
if err := unix.Fchown(int(fd), u.Uid, int(s.Gid)); err != nil {
if err := file.Chown(u.Uid, int(s.Gid)); err != nil {
// If we've hit an EINVAL then s.Gid isn't mapped in the user
// namespace. If we've hit an EPERM then the inode's current owner
// is not mapped in our user namespace (in particular,
// privileged_wrt_inode_uidgid() has failed). In either case, we
// are in a configuration where it's better for us to just not
// touch the stdio rather than bail at this point.
// privileged_wrt_inode_uidgid() has failed). Read-only
// /dev can result in EROFS error. In any case, it's
// better for us to just not touch the stdio rather
// than bail at this point.
// nolint:errorlint // unix errors are bare
if err == unix.EINVAL || err == unix.EPERM {
if errors.Is(err, unix.EINVAL) || errors.Is(err, unix.EPERM) || errors.Is(err, unix.EROFS) {
continue
}
return &os.PathError{Op: "fchown", Path: "fd " + strconv.Itoa(int(fd)), Err: err}
return err
}
}
return nil

View File

@ -6,6 +6,7 @@ import (
"os"
"os/exec"
"path/filepath"
"regexp"
"strings"
"testing"
@ -61,6 +62,12 @@ func testCheckpoint(t *testing.T, userns bool) {
t.Skipf("criu binary not found: %v", err)
}
// Workaround for https://github.com/opencontainers/runc/issues/3532.
out, err := exec.Command("rpm", "-q", "criu").CombinedOutput()
if err == nil && regexp.MustCompile(`^criu-3\.17-[123]\.el9`).Match(out) {
t.Skip("Test requires criu >= 3.17-4 on CentOS Stream 9.")
}
config := newTemplateConfig(t, &tParam{userns: userns})
factory, err := libcontainer.New(t.TempDir())
ok(t, err)

View File

@ -18,6 +18,7 @@ import (
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/systemd"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/userns"
"github.com/opencontainers/runtime-spec/specs-go"
"golang.org/x/sys/unix"
@ -40,13 +41,7 @@ func testExecPS(t *testing.T, userns bool) {
}
config := newTemplateConfig(t, &tParam{userns: userns})
buffers, exitCode, err := runContainer(t, config, "ps", "-o", "pid,user,comm")
if err != nil {
t.Fatalf("%s: %s", buffers, err)
}
if exitCode != 0 {
t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
}
buffers := runContainerOk(t, config, "ps", "-o", "pid,user,comm")
lines := strings.Split(buffers.Stdout.String(), "\n")
if len(lines) < 2 {
t.Fatalf("more than one process running for output %q", buffers.Stdout.String())
@ -67,12 +62,7 @@ func TestIPCPrivate(t *testing.T) {
ok(t, err)
config := newTemplateConfig(t, nil)
buffers, exitCode, err := runContainer(t, config, "readlink", "/proc/self/ns/ipc")
ok(t, err)
if exitCode != 0 {
t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
}
buffers := runContainerOk(t, config, "readlink", "/proc/self/ns/ipc")
if actual := strings.Trim(buffers.Stdout.String(), "\n"); actual == l {
t.Fatalf("ipc link should be private to the container but equals host %q %q", actual, l)
@ -89,12 +79,7 @@ func TestIPCHost(t *testing.T) {
config := newTemplateConfig(t, nil)
config.Namespaces.Remove(configs.NEWIPC)
buffers, exitCode, err := runContainer(t, config, "readlink", "/proc/self/ns/ipc")
ok(t, err)
if exitCode != 0 {
t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
}
buffers := runContainerOk(t, config, "readlink", "/proc/self/ns/ipc")
if actual := strings.Trim(buffers.Stdout.String(), "\n"); actual != l {
t.Fatalf("ipc link not equal to host link %q %q", actual, l)
@ -111,13 +96,7 @@ func TestIPCJoinPath(t *testing.T) {
config := newTemplateConfig(t, nil)
config.Namespaces.Add(configs.NEWIPC, "/proc/1/ns/ipc")
buffers, exitCode, err := runContainer(t, config, "readlink", "/proc/self/ns/ipc")
ok(t, err)
if exitCode != 0 {
t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
}
buffers := runContainerOk(t, config, "readlink", "/proc/self/ns/ipc")
if actual := strings.Trim(buffers.Stdout.String(), "\n"); actual != l {
t.Fatalf("ipc link not equal to host link %q %q", actual, l)
@ -163,8 +142,7 @@ func testRlimit(t *testing.T, userns bool) {
Cur: 1024,
}))
out, _, err := runContainer(t, config, "/bin/sh", "-c", "ulimit -n")
ok(t, err)
out := runContainerOk(t, config, "/bin/sh", "-c", "ulimit -n")
if limit := strings.TrimSpace(out.Stdout.String()); limit != "1025" {
t.Fatalf("expected rlimit to be 1025, got %s", limit)
}
@ -537,7 +515,7 @@ func testCpuShares(t *testing.T, systemd bool) {
config.Cgroups.Resources.CpuShares = 1
if _, _, err := runContainer(t, config, "ps"); err == nil {
t.Fatalf("runContainer should failed with invalid CpuShares")
t.Fatal("runContainer should fail with invalid CpuShares")
}
}
@ -560,30 +538,20 @@ func testPids(t *testing.T, systemd bool) {
config := newTemplateConfig(t, &tParam{systemd: systemd})
config.Cgroups.Resources.PidsLimit = -1
// Running multiple processes.
_, ret, err := runContainer(t, config, "/bin/sh", "-c", "/bin/true | /bin/true | /bin/true | /bin/true")
ok(t, err)
if ret != 0 {
t.Fatalf("expected fork() to succeed with no pids limit")
}
// Running multiple processes, expecting it to succeed with no pids limit.
_ = runContainerOk(t, config, "/bin/sh", "-c", "/bin/true | /bin/true | /bin/true | /bin/true")
// Enforce a permissive limit. This needs to be fairly hand-wavey due to the
// issues with running Go binaries with pids restrictions (see below).
config.Cgroups.Resources.PidsLimit = 64
_, ret, err = runContainer(t, config, "/bin/sh", "-c", `
_ = runContainerOk(t, config, "/bin/sh", "-c", `
/bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true |
/bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true |
/bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true |
/bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true`)
ok(t, err)
if ret != 0 {
t.Fatalf("expected fork() to succeed with permissive pids limit")
}
// Enforce a restrictive limit. 64 * /bin/true + 1 * shell should cause this
// to fail reliability.
// Enforce a restrictive limit. 64 * /bin/true + 1 * shell should cause
// this to fail reliably.
config.Cgroups.Resources.PidsLimit = 64
out, _, err := runContainer(t, config, "/bin/sh", "-c", `
/bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true |
@ -933,13 +901,8 @@ func TestMountCgroupRO(t *testing.T) {
return
}
config := newTemplateConfig(t, nil)
buffers, exitCode, err := runContainer(t, config, "mount")
if err != nil {
t.Fatalf("%s: %s", buffers, err)
}
if exitCode != 0 {
t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
}
buffers := runContainerOk(t, config, "mount")
mountInfo := buffers.Stdout.String()
lines := strings.Split(mountInfo, "\n")
for _, l := range lines {
@ -980,13 +943,8 @@ func TestMountCgroupRW(t *testing.T) {
}
}
buffers, exitCode, err := runContainer(t, config, "mount")
if err != nil {
t.Fatalf("%s: %s", buffers, err)
}
if exitCode != 0 {
t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
}
buffers := runContainerOk(t, config, "mount")
mountInfo := buffers.Stdout.String()
lines := strings.Split(mountInfo, "\n")
for _, l := range lines {
@ -1197,11 +1155,7 @@ func TestSTDIOPermissions(t *testing.T) {
}
config := newTemplateConfig(t, nil)
buffers, exitCode, err := runContainer(t, config, "sh", "-c", "echo hi > /dev/stderr")
ok(t, err)
if exitCode != 0 {
t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
}
buffers := runContainerOk(t, config, "sh", "-c", "echo hi > /dev/stderr")
if actual := strings.Trim(buffers.Stderr.String(), "\n"); actual != "hi" {
t.Fatalf("stderr should equal be equal %q %q", actual, "hi")
@ -1444,12 +1398,7 @@ func TestPIDHost(t *testing.T) {
config := newTemplateConfig(t, nil)
config.Namespaces.Remove(configs.NEWPID)
buffers, exitCode, err := runContainer(t, config, "readlink", "/proc/self/ns/pid")
ok(t, err)
if exitCode != 0 {
t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
}
buffers := runContainerOk(t, config, "readlink", "/proc/self/ns/pid")
if actual := strings.Trim(buffers.Stdout.String(), "\n"); actual != l {
t.Fatalf("ipc link not equal to host link %q %q", actual, l)
@ -1640,6 +1589,11 @@ func TestInitJoinNetworkAndUser(t *testing.T) {
config2 := newTemplateConfig(t, &tParam{userns: true})
config2.Namespaces.Add(configs.NEWNET, netns1)
config2.Namespaces.Add(configs.NEWUSER, userns1)
// Emulate specconv.setupUserNamespace().
uidMap, gidMap, err := userns.GetUserNamespaceMappings(userns1)
ok(t, err)
config2.UidMappings = uidMap
config2.GidMappings = gidMap
config2.Cgroups.Path = "integration/test2"
container2, err := newContainer(t, config2)
ok(t, err)
@ -1738,12 +1692,7 @@ func TestCGROUPPrivate(t *testing.T) {
config := newTemplateConfig(t, nil)
config.Namespaces.Add(configs.NEWCGROUP, "")
buffers, exitCode, err := runContainer(t, config, "readlink", "/proc/self/ns/cgroup")
ok(t, err)
if exitCode != 0 {
t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
}
buffers := runContainerOk(t, config, "readlink", "/proc/self/ns/cgroup")
if actual := strings.Trim(buffers.Stdout.String(), "\n"); actual == l {
t.Fatalf("cgroup link should be private to the container but equals host %q %q", actual, l)
@ -1762,12 +1711,7 @@ func TestCGROUPHost(t *testing.T) {
ok(t, err)
config := newTemplateConfig(t, nil)
buffers, exitCode, err := runContainer(t, config, "readlink", "/proc/self/ns/cgroup")
ok(t, err)
if exitCode != 0 {
t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
}
buffers := runContainerOk(t, config, "readlink", "/proc/self/ns/cgroup")
if actual := strings.Trim(buffers.Stdout.String(), "\n"); actual != l {
t.Fatalf("cgroup link not equal to host link %q %q", actual, l)
@ -1790,6 +1734,16 @@ func testFdLeaks(t *testing.T, systemd bool) {
return
}
config := newTemplateConfig(t, &tParam{systemd: systemd})
// Run a container once to exclude file descriptors that are only
// opened once during the process lifetime by the library and are
// never closed. Those are not considered leaks.
//
// Examples of this open-once file descriptors are:
// - /sys/fs/cgroup dirfd opened by prepareOpenat2 in libct/cgroups;
// - dbus connection opened by getConnection in libct/cgroups/systemd.
_ = runContainerOk(t, config, "true")
pfd, err := os.Open("/proc/self/fd")
ok(t, err)
defer pfd.Close()
@ -1798,13 +1752,7 @@ func testFdLeaks(t *testing.T, systemd bool) {
_, err = pfd.Seek(0, 0)
ok(t, err)
config := newTemplateConfig(t, &tParam{systemd: systemd})
buffers, exitCode, err := runContainer(t, config, "true")
ok(t, err)
if exitCode != 0 {
t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
}
_ = runContainerOk(t, config, "true")
fds1, err := pfd.Readdirnames(0)
ok(t, err)
@ -1815,7 +1763,6 @@ func testFdLeaks(t *testing.T, systemd bool) {
// Show the extra opened files.
excludedPaths := []string{
"/sys/fs/cgroup", // opened once, see prepareOpenat2
"anon_inode:bpf-prog", // FIXME: see https://github.com/opencontainers/runc/issues/2366#issuecomment-776411392
}

View File

@ -13,7 +13,7 @@ import (
libseccomp "github.com/seccomp/libseccomp-golang"
)
func TestSeccompDenyGetcwdWithErrno(t *testing.T) {
func TestSeccompDenySyslogWithErrno(t *testing.T) {
if testing.Short() {
return
}
@ -25,7 +25,7 @@ func TestSeccompDenyGetcwdWithErrno(t *testing.T) {
DefaultAction: configs.Allow,
Syscalls: []*configs.Syscall{
{
Name: "getcwd",
Name: "syslog",
Action: configs.Errno,
ErrnoRet: &errnoRet,
},
@ -39,7 +39,7 @@ func TestSeccompDenyGetcwdWithErrno(t *testing.T) {
buffers := newStdBuffers()
pwd := &libcontainer.Process{
Cwd: "/",
Args: []string{"pwd"},
Args: []string{"dmesg"},
Env: standardEnvironment,
Stdin: buffers.Stdin,
Stdout: buffers.Stdout,
@ -65,17 +65,17 @@ func TestSeccompDenyGetcwdWithErrno(t *testing.T) {
}
if exitCode == 0 {
t.Fatalf("Getcwd should fail with negative exit code, instead got %d!", exitCode)
t.Fatalf("dmesg should fail with negative exit code, instead got %d!", exitCode)
}
expected := "pwd: getcwd: No such process"
expected := "dmesg: klogctl: No such process"
actual := strings.Trim(buffers.Stderr.String(), "\n")
if actual != expected {
t.Fatalf("Expected output %s but got %s\n", expected, actual)
}
}
func TestSeccompDenyGetcwd(t *testing.T) {
func TestSeccompDenySyslog(t *testing.T) {
if testing.Short() {
return
}
@ -85,7 +85,7 @@ func TestSeccompDenyGetcwd(t *testing.T) {
DefaultAction: configs.Allow,
Syscalls: []*configs.Syscall{
{
Name: "getcwd",
Name: "syslog",
Action: configs.Errno,
},
},
@ -98,7 +98,7 @@ func TestSeccompDenyGetcwd(t *testing.T) {
buffers := newStdBuffers()
pwd := &libcontainer.Process{
Cwd: "/",
Args: []string{"pwd"},
Args: []string{"dmesg"},
Env: standardEnvironment,
Stdin: buffers.Stdin,
Stdout: buffers.Stdout,
@ -124,10 +124,10 @@ func TestSeccompDenyGetcwd(t *testing.T) {
}
if exitCode == 0 {
t.Fatalf("Getcwd should fail with negative exit code, instead got %d!", exitCode)
t.Fatalf("dmesg should fail with negative exit code, instead got %d!", exitCode)
}
expected := "pwd: getcwd: Operation not permitted"
expected := "dmesg: klogctl: Operation not permitted"
actual := strings.Trim(buffers.Stderr.String(), "\n")
if actual != expected {
t.Fatalf("Expected output %s but got %s\n", expected, actual)
@ -282,13 +282,7 @@ func TestSeccompPermitWriteMultipleConditions(t *testing.T) {
},
}
buffers, exitCode, err := runContainer(t, config, "ls", "/")
if err != nil {
t.Fatalf("%s: %s", buffers, err)
}
if exitCode != 0 {
t.Fatalf("exit code not 0. code %d buffers %s", exitCode, buffers)
}
buffers := runContainerOk(t, config, "ls", "/")
// We don't need to verify the actual thing printed
// Just that something was written to stdout
if len(buffers.Stdout.String()) == 0 {
@ -375,13 +369,7 @@ func TestSeccompMultipleConditionSameArgDeniesStdout(t *testing.T) {
},
}
buffers, exitCode, err := runContainer(t, config, "ls", "/")
if err != nil {
t.Fatalf("%s: %s", buffers, err)
}
if exitCode != 0 {
t.Fatalf("exit code not 0. code %d buffers %s", exitCode, buffers)
}
buffers := runContainerOk(t, config, "ls", "/")
// Verify that nothing was printed
if len(buffers.Stdout.String()) != 0 {
t.Fatalf("Something was written to stdout, write call succeeded!\n")

View File

@ -216,6 +216,22 @@ func runContainer(t *testing.T, config *configs.Config, args ...string) (buffers
return
}
// runContainerOk is a wrapper for runContainer, simplifying its use for cases
// when the run is expected to succeed and return exit code of 0.
func runContainerOk(t *testing.T, config *configs.Config, args ...string) *stdBuffers {
buffers, exitCode, err := runContainer(t, config, args...)
t.Helper()
if err != nil {
t.Fatalf("%s: %s", buffers, err)
}
if exitCode != 0 {
t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
}
return buffers
}
func destroyContainer(container libcontainer.Container) {
_ = container.Destroy()
}

View File

@ -1,11 +1,9 @@
package intelrdt
import (
"bufio"
"bytes"
"errors"
"fmt"
"io"
"os"
"path/filepath"
"strconv"
@ -13,6 +11,8 @@ import (
"sync"
"github.com/moby/sys/mountinfo"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
)
@ -145,34 +145,31 @@ import (
* }
*/
type Manager interface {
// Applies Intel RDT configuration to the process with the specified pid
Apply(pid int) error
// Returns statistics for Intel RDT
GetStats() (*Stats, error)
// Destroys the Intel RDT container-specific 'container_id' group
Destroy() error
// Returns Intel RDT path to save in a state file and to be able to
// restore the object later
GetPath() string
// Set Intel RDT "resource control" filesystem as configured.
Set(container *configs.Config) error
}
// This implements interface Manager
type intelRdtManager struct {
type Manager struct {
mu sync.Mutex
config *configs.Config
id string
path string
}
func NewManager(config *configs.Config, id string, path string) Manager {
return &intelRdtManager{
// NewManager returns a new instance of Manager, or nil if the Intel RDT
// functionality is not specified in the config, available from hardware or
// enabled in the kernel.
func NewManager(config *configs.Config, id string, path string) *Manager {
if config.IntelRdt == nil {
return nil
}
if _, err := Root(); err != nil {
// Intel RDT is not available.
return nil
}
return newManager(config, id, path)
}
// newManager is the same as NewManager, except it does not check if the feature
// is actually available. Used by unit tests that mock intelrdt paths.
func newManager(config *configs.Config, id string, path string) *Manager {
return &Manager{
config: config,
id: id,
path: path,
@ -188,71 +185,52 @@ var (
catEnabled bool
// The flag to indicate if Intel RDT/MBA is enabled
mbaEnabled bool
// The flag to indicate if Intel RDT/MBA Software Controller is enabled
mbaScEnabled bool
// For Intel RDT initialization
initOnce sync.Once
errNotFound = errors.New("Intel RDT resctrl mount point not found")
errNotFound = errors.New("Intel RDT not available")
)
// Check if Intel RDT sub-features are enabled in featuresInit()
func featuresInit() {
initOnce.Do(func() {
// 1. Check if hardware and kernel support Intel RDT sub-features
flagsSet, err := parseCpuInfoFile("/proc/cpuinfo")
if err != nil {
return
}
// 2. Check if Intel RDT "resource control" filesystem is available.
// 1. Check if Intel RDT "resource control" filesystem is available.
// The user guarantees to mount the filesystem.
root, err := Root()
if err != nil {
return
}
// 3. Double check if Intel RDT sub-features are available in
// "resource control" filesystem. Intel RDT sub-features can be
// 2. Check if Intel RDT sub-features are available in "resource
// control" filesystem. Intel RDT sub-features can be
// selectively disabled or enabled by kernel command line
// (e.g., rdt=!l3cat,mba) in 4.14 and newer kernel
if flagsSet.CAT {
if _, err := os.Stat(filepath.Join(root, "info", "L3")); err == nil {
catEnabled = true
}
if _, err := os.Stat(filepath.Join(root, "info", "L3")); err == nil {
catEnabled = true
}
if mbaScEnabled {
// We confirm MBA Software Controller is enabled in step 2,
// MBA should be enabled because MBA Software Controller
// depends on MBA
if _, err := os.Stat(filepath.Join(root, "info", "MB")); err == nil {
mbaEnabled = true
} else if flagsSet.MBA {
if _, err := os.Stat(filepath.Join(root, "info", "MB")); err == nil {
mbaEnabled = true
}
}
if flagsSet.MBMTotal || flagsSet.MBMLocal || flagsSet.CMT {
if _, err := os.Stat(filepath.Join(root, "info", "L3_MON")); err != nil {
return
}
enabledMonFeatures, err = getMonFeatures(root)
if err != nil {
return
}
if enabledMonFeatures.mbmTotalBytes || enabledMonFeatures.mbmLocalBytes {
mbmEnabled = true
}
if enabledMonFeatures.llcOccupancy {
cmtEnabled = true
}
if _, err := os.Stat(filepath.Join(root, "info", "L3_MON")); err != nil {
return
}
enabledMonFeatures, err = getMonFeatures(root)
if err != nil {
return
}
if enabledMonFeatures.mbmTotalBytes || enabledMonFeatures.mbmLocalBytes {
mbmEnabled = true
}
if enabledMonFeatures.llcOccupancy {
cmtEnabled = true
}
})
}
// Return the mount point path of Intel RDT "resource control" filesysem
func findIntelRdtMountpointDir(f io.Reader) (string, error) {
mi, err := mountinfo.GetMountsFromReader(f, func(m *mountinfo.Info) (bool, bool) {
// findIntelRdtMountpointDir returns the mount point of the Intel RDT "resource control" filesystem.
func findIntelRdtMountpointDir() (string, error) {
mi, err := mountinfo.GetMounts(func(m *mountinfo.Info) (bool, bool) {
// similar to mountinfo.FSTypeFilter but stops after the first match
if m.FSType == "resctrl" {
return false, true // don't skip, stop
@ -266,97 +244,45 @@ func findIntelRdtMountpointDir(f io.Reader) (string, error) {
return "", errNotFound
}
// Check if MBA Software Controller is enabled through mount option "-o mba_MBps"
if strings.Contains(","+mi[0].VFSOptions+",", ",mba_MBps,") {
mbaScEnabled = true
}
return mi[0].Mountpoint, nil
}
// For Root() use only.
var (
intelRdtRoot string
rootMu sync.Mutex
intelRdtRoot string
intelRdtRootErr error
rootOnce sync.Once
)
// The kernel creates this (empty) directory if resctrl is supported by the
// hardware and kernel. The user is responsible for mounting the resctrl
// filesystem, and they could mount it somewhere else if they wanted to.
const defaultResctrlMountpoint = "/sys/fs/resctrl"
// Root returns the Intel RDT "resource control" filesystem mount point.
func Root() (string, error) {
rootMu.Lock()
defer rootMu.Unlock()
if intelRdtRoot != "" {
return intelRdtRoot, nil
}
f, err := os.Open("/proc/self/mountinfo")
if err != nil {
return "", err
}
root, err := findIntelRdtMountpointDir(f)
f.Close()
if err != nil {
return "", err
}
if _, err := os.Stat(root); err != nil {
return "", err
}
intelRdtRoot = root
return intelRdtRoot, nil
}
type cpuInfoFlags struct {
CAT bool // Cache Allocation Technology
MBA bool // Memory Bandwidth Allocation
// Memory Bandwidth Monitoring related.
MBMTotal bool
MBMLocal bool
CMT bool // Cache Monitoring Technology
}
func parseCpuInfoFile(path string) (cpuInfoFlags, error) {
infoFlags := cpuInfoFlags{}
f, err := os.Open(path)
if err != nil {
return infoFlags, err
}
defer f.Close()
s := bufio.NewScanner(f)
for s.Scan() {
line := s.Text()
// Search "cat_l3" and "mba" flags in first "flags" line
if strings.HasPrefix(line, "flags") {
flags := strings.Split(line, " ")
// "cat_l3" flag for CAT and "mba" flag for MBA
for _, flag := range flags {
switch flag {
case "cat_l3":
infoFlags.CAT = true
case "mba":
infoFlags.MBA = true
case "cqm_mbm_total":
infoFlags.MBMTotal = true
case "cqm_mbm_local":
infoFlags.MBMLocal = true
case "cqm_occup_llc":
infoFlags.CMT = true
}
rootOnce.Do(func() {
// Does this system support resctrl?
var statfs unix.Statfs_t
if err := unix.Statfs(defaultResctrlMountpoint, &statfs); err != nil {
if errors.Is(err, unix.ENOENT) {
err = errNotFound
}
return infoFlags, nil
intelRdtRootErr = err
return
}
}
if err := s.Err(); err != nil {
return infoFlags, err
}
return infoFlags, nil
// Has the resctrl fs been mounted to the default mount point?
if statfs.Type == unix.RDTGROUP_SUPER_MAGIC {
intelRdtRoot = defaultResctrlMountpoint
return
}
// The resctrl fs could have been mounted somewhere nonstandard.
intelRdtRoot, intelRdtRootErr = findIntelRdtMountpointDir()
})
return intelRdtRoot, intelRdtRootErr
}
// Gets a single uint64 value from the specified file.
@ -502,14 +428,8 @@ func IsMBAEnabled() bool {
return mbaEnabled
}
// Check if Intel RDT/MBA Software Controller is enabled
func IsMBAScEnabled() bool {
featuresInit()
return mbaScEnabled
}
// Get the path of the clos group in "resource control" filesystem that the container belongs to
func (m *intelRdtManager) getIntelRdtPath() (string, error) {
func (m *Manager) getIntelRdtPath() (string, error) {
rootPath, err := Root()
if err != nil {
return "", err
@ -524,7 +444,7 @@ func (m *intelRdtManager) getIntelRdtPath() (string, error) {
}
// Applies Intel RDT configuration to the process with the specified pid
func (m *intelRdtManager) Apply(pid int) (err error) {
func (m *Manager) Apply(pid int) (err error) {
// If intelRdt is not specified in config, we do nothing
if m.config.IntelRdt == nil {
return nil
@ -559,11 +479,11 @@ func (m *intelRdtManager) Apply(pid int) (err error) {
}
// Destroys the Intel RDT container-specific 'container_id' group
func (m *intelRdtManager) Destroy() error {
func (m *Manager) Destroy() error {
// Don't remove resctrl group if closid has been explicitly specified. The
// group is likely externally managed, i.e. by some other entity than us.
// There are probably other containers/tasks sharing the same group.
if m.config.IntelRdt == nil || m.config.IntelRdt.ClosID == "" {
if m.config.IntelRdt != nil && m.config.IntelRdt.ClosID == "" {
m.mu.Lock()
defer m.mu.Unlock()
if err := os.RemoveAll(m.GetPath()); err != nil {
@ -576,7 +496,7 @@ func (m *intelRdtManager) Destroy() error {
// Returns Intel RDT path to save in a state file and to be able to
// restore the object later
func (m *intelRdtManager) GetPath() string {
func (m *Manager) GetPath() string {
if m.path == "" {
m.path, _ = m.getIntelRdtPath()
}
@ -584,7 +504,7 @@ func (m *intelRdtManager) GetPath() string {
}
// Returns statistics for Intel RDT
func (m *intelRdtManager) GetStats() (*Stats, error) {
func (m *Manager) GetStats() (*Stats, error) {
// If intelRdt is not specified in config
if m.config.IntelRdt == nil {
return nil, nil
@ -670,7 +590,7 @@ func (m *intelRdtManager) GetStats() (*Stats, error) {
}
// Set Intel RDT "resource control" filesystem as configured.
func (m *intelRdtManager) Set(container *configs.Config) error {
func (m *Manager) Set(container *configs.Config) error {
// About L3 cache schema:
// It has allocation bitmasks/values for L3 cache on each socket,
// which contains L3 cache id and capacity bitmask (CBM).

View File

@ -1,8 +1,6 @@
package intelrdt
import (
"errors"
"io"
"os"
"path/filepath"
"strings"
@ -22,7 +20,7 @@ func TestIntelRdtSetL3CacheSchema(t *testing.T) {
})
helper.config.IntelRdt.L3CacheSchema = l3CacheSchemeAfter
intelrdt := NewManager(helper.config, "", helper.IntelRdtPath)
intelrdt := newManager(helper.config, "", helper.IntelRdtPath)
if err := intelrdt.Set(helper.config); err != nil {
t.Fatal(err)
}
@ -52,7 +50,7 @@ func TestIntelRdtSetMemBwSchema(t *testing.T) {
})
helper.config.IntelRdt.MemBwSchema = memBwSchemeAfter
intelrdt := NewManager(helper.config, "", helper.IntelRdtPath)
intelrdt := newManager(helper.config, "", helper.IntelRdtPath)
if err := intelrdt.Set(helper.config); err != nil {
t.Fatal(err)
}
@ -82,7 +80,7 @@ func TestIntelRdtSetMemBwScSchema(t *testing.T) {
})
helper.config.IntelRdt.MemBwSchema = memBwScSchemeAfter
intelrdt := NewManager(helper.config, "", helper.IntelRdtPath)
intelrdt := newManager(helper.config, "", helper.IntelRdtPath)
if err := intelrdt.Set(helper.config); err != nil {
t.Fatal(err)
}
@ -105,7 +103,7 @@ func TestApply(t *testing.T) {
const closID = "test-clos"
helper.config.IntelRdt.ClosID = closID
intelrdt := NewManager(helper.config, "", helper.IntelRdtPath)
intelrdt := newManager(helper.config, "", helper.IntelRdtPath)
if err := intelrdt.Apply(1234); err == nil {
t.Fatal("unexpected success when applying pid")
}
@ -114,7 +112,7 @@ func TestApply(t *testing.T) {
}
// Dir should be created if some schema has been specified
intelrdt.(*intelRdtManager).config.IntelRdt.L3CacheSchema = "L3:0=f"
intelrdt.config.IntelRdt.L3CacheSchema = "L3:0=f"
if err := intelrdt.Apply(1235); err != nil {
t.Fatalf("Apply() failed: %v", err)
}
@ -127,141 +125,3 @@ func TestApply(t *testing.T) {
t.Fatalf("unexpected tasks file, expected '1235', got %q", pids)
}
}
const (
mountinfoValid = `18 40 0:18 / /sys rw,nosuid,nodev,noexec,relatime shared:6 - sysfs sysfs rw
19 40 0:3 / /proc rw,nosuid,nodev,noexec,relatime shared:5 - proc proc rw
20 40 0:5 / /dev rw,nosuid shared:2 - devtmpfs devtmpfs rw,size=131927256k,nr_inodes=32981814,mode=755
21 18 0:17 / /sys/kernel/security rw,nosuid,nodev,noexec,relatime shared:7 - securityfs securityfs rw
22 20 0:19 / /dev/shm rw,nosuid,nodev shared:3 - tmpfs tmpfs rw
23 20 0:12 / /dev/pts rw,nosuid,noexec,relatime shared:4 - devpts devpts rw,gid=5,mode=620,ptmxmode=000
24 40 0:20 / /run rw,nosuid,nodev shared:22 - tmpfs tmpfs rw,mode=755
25 18 0:21 / /sys/fs/cgroup ro,nosuid,nodev,noexec shared:8 - tmpfs tmpfs ro,mode=755
26 25 0:22 / /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:9 - cgroup cgroup rw,xattr,release_agent=/usr/lib/systemd/systemd-cgroups-agent,name=systemd
27 18 0:23 / /sys/fs/pstore rw,nosuid,nodev,noexec,relatime shared:20 - pstore pstore rw
28 25 0:24 / /sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:10 - cgroup cgroup rw,perf_event
29 25 0:25 / /sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:11 - cgroup cgroup rw,cpuacct,cpu
30 25 0:26 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:12 - cgroup cgroup rw,memory
31 25 0:27 / /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:13 - cgroup cgroup rw,devices
32 25 0:28 / /sys/fs/cgroup/hugetlb rw,nosuid,nodev,noexec,relatime shared:14 - cgroup cgroup rw,hugetlb
33 25 0:29 / /sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:15 - cgroup cgroup rw,blkio
34 25 0:30 / /sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:16 - cgroup cgroup rw,pids
35 25 0:31 / /sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,cpuset
36 25 0:32 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,freezer
37 25 0:33 / /sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:19 - cgroup cgroup rw,net_prio,net_cls
38 18 0:34 / /sys/kernel/config rw,relatime shared:21 - configfs configfs rw
40 0 253:0 / / rw,relatime shared:1 - ext4 /dev/mapper/vvrg-vvrg rw,data=ordered
16 18 0:6 / /sys/kernel/debug rw,relatime shared:23 - debugfs debugfs rw
41 18 0:16 / /sys/fs/resctrl rw,relatime shared:24 - resctrl resctrl rw
42 20 0:36 / /dev/hugepages rw,relatime shared:25 - hugetlbfs hugetlbfs rw
43 19 0:37 / /proc/sys/fs/binfmt_misc rw,relatime shared:26 - autofs systemd-1 rw,fd=32,pgrp=1,timeout=0,minproto=5,maxproto=5,direct,pipe_ino=35492
44 20 0:15 / /dev/mqueue rw,relatime shared:27 - mqueue mqueue rw
45 40 8:1 / /boot rw,relatime shared:28 - ext4 /dev/sda1 rw,stripe=4,data=ordered
46 40 253:1 / /home rw,relatime shared:29 - ext4 /dev/mapper/vvhg-vvhg rw,data=ordered
47 40 0:38 / /var/lib/nfs/rpc_pipefs rw,relatime shared:30 - rpc_pipefs sunrpc rw
125 24 0:20 /mesos/containers /run/mesos/containers rw,nosuid shared:22 - tmpfs tmpfs rw,mode=755
123 40 253:0 /var/lib/docker/containers /var/lib/docker/containers rw,relatime - ext4 /dev/mapper/vvrg-vvrg rw,data=ordered
129 40 253:0 /var/lib/docker/overlay2 /var/lib/docker/overlay2 rw,relatime - ext4 /dev/mapper/vvrg-vvrg rw,data=ordered
119 24 0:39 / /run/user/1009 rw,nosuid,nodev,relatime shared:100 - tmpfs tmpfs rw,size=26387788k,mode=700,uid=1009,gid=1009`
mountinfoMbaSc = `18 40 0:18 / /sys rw,nosuid,nodev,noexec,relatime shared:6 - sysfs sysfs rw
19 40 0:3 / /proc rw,nosuid,nodev,noexec,relatime shared:5 - proc proc rw
20 40 0:5 / /dev rw,nosuid shared:2 - devtmpfs devtmpfs rw,size=131927256k,nr_inodes=32981814,mode=755
21 18 0:17 / /sys/kernel/security rw,nosuid,nodev,noexec,relatime shared:7 - securityfs securityfs rw
22 20 0:19 / /dev/shm rw,nosuid,nodev shared:3 - tmpfs tmpfs rw
23 20 0:12 / /dev/pts rw,nosuid,noexec,relatime shared:4 - devpts devpts rw,gid=5,mode=620,ptmxmode=000
24 40 0:20 / /run rw,nosuid,nodev shared:22 - tmpfs tmpfs rw,mode=755
25 18 0:21 / /sys/fs/cgroup ro,nosuid,nodev,noexec shared:8 - tmpfs tmpfs ro,mode=755
26 25 0:22 / /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:9 - cgroup cgroup rw,xattr,release_agent=/usr/lib/systemd/systemd-cgroups-agent,name=systemd
27 18 0:23 / /sys/fs/pstore rw,nosuid,nodev,noexec,relatime shared:20 - pstore pstore rw
28 25 0:24 / /sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:10 - cgroup cgroup rw,perf_event
29 25 0:25 / /sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:11 - cgroup cgroup rw,cpuacct,cpu
30 25 0:26 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:12 - cgroup cgroup rw,memory
31 25 0:27 / /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:13 - cgroup cgroup rw,devices
32 25 0:28 / /sys/fs/cgroup/hugetlb rw,nosuid,nodev,noexec,relatime shared:14 - cgroup cgroup rw,hugetlb
33 25 0:29 / /sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:15 - cgroup cgroup rw,blkio
34 25 0:30 / /sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:16 - cgroup cgroup rw,pids
35 25 0:31 / /sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,cpuset
36 25 0:32 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,freezer
37 25 0:33 / /sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:19 - cgroup cgroup rw,net_prio,net_cls
38 18 0:34 / /sys/kernel/config rw,relatime shared:21 - configfs configfs rw
40 0 253:0 / / rw,relatime shared:1 - ext4 /dev/mapper/vvrg-vvrg rw,data=ordered
16 18 0:6 / /sys/kernel/debug rw,relatime shared:23 - debugfs debugfs rw
41 18 0:16 / /sys/fs/resctrl rw,relatime shared:24 - resctrl resctrl rw,mba_MBps
42 20 0:36 / /dev/hugepages rw,relatime shared:25 - hugetlbfs hugetlbfs rw
43 19 0:37 / /proc/sys/fs/binfmt_misc rw,relatime shared:26 - autofs systemd-1 rw,fd=32,pgrp=1,timeout=0,minproto=5,maxproto=5,direct,pipe_ino=35492
44 20 0:15 / /dev/mqueue rw,relatime shared:27 - mqueue mqueue rw
45 40 8:1 / /boot rw,relatime shared:28 - ext4 /dev/sda1 rw,stripe=4,data=ordered
46 40 253:1 / /home rw,relatime shared:29 - ext4 /dev/mapper/vvhg-vvhg rw,data=ordered
47 40 0:38 / /var/lib/nfs/rpc_pipefs rw,relatime shared:30 - rpc_pipefs sunrpc rw
125 24 0:20 /mesos/containers /run/mesos/containers rw,nosuid shared:22 - tmpfs tmpfs rw,mode=755
123 40 253:0 /var/lib/docker/containers /var/lib/docker/containers rw,relatime - ext4 /dev/mapper/vvrg-vvrg rw,data=ordered
129 40 253:0 /var/lib/docker/overlay2 /var/lib/docker/overlay2 rw,relatime - ext4 /dev/mapper/vvrg-vvrg rw,data=ordered
119 24 0:39 / /run/user/1009 rw,nosuid,nodev,relatime shared:100 - tmpfs tmpfs rw,size=26387788k,mode=700,uid=1009,gid=1009`
)
func TestFindIntelRdtMountpointDir(t *testing.T) {
testCases := []struct {
name string
input io.Reader
isNotFoundError bool
isError bool
mbaScEnabled bool
mountpoint string
}{
{
name: "Valid mountinfo with MBA Software Controller disabled",
input: strings.NewReader(mountinfoValid),
mountpoint: "/sys/fs/resctrl",
},
{
name: "Valid mountinfo with MBA Software Controller enabled",
input: strings.NewReader(mountinfoMbaSc),
mbaScEnabled: true,
mountpoint: "/sys/fs/resctrl",
},
{
name: "Empty mountinfo",
input: strings.NewReader(""),
isNotFoundError: true,
},
{
name: "Broken mountinfo",
input: strings.NewReader("baa"),
isError: true,
},
}
for _, tc := range testCases {
tc := tc
t.Run(tc.name, func(t *testing.T) {
mbaScEnabled = false
mp, err := findIntelRdtMountpointDir(tc.input)
if tc.isNotFoundError {
if !errors.Is(err, errNotFound) {
t.Errorf("expected errNotFound error, got %+v", err)
}
return
}
if tc.isError {
if err == nil {
t.Error("expected error, got nil")
}
return
}
if err != nil {
t.Errorf("expected nil, got %+v", err)
return
}
// no errors, check the results
if tc.mbaScEnabled != mbaScEnabled {
t.Errorf("expected mbaScEnabled=%v, got %v",
tc.mbaScEnabled, mbaScEnabled)
}
if tc.mountpoint != mp {
t.Errorf("expected mountpoint=%q, got %q",
tc.mountpoint, mp)
}
})
}
}

View File

@ -26,7 +26,12 @@ func NewIntelRdtTestUtil(t *testing.T) *intelRdtTestUtil {
config := &configs.Config{
IntelRdt: &configs.IntelRdt{},
}
// Assign fake intelRtdRoot value, returned by Root().
intelRdtRoot = t.TempDir()
// Make sure Root() won't even try to parse mountinfo.
rootOnce.Do(func() {})
testIntelRdtPath := filepath.Join(intelRdtRoot, "resctrl")
// Ensure the full mock Intel RDT "resource control" filesystem path exists

View File

@ -1,6 +1,7 @@
package libcontainer
import (
"io/fs"
"strconv"
"golang.org/x/sys/unix"
@ -81,3 +82,20 @@ func unmount(target string, flags int) error {
}
return nil
}
// syscallMode returns the syscall-specific mode bits from Go's portable mode bits.
// Copy from https://cs.opensource.google/go/go/+/refs/tags/go1.20.7:src/os/file_posix.go;l=61-75
func syscallMode(i fs.FileMode) (o uint32) {
o |= uint32(i.Perm())
if i&fs.ModeSetuid != 0 {
o |= unix.S_ISUID
}
if i&fs.ModeSetgid != 0 {
o |= unix.S_ISGID
}
if i&fs.ModeSticky != 0 {
o |= unix.S_ISVTX
}
// No mapping for Go's ModeTemporary (plan9 only).
return
}

View File

@ -151,7 +151,7 @@ static int is_self_cloned(void)
* Is the binary a fully-sealed memfd? We don't need CLONED_BINARY_ENV for
* this, because you cannot write to a sealed memfd no matter what (so
* sharing it isn't a bad thing -- and an admin could bind-mount a sealed
* memfd to /usr/bin/runc to allow re-use).
* memfd to /usr/bin/runc to allow reuse).
*/
ret = fcntl(fd, F_GET_SEALS);
if (ret >= 0) {

View File

@ -168,15 +168,17 @@ static void write_log(int level, const char *format, ...)
message = escape_json_string(message);
if (current_stage == STAGE_SETUP)
if (current_stage == STAGE_SETUP) {
stage = strdup("nsexec");
else
if (stage == NULL)
goto out;
} else {
ret = asprintf(&stage, "nsexec-%d", current_stage);
if (ret < 0) {
stage = NULL;
goto out;
if (ret < 0) {
stage = NULL;
goto out;
}
}
ret = asprintf(&json, "{\"level\":\"%s\", \"msg\": \"%s[%d]: %s\"}\n",
level_str[level], stage, getpid(), message);
if (ret < 0) {
@ -416,11 +418,9 @@ static int getenv_int(const char *name)
if (val == endptr || *endptr != '\0')
bail("unable to parse %s=%s", name, val);
/*
* Sanity check: this must be a small non-negative number.
* Practically, we pass two fds (3 and 4) and a log level,
* for which the maximum is 6 (TRACE).
* */
if (ret < 0 || ret > TRACE)
* Sanity check: this must be a non-negative number.
*/
if (ret < 0)
bail("bad value for %s=%s (%d)", name, val, ret);
return ret;
@ -832,6 +832,25 @@ void send_mountsources(int sockfd, pid_t child, char *mountsources, size_t mount
bail("failed to close container mount namespace fd %d", container_mntns_fd);
}
void try_unshare(int flags, const char *msg)
{
write_log(DEBUG, "unshare %s", msg);
/*
* Kernels prior to v4.3 may return EINVAL on unshare when another process
* reads runc's /proc/$PID/status or /proc/$PID/maps. To work around this,
* retry on EINVAL a few times.
*/
int retries = 5;
for (; retries > 0; retries--) {
if (unshare(flags) == 0) {
return;
}
if (errno != EINVAL)
break;
}
bail("failed to unshare %s", msg);
}
void nsexec(void)
{
int pipenum;
@ -1070,7 +1089,7 @@ void nsexec(void)
s = SYNC_MOUNTSOURCES_ACK;
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
kill(stage1_pid, SIGKILL);
sane_kill(stage1_pid, SIGKILL);
bail("failed to sync with child: write(SYNC_MOUNTSOURCES_ACK)");
}
break;
@ -1170,9 +1189,7 @@ void nsexec(void)
* problem.
*/
if (config.cloneflags & CLONE_NEWUSER) {
write_log(DEBUG, "unshare user namespace");
if (unshare(CLONE_NEWUSER) < 0)
bail("failed to unshare user namespace");
try_unshare(CLONE_NEWUSER, "user namespace");
config.cloneflags &= ~CLONE_NEWUSER;
/*
@ -1224,15 +1241,13 @@ void nsexec(void)
* some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID)
* was broken, so we'll just do it the long way anyway.
*/
write_log(DEBUG, "unshare remaining namespace (except cgroupns)");
if (unshare(config.cloneflags & ~CLONE_NEWCGROUP) < 0)
bail("failed to unshare remaining namespaces (except cgroupns)");
try_unshare(config.cloneflags & ~CLONE_NEWCGROUP, "remaining namespaces (except cgroupns)");
/* Ask our parent to send the mount sources fds. */
if (config.mountsources) {
s = SYNC_MOUNTSOURCES_PLS;
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
kill(stage2_pid, SIGKILL);
sane_kill(stage2_pid, SIGKILL);
bail("failed to sync with parent: write(SYNC_MOUNTSOURCES_PLS)");
}
@ -1241,11 +1256,11 @@ void nsexec(void)
/* Parent finished to send the mount sources fds. */
if (read(syncfd, &s, sizeof(s)) != sizeof(s)) {
kill(stage2_pid, SIGKILL);
sane_kill(stage2_pid, SIGKILL);
bail("failed to sync with parent: read(SYNC_MOUNTSOURCES_ACK)");
}
if (s != SYNC_MOUNTSOURCES_ACK) {
kill(stage2_pid, SIGKILL);
sane_kill(stage2_pid, SIGKILL);
bail("failed to sync with parent: SYNC_MOUNTSOURCES_ACK: got %u", s);
}
}
@ -1344,8 +1359,7 @@ void nsexec(void)
}
if (config.cloneflags & CLONE_NEWCGROUP) {
if (unshare(CLONE_NEWCGROUP) < 0)
bail("failed to unshare cgroup namespace");
try_unshare(CLONE_NEWCGROUP, "cgroup namespace");
}
write_log(DEBUG, "signal completion to stage-0");

View File

@ -39,13 +39,9 @@ type parentProcess interface {
// startTime returns the process start time.
startTime() (uint64, error)
signal(os.Signal) error
externalDescriptors() []string
setExternalDescriptors(fds []string)
forwardChildLogs() chan error
}
@ -303,7 +299,7 @@ type initProcess struct {
logFilePair filePair
config *initConfig
manager cgroups.Manager
intelRdtManager intelrdt.Manager
intelRdtManager *intelrdt.Manager
container *linuxContainer
fds []string
process *Process

View File

@ -80,6 +80,8 @@ func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig, mountFds []int) (err
// Therefore, we can access mountFds[i] without any concerns.
if mountFds != nil && mountFds[i] != -1 {
mountConfig.fd = &mountFds[i]
} else {
mountConfig.fd = nil
}
if err := mountToRootfs(m, mountConfig); err != nil {
@ -327,26 +329,41 @@ func mountCgroupV2(m *configs.Mount, c *mountConfig) error {
if err := os.MkdirAll(dest, 0o755); err != nil {
return err
}
return utils.WithProcfd(c.root, m.Destination, func(procfd string) error {
if err := mount(m.Source, m.Destination, procfd, "cgroup2", uintptr(m.Flags), m.Data); err != nil {
// when we are in UserNS but CgroupNS is not unshared, we cannot mount cgroup2 (#2158)
if errors.Is(err, unix.EPERM) || errors.Is(err, unix.EBUSY) {
src := fs2.UnifiedMountpoint
if c.cgroupns && c.cgroup2Path != "" {
// Emulate cgroupns by bind-mounting
// the container cgroup path rather than
// the whole /sys/fs/cgroup.
src = c.cgroup2Path
}
err = mount(src, m.Destination, procfd, "", uintptr(m.Flags)|unix.MS_BIND, "")
if c.rootlessCgroups && errors.Is(err, unix.ENOENT) {
err = nil
}
}
return err
}
return nil
err = utils.WithProcfd(c.root, m.Destination, func(procfd string) error {
return mount(m.Source, m.Destination, procfd, "cgroup2", uintptr(m.Flags), m.Data)
})
if err == nil || !(errors.Is(err, unix.EPERM) || errors.Is(err, unix.EBUSY)) {
return err
}
// When we are in UserNS but CgroupNS is not unshared, we cannot mount
// cgroup2 (#2158), so fall back to bind mount.
bindM := &configs.Mount{
Device: "bind",
Source: fs2.UnifiedMountpoint,
Destination: m.Destination,
Flags: unix.MS_BIND | m.Flags,
PropagationFlags: m.PropagationFlags,
}
if c.cgroupns && c.cgroup2Path != "" {
// Emulate cgroupns by bind-mounting the container cgroup path
// rather than the whole /sys/fs/cgroup.
bindM.Source = c.cgroup2Path
}
// mountToRootfs() handles remounting for MS_RDONLY.
// No need to set c.fd here, because mountToRootfs() calls utils.WithProcfd() by itself in mountPropagate().
err = mountToRootfs(bindM, c)
if c.rootlessCgroups && errors.Is(err, unix.ENOENT) {
// ENOENT (for `src = c.cgroup2Path`) happens when rootless runc is being executed
// outside the userns+mountns.
//
// Mask `/sys/fs/cgroup` to ensure it is read-only, even when `/sys` is mounted
// with `rbind,ro` (`runc spec --rootless` produces `rbind,ro` for `/sys`).
err = utils.WithProcfd(c.root, m.Destination, func(procfd string) error {
return maskPath(procfd, c.label)
})
}
return err
}
func doTmpfsCopyUp(m *configs.Mount, rootfs, mountLabel string) (Err error) {
@ -396,6 +413,35 @@ func doTmpfsCopyUp(m *configs.Mount, rootfs, mountLabel string) (Err error) {
func mountToRootfs(m *configs.Mount, c *mountConfig) error {
rootfs := c.root
// procfs and sysfs are special because we need to ensure they are actually
// mounted on a specific path in a container without any funny business.
switch m.Device {
case "proc", "sysfs":
// If the destination already exists and is not a directory, we bail
// out. This is to avoid mounting through a symlink or similar -- which
// has been a "fun" attack scenario in the past.
// TODO: This won't be necessary once we switch to libpathrs and we can
// stop all of these symlink-exchange attacks.
dest := filepath.Clean(m.Destination)
if !strings.HasPrefix(dest, rootfs) {
// Do not use securejoin as it resolves symlinks.
dest = filepath.Join(rootfs, dest)
}
if fi, err := os.Lstat(dest); err != nil {
if !os.IsNotExist(err) {
return err
}
} else if !fi.IsDir() {
return fmt.Errorf("filesystem %q must be mounted on ordinary directory", m.Device)
}
if err := os.MkdirAll(dest, 0o755); err != nil {
return err
}
// Selinux kernels do not support labeling of /proc or /sys.
return mountPropagate(m, rootfs, "", nil)
}
mountLabel := c.label
mountFd := c.fd
dest, err := securejoin.SecureJoin(rootfs, m.Destination)
@ -404,24 +450,6 @@ func mountToRootfs(m *configs.Mount, c *mountConfig) error {
}
switch m.Device {
case "proc", "sysfs":
// If the destination already exists and is not a directory, we bail
// out This is to avoid mounting through a symlink or similar -- which
// has been a "fun" attack scenario in the past.
// TODO: This won't be necessary once we switch to libpathrs and we can
// stop all of these symlink-exchange attacks.
if fi, err := os.Lstat(dest); err != nil {
if !os.IsNotExist(err) {
return err
}
} else if fi.Mode()&os.ModeDir == 0 {
return fmt.Errorf("filesystem %q must be mounted on ordinary directory", m.Device)
}
if err := os.MkdirAll(dest, 0o755); err != nil {
return err
}
// Selinux kernels do not support labeling of /proc or /sys
return mountPropagate(m, rootfs, "", nil)
case "mqueue":
if err := os.MkdirAll(dest, 0o755); err != nil {
return err
@ -431,11 +459,16 @@ func mountToRootfs(m *configs.Mount, c *mountConfig) error {
}
return label.SetFileLabel(dest, mountLabel)
case "tmpfs":
stat, err := os.Stat(dest)
if err != nil {
if stat, err := os.Stat(dest); err != nil {
if err := os.MkdirAll(dest, 0o755); err != nil {
return err
}
} else {
dt := fmt.Sprintf("mode=%04o", syscallMode(stat.Mode()))
if m.Data != "" {
dt = dt + "," + m.Data
}
m.Data = dt
}
if m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP {
@ -444,16 +477,7 @@ func mountToRootfs(m *configs.Mount, c *mountConfig) error {
err = mountPropagate(m, rootfs, mountLabel, nil)
}
if err != nil {
return err
}
if stat != nil {
if err = os.Chmod(dest, stat.Mode()); err != nil {
return err
}
}
return nil
return err
case "bind":
if err := prepareBindMount(m, rootfs, mountFd); err != nil {
return err
@ -577,6 +601,7 @@ func checkProcMount(rootfs, dest, source string) error {
"/proc/loadavg",
"/proc/slabinfo",
"/proc/net/dev",
"/proc/sys/kernel/ns_last_pid",
}
for _, valid := range validProcMounts {
path, err := filepath.Rel(filepath.Join(rootfs, valid), dest)

View File

@ -38,6 +38,14 @@ func TestCheckMountDestFalsePositive(t *testing.T) {
}
}
func TestCheckMountDestNsLastPid(t *testing.T) {
dest := "/rootfs/proc/sys/kernel/ns_last_pid"
err := checkProcMount("/rootfs", dest, "/proc")
if err != nil {
t.Fatal("/proc/sys/kernel/ns_last_pid should not return an error")
}
}
func TestNeedsSetupDev(t *testing.T) {
config := &configs.Config{
Mounts: []*configs.Mount{

View File

@ -29,13 +29,15 @@ func KnownOperators() []string {
}
var actions = map[string]configs.Action{
"SCMP_ACT_KILL": configs.Kill,
"SCMP_ACT_ERRNO": configs.Errno,
"SCMP_ACT_TRAP": configs.Trap,
"SCMP_ACT_ALLOW": configs.Allow,
"SCMP_ACT_TRACE": configs.Trace,
"SCMP_ACT_LOG": configs.Log,
"SCMP_ACT_NOTIFY": configs.Notify,
"SCMP_ACT_KILL": configs.Kill,
"SCMP_ACT_ERRNO": configs.Errno,
"SCMP_ACT_TRAP": configs.Trap,
"SCMP_ACT_ALLOW": configs.Allow,
"SCMP_ACT_TRACE": configs.Trace,
"SCMP_ACT_LOG": configs.Log,
"SCMP_ACT_NOTIFY": configs.Notify,
"SCMP_ACT_KILL_THREAD": configs.KillThread,
"SCMP_ACT_KILL_PROCESS": configs.KillProcess,
}
// KnownActions returns the list of the known actions.
@ -64,6 +66,7 @@ var archs = map[string]string{
"SCMP_ARCH_PPC": "ppc",
"SCMP_ARCH_PPC64": "ppc64",
"SCMP_ARCH_PPC64LE": "ppc64le",
"SCMP_ARCH_RISCV64": "riscv64",
"SCMP_ARCH_S390": "s390",
"SCMP_ARCH_S390X": "s390x",
}

View File

@ -48,6 +48,13 @@ const uintptr_t C_FILTER_FLAG_LOG = SECCOMP_FILTER_FLAG_LOG;
#endif
const uintptr_t C_FILTER_FLAG_NEW_LISTENER = SECCOMP_FILTER_FLAG_NEW_LISTENER;
#ifndef AUDIT_ARCH_RISCV64
#ifndef EM_RISCV
#define EM_RISCV 243
#endif
#define AUDIT_ARCH_RISCV64 (EM_RISCV|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
#endif
// We use the AUDIT_ARCH_* values because those are the ones used by the kernel
// and SCMP_ARCH_* sometimes has fake values (such as SCMP_ARCH_X32). But we
// use <seccomp.h> so we get libseccomp's fallback definitions of AUDIT_ARCH_*.
@ -67,11 +74,17 @@ const uint32_t C_AUDIT_ARCH_PPC64 = AUDIT_ARCH_PPC64;
const uint32_t C_AUDIT_ARCH_PPC64LE = AUDIT_ARCH_PPC64LE;
const uint32_t C_AUDIT_ARCH_S390 = AUDIT_ARCH_S390;
const uint32_t C_AUDIT_ARCH_S390X = AUDIT_ARCH_S390X;
const uint32_t C_AUDIT_ARCH_RISCV64 = AUDIT_ARCH_RISCV64;
*/
import "C"
var retErrnoEnosys = uint32(C.C_ACT_ERRNO_ENOSYS)
// This syscall is used for multiplexing "large" syscalls on s390(x). Unknown
// syscalls will end up with this syscall number, so we need to explicitly
// return -ENOSYS for this syscall on those architectures.
const s390xMultiplexSyscall libseccomp.ScmpSyscall = 0
func isAllowAction(action configs.Action) bool {
switch action {
// Trace is considered an "allow" action because a good tracer should
@ -197,6 +210,8 @@ func archToNative(arch libseccomp.ScmpArch) (nativeArch, error) {
return nativeArch(C.C_AUDIT_ARCH_S390), nil
case libseccomp.ArchS390X:
return nativeArch(C.C_AUDIT_ARCH_S390X), nil
case libseccomp.ArchRISCV64:
return nativeArch(C.C_AUDIT_ARCH_RISCV64), nil
default:
return invalidArch, fmt.Errorf("unknown architecture: %v", arch)
}
@ -305,7 +320,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
// directly from the arch code so we need to do it here. Sadly we can't
// share this code between architecture branches.
section := []bpf.Instruction{
// load [0]
// load [0] (syscall number)
bpf.LoadAbsolute{Off: 0, Size: 4}, // NOTE: We assume sizeof(int) == 4.
}
@ -314,10 +329,37 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
// No syscalls found for this arch -- skip it and move on.
continue
case 1:
// Get the only syscall in the map.
var sysno libseccomp.ScmpSyscall
for _, no := range maxSyscalls {
// Get the only syscall and scmpArch in the map.
var (
scmpArch libseccomp.ScmpArch
sysno libseccomp.ScmpSyscall
)
for arch, no := range maxSyscalls {
sysno = no
scmpArch = arch
}
switch scmpArch {
// Return -ENOSYS for setup(2) on s390(x). This syscall is used for
// multiplexing "large syscall number" syscalls, but if the syscall
// number is not known to the kernel then the syscall number is
// left unchanged (and because it is sysno=0, you'll end up with
// EPERM for syscalls the kernel doesn't know about).
//
// The actual setup(2) syscall is never used by userspace anymore
// (and hasn't existed for decades) outside of this multiplexing
// scheme so returning -ENOSYS is fine.
case libseccomp.ArchS390, libseccomp.ArchS390X:
section = append(section, []bpf.Instruction{
// jne [setup=0],1
bpf.JumpIf{
Cond: bpf.JumpNotEqual,
Val: uint32(s390xMultiplexSyscall),
SkipTrue: 1,
},
// ret [ENOSYS]
bpf.RetConstant{Val: retErrnoEnosys},
}...)
}
// The simplest case just boils down to a single jgt instruction,
@ -349,12 +391,6 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
// If we're on x86 we need to add a check for x32 and if we're in
// the wrong mode we jump over the section.
if uint32(nativeArch) == uint32(C.C_AUDIT_ARCH_X86_64) {
// Grab the only architecture in the map.
var scmpArch libseccomp.ScmpArch
for arch := range maxSyscalls {
scmpArch = arch
}
// Generate a prefix to check the mode.
switch scmpArch {
case libseccomp.ArchAMD64:
@ -512,7 +548,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
// Prepend the load instruction for the architecture.
programTail = append([]bpf.Instruction{
// load [4]
// load [4] (architecture)
bpf.LoadAbsolute{Off: 4, Size: 4}, // NOTE: We assume sizeof(int) == 4.
}, programTail...)

View File

@ -213,6 +213,19 @@ func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string)
})
}
// If we're on s390(x) make sure you get -ENOSYS for the "setup"
// syscall (this is done to work around an issue with s390x's
// syscall multiplexing which results in unknown syscalls being a
// setup(2) invocation).
switch scmpArch {
case libseccomp.ArchS390, libseccomp.ArchS390X:
syscallTests = append(syscallTests, syscallTest{
sysno: s390xMultiplexSyscall,
syscall: "setup",
expected: retErrnoEnosys,
})
}
// Test syscalls in the explicit list.
for _, test := range syscallTests {
// Override the expected value in the two special cases.
@ -282,7 +295,7 @@ func TestDisassembleHugeFilterDoesNotHang(t *testing.T) {
}
for i := 1; i < 10000; i++ {
if err := hugeFilter.AddRule(libseccomp.ScmpSyscall(i), libseccomp.ActKill); err != nil {
if err := hugeFilter.AddRule(libseccomp.ScmpSyscall(i), libseccomp.ActKillThread); err != nil {
t.Fatalf("failed to add rule to filter %d: %v", i, err)
}
}

View File

@ -113,8 +113,8 @@ func InitSeccomp(config *configs.Seccomp) (int, error) {
// Convert Libcontainer Action to Libseccomp ScmpAction
func getAction(act configs.Action, errnoRet *uint) (libseccomp.ScmpAction, error) {
switch act {
case configs.Kill:
return libseccomp.ActKill, nil
case configs.Kill, configs.KillThread:
return libseccomp.ActKillThread, nil
case configs.Errno:
if errnoRet != nil {
return libseccomp.ActErrno.SetReturnCode(int16(*errnoRet)), nil
@ -133,8 +133,6 @@ func getAction(act configs.Action, errnoRet *uint) (libseccomp.ScmpAction, error
return libseccomp.ActLog, nil
case configs.Notify:
return libseccomp.ActNotify, nil
case configs.KillThread:
return libseccomp.ActKillThread, nil
case configs.KillProcess:
return libseccomp.ActKillProcess, nil
default:

View File

@ -4,6 +4,7 @@ import (
"errors"
"fmt"
"os"
"os/exec"
"strconv"
"github.com/opencontainers/selinux/go-selinux"
@ -14,6 +15,7 @@ import (
"github.com/opencontainers/runc/libcontainer/keys"
"github.com/opencontainers/runc/libcontainer/seccomp"
"github.com/opencontainers/runc/libcontainer/system"
"github.com/opencontainers/runc/libcontainer/utils"
)
// linuxSetnsInit performs the container's initialization for running a new process
@ -82,6 +84,21 @@ func (l *linuxSetnsInit) Init() error {
if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
return err
}
// Check for the arg before waiting to make sure it exists and it is
// returned as a create time error.
name, err := exec.LookPath(l.config.Args[0])
if err != nil {
return err
}
// exec.LookPath in Go < 1.20 might return no error for an executable
// residing on a file system mounted with noexec flag, so perform this
// extra check now while we can still return a proper error.
// TODO: remove this once go < 1.20 is not supported.
if err := eaccess(name); err != nil {
return &os.PathError{Op: "eaccess", Path: name, Err: err}
}
// Set seccomp as close to execve as possible, so as few syscalls take
// place afterward (reducing the amount of syscalls that users need to
// enable in their seccomp profiles).
@ -101,5 +118,23 @@ func (l *linuxSetnsInit) Init() error {
return &os.PathError{Op: "close log pipe", Path: "fd " + strconv.Itoa(l.logFd), Err: err}
}
return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ())
// Close all file descriptors we are not passing to the container. This is
// necessary because the execve target could use internal runc fds as the
// execve path, potentially giving access to binary files from the host
// (which can then be opened by container processes, leading to container
// escapes). Note that because this operation will close any open file
// descriptors that are referenced by (*os.File) handles from underneath
// the Go runtime, we must not do any file operations after this point
// (otherwise the (*os.File) finaliser could close the wrong file). See
// CVE-2024-21626 for more information as to why this protection is
// necessary.
//
// This is not needed for runc-dmz, because the extra execve(2) step means
// that all O_CLOEXEC file descriptors have already been closed and thus
// the second execve(2) from runc-dmz cannot access internal file
// descriptors from runc.
if err := utils.UnsafeCloseFrom(l.config.PassedFilesCount + 3); err != nil {
return err
}
return system.Exec(name, l.config.Args[0:], os.Environ())
}

View File

@ -18,6 +18,7 @@ import (
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/devices"
"github.com/opencontainers/runc/libcontainer/seccomp"
"github.com/opencontainers/runc/libcontainer/userns"
libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/sirupsen/logrus"
@ -176,18 +177,19 @@ func KnownMountOptions() []string {
// AllowedDevices is the set of devices which are automatically included for
// all containers.
//
// XXX (cyphar)
// This behaviour is at the very least "questionable" (if not outright
// wrong) according to the runtime-spec.
// # XXX (cyphar)
//
// Yes, we have to include certain devices other than the ones the user
// specifies, but several devices listed here are not part of the spec
// (including "mknod for any device"?!). In addition, these rules are
// appended to the user-provided set which means that users *cannot disable
// this behaviour*.
// This behaviour is at the very least "questionable" (if not outright
// wrong) according to the runtime-spec.
//
// ... unfortunately I'm too scared to change this now because who knows how
// many people depend on this (incorrect and arguably insecure) behaviour.
// Yes, we have to include certain devices other than the ones the user
// specifies, but several devices listed here are not part of the spec
// (including "mknod for any device"?!). In addition, these rules are
// appended to the user-provided set which means that users *cannot disable
// this behaviour*.
//
// ... unfortunately I'm too scared to change this now because who knows how
// many people depend on this (incorrect and arguably insecure) behaviour.
var AllowedDevices = []*devices.Device{
// allow mknod for any device
{
@ -925,9 +927,9 @@ next:
func setupUserNamespace(spec *specs.Spec, config *configs.Config) error {
create := func(m specs.LinuxIDMapping) configs.IDMap {
return configs.IDMap{
HostID: int(m.HostID),
ContainerID: int(m.ContainerID),
Size: int(m.Size),
HostID: int64(m.HostID),
ContainerID: int64(m.ContainerID),
Size: int64(m.Size),
}
}
if spec.Linux != nil {
@ -938,6 +940,40 @@ func setupUserNamespace(spec *specs.Spec, config *configs.Config) error {
config.GidMappings = append(config.GidMappings, create(m))
}
}
if path := config.Namespaces.PathOf(configs.NEWUSER); path != "" {
// Cache the current userns mappings in our configuration, so that we
// can calculate uid and gid mappings within runc. These mappings are
// never used for configuring the container if the path is set.
uidMap, gidMap, err := userns.GetUserNamespaceMappings(path)
if err != nil {
return fmt.Errorf("failed to cache mappings for userns: %w", err)
}
// We cannot allow uid or gid mappings to be set if we are also asked
// to join a userns.
if config.UidMappings != nil || config.GidMappings != nil {
// FIXME: It turns out that containerd and CRIO pass both a userns
// path and the mappings of the namespace in the same config.json.
// Such a configuration is technically not valid, but we used to
// require mappings be specified, and thus users worked around our
// bug -- so we can't regress it at the moment. But we also don't
// want to produce broken behaviour if the mapping doesn't match
// the userns. So (for now) we output a warning if the actual
// userns mappings match the configuration, otherwise we return an
// error.
if !userns.IsSameMapping(uidMap, config.UidMappings) ||
!userns.IsSameMapping(gidMap, config.GidMappings) {
return errors.New("user namespaces enabled, but both namespace path and non-matching mapping specified -- you may only provide one")
}
logrus.Warnf("config.json has both a userns path to join and a matching userns mapping specified -- you may only provide one. Future versions of runc may return an error with this configuration, please report a bug on <https://github.com/opencontainers/runc> if you see this warning and cannot update your configuration.")
}
config.UidMappings = uidMap
config.GidMappings = gidMap
logrus.WithFields(logrus.Fields{
"uid_map": uidMap,
"gid_map": gidMap,
}).Debugf("config uses path-based userns configuration -- current uid and gid mappings cached")
}
rootUID, err := config.HostRootUID()
if err != nil {
return err

View File

@ -234,6 +234,14 @@ func TestSetupSeccomp(t *testing.T) {
Names: []string{"mknod"},
Action: "SCMP_ACT_NOTIFY",
},
{
Names: []string{"rmdir"},
Action: "SCMP_ACT_KILL_THREAD",
},
{
Names: []string{"mkdir"},
Action: "SCMP_ACT_KILL_PROCESS",
},
},
}
seccomp, err := SetupSeccomp(conf)
@ -263,9 +271,8 @@ func TestSetupSeccomp(t *testing.T) {
calls := seccomp.Syscalls
callsLength := len(calls)
if callsLength != 8 {
t.Errorf("Expected 8 syscalls, got :%d", callsLength)
if len(calls) != len(conf.Syscalls) {
t.Error("Mismatched number of syscalls")
}
for _, call := range calls {
@ -317,6 +324,14 @@ func TestSetupSeccomp(t *testing.T) {
if call.Action != configs.Notify {
t.Errorf("Wrong conversion for the %s syscall action", call.Name)
}
case "rmdir":
if call.Action != configs.KillThread {
t.Errorf("Wrong conversion for the %s syscall action", call.Name)
}
case "mkdir":
if call.Action != configs.KillProcess {
t.Errorf("Wrong conversion for the %s syscall action", call.Name)
}
default:
t.Errorf("Unexpected syscall %s found", call.Name)
}
@ -595,6 +610,40 @@ func TestDupNamespaces(t *testing.T) {
}
}
func TestUserNamespaceMappingAndPath(t *testing.T) {
if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) {
t.Skip("Test requires userns.")
}
spec := &specs.Spec{
Root: &specs.Root{
Path: "rootfs",
},
Linux: &specs.Linux{
UIDMappings: []specs.LinuxIDMapping{
{ContainerID: 0, HostID: 1000, Size: 1000},
},
GIDMappings: []specs.LinuxIDMapping{
{ContainerID: 0, HostID: 2000, Size: 1000},
},
Namespaces: []specs.LinuxNamespace{
{
Type: "user",
Path: "/proc/1/ns/user",
},
},
},
}
_, err := CreateLibcontainerConfig(&CreateOpts{
Spec: spec,
})
if !strings.Contains(err.Error(), "both namespace path and non-matching mapping specified") {
t.Errorf("user namespace with path and non-matching mapping should be forbidden, got error %v", err)
}
}
func TestNonZeroEUIDCompatibleSpecconvValidate(t *testing.T) {
if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) {
t.Skip("Test requires userns.")

View File

@ -17,6 +17,7 @@ import (
"github.com/opencontainers/runc/libcontainer/keys"
"github.com/opencontainers/runc/libcontainer/seccomp"
"github.com/opencontainers/runc/libcontainer/system"
"github.com/opencontainers/runc/libcontainer/utils"
)
type linuxStandardInit struct {
@ -198,6 +199,14 @@ func (l *linuxStandardInit) Init() error {
if err != nil {
return err
}
// exec.LookPath in Go < 1.20 might return no error for an executable
// residing on a file system mounted with noexec flag, so perform this
// extra check now while we can still return a proper error.
// TODO: remove this once go < 1.20 is not supported.
if err := eaccess(name); err != nil {
return &os.PathError{Op: "eaccess", Path: name, Err: err}
}
// Set seccomp as close to execve as possible, so as few syscalls take
// place afterward (reducing the amount of syscalls that users need to
// enable in their seccomp profiles). However, this needs to be done
@ -250,5 +259,23 @@ func (l *linuxStandardInit) Init() error {
return err
}
// Close all file descriptors we are not passing to the container. This is
// necessary because the execve target could use internal runc fds as the
// execve path, potentially giving access to binary files from the host
// (which can then be opened by container processes, leading to container
// escapes). Note that because this operation will close any open file
// descriptors that are referenced by (*os.File) handles from underneath
// the Go runtime, we must not do any file operations after this point
// (otherwise the (*os.File) finaliser could close the wrong file). See
// CVE-2024-21626 for more information as to why this protection is
// necessary.
//
// This is not needed for runc-dmz, because the extra execve(2) step means
// that all O_CLOEXEC file descriptors have already been closed and thus
// the second execve(2) from runc-dmz cannot access internal file
// descriptors from runc.
if err := utils.UnsafeCloseFrom(l.config.PassedFilesCount + 3); err != nil {
return err
}
return system.Exec(name, l.config.Args[0:], os.Environ())
}

View File

@ -15,16 +15,16 @@ type syncType string
// during container setup. They come in pairs (with procError being a generic
// response which is followed by an &initError).
//
// [ child ] <-> [ parent ]
// [ child ] <-> [ parent ]
//
// procHooks --> [run hooks]
// <-- procResume
// procHooks --> [run hooks]
// <-- procResume
//
// procReady --> [final setup]
// <-- procRun
// procReady --> [final setup]
// <-- procRun
//
// procSeccomp --> [pick up seccomp fd with pidfd_getfd()]
// <-- procSeccompDone
// procSeccomp --> [pick up seccomp fd with pidfd_getfd()]
// <-- procSeccompDone
const (
procError syncType = "procError"
procReady syncType = "procReady"

View File

@ -201,7 +201,7 @@ func ParseGroupFilter(r io.Reader, filter func(Group) bool) ([]Group, error) {
if err != nil {
// We should return no error if EOF is reached
// without a match.
if err == io.EOF { //nolint:errorlint // comparison with io.EOF is legit, https://github.com/polyfloyd/go-errorlint/pull/12
if err == io.EOF {
err = nil
}
return out, err
@ -280,13 +280,13 @@ func GetExecUserPath(userSpec string, defaults *ExecUser, passwdPath, groupPath
// found in any entry in passwd and group respectively.
//
// Examples of valid user specifications are:
// * ""
// * "user"
// * "uid"
// * "user:group"
// * "uid:gid
// * "user:gid"
// * "uid:group"
// - ""
// - "user"
// - "uid"
// - "user:group"
// - "uid:gid
// - "user:gid"
// - "uid:group"
//
// It should be noted that if you specify a numeric user or group id, they will
// not be evaluated as usernames (only the metadata will be filled). So attempting

View File

@ -0,0 +1,79 @@
#define _GNU_SOURCE
#include <fcntl.h>
#include <sched.h>
#include <stdio.h>
#include <unistd.h>
#include <stdarg.h>
#include <stdlib.h>
/*
* All of the code here is run inside an aync-signal-safe context, so we need
* to be careful to not call any functions that could cause issues. In theory,
* since we are a Go program, there are fewer restrictions in practice, it's
* better to be safe than sorry.
*
* The only exception is exit, which we need to call to make sure we don't
* return into runc.
*/
void bail(int pipefd, const char *fmt, ...)
{
va_list args;
va_start(args, fmt);
vdprintf(pipefd, fmt, args);
va_end(args);
exit(1);
}
int spawn_userns_cat(char *userns_path, char *path, int outfd, int errfd)
{
char buffer[4096] = { 0 };
pid_t child = fork();
if (child != 0)
return child;
/* in child */
/* Join the target userns. */
int nsfd = open(userns_path, O_RDONLY);
if (nsfd < 0)
bail(errfd, "open userns path %s failed: %m", userns_path);
int err = setns(nsfd, CLONE_NEWUSER);
if (err < 0)
bail(errfd, "setns %s failed: %m", userns_path);
close(nsfd);
/* Pipe the requested file contents. */
int fd = open(path, O_RDONLY);
if (fd < 0)
bail(errfd, "open %s in userns %s failed: %m", path, userns_path);
int nread, ntotal = 0;
while ((nread = read(fd, buffer, sizeof(buffer))) != 0) {
if (nread < 0)
bail(errfd, "read bytes from %s failed (after %d total bytes read): %m", path, ntotal);
ntotal += nread;
int nwritten = 0;
while (nwritten < nread) {
int n = write(outfd, buffer, nread - nwritten);
if (n < 0)
bail(errfd, "write %d bytes from %s failed (after %d bytes written): %m",
nread - nwritten, path, nwritten);
nwritten += n;
}
if (nread != nwritten)
bail(errfd, "mismatch for bytes read and written: %d read != %d written", nread, nwritten);
}
close(fd);
close(outfd);
close(errfd);
/* We must exit here, otherwise we would return into a forked runc. */
exit(0);
}

View File

@ -0,0 +1,186 @@
//go:build linux
package userns
import (
"bufio"
"bytes"
"fmt"
"io"
"os"
"unsafe"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/sirupsen/logrus"
)
/*
#include <stdlib.h>
extern int spawn_userns_cat(char *userns_path, char *path, int outfd, int errfd);
*/
import "C"
func parseIdmapData(data []byte) (ms []configs.IDMap, err error) {
scanner := bufio.NewScanner(bytes.NewReader(data))
for scanner.Scan() {
var m configs.IDMap
line := scanner.Text()
if _, err := fmt.Sscanf(line, "%d %d %d", &m.ContainerID, &m.HostID, &m.Size); err != nil {
return nil, fmt.Errorf("parsing id map failed: invalid format in line %q: %w", line, err)
}
ms = append(ms, m)
}
if err := scanner.Err(); err != nil {
return nil, fmt.Errorf("parsing id map failed: %w", err)
}
return ms, nil
}
// Do something equivalent to nsenter --user=<nsPath> cat <path>, but more
// efficiently. Returns the contents of the requested file from within the user
// namespace.
func spawnUserNamespaceCat(nsPath string, path string) ([]byte, error) {
rdr, wtr, err := os.Pipe()
if err != nil {
return nil, fmt.Errorf("create pipe for userns spawn failed: %w", err)
}
defer rdr.Close()
defer wtr.Close()
errRdr, errWtr, err := os.Pipe()
if err != nil {
return nil, fmt.Errorf("create error pipe for userns spawn failed: %w", err)
}
defer errRdr.Close()
defer errWtr.Close()
cNsPath := C.CString(nsPath)
defer C.free(unsafe.Pointer(cNsPath))
cPath := C.CString(path)
defer C.free(unsafe.Pointer(cPath))
childPid := C.spawn_userns_cat(cNsPath, cPath, C.int(wtr.Fd()), C.int(errWtr.Fd()))
if childPid < 0 {
return nil, fmt.Errorf("failed to spawn fork for userns")
} else if childPid == 0 {
// this should never happen
panic("runc executing inside fork child -- unsafe state!")
}
// We are in the parent -- close the write end of the pipe before reading.
wtr.Close()
output, err := io.ReadAll(rdr)
rdr.Close()
if err != nil {
return nil, fmt.Errorf("reading from userns spawn failed: %w", err)
}
// Ditto for the error pipe.
errWtr.Close()
errOutput, err := io.ReadAll(errRdr)
errRdr.Close()
if err != nil {
return nil, fmt.Errorf("reading from userns spawn error pipe failed: %w", err)
}
errOutput = bytes.TrimSpace(errOutput)
// Clean up the child.
child, err := os.FindProcess(int(childPid))
if err != nil {
return nil, fmt.Errorf("could not find userns spawn process: %w", err)
}
state, err := child.Wait()
if err != nil {
return nil, fmt.Errorf("failed to wait for userns spawn process: %w", err)
}
if !state.Success() {
errStr := string(errOutput)
if errStr == "" {
errStr = fmt.Sprintf("unknown error (status code %d)", state.ExitCode())
}
return nil, fmt.Errorf("userns spawn: %s", errStr)
} else if len(errOutput) > 0 {
// We can just ignore weird output in the error pipe if the process
// didn't bail(), but for completeness output for debugging.
logrus.Debugf("userns spawn succeeded but unexpected error message found: %s", string(errOutput))
}
// The subprocess succeeded, return whatever it wrote to the pipe.
return output, nil
}
func GetUserNamespaceMappings(nsPath string) (uidMap, gidMap []configs.IDMap, err error) {
var (
pid int
extra rune
tryFastPath bool
)
// nsPath is usually of the form /proc/<pid>/ns/user, which means that we
// already have a pid that is part of the user namespace and thus we can
// just use the pid to read from /proc/<pid>/*id_map.
//
// Note that Sscanf doesn't consume the whole input, so we check for any
// trailing data with %c. That way, we can be sure the pattern matched
// /proc/$pid/ns/user _exactly_ iff n === 1.
if n, _ := fmt.Sscanf(nsPath, "/proc/%d/ns/user%c", &pid, &extra); n == 1 {
tryFastPath = pid > 0
}
for _, mapType := range []struct {
name string
idMap *[]configs.IDMap
}{
{"uid_map", &uidMap},
{"gid_map", &gidMap},
} {
var mapData []byte
if tryFastPath {
path := fmt.Sprintf("/proc/%d/%s", pid, mapType.name)
data, err := os.ReadFile(path)
if err != nil {
// Do not error out here -- we need to try the slow path if the
// fast path failed.
logrus.Debugf("failed to use fast path to read %s from userns %s (error: %s), falling back to slow userns-join path", mapType.name, nsPath, err)
} else {
mapData = data
}
} else {
logrus.Debugf("cannot use fast path to read %s from userns %s, falling back to slow userns-join path", mapType.name, nsPath)
}
if mapData == nil {
// We have to actually join the namespace if we cannot take the
// fast path. The path is resolved with respect to the child
// process, so just use /proc/self.
data, err := spawnUserNamespaceCat(nsPath, "/proc/self/"+mapType.name)
if err != nil {
return nil, nil, err
}
mapData = data
}
idMap, err := parseIdmapData(mapData)
if err != nil {
return nil, nil, fmt.Errorf("failed to parse %s of userns %s: %w", mapType.name, nsPath, err)
}
*mapType.idMap = idMap
}
return uidMap, gidMap, nil
}
// IsSameMapping returns whether or not the two id mappings are the same. Note
// that if the order of the mappings is different, or a mapping has been split,
// the mappings will be considered different.
func IsSameMapping(a, b []configs.IDMap) bool {
if len(a) != len(b) {
return false
}
for idx := range a {
if a[idx] != b[idx] {
return false
}
}
return true
}

Some files were not shown because too many files have changed in this diff Show More