mirror of https://gitee.com/openkylin/runc.git
Import Upstream version 1.1.12+ds1
This commit is contained in:
parent
a9cf6bd6d6
commit
68d1261cab
92
.cirrus.yml
92
.cirrus.yml
|
@ -1,7 +1,8 @@
|
|||
---
|
||||
# We use Cirrus for Vagrant tests and native CentOS 7 and 8, because macOS
|
||||
# instances of GHA are too slow and flaky, and Linux instances of GHA do not
|
||||
# support KVM.
|
||||
# We use Cirrus for CentOS (native) and Fedora (in Vagrant), because neither
|
||||
# CentOS nor Fedora is available on GHA natively, so the only option is VM.
|
||||
# In GHA, nested virtualization is only supported on macOS instances, which
|
||||
# are slow and flaky.
|
||||
|
||||
# NOTE Cirrus execution environments lack a terminal, needed for
|
||||
# some integration tests. So we use `ssh -tt` command to fake a terminal.
|
||||
|
@ -24,25 +25,31 @@ task:
|
|||
platform: linux
|
||||
nested_virtualization: true
|
||||
# CPU limit: `16 / NTASK`: see https://cirrus-ci.org/faq/#are-there-any-limits
|
||||
cpu: 8
|
||||
cpu: 4
|
||||
# Memory limit: `4GB * NCPU`
|
||||
memory: 32G
|
||||
memory: 16G
|
||||
|
||||
host_info_script: |
|
||||
uname -a
|
||||
echo "-----"
|
||||
# -----
|
||||
cat /etc/os-release
|
||||
echo "-----"
|
||||
cat /proc/cpuinfo
|
||||
echo "-----"
|
||||
# -----
|
||||
df -T
|
||||
# -----
|
||||
cat /proc/cpuinfo
|
||||
install_libvirt_vagrant_script: |
|
||||
curl -fsSL https://apt.releases.hashicorp.com/gpg | sudo gpg --dearmor -o /usr/share/keyrings/hashicorp-archive-keyring.gpg
|
||||
echo "deb [signed-by=/usr/share/keyrings/hashicorp-archive-keyring.gpg] https://apt.releases.hashicorp.com $(lsb_release -cs) main" | sudo tee /etc/apt/sources.list.d/hashicorp.list
|
||||
sudo sed -i 's/^# deb-src/deb-src/' /etc/apt/sources.list
|
||||
apt-get update
|
||||
apt-get install -y libvirt-daemon libvirt-daemon-system vagrant vagrant-libvirt
|
||||
apt-get install -y libvirt-daemon libvirt-daemon-system vagrant
|
||||
systemctl enable --now libvirtd
|
||||
apt-get build-dep -y vagrant ruby-libvirt
|
||||
apt-get install -y --no-install-recommends libxslt-dev libxml2-dev libvirt-dev ruby-bundler ruby-dev zlib1g-dev
|
||||
vagrant plugin install vagrant-libvirt
|
||||
vagrant_cache:
|
||||
fingerprint_script: uname -s ; cat Vagrantfile.$DISTRO
|
||||
folder: /root/.vagrant.d
|
||||
fingerprint_script: cat Vagrantfile.$DISTRO
|
||||
folder: /root/.vagrant.d/boxes
|
||||
vagrant_up_script: |
|
||||
ln -sf Vagrantfile.$DISTRO Vagrantfile
|
||||
# Retry if it fails (download.fedoraproject.org returns 404 sometimes)
|
||||
|
@ -50,7 +57,9 @@ task:
|
|||
mkdir -p -m 0700 /root/.ssh
|
||||
vagrant ssh-config >> /root/.ssh/config
|
||||
guest_info_script: |
|
||||
ssh default 'sh -exc "uname -a && systemctl --version && df -T && cat /etc/os-release"'
|
||||
ssh default 'sh -exc "uname -a && systemctl --version && df -T && cat /etc/os-release && go version"'
|
||||
check_config_script: |
|
||||
ssh default /vagrant/script/check-config.sh
|
||||
unit_tests_script: |
|
||||
ssh default 'sudo -i make -C /vagrant localunittest'
|
||||
integration_systemd_script: |
|
||||
|
@ -68,12 +77,14 @@ task:
|
|||
env:
|
||||
HOME: /root
|
||||
CIRRUS_WORKING_DIR: /home/runc
|
||||
GO_VERSION: "1.17.3"
|
||||
BATS_VERSION: "v1.3.0"
|
||||
GO_VERSION: "1.20"
|
||||
BATS_VERSION: "v1.9.0"
|
||||
RPMS: gcc git iptables jq glibc-static libseccomp-devel make criu fuse-sshfs
|
||||
# yamllint disable rule:key-duplicates
|
||||
matrix:
|
||||
DISTRO: centos-7
|
||||
DISTRO: centos-stream-8
|
||||
DISTRO: centos-stream-9
|
||||
|
||||
name: ci / $DISTRO
|
||||
|
||||
|
@ -88,6 +99,8 @@ task:
|
|||
case $DISTRO in
|
||||
centos-7)
|
||||
(cd /etc/yum.repos.d && curl -O https://copr.fedorainfracloud.org/coprs/adrian/criu-el7/repo/epel-7/adrian-criu-el7-epel-7.repo)
|
||||
# EPEL is needed for jq and fuse-sshfs.
|
||||
rpm -q epel-release || rpm -Uvh https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
|
||||
# sysctl
|
||||
echo "user.max_user_namespaces=15076" > /etc/sysctl.d/userns.conf
|
||||
sysctl --system
|
||||
|
@ -95,15 +108,32 @@ task:
|
|||
centos-stream-8)
|
||||
yum config-manager --set-enabled powertools # for glibc-static
|
||||
;;
|
||||
centos-stream-9)
|
||||
dnf config-manager --set-enabled crb # for glibc-static
|
||||
dnf -y install epel-release epel-next-release # for fuse-sshfs
|
||||
# Delegate all cgroup v2 controllers to rootless user via --systemd-cgroup.
|
||||
# The default (since systemd v252) is "pids memory cpu".
|
||||
mkdir -p /etc/systemd/system/user@.service.d
|
||||
printf "[Service]\nDelegate=yes\n" > /etc/systemd/system/user@.service.d/delegate.conf
|
||||
systemctl daemon-reload
|
||||
;;
|
||||
esac
|
||||
# Work around dnf mirror failures by retrying a few times.
|
||||
for i in $(seq 0 2); do
|
||||
sleep $i
|
||||
yum install -y -q gcc git iptables jq glibc-static libseccomp-devel make criu fuse-sshfs && break
|
||||
yum install -y $RPMS && break
|
||||
done
|
||||
[ $? -eq 0 ] # fail if yum failed
|
||||
|
||||
# Double check that all rpms were installed (yum from CentOS 7
|
||||
# does not exit with an error if some packages were not found).
|
||||
# Use --whatprovides since some packages are renamed.
|
||||
rpm -q --whatprovides $RPMS
|
||||
# install Go
|
||||
curl -fsSL "https://dl.google.com/go/go${GO_VERSION}.linux-amd64.tar.gz" | tar Cxz /usr/local
|
||||
PREFIX="https://go.dev/dl/"
|
||||
# Find out the latest minor release URL.
|
||||
eval $(curl -fsSL "${PREFIX}?mode=json" | jq -r --arg Ver "$GO_VERSION" '.[] | select(.version | startswith("go\($Ver)")) | .files[] | select(.os == "linux" and .arch == "amd64" and .kind == "archive") | "filename=\"" + .filename + "\""')
|
||||
curl -fsSL "$PREFIX$filename" | tar Cxz /usr/local
|
||||
# install bats
|
||||
cd /tmp
|
||||
git clone https://github.com/bats-core/bats-core
|
||||
|
@ -131,14 +161,18 @@ task:
|
|||
systemctl restart sshd
|
||||
host_info_script: |
|
||||
uname -a
|
||||
echo "-----"
|
||||
cat /etc/os-release
|
||||
echo "-----"
|
||||
cat /proc/cpuinfo
|
||||
echo "-----"
|
||||
df -T
|
||||
echo "-----"
|
||||
# -----
|
||||
/usr/local/go/bin/go version
|
||||
# -----
|
||||
systemctl --version
|
||||
# -----
|
||||
cat /etc/os-release
|
||||
# -----
|
||||
df -T
|
||||
# -----
|
||||
cat /proc/cpuinfo
|
||||
check_config_script: |
|
||||
/home/runc/script/check-config.sh
|
||||
unit_tests_script: |
|
||||
ssh -tt localhost "make -C /home/runc localunittest"
|
||||
integration_systemd_script: |
|
||||
|
@ -146,13 +180,19 @@ task:
|
|||
integration_fs_script: |
|
||||
ssh -tt localhost "make -C /home/runc localintegration"
|
||||
integration_systemd_rootless_script: |
|
||||
echo "SKIP: integration_systemd_rootless_script requires cgroup v2"
|
||||
case $DISTRO in
|
||||
centos-7|centos-stream-8)
|
||||
echo "SKIP: integration_systemd_rootless_script requires cgroup v2"
|
||||
;;
|
||||
*)
|
||||
ssh -tt localhost "make -C /home/runc localrootlessintegration RUNC_USE_SYSTEMD=yes"
|
||||
esac
|
||||
integration_fs_rootless_script: |
|
||||
case $DISTRO in
|
||||
centos-7)
|
||||
echo "SKIP: FIXME: integration_fs_rootless_script is skipped because of EPERM on writing cgroup.procs"
|
||||
;;
|
||||
centos-stream-8)
|
||||
*)
|
||||
ssh -tt localhost "make -C /home/runc localrootlessintegration"
|
||||
;;
|
||||
esac
|
||||
|
|
|
@ -1,3 +1,3 @@
|
|||
[codespell]
|
||||
skip = ./vendor,./.git
|
||||
ignore-words-list = clos,creat
|
||||
skip = ./vendor,./.git,./go.sum
|
||||
ignore-words-list = clos,mis
|
||||
|
|
|
@ -0,0 +1,8 @@
|
|||
# This file is used by shfmt. See https://EditorConfig.org
|
||||
|
||||
# This is a top-most EditorConfig file.
|
||||
root = true
|
||||
|
||||
# Ignore the entire "vendor" directory.
|
||||
[vendor/**]
|
||||
ignore = true
|
|
@ -21,13 +21,13 @@ jobs:
|
|||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
go-version: [1.16.x, 1.17.x]
|
||||
go-version: [1.17.x, 1.20.x, 1.21.x]
|
||||
rootless: ["rootless", ""]
|
||||
race: ["-race", ""]
|
||||
criu: [""]
|
||||
include:
|
||||
# Also test against latest criu-dev
|
||||
- go-version: 1.17.x
|
||||
- go-version: 1.20.x
|
||||
rootless: ""
|
||||
race: ""
|
||||
criu: "criu-dev"
|
||||
|
@ -35,7 +35,7 @@ jobs:
|
|||
steps:
|
||||
|
||||
- name: checkout
|
||||
uses: actions/checkout@v2
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: install deps
|
||||
if: matrix.criu == ''
|
||||
|
@ -43,7 +43,7 @@ jobs:
|
|||
REPO: https://download.opensuse.org/repositories/devel:/tools:/criu/xUbuntu_20.04
|
||||
run: |
|
||||
# criu repo
|
||||
curl -fSsl $REPO/Release.key | sudo apt-key add -
|
||||
curl -fSsLl $REPO/Release.key | gpg --dearmor | sudo tee /etc/apt/trusted.gpg.d/devel_tools_criu.gpg > /dev/null
|
||||
echo "deb $REPO/ /" | sudo tee /etc/apt/sources.list.d/criu.list
|
||||
sudo apt update
|
||||
sudo apt install libseccomp-dev criu sshfs
|
||||
|
@ -60,9 +60,8 @@ jobs:
|
|||
rm -rf ~/criu
|
||||
|
||||
- name: install go ${{ matrix.go-version }}
|
||||
uses: actions/setup-go@v2
|
||||
uses: actions/setup-go@v4
|
||||
with:
|
||||
stable: '!contains(${{ matrix.go-version }}, "beta") && !contains(${{ matrix.go-version }}, "rc")'
|
||||
go-version: ${{ matrix.go-version }}
|
||||
|
||||
- name: build
|
||||
|
@ -71,7 +70,7 @@ jobs:
|
|||
- name: install bats
|
||||
uses: mig4/setup-bats@v1
|
||||
with:
|
||||
bats-version: 1.3.0
|
||||
bats-version: 1.9.0
|
||||
|
||||
- name: unit test
|
||||
if: matrix.rootless != 'rootless'
|
||||
|
@ -105,7 +104,7 @@ jobs:
|
|||
steps:
|
||||
|
||||
- name: checkout
|
||||
uses: actions/checkout@v2
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: install deps
|
||||
run: |
|
||||
|
@ -120,10 +119,9 @@ jobs:
|
|||
sudo apt -q install libseccomp-dev libseccomp-dev:i386 gcc-multilib criu
|
||||
|
||||
- name: install go
|
||||
uses: actions/setup-go@v2
|
||||
uses: actions/setup-go@v4
|
||||
with:
|
||||
go-version: 1.x # Latest stable
|
||||
|
||||
- name: unit test
|
||||
# cgo is disabled by default when cross-compiling
|
||||
run: sudo -E PATH="$PATH" -- make GOARCH=386 CGO_ENABLED=1 localunittest
|
||||
run: sudo -E PATH="$PATH" -- make GOARCH=386 localunittest
|
||||
|
|
|
@ -7,41 +7,39 @@ on:
|
|||
- master
|
||||
- release-*
|
||||
pull_request:
|
||||
env:
|
||||
GO_VERSION: 1.20.x
|
||||
|
||||
jobs:
|
||||
keyring:
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: check runc.keyring
|
||||
run: make validate-keyring
|
||||
|
||||
lint:
|
||||
runs-on: ubuntu-20.04
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 2
|
||||
- uses: actions/setup-go@v4
|
||||
with:
|
||||
go-version: "${{ env.GO_VERSION }}"
|
||||
cache: false # golangci-lint-action does its own caching
|
||||
- name: install deps
|
||||
run: |
|
||||
sudo apt -q update
|
||||
sudo apt -q install libseccomp-dev
|
||||
- uses: golangci/golangci-lint-action@v2
|
||||
- uses: golangci/golangci-lint-action@v3
|
||||
with:
|
||||
# must be specified without patch version
|
||||
version: v1.42
|
||||
|
||||
lint-extra:
|
||||
# Extra linters, only checking new code from pull requests.
|
||||
if: github.event_name == 'pull_request'
|
||||
runs-on: ubuntu-20.04
|
||||
permissions:
|
||||
contents: read
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: install deps
|
||||
version: v1.53
|
||||
# Extra linters, only checking new code from a pull request.
|
||||
- name: lint-extra
|
||||
if: github.event_name == 'pull_request'
|
||||
run: |
|
||||
sudo apt -q update
|
||||
sudo apt -q install libseccomp-dev
|
||||
- uses: golangci/golangci-lint-action@v2
|
||||
with:
|
||||
only-new-issues: true
|
||||
args: --config .golangci-extra.yml
|
||||
# must be specified without patch version
|
||||
version: v1.43
|
||||
|
||||
golangci-lint run --config .golangci-extra.yml --new-from-rev=HEAD~1 --out-format=github-actions
|
||||
|
||||
compile-buildtags:
|
||||
runs-on: ubuntu-20.04
|
||||
|
@ -49,18 +47,18 @@ jobs:
|
|||
# Don't ignore C warnings. Note that the output of "go env CGO_CFLAGS" by default is "-g -O2", so we keep them.
|
||||
CGO_CFLAGS: -g -O2 -Werror
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- uses: actions/checkout@v3
|
||||
- name: install go
|
||||
uses: actions/setup-go@v2
|
||||
uses: actions/setup-go@v4
|
||||
with:
|
||||
go-version: 1.x # Latest stable
|
||||
go-version: "${{ env.GO_VERSION }}"
|
||||
- name: compile with no build tags
|
||||
run: make BUILDTAGS=""
|
||||
|
||||
codespell:
|
||||
runs-on: ubuntu-20.04
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- uses: actions/checkout@v3
|
||||
- name: install deps
|
||||
# Version of codespell bundled with Ubuntu is way old, so use pip.
|
||||
run: pip install codespell
|
||||
|
@ -70,35 +68,19 @@ jobs:
|
|||
shfmt:
|
||||
runs-on: ubuntu-20.04
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: vars
|
||||
run: |
|
||||
echo "VERSION=3.3.1" >> $GITHUB_ENV
|
||||
echo "$(go env GOPATH)/bin" >> $GITHUB_PATH
|
||||
- name: cache go mod and $GOCACHE
|
||||
uses: actions/cache@v2
|
||||
with:
|
||||
path: |
|
||||
~/go/pkg/mod
|
||||
~/.cache/go-build
|
||||
key: ${{ runner.os }}-shfmt-${{ env.VERSION }}
|
||||
restore-keys: ${{ runner.os }}-shfmt-
|
||||
- name: install shfmt
|
||||
run: |
|
||||
command -v shfmt || \
|
||||
(cd ~ && GO111MODULE=on time go get mvdan.cc/sh/v3/cmd/shfmt@v$VERSION)
|
||||
- uses: actions/checkout@v3
|
||||
- name: shfmt
|
||||
run: make shfmt
|
||||
|
||||
shellcheck:
|
||||
runs-on: ubuntu-20.04
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- uses: actions/checkout@v3
|
||||
- name: vars
|
||||
run: |
|
||||
echo 'VERSION=v0.7.2' >> $GITHUB_ENV
|
||||
echo 'VERSION=v0.8.0' >> $GITHUB_ENV
|
||||
echo 'BASEURL=https://github.com/koalaman/shellcheck/releases/download' >> $GITHUB_ENV
|
||||
echo 'SHA256SUM=12ee2e0b90a3d1e9cae24ac9b2838be66b48573cb2c8e8f3c566b959df6f050c' >> $GITHUB_ENV
|
||||
echo 'SHA256SUM=f4bce23c11c3919c1b20bcb0f206f6b44c44e26f2bc95f8aa708716095fa0651' >> $GITHUB_ENV
|
||||
echo ~/bin >> $GITHUB_PATH
|
||||
- name: install shellcheck
|
||||
run: |
|
||||
|
@ -108,27 +90,21 @@ jobs:
|
|||
sha256sum ~/bin/shellcheck | grep -q $SHA256SUM
|
||||
# make sure to remove the old version
|
||||
sudo rm -f /usr/bin/shellcheck
|
||||
- uses: lumaxis/shellcheck-problem-matchers@v1
|
||||
- uses: lumaxis/shellcheck-problem-matchers@v2
|
||||
- name: shellcheck
|
||||
run: |
|
||||
make shellcheck
|
||||
- name: check-config.sh
|
||||
run : ./script/check-config.sh
|
||||
|
||||
deps:
|
||||
runs-on: ubuntu-20.04
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- uses: actions/checkout@v3
|
||||
- name: install go
|
||||
uses: actions/setup-go@v2
|
||||
uses: actions/setup-go@v4
|
||||
with:
|
||||
go-version: 1.x # Latest stable
|
||||
- name: cache go mod and $GOCACHE
|
||||
uses: actions/cache@v2
|
||||
with:
|
||||
path: |
|
||||
~/go/pkg/mod
|
||||
~/.cache/go-build
|
||||
key: ${{ runner.os }}-go.sum-${{ hashFiles('**/go.sum') }}
|
||||
restore-keys: ${{ runner.os }}-go.sum-
|
||||
go-version: "${{ env.GO_VERSION }}"
|
||||
- name: verify deps
|
||||
run: make verify-dependencies
|
||||
|
||||
|
@ -151,12 +127,11 @@ jobs:
|
|||
pattern: '^.{0,72}(\n.*)*$'
|
||||
error: 'Subject too long (max 72)'
|
||||
|
||||
|
||||
cfmt:
|
||||
runs-on: ubuntu-20.04
|
||||
steps:
|
||||
- name: checkout
|
||||
uses: actions/checkout@v2
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 0
|
||||
- name: install deps
|
||||
|
@ -173,9 +148,13 @@ jobs:
|
|||
runs-on: ubuntu-20.04
|
||||
steps:
|
||||
- name: checkout
|
||||
uses: actions/checkout@v2
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: check CHANGELOG.md
|
||||
run: make verify-changelog
|
||||
|
||||
# We have to run this under Docker as Ubuntu (host) does not support all
|
||||
# the architectures we want to compile test against, and Dockerfile uses
|
||||
# Debian (which does).
|
||||
|
@ -185,14 +164,12 @@ jobs:
|
|||
# under Docker will emerge, it will be good to have a separate make
|
||||
# runcimage job and share its result (the docker image) with whoever
|
||||
# needs it.
|
||||
- uses: satackey/action-docker-layer-caching@v0.0.11
|
||||
continue-on-error: true
|
||||
- name: build docker image
|
||||
run: make runcimage
|
||||
- name: make releaseall
|
||||
run: make releaseall
|
||||
- name: upload artifacts
|
||||
uses: actions/upload-artifact@v2
|
||||
uses: actions/upload-artifact@v3
|
||||
with:
|
||||
name: release-${{ github.run_id }}
|
||||
path: release/*
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
# This is golangci-lint config file which is used to check new code in
|
||||
# github PRs only (see lint-extra job in .github/workflows/validate.yml).
|
||||
# github PRs only (see lint-extra in .github/workflows/validate.yml).
|
||||
#
|
||||
# For the default linter config, see .golangci.yml. This config should
|
||||
# only enable additional linters not enabled in the default config.
|
||||
|
|
297
CHANGELOG.md
297
CHANGELOG.md
|
@ -1,21 +1,288 @@
|
|||
# Changelog/
|
||||
# Changelog
|
||||
This file documents all notable changes made to this project since runc 1.0.
|
||||
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
## [Unreleased]
|
||||
## [Unreleased 1.1.z]
|
||||
|
||||
## [1.1.12] - 2024-01-31
|
||||
|
||||
> Now you're thinking with Portals™!
|
||||
|
||||
### Security
|
||||
|
||||
* Fix [CVE-2024-21626][cve-2024-21626], a container breakout attack that took
|
||||
advantage of a file descriptor that was leaked internally within runc (but
|
||||
never leaked to the container process). In addition to fixing the leak,
|
||||
several strict hardening measures were added to ensure that future internal
|
||||
leaks could not be used to break out in this manner again. Based on our
|
||||
research, while no other container runtime had a similar leak, none had any
|
||||
of the hardening steps we've introduced (and some runtimes would not check
|
||||
for any file descriptors that a calling process may have leaked to them,
|
||||
allowing for container breakouts due to basic user error).
|
||||
|
||||
[cve-2024-21626]: https://github.com/opencontainers/runc/security/advisories/GHSA-xr7r-f8xq-vfvv
|
||||
|
||||
## [1.1.11] - 2024-01-01
|
||||
|
||||
> Happy New Year!
|
||||
|
||||
### Fixed
|
||||
|
||||
* Fix several issues with userns path handling. (#4122, #4124, #4134, #4144)
|
||||
|
||||
### Changed
|
||||
|
||||
* Support memory.peak and memory.swap.peak in cgroups v2.
|
||||
Add `swapOnlyUsage` in `MemoryStats`. This field reports swap-only usage.
|
||||
For cgroupv1, `Usage` and `Failcnt` are set by subtracting memory usage
|
||||
from memory+swap usage. For cgroupv2, `Usage`, `Limit`, and `MaxUsage`
|
||||
are set. (#4000, #4010, #4131)
|
||||
* build(deps): bump github.com/cyphar/filepath-securejoin. (#4140)
|
||||
|
||||
## [1.1.10] - 2023-10-31
|
||||
|
||||
> Śruba, przykręcona we śnie, nie zmieni sytuacji, jaka panuje na jawie.
|
||||
|
||||
### Added
|
||||
|
||||
* Support for `hugetlb.<pagesize>.rsvd` limiting and accounting. Fixes the
|
||||
issue of postres failing when hugepage limits are set. (#3859, #4077)
|
||||
|
||||
### Fixed
|
||||
|
||||
* Fixed permissions of a newly created directories to not depend on the value
|
||||
of umask in tmpcopyup feature implementation. (#3991, #4060)
|
||||
* libcontainer: cgroup v1 GetStats now ignores missing `kmem.limit_in_bytes`
|
||||
(fixes the compatibility with Linux kernel 6.1+). (#4028)
|
||||
* Fix a semi-arbitrary cgroup write bug when given a malicious hugetlb
|
||||
configuration. This issue is not a security issue because it requires a
|
||||
malicious `config.json`, which is outside of our threat model. (#4103)
|
||||
* Various CI fixes. (#4081, #4055)
|
||||
|
||||
## [1.1.9] - 2023-08-10
|
||||
|
||||
> There is a crack in everything. That's how the light gets in.
|
||||
|
||||
### Added
|
||||
|
||||
* Added go 1.21 to the CI matrix; other CI updates. (#3976, #3958)
|
||||
|
||||
### Fixed
|
||||
|
||||
* Fixed losing sticky bit on tmpfs (a regression in 1.1.8). (#3952, #3961)
|
||||
* intelrdt: fixed ignoring ClosID on some systems. (#3550, #3978)
|
||||
|
||||
### Changed
|
||||
|
||||
* Sum `anon` and `file` from `memory.stat` for cgroupv2 root usage,
|
||||
as the root does not have `memory.current` for cgroupv2.
|
||||
This aligns cgroupv2 root usage more closely with cgroupv1 reporting.
|
||||
Additionally, report root swap usage as sum of swap and memory usage,
|
||||
aligned with v1 and existing non-root v2 reporting. (#3933)
|
||||
|
||||
## [1.1.8] - 2023-07-20
|
||||
|
||||
> 海纳百川 有容乃大
|
||||
|
||||
### Added
|
||||
|
||||
* Support riscv64. (#3905)
|
||||
|
||||
### Fixed
|
||||
|
||||
* init: do not print environment variable value. (#3879)
|
||||
* libct: fix a race with systemd removal. (#3877)
|
||||
* tests/int: increase num retries for oom tests. (#3891)
|
||||
* man/runc: fixes. (#3892)
|
||||
* Fix tmpfs mode opts when dir already exists. (#3916)
|
||||
* docs/systemd: fix a broken link. (#3917)
|
||||
* ci/cirrus: enable some rootless tests on cs9. (#3918)
|
||||
* runc delete: call systemd's reset-failed. (#3932)
|
||||
* libct/cg/sd/v1: do not update non-frozen cgroup after frozen failed. (#3921)
|
||||
|
||||
### Changed
|
||||
|
||||
* CI: bump Fedora, Vagrant, bats. (#3878)
|
||||
* `.codespellrc`: update for 2.2.5. (#3909)
|
||||
|
||||
## [1.1.7] - 2023-04-26
|
||||
|
||||
> Ночевала тучка золотая на груди утеса-великана.
|
||||
|
||||
### Fixed
|
||||
|
||||
* When used with systemd v240+, systemd cgroup drivers no longer skip
|
||||
`DeviceAllow` rules if the device does not exist (a regression introduced
|
||||
in runc 1.1.3). This fix also reverts the workaround added in runc 1.1.5,
|
||||
removing an extra warning emitted by runc run/start. (#3845, #3708, #3671)
|
||||
|
||||
### Added
|
||||
|
||||
* The source code now has a new file, `runc.keyring`, which contains the keys
|
||||
used to sign runc releases. (#3838)
|
||||
|
||||
## [1.1.6] - 2023-04-11
|
||||
|
||||
> In this world nothing is certain but death and taxes.
|
||||
|
||||
### Compatibility
|
||||
|
||||
* This release can no longer be built from sources using Go 1.16. Using a
|
||||
latest maintained Go 1.20.x or Go 1.19.x release is recommended.
|
||||
Go 1.17 can still be used.
|
||||
|
||||
### Fixed
|
||||
|
||||
* systemd cgroup v1 and v2 drivers were deliberately ignoring `UnitExist` error
|
||||
from systemd while trying to create a systemd unit, which in some scenarios
|
||||
may result in a container not being added to the proper systemd unit and
|
||||
cgroup. (#3780, #3806)
|
||||
* systemd cgroup v2 driver was incorrectly translating cpuset range from spec's
|
||||
`resources.cpu.cpus` to systemd unit property (`AllowedCPUs`) in case of more
|
||||
than 8 CPUs, resulting in the wrong AllowedCPUs setting. (#3808)
|
||||
* systemd cgroup v1 driver was prefixing container's cgroup path with the path
|
||||
of PID 1 cgroup, resulting in inability to place PID 1 in a non-root cgroup.
|
||||
(#3811)
|
||||
* runc run/start may return "permission denied" error when starting a rootless
|
||||
container when the file to be executed does not have executable bit set for
|
||||
the user, not taking the `CAP_DAC_OVERRIDE` capability into account. This is
|
||||
a regression in runc 1.1.4, as well as in Go 1.20 and 1.20.1 (#3715, #3817)
|
||||
* cgroup v1 drivers are now aware of `misc` controller. (#3823)
|
||||
* Various CI fixes and improvements, mostly to ensure Go 1.19.x and Go 1.20.x
|
||||
compatibility.
|
||||
|
||||
## [1.1.5] - 2023-03-29
|
||||
|
||||
> 囚われた屈辱は
|
||||
> 反撃の嚆矢だ
|
||||
|
||||
### Security
|
||||
|
||||
The following CVEs were fixed in this release:
|
||||
|
||||
* [CVE-2023-25809][] is a vulnerability involving rootless containers where
|
||||
(under specific configurations), the container would have write access to the
|
||||
`/sys/fs/cgroup/user.slice/...` cgroup hierarchy. No other hierarchies on the
|
||||
host were affected. This vulnerability was discovered by Akihiro Suda.
|
||||
|
||||
* [CVE-2023-27561][] was a regression in our protections against tricky `/proc`
|
||||
and `/sys` configurations (where the container mountpoint is a symlink)
|
||||
causing us to be tricked into incorrectly configuring the container, which
|
||||
effectively re-introduced [CVE-2019-19921][]. This regression was present
|
||||
from v1.0.0-rc95 to v1.1.4 and was discovered by @Beuc. (#3785)
|
||||
|
||||
* [CVE-2023-28642][] is a different attack vector using the same regression
|
||||
as in [CVE-2023-27561][]. This was reported by Lei Wang.
|
||||
|
||||
[CVE-2019-19921]: https://github.com/advisories/GHSA-fh74-hm69-rqjw
|
||||
[CVE-2023-25809]: https://github.com/opencontainers/runc/security/advisories/GHSA-m8cg-xc2p-r3fc
|
||||
[CVE-2023-27561]: https://github.com/advisories/GHSA-vpvm-3wq2-2wvm
|
||||
[CVE-2023-28642]: https://github.com/opencontainers/runc/security/advisories/GHSA-g2j6-57v7-gm8c
|
||||
|
||||
### Fixed
|
||||
|
||||
* Fix the inability to use `/dev/null` when inside a container. (#3620)
|
||||
* Fix changing the ownership of host's `/dev/null` caused by fd redirection
|
||||
(a regression in 1.1.1). (#3674, #3731)
|
||||
* Fix rare runc exec/enter unshare error on older kernels, including
|
||||
CentOS < 7.7. (#3776)
|
||||
* nsexec: Check for errors in `write_log()`. (#3721)
|
||||
* Various CI fixes and updates. (#3618, #3630, #3640, #3729)
|
||||
|
||||
## [1.1.4] - 2022-08-24
|
||||
|
||||
> If you look for perfection, you'll never be content.
|
||||
|
||||
### Fixed
|
||||
|
||||
* Fix mounting via wrong proc fd.
|
||||
When the user and mount namespaces are used, and the bind mount is followed by
|
||||
the cgroup mount in the spec, the cgroup was mounted using the bind mount's
|
||||
mount fd. (#3511)
|
||||
* Switch `kill()` in `libcontainer/nsenter` to `sane_kill()`. (#3536)
|
||||
* Fix "permission denied" error from `runc run` on `noexec` fs. (#3541)
|
||||
* Fix failed exec after `systemctl daemon-reload`.
|
||||
Due to a regression in v1.1.3, the `DeviceAllow=char-pts rwm` rule was no
|
||||
longer added and was causing an error `open /dev/pts/0: operation not permitted: unknown`
|
||||
when systemd was reloaded. (#3554)
|
||||
* Various CI fixes. (#3538, #3558, #3562)
|
||||
|
||||
## [1.1.3] - 2022-06-09
|
||||
|
||||
> In the beginning there was nothing, which exploded.
|
||||
|
||||
### Fixed
|
||||
* Our seccomp `-ENOSYS` stub now correctly handles multiplexed syscalls on
|
||||
s390 and s390x. This solves the issue where syscalls the host kernel did not
|
||||
support would return `-EPERM` despite the existence of the `-ENOSYS` stub
|
||||
code (this was due to how s390x does syscall multiplexing). (#3478)
|
||||
* Retry on dbus disconnect logic in libcontainer/cgroups/systemd now works as
|
||||
intended; this fix does not affect runc binary itself but is important for
|
||||
libcontainer users such as Kubernetes. (#3476)
|
||||
* Inability to compile with recent clang due to an issue with duplicate
|
||||
constants in libseccomp-golang. (#3477)
|
||||
* When using systemd cgroup driver, skip adding device paths that don't exist,
|
||||
to stop systemd from emitting warnings about those paths. (#3504)
|
||||
* Socket activation was failing when more than 3 sockets were used. (#3494)
|
||||
* Various CI fixes. (#3472, #3479)
|
||||
|
||||
### Added
|
||||
* Allow to bind mount /proc/sys/kernel/ns_last_pid to inside container. (#3493)
|
||||
|
||||
### Changed
|
||||
* runc static binaries are now linked against libseccomp v2.5.4. (#3481)
|
||||
|
||||
|
||||
## [1.1.2] - 2022-05-11
|
||||
|
||||
> I should think I'm going to be a perpetual student.
|
||||
|
||||
### Security
|
||||
* A bug was found in runc where runc exec --cap executed processes with
|
||||
non-empty inheritable Linux process capabilities, creating an atypical Linux
|
||||
environment. For more information, see [GHSA-f3fp-gc8g-vw66][] and
|
||||
CVE-2022-29162.
|
||||
|
||||
### Changed
|
||||
* `runc spec` no longer sets any inheritable capabilities in the created
|
||||
example OCI spec (`config.json`) file.
|
||||
|
||||
[GHSA-f3fp-gc8g-vw66]: https://github.com/opencontainers/runc/security/advisories/GHSA-f3fp-gc8g-vw66
|
||||
|
||||
|
||||
## [1.1.1] - 2022-03-28
|
||||
|
||||
> Violence is the last refuge of the incompetent.
|
||||
|
||||
### Added
|
||||
* CI is now also run on centos-stream-9. (#3436)
|
||||
|
||||
### Fixed
|
||||
* `runc run/start` can now run a container with read-only `/dev` in OCI spec,
|
||||
rather than error out. (#3355)
|
||||
* `runc exec` now ensures that `--cgroup` argument is a sub-cgroup. (#3403)
|
||||
* libcontainer systemd v2 manager no longer errors out if one of the files
|
||||
listed in `/sys/kernel/cgroup/delegate` do not exist in container's cgroup.
|
||||
(#3387, #3404)
|
||||
* Loose OCI spec validation to avoid bogus "Intel RDT is not supported" error.
|
||||
(#3406)
|
||||
* libcontainer/cgroups no longer panics in cgroup v1 managers if `stat`
|
||||
of `/sys/fs/cgroup/unified` returns an error other than ENOENT. (#3435)
|
||||
|
||||
|
||||
## [1.1.0] - 2022-01-14
|
||||
|
||||
> A plan depends as much upon execution as it does upon concept.
|
||||
|
||||
## Changed
|
||||
### Changed
|
||||
* libcontainer will now refuse to build without the nsenter package being
|
||||
correctly compiled (specifically this requires CGO to be enabled). This
|
||||
should avoid folks accidentally creating broken runc binaries (and
|
||||
incorrectly importing our internal libraries into their projects). (#3331)
|
||||
|
||||
|
||||
## [1.1.0-rc.1] - 2021-12-14
|
||||
|
||||
> He who controls the spice controls the universe.
|
||||
|
@ -41,7 +308,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||
binary etc.) and failures of the command being executed. (#3073)
|
||||
* runc run: new `--keep` option to skip removal exited containers artefacts.
|
||||
This might be useful to check the state (e.g. of cgroup controllers) after
|
||||
the container hasexited. (#2817, #2825)
|
||||
the container has exited. (#2817, #2825)
|
||||
* seccomp: add support for `SCMP_ACT_KILL_PROCESS` and `SCMP_ACT_KILL_THREAD`
|
||||
(the latter is just an alias for `SCMP_ACT_KILL`). (#3204)
|
||||
* seccomp: add support for `SCMP_ACT_NOTIFY` (seccomp actions). This allows
|
||||
|
@ -130,13 +397,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||
### Fixed
|
||||
* Fixed inability to start a container with read-write bind mount of a
|
||||
read-only fuse host mount. (#3283, #3292)
|
||||
* Fixed inability to start when read-only /dev in set in spec (#3276, #3277)
|
||||
* Fixed inability to start when read-only /dev in set in spec. (#3276, #3277)
|
||||
* Fixed not removing sub-cgroups upon container delete, when rootless cgroup v2
|
||||
is used with older systemd. (#3226, #3297)
|
||||
* Fixed returning error from GetStats when hugetlb is unsupported (which causes
|
||||
excessive logging for Kubernetes). (#3233, #3295)
|
||||
* Improved an error message when dbus-user-session is not installed and
|
||||
rootless + cgroup2 + systemd are used (#3212)
|
||||
rootless + cgroup2 + systemd are used. (#3212)
|
||||
|
||||
[GHSA-v95c-p5hm-xq8f]: https://github.com/opencontainers/runc/security/advisories/GHSA-v95c-p5hm-xq8f
|
||||
|
||||
|
@ -216,7 +483,7 @@ implementation (libcontainer) is *not* covered by this policy.
|
|||
code, optimize the method for checking whether a cgroup is frozen. (#2955)
|
||||
* cgroups/systemd: fixed "retry on dbus disconnect" logic introduced in rc94
|
||||
* cgroups/systemd: fixed returning "unit already exists" error from a systemd
|
||||
cgroup manager (regression in rc94) (#2997, #2996)
|
||||
cgroup manager (regression in rc94). (#2997, #2996)
|
||||
|
||||
### Added
|
||||
* cgroupv2: support SkipDevices with systemd driver. (#2958, #3019)
|
||||
|
@ -225,7 +492,7 @@ implementation (libcontainer) is *not* covered by this policy.
|
|||
(#3022)
|
||||
|
||||
### Changed
|
||||
* cgroup/systemd: return, not ignore, stop unit error from Destroy (#2946)
|
||||
* cgroup/systemd: return, not ignore, stop unit error from Destroy. (#2946)
|
||||
* Fix all golangci-lint failures. (#2781, #2962)
|
||||
* Make `runc --version` output sane even when built with `go get` or
|
||||
otherwise outside of our build scripts. (#2962)
|
||||
|
@ -244,5 +511,17 @@ implementation (libcontainer) is *not* covered by this policy.
|
|||
[1.0.1]: https://github.com/opencontainers/runc/compare/v1.0.0...v1.0.1
|
||||
|
||||
<!-- 1.1.z patch releases -->
|
||||
[Unreleased 1.1.z]: https://github.com/opencontainers/runc/compare/v1.1.0...release-1.1
|
||||
[Unreleased 1.1.z]: https://github.com/opencontainers/runc/compare/v1.1.12...release-1.1
|
||||
[1.1.12]: https://github.com/opencontainers/runc/compare/v1.1.11...v1.1.12
|
||||
[1.1.11]: https://github.com/opencontainers/runc/compare/v1.1.10...v1.1.11
|
||||
[1.1.10]: https://github.com/opencontainers/runc/compare/v1.1.9...v1.1.10
|
||||
[1.1.9]: https://github.com/opencontainers/runc/compare/v1.1.8...v1.1.9
|
||||
[1.1.8]: https://github.com/opencontainers/runc/compare/v1.1.7...v1.1.8
|
||||
[1.1.7]: https://github.com/opencontainers/runc/compare/v1.1.6...v1.1.7
|
||||
[1.1.6]: https://github.com/opencontainers/runc/compare/v1.1.5...v1.1.6
|
||||
[1.1.5]: https://github.com/opencontainers/runc/compare/v1.1.4...v1.1.5
|
||||
[1.1.4]: https://github.com/opencontainers/runc/compare/v1.1.3...v1.1.4
|
||||
[1.1.3]: https://github.com/opencontainers/runc/compare/v1.1.2...v1.1.3
|
||||
[1.1.2]: https://github.com/opencontainers/runc/compare/v1.1.1...v1.1.2
|
||||
[1.1.1]: https://github.com/opencontainers/runc/compare/v1.1.0...v1.1.1
|
||||
[1.1.0-rc.1]: https://github.com/opencontainers/runc/compare/v1.0.0...v1.1.0-rc.1
|
||||
|
|
32
Dockerfile
32
Dockerfile
|
@ -1,6 +1,6 @@
|
|||
ARG GO_VERSION=1.17
|
||||
ARG BATS_VERSION=v1.3.0
|
||||
ARG LIBSECCOMP_VERSION=2.5.3
|
||||
ARG GO_VERSION=1.20
|
||||
ARG BATS_VERSION=v1.9.0
|
||||
ARG LIBSECCOMP_VERSION=2.5.4
|
||||
|
||||
FROM golang:${GO_VERSION}-bullseye
|
||||
ARG DEBIAN_FRONTEND=noninteractive
|
||||
|
@ -9,19 +9,16 @@ ARG CRIU_REPO=https://download.opensuse.org/repositories/devel:/tools:/criu/Debi
|
|||
RUN KEYFILE=/usr/share/keyrings/criu-repo-keyring.gpg; \
|
||||
wget -nv $CRIU_REPO/Release.key -O- | gpg --dearmor > "$KEYFILE" \
|
||||
&& echo "deb [signed-by=$KEYFILE] $CRIU_REPO/ /" > /etc/apt/sources.list.d/criu.list \
|
||||
&& dpkg --add-architecture armel \
|
||||
&& dpkg --add-architecture armhf \
|
||||
&& dpkg --add-architecture arm64 \
|
||||
&& dpkg --add-architecture ppc64el \
|
||||
&& apt-get update \
|
||||
&& apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
criu \
|
||||
crossbuild-essential-arm64 \
|
||||
crossbuild-essential-armel \
|
||||
crossbuild-essential-armhf \
|
||||
crossbuild-essential-ppc64el \
|
||||
crossbuild-essential-s390x \
|
||||
gcc-aarch64-linux-gnu libc-dev-arm64-cross \
|
||||
gcc-arm-linux-gnueabi libc-dev-armel-cross \
|
||||
gcc-arm-linux-gnueabihf libc-dev-armhf-cross \
|
||||
gcc-powerpc64le-linux-gnu libc-dev-ppc64el-cross \
|
||||
gcc-s390x-linux-gnu libc-dev-s390x-cross \
|
||||
gcc-riscv64-linux-gnu libc-dev-riscv64-cross \
|
||||
curl \
|
||||
gawk \
|
||||
gcc \
|
||||
|
@ -54,11 +51,18 @@ RUN cd /tmp \
|
|||
|
||||
# install libseccomp
|
||||
ARG LIBSECCOMP_VERSION
|
||||
COPY script/* /tmp/script/
|
||||
COPY script/seccomp.sh script/lib.sh /tmp/script/
|
||||
RUN mkdir -p /opt/libseccomp \
|
||||
&& /tmp/script/seccomp.sh "$LIBSECCOMP_VERSION" /opt/libseccomp arm64 armel armhf ppc64le s390x
|
||||
&& /tmp/script/seccomp.sh "$LIBSECCOMP_VERSION" /opt/libseccomp arm64 armel armhf ppc64le riscv64 s390x
|
||||
ENV LIBSECCOMP_VERSION=$LIBSECCOMP_VERSION
|
||||
ENV LD_LIBRARY_PATH=/opt/libseccomp/lib
|
||||
ENV PKG_CONFIG_PATH=/opt/libseccomp/lib/pkgconfig
|
||||
|
||||
# Prevent the "fatal: detected dubious ownership in repository" git complain during build.
|
||||
RUN git config --global --add safe.directory /go/src/github.com/opencontainers/runc
|
||||
|
||||
WORKDIR /go/src/github.com/opencontainers/runc
|
||||
|
||||
# Fixup for cgroup v2.
|
||||
COPY script/prepare-cgroup-v2.sh /
|
||||
ENTRYPOINT [ "/prepare-cgroup-v2.sh" ]
|
||||
|
|
75
Makefile
75
Makefile
|
@ -10,23 +10,51 @@ GIT_BRANCH_CLEAN := $(shell echo $(GIT_BRANCH) | sed -e "s/[^[:alnum:]]/-/g")
|
|||
RUNC_IMAGE := runc_dev$(if $(GIT_BRANCH_CLEAN),:$(GIT_BRANCH_CLEAN))
|
||||
PROJECT := github.com/opencontainers/runc
|
||||
BUILDTAGS ?= seccomp
|
||||
|
||||
COMMIT ?= $(shell git describe --dirty --long --always)
|
||||
VERSION := $(shell cat ./VERSION)
|
||||
LDFLAGS_COMMON := -X main.gitCommit=$(COMMIT) -X main.version=$(VERSION)
|
||||
|
||||
ifeq ($(shell $(GO) env GOOS),linux)
|
||||
ifeq (,$(filter $(shell $(GO) env GOARCH),mips mipsle mips64 mips64le ppc64))
|
||||
ifeq (,$(findstring -race,$(EXTRA_FLAGS)))
|
||||
GO_BUILDMODE := "-buildmode=pie"
|
||||
endif
|
||||
GOARCH := $(shell $(GO) env GOARCH)
|
||||
|
||||
GO_BUILDMODE :=
|
||||
# Enable dynamic PIE executables on supported platforms.
|
||||
ifneq (,$(filter $(GOARCH),386 amd64 arm arm64 ppc64le riscv64 s390x))
|
||||
ifeq (,$(findstring -race,$(EXTRA_FLAGS)))
|
||||
GO_BUILDMODE := "-buildmode=pie"
|
||||
endif
|
||||
endif
|
||||
GO_BUILD := $(GO) build -trimpath $(GO_BUILDMODE) $(EXTRA_FLAGS) -tags "$(BUILDTAGS)" \
|
||||
-ldflags "-X main.gitCommit=$(COMMIT) -X main.version=$(VERSION) $(EXTRA_LDFLAGS)"
|
||||
GO_BUILD_STATIC := CGO_ENABLED=1 $(GO) build -trimpath $(EXTRA_FLAGS) -tags "$(BUILDTAGS) netgo osusergo" \
|
||||
-ldflags "-extldflags -static -X main.gitCommit=$(COMMIT) -X main.version=$(VERSION) $(EXTRA_LDFLAGS)"
|
||||
GO_BUILD := $(GO) build -trimpath $(GO_BUILDMODE) \
|
||||
$(EXTRA_FLAGS) -tags "$(BUILDTAGS)" \
|
||||
-ldflags "$(LDFLAGS_COMMON) $(EXTRA_LDFLAGS)"
|
||||
|
||||
GO_BUILDMODE_STATIC :=
|
||||
LDFLAGS_STATIC := -extldflags -static
|
||||
# Enable static PIE executables on supported platforms.
|
||||
# This (among the other things) requires libc support (rcrt1.o), which seems
|
||||
# to be available only for arm64 and amd64 (Debian Bullseye).
|
||||
ifneq (,$(filter $(GOARCH),arm64 amd64))
|
||||
ifeq (,$(findstring -race,$(EXTRA_FLAGS)))
|
||||
GO_BUILDMODE_STATIC := -buildmode=pie
|
||||
LDFLAGS_STATIC := -linkmode external -extldflags --static-pie
|
||||
endif
|
||||
endif
|
||||
# Enable static PIE binaries on supported platforms.
|
||||
GO_BUILD_STATIC := $(GO) build -trimpath $(GO_BUILDMODE_STATIC) \
|
||||
$(EXTRA_FLAGS) -tags "$(BUILDTAGS) netgo osusergo" \
|
||||
-ldflags "$(LDFLAGS_COMMON) $(LDFLAGS_STATIC) $(EXTRA_LDFLAGS)"
|
||||
|
||||
GPG_KEYID ?= asarai@suse.de
|
||||
|
||||
# Some targets need cgo, which is disabled by default when cross compiling.
|
||||
# Enable cgo explicitly for those.
|
||||
# Both runc and libcontainer/integration need libcontainer/nsenter.
|
||||
runc static localunittest: export CGO_ENABLED=1
|
||||
# seccompagent needs libseccomp (when seccomp build tag is set).
|
||||
ifneq (,$(filter $(BUILDTAGS),seccomp))
|
||||
seccompagent: export CGO_ENABLED=1
|
||||
endif
|
||||
|
||||
.DEFAULT: runc
|
||||
|
||||
runc:
|
||||
|
@ -40,7 +68,7 @@ recvtty sd-helper seccompagent:
|
|||
static:
|
||||
$(GO_BUILD_STATIC) -o runc .
|
||||
|
||||
releaseall: RELEASE_ARGS := "-a arm64 -a armel -a armhf -a ppc64le -a s390x"
|
||||
releaseall: RELEASE_ARGS := "-a arm64 -a armel -a armhf -a ppc64le -a riscv64 -a s390x"
|
||||
releaseall: release
|
||||
|
||||
release: runcimage
|
||||
|
@ -50,7 +78,7 @@ release: runcimage
|
|||
$(RUNC_IMAGE) make localrelease
|
||||
script/release_sign.sh -S $(GPG_KEYID) -r release/$(VERSION) -v $(VERSION)
|
||||
|
||||
localrelease:
|
||||
localrelease: verify-changelog
|
||||
script/release_build.sh -r release/$(VERSION) -v $(VERSION) $(RELEASE_ARGS)
|
||||
|
||||
dbuild: runcimage
|
||||
|
@ -133,26 +161,39 @@ cfmt:
|
|||
shellcheck:
|
||||
shellcheck tests/integration/*.bats tests/integration/*.sh \
|
||||
tests/integration/*.bash tests/*.sh \
|
||||
script/release_*.sh script/seccomp.sh script/lib.sh
|
||||
# TODO: add shellcheck for more sh files
|
||||
man/*.sh script/*
|
||||
# TODO: add shellcheck for more sh files (contrib/completions/bash/runc).
|
||||
|
||||
shfmt:
|
||||
shfmt -ln bats -d -w tests/integration/*.bats
|
||||
shfmt -ln bash -d -w man/*.sh script/* tests/*.sh tests/integration/*.bash
|
||||
$(CONTAINER_ENGINE) run $(CONTAINER_ENGINE_RUN_FLAGS) \
|
||||
--rm -v $(CURDIR):/src -w /src \
|
||||
mvdan/shfmt:v3.5.1 -d -w .
|
||||
|
||||
localshfmt:
|
||||
shfmt -d -w .
|
||||
|
||||
vendor:
|
||||
$(GO) mod tidy
|
||||
$(GO) mod vendor
|
||||
$(GO) mod verify
|
||||
|
||||
verify-changelog:
|
||||
# No space at EOL.
|
||||
! grep -n '\s$$' CHANGELOG.md
|
||||
# Period before issue/PR references.
|
||||
! grep -n '[0-9a-zA-Z][^.] (#[1-9][0-9, #]*)$$' CHANGELOG.md
|
||||
|
||||
verify-dependencies: vendor
|
||||
@test -z "$$(git status --porcelain -- go.mod go.sum vendor/)" \
|
||||
|| (echo -e "git status:\n $$(git status -- go.mod go.sum vendor/)\nerror: vendor/, go.mod and/or go.sum not up to date. Run \"make vendor\" to update"; exit 1) \
|
||||
&& echo "all vendor files are up to date."
|
||||
|
||||
validate-keyring:
|
||||
script/keyring_validate.sh
|
||||
|
||||
.PHONY: runc all recvtty sd-helper seccompagent static releaseall release \
|
||||
localrelease dbuild lint man runcimage \
|
||||
test localtest unittest localunittest integration localintegration \
|
||||
rootlessintegration localrootlessintegration shell install install-bash \
|
||||
install-man clean cfmt shfmt shellcheck \
|
||||
vendor verify-dependencies
|
||||
install-man clean cfmt shfmt localshfmt shellcheck \
|
||||
vendor verify-changelog verify-dependencies validate-keyring
|
||||
|
|
|
@ -1,10 +1,11 @@
|
|||
# runc
|
||||
|
||||
[![Go Report Card](https://goreportcard.com/badge/github.com/opencontainers/runc)](https://goreportcard.com/report/github.com/opencontainers/runc)
|
||||
[![GoDoc](https://godoc.org/github.com/opencontainers/runc?status.svg)](https://godoc.org/github.com/opencontainers/runc)
|
||||
[![Go Reference](https://pkg.go.dev/badge/github.com/opencontainers/runc.svg)](https://pkg.go.dev/github.com/opencontainers/runc)
|
||||
[![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/588/badge)](https://bestpractices.coreinfrastructure.org/projects/588)
|
||||
[![gha/validate](https://github.com/opencontainers/runc/workflows/validate/badge.svg)](https://github.com/opencontainers/runc/actions?query=workflow%3Avalidate)
|
||||
[![gha/ci](https://github.com/opencontainers/runc/workflows/ci/badge.svg)](https://github.com/opencontainers/runc/actions?query=workflow%3Aci)
|
||||
[![CirrusCI](https://api.cirrus-ci.com/github/opencontainers/runc.svg)](https://cirrus-ci.com/github/opencontainers/runc)
|
||||
|
||||
## Introduction
|
||||
|
||||
|
@ -14,6 +15,8 @@
|
|||
|
||||
You can find official releases of `runc` on the [release](https://github.com/opencontainers/runc/releases) page.
|
||||
|
||||
All releases are signed by one of the keys listed in the [`runc.keyring` file in the root of this repository](runc.keyring).
|
||||
|
||||
## Security
|
||||
|
||||
The reporting process and disclosure communications are outlined [here](https://github.com/opencontainers/org/blob/master/SECURITY.md).
|
||||
|
@ -23,7 +26,7 @@ A third party security audit was performed by Cure53, you can see the full repor
|
|||
|
||||
## Building
|
||||
|
||||
`runc` only supports Linux. It must be built with Go version 1.16 or higher.
|
||||
`runc` only supports Linux. It must be built with Go version 1.17 or higher.
|
||||
|
||||
In order to enable seccomp support you will need to install `libseccomp` on your platform.
|
||||
> e.g. `libseccomp-devel` for CentOS, or `libseccomp-dev` for Ubuntu
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
|
||||
Vagrant.configure("2") do |config|
|
||||
# Fedora box is used for testing cgroup v2 support
|
||||
config.vm.box = "fedora/35-cloud-base"
|
||||
config.vm.box = "fedora/38-cloud-base"
|
||||
config.vm.provider :virtualbox do |v|
|
||||
v.memory = 2048
|
||||
v.cpus = 2
|
||||
|
@ -29,6 +29,9 @@ EOF
|
|||
done
|
||||
dnf clean all
|
||||
|
||||
# Prevent the "fatal: unsafe repository" git complain during build.
|
||||
git config --global --add safe.directory /vagrant
|
||||
|
||||
# Add a user for rootless tests
|
||||
useradd -u2000 -m -d/home/rootless -s/bin/bash rootless
|
||||
|
||||
|
|
|
@ -12,7 +12,7 @@ fi
|
|||
# exits when not running inside bats. We can do hacks, but just to redefine
|
||||
# update_config() seems clearer. We don't even really need to keep them in sync.
|
||||
function update_config() {
|
||||
jq "$1" "./config.json" | awk 'BEGIN{RS="";getline<"-";print>ARGV[1]}' "./config.json"
|
||||
jq "$1" "./config.json" | awk 'BEGIN{RS="";getline<"-";print>ARGV[1]}' "./config.json"
|
||||
}
|
||||
|
||||
update_config '.linux.seccomp = {
|
||||
|
|
|
@ -1,29 +0,0 @@
|
|||
runc (1.1.0-ok3) yangtze; urgency=medium
|
||||
|
||||
* CVE-2022-29162 安全修复
|
||||
|
||||
-- chenxinquan <chenxinquan@kylinos.cn> Fri, 28 Jul 2023 16:16:46 +0800
|
||||
|
||||
runc (1.1.0-ok2) yangtze; urgency=medium
|
||||
|
||||
* yangfs215 CVE-2022-29162 runc: do not set inheritable capabilities
|
||||
|
||||
-- yangfengsheng <yangfs@whu.edu.cn> Tue, 18 Jul 2023 00:10:28 +0800
|
||||
|
||||
runc (1.1.0-ok1) yangtze; urgency=medium
|
||||
|
||||
* Merge new upstream version 1.1.0
|
||||
|
||||
-- Luoyaoming <luoyaoming@kylinos.cn> Fri, 30 Dec 2022 11:11:29 +0800
|
||||
|
||||
runc (1.0.0~rc10-ok2) yangtze; urgency=medium
|
||||
|
||||
* Update version.
|
||||
|
||||
-- zhouganqing <zhouganqing@kylinos.cn> Thu, 28 Jul 2022 16:49:00 +0800
|
||||
|
||||
runc (1.0.0~rc10-ok1) yangtze; urgency=medium
|
||||
|
||||
* Build for openKylin.
|
||||
|
||||
-- openKylinBot <openKylinBot@openkylin.com> Mon, 25 Apr 2022 22:03:04 +0800
|
|
@ -1,17 +0,0 @@
|
|||
## Remove generated man pages:
|
||||
man/man8/*
|
||||
|
||||
## Drop hanging test (introduced in 0.0.9).
|
||||
## https://github.com/opencontainers/runc/issues/692
|
||||
libcontainer/nsenter/nsenter_test.go
|
||||
|
||||
## Failing tests:
|
||||
|
||||
## Privileged tests:
|
||||
### couldn't get cgroup root: mountpoint for cgroup not found
|
||||
libcontainer/cgroups/fs/apply_raw_test.go
|
||||
|
||||
### FAIL: TestXattr (0.00s)
|
||||
### xattr_test.go:26: Success
|
||||
### xattr_test.go:30: failed
|
||||
libcontainer/xattr/xattr_test.go
|
|
@ -1 +0,0 @@
|
|||
10
|
|
@ -1,43 +0,0 @@
|
|||
Source: runc
|
||||
Section: devel
|
||||
Priority: optional
|
||||
Maintainer: Openkylin Developers <packaging@lists.openkylin.top>
|
||||
XSBC-Original-Maintainer: Debian Go Packaging Team <pkg-go-maintainers@lists.alioth.debian.org>
|
||||
Uploaders: Alexandre Viau <aviau@debian.org>,
|
||||
Dmitry Smirnov <onlyjob@debian.org>,
|
||||
Tim Potter <tpot@hpe.com>
|
||||
Build-Depends: debhelper (>= 11~),
|
||||
dh-golang,
|
||||
go-md2man,
|
||||
golang-any,
|
||||
libapparmor-dev,
|
||||
libseccomp-dev,
|
||||
pkg-config,
|
||||
protobuf-compiler
|
||||
Standards-Version: 4.1.4
|
||||
Homepage: https://github.com/opencontainers/runc
|
||||
Vcs-Git: https://salsa.debian.org/go-team/packages/runc.git
|
||||
Vcs-Browser: https://salsa.debian.org/go-team/packages/runc
|
||||
XS-Go-Import-Path: github.com/opencontainers/runc
|
||||
|
||||
Package: runc
|
||||
Architecture: any
|
||||
Depends: ${misc:Depends}, ${shlibs:Depends}
|
||||
Breaks: docker.io (<= 1.13.1~ds1-0)
|
||||
Built-Using: ${misc:Built-Using}
|
||||
Description: Open Container Project - runtime
|
||||
"runc" is a command line client for running applications packaged according
|
||||
to the Open Container Format (OCF) and is a compliant implementation of
|
||||
the Open Container Project specification.
|
||||
|
||||
Package: golang-github-opencontainers-runc-dev
|
||||
Architecture: all
|
||||
Depends: ${misc:Depends}
|
||||
Description: Open Container Project - development files
|
||||
"runc" is a command line client for running applications packaged according
|
||||
to the Open Container Format (OCF) and is a compliant implementation of
|
||||
the Open Container Project specification.
|
||||
.
|
||||
This package provides development files formerly known as
|
||||
"github.com/docker/libcontainer".
|
||||
|
|
@ -1,82 +0,0 @@
|
|||
Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
|
||||
Upstream-Name: runc
|
||||
Source: https://github.com/opencontainers/runc
|
||||
|
||||
Files: *
|
||||
Copyright: 2012-2015 Docker, Inc.
|
||||
License: Apache-2.0
|
||||
|
||||
Files:
|
||||
vendor/github.com/cyphar/filepath-securejoin/*
|
||||
Copyright:
|
||||
2014-2015 Docker Inc & Go Authors. All rights reserved.
|
||||
2017 SUSE LLC. All rights reserved.
|
||||
License: BSD-3-Clause~Google
|
||||
|
||||
Files: debian/*
|
||||
Copyright:
|
||||
2015 Alexandre Viau <alexandre@alexandreviau.net>
|
||||
2015-2016 Dmitry Smirnov <onlyjob@debian.org>
|
||||
License: GPL-3+
|
||||
|
||||
Files: debian/patches/*
|
||||
Copyright: 2015 Dmitry Smirnov <onlyjob@debian.org>
|
||||
License: GPL-3+ or Apache-2.0
|
||||
Comment: patches can be licensed under the same terms as upstream.
|
||||
|
||||
License: Apache-2.0
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
.
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
.
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
.
|
||||
The complete text of the Apache version 2.0 license
|
||||
can be found in "/usr/share/common-licenses/Apache-2.0".
|
||||
|
||||
License: GPL-3+
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
․
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
․
|
||||
The complete text of the GNU General Public License version 3
|
||||
can be found in "/usr/share/common-licenses/GPL-3".
|
||||
|
||||
License: BSD-3-Clause~Google
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
.
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the following disclaimer
|
||||
in the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of Google Inc. nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
@ -1,28 +0,0 @@
|
|||
|
||||
# auto-generated, DO NOT MODIFY.
|
||||
# The authoritative copy of this file lives at:
|
||||
# https://salsa.debian.org/go-team/ci/blob/master/cmd/ci/gitlabciyml.go
|
||||
|
||||
# TODO: publish under debian-go-team/ci
|
||||
image: stapelberg/ci2
|
||||
|
||||
test_the_archive:
|
||||
artifacts:
|
||||
paths:
|
||||
- before-applying-commit.json
|
||||
- after-applying-commit.json
|
||||
script:
|
||||
# Create an overlay to discard writes to /srv/gopath/src after the build:
|
||||
- "rm -rf /cache/overlay/{upper,work}"
|
||||
- "mkdir -p /cache/overlay/{upper,work}"
|
||||
- "mount -t overlay overlay -o lowerdir=/srv/gopath/src,upperdir=/cache/overlay/upper,workdir=/cache/overlay/work /srv/gopath/src"
|
||||
- "export GOPATH=/srv/gopath"
|
||||
- "export GOCACHE=/cache/go"
|
||||
# Build the world as-is:
|
||||
- "ci-build -exemptions=/var/lib/ci-build/exemptions.json > before-applying-commit.json"
|
||||
# Copy this package into the overlay:
|
||||
- "GBP_CONF_FILES=:debian/gbp.conf gbp buildpackage --git-no-pristine-tar --git-ignore-branch --git-ignore-new --git-export-dir=/tmp/export --git-no-overlay --git-tarball-dir=/nonexistant --git-cleaner=/bin/true --git-builder='dpkg-buildpackage -S -d --no-sign'"
|
||||
- "pgt-gopath -dsc /tmp/export/*.dsc"
|
||||
# Rebuild the world:
|
||||
- "ci-build -exemptions=/var/lib/ci-build/exemptions.json > after-applying-commit.json"
|
||||
- "ci-diff before-applying-commit.json after-applying-commit.json"
|
|
@ -1 +0,0 @@
|
|||
usr/share/gocode/src
|
|
@ -1,39 +0,0 @@
|
|||
From: Dmitry Smirnov <onlyjob@debian.org>
|
||||
Date: Thu, 28 Jul 2022 16:28:22 +0800
|
||||
Subject: fix FTBFS on i686
|
||||
|
||||
src/github.com/opencontainers/runc/libcontainer/user/user_test.go:448:36: constant 2147483648 overflows int
|
||||
Last-Update: 2018-06-16
|
||||
Forwarded: https://github.com/opencontainers/runc/pull/1821
|
||||
Bug-Upstream: https://github.com/opencontainers/runc/issues/941
|
||||
---
|
||||
libcontainer/user/user.go | 2 +-
|
||||
libcontainer/user/user_test.go | 2 +-
|
||||
2 files changed, 2 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/libcontainer/user/user.go b/libcontainer/user/user.go
|
||||
index 7b912bb..38caded 100644
|
||||
--- a/libcontainer/user/user.go
|
||||
+++ b/libcontainer/user/user.go
|
||||
@@ -473,7 +473,7 @@ func GetAdditionalGroups(additionalGroups []string, group io.Reader) ([]int, err
|
||||
return nil, fmt.Errorf("Unable to find group %s", ag)
|
||||
}
|
||||
// Ensure gid is inside gid range.
|
||||
- if gid < minId || gid > maxId {
|
||||
+ if gid < minId || gid >= maxId {
|
||||
return nil, ErrRange
|
||||
}
|
||||
gidMap[gid] = struct{}{}
|
||||
diff --git a/libcontainer/user/user_test.go b/libcontainer/user/user_test.go
|
||||
index 24ee559..a4aabdc 100644
|
||||
--- a/libcontainer/user/user_test.go
|
||||
+++ b/libcontainer/user/user_test.go
|
||||
@@ -445,7 +445,7 @@ this is just some garbage data
|
||||
if utils.GetIntSize() > 4 {
|
||||
tests = append(tests, foo{
|
||||
// groups with too large id
|
||||
- groups: []string{strconv.Itoa(1 << 31)},
|
||||
+ groups: []string{strconv.Itoa( 1<<31 -1 )},
|
||||
expected: nil,
|
||||
hasError: true,
|
||||
})
|
|
@ -1,48 +0,0 @@
|
|||
From: Dmitry Smirnov <onlyjob@debian.org>
|
||||
Date: Thu, 28 Jul 2022 16:28:22 +0800
|
||||
Subject: disabled unreliable tests due to random failures on [ppc64el,
|
||||
s390x].
|
||||
|
||||
Last-Update: 2018-09-27
|
||||
Forwarded: not-needed
|
||||
Bug-Upstream: https://github.com/opencontainers/runc/issues/1822
|
||||
---
|
||||
libcontainer/cgroups/fs/hugetlb_test.go | 4 ++++
|
||||
1 file changed, 4 insertions(+)
|
||||
|
||||
diff --git a/libcontainer/cgroups/fs/hugetlb_test.go b/libcontainer/cgroups/fs/hugetlb_test.go
|
||||
index 9ddacfe..9b60650 100644
|
||||
--- a/libcontainer/cgroups/fs/hugetlb_test.go
|
||||
+++ b/libcontainer/cgroups/fs/hugetlb_test.go
|
||||
@@ -89,6 +89,7 @@ func TestHugetlbStats(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestHugetlbStatsNoUsageFile(t *testing.T) {
|
||||
+t.Skip("Disabled unreliable test")
|
||||
helper := NewCgroupTestUtil("hugetlb", t)
|
||||
defer helper.cleanup()
|
||||
helper.writeFileContents(map[string]string{
|
||||
@@ -104,6 +105,7 @@ func TestHugetlbStatsNoUsageFile(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestHugetlbStatsNoMaxUsageFile(t *testing.T) {
|
||||
+t.Skip("Disabled unreliable test")
|
||||
helper := NewCgroupTestUtil("hugetlb", t)
|
||||
defer helper.cleanup()
|
||||
for _, pageSize := range HugePageSizes {
|
||||
@@ -121,6 +123,7 @@ func TestHugetlbStatsNoMaxUsageFile(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestHugetlbStatsBadUsageFile(t *testing.T) {
|
||||
+t.Skip("Disabled unreliable test")
|
||||
helper := NewCgroupTestUtil("hugetlb", t)
|
||||
defer helper.cleanup()
|
||||
for _, pageSize := range HugePageSizes {
|
||||
@@ -139,6 +142,7 @@ func TestHugetlbStatsBadUsageFile(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestHugetlbStatsBadMaxUsageFile(t *testing.T) {
|
||||
+t.Skip("Disabled unreliable test")
|
||||
helper := NewCgroupTestUtil("hugetlb", t)
|
||||
defer helper.cleanup()
|
||||
helper.writeFileContents(map[string]string{
|
|
@ -1,22 +0,0 @@
|
|||
From: Dmitry Smirnov <onlyjob@debian.org>
|
||||
Date: Thu, 28 Jul 2022 16:28:22 +0800
|
||||
Subject: disable test (requires root)
|
||||
|
||||
Last-Update: 2018-06-15
|
||||
Forwarded: not-needed
|
||||
---
|
||||
libcontainer/factory_linux_test.go | 1 +
|
||||
1 file changed, 1 insertion(+)
|
||||
|
||||
diff --git a/libcontainer/factory_linux_test.go b/libcontainer/factory_linux_test.go
|
||||
index 8d0ca8a..1dc0180 100644
|
||||
--- a/libcontainer/factory_linux_test.go
|
||||
+++ b/libcontainer/factory_linux_test.go
|
||||
@@ -78,6 +78,7 @@ func TestFactoryNewIntelRdt(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestFactoryNewTmpfs(t *testing.T) {
|
||||
+t.Skip("DM - skipping privileged test")
|
||||
root, rerr := newTestRoot()
|
||||
if rerr != nil {
|
||||
t.Fatal(rerr)
|
|
@ -1,26 +0,0 @@
|
|||
#!/usr/bin/make -f
|
||||
|
||||
# Uncomment this to turn on verbose mode.
|
||||
#export DH_VERBOSE=1
|
||||
|
||||
export DH_GOPKG := github.com/opencontainers/runc
|
||||
export DH_GOLANG_INSTALL_EXTRA := libcontainer/seccomp/fixtures libcontainer/criurpc
|
||||
TAGS=apparmor seccomp selinux ambient
|
||||
|
||||
%:
|
||||
dh $@ --buildsystem=golang --with=golang --builddirectory=_build
|
||||
|
||||
override_dh_auto_configure:
|
||||
cd man && ./md2man-all.sh
|
||||
dh_auto_configure
|
||||
## Remove extra license files:
|
||||
$(RM) -v \
|
||||
_build/src/$(DH_GOPKG)/vendor/github.com/docker/docker/*/*/LICENSE* \
|
||||
;
|
||||
|
||||
override_dh_auto_build:
|
||||
dh_auto_build -- -tags "$(TAGS)"
|
||||
|
||||
override_dh_auto_test:
|
||||
DH_GOLANG_EXCLUDES="libcontainer/integration" \
|
||||
dh_auto_test -- -tags "$(TAGS)"
|
|
@ -1,2 +0,0 @@
|
|||
NOTICE
|
||||
README*
|
|
@ -1 +0,0 @@
|
|||
usr/bin/* /usr/sbin/
|
|
@ -1 +0,0 @@
|
|||
runc: spelling-error-in-binary
|
|
@ -1 +0,0 @@
|
|||
man/man8/*.8
|
|
@ -1 +0,0 @@
|
|||
3.0 (native)
|
|
@ -1,2 +0,0 @@
|
|||
# Result of Files-Excluded:
|
||||
source-contains-empty-directory vendor/*
|
|
@ -1,34 +0,0 @@
|
|||
#!/bin/bash
|
||||
set -Eeuo pipefail
|
||||
set -x
|
||||
|
||||
runc --version
|
||||
|
||||
tempDir="$(mktemp -d)"
|
||||
trap 'rm -rf "$tempDir"' EXIT
|
||||
|
||||
# build up rootfs with busybox
|
||||
busybox="$(which busybox)" # from busybox-static
|
||||
mkdir "$tempDir/rootfs"
|
||||
cp -a "$busybox" "$tempDir/rootfs/"
|
||||
|
||||
# rough "rootfs" smoke test (makes sure "busybox" is actually static)
|
||||
chroot "$tempDir/rootfs" /busybox true
|
||||
|
||||
# make a config.json file for our "bundle"
|
||||
runc spec --bundle "$tempDir"
|
||||
|
||||
# edit the default command to something we can actually run with our rootfs
|
||||
grep '"sh"' "$tempDir/config.json"
|
||||
sed -i 's@"sh"@"/busybox","echo","success"@g' "$tempDir/config.json"
|
||||
grep '"/busybox","echo","success"' "$tempDir/config.json"
|
||||
# and disable the TTY
|
||||
grep '"terminal": true,' "$tempDir/config.json"
|
||||
sed -i 's/"terminal": true,/"terminal": false,/g' "$tempDir/config.json"
|
||||
grep '"terminal": false,' "$tempDir/config.json"
|
||||
|
||||
# run it and capture the output
|
||||
output="$(runc run --bundle "$tempDir" "test-$$-$RANDOM")"
|
||||
|
||||
# ensure the output was exactly what we expected
|
||||
[ "$output" = 'success' ]
|
|
@ -1,7 +0,0 @@
|
|||
Tests: basic-smoke
|
||||
Depends: busybox-static, @
|
||||
Restrictions: allow-stderr, isolation-machine, needs-root
|
||||
|
||||
Test-Command: /usr/bin/dh_golang_autopkgtest
|
||||
Depends: @, @builddeps@, dh-golang
|
||||
Restrictions: allow-stderr, isolation-machine
|
|
@ -1,9 +0,0 @@
|
|||
version=3
|
||||
|
||||
opts=\
|
||||
repack,\
|
||||
repacksuffix=+dfsg1,\
|
||||
uversionmangle=s/-rc/~rc/,\
|
||||
dversionmangle=s/[~+]dfsg\d*$// \
|
||||
https://github.com/opencontainers/runc/releases \
|
||||
.*archive/v?(\d\.\d\.\d.*)\.tar\.gz
|
|
@ -123,8 +123,8 @@ The above will set the following properties:
|
|||
* `TimeoutStopSec` to 2 minutes and 3 seconds;
|
||||
* `CollectMode` to "inactive-or-failed".
|
||||
|
||||
The values must be in the gvariant format (for details, see
|
||||
[gvariant documentation](https://developer.gnome.org/glib/stable/gvariant-text.html)).
|
||||
The values must be in the gvariant text format, as described in
|
||||
[gvariant documentation](https://docs.gtk.org/glib/gvariant-text.html).
|
||||
|
||||
To find out which type systemd expects for a particular parameter, please
|
||||
consult systemd sources.
|
||||
|
|
19
go.mod
19
go.mod
|
@ -1,26 +1,33 @@
|
|||
module github.com/opencontainers/runc
|
||||
|
||||
go 1.16
|
||||
go 1.17
|
||||
|
||||
require (
|
||||
github.com/checkpoint-restore/go-criu/v5 v5.3.0
|
||||
github.com/cilium/ebpf v0.7.0
|
||||
github.com/containerd/console v1.0.3
|
||||
github.com/coreos/go-systemd/v22 v22.3.2
|
||||
github.com/cyphar/filepath-securejoin v0.2.3
|
||||
github.com/cyphar/filepath-securejoin v0.2.4
|
||||
github.com/docker/go-units v0.4.0
|
||||
github.com/godbus/dbus/v5 v5.0.6
|
||||
github.com/moby/sys/mountinfo v0.5.0
|
||||
github.com/mrunalp/fileutils v0.5.0
|
||||
github.com/mrunalp/fileutils v0.5.1
|
||||
github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417
|
||||
github.com/opencontainers/selinux v1.10.0
|
||||
github.com/seccomp/libseccomp-golang v0.9.2-0.20210429002308-3879420cc921
|
||||
github.com/seccomp/libseccomp-golang v0.9.2-0.20220502022130-f33da4d89646
|
||||
github.com/sirupsen/logrus v1.8.1
|
||||
github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635
|
||||
// NOTE: urfave/cli must be <= v1.22.1 due to a regression: https://github.com/urfave/cli/issues/1092
|
||||
github.com/urfave/cli v1.22.1
|
||||
github.com/vishvananda/netlink v1.1.0
|
||||
golang.org/x/net v0.0.0-20201224014010-6772e930b67b
|
||||
golang.org/x/sys v0.0.0-20211116061358-0a5406a5449c
|
||||
golang.org/x/net v0.8.0
|
||||
golang.org/x/sys v0.6.0
|
||||
google.golang.org/protobuf v1.27.1
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d // indirect
|
||||
github.com/russross/blackfriday/v2 v2.0.1 // indirect
|
||||
github.com/shurcooL/sanitized_anchor_name v1.0.0 // indirect
|
||||
github.com/vishvananda/netns v0.0.0-20191106174202-0a2b9b5464df // indirect
|
||||
)
|
||||
|
|
48
go.sum
48
go.sum
|
@ -9,8 +9,8 @@ github.com/coreos/go-systemd/v22 v22.3.2 h1:D9/bQk5vlXQFZ6Kwuu6zaiXJ9oTPe68++AzA
|
|||
github.com/coreos/go-systemd/v22 v22.3.2/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
|
||||
github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d h1:U+s90UTSYgptZMwQh2aRr3LuazLJIa+Pg3Kc1ylSYVY=
|
||||
github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU=
|
||||
github.com/cyphar/filepath-securejoin v0.2.3 h1:YX6ebbZCZP7VkM3scTTokDgBL2TY741X51MTk3ycuNI=
|
||||
github.com/cyphar/filepath-securejoin v0.2.3/go.mod h1:aPGpWjXOXUn2NCNjFvBE6aRxGGx79pTxQpKOJNYHHl4=
|
||||
github.com/cyphar/filepath-securejoin v0.2.4 h1:Ugdm7cg7i6ZK6x3xDF1oEu1nfkyfH53EtKeQYTC3kyg=
|
||||
github.com/cyphar/filepath-securejoin v0.2.4/go.mod h1:aPGpWjXOXUn2NCNjFvBE6aRxGGx79pTxQpKOJNYHHl4=
|
||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/docker/go-units v0.4.0 h1:3uh0PgVws3nIA0Q+MwDC8yjEPf9zjRfZZWXZYDct3Tw=
|
||||
|
@ -31,8 +31,8 @@ github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
|
|||
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
|
||||
github.com/moby/sys/mountinfo v0.5.0 h1:2Ks8/r6lopsxWi9m58nlwjaeSzUX9iiL1vj5qB/9ObI=
|
||||
github.com/moby/sys/mountinfo v0.5.0/go.mod h1:3bMD3Rg+zkqx8MRYPi7Pyb0Ie97QEBmdxbhnCLlSvSU=
|
||||
github.com/mrunalp/fileutils v0.5.0 h1:NKzVxiH7eSk+OQ4M+ZYW1K6h27RUV3MI6NUTsHhU6Z4=
|
||||
github.com/mrunalp/fileutils v0.5.0/go.mod h1:M1WthSahJixYnrXQl/DFQuteStB1weuxD2QJNHXfbSQ=
|
||||
github.com/mrunalp/fileutils v0.5.1 h1:F+S7ZlNKnrwHfSwdlgNSkKo67ReVf8o9fel6C3dkm/Q=
|
||||
github.com/mrunalp/fileutils v0.5.1/go.mod h1:M1WthSahJixYnrXQl/DFQuteStB1weuxD2QJNHXfbSQ=
|
||||
github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417 h1:3snG66yBm59tKhhSPQrQ/0bCrv1LQbKt40LnUPiUxdc=
|
||||
github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0=
|
||||
github.com/opencontainers/selinux v1.10.0 h1:rAiKF8hTcgLI3w0DHm6i0ylVVcOrlgR1kK99DRLDhyU=
|
||||
|
@ -41,8 +41,8 @@ github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZb
|
|||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/russross/blackfriday/v2 v2.0.1 h1:lPqVAte+HuHNfhJ/0LC98ESWRz8afy9tM/0RK8m9o+Q=
|
||||
github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
|
||||
github.com/seccomp/libseccomp-golang v0.9.2-0.20210429002308-3879420cc921 h1:58EBmR2dMNL2n/FnbQewK3D14nXr0V9CObDSvMJLq+Y=
|
||||
github.com/seccomp/libseccomp-golang v0.9.2-0.20210429002308-3879420cc921/go.mod h1:JA8cRccbGaA1s33RQf7Y1+q9gHmZX1yB/z9WDN1C6fg=
|
||||
github.com/seccomp/libseccomp-golang v0.9.2-0.20220502022130-f33da4d89646 h1:RpforrEYXWkmGwJHIGnLZ3tTWStkjVVstwzNGqxX2Ds=
|
||||
github.com/seccomp/libseccomp-golang v0.9.2-0.20220502022130-f33da4d89646/go.mod h1:JA8cRccbGaA1s33RQf7Y1+q9gHmZX1yB/z9WDN1C6fg=
|
||||
github.com/shurcooL/sanitized_anchor_name v1.0.0 h1:PdmoCO6wvbs+7yrJyMORt4/BmY5IYyJwS/kOiWx8mHo=
|
||||
github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc=
|
||||
github.com/sirupsen/logrus v1.8.1 h1:dJKuHgqk1NNQlqoA6BTlM1Wf9DOH3NBjQyu0h9+AZZE=
|
||||
|
@ -57,20 +57,48 @@ github.com/vishvananda/netlink v1.1.0 h1:1iyaYNBLmP6L0220aDnYQpo1QEV4t4hJ+xEEhhJ
|
|||
github.com/vishvananda/netlink v1.1.0/go.mod h1:cTgwzPIzzgDAYoQrMm0EdrjRUBkTqKYppBueQtXaqoE=
|
||||
github.com/vishvananda/netns v0.0.0-20191106174202-0a2b9b5464df h1:OviZH7qLw/7ZovXvuNyL3XQl8UFofeikI1NW1Gypu7k=
|
||||
github.com/vishvananda/netns v0.0.0-20191106174202-0a2b9b5464df/go.mod h1:JP3t17pCcGlemwknint6hfoeCVQrEMVwxRLRjXpq+BU=
|
||||
golang.org/x/net v0.0.0-20201224014010-6772e930b67b h1:iFwSg7t5GZmB/Q5TjiEAsdoLDrdJRC1RiF2WhuV29Qw=
|
||||
golang.org/x/net v0.0.0-20201224014010-6772e930b67b/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
|
||||
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
|
||||
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
|
||||
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
|
||||
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
|
||||
golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
|
||||
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
|
||||
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
|
||||
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
|
||||
golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
|
||||
golang.org/x/net v0.8.0 h1:Zrh2ngAOFYneWTAIAPethzeaQLuHwhuBkuV6ZiRnUaQ=
|
||||
golang.org/x/net v0.8.0/go.mod h1:QVkue5JL9kW//ek3r6jTKnTFis1tRmNAW2P1shuFdJc=
|
||||
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
||||
golang.org/x/sys v0.0.0-20190606203320-7fc4e5ec1444/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20191115151921-52ab43148777/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.0.0-20210906170528-6f6e22806c34/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.0.0-20211025201205-69cdffdb9359/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.0.0-20211116061358-0a5406a5449c h1:DHcbWVXeY+0Y8HHKR+rbLwnoh2F4tNCY7rTiHJ30RmA=
|
||||
golang.org/x/sys v0.0.0-20211116061358-0a5406a5449c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.6.0 h1:MVltZSvRTcU2ljQOhs94SXPftV6DCNnZViHeQps87pQ=
|
||||
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
|
||||
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
|
||||
golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
|
||||
golang.org/x/term v0.6.0/go.mod h1:m6U89DPEgQRMq3DNkDClhWw02AUbt2daBVO4cn4Hv9U=
|
||||
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
|
||||
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
|
||||
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
|
||||
golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
|
||||
golang.org/x/text v0.8.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
|
||||
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
||||
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
|
||||
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
|
||||
golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
|
||||
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4=
|
||||
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# libcontainer
|
||||
|
||||
[![GoDoc](https://godoc.org/github.com/opencontainers/runc/libcontainer?status.svg)](https://godoc.org/github.com/opencontainers/runc/libcontainer)
|
||||
[![Go Reference](https://pkg.go.dev/badge/github.com/opencontainers/runc/libcontainer.svg)](https://pkg.go.dev/github.com/opencontainers/runc/libcontainer)
|
||||
|
||||
Libcontainer provides a native Go implementation for creating containers
|
||||
with namespaces, cgroups, capabilities, and filesystem access controls.
|
||||
|
|
|
@ -153,8 +153,7 @@ func TestDeviceFilter_Privileged(t *testing.T) {
|
|||
Allow: true,
|
||||
},
|
||||
}
|
||||
expected :=
|
||||
`
|
||||
expected := `
|
||||
// load parameters into registers
|
||||
0: LdXMemW dst: r2 src: r1 off: 0 imm: 0
|
||||
1: And32Imm dst: r2 imm: 65535
|
||||
|
|
|
@ -93,7 +93,7 @@ var (
|
|||
)
|
||||
|
||||
// Loosely based on the BPF_F_REPLACE support check in
|
||||
// <https://github.com/cilium/ebpf/blob/v0.6.0/link/syscalls.go>.
|
||||
// https://github.com/cilium/ebpf/blob/v0.6.0/link/syscalls.go.
|
||||
//
|
||||
// TODO: move this logic to cilium/ebpf
|
||||
func haveBpfProgReplace() bool {
|
||||
|
|
|
@ -10,6 +10,7 @@ import (
|
|||
"strings"
|
||||
"sync"
|
||||
|
||||
"github.com/opencontainers/runc/libcontainer/utils"
|
||||
"github.com/sirupsen/logrus"
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
@ -76,16 +77,16 @@ var (
|
|||
// TestMode is set to true by unit tests that need "fake" cgroupfs.
|
||||
TestMode bool
|
||||
|
||||
cgroupFd int = -1
|
||||
prepOnce sync.Once
|
||||
prepErr error
|
||||
resolveFlags uint64
|
||||
cgroupRootHandle *os.File
|
||||
prepOnce sync.Once
|
||||
prepErr error
|
||||
resolveFlags uint64
|
||||
)
|
||||
|
||||
func prepareOpenat2() error {
|
||||
prepOnce.Do(func() {
|
||||
fd, err := unix.Openat2(-1, cgroupfsDir, &unix.OpenHow{
|
||||
Flags: unix.O_DIRECTORY | unix.O_PATH,
|
||||
Flags: unix.O_DIRECTORY | unix.O_PATH | unix.O_CLOEXEC,
|
||||
})
|
||||
if err != nil {
|
||||
prepErr = &os.PathError{Op: "openat2", Path: cgroupfsDir, Err: err}
|
||||
|
@ -96,15 +97,16 @@ func prepareOpenat2() error {
|
|||
}
|
||||
return
|
||||
}
|
||||
file := os.NewFile(uintptr(fd), cgroupfsDir)
|
||||
|
||||
var st unix.Statfs_t
|
||||
if err = unix.Fstatfs(fd, &st); err != nil {
|
||||
if err := unix.Fstatfs(int(file.Fd()), &st); err != nil {
|
||||
prepErr = &os.PathError{Op: "statfs", Path: cgroupfsDir, Err: err}
|
||||
logrus.Warnf("falling back to securejoin: %s", prepErr)
|
||||
return
|
||||
}
|
||||
|
||||
cgroupFd = fd
|
||||
|
||||
cgroupRootHandle = file
|
||||
resolveFlags = unix.RESOLVE_BENEATH | unix.RESOLVE_NO_MAGICLINKS
|
||||
if st.Type == unix.CGROUP2_SUPER_MAGIC {
|
||||
// cgroupv2 has a single mountpoint and no "cpu,cpuacct" symlinks
|
||||
|
@ -122,7 +124,7 @@ func openFile(dir, file string, flags int) (*os.File, error) {
|
|||
flags |= os.O_TRUNC | os.O_CREATE
|
||||
mode = 0o600
|
||||
}
|
||||
path := path.Join(dir, file)
|
||||
path := path.Join(dir, utils.CleanPath(file))
|
||||
if prepareOpenat2() != nil {
|
||||
return openFallback(path, flags, mode)
|
||||
}
|
||||
|
@ -131,7 +133,7 @@ func openFile(dir, file string, flags int) (*os.File, error) {
|
|||
return openFallback(path, flags, mode)
|
||||
}
|
||||
|
||||
fd, err := unix.Openat2(cgroupFd, relPath,
|
||||
fd, err := unix.Openat2(int(cgroupRootHandle.Fd()), relPath,
|
||||
&unix.OpenHow{
|
||||
Resolve: resolveFlags,
|
||||
Flags: uint64(flags) | unix.O_CLOEXEC,
|
||||
|
@ -139,20 +141,20 @@ func openFile(dir, file string, flags int) (*os.File, error) {
|
|||
})
|
||||
if err != nil {
|
||||
err = &os.PathError{Op: "openat2", Path: path, Err: err}
|
||||
// Check if cgroupFd is still opened to cgroupfsDir
|
||||
// Check if cgroupRootHandle is still opened to cgroupfsDir
|
||||
// (happens when this package is incorrectly used
|
||||
// across the chroot/pivot_root/mntns boundary, or
|
||||
// when /sys/fs/cgroup is remounted).
|
||||
//
|
||||
// TODO: if such usage will ever be common, amend this
|
||||
// to reopen cgroupFd and retry openat2.
|
||||
fdStr := strconv.Itoa(cgroupFd)
|
||||
// to reopen cgroupRootHandle and retry openat2.
|
||||
fdStr := strconv.Itoa(int(cgroupRootHandle.Fd()))
|
||||
fdDest, _ := os.Readlink("/proc/self/fd/" + fdStr)
|
||||
if fdDest != cgroupfsDir {
|
||||
// Wrap the error so it is clear that cgroupFd
|
||||
// Wrap the error so it is clear that cgroupRootHandle
|
||||
// is opened to an unexpected/wrong directory.
|
||||
err = fmt.Errorf("cgroupFd %s unexpectedly opened to %s != %s: %w",
|
||||
fdStr, fdDest, cgroupfsDir, err)
|
||||
err = fmt.Errorf("cgroupRootHandle %d unexpectedly opened to %s != %s: %w",
|
||||
cgroupRootHandle.Fd(), fdDest, cgroupfsDir, err)
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
|
|
|
@ -58,8 +58,6 @@ func TestOpenat2(t *testing.T) {
|
|||
{"/sys/fs/cgroup", "/cgroup.controllers"},
|
||||
{"/sys/fs/cgroup/", "cgroup.controllers"},
|
||||
{"/sys/fs/cgroup/", "/cgroup.controllers"},
|
||||
{"/sys/fs/cgroup/user.slice", "cgroup.controllers"},
|
||||
{"/sys/fs/cgroup/user.slice/", "/cgroup.controllers"},
|
||||
{"/", "/sys/fs/cgroup/cgroup.controllers"},
|
||||
{"/", "sys/fs/cgroup/cgroup.controllers"},
|
||||
{"/sys/fs/cgroup/cgroup.controllers", ""},
|
||||
|
|
|
@ -28,6 +28,7 @@ var subsystems = []subsystem{
|
|||
&FreezerGroup{},
|
||||
&RdmaGroup{},
|
||||
&NameGroup{GroupName: "name=systemd", Join: true},
|
||||
&NameGroup{GroupName: "misc", Join: true},
|
||||
}
|
||||
|
||||
var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist")
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
package fs
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"os"
|
||||
"strconv"
|
||||
|
||||
"github.com/opencontainers/runc/libcontainer/cgroups"
|
||||
|
@ -19,8 +21,23 @@ func (s *HugetlbGroup) Apply(path string, _ *configs.Resources, pid int) error {
|
|||
}
|
||||
|
||||
func (s *HugetlbGroup) Set(path string, r *configs.Resources) error {
|
||||
const suffix = ".limit_in_bytes"
|
||||
skipRsvd := false
|
||||
|
||||
for _, hugetlb := range r.HugetlbLimit {
|
||||
if err := cgroups.WriteFile(path, "hugetlb."+hugetlb.Pagesize+".limit_in_bytes", strconv.FormatUint(hugetlb.Limit, 10)); err != nil {
|
||||
prefix := "hugetlb." + hugetlb.Pagesize
|
||||
val := strconv.FormatUint(hugetlb.Limit, 10)
|
||||
if err := cgroups.WriteFile(path, prefix+suffix, val); err != nil {
|
||||
return err
|
||||
}
|
||||
if skipRsvd {
|
||||
continue
|
||||
}
|
||||
if err := cgroups.WriteFile(path, prefix+".rsvd"+suffix, val); err != nil {
|
||||
if errors.Is(err, os.ErrNotExist) {
|
||||
skipRsvd = true
|
||||
continue
|
||||
}
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
@ -32,24 +49,29 @@ func (s *HugetlbGroup) GetStats(path string, stats *cgroups.Stats) error {
|
|||
if !cgroups.PathExists(path) {
|
||||
return nil
|
||||
}
|
||||
rsvd := ".rsvd"
|
||||
hugetlbStats := cgroups.HugetlbStats{}
|
||||
for _, pageSize := range cgroups.HugePageSizes() {
|
||||
usage := "hugetlb." + pageSize + ".usage_in_bytes"
|
||||
value, err := fscommon.GetCgroupParamUint(path, usage)
|
||||
again:
|
||||
prefix := "hugetlb." + pageSize + rsvd
|
||||
|
||||
value, err := fscommon.GetCgroupParamUint(path, prefix+".usage_in_bytes")
|
||||
if err != nil {
|
||||
if rsvd != "" && errors.Is(err, os.ErrNotExist) {
|
||||
rsvd = ""
|
||||
goto again
|
||||
}
|
||||
return err
|
||||
}
|
||||
hugetlbStats.Usage = value
|
||||
|
||||
maxUsage := "hugetlb." + pageSize + ".max_usage_in_bytes"
|
||||
value, err = fscommon.GetCgroupParamUint(path, maxUsage)
|
||||
value, err = fscommon.GetCgroupParamUint(path, prefix+".max_usage_in_bytes")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
hugetlbStats.MaxUsage = value
|
||||
|
||||
failcnt := "hugetlb." + pageSize + ".failcnt"
|
||||
value, err = fscommon.GetCgroupParamUint(path, failcnt)
|
||||
value, err = fscommon.GetCgroupParamUint(path, prefix+".failcnt")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
|
|
@ -21,6 +21,11 @@ const (
|
|||
limit = "hugetlb.%s.limit_in_bytes"
|
||||
maxUsage = "hugetlb.%s.max_usage_in_bytes"
|
||||
failcnt = "hugetlb.%s.failcnt"
|
||||
|
||||
rsvdUsage = "hugetlb.%s.rsvd.usage_in_bytes"
|
||||
rsvdLimit = "hugetlb.%s.rsvd.limit_in_bytes"
|
||||
rsvdMaxUsage = "hugetlb.%s.rsvd.max_usage_in_bytes"
|
||||
rsvdFailcnt = "hugetlb.%s.rsvd.failcnt"
|
||||
)
|
||||
|
||||
func TestHugetlbSetHugetlb(t *testing.T) {
|
||||
|
@ -52,13 +57,15 @@ func TestHugetlbSetHugetlb(t *testing.T) {
|
|||
}
|
||||
|
||||
for _, pageSize := range cgroups.HugePageSizes() {
|
||||
limit := fmt.Sprintf(limit, pageSize)
|
||||
value, err := fscommon.GetCgroupParamUint(path, limit)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if value != hugetlbAfter {
|
||||
t.Fatalf("Set hugetlb.limit_in_bytes failed. Expected: %v, Got: %v", hugetlbAfter, value)
|
||||
for _, f := range []string{limit, rsvdLimit} {
|
||||
limit := fmt.Sprintf(f, pageSize)
|
||||
value, err := fscommon.GetCgroupParamUint(path, limit)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if value != hugetlbAfter {
|
||||
t.Fatalf("Set %s failed. Expected: %v, Got: %v", limit, hugetlbAfter, value)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -85,6 +92,28 @@ func TestHugetlbStats(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestHugetlbRStatsRsvd(t *testing.T) {
|
||||
path := tempDir(t, "hugetlb")
|
||||
for _, pageSize := range cgroups.HugePageSizes() {
|
||||
writeFileContents(t, path, map[string]string{
|
||||
fmt.Sprintf(rsvdUsage, pageSize): hugetlbUsageContents,
|
||||
fmt.Sprintf(rsvdMaxUsage, pageSize): hugetlbMaxUsageContents,
|
||||
fmt.Sprintf(rsvdFailcnt, pageSize): hugetlbFailcnt,
|
||||
})
|
||||
}
|
||||
|
||||
hugetlb := &HugetlbGroup{}
|
||||
actualStats := *cgroups.NewStats()
|
||||
err := hugetlb.GetStats(path, &actualStats)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
expectedStats := cgroups.HugetlbStats{Usage: 128, MaxUsage: 256, Failcnt: 100}
|
||||
for _, pageSize := range cgroups.HugePageSizes() {
|
||||
expectHugetlbStatEquals(t, expectedStats, actualStats.HugetlbStats[pageSize])
|
||||
}
|
||||
}
|
||||
|
||||
func TestHugetlbStatsNoUsageFile(t *testing.T) {
|
||||
path := tempDir(t, "hugetlb")
|
||||
writeFileContents(t, path, map[string]string{
|
||||
|
|
|
@ -170,6 +170,10 @@ func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error {
|
|||
return err
|
||||
}
|
||||
stats.MemoryStats.SwapUsage = swapUsage
|
||||
stats.MemoryStats.SwapOnlyUsage = cgroups.MemoryData{
|
||||
Usage: swapUsage.Usage - memoryUsage.Usage,
|
||||
Failcnt: swapUsage.Failcnt - memoryUsage.Failcnt,
|
||||
}
|
||||
kernelUsage, err := getMemoryData(path, "kmem")
|
||||
if err != nil {
|
||||
return err
|
||||
|
@ -234,6 +238,12 @@ func getMemoryData(path, name string) (cgroups.MemoryData, error) {
|
|||
memoryData.Failcnt = value
|
||||
value, err = fscommon.GetCgroupParamUint(path, limit)
|
||||
if err != nil {
|
||||
if name == "kmem" && os.IsNotExist(err) {
|
||||
// Ignore ENOENT as kmem.limit_in_bytes has
|
||||
// been removed in newer kernels.
|
||||
return memoryData, nil
|
||||
}
|
||||
|
||||
return cgroups.MemoryData{}, err
|
||||
}
|
||||
memoryData.Limit = value
|
||||
|
|
|
@ -249,12 +249,13 @@ func TestMemoryStats(t *testing.T) {
|
|||
t.Fatal(err)
|
||||
}
|
||||
expectedStats := cgroups.MemoryStats{
|
||||
Cache: 512,
|
||||
Usage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192},
|
||||
SwapUsage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192},
|
||||
KernelUsage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192},
|
||||
Stats: map[string]uint64{"cache": 512, "rss": 1024},
|
||||
UseHierarchy: true,
|
||||
Cache: 512,
|
||||
Usage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192},
|
||||
SwapUsage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192},
|
||||
SwapOnlyUsage: cgroups.MemoryData{Usage: 0, MaxUsage: 0, Failcnt: 0, Limit: 0},
|
||||
KernelUsage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192},
|
||||
Stats: map[string]uint64{"cache": 512, "rss": 1024},
|
||||
UseHierarchy: true,
|
||||
PageUsageByNUMA: cgroups.PageUsageByNUMA{
|
||||
PageUsageByNUMAInner: cgroups.PageUsageByNUMAInner{
|
||||
Total: cgroups.PageStats{Total: 44611, Nodes: map[uint8]uint64{0: 32631, 1: 7501, 2: 1982, 3: 2497}},
|
||||
|
|
|
@ -83,6 +83,7 @@ func tryDefaultCgroupRoot() string {
|
|||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
defer dir.Close()
|
||||
names, err := dir.Readdirnames(1)
|
||||
if err != nil {
|
||||
return ""
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
package fs2
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"os"
|
||||
"strconv"
|
||||
|
||||
"github.com/opencontainers/runc/libcontainer/cgroups"
|
||||
|
@ -16,8 +18,22 @@ func setHugeTlb(dirPath string, r *configs.Resources) error {
|
|||
if !isHugeTlbSet(r) {
|
||||
return nil
|
||||
}
|
||||
const suffix = ".max"
|
||||
skipRsvd := false
|
||||
for _, hugetlb := range r.HugetlbLimit {
|
||||
if err := cgroups.WriteFile(dirPath, "hugetlb."+hugetlb.Pagesize+".max", strconv.FormatUint(hugetlb.Limit, 10)); err != nil {
|
||||
prefix := "hugetlb." + hugetlb.Pagesize
|
||||
val := strconv.FormatUint(hugetlb.Limit, 10)
|
||||
if err := cgroups.WriteFile(dirPath, prefix+suffix, val); err != nil {
|
||||
return err
|
||||
}
|
||||
if skipRsvd {
|
||||
continue
|
||||
}
|
||||
if err := cgroups.WriteFile(dirPath, prefix+".rsvd"+suffix, val); err != nil {
|
||||
if errors.Is(err, os.ErrNotExist) {
|
||||
skipRsvd = true
|
||||
continue
|
||||
}
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
@ -27,15 +43,21 @@ func setHugeTlb(dirPath string, r *configs.Resources) error {
|
|||
|
||||
func statHugeTlb(dirPath string, stats *cgroups.Stats) error {
|
||||
hugetlbStats := cgroups.HugetlbStats{}
|
||||
rsvd := ".rsvd"
|
||||
for _, pagesize := range cgroups.HugePageSizes() {
|
||||
value, err := fscommon.GetCgroupParamUint(dirPath, "hugetlb."+pagesize+".current")
|
||||
again:
|
||||
prefix := "hugetlb." + pagesize + rsvd
|
||||
value, err := fscommon.GetCgroupParamUint(dirPath, prefix+".current")
|
||||
if err != nil {
|
||||
if rsvd != "" && errors.Is(err, os.ErrNotExist) {
|
||||
rsvd = ""
|
||||
goto again
|
||||
}
|
||||
return err
|
||||
}
|
||||
hugetlbStats.Usage = value
|
||||
|
||||
fileName := "hugetlb." + pagesize + ".events"
|
||||
value, err = fscommon.GetValueByKey(dirPath, fileName, "max")
|
||||
value, err = fscommon.GetValueByKey(dirPath, prefix+".events", "max")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
|
|
@ -100,17 +100,20 @@ func statMemory(dirPath string, stats *cgroups.Stats) error {
|
|||
memoryUsage, err := getMemoryDataV2(dirPath, "")
|
||||
if err != nil {
|
||||
if errors.Is(err, unix.ENOENT) && dirPath == UnifiedMountpoint {
|
||||
// The root cgroup does not have memory.{current,max}
|
||||
// so emulate those using data from /proc/meminfo.
|
||||
return statsFromMeminfo(stats)
|
||||
// The root cgroup does not have memory.{current,max,peak}
|
||||
// so emulate those using data from /proc/meminfo and
|
||||
// /sys/fs/cgroup/memory.stat
|
||||
return rootStatsFromMeminfo(stats)
|
||||
}
|
||||
return err
|
||||
}
|
||||
stats.MemoryStats.Usage = memoryUsage
|
||||
swapUsage, err := getMemoryDataV2(dirPath, "swap")
|
||||
swapOnlyUsage, err := getMemoryDataV2(dirPath, "swap")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
stats.MemoryStats.SwapOnlyUsage = swapOnlyUsage
|
||||
swapUsage := swapOnlyUsage
|
||||
// As cgroup v1 reports SwapUsage values as mem+swap combined,
|
||||
// while in cgroup v2 swap values do not include memory,
|
||||
// report combined mem+swap for v1 compatibility.
|
||||
|
@ -118,6 +121,9 @@ func statMemory(dirPath string, stats *cgroups.Stats) error {
|
|||
if swapUsage.Limit != math.MaxUint64 {
|
||||
swapUsage.Limit += memoryUsage.Limit
|
||||
}
|
||||
// The `MaxUsage` of mem+swap cannot simply combine mem with
|
||||
// swap. So set it to 0 for v1 compatibility.
|
||||
swapUsage.MaxUsage = 0
|
||||
stats.MemoryStats.SwapUsage = swapUsage
|
||||
|
||||
return nil
|
||||
|
@ -132,6 +138,7 @@ func getMemoryDataV2(path, name string) (cgroups.MemoryData, error) {
|
|||
}
|
||||
usage := moduleName + ".current"
|
||||
limit := moduleName + ".max"
|
||||
maxUsage := moduleName + ".peak"
|
||||
|
||||
value, err := fscommon.GetCgroupParamUint(path, usage)
|
||||
if err != nil {
|
||||
|
@ -151,10 +158,18 @@ func getMemoryDataV2(path, name string) (cgroups.MemoryData, error) {
|
|||
}
|
||||
memoryData.Limit = value
|
||||
|
||||
// `memory.peak` since kernel 5.19
|
||||
// `memory.swap.peak` since kernel 6.5
|
||||
value, err = fscommon.GetCgroupParamUint(path, maxUsage)
|
||||
if err != nil && !os.IsNotExist(err) {
|
||||
return cgroups.MemoryData{}, err
|
||||
}
|
||||
memoryData.MaxUsage = value
|
||||
|
||||
return memoryData, nil
|
||||
}
|
||||
|
||||
func statsFromMeminfo(stats *cgroups.Stats) error {
|
||||
func rootStatsFromMeminfo(stats *cgroups.Stats) error {
|
||||
const file = "/proc/meminfo"
|
||||
f, err := os.Open(file)
|
||||
if err != nil {
|
||||
|
@ -166,14 +181,10 @@ func statsFromMeminfo(stats *cgroups.Stats) error {
|
|||
var (
|
||||
swap_free uint64
|
||||
swap_total uint64
|
||||
main_total uint64
|
||||
main_free uint64
|
||||
)
|
||||
mem := map[string]*uint64{
|
||||
"SwapFree": &swap_free,
|
||||
"SwapTotal": &swap_total,
|
||||
"MemTotal": &main_total,
|
||||
"MemFree": &main_free,
|
||||
}
|
||||
|
||||
found := 0
|
||||
|
@ -206,11 +217,18 @@ func statsFromMeminfo(stats *cgroups.Stats) error {
|
|||
return &parseError{Path: "", File: file, Err: err}
|
||||
}
|
||||
|
||||
// cgroup v1 `usage_in_bytes` reports memory usage as the sum of
|
||||
// - rss (NR_ANON_MAPPED)
|
||||
// - cache (NR_FILE_PAGES)
|
||||
// cgroup v1 reports SwapUsage values as mem+swap combined
|
||||
// cgroup v2 reports rss and cache as anon and file.
|
||||
// sum `anon` + `file` to report the same value as `usage_in_bytes` in v1.
|
||||
// sum swap usage as combined mem+swap usage for consistency as well.
|
||||
stats.MemoryStats.Usage.Usage = stats.MemoryStats.Stats["anon"] + stats.MemoryStats.Stats["file"]
|
||||
stats.MemoryStats.Usage.Limit = math.MaxUint64
|
||||
stats.MemoryStats.SwapUsage.Usage = (swap_total - swap_free) * 1024
|
||||
stats.MemoryStats.SwapUsage.Limit = math.MaxUint64
|
||||
|
||||
stats.MemoryStats.Usage.Usage = (main_total - main_free) * 1024
|
||||
stats.MemoryStats.Usage.Limit = math.MaxUint64
|
||||
stats.MemoryStats.SwapUsage.Usage += stats.MemoryStats.Usage.Usage
|
||||
|
||||
return nil
|
||||
}
|
||||
|
|
|
@ -0,0 +1,155 @@
|
|||
package fs2
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/opencontainers/runc/libcontainer/cgroups"
|
||||
)
|
||||
|
||||
const exampleMemoryStatData = `anon 790425600
|
||||
file 6502666240
|
||||
kernel_stack 7012352
|
||||
pagetables 8867840
|
||||
percpu 2445520
|
||||
sock 40960
|
||||
shmem 6721536
|
||||
file_mapped 656187392
|
||||
file_dirty 1122304
|
||||
file_writeback 0
|
||||
swapcached 10
|
||||
anon_thp 438304768
|
||||
file_thp 0
|
||||
shmem_thp 0
|
||||
inactive_anon 892223488
|
||||
active_anon 2973696
|
||||
inactive_file 5307346944
|
||||
active_file 1179316224
|
||||
unevictable 31477760
|
||||
slab_reclaimable 348866240
|
||||
slab_unreclaimable 10099808
|
||||
slab 358966048
|
||||
workingset_refault_anon 0
|
||||
workingset_refault_file 0
|
||||
workingset_activate_anon 0
|
||||
workingset_activate_file 0
|
||||
workingset_restore_anon 0
|
||||
workingset_restore_file 0
|
||||
workingset_nodereclaim 0
|
||||
pgfault 103216687
|
||||
pgmajfault 6879
|
||||
pgrefill 0
|
||||
pgscan 0
|
||||
pgsteal 0
|
||||
pgactivate 1110217
|
||||
pgdeactivate 292
|
||||
pglazyfree 267
|
||||
pglazyfreed 0
|
||||
thp_fault_alloc 57411
|
||||
thp_collapse_alloc 443`
|
||||
|
||||
func TestStatMemoryPodCgroupNotFound(t *testing.T) {
|
||||
// We're using a fake cgroupfs.
|
||||
cgroups.TestMode = true
|
||||
fakeCgroupDir := t.TempDir()
|
||||
|
||||
// only write memory.stat to ensure pod cgroup usage
|
||||
// still reads memory.current.
|
||||
statPath := filepath.Join(fakeCgroupDir, "memory.stat")
|
||||
if err := os.WriteFile(statPath, []byte(exampleMemoryStatData), 0o644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
gotStats := cgroups.NewStats()
|
||||
|
||||
// use a fake root path to mismatch the file we wrote.
|
||||
// this triggers the non-root path which should fail to find memory.current.
|
||||
err := statMemory(fakeCgroupDir, gotStats)
|
||||
if err == nil {
|
||||
t.Errorf("expected error when statting memory for cgroupv2 root, but was nil")
|
||||
}
|
||||
|
||||
if !strings.Contains(err.Error(), "memory.current: no such file or directory") {
|
||||
t.Errorf("expected error to contain 'memory.current: no such file or directory', but was %s", err.Error())
|
||||
}
|
||||
}
|
||||
|
||||
func TestStatMemoryPodCgroup(t *testing.T) {
|
||||
// We're using a fake cgroupfs.
|
||||
cgroups.TestMode = true
|
||||
fakeCgroupDir := t.TempDir()
|
||||
|
||||
statPath := filepath.Join(fakeCgroupDir, "memory.stat")
|
||||
if err := os.WriteFile(statPath, []byte(exampleMemoryStatData), 0o644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err := os.WriteFile(filepath.Join(fakeCgroupDir, "memory.current"), []byte("123456789"), 0o644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err := os.WriteFile(filepath.Join(fakeCgroupDir, "memory.max"), []byte("999999999"), 0o644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if err := os.WriteFile(filepath.Join(fakeCgroupDir, "memory.peak"), []byte("987654321"), 0o644); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
gotStats := cgroups.NewStats()
|
||||
|
||||
// use a fake root path to trigger the pod cgroup lookup.
|
||||
err := statMemory(fakeCgroupDir, gotStats)
|
||||
if err != nil {
|
||||
t.Errorf("expected no error when statting memory for cgroupv2 root, but got %#+v", err)
|
||||
}
|
||||
|
||||
// result should be "memory.current"
|
||||
var expectedUsageBytes uint64 = 123456789
|
||||
if gotStats.MemoryStats.Usage.Usage != expectedUsageBytes {
|
||||
t.Errorf("parsed cgroupv2 memory.stat doesn't match expected result: \ngot %#v\nexpected %#v\n", gotStats.MemoryStats.Usage.Usage, expectedUsageBytes)
|
||||
}
|
||||
|
||||
// result should be "memory.max"
|
||||
var expectedLimitBytes uint64 = 999999999
|
||||
if gotStats.MemoryStats.Usage.Limit != expectedLimitBytes {
|
||||
t.Errorf("parsed cgroupv2 memory.stat doesn't match expected result: \ngot %#v\nexpected %#v\n", gotStats.MemoryStats.Usage.Limit, expectedLimitBytes)
|
||||
}
|
||||
|
||||
// result should be "memory.peak"
|
||||
var expectedMaxUsageBytes uint64 = 987654321
|
||||
if gotStats.MemoryStats.Usage.MaxUsage != expectedMaxUsageBytes {
|
||||
t.Errorf("parsed cgroupv2 memory.stat doesn't match expected result: \ngot %#v\nexpected %#v\n", gotStats.MemoryStats.Usage.MaxUsage, expectedMaxUsageBytes)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRootStatsFromMeminfo(t *testing.T) {
|
||||
stats := &cgroups.Stats{
|
||||
MemoryStats: cgroups.MemoryStats{
|
||||
Stats: map[string]uint64{
|
||||
"anon": 790425600,
|
||||
"file": 6502666240,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
if err := rootStatsFromMeminfo(stats); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// result is anon + file
|
||||
var expectedUsageBytes uint64 = 7293091840
|
||||
if stats.MemoryStats.Usage.Usage != expectedUsageBytes {
|
||||
t.Errorf("parsed cgroupv2 memory.stat doesn't match expected result: \ngot %d\nexpected %d\n", stats.MemoryStats.Usage.Usage, expectedUsageBytes)
|
||||
}
|
||||
|
||||
// swap is adjusted to mem+swap
|
||||
if stats.MemoryStats.SwapUsage.Usage < stats.MemoryStats.Usage.Usage {
|
||||
t.Errorf("swap usage %d should be at least mem usage %d", stats.MemoryStats.SwapUsage.Usage, stats.MemoryStats.Usage.Usage)
|
||||
}
|
||||
if stats.MemoryStats.SwapUsage.Limit < stats.MemoryStats.Usage.Limit {
|
||||
t.Errorf("swap limit %d should be at least mem limit %d", stats.MemoryStats.SwapUsage.Limit, stats.MemoryStats.Usage.Limit)
|
||||
}
|
||||
}
|
|
@ -3,6 +3,7 @@ package manager
|
|||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/opencontainers/runc/libcontainer/cgroups/systemd"
|
||||
"github.com/opencontainers/runc/libcontainer/configs"
|
||||
)
|
||||
|
||||
|
@ -10,35 +11,45 @@ import (
|
|||
// config.Resources is nil. While it does not make sense to use a
|
||||
// manager with no resources, it should not result in a panic.
|
||||
//
|
||||
// This tests either v1 or v2 managers (both fs and systemd),
|
||||
// depending on what cgroup version is available on the host.
|
||||
// This tests either v1 or v2 fs cgroup manager, depending on which
|
||||
// cgroup version is available.
|
||||
func TestNilResources(t *testing.T) {
|
||||
for _, sd := range []bool{false, true} {
|
||||
cg := &configs.Cgroup{} // .Resources is nil
|
||||
cg.Systemd = sd
|
||||
mgr, err := New(cg)
|
||||
if err != nil {
|
||||
// Some managers require non-nil Resources during
|
||||
// instantiation -- provide and retry. In such case
|
||||
// we're mostly testing Set(nil) below.
|
||||
cg.Resources = &configs.Resources{}
|
||||
mgr, err = New(cg)
|
||||
if err != nil {
|
||||
t.Error(err)
|
||||
continue
|
||||
}
|
||||
}
|
||||
_ = mgr.Apply(-1)
|
||||
_ = mgr.Set(nil)
|
||||
_ = mgr.Freeze(configs.Thawed)
|
||||
_ = mgr.Exists()
|
||||
_, _ = mgr.GetAllPids()
|
||||
_, _ = mgr.GetCgroups()
|
||||
_, _ = mgr.GetFreezerState()
|
||||
_ = mgr.Path("")
|
||||
_ = mgr.GetPaths()
|
||||
_, _ = mgr.GetStats()
|
||||
_, _ = mgr.OOMKillCount()
|
||||
_ = mgr.Destroy()
|
||||
}
|
||||
testNilResources(t, false)
|
||||
}
|
||||
|
||||
// TestNilResourcesSystemd is the same as TestNilResources,
|
||||
// only checking the systemd cgroup manager.
|
||||
func TestNilResourcesSystemd(t *testing.T) {
|
||||
if !systemd.IsRunningSystemd() {
|
||||
t.Skip("requires systemd")
|
||||
}
|
||||
testNilResources(t, true)
|
||||
}
|
||||
|
||||
func testNilResources(t *testing.T, systemd bool) {
|
||||
cg := &configs.Cgroup{} // .Resources is nil
|
||||
cg.Systemd = systemd
|
||||
mgr, err := New(cg)
|
||||
if err != nil {
|
||||
// Some managers require non-nil Resources during
|
||||
// instantiation -- provide and retry. In such case
|
||||
// we're mostly testing Set(nil) below.
|
||||
cg.Resources = &configs.Resources{}
|
||||
mgr, err = New(cg)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
_ = mgr.Apply(-1)
|
||||
_ = mgr.Set(nil)
|
||||
_ = mgr.Freeze(configs.Thawed)
|
||||
_ = mgr.Exists()
|
||||
_, _ = mgr.GetAllPids()
|
||||
_, _ = mgr.GetCgroups()
|
||||
_, _ = mgr.GetFreezerState()
|
||||
_ = mgr.Path("")
|
||||
_ = mgr.GetPaths()
|
||||
_, _ = mgr.GetStats()
|
||||
_, _ = mgr.OOMKillCount()
|
||||
_ = mgr.Destroy()
|
||||
}
|
||||
|
|
|
@ -78,6 +78,8 @@ type MemoryStats struct {
|
|||
Usage MemoryData `json:"usage,omitempty"`
|
||||
// usage of memory + swap
|
||||
SwapUsage MemoryData `json:"swap_usage,omitempty"`
|
||||
// usage of swap only
|
||||
SwapOnlyUsage MemoryData `json:"swap_only_usage,omitempty"`
|
||||
// usage of kernel memory
|
||||
KernelUsage MemoryData `json:"kernel_usage,omitempty"`
|
||||
// usage of kernel TCP memory
|
||||
|
|
|
@ -177,7 +177,7 @@ func allowAllDevices() []systemdDbus.Property {
|
|||
|
||||
// generateDeviceProperties takes the configured device rules and generates a
|
||||
// corresponding set of systemd properties to configure the devices correctly.
|
||||
func generateDeviceProperties(r *configs.Resources) ([]systemdDbus.Property, error) {
|
||||
func generateDeviceProperties(r *configs.Resources, sdVer int) ([]systemdDbus.Property, error) {
|
||||
if r.SkipDevices {
|
||||
return nil, nil
|
||||
}
|
||||
|
@ -238,9 +238,10 @@ func generateDeviceProperties(r *configs.Resources) ([]systemdDbus.Property, err
|
|||
// trickery to convert things:
|
||||
//
|
||||
// * Concrete rules with non-wildcard major/minor numbers have to use
|
||||
// /dev/{block,char} paths. This is slightly odd because it means
|
||||
// that we cannot add whitelist rules for devices that don't exist,
|
||||
// but there's not too much we can do about that.
|
||||
// /dev/{block,char}/MAJOR:minor paths. Before v240, systemd uses
|
||||
// stat(2) on such paths to look up device properties, meaning we
|
||||
// cannot add whitelist rules for devices that don't exist. Since v240,
|
||||
// device properties are parsed from the path string.
|
||||
//
|
||||
// However, path globbing is not support for path-based rules so we
|
||||
// need to handle wildcards in some other manner.
|
||||
|
@ -288,6 +289,17 @@ func generateDeviceProperties(r *configs.Resources) ([]systemdDbus.Property, err
|
|||
case devices.CharDevice:
|
||||
entry.Path = fmt.Sprintf("/dev/char/%d:%d", rule.Major, rule.Minor)
|
||||
}
|
||||
if sdVer < 240 {
|
||||
// Old systemd versions use stat(2) on path to find out device major:minor
|
||||
// numbers and type. If the path doesn't exist, it will not add the rule,
|
||||
// emitting a warning instead.
|
||||
// Since all of this logic is best-effort anyway (we manually set these
|
||||
// rules separately to systemd) we can safely skip entries that don't
|
||||
// have a corresponding path.
|
||||
if _, err := os.Stat(entry.Path); err != nil {
|
||||
continue
|
||||
}
|
||||
}
|
||||
}
|
||||
deviceAllowList = append(deviceAllowList, entry)
|
||||
}
|
||||
|
@ -335,32 +347,55 @@ func isUnitExists(err error) bool {
|
|||
return isDbusError(err, "org.freedesktop.systemd1.UnitExists")
|
||||
}
|
||||
|
||||
func startUnit(cm *dbusConnManager, unitName string, properties []systemdDbus.Property) error {
|
||||
func startUnit(cm *dbusConnManager, unitName string, properties []systemdDbus.Property, ignoreExist bool) error {
|
||||
statusChan := make(chan string, 1)
|
||||
retry := true
|
||||
|
||||
retry:
|
||||
err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) error {
|
||||
_, err := c.StartTransientUnitContext(context.TODO(), unitName, "replace", properties, statusChan)
|
||||
return err
|
||||
})
|
||||
if err == nil {
|
||||
timeout := time.NewTimer(30 * time.Second)
|
||||
defer timeout.Stop()
|
||||
|
||||
select {
|
||||
case s := <-statusChan:
|
||||
close(statusChan)
|
||||
// Please refer to https://pkg.go.dev/github.com/coreos/go-systemd/v22/dbus#Conn.StartUnit
|
||||
if s != "done" {
|
||||
resetFailedUnit(cm, unitName)
|
||||
return fmt.Errorf("error creating systemd unit `%s`: got `%s`", unitName, s)
|
||||
}
|
||||
case <-timeout.C:
|
||||
resetFailedUnit(cm, unitName)
|
||||
return errors.New("Timeout waiting for systemd to create " + unitName)
|
||||
if err != nil {
|
||||
if !isUnitExists(err) {
|
||||
return err
|
||||
}
|
||||
if ignoreExist {
|
||||
// TODO: remove this hack.
|
||||
// This is kubelet making sure a slice exists (see
|
||||
// https://github.com/opencontainers/runc/pull/1124).
|
||||
return nil
|
||||
}
|
||||
if retry {
|
||||
// In case a unit with the same name exists, this may
|
||||
// be a leftover failed unit. Reset it, so systemd can
|
||||
// remove it, and retry once.
|
||||
err = resetFailedUnit(cm, unitName)
|
||||
if err != nil {
|
||||
logrus.Warnf("unable to reset failed unit: %v", err)
|
||||
}
|
||||
retry = false
|
||||
goto retry
|
||||
}
|
||||
} else if !isUnitExists(err) {
|
||||
return err
|
||||
}
|
||||
|
||||
timeout := time.NewTimer(30 * time.Second)
|
||||
defer timeout.Stop()
|
||||
|
||||
select {
|
||||
case s := <-statusChan:
|
||||
close(statusChan)
|
||||
// Please refer to https://pkg.go.dev/github.com/coreos/go-systemd/v22/dbus#Conn.StartUnit
|
||||
if s != "done" {
|
||||
_ = resetFailedUnit(cm, unitName)
|
||||
return fmt.Errorf("error creating systemd unit `%s`: got `%s`", unitName, s)
|
||||
}
|
||||
case <-timeout.C:
|
||||
_ = resetFailedUnit(cm, unitName)
|
||||
return errors.New("Timeout waiting for systemd to create " + unitName)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
|
@ -385,16 +420,17 @@ func stopUnit(cm *dbusConnManager, unitName string) error {
|
|||
return errors.New("Timed out while waiting for systemd to remove " + unitName)
|
||||
}
|
||||
}
|
||||
|
||||
// In case of a failed unit, let systemd remove it.
|
||||
_ = resetFailedUnit(cm, unitName)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func resetFailedUnit(cm *dbusConnManager, name string) {
|
||||
err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) error {
|
||||
func resetFailedUnit(cm *dbusConnManager, name string) error {
|
||||
return cm.retryOnDisconnect(func(c *systemdDbus.Conn) error {
|
||||
return c.ResetFailedUnitContext(context.TODO(), name)
|
||||
})
|
||||
if err != nil {
|
||||
logrus.Warnf("unable to reset failed unit: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func getUnitTypeProperty(cm *dbusConnManager, unitName string, unitType string, propertyName string) (*systemdDbus.Property, error) {
|
||||
|
|
|
@ -51,5 +51,10 @@ func RangeToBits(str string) ([]byte, error) {
|
|||
// do not allow empty values
|
||||
return nil, errors.New("empty value")
|
||||
}
|
||||
|
||||
// fit cpuset parsing order in systemd
|
||||
for l, r := 0, len(ret)-1; l < r; l, r = l+1, r-1 {
|
||||
ret[l], ret[r] = ret[r], ret[l]
|
||||
}
|
||||
return ret, nil
|
||||
}
|
||||
|
|
|
@ -22,13 +22,13 @@ func TestRangeToBits(t *testing.T) {
|
|||
{in: "4-7", out: []byte{0xf0}},
|
||||
{in: "0-7", out: []byte{0xff}},
|
||||
{in: "0-15", out: []byte{0xff, 0xff}},
|
||||
{in: "16", out: []byte{1, 0, 0}},
|
||||
{in: "0-3,32-33", out: []byte{3, 0, 0, 0, 0x0f}},
|
||||
{in: "16", out: []byte{0, 0, 1}},
|
||||
{in: "0-3,32-33", out: []byte{0x0f, 0, 0, 0, 3}},
|
||||
// extra spaces and tabs are ok
|
||||
{in: "1, 2, 1-2", out: []byte{6}},
|
||||
{in: " , 1 , 3 , 5-7, ", out: []byte{0xea}},
|
||||
// somewhat large values
|
||||
{in: "128-130,1", out: []byte{7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2}},
|
||||
{in: "128-130,1", out: []byte{2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7}},
|
||||
|
||||
{in: "-", isErr: true},
|
||||
{in: "1-", isErr: true},
|
||||
|
|
|
@ -2,6 +2,7 @@ package systemd
|
|||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"sync"
|
||||
|
||||
|
@ -80,8 +81,6 @@ func (d *dbusConnManager) resetConnection(conn *systemdDbus.Conn) {
|
|||
}
|
||||
}
|
||||
|
||||
var errDbusConnClosed = dbus.ErrClosed.Error()
|
||||
|
||||
// retryOnDisconnect calls op, and if the error it returns is about closed dbus
|
||||
// connection, the connection is re-established and the op is retried. This helps
|
||||
// with the situation when dbus is restarted and we have a stale connection.
|
||||
|
@ -92,7 +91,10 @@ func (d *dbusConnManager) retryOnDisconnect(op func(*systemdDbus.Conn) error) er
|
|||
return err
|
||||
}
|
||||
err = op(conn)
|
||||
if !isDbusError(err, errDbusConnClosed) {
|
||||
if err == nil {
|
||||
return nil
|
||||
}
|
||||
if !errors.Is(err, dbus.ErrClosed) {
|
||||
return err
|
||||
}
|
||||
d.resetConnection(conn)
|
||||
|
|
|
@ -127,7 +127,7 @@ func TestPodSkipDevicesUpdate(t *testing.T) {
|
|||
|
||||
// Create a "container" within the "pod" cgroup.
|
||||
// This is not a real container, just a process in the cgroup.
|
||||
cmd := exec.Command("bash", "-c", "while true; do echo > /dev/null; done")
|
||||
cmd := exec.Command("sleep", "infinity")
|
||||
cmd.Env = append(os.Environ(), "LANG=C")
|
||||
var stderr bytes.Buffer
|
||||
cmd.Stderr = &stderr
|
||||
|
@ -183,6 +183,11 @@ func testSkipDevices(t *testing.T, skipDevices bool, expected []string) {
|
|||
if os.Geteuid() != 0 {
|
||||
t.Skip("Test requires root.")
|
||||
}
|
||||
// https://github.com/opencontainers/runc/issues/3743
|
||||
centosVer, _ := exec.Command("rpm", "-q", "--qf", "%{version}", "centos-release").CombinedOutput()
|
||||
if string(centosVer) == "7" {
|
||||
t.Skip("Flaky on CentOS 7")
|
||||
}
|
||||
|
||||
podConfig := &configs.Cgroup{
|
||||
Parent: "system.slice",
|
||||
|
|
|
@ -71,12 +71,13 @@ var legacySubsystems = []subsystem{
|
|||
&fs.NetClsGroup{},
|
||||
&fs.NameGroup{GroupName: "name=systemd"},
|
||||
&fs.RdmaGroup{},
|
||||
&fs.NameGroup{GroupName: "misc"},
|
||||
}
|
||||
|
||||
func genV1ResourcesProperties(r *configs.Resources, cm *dbusConnManager) ([]systemdDbus.Property, error) {
|
||||
var properties []systemdDbus.Property
|
||||
|
||||
deviceProperties, err := generateDeviceProperties(r)
|
||||
deviceProperties, err := generateDeviceProperties(r, systemdVersion(cm))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
@ -206,7 +207,7 @@ func (m *legacyManager) Apply(pid int) error {
|
|||
|
||||
properties = append(properties, c.SystemdProps...)
|
||||
|
||||
if err := startUnit(m.dbus, unitName, properties); err != nil {
|
||||
if err := startUnit(m.dbus, unitName, properties, pid == -1); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
|
@ -273,14 +274,7 @@ func getSubsystemPath(slice, unit, subsystem string) (string, error) {
|
|||
return "", err
|
||||
}
|
||||
|
||||
initPath, err := cgroups.GetInitCgroup(subsystem)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
// if pid 1 is systemd 226 or later, it will be in init.scope, not the root
|
||||
initPath = strings.TrimSuffix(filepath.Clean(initPath), "init.scope")
|
||||
|
||||
return filepath.Join(mountpoint, initPath, slice, unit), nil
|
||||
return filepath.Join(mountpoint, slice, unit), nil
|
||||
}
|
||||
|
||||
func (m *legacyManager) Freeze(state configs.FreezerState) error {
|
||||
|
@ -423,6 +417,15 @@ func (m *legacyManager) Set(r *configs.Resources) error {
|
|||
if err := m.doFreeze(configs.Frozen); err != nil {
|
||||
// If freezer cgroup isn't supported, we just warn about it.
|
||||
logrus.Infof("freeze container before SetUnitProperties failed: %v", err)
|
||||
// skip update the cgroup while frozen failed. #3803
|
||||
if !errors.Is(err, errSubsystemDoesNotExist) {
|
||||
if needsThaw {
|
||||
if thawErr := m.doFreeze(configs.Thawed); thawErr != nil {
|
||||
logrus.Infof("thaw container after doFreeze failed: %v", thawErr)
|
||||
}
|
||||
}
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
setErr := setUnitProperties(m.dbus, unitName, properties...)
|
||||
|
|
|
@ -2,6 +2,7 @@ package systemd
|
|||
|
||||
import (
|
||||
"bufio"
|
||||
"errors"
|
||||
"fmt"
|
||||
"math"
|
||||
"os"
|
||||
|
@ -181,7 +182,7 @@ func genV2ResourcesProperties(r *configs.Resources, cm *dbusConnManager) ([]syst
|
|||
// aren't the end of the world, but it is a bit concerning. However
|
||||
// it's unclear if systemd removes all eBPF programs attached when
|
||||
// doing SetUnitProperties...
|
||||
deviceProperties, err := generateDeviceProperties(r)
|
||||
deviceProperties, err := generateDeviceProperties(r, systemdVersion(cm))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
@ -283,7 +284,7 @@ func (m *unifiedManager) Apply(pid int) error {
|
|||
|
||||
properties = append(properties, c.SystemdProps...)
|
||||
|
||||
if err := startUnit(m.dbus, unitName, properties); err != nil {
|
||||
if err := startUnit(m.dbus, unitName, properties, pid == -1); err != nil {
|
||||
return fmt.Errorf("unable to start unit %q (properties %+v): %w", unitName, properties, err)
|
||||
}
|
||||
|
||||
|
@ -292,6 +293,12 @@ func (m *unifiedManager) Apply(pid int) error {
|
|||
}
|
||||
|
||||
if c.OwnerUID != nil {
|
||||
// The directory itself must be chowned.
|
||||
err := os.Chown(m.path, *c.OwnerUID, -1)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
filesToChown, err := cgroupFilesToChown()
|
||||
if err != nil {
|
||||
return err
|
||||
|
@ -299,7 +306,8 @@ func (m *unifiedManager) Apply(pid int) error {
|
|||
|
||||
for _, v := range filesToChown {
|
||||
err := os.Chown(m.path+"/"+v, *c.OwnerUID, -1)
|
||||
if err != nil {
|
||||
// Some files might not be present.
|
||||
if err != nil && !errors.Is(err, os.ErrNotExist) {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
@ -312,21 +320,23 @@ func (m *unifiedManager) Apply(pid int) error {
|
|||
// uid in /sys/kernel/cgroup/delegate. If the file is not present
|
||||
// (Linux < 4.15), use the initial values mentioned in cgroups(7).
|
||||
func cgroupFilesToChown() ([]string, error) {
|
||||
filesToChown := []string{"."} // the directory itself must be chowned
|
||||
const cgroupDelegateFile = "/sys/kernel/cgroup/delegate"
|
||||
|
||||
f, err := os.Open(cgroupDelegateFile)
|
||||
if err == nil {
|
||||
defer f.Close()
|
||||
scanner := bufio.NewScanner(f)
|
||||
for scanner.Scan() {
|
||||
filesToChown = append(filesToChown, scanner.Text())
|
||||
}
|
||||
if err := scanner.Err(); err != nil {
|
||||
return nil, fmt.Errorf("error reading %s: %w", cgroupDelegateFile, err)
|
||||
}
|
||||
} else {
|
||||
filesToChown = append(filesToChown, "cgroup.procs", "cgroup.subtree_control", "cgroup.threads")
|
||||
if err != nil {
|
||||
return []string{"cgroup.procs", "cgroup.subtree_control", "cgroup.threads"}, nil
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
filesToChown := []string{}
|
||||
scanner := bufio.NewScanner(f)
|
||||
for scanner.Scan() {
|
||||
filesToChown = append(filesToChown, scanner.Text())
|
||||
}
|
||||
if err := scanner.Err(); err != nil {
|
||||
return nil, fmt.Errorf("error reading %s: %w", cgroupDelegateFile, err)
|
||||
}
|
||||
|
||||
return filesToChown, nil
|
||||
}
|
||||
|
||||
|
|
|
@ -55,12 +55,12 @@ func IsCgroup2HybridMode() bool {
|
|||
var st unix.Statfs_t
|
||||
err := unix.Statfs(hybridMountpoint, &st)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
// ignore the "not found" error
|
||||
isHybrid = false
|
||||
return
|
||||
isHybrid = false
|
||||
if !os.IsNotExist(err) {
|
||||
// Report unexpected errors.
|
||||
logrus.WithError(err).Debugf("statfs(%q) failed", hybridMountpoint)
|
||||
}
|
||||
panic(fmt.Sprintf("cannot statfs cgroup root: %s", err))
|
||||
return
|
||||
}
|
||||
isHybrid = st.Type == unix.CGROUP2_SUPER_MAGIC
|
||||
})
|
||||
|
@ -162,8 +162,10 @@ func readProcsFile(dir string) ([]int, error) {
|
|||
|
||||
// ParseCgroupFile parses the given cgroup file, typically /proc/self/cgroup
|
||||
// or /proc/<pid>/cgroup, into a map of subsystems to cgroup paths, e.g.
|
||||
// "cpu": "/user.slice/user-1000.slice"
|
||||
// "pids": "/user.slice/user-1000.slice"
|
||||
//
|
||||
// "cpu": "/user.slice/user-1000.slice"
|
||||
// "pids": "/user.slice/user-1000.slice"
|
||||
//
|
||||
// etc.
|
||||
//
|
||||
// Note that for cgroup v2 unified hierarchy, there are no per-controller
|
||||
|
|
|
@ -21,9 +21,9 @@ type Rlimit struct {
|
|||
|
||||
// IDMap represents UID/GID Mappings for User Namespaces.
|
||||
type IDMap struct {
|
||||
ContainerID int `json:"container_id"`
|
||||
HostID int `json:"host_id"`
|
||||
Size int `json:"size"`
|
||||
ContainerID int64 `json:"container_id"`
|
||||
HostID int64 `json:"host_id"`
|
||||
Size int64 `json:"size"`
|
||||
}
|
||||
|
||||
// Seccomp represents syscall restrictions
|
||||
|
|
|
@ -1,6 +1,10 @@
|
|||
package configs
|
||||
|
||||
import "errors"
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"math"
|
||||
)
|
||||
|
||||
var (
|
||||
errNoUIDMap = errors.New("User namespaces enabled, but no uid mappings found.")
|
||||
|
@ -16,11 +20,18 @@ func (c Config) HostUID(containerId int) (int, error) {
|
|||
if c.UidMappings == nil {
|
||||
return -1, errNoUIDMap
|
||||
}
|
||||
id, found := c.hostIDFromMapping(containerId, c.UidMappings)
|
||||
id, found := c.hostIDFromMapping(int64(containerId), c.UidMappings)
|
||||
if !found {
|
||||
return -1, errNoUserMap
|
||||
}
|
||||
return id, nil
|
||||
// If we are a 32-bit binary running on a 64-bit system, it's possible
|
||||
// the mapped user is too large to store in an int, which means we
|
||||
// cannot do the mapping. We can't just return an int64, because
|
||||
// os.Setuid() takes an int.
|
||||
if id > math.MaxInt {
|
||||
return -1, fmt.Errorf("mapping for uid %d (host id %d) is larger than native integer size (%d)", containerId, id, math.MaxInt)
|
||||
}
|
||||
return int(id), nil
|
||||
}
|
||||
// Return unchanged id.
|
||||
return containerId, nil
|
||||
|
@ -39,11 +50,18 @@ func (c Config) HostGID(containerId int) (int, error) {
|
|||
if c.GidMappings == nil {
|
||||
return -1, errNoGIDMap
|
||||
}
|
||||
id, found := c.hostIDFromMapping(containerId, c.GidMappings)
|
||||
id, found := c.hostIDFromMapping(int64(containerId), c.GidMappings)
|
||||
if !found {
|
||||
return -1, errNoGroupMap
|
||||
}
|
||||
return id, nil
|
||||
// If we are a 32-bit binary running on a 64-bit system, it's possible
|
||||
// the mapped user is too large to store in an int, which means we
|
||||
// cannot do the mapping. We can't just return an int64, because
|
||||
// os.Setgid() takes an int.
|
||||
if id > math.MaxInt {
|
||||
return -1, fmt.Errorf("mapping for gid %d (host id %d) is larger than native integer size (%d)", containerId, id, math.MaxInt)
|
||||
}
|
||||
return int(id), nil
|
||||
}
|
||||
// Return unchanged id.
|
||||
return containerId, nil
|
||||
|
@ -57,7 +75,7 @@ func (c Config) HostRootGID() (int, error) {
|
|||
|
||||
// Utility function that gets a host ID for a container ID from user namespace map
|
||||
// if that ID is present in the map.
|
||||
func (c Config) hostIDFromMapping(containerID int, uMap []IDMap) (int, bool) {
|
||||
func (c Config) hostIDFromMapping(containerID int64, uMap []IDMap) (int64, bool) {
|
||||
for _, m := range uMap {
|
||||
if (containerID >= m.ContainerID) && (containerID <= (m.ContainerID + m.Size - 1)) {
|
||||
hostID := m.HostID + (containerID - m.ContainerID)
|
||||
|
|
|
@ -28,25 +28,18 @@ func (v *ConfigValidator) rootlessEUID(config *configs.Config) error {
|
|||
return nil
|
||||
}
|
||||
|
||||
func hasIDMapping(id int, mappings []configs.IDMap) bool {
|
||||
for _, m := range mappings {
|
||||
if id >= m.ContainerID && id < m.ContainerID+m.Size {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func rootlessEUIDMappings(config *configs.Config) error {
|
||||
if !config.Namespaces.Contains(configs.NEWUSER) {
|
||||
return errors.New("rootless container requires user namespaces")
|
||||
}
|
||||
|
||||
if len(config.UidMappings) == 0 {
|
||||
return errors.New("rootless containers requires at least one UID mapping")
|
||||
}
|
||||
if len(config.GidMappings) == 0 {
|
||||
return errors.New("rootless containers requires at least one GID mapping")
|
||||
// We only require mappings if we are not joining another userns.
|
||||
if path := config.Namespaces.PathOf(configs.NEWUSER); path == "" {
|
||||
if len(config.UidMappings) == 0 {
|
||||
return errors.New("rootless containers requires at least one UID mapping")
|
||||
}
|
||||
if len(config.GidMappings) == 0 {
|
||||
return errors.New("rootless containers requires at least one GID mapping")
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
@ -70,8 +63,8 @@ func rootlessEUIDMount(config *configs.Config) error {
|
|||
// Ignore unknown mount options.
|
||||
continue
|
||||
}
|
||||
if !hasIDMapping(uid, config.UidMappings) {
|
||||
return errors.New("cannot specify uid= mount options for unmapped uid in rootless containers")
|
||||
if _, err := config.HostUID(uid); err != nil {
|
||||
return fmt.Errorf("cannot specify uid=%d mount option for rootless container: %w", uid, err)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -82,8 +75,8 @@ func rootlessEUIDMount(config *configs.Config) error {
|
|||
// Ignore unknown mount options.
|
||||
continue
|
||||
}
|
||||
if !hasIDMapping(gid, config.GidMappings) {
|
||||
return errors.New("cannot specify gid= mount options for unmapped gid in rootless containers")
|
||||
if _, err := config.HostGID(gid); err != nil {
|
||||
return fmt.Errorf("cannot specify gid=%d mount option for rootless container: %w", gid, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -109,11 +109,19 @@ func (v *ConfigValidator) security(config *configs.Config) error {
|
|||
func (v *ConfigValidator) usernamespace(config *configs.Config) error {
|
||||
if config.Namespaces.Contains(configs.NEWUSER) {
|
||||
if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) {
|
||||
return errors.New("USER namespaces aren't enabled in the kernel")
|
||||
return errors.New("user namespaces aren't enabled in the kernel")
|
||||
}
|
||||
hasPath := config.Namespaces.PathOf(configs.NEWUSER) != ""
|
||||
hasMappings := config.UidMappings != nil || config.GidMappings != nil
|
||||
if !hasPath && !hasMappings {
|
||||
return errors.New("user namespaces enabled, but no namespace path to join nor mappings to apply specified")
|
||||
}
|
||||
// The hasPath && hasMappings validation case is handled in specconv --
|
||||
// we cache the mappings in Config during specconv in the hasPath case,
|
||||
// so we cannot do that validation here.
|
||||
} else {
|
||||
if config.UidMappings != nil || config.GidMappings != nil {
|
||||
return errors.New("User namespace mappings specified, but USER namespace isn't enabled in the config")
|
||||
return errors.New("user namespace mappings specified, but user namespace isn't enabled in the config")
|
||||
}
|
||||
}
|
||||
return nil
|
||||
|
@ -131,9 +139,8 @@ func (v *ConfigValidator) cgroupnamespace(config *configs.Config) error {
|
|||
// convertSysctlVariableToDotsSeparator can return sysctl variables in dots separator format.
|
||||
// The '/' separator is also accepted in place of a '.'.
|
||||
// Convert the sysctl variables to dots separator format for validation.
|
||||
// More info:
|
||||
// https://man7.org/linux/man-pages/man8/sysctl.8.html
|
||||
// https://man7.org/linux/man-pages/man5/sysctl.d.5.html
|
||||
// More info: sysctl(8), sysctl.d(5).
|
||||
//
|
||||
// For example:
|
||||
// Input sysctl variable "net/ipv4/conf/eno2.100.rp_filter"
|
||||
// will return the converted value "net.ipv4.conf.eno2/100.rp_filter"
|
||||
|
@ -229,10 +236,6 @@ func (v *ConfigValidator) sysctl(config *configs.Config) error {
|
|||
|
||||
func (v *ConfigValidator) intelrdt(config *configs.Config) error {
|
||||
if config.IntelRdt != nil {
|
||||
if !intelrdt.IsCATEnabled() && !intelrdt.IsMBAEnabled() {
|
||||
return errors.New("intelRdt is specified in config, but Intel RDT is not supported or enabled")
|
||||
}
|
||||
|
||||
if config.IntelRdt.ClosID == "." || config.IntelRdt.ClosID == ".." || strings.Contains(config.IntelRdt.ClosID, "/") {
|
||||
return fmt.Errorf("invalid intelRdt.ClosID %q", config.IntelRdt.ClosID)
|
||||
}
|
||||
|
|
|
@ -150,7 +150,7 @@ func TestValidateSecurityWithoutNEWNS(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestValidateUsernamespace(t *testing.T) {
|
||||
func TestValidateUserNamespace(t *testing.T) {
|
||||
if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) {
|
||||
t.Skip("Test requires userns.")
|
||||
}
|
||||
|
@ -161,6 +161,8 @@ func TestValidateUsernamespace(t *testing.T) {
|
|||
{Type: configs.NEWUSER},
|
||||
},
|
||||
),
|
||||
UidMappings: []configs.IDMap{{HostID: 0, ContainerID: 123, Size: 100}},
|
||||
GidMappings: []configs.IDMap{{HostID: 0, ContainerID: 123, Size: 100}},
|
||||
}
|
||||
|
||||
validator := New()
|
||||
|
@ -170,11 +172,11 @@ func TestValidateUsernamespace(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestValidateUsernamespaceWithoutUserNS(t *testing.T) {
|
||||
uidMap := configs.IDMap{ContainerID: 123}
|
||||
func TestValidateUsernsMappingWithoutNamespace(t *testing.T) {
|
||||
config := &configs.Config{
|
||||
Rootfs: "/var",
|
||||
UidMappings: []configs.IDMap{uidMap},
|
||||
UidMappings: []configs.IDMap{{HostID: 0, ContainerID: 123, Size: 100}},
|
||||
GidMappings: []configs.IDMap{{HostID: 0, ContainerID: 123, Size: 100}},
|
||||
}
|
||||
|
||||
validator := New()
|
||||
|
|
|
@ -40,7 +40,7 @@ type linuxContainer struct {
|
|||
root string
|
||||
config *configs.Config
|
||||
cgroupManager cgroups.Manager
|
||||
intelRdtManager intelrdt.Manager
|
||||
intelRdtManager *intelrdt.Manager
|
||||
initPath string
|
||||
initArgs []string
|
||||
initProcess parentProcess
|
||||
|
@ -146,19 +146,21 @@ func (c *linuxContainer) OCIState() (*specs.State, error) {
|
|||
return c.currentOCIState()
|
||||
}
|
||||
|
||||
func (c *linuxContainer) Processes() ([]int, error) {
|
||||
var pids []int
|
||||
status, err := c.currentStatus()
|
||||
if err != nil {
|
||||
return pids, err
|
||||
// ignoreCgroupError filters out cgroup-related errors that can be ignored,
|
||||
// because the container is stopped and its cgroup is gone.
|
||||
func (c *linuxContainer) ignoreCgroupError(err error) error {
|
||||
if err == nil {
|
||||
return nil
|
||||
}
|
||||
// for systemd cgroup, the unit's cgroup path will be auto removed if container's all processes exited
|
||||
if status == Stopped && !c.cgroupManager.Exists() {
|
||||
return pids, nil
|
||||
if errors.Is(err, os.ErrNotExist) && c.runType() == Stopped && !c.cgroupManager.Exists() {
|
||||
return nil
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
pids, err = c.cgroupManager.GetAllPids()
|
||||
if err != nil {
|
||||
func (c *linuxContainer) Processes() ([]int, error) {
|
||||
pids, err := c.cgroupManager.GetAllPids()
|
||||
if err = c.ignoreCgroupError(err); err != nil {
|
||||
return nil, fmt.Errorf("unable to get all container pids: %w", err)
|
||||
}
|
||||
return pids, nil
|
||||
|
@ -351,6 +353,15 @@ func (c *linuxContainer) start(process *Process) (retErr error) {
|
|||
}()
|
||||
}
|
||||
|
||||
// Before starting "runc init", mark all non-stdio open files as O_CLOEXEC
|
||||
// to make sure we don't leak any files into "runc init". Any files to be
|
||||
// passed to "runc init" through ExtraFiles will get dup2'd by the Go
|
||||
// runtime and thus their O_CLOEXEC flag will be cleared. This is some
|
||||
// additional protection against attacks like CVE-2024-21626, by making
|
||||
// sure we never leak files to "runc init" we didn't intend to.
|
||||
if err := utils.CloseExecFrom(3); err != nil {
|
||||
return fmt.Errorf("unable to mark non-stdio fds as cloexec: %w", err)
|
||||
}
|
||||
if err := parent.start(); err != nil {
|
||||
return fmt.Errorf("unable to start container process: %w", err)
|
||||
}
|
||||
|
@ -382,11 +393,12 @@ func (c *linuxContainer) Signal(s os.Signal, all bool) error {
|
|||
return err
|
||||
}
|
||||
if all {
|
||||
// for systemd cgroup, the unit's cgroup path will be auto removed if container's all processes exited
|
||||
if status == Stopped && !c.cgroupManager.Exists() {
|
||||
// Avoid calling signalAllProcesses which may print
|
||||
// a warning trying to freeze a non-existing cgroup.
|
||||
return nil
|
||||
}
|
||||
return signalAllProcesses(c.cgroupManager, s)
|
||||
return c.ignoreCgroupError(signalAllProcesses(c.cgroupManager, s))
|
||||
}
|
||||
// to avoid a PID reuse attack
|
||||
if status == Running || status == Created || status == Paused {
|
||||
|
@ -636,7 +648,11 @@ func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, messageSockP
|
|||
// cgroup v1: using the same path for all controllers.
|
||||
// cgroup v2: the only possible way.
|
||||
for k := range proc.cgroupPaths {
|
||||
proc.cgroupPaths[k] = path.Join(proc.cgroupPaths[k], add)
|
||||
subPath := path.Join(proc.cgroupPaths[k], add)
|
||||
if !strings.HasPrefix(subPath, proc.cgroupPaths[k]) {
|
||||
return nil, fmt.Errorf("%s is not a sub cgroup path", add)
|
||||
}
|
||||
proc.cgroupPaths[k] = subPath
|
||||
}
|
||||
// cgroup v2: do not try to join init process's cgroup
|
||||
// as a fallback (see (*setnsProcess).start).
|
||||
|
@ -645,7 +661,11 @@ func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, messageSockP
|
|||
// Per-controller paths.
|
||||
for ctrl, add := range p.SubCgroupPaths {
|
||||
if val, ok := proc.cgroupPaths[ctrl]; ok {
|
||||
proc.cgroupPaths[ctrl] = path.Join(val, add)
|
||||
subPath := path.Join(val, add)
|
||||
if !strings.HasPrefix(subPath, val) {
|
||||
return nil, fmt.Errorf("%s is not a sub cgroup path", add)
|
||||
}
|
||||
proc.cgroupPaths[ctrl] = subPath
|
||||
} else {
|
||||
return nil, fmt.Errorf("unknown controller %s in SubCgroupPaths", ctrl)
|
||||
}
|
||||
|
@ -918,7 +938,7 @@ func (c *linuxContainer) criuSupportsExtNS(t configs.NamespaceType) bool {
|
|||
}
|
||||
|
||||
func criuNsToKey(t configs.NamespaceType) string {
|
||||
return "extRoot" + strings.Title(configs.NsName(t)) + "NS"
|
||||
return "extRoot" + strings.Title(configs.NsName(t)) + "NS" //nolint:staticcheck // SA1019: strings.Title is deprecated
|
||||
}
|
||||
|
||||
func (c *linuxContainer) handleCheckpointingExternalNamespaces(rpcOpts *criurpc.CriuOpts, t configs.NamespaceType) error {
|
||||
|
@ -2257,7 +2277,7 @@ func ignoreTerminateErrors(err error) error {
|
|||
|
||||
func requiresRootOrMappingTool(c *configs.Config) bool {
|
||||
gidMap := []configs.IDMap{
|
||||
{ContainerID: 0, HostID: os.Getegid(), Size: 1},
|
||||
{ContainerID: 0, HostID: int64(os.Getegid()), Size: 1},
|
||||
}
|
||||
return !reflect.DeepEqual(c.GidMappings, gidMap)
|
||||
}
|
||||
|
|
|
@ -7,22 +7,15 @@ import (
|
|||
|
||||
"github.com/opencontainers/runc/libcontainer/cgroups"
|
||||
"github.com/opencontainers/runc/libcontainer/configs"
|
||||
"github.com/opencontainers/runc/libcontainer/intelrdt"
|
||||
"github.com/opencontainers/runc/libcontainer/system"
|
||||
)
|
||||
|
||||
type mockCgroupManager struct {
|
||||
pids []int
|
||||
allPids []int
|
||||
stats *cgroups.Stats
|
||||
paths map[string]string
|
||||
}
|
||||
|
||||
type mockIntelRdtManager struct {
|
||||
stats *intelrdt.Stats
|
||||
path string
|
||||
}
|
||||
|
||||
func (m *mockCgroupManager) GetPids() ([]int, error) {
|
||||
return m.pids, nil
|
||||
}
|
||||
|
@ -32,7 +25,7 @@ func (m *mockCgroupManager) GetAllPids() ([]int, error) {
|
|||
}
|
||||
|
||||
func (m *mockCgroupManager) GetStats() (*cgroups.Stats, error) {
|
||||
return m.stats, nil
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (m *mockCgroupManager) Apply(pid int) error {
|
||||
|
@ -76,30 +69,6 @@ func (m *mockCgroupManager) GetFreezerState() (configs.FreezerState, error) {
|
|||
return configs.Thawed, nil
|
||||
}
|
||||
|
||||
func (m *mockIntelRdtManager) Apply(pid int) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *mockIntelRdtManager) GetStats() (*intelrdt.Stats, error) {
|
||||
return m.stats, nil
|
||||
}
|
||||
|
||||
func (m *mockIntelRdtManager) Destroy() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *mockIntelRdtManager) GetPath() string {
|
||||
return m.path
|
||||
}
|
||||
|
||||
func (m *mockIntelRdtManager) Set(container *configs.Config) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *mockIntelRdtManager) GetCgroups() (*configs.Cgroup, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
type mockProcess struct {
|
||||
_pid int
|
||||
started uint64
|
||||
|
@ -173,61 +142,11 @@ func TestGetContainerPids(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestGetContainerStats(t *testing.T) {
|
||||
container := &linuxContainer{
|
||||
id: "myid",
|
||||
config: &configs.Config{},
|
||||
cgroupManager: &mockCgroupManager{
|
||||
pids: []int{1, 2, 3},
|
||||
stats: &cgroups.Stats{
|
||||
MemoryStats: cgroups.MemoryStats{
|
||||
Usage: cgroups.MemoryData{
|
||||
Usage: 1024,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
intelRdtManager: &mockIntelRdtManager{
|
||||
stats: &intelrdt.Stats{
|
||||
L3CacheSchema: "L3:0=f;1=f0",
|
||||
MemBwSchema: "MB:0=20;1=70",
|
||||
},
|
||||
},
|
||||
}
|
||||
stats, err := container.Stats()
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if stats.CgroupStats == nil {
|
||||
t.Fatal("cgroup stats are nil")
|
||||
}
|
||||
if stats.CgroupStats.MemoryStats.Usage.Usage != 1024 {
|
||||
t.Fatalf("expected memory usage 1024 but received %d", stats.CgroupStats.MemoryStats.Usage.Usage)
|
||||
}
|
||||
if intelrdt.IsCATEnabled() {
|
||||
if stats.IntelRdtStats == nil {
|
||||
t.Fatal("intel rdt stats are nil")
|
||||
}
|
||||
if stats.IntelRdtStats.L3CacheSchema != "L3:0=f;1=f0" {
|
||||
t.Fatalf("expected L3CacheSchema L3:0=f;1=f0 but received %s", stats.IntelRdtStats.L3CacheSchema)
|
||||
}
|
||||
}
|
||||
if intelrdt.IsMBAEnabled() {
|
||||
if stats.IntelRdtStats == nil {
|
||||
t.Fatal("intel rdt stats are nil")
|
||||
}
|
||||
if stats.IntelRdtStats.MemBwSchema != "MB:0=20;1=70" {
|
||||
t.Fatalf("expected MemBwSchema MB:0=20;1=70 but received %s", stats.IntelRdtStats.MemBwSchema)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetContainerState(t *testing.T) {
|
||||
var (
|
||||
pid = os.Getpid()
|
||||
expectedMemoryPath = "/sys/fs/cgroup/memory/myid"
|
||||
expectedNetworkPath = fmt.Sprintf("/proc/%d/ns/net", pid)
|
||||
expectedIntelRdtPath = "/sys/fs/resctrl/myid"
|
||||
pid = os.Getpid()
|
||||
expectedMemoryPath = "/sys/fs/cgroup/memory/myid"
|
||||
expectedNetworkPath = fmt.Sprintf("/proc/%d/ns/net", pid)
|
||||
)
|
||||
container := &linuxContainer{
|
||||
id: "myid",
|
||||
|
@ -248,24 +167,10 @@ func TestGetContainerState(t *testing.T) {
|
|||
},
|
||||
cgroupManager: &mockCgroupManager{
|
||||
pids: []int{1, 2, 3},
|
||||
stats: &cgroups.Stats{
|
||||
MemoryStats: cgroups.MemoryStats{
|
||||
Usage: cgroups.MemoryData{
|
||||
Usage: 1024,
|
||||
},
|
||||
},
|
||||
},
|
||||
paths: map[string]string{
|
||||
"memory": expectedMemoryPath,
|
||||
},
|
||||
},
|
||||
intelRdtManager: &mockIntelRdtManager{
|
||||
stats: &intelrdt.Stats{
|
||||
L3CacheSchema: "L3:0=f0;1=f",
|
||||
MemBwSchema: "MB:0=70;1=20",
|
||||
},
|
||||
path: expectedIntelRdtPath,
|
||||
},
|
||||
}
|
||||
container.state = &createdState{c: container}
|
||||
state, err := container.State()
|
||||
|
@ -285,15 +190,6 @@ func TestGetContainerState(t *testing.T) {
|
|||
if memPath := paths["memory"]; memPath != expectedMemoryPath {
|
||||
t.Fatalf("expected memory path %q but received %q", expectedMemoryPath, memPath)
|
||||
}
|
||||
if intelrdt.IsCATEnabled() || intelrdt.IsMBAEnabled() {
|
||||
intelRdtPath := state.IntelRdtPath
|
||||
if intelRdtPath == "" {
|
||||
t.Fatal("intel rdt path should not be empty")
|
||||
}
|
||||
if intelRdtPath != expectedIntelRdtPath {
|
||||
t.Fatalf("expected intel rdt path %q but received %q", expectedIntelRdtPath, intelRdtPath)
|
||||
}
|
||||
}
|
||||
for _, ns := range container.config.Namespaces {
|
||||
path := state.NamespacePaths[ns.Type]
|
||||
if path == "" {
|
||||
|
|
|
@ -1,39 +0,0 @@
|
|||
//go:build !go1.17
|
||||
// +build !go1.17
|
||||
|
||||
package devices
|
||||
|
||||
import "io/fs"
|
||||
|
||||
// The following code is adapted from go1.17.1/src/io/fs/readdir.go
|
||||
// to compensate for the lack of fs.FileInfoToDirEntry in Go 1.16.
|
||||
|
||||
// dirInfo is a DirEntry based on a FileInfo.
|
||||
type dirInfo struct {
|
||||
fileInfo fs.FileInfo
|
||||
}
|
||||
|
||||
func (di dirInfo) IsDir() bool {
|
||||
return di.fileInfo.IsDir()
|
||||
}
|
||||
|
||||
func (di dirInfo) Type() fs.FileMode {
|
||||
return di.fileInfo.Mode().Type()
|
||||
}
|
||||
|
||||
func (di dirInfo) Info() (fs.FileInfo, error) {
|
||||
return di.fileInfo, nil
|
||||
}
|
||||
|
||||
func (di dirInfo) Name() string {
|
||||
return di.fileInfo.Name()
|
||||
}
|
||||
|
||||
// fileInfoToDirEntry returns a DirEntry that returns information from info.
|
||||
// If info is nil, FileInfoToDirEntry returns nil.
|
||||
func fileInfoToDirEntry(info fs.FileInfo) fs.DirEntry {
|
||||
if info == nil {
|
||||
return nil
|
||||
}
|
||||
return dirInfo{fileInfo: info}
|
||||
}
|
|
@ -1,8 +0,0 @@
|
|||
//go:build go1.17
|
||||
// +build go1.17
|
||||
|
||||
package devices
|
||||
|
||||
import "io/fs"
|
||||
|
||||
var fileInfoToDirEntry = fs.FileInfoToDirEntry
|
|
@ -64,7 +64,7 @@ func TestHostDevicesIoutilReadDirDeepFailure(t *testing.T) {
|
|||
t.Fatalf("Unexpected error %v", err)
|
||||
}
|
||||
|
||||
return []fs.DirEntry{fileInfoToDirEntry(fi)}, nil
|
||||
return []fs.DirEntry{fs.FileInfoToDirEntry(fi)}, nil
|
||||
}
|
||||
defer cleanupTest()
|
||||
|
||||
|
|
|
@ -0,0 +1,17 @@
|
|||
//go:build !go1.20
|
||||
// +build !go1.20
|
||||
|
||||
package libcontainer
|
||||
|
||||
import "golang.org/x/sys/unix"
|
||||
|
||||
func eaccess(path string) error {
|
||||
// This check is similar to access(2) with X_OK except for
|
||||
// setuid/setgid binaries where it checks against the effective
|
||||
// (rather than real) uid and gid. It is not needed in go 1.20
|
||||
// and beyond and will be removed later.
|
||||
|
||||
// Relies on code added in https://go-review.googlesource.com/c/sys/+/468877
|
||||
// and older CLs linked from there.
|
||||
return unix.Faccessat(unix.AT_FDCWD, path, unix.X_OK, unix.AT_EACCESS)
|
||||
}
|
|
@ -0,0 +1,10 @@
|
|||
//go:build go1.20
|
||||
|
||||
package libcontainer
|
||||
|
||||
func eaccess(path string) error {
|
||||
// Not needed in Go 1.20+ as the functionality is already in there
|
||||
// (added by https://go.dev/cl/416115, https://go.dev/cl/414824,
|
||||
// and fixed in Go 1.20.2 by https://go.dev/cl/469956).
|
||||
return nil
|
||||
}
|
|
@ -48,20 +48,6 @@ func InitArgs(args ...string) func(*LinuxFactory) error {
|
|||
}
|
||||
}
|
||||
|
||||
// IntelRdtfs is an options func to configure a LinuxFactory to return
|
||||
// containers that use the Intel RDT "resource control" filesystem to
|
||||
// create and manage Intel RDT resources (e.g., L3 cache, memory bandwidth).
|
||||
func IntelRdtFs(l *LinuxFactory) error {
|
||||
if !intelrdt.IsCATEnabled() && !intelrdt.IsMBAEnabled() {
|
||||
l.NewIntelRdtManager = nil
|
||||
} else {
|
||||
l.NewIntelRdtManager = func(config *configs.Config, id string, path string) intelrdt.Manager {
|
||||
return intelrdt.NewManager(config, id, path)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// TmpfsRoot is an option func to mount LinuxFactory.Root to tmpfs.
|
||||
func TmpfsRoot(l *LinuxFactory) error {
|
||||
mounted, err := mountinfo.Mounted(l.Root)
|
||||
|
@ -136,9 +122,6 @@ type LinuxFactory struct {
|
|||
|
||||
// Validator provides validation to container configurations.
|
||||
Validator validate.Validator
|
||||
|
||||
// NewIntelRdtManager returns an initialized Intel RDT manager for a single container.
|
||||
NewIntelRdtManager func(config *configs.Config, id string, path string) intelrdt.Manager
|
||||
}
|
||||
|
||||
func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, error) {
|
||||
|
@ -179,6 +162,12 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err
|
|||
return nil, fmt.Errorf("unable to get cgroup PIDs: %w", err)
|
||||
}
|
||||
if len(pids) != 0 {
|
||||
if config.Cgroups.Systemd {
|
||||
// systemd cgroup driver can't add a pid to an
|
||||
// existing systemd unit and will return an
|
||||
// error anyway, so let's error out early.
|
||||
return nil, fmt.Errorf("container's cgroup is not empty: %d process(es) found", len(pids))
|
||||
}
|
||||
// TODO: return an error.
|
||||
logrus.Warnf("container's cgroup is not empty: %d process(es) found", len(pids))
|
||||
logrus.Warn("DEPRECATED: running container in a non-empty cgroup won't be supported in runc 1.2; https://github.com/opencontainers/runc/issues/3132")
|
||||
|
@ -202,18 +191,16 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err
|
|||
return nil, err
|
||||
}
|
||||
c := &linuxContainer{
|
||||
id: id,
|
||||
root: containerRoot,
|
||||
config: config,
|
||||
initPath: l.InitPath,
|
||||
initArgs: l.InitArgs,
|
||||
criuPath: l.CriuPath,
|
||||
newuidmapPath: l.NewuidmapPath,
|
||||
newgidmapPath: l.NewgidmapPath,
|
||||
cgroupManager: cm,
|
||||
}
|
||||
if l.NewIntelRdtManager != nil {
|
||||
c.intelRdtManager = l.NewIntelRdtManager(config, id, "")
|
||||
id: id,
|
||||
root: containerRoot,
|
||||
config: config,
|
||||
initPath: l.InitPath,
|
||||
initArgs: l.InitArgs,
|
||||
criuPath: l.CriuPath,
|
||||
newuidmapPath: l.NewuidmapPath,
|
||||
newgidmapPath: l.NewgidmapPath,
|
||||
cgroupManager: cm,
|
||||
intelRdtManager: intelrdt.NewManager(config, id, ""),
|
||||
}
|
||||
c.state = &stoppedState{c: c}
|
||||
return c, nil
|
||||
|
@ -255,12 +242,10 @@ func (l *LinuxFactory) Load(id string) (Container, error) {
|
|||
newuidmapPath: l.NewuidmapPath,
|
||||
newgidmapPath: l.NewgidmapPath,
|
||||
cgroupManager: cm,
|
||||
intelRdtManager: intelrdt.NewManager(&state.Config, id, state.IntelRdtPath),
|
||||
root: containerRoot,
|
||||
created: state.Created,
|
||||
}
|
||||
if l.NewIntelRdtManager != nil {
|
||||
c.intelRdtManager = l.NewIntelRdtManager(&state.Config, id, state.IntelRdtPath)
|
||||
}
|
||||
c.state = &loadedState{c: c}
|
||||
if err := c.refreshState(); err != nil {
|
||||
return nil, err
|
||||
|
@ -338,7 +323,11 @@ func (l *LinuxFactory) StartInitialization() (err error) {
|
|||
|
||||
defer func() {
|
||||
if e := recover(); e != nil {
|
||||
err = fmt.Errorf("panic from initialization: %w, %v", e, string(debug.Stack()))
|
||||
if ee, ok := e.(error); ok {
|
||||
err = fmt.Errorf("panic from initialization: %w, %s", ee, debug.Stack())
|
||||
} else {
|
||||
err = fmt.Errorf("panic from initialization: %v, %s", e, debug.Stack())
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
||||
|
|
|
@ -37,28 +37,6 @@ func TestFactoryNew(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestFactoryNewIntelRdt(t *testing.T) {
|
||||
root := t.TempDir()
|
||||
factory, err := New(root, IntelRdtFs)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if factory == nil {
|
||||
t.Fatal("factory should not be nil")
|
||||
}
|
||||
lfactory, ok := factory.(*LinuxFactory)
|
||||
if !ok {
|
||||
t.Fatal("expected linux factory returned on linux based systems")
|
||||
}
|
||||
if lfactory.Root != root {
|
||||
t.Fatalf("expected factory root to be %q but received %q", root, lfactory.Root)
|
||||
}
|
||||
|
||||
if factory.Type() != "libcontainer" {
|
||||
t.Fatalf("unexpected factory type: %q, expected %q", factory.Type(), "libcontainer")
|
||||
}
|
||||
}
|
||||
|
||||
func TestFactoryNewTmpfs(t *testing.T) {
|
||||
root := t.TempDir()
|
||||
factory, err := New(root, TmpfsRoot)
|
||||
|
@ -157,7 +135,7 @@ func TestFactoryLoadContainer(t *testing.T) {
|
|||
if err := marshal(filepath.Join(root, id, stateFilename), expectedState); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
factory, err := New(root, IntelRdtFs)
|
||||
factory, err := New(root)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
|
|
@ -8,7 +8,7 @@ import (
|
|||
"io"
|
||||
"net"
|
||||
"os"
|
||||
"strconv"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"unsafe"
|
||||
|
||||
|
@ -117,17 +117,17 @@ func populateProcessEnvironment(env []string) error {
|
|||
for _, pair := range env {
|
||||
p := strings.SplitN(pair, "=", 2)
|
||||
if len(p) < 2 {
|
||||
return fmt.Errorf("invalid environment variable: %q", pair)
|
||||
return errors.New("invalid environment variable: missing '='")
|
||||
}
|
||||
name, val := p[0], p[1]
|
||||
if name == "" {
|
||||
return fmt.Errorf("environment variable name can't be empty: %q", pair)
|
||||
return errors.New("invalid environment variable: name cannot be empty")
|
||||
}
|
||||
if strings.IndexByte(name, 0) >= 0 {
|
||||
return fmt.Errorf("environment variable name can't contain null(\\x00): %q", pair)
|
||||
return fmt.Errorf("invalid environment variable %q: name contains nul byte (\\x00)", name)
|
||||
}
|
||||
if strings.IndexByte(val, 0) >= 0 {
|
||||
return fmt.Errorf("environment variable value can't contain null(\\x00): %q", pair)
|
||||
return fmt.Errorf("invalid environment variable %q: value contains nul byte (\\x00)", name)
|
||||
}
|
||||
if err := os.Setenv(name, val); err != nil {
|
||||
return err
|
||||
|
@ -136,6 +136,32 @@ func populateProcessEnvironment(env []string) error {
|
|||
return nil
|
||||
}
|
||||
|
||||
// verifyCwd ensures that the current directory is actually inside the mount
|
||||
// namespace root of the current process.
|
||||
func verifyCwd() error {
|
||||
// getcwd(2) on Linux detects if cwd is outside of the rootfs of the
|
||||
// current mount namespace root, and in that case prefixes "(unreachable)"
|
||||
// to the returned string. glibc's getcwd(3) and Go's Getwd() both detect
|
||||
// when this happens and return ENOENT rather than returning a non-absolute
|
||||
// path. In both cases we can therefore easily detect if we have an invalid
|
||||
// cwd by checking the return value of getcwd(3). See getcwd(3) for more
|
||||
// details, and CVE-2024-21626 for the security issue that motivated this
|
||||
// check.
|
||||
//
|
||||
// We have to use unix.Getwd() here because os.Getwd() has a workaround for
|
||||
// $PWD which involves doing stat(.), which can fail if the current
|
||||
// directory is inaccessible to the container process.
|
||||
if wd, err := unix.Getwd(); errors.Is(err, unix.ENOENT) {
|
||||
return errors.New("current working directory is outside of container mount namespace root -- possible container breakout detected")
|
||||
} else if err != nil {
|
||||
return fmt.Errorf("failed to verify if current working directory is safe: %w", err)
|
||||
} else if !filepath.IsAbs(wd) {
|
||||
// We shouldn't ever hit this, but check just in case.
|
||||
return fmt.Errorf("current working directory is not absolute -- possible container breakout detected: cwd is %q", wd)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// finalizeNamespace drops the caps, sets the correct user
|
||||
// and working dir, and closes any leaked file descriptors
|
||||
// before executing the command inside the namespace
|
||||
|
@ -194,6 +220,10 @@ func finalizeNamespace(config *initConfig) error {
|
|||
return fmt.Errorf("chdir to cwd (%q) set in config.json failed: %w", config.Cwd, err)
|
||||
}
|
||||
}
|
||||
// Make sure our final working directory is inside the container.
|
||||
if err := verifyCwd(); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := system.ClearKeepCaps(); err != nil {
|
||||
return fmt.Errorf("unable to clear keep caps: %w", err)
|
||||
}
|
||||
|
@ -406,40 +436,37 @@ func fixStdioPermissions(u *user.ExecUser) error {
|
|||
if err := unix.Stat("/dev/null", &null); err != nil {
|
||||
return &os.PathError{Op: "stat", Path: "/dev/null", Err: err}
|
||||
}
|
||||
for _, fd := range []uintptr{
|
||||
os.Stdin.Fd(),
|
||||
os.Stderr.Fd(),
|
||||
os.Stdout.Fd(),
|
||||
} {
|
||||
for _, file := range []*os.File{os.Stdin, os.Stdout, os.Stderr} {
|
||||
var s unix.Stat_t
|
||||
if err := unix.Fstat(int(fd), &s); err != nil {
|
||||
return &os.PathError{Op: "fstat", Path: "fd " + strconv.Itoa(int(fd)), Err: err}
|
||||
if err := unix.Fstat(int(file.Fd()), &s); err != nil {
|
||||
return &os.PathError{Op: "fstat", Path: file.Name(), Err: err}
|
||||
}
|
||||
|
||||
// Skip chown of /dev/null if it was used as one of the STDIO fds.
|
||||
if s.Rdev == null.Rdev {
|
||||
// Skip chown if uid is already the one we want or any of the STDIO descriptors
|
||||
// were redirected to /dev/null.
|
||||
if int(s.Uid) == u.Uid || s.Rdev == null.Rdev {
|
||||
continue
|
||||
}
|
||||
|
||||
// We only change the uid owner (as it is possible for the mount to
|
||||
// We only change the uid (as it is possible for the mount to
|
||||
// prefer a different gid, and there's no reason for us to change it).
|
||||
// The reason why we don't just leave the default uid=X mount setup is
|
||||
// that users expect to be able to actually use their console. Without
|
||||
// this code, you couldn't effectively run as a non-root user inside a
|
||||
// container and also have a console set up.
|
||||
if err := unix.Fchown(int(fd), u.Uid, int(s.Gid)); err != nil {
|
||||
if err := file.Chown(u.Uid, int(s.Gid)); err != nil {
|
||||
// If we've hit an EINVAL then s.Gid isn't mapped in the user
|
||||
// namespace. If we've hit an EPERM then the inode's current owner
|
||||
// is not mapped in our user namespace (in particular,
|
||||
// privileged_wrt_inode_uidgid() has failed). In either case, we
|
||||
// are in a configuration where it's better for us to just not
|
||||
// touch the stdio rather than bail at this point.
|
||||
// privileged_wrt_inode_uidgid() has failed). Read-only
|
||||
// /dev can result in EROFS error. In any case, it's
|
||||
// better for us to just not touch the stdio rather
|
||||
// than bail at this point.
|
||||
|
||||
// nolint:errorlint // unix errors are bare
|
||||
if err == unix.EINVAL || err == unix.EPERM {
|
||||
if errors.Is(err, unix.EINVAL) || errors.Is(err, unix.EPERM) || errors.Is(err, unix.EROFS) {
|
||||
continue
|
||||
}
|
||||
return &os.PathError{Op: "fchown", Path: "fd " + strconv.Itoa(int(fd)), Err: err}
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
|
|
|
@ -6,6 +6,7 @@ import (
|
|||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
|
@ -61,6 +62,12 @@ func testCheckpoint(t *testing.T, userns bool) {
|
|||
t.Skipf("criu binary not found: %v", err)
|
||||
}
|
||||
|
||||
// Workaround for https://github.com/opencontainers/runc/issues/3532.
|
||||
out, err := exec.Command("rpm", "-q", "criu").CombinedOutput()
|
||||
if err == nil && regexp.MustCompile(`^criu-3\.17-[123]\.el9`).Match(out) {
|
||||
t.Skip("Test requires criu >= 3.17-4 on CentOS Stream 9.")
|
||||
}
|
||||
|
||||
config := newTemplateConfig(t, &tParam{userns: userns})
|
||||
factory, err := libcontainer.New(t.TempDir())
|
||||
ok(t, err)
|
||||
|
|
|
@ -18,6 +18,7 @@ import (
|
|||
"github.com/opencontainers/runc/libcontainer/cgroups"
|
||||
"github.com/opencontainers/runc/libcontainer/cgroups/systemd"
|
||||
"github.com/opencontainers/runc/libcontainer/configs"
|
||||
"github.com/opencontainers/runc/libcontainer/userns"
|
||||
"github.com/opencontainers/runtime-spec/specs-go"
|
||||
|
||||
"golang.org/x/sys/unix"
|
||||
|
@ -40,13 +41,7 @@ func testExecPS(t *testing.T, userns bool) {
|
|||
}
|
||||
config := newTemplateConfig(t, &tParam{userns: userns})
|
||||
|
||||
buffers, exitCode, err := runContainer(t, config, "ps", "-o", "pid,user,comm")
|
||||
if err != nil {
|
||||
t.Fatalf("%s: %s", buffers, err)
|
||||
}
|
||||
if exitCode != 0 {
|
||||
t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
|
||||
}
|
||||
buffers := runContainerOk(t, config, "ps", "-o", "pid,user,comm")
|
||||
lines := strings.Split(buffers.Stdout.String(), "\n")
|
||||
if len(lines) < 2 {
|
||||
t.Fatalf("more than one process running for output %q", buffers.Stdout.String())
|
||||
|
@ -67,12 +62,7 @@ func TestIPCPrivate(t *testing.T) {
|
|||
ok(t, err)
|
||||
|
||||
config := newTemplateConfig(t, nil)
|
||||
buffers, exitCode, err := runContainer(t, config, "readlink", "/proc/self/ns/ipc")
|
||||
ok(t, err)
|
||||
|
||||
if exitCode != 0 {
|
||||
t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
|
||||
}
|
||||
buffers := runContainerOk(t, config, "readlink", "/proc/self/ns/ipc")
|
||||
|
||||
if actual := strings.Trim(buffers.Stdout.String(), "\n"); actual == l {
|
||||
t.Fatalf("ipc link should be private to the container but equals host %q %q", actual, l)
|
||||
|
@ -89,12 +79,7 @@ func TestIPCHost(t *testing.T) {
|
|||
|
||||
config := newTemplateConfig(t, nil)
|
||||
config.Namespaces.Remove(configs.NEWIPC)
|
||||
buffers, exitCode, err := runContainer(t, config, "readlink", "/proc/self/ns/ipc")
|
||||
ok(t, err)
|
||||
|
||||
if exitCode != 0 {
|
||||
t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
|
||||
}
|
||||
buffers := runContainerOk(t, config, "readlink", "/proc/self/ns/ipc")
|
||||
|
||||
if actual := strings.Trim(buffers.Stdout.String(), "\n"); actual != l {
|
||||
t.Fatalf("ipc link not equal to host link %q %q", actual, l)
|
||||
|
@ -111,13 +96,7 @@ func TestIPCJoinPath(t *testing.T) {
|
|||
|
||||
config := newTemplateConfig(t, nil)
|
||||
config.Namespaces.Add(configs.NEWIPC, "/proc/1/ns/ipc")
|
||||
|
||||
buffers, exitCode, err := runContainer(t, config, "readlink", "/proc/self/ns/ipc")
|
||||
ok(t, err)
|
||||
|
||||
if exitCode != 0 {
|
||||
t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
|
||||
}
|
||||
buffers := runContainerOk(t, config, "readlink", "/proc/self/ns/ipc")
|
||||
|
||||
if actual := strings.Trim(buffers.Stdout.String(), "\n"); actual != l {
|
||||
t.Fatalf("ipc link not equal to host link %q %q", actual, l)
|
||||
|
@ -163,8 +142,7 @@ func testRlimit(t *testing.T, userns bool) {
|
|||
Cur: 1024,
|
||||
}))
|
||||
|
||||
out, _, err := runContainer(t, config, "/bin/sh", "-c", "ulimit -n")
|
||||
ok(t, err)
|
||||
out := runContainerOk(t, config, "/bin/sh", "-c", "ulimit -n")
|
||||
if limit := strings.TrimSpace(out.Stdout.String()); limit != "1025" {
|
||||
t.Fatalf("expected rlimit to be 1025, got %s", limit)
|
||||
}
|
||||
|
@ -537,7 +515,7 @@ func testCpuShares(t *testing.T, systemd bool) {
|
|||
config.Cgroups.Resources.CpuShares = 1
|
||||
|
||||
if _, _, err := runContainer(t, config, "ps"); err == nil {
|
||||
t.Fatalf("runContainer should failed with invalid CpuShares")
|
||||
t.Fatal("runContainer should fail with invalid CpuShares")
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -560,30 +538,20 @@ func testPids(t *testing.T, systemd bool) {
|
|||
config := newTemplateConfig(t, &tParam{systemd: systemd})
|
||||
config.Cgroups.Resources.PidsLimit = -1
|
||||
|
||||
// Running multiple processes.
|
||||
_, ret, err := runContainer(t, config, "/bin/sh", "-c", "/bin/true | /bin/true | /bin/true | /bin/true")
|
||||
ok(t, err)
|
||||
|
||||
if ret != 0 {
|
||||
t.Fatalf("expected fork() to succeed with no pids limit")
|
||||
}
|
||||
// Running multiple processes, expecting it to succeed with no pids limit.
|
||||
_ = runContainerOk(t, config, "/bin/sh", "-c", "/bin/true | /bin/true | /bin/true | /bin/true")
|
||||
|
||||
// Enforce a permissive limit. This needs to be fairly hand-wavey due to the
|
||||
// issues with running Go binaries with pids restrictions (see below).
|
||||
config.Cgroups.Resources.PidsLimit = 64
|
||||
_, ret, err = runContainer(t, config, "/bin/sh", "-c", `
|
||||
_ = runContainerOk(t, config, "/bin/sh", "-c", `
|
||||
/bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true |
|
||||
/bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true |
|
||||
/bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true |
|
||||
/bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true`)
|
||||
ok(t, err)
|
||||
|
||||
if ret != 0 {
|
||||
t.Fatalf("expected fork() to succeed with permissive pids limit")
|
||||
}
|
||||
|
||||
// Enforce a restrictive limit. 64 * /bin/true + 1 * shell should cause this
|
||||
// to fail reliability.
|
||||
// Enforce a restrictive limit. 64 * /bin/true + 1 * shell should cause
|
||||
// this to fail reliably.
|
||||
config.Cgroups.Resources.PidsLimit = 64
|
||||
out, _, err := runContainer(t, config, "/bin/sh", "-c", `
|
||||
/bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true |
|
||||
|
@ -933,13 +901,8 @@ func TestMountCgroupRO(t *testing.T) {
|
|||
return
|
||||
}
|
||||
config := newTemplateConfig(t, nil)
|
||||
buffers, exitCode, err := runContainer(t, config, "mount")
|
||||
if err != nil {
|
||||
t.Fatalf("%s: %s", buffers, err)
|
||||
}
|
||||
if exitCode != 0 {
|
||||
t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
|
||||
}
|
||||
buffers := runContainerOk(t, config, "mount")
|
||||
|
||||
mountInfo := buffers.Stdout.String()
|
||||
lines := strings.Split(mountInfo, "\n")
|
||||
for _, l := range lines {
|
||||
|
@ -980,13 +943,8 @@ func TestMountCgroupRW(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
buffers, exitCode, err := runContainer(t, config, "mount")
|
||||
if err != nil {
|
||||
t.Fatalf("%s: %s", buffers, err)
|
||||
}
|
||||
if exitCode != 0 {
|
||||
t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
|
||||
}
|
||||
buffers := runContainerOk(t, config, "mount")
|
||||
|
||||
mountInfo := buffers.Stdout.String()
|
||||
lines := strings.Split(mountInfo, "\n")
|
||||
for _, l := range lines {
|
||||
|
@ -1197,11 +1155,7 @@ func TestSTDIOPermissions(t *testing.T) {
|
|||
}
|
||||
|
||||
config := newTemplateConfig(t, nil)
|
||||
buffers, exitCode, err := runContainer(t, config, "sh", "-c", "echo hi > /dev/stderr")
|
||||
ok(t, err)
|
||||
if exitCode != 0 {
|
||||
t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
|
||||
}
|
||||
buffers := runContainerOk(t, config, "sh", "-c", "echo hi > /dev/stderr")
|
||||
|
||||
if actual := strings.Trim(buffers.Stderr.String(), "\n"); actual != "hi" {
|
||||
t.Fatalf("stderr should equal be equal %q %q", actual, "hi")
|
||||
|
@ -1444,12 +1398,7 @@ func TestPIDHost(t *testing.T) {
|
|||
|
||||
config := newTemplateConfig(t, nil)
|
||||
config.Namespaces.Remove(configs.NEWPID)
|
||||
buffers, exitCode, err := runContainer(t, config, "readlink", "/proc/self/ns/pid")
|
||||
ok(t, err)
|
||||
|
||||
if exitCode != 0 {
|
||||
t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
|
||||
}
|
||||
buffers := runContainerOk(t, config, "readlink", "/proc/self/ns/pid")
|
||||
|
||||
if actual := strings.Trim(buffers.Stdout.String(), "\n"); actual != l {
|
||||
t.Fatalf("ipc link not equal to host link %q %q", actual, l)
|
||||
|
@ -1640,6 +1589,11 @@ func TestInitJoinNetworkAndUser(t *testing.T) {
|
|||
config2 := newTemplateConfig(t, &tParam{userns: true})
|
||||
config2.Namespaces.Add(configs.NEWNET, netns1)
|
||||
config2.Namespaces.Add(configs.NEWUSER, userns1)
|
||||
// Emulate specconv.setupUserNamespace().
|
||||
uidMap, gidMap, err := userns.GetUserNamespaceMappings(userns1)
|
||||
ok(t, err)
|
||||
config2.UidMappings = uidMap
|
||||
config2.GidMappings = gidMap
|
||||
config2.Cgroups.Path = "integration/test2"
|
||||
container2, err := newContainer(t, config2)
|
||||
ok(t, err)
|
||||
|
@ -1738,12 +1692,7 @@ func TestCGROUPPrivate(t *testing.T) {
|
|||
|
||||
config := newTemplateConfig(t, nil)
|
||||
config.Namespaces.Add(configs.NEWCGROUP, "")
|
||||
buffers, exitCode, err := runContainer(t, config, "readlink", "/proc/self/ns/cgroup")
|
||||
ok(t, err)
|
||||
|
||||
if exitCode != 0 {
|
||||
t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
|
||||
}
|
||||
buffers := runContainerOk(t, config, "readlink", "/proc/self/ns/cgroup")
|
||||
|
||||
if actual := strings.Trim(buffers.Stdout.String(), "\n"); actual == l {
|
||||
t.Fatalf("cgroup link should be private to the container but equals host %q %q", actual, l)
|
||||
|
@ -1762,12 +1711,7 @@ func TestCGROUPHost(t *testing.T) {
|
|||
ok(t, err)
|
||||
|
||||
config := newTemplateConfig(t, nil)
|
||||
buffers, exitCode, err := runContainer(t, config, "readlink", "/proc/self/ns/cgroup")
|
||||
ok(t, err)
|
||||
|
||||
if exitCode != 0 {
|
||||
t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
|
||||
}
|
||||
buffers := runContainerOk(t, config, "readlink", "/proc/self/ns/cgroup")
|
||||
|
||||
if actual := strings.Trim(buffers.Stdout.String(), "\n"); actual != l {
|
||||
t.Fatalf("cgroup link not equal to host link %q %q", actual, l)
|
||||
|
@ -1790,6 +1734,16 @@ func testFdLeaks(t *testing.T, systemd bool) {
|
|||
return
|
||||
}
|
||||
|
||||
config := newTemplateConfig(t, &tParam{systemd: systemd})
|
||||
// Run a container once to exclude file descriptors that are only
|
||||
// opened once during the process lifetime by the library and are
|
||||
// never closed. Those are not considered leaks.
|
||||
//
|
||||
// Examples of this open-once file descriptors are:
|
||||
// - /sys/fs/cgroup dirfd opened by prepareOpenat2 in libct/cgroups;
|
||||
// - dbus connection opened by getConnection in libct/cgroups/systemd.
|
||||
_ = runContainerOk(t, config, "true")
|
||||
|
||||
pfd, err := os.Open("/proc/self/fd")
|
||||
ok(t, err)
|
||||
defer pfd.Close()
|
||||
|
@ -1798,13 +1752,7 @@ func testFdLeaks(t *testing.T, systemd bool) {
|
|||
_, err = pfd.Seek(0, 0)
|
||||
ok(t, err)
|
||||
|
||||
config := newTemplateConfig(t, &tParam{systemd: systemd})
|
||||
buffers, exitCode, err := runContainer(t, config, "true")
|
||||
ok(t, err)
|
||||
|
||||
if exitCode != 0 {
|
||||
t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
|
||||
}
|
||||
_ = runContainerOk(t, config, "true")
|
||||
|
||||
fds1, err := pfd.Readdirnames(0)
|
||||
ok(t, err)
|
||||
|
@ -1815,7 +1763,6 @@ func testFdLeaks(t *testing.T, systemd bool) {
|
|||
// Show the extra opened files.
|
||||
|
||||
excludedPaths := []string{
|
||||
"/sys/fs/cgroup", // opened once, see prepareOpenat2
|
||||
"anon_inode:bpf-prog", // FIXME: see https://github.com/opencontainers/runc/issues/2366#issuecomment-776411392
|
||||
}
|
||||
|
||||
|
|
|
@ -13,7 +13,7 @@ import (
|
|||
libseccomp "github.com/seccomp/libseccomp-golang"
|
||||
)
|
||||
|
||||
func TestSeccompDenyGetcwdWithErrno(t *testing.T) {
|
||||
func TestSeccompDenySyslogWithErrno(t *testing.T) {
|
||||
if testing.Short() {
|
||||
return
|
||||
}
|
||||
|
@ -25,7 +25,7 @@ func TestSeccompDenyGetcwdWithErrno(t *testing.T) {
|
|||
DefaultAction: configs.Allow,
|
||||
Syscalls: []*configs.Syscall{
|
||||
{
|
||||
Name: "getcwd",
|
||||
Name: "syslog",
|
||||
Action: configs.Errno,
|
||||
ErrnoRet: &errnoRet,
|
||||
},
|
||||
|
@ -39,7 +39,7 @@ func TestSeccompDenyGetcwdWithErrno(t *testing.T) {
|
|||
buffers := newStdBuffers()
|
||||
pwd := &libcontainer.Process{
|
||||
Cwd: "/",
|
||||
Args: []string{"pwd"},
|
||||
Args: []string{"dmesg"},
|
||||
Env: standardEnvironment,
|
||||
Stdin: buffers.Stdin,
|
||||
Stdout: buffers.Stdout,
|
||||
|
@ -65,17 +65,17 @@ func TestSeccompDenyGetcwdWithErrno(t *testing.T) {
|
|||
}
|
||||
|
||||
if exitCode == 0 {
|
||||
t.Fatalf("Getcwd should fail with negative exit code, instead got %d!", exitCode)
|
||||
t.Fatalf("dmesg should fail with negative exit code, instead got %d!", exitCode)
|
||||
}
|
||||
|
||||
expected := "pwd: getcwd: No such process"
|
||||
expected := "dmesg: klogctl: No such process"
|
||||
actual := strings.Trim(buffers.Stderr.String(), "\n")
|
||||
if actual != expected {
|
||||
t.Fatalf("Expected output %s but got %s\n", expected, actual)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSeccompDenyGetcwd(t *testing.T) {
|
||||
func TestSeccompDenySyslog(t *testing.T) {
|
||||
if testing.Short() {
|
||||
return
|
||||
}
|
||||
|
@ -85,7 +85,7 @@ func TestSeccompDenyGetcwd(t *testing.T) {
|
|||
DefaultAction: configs.Allow,
|
||||
Syscalls: []*configs.Syscall{
|
||||
{
|
||||
Name: "getcwd",
|
||||
Name: "syslog",
|
||||
Action: configs.Errno,
|
||||
},
|
||||
},
|
||||
|
@ -98,7 +98,7 @@ func TestSeccompDenyGetcwd(t *testing.T) {
|
|||
buffers := newStdBuffers()
|
||||
pwd := &libcontainer.Process{
|
||||
Cwd: "/",
|
||||
Args: []string{"pwd"},
|
||||
Args: []string{"dmesg"},
|
||||
Env: standardEnvironment,
|
||||
Stdin: buffers.Stdin,
|
||||
Stdout: buffers.Stdout,
|
||||
|
@ -124,10 +124,10 @@ func TestSeccompDenyGetcwd(t *testing.T) {
|
|||
}
|
||||
|
||||
if exitCode == 0 {
|
||||
t.Fatalf("Getcwd should fail with negative exit code, instead got %d!", exitCode)
|
||||
t.Fatalf("dmesg should fail with negative exit code, instead got %d!", exitCode)
|
||||
}
|
||||
|
||||
expected := "pwd: getcwd: Operation not permitted"
|
||||
expected := "dmesg: klogctl: Operation not permitted"
|
||||
actual := strings.Trim(buffers.Stderr.String(), "\n")
|
||||
if actual != expected {
|
||||
t.Fatalf("Expected output %s but got %s\n", expected, actual)
|
||||
|
@ -282,13 +282,7 @@ func TestSeccompPermitWriteMultipleConditions(t *testing.T) {
|
|||
},
|
||||
}
|
||||
|
||||
buffers, exitCode, err := runContainer(t, config, "ls", "/")
|
||||
if err != nil {
|
||||
t.Fatalf("%s: %s", buffers, err)
|
||||
}
|
||||
if exitCode != 0 {
|
||||
t.Fatalf("exit code not 0. code %d buffers %s", exitCode, buffers)
|
||||
}
|
||||
buffers := runContainerOk(t, config, "ls", "/")
|
||||
// We don't need to verify the actual thing printed
|
||||
// Just that something was written to stdout
|
||||
if len(buffers.Stdout.String()) == 0 {
|
||||
|
@ -375,13 +369,7 @@ func TestSeccompMultipleConditionSameArgDeniesStdout(t *testing.T) {
|
|||
},
|
||||
}
|
||||
|
||||
buffers, exitCode, err := runContainer(t, config, "ls", "/")
|
||||
if err != nil {
|
||||
t.Fatalf("%s: %s", buffers, err)
|
||||
}
|
||||
if exitCode != 0 {
|
||||
t.Fatalf("exit code not 0. code %d buffers %s", exitCode, buffers)
|
||||
}
|
||||
buffers := runContainerOk(t, config, "ls", "/")
|
||||
// Verify that nothing was printed
|
||||
if len(buffers.Stdout.String()) != 0 {
|
||||
t.Fatalf("Something was written to stdout, write call succeeded!\n")
|
||||
|
|
|
@ -216,6 +216,22 @@ func runContainer(t *testing.T, config *configs.Config, args ...string) (buffers
|
|||
return
|
||||
}
|
||||
|
||||
// runContainerOk is a wrapper for runContainer, simplifying its use for cases
|
||||
// when the run is expected to succeed and return exit code of 0.
|
||||
func runContainerOk(t *testing.T, config *configs.Config, args ...string) *stdBuffers {
|
||||
buffers, exitCode, err := runContainer(t, config, args...)
|
||||
|
||||
t.Helper()
|
||||
if err != nil {
|
||||
t.Fatalf("%s: %s", buffers, err)
|
||||
}
|
||||
if exitCode != 0 {
|
||||
t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
|
||||
}
|
||||
|
||||
return buffers
|
||||
}
|
||||
|
||||
func destroyContainer(container libcontainer.Container) {
|
||||
_ = container.Destroy()
|
||||
}
|
||||
|
|
|
@ -1,11 +1,9 @@
|
|||
package intelrdt
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
|
@ -13,6 +11,8 @@ import (
|
|||
"sync"
|
||||
|
||||
"github.com/moby/sys/mountinfo"
|
||||
"golang.org/x/sys/unix"
|
||||
|
||||
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
|
||||
"github.com/opencontainers/runc/libcontainer/configs"
|
||||
)
|
||||
|
@ -145,34 +145,31 @@ import (
|
|||
* }
|
||||
*/
|
||||
|
||||
type Manager interface {
|
||||
// Applies Intel RDT configuration to the process with the specified pid
|
||||
Apply(pid int) error
|
||||
|
||||
// Returns statistics for Intel RDT
|
||||
GetStats() (*Stats, error)
|
||||
|
||||
// Destroys the Intel RDT container-specific 'container_id' group
|
||||
Destroy() error
|
||||
|
||||
// Returns Intel RDT path to save in a state file and to be able to
|
||||
// restore the object later
|
||||
GetPath() string
|
||||
|
||||
// Set Intel RDT "resource control" filesystem as configured.
|
||||
Set(container *configs.Config) error
|
||||
}
|
||||
|
||||
// This implements interface Manager
|
||||
type intelRdtManager struct {
|
||||
type Manager struct {
|
||||
mu sync.Mutex
|
||||
config *configs.Config
|
||||
id string
|
||||
path string
|
||||
}
|
||||
|
||||
func NewManager(config *configs.Config, id string, path string) Manager {
|
||||
return &intelRdtManager{
|
||||
// NewManager returns a new instance of Manager, or nil if the Intel RDT
|
||||
// functionality is not specified in the config, available from hardware or
|
||||
// enabled in the kernel.
|
||||
func NewManager(config *configs.Config, id string, path string) *Manager {
|
||||
if config.IntelRdt == nil {
|
||||
return nil
|
||||
}
|
||||
if _, err := Root(); err != nil {
|
||||
// Intel RDT is not available.
|
||||
return nil
|
||||
}
|
||||
return newManager(config, id, path)
|
||||
}
|
||||
|
||||
// newManager is the same as NewManager, except it does not check if the feature
|
||||
// is actually available. Used by unit tests that mock intelrdt paths.
|
||||
func newManager(config *configs.Config, id string, path string) *Manager {
|
||||
return &Manager{
|
||||
config: config,
|
||||
id: id,
|
||||
path: path,
|
||||
|
@ -188,71 +185,52 @@ var (
|
|||
catEnabled bool
|
||||
// The flag to indicate if Intel RDT/MBA is enabled
|
||||
mbaEnabled bool
|
||||
// The flag to indicate if Intel RDT/MBA Software Controller is enabled
|
||||
mbaScEnabled bool
|
||||
|
||||
// For Intel RDT initialization
|
||||
initOnce sync.Once
|
||||
|
||||
errNotFound = errors.New("Intel RDT resctrl mount point not found")
|
||||
errNotFound = errors.New("Intel RDT not available")
|
||||
)
|
||||
|
||||
// Check if Intel RDT sub-features are enabled in featuresInit()
|
||||
func featuresInit() {
|
||||
initOnce.Do(func() {
|
||||
// 1. Check if hardware and kernel support Intel RDT sub-features
|
||||
flagsSet, err := parseCpuInfoFile("/proc/cpuinfo")
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
// 2. Check if Intel RDT "resource control" filesystem is available.
|
||||
// 1. Check if Intel RDT "resource control" filesystem is available.
|
||||
// The user guarantees to mount the filesystem.
|
||||
root, err := Root()
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
// 3. Double check if Intel RDT sub-features are available in
|
||||
// "resource control" filesystem. Intel RDT sub-features can be
|
||||
// 2. Check if Intel RDT sub-features are available in "resource
|
||||
// control" filesystem. Intel RDT sub-features can be
|
||||
// selectively disabled or enabled by kernel command line
|
||||
// (e.g., rdt=!l3cat,mba) in 4.14 and newer kernel
|
||||
if flagsSet.CAT {
|
||||
if _, err := os.Stat(filepath.Join(root, "info", "L3")); err == nil {
|
||||
catEnabled = true
|
||||
}
|
||||
if _, err := os.Stat(filepath.Join(root, "info", "L3")); err == nil {
|
||||
catEnabled = true
|
||||
}
|
||||
if mbaScEnabled {
|
||||
// We confirm MBA Software Controller is enabled in step 2,
|
||||
// MBA should be enabled because MBA Software Controller
|
||||
// depends on MBA
|
||||
if _, err := os.Stat(filepath.Join(root, "info", "MB")); err == nil {
|
||||
mbaEnabled = true
|
||||
} else if flagsSet.MBA {
|
||||
if _, err := os.Stat(filepath.Join(root, "info", "MB")); err == nil {
|
||||
mbaEnabled = true
|
||||
}
|
||||
}
|
||||
if flagsSet.MBMTotal || flagsSet.MBMLocal || flagsSet.CMT {
|
||||
if _, err := os.Stat(filepath.Join(root, "info", "L3_MON")); err != nil {
|
||||
return
|
||||
}
|
||||
enabledMonFeatures, err = getMonFeatures(root)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
if enabledMonFeatures.mbmTotalBytes || enabledMonFeatures.mbmLocalBytes {
|
||||
mbmEnabled = true
|
||||
}
|
||||
if enabledMonFeatures.llcOccupancy {
|
||||
cmtEnabled = true
|
||||
}
|
||||
if _, err := os.Stat(filepath.Join(root, "info", "L3_MON")); err != nil {
|
||||
return
|
||||
}
|
||||
enabledMonFeatures, err = getMonFeatures(root)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
if enabledMonFeatures.mbmTotalBytes || enabledMonFeatures.mbmLocalBytes {
|
||||
mbmEnabled = true
|
||||
}
|
||||
if enabledMonFeatures.llcOccupancy {
|
||||
cmtEnabled = true
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// Return the mount point path of Intel RDT "resource control" filesysem
|
||||
func findIntelRdtMountpointDir(f io.Reader) (string, error) {
|
||||
mi, err := mountinfo.GetMountsFromReader(f, func(m *mountinfo.Info) (bool, bool) {
|
||||
// findIntelRdtMountpointDir returns the mount point of the Intel RDT "resource control" filesystem.
|
||||
func findIntelRdtMountpointDir() (string, error) {
|
||||
mi, err := mountinfo.GetMounts(func(m *mountinfo.Info) (bool, bool) {
|
||||
// similar to mountinfo.FSTypeFilter but stops after the first match
|
||||
if m.FSType == "resctrl" {
|
||||
return false, true // don't skip, stop
|
||||
|
@ -266,97 +244,45 @@ func findIntelRdtMountpointDir(f io.Reader) (string, error) {
|
|||
return "", errNotFound
|
||||
}
|
||||
|
||||
// Check if MBA Software Controller is enabled through mount option "-o mba_MBps"
|
||||
if strings.Contains(","+mi[0].VFSOptions+",", ",mba_MBps,") {
|
||||
mbaScEnabled = true
|
||||
}
|
||||
|
||||
return mi[0].Mountpoint, nil
|
||||
}
|
||||
|
||||
// For Root() use only.
|
||||
var (
|
||||
intelRdtRoot string
|
||||
rootMu sync.Mutex
|
||||
intelRdtRoot string
|
||||
intelRdtRootErr error
|
||||
rootOnce sync.Once
|
||||
)
|
||||
|
||||
// The kernel creates this (empty) directory if resctrl is supported by the
|
||||
// hardware and kernel. The user is responsible for mounting the resctrl
|
||||
// filesystem, and they could mount it somewhere else if they wanted to.
|
||||
const defaultResctrlMountpoint = "/sys/fs/resctrl"
|
||||
|
||||
// Root returns the Intel RDT "resource control" filesystem mount point.
|
||||
func Root() (string, error) {
|
||||
rootMu.Lock()
|
||||
defer rootMu.Unlock()
|
||||
|
||||
if intelRdtRoot != "" {
|
||||
return intelRdtRoot, nil
|
||||
}
|
||||
|
||||
f, err := os.Open("/proc/self/mountinfo")
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
root, err := findIntelRdtMountpointDir(f)
|
||||
f.Close()
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
if _, err := os.Stat(root); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
intelRdtRoot = root
|
||||
return intelRdtRoot, nil
|
||||
}
|
||||
|
||||
type cpuInfoFlags struct {
|
||||
CAT bool // Cache Allocation Technology
|
||||
MBA bool // Memory Bandwidth Allocation
|
||||
|
||||
// Memory Bandwidth Monitoring related.
|
||||
MBMTotal bool
|
||||
MBMLocal bool
|
||||
|
||||
CMT bool // Cache Monitoring Technology
|
||||
}
|
||||
|
||||
func parseCpuInfoFile(path string) (cpuInfoFlags, error) {
|
||||
infoFlags := cpuInfoFlags{}
|
||||
|
||||
f, err := os.Open(path)
|
||||
if err != nil {
|
||||
return infoFlags, err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
s := bufio.NewScanner(f)
|
||||
for s.Scan() {
|
||||
line := s.Text()
|
||||
|
||||
// Search "cat_l3" and "mba" flags in first "flags" line
|
||||
if strings.HasPrefix(line, "flags") {
|
||||
flags := strings.Split(line, " ")
|
||||
// "cat_l3" flag for CAT and "mba" flag for MBA
|
||||
for _, flag := range flags {
|
||||
switch flag {
|
||||
case "cat_l3":
|
||||
infoFlags.CAT = true
|
||||
case "mba":
|
||||
infoFlags.MBA = true
|
||||
case "cqm_mbm_total":
|
||||
infoFlags.MBMTotal = true
|
||||
case "cqm_mbm_local":
|
||||
infoFlags.MBMLocal = true
|
||||
case "cqm_occup_llc":
|
||||
infoFlags.CMT = true
|
||||
}
|
||||
rootOnce.Do(func() {
|
||||
// Does this system support resctrl?
|
||||
var statfs unix.Statfs_t
|
||||
if err := unix.Statfs(defaultResctrlMountpoint, &statfs); err != nil {
|
||||
if errors.Is(err, unix.ENOENT) {
|
||||
err = errNotFound
|
||||
}
|
||||
return infoFlags, nil
|
||||
intelRdtRootErr = err
|
||||
return
|
||||
}
|
||||
}
|
||||
if err := s.Err(); err != nil {
|
||||
return infoFlags, err
|
||||
}
|
||||
|
||||
return infoFlags, nil
|
||||
// Has the resctrl fs been mounted to the default mount point?
|
||||
if statfs.Type == unix.RDTGROUP_SUPER_MAGIC {
|
||||
intelRdtRoot = defaultResctrlMountpoint
|
||||
return
|
||||
}
|
||||
|
||||
// The resctrl fs could have been mounted somewhere nonstandard.
|
||||
intelRdtRoot, intelRdtRootErr = findIntelRdtMountpointDir()
|
||||
})
|
||||
|
||||
return intelRdtRoot, intelRdtRootErr
|
||||
}
|
||||
|
||||
// Gets a single uint64 value from the specified file.
|
||||
|
@ -502,14 +428,8 @@ func IsMBAEnabled() bool {
|
|||
return mbaEnabled
|
||||
}
|
||||
|
||||
// Check if Intel RDT/MBA Software Controller is enabled
|
||||
func IsMBAScEnabled() bool {
|
||||
featuresInit()
|
||||
return mbaScEnabled
|
||||
}
|
||||
|
||||
// Get the path of the clos group in "resource control" filesystem that the container belongs to
|
||||
func (m *intelRdtManager) getIntelRdtPath() (string, error) {
|
||||
func (m *Manager) getIntelRdtPath() (string, error) {
|
||||
rootPath, err := Root()
|
||||
if err != nil {
|
||||
return "", err
|
||||
|
@ -524,7 +444,7 @@ func (m *intelRdtManager) getIntelRdtPath() (string, error) {
|
|||
}
|
||||
|
||||
// Applies Intel RDT configuration to the process with the specified pid
|
||||
func (m *intelRdtManager) Apply(pid int) (err error) {
|
||||
func (m *Manager) Apply(pid int) (err error) {
|
||||
// If intelRdt is not specified in config, we do nothing
|
||||
if m.config.IntelRdt == nil {
|
||||
return nil
|
||||
|
@ -559,11 +479,11 @@ func (m *intelRdtManager) Apply(pid int) (err error) {
|
|||
}
|
||||
|
||||
// Destroys the Intel RDT container-specific 'container_id' group
|
||||
func (m *intelRdtManager) Destroy() error {
|
||||
func (m *Manager) Destroy() error {
|
||||
// Don't remove resctrl group if closid has been explicitly specified. The
|
||||
// group is likely externally managed, i.e. by some other entity than us.
|
||||
// There are probably other containers/tasks sharing the same group.
|
||||
if m.config.IntelRdt == nil || m.config.IntelRdt.ClosID == "" {
|
||||
if m.config.IntelRdt != nil && m.config.IntelRdt.ClosID == "" {
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
if err := os.RemoveAll(m.GetPath()); err != nil {
|
||||
|
@ -576,7 +496,7 @@ func (m *intelRdtManager) Destroy() error {
|
|||
|
||||
// Returns Intel RDT path to save in a state file and to be able to
|
||||
// restore the object later
|
||||
func (m *intelRdtManager) GetPath() string {
|
||||
func (m *Manager) GetPath() string {
|
||||
if m.path == "" {
|
||||
m.path, _ = m.getIntelRdtPath()
|
||||
}
|
||||
|
@ -584,7 +504,7 @@ func (m *intelRdtManager) GetPath() string {
|
|||
}
|
||||
|
||||
// Returns statistics for Intel RDT
|
||||
func (m *intelRdtManager) GetStats() (*Stats, error) {
|
||||
func (m *Manager) GetStats() (*Stats, error) {
|
||||
// If intelRdt is not specified in config
|
||||
if m.config.IntelRdt == nil {
|
||||
return nil, nil
|
||||
|
@ -670,7 +590,7 @@ func (m *intelRdtManager) GetStats() (*Stats, error) {
|
|||
}
|
||||
|
||||
// Set Intel RDT "resource control" filesystem as configured.
|
||||
func (m *intelRdtManager) Set(container *configs.Config) error {
|
||||
func (m *Manager) Set(container *configs.Config) error {
|
||||
// About L3 cache schema:
|
||||
// It has allocation bitmasks/values for L3 cache on each socket,
|
||||
// which contains L3 cache id and capacity bitmask (CBM).
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
package intelrdt
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"io"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
@ -22,7 +20,7 @@ func TestIntelRdtSetL3CacheSchema(t *testing.T) {
|
|||
})
|
||||
|
||||
helper.config.IntelRdt.L3CacheSchema = l3CacheSchemeAfter
|
||||
intelrdt := NewManager(helper.config, "", helper.IntelRdtPath)
|
||||
intelrdt := newManager(helper.config, "", helper.IntelRdtPath)
|
||||
if err := intelrdt.Set(helper.config); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
@ -52,7 +50,7 @@ func TestIntelRdtSetMemBwSchema(t *testing.T) {
|
|||
})
|
||||
|
||||
helper.config.IntelRdt.MemBwSchema = memBwSchemeAfter
|
||||
intelrdt := NewManager(helper.config, "", helper.IntelRdtPath)
|
||||
intelrdt := newManager(helper.config, "", helper.IntelRdtPath)
|
||||
if err := intelrdt.Set(helper.config); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
@ -82,7 +80,7 @@ func TestIntelRdtSetMemBwScSchema(t *testing.T) {
|
|||
})
|
||||
|
||||
helper.config.IntelRdt.MemBwSchema = memBwScSchemeAfter
|
||||
intelrdt := NewManager(helper.config, "", helper.IntelRdtPath)
|
||||
intelrdt := newManager(helper.config, "", helper.IntelRdtPath)
|
||||
if err := intelrdt.Set(helper.config); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
@ -105,7 +103,7 @@ func TestApply(t *testing.T) {
|
|||
const closID = "test-clos"
|
||||
|
||||
helper.config.IntelRdt.ClosID = closID
|
||||
intelrdt := NewManager(helper.config, "", helper.IntelRdtPath)
|
||||
intelrdt := newManager(helper.config, "", helper.IntelRdtPath)
|
||||
if err := intelrdt.Apply(1234); err == nil {
|
||||
t.Fatal("unexpected success when applying pid")
|
||||
}
|
||||
|
@ -114,7 +112,7 @@ func TestApply(t *testing.T) {
|
|||
}
|
||||
|
||||
// Dir should be created if some schema has been specified
|
||||
intelrdt.(*intelRdtManager).config.IntelRdt.L3CacheSchema = "L3:0=f"
|
||||
intelrdt.config.IntelRdt.L3CacheSchema = "L3:0=f"
|
||||
if err := intelrdt.Apply(1235); err != nil {
|
||||
t.Fatalf("Apply() failed: %v", err)
|
||||
}
|
||||
|
@ -127,141 +125,3 @@ func TestApply(t *testing.T) {
|
|||
t.Fatalf("unexpected tasks file, expected '1235', got %q", pids)
|
||||
}
|
||||
}
|
||||
|
||||
const (
|
||||
mountinfoValid = `18 40 0:18 / /sys rw,nosuid,nodev,noexec,relatime shared:6 - sysfs sysfs rw
|
||||
19 40 0:3 / /proc rw,nosuid,nodev,noexec,relatime shared:5 - proc proc rw
|
||||
20 40 0:5 / /dev rw,nosuid shared:2 - devtmpfs devtmpfs rw,size=131927256k,nr_inodes=32981814,mode=755
|
||||
21 18 0:17 / /sys/kernel/security rw,nosuid,nodev,noexec,relatime shared:7 - securityfs securityfs rw
|
||||
22 20 0:19 / /dev/shm rw,nosuid,nodev shared:3 - tmpfs tmpfs rw
|
||||
23 20 0:12 / /dev/pts rw,nosuid,noexec,relatime shared:4 - devpts devpts rw,gid=5,mode=620,ptmxmode=000
|
||||
24 40 0:20 / /run rw,nosuid,nodev shared:22 - tmpfs tmpfs rw,mode=755
|
||||
25 18 0:21 / /sys/fs/cgroup ro,nosuid,nodev,noexec shared:8 - tmpfs tmpfs ro,mode=755
|
||||
26 25 0:22 / /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:9 - cgroup cgroup rw,xattr,release_agent=/usr/lib/systemd/systemd-cgroups-agent,name=systemd
|
||||
27 18 0:23 / /sys/fs/pstore rw,nosuid,nodev,noexec,relatime shared:20 - pstore pstore rw
|
||||
28 25 0:24 / /sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:10 - cgroup cgroup rw,perf_event
|
||||
29 25 0:25 / /sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:11 - cgroup cgroup rw,cpuacct,cpu
|
||||
30 25 0:26 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:12 - cgroup cgroup rw,memory
|
||||
31 25 0:27 / /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:13 - cgroup cgroup rw,devices
|
||||
32 25 0:28 / /sys/fs/cgroup/hugetlb rw,nosuid,nodev,noexec,relatime shared:14 - cgroup cgroup rw,hugetlb
|
||||
33 25 0:29 / /sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:15 - cgroup cgroup rw,blkio
|
||||
34 25 0:30 / /sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:16 - cgroup cgroup rw,pids
|
||||
35 25 0:31 / /sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,cpuset
|
||||
36 25 0:32 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,freezer
|
||||
37 25 0:33 / /sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:19 - cgroup cgroup rw,net_prio,net_cls
|
||||
38 18 0:34 / /sys/kernel/config rw,relatime shared:21 - configfs configfs rw
|
||||
40 0 253:0 / / rw,relatime shared:1 - ext4 /dev/mapper/vvrg-vvrg rw,data=ordered
|
||||
16 18 0:6 / /sys/kernel/debug rw,relatime shared:23 - debugfs debugfs rw
|
||||
41 18 0:16 / /sys/fs/resctrl rw,relatime shared:24 - resctrl resctrl rw
|
||||
42 20 0:36 / /dev/hugepages rw,relatime shared:25 - hugetlbfs hugetlbfs rw
|
||||
43 19 0:37 / /proc/sys/fs/binfmt_misc rw,relatime shared:26 - autofs systemd-1 rw,fd=32,pgrp=1,timeout=0,minproto=5,maxproto=5,direct,pipe_ino=35492
|
||||
44 20 0:15 / /dev/mqueue rw,relatime shared:27 - mqueue mqueue rw
|
||||
45 40 8:1 / /boot rw,relatime shared:28 - ext4 /dev/sda1 rw,stripe=4,data=ordered
|
||||
46 40 253:1 / /home rw,relatime shared:29 - ext4 /dev/mapper/vvhg-vvhg rw,data=ordered
|
||||
47 40 0:38 / /var/lib/nfs/rpc_pipefs rw,relatime shared:30 - rpc_pipefs sunrpc rw
|
||||
125 24 0:20 /mesos/containers /run/mesos/containers rw,nosuid shared:22 - tmpfs tmpfs rw,mode=755
|
||||
123 40 253:0 /var/lib/docker/containers /var/lib/docker/containers rw,relatime - ext4 /dev/mapper/vvrg-vvrg rw,data=ordered
|
||||
129 40 253:0 /var/lib/docker/overlay2 /var/lib/docker/overlay2 rw,relatime - ext4 /dev/mapper/vvrg-vvrg rw,data=ordered
|
||||
119 24 0:39 / /run/user/1009 rw,nosuid,nodev,relatime shared:100 - tmpfs tmpfs rw,size=26387788k,mode=700,uid=1009,gid=1009`
|
||||
|
||||
mountinfoMbaSc = `18 40 0:18 / /sys rw,nosuid,nodev,noexec,relatime shared:6 - sysfs sysfs rw
|
||||
19 40 0:3 / /proc rw,nosuid,nodev,noexec,relatime shared:5 - proc proc rw
|
||||
20 40 0:5 / /dev rw,nosuid shared:2 - devtmpfs devtmpfs rw,size=131927256k,nr_inodes=32981814,mode=755
|
||||
21 18 0:17 / /sys/kernel/security rw,nosuid,nodev,noexec,relatime shared:7 - securityfs securityfs rw
|
||||
22 20 0:19 / /dev/shm rw,nosuid,nodev shared:3 - tmpfs tmpfs rw
|
||||
23 20 0:12 / /dev/pts rw,nosuid,noexec,relatime shared:4 - devpts devpts rw,gid=5,mode=620,ptmxmode=000
|
||||
24 40 0:20 / /run rw,nosuid,nodev shared:22 - tmpfs tmpfs rw,mode=755
|
||||
25 18 0:21 / /sys/fs/cgroup ro,nosuid,nodev,noexec shared:8 - tmpfs tmpfs ro,mode=755
|
||||
26 25 0:22 / /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:9 - cgroup cgroup rw,xattr,release_agent=/usr/lib/systemd/systemd-cgroups-agent,name=systemd
|
||||
27 18 0:23 / /sys/fs/pstore rw,nosuid,nodev,noexec,relatime shared:20 - pstore pstore rw
|
||||
28 25 0:24 / /sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:10 - cgroup cgroup rw,perf_event
|
||||
29 25 0:25 / /sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:11 - cgroup cgroup rw,cpuacct,cpu
|
||||
30 25 0:26 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:12 - cgroup cgroup rw,memory
|
||||
31 25 0:27 / /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:13 - cgroup cgroup rw,devices
|
||||
32 25 0:28 / /sys/fs/cgroup/hugetlb rw,nosuid,nodev,noexec,relatime shared:14 - cgroup cgroup rw,hugetlb
|
||||
33 25 0:29 / /sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:15 - cgroup cgroup rw,blkio
|
||||
34 25 0:30 / /sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:16 - cgroup cgroup rw,pids
|
||||
35 25 0:31 / /sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,cpuset
|
||||
36 25 0:32 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,freezer
|
||||
37 25 0:33 / /sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:19 - cgroup cgroup rw,net_prio,net_cls
|
||||
38 18 0:34 / /sys/kernel/config rw,relatime shared:21 - configfs configfs rw
|
||||
40 0 253:0 / / rw,relatime shared:1 - ext4 /dev/mapper/vvrg-vvrg rw,data=ordered
|
||||
16 18 0:6 / /sys/kernel/debug rw,relatime shared:23 - debugfs debugfs rw
|
||||
41 18 0:16 / /sys/fs/resctrl rw,relatime shared:24 - resctrl resctrl rw,mba_MBps
|
||||
42 20 0:36 / /dev/hugepages rw,relatime shared:25 - hugetlbfs hugetlbfs rw
|
||||
43 19 0:37 / /proc/sys/fs/binfmt_misc rw,relatime shared:26 - autofs systemd-1 rw,fd=32,pgrp=1,timeout=0,minproto=5,maxproto=5,direct,pipe_ino=35492
|
||||
44 20 0:15 / /dev/mqueue rw,relatime shared:27 - mqueue mqueue rw
|
||||
45 40 8:1 / /boot rw,relatime shared:28 - ext4 /dev/sda1 rw,stripe=4,data=ordered
|
||||
46 40 253:1 / /home rw,relatime shared:29 - ext4 /dev/mapper/vvhg-vvhg rw,data=ordered
|
||||
47 40 0:38 / /var/lib/nfs/rpc_pipefs rw,relatime shared:30 - rpc_pipefs sunrpc rw
|
||||
125 24 0:20 /mesos/containers /run/mesos/containers rw,nosuid shared:22 - tmpfs tmpfs rw,mode=755
|
||||
123 40 253:0 /var/lib/docker/containers /var/lib/docker/containers rw,relatime - ext4 /dev/mapper/vvrg-vvrg rw,data=ordered
|
||||
129 40 253:0 /var/lib/docker/overlay2 /var/lib/docker/overlay2 rw,relatime - ext4 /dev/mapper/vvrg-vvrg rw,data=ordered
|
||||
119 24 0:39 / /run/user/1009 rw,nosuid,nodev,relatime shared:100 - tmpfs tmpfs rw,size=26387788k,mode=700,uid=1009,gid=1009`
|
||||
)
|
||||
|
||||
func TestFindIntelRdtMountpointDir(t *testing.T) {
|
||||
testCases := []struct {
|
||||
name string
|
||||
input io.Reader
|
||||
isNotFoundError bool
|
||||
isError bool
|
||||
mbaScEnabled bool
|
||||
mountpoint string
|
||||
}{
|
||||
{
|
||||
name: "Valid mountinfo with MBA Software Controller disabled",
|
||||
input: strings.NewReader(mountinfoValid),
|
||||
mountpoint: "/sys/fs/resctrl",
|
||||
},
|
||||
{
|
||||
name: "Valid mountinfo with MBA Software Controller enabled",
|
||||
input: strings.NewReader(mountinfoMbaSc),
|
||||
mbaScEnabled: true,
|
||||
mountpoint: "/sys/fs/resctrl",
|
||||
},
|
||||
{
|
||||
name: "Empty mountinfo",
|
||||
input: strings.NewReader(""),
|
||||
isNotFoundError: true,
|
||||
},
|
||||
{
|
||||
name: "Broken mountinfo",
|
||||
input: strings.NewReader("baa"),
|
||||
isError: true,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
tc := tc
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
mbaScEnabled = false
|
||||
mp, err := findIntelRdtMountpointDir(tc.input)
|
||||
if tc.isNotFoundError {
|
||||
if !errors.Is(err, errNotFound) {
|
||||
t.Errorf("expected errNotFound error, got %+v", err)
|
||||
}
|
||||
return
|
||||
}
|
||||
if tc.isError {
|
||||
if err == nil {
|
||||
t.Error("expected error, got nil")
|
||||
}
|
||||
return
|
||||
}
|
||||
if err != nil {
|
||||
t.Errorf("expected nil, got %+v", err)
|
||||
return
|
||||
}
|
||||
// no errors, check the results
|
||||
if tc.mbaScEnabled != mbaScEnabled {
|
||||
t.Errorf("expected mbaScEnabled=%v, got %v",
|
||||
tc.mbaScEnabled, mbaScEnabled)
|
||||
}
|
||||
if tc.mountpoint != mp {
|
||||
t.Errorf("expected mountpoint=%q, got %q",
|
||||
tc.mountpoint, mp)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
|
|
@ -26,7 +26,12 @@ func NewIntelRdtTestUtil(t *testing.T) *intelRdtTestUtil {
|
|||
config := &configs.Config{
|
||||
IntelRdt: &configs.IntelRdt{},
|
||||
}
|
||||
|
||||
// Assign fake intelRtdRoot value, returned by Root().
|
||||
intelRdtRoot = t.TempDir()
|
||||
// Make sure Root() won't even try to parse mountinfo.
|
||||
rootOnce.Do(func() {})
|
||||
|
||||
testIntelRdtPath := filepath.Join(intelRdtRoot, "resctrl")
|
||||
|
||||
// Ensure the full mock Intel RDT "resource control" filesystem path exists
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
package libcontainer
|
||||
|
||||
import (
|
||||
"io/fs"
|
||||
"strconv"
|
||||
|
||||
"golang.org/x/sys/unix"
|
||||
|
@ -81,3 +82,20 @@ func unmount(target string, flags int) error {
|
|||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// syscallMode returns the syscall-specific mode bits from Go's portable mode bits.
|
||||
// Copy from https://cs.opensource.google/go/go/+/refs/tags/go1.20.7:src/os/file_posix.go;l=61-75
|
||||
func syscallMode(i fs.FileMode) (o uint32) {
|
||||
o |= uint32(i.Perm())
|
||||
if i&fs.ModeSetuid != 0 {
|
||||
o |= unix.S_ISUID
|
||||
}
|
||||
if i&fs.ModeSetgid != 0 {
|
||||
o |= unix.S_ISGID
|
||||
}
|
||||
if i&fs.ModeSticky != 0 {
|
||||
o |= unix.S_ISVTX
|
||||
}
|
||||
// No mapping for Go's ModeTemporary (plan9 only).
|
||||
return
|
||||
}
|
||||
|
|
|
@ -151,7 +151,7 @@ static int is_self_cloned(void)
|
|||
* Is the binary a fully-sealed memfd? We don't need CLONED_BINARY_ENV for
|
||||
* this, because you cannot write to a sealed memfd no matter what (so
|
||||
* sharing it isn't a bad thing -- and an admin could bind-mount a sealed
|
||||
* memfd to /usr/bin/runc to allow re-use).
|
||||
* memfd to /usr/bin/runc to allow reuse).
|
||||
*/
|
||||
ret = fcntl(fd, F_GET_SEALS);
|
||||
if (ret >= 0) {
|
||||
|
|
|
@ -168,15 +168,17 @@ static void write_log(int level, const char *format, ...)
|
|||
|
||||
message = escape_json_string(message);
|
||||
|
||||
if (current_stage == STAGE_SETUP)
|
||||
if (current_stage == STAGE_SETUP) {
|
||||
stage = strdup("nsexec");
|
||||
else
|
||||
if (stage == NULL)
|
||||
goto out;
|
||||
} else {
|
||||
ret = asprintf(&stage, "nsexec-%d", current_stage);
|
||||
if (ret < 0) {
|
||||
stage = NULL;
|
||||
goto out;
|
||||
if (ret < 0) {
|
||||
stage = NULL;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
ret = asprintf(&json, "{\"level\":\"%s\", \"msg\": \"%s[%d]: %s\"}\n",
|
||||
level_str[level], stage, getpid(), message);
|
||||
if (ret < 0) {
|
||||
|
@ -416,11 +418,9 @@ static int getenv_int(const char *name)
|
|||
if (val == endptr || *endptr != '\0')
|
||||
bail("unable to parse %s=%s", name, val);
|
||||
/*
|
||||
* Sanity check: this must be a small non-negative number.
|
||||
* Practically, we pass two fds (3 and 4) and a log level,
|
||||
* for which the maximum is 6 (TRACE).
|
||||
* */
|
||||
if (ret < 0 || ret > TRACE)
|
||||
* Sanity check: this must be a non-negative number.
|
||||
*/
|
||||
if (ret < 0)
|
||||
bail("bad value for %s=%s (%d)", name, val, ret);
|
||||
|
||||
return ret;
|
||||
|
@ -832,6 +832,25 @@ void send_mountsources(int sockfd, pid_t child, char *mountsources, size_t mount
|
|||
bail("failed to close container mount namespace fd %d", container_mntns_fd);
|
||||
}
|
||||
|
||||
void try_unshare(int flags, const char *msg)
|
||||
{
|
||||
write_log(DEBUG, "unshare %s", msg);
|
||||
/*
|
||||
* Kernels prior to v4.3 may return EINVAL on unshare when another process
|
||||
* reads runc's /proc/$PID/status or /proc/$PID/maps. To work around this,
|
||||
* retry on EINVAL a few times.
|
||||
*/
|
||||
int retries = 5;
|
||||
for (; retries > 0; retries--) {
|
||||
if (unshare(flags) == 0) {
|
||||
return;
|
||||
}
|
||||
if (errno != EINVAL)
|
||||
break;
|
||||
}
|
||||
bail("failed to unshare %s", msg);
|
||||
}
|
||||
|
||||
void nsexec(void)
|
||||
{
|
||||
int pipenum;
|
||||
|
@ -1070,7 +1089,7 @@ void nsexec(void)
|
|||
|
||||
s = SYNC_MOUNTSOURCES_ACK;
|
||||
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
|
||||
kill(stage1_pid, SIGKILL);
|
||||
sane_kill(stage1_pid, SIGKILL);
|
||||
bail("failed to sync with child: write(SYNC_MOUNTSOURCES_ACK)");
|
||||
}
|
||||
break;
|
||||
|
@ -1170,9 +1189,7 @@ void nsexec(void)
|
|||
* problem.
|
||||
*/
|
||||
if (config.cloneflags & CLONE_NEWUSER) {
|
||||
write_log(DEBUG, "unshare user namespace");
|
||||
if (unshare(CLONE_NEWUSER) < 0)
|
||||
bail("failed to unshare user namespace");
|
||||
try_unshare(CLONE_NEWUSER, "user namespace");
|
||||
config.cloneflags &= ~CLONE_NEWUSER;
|
||||
|
||||
/*
|
||||
|
@ -1224,15 +1241,13 @@ void nsexec(void)
|
|||
* some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID)
|
||||
* was broken, so we'll just do it the long way anyway.
|
||||
*/
|
||||
write_log(DEBUG, "unshare remaining namespace (except cgroupns)");
|
||||
if (unshare(config.cloneflags & ~CLONE_NEWCGROUP) < 0)
|
||||
bail("failed to unshare remaining namespaces (except cgroupns)");
|
||||
try_unshare(config.cloneflags & ~CLONE_NEWCGROUP, "remaining namespaces (except cgroupns)");
|
||||
|
||||
/* Ask our parent to send the mount sources fds. */
|
||||
if (config.mountsources) {
|
||||
s = SYNC_MOUNTSOURCES_PLS;
|
||||
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
|
||||
kill(stage2_pid, SIGKILL);
|
||||
sane_kill(stage2_pid, SIGKILL);
|
||||
bail("failed to sync with parent: write(SYNC_MOUNTSOURCES_PLS)");
|
||||
}
|
||||
|
||||
|
@ -1241,11 +1256,11 @@ void nsexec(void)
|
|||
|
||||
/* Parent finished to send the mount sources fds. */
|
||||
if (read(syncfd, &s, sizeof(s)) != sizeof(s)) {
|
||||
kill(stage2_pid, SIGKILL);
|
||||
sane_kill(stage2_pid, SIGKILL);
|
||||
bail("failed to sync with parent: read(SYNC_MOUNTSOURCES_ACK)");
|
||||
}
|
||||
if (s != SYNC_MOUNTSOURCES_ACK) {
|
||||
kill(stage2_pid, SIGKILL);
|
||||
sane_kill(stage2_pid, SIGKILL);
|
||||
bail("failed to sync with parent: SYNC_MOUNTSOURCES_ACK: got %u", s);
|
||||
}
|
||||
}
|
||||
|
@ -1344,8 +1359,7 @@ void nsexec(void)
|
|||
}
|
||||
|
||||
if (config.cloneflags & CLONE_NEWCGROUP) {
|
||||
if (unshare(CLONE_NEWCGROUP) < 0)
|
||||
bail("failed to unshare cgroup namespace");
|
||||
try_unshare(CLONE_NEWCGROUP, "cgroup namespace");
|
||||
}
|
||||
|
||||
write_log(DEBUG, "signal completion to stage-0");
|
||||
|
|
|
@ -39,13 +39,9 @@ type parentProcess interface {
|
|||
|
||||
// startTime returns the process start time.
|
||||
startTime() (uint64, error)
|
||||
|
||||
signal(os.Signal) error
|
||||
|
||||
externalDescriptors() []string
|
||||
|
||||
setExternalDescriptors(fds []string)
|
||||
|
||||
forwardChildLogs() chan error
|
||||
}
|
||||
|
||||
|
@ -303,7 +299,7 @@ type initProcess struct {
|
|||
logFilePair filePair
|
||||
config *initConfig
|
||||
manager cgroups.Manager
|
||||
intelRdtManager intelrdt.Manager
|
||||
intelRdtManager *intelrdt.Manager
|
||||
container *linuxContainer
|
||||
fds []string
|
||||
process *Process
|
||||
|
|
|
@ -80,6 +80,8 @@ func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig, mountFds []int) (err
|
|||
// Therefore, we can access mountFds[i] without any concerns.
|
||||
if mountFds != nil && mountFds[i] != -1 {
|
||||
mountConfig.fd = &mountFds[i]
|
||||
} else {
|
||||
mountConfig.fd = nil
|
||||
}
|
||||
|
||||
if err := mountToRootfs(m, mountConfig); err != nil {
|
||||
|
@ -327,26 +329,41 @@ func mountCgroupV2(m *configs.Mount, c *mountConfig) error {
|
|||
if err := os.MkdirAll(dest, 0o755); err != nil {
|
||||
return err
|
||||
}
|
||||
return utils.WithProcfd(c.root, m.Destination, func(procfd string) error {
|
||||
if err := mount(m.Source, m.Destination, procfd, "cgroup2", uintptr(m.Flags), m.Data); err != nil {
|
||||
// when we are in UserNS but CgroupNS is not unshared, we cannot mount cgroup2 (#2158)
|
||||
if errors.Is(err, unix.EPERM) || errors.Is(err, unix.EBUSY) {
|
||||
src := fs2.UnifiedMountpoint
|
||||
if c.cgroupns && c.cgroup2Path != "" {
|
||||
// Emulate cgroupns by bind-mounting
|
||||
// the container cgroup path rather than
|
||||
// the whole /sys/fs/cgroup.
|
||||
src = c.cgroup2Path
|
||||
}
|
||||
err = mount(src, m.Destination, procfd, "", uintptr(m.Flags)|unix.MS_BIND, "")
|
||||
if c.rootlessCgroups && errors.Is(err, unix.ENOENT) {
|
||||
err = nil
|
||||
}
|
||||
}
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
err = utils.WithProcfd(c.root, m.Destination, func(procfd string) error {
|
||||
return mount(m.Source, m.Destination, procfd, "cgroup2", uintptr(m.Flags), m.Data)
|
||||
})
|
||||
if err == nil || !(errors.Is(err, unix.EPERM) || errors.Is(err, unix.EBUSY)) {
|
||||
return err
|
||||
}
|
||||
|
||||
// When we are in UserNS but CgroupNS is not unshared, we cannot mount
|
||||
// cgroup2 (#2158), so fall back to bind mount.
|
||||
bindM := &configs.Mount{
|
||||
Device: "bind",
|
||||
Source: fs2.UnifiedMountpoint,
|
||||
Destination: m.Destination,
|
||||
Flags: unix.MS_BIND | m.Flags,
|
||||
PropagationFlags: m.PropagationFlags,
|
||||
}
|
||||
if c.cgroupns && c.cgroup2Path != "" {
|
||||
// Emulate cgroupns by bind-mounting the container cgroup path
|
||||
// rather than the whole /sys/fs/cgroup.
|
||||
bindM.Source = c.cgroup2Path
|
||||
}
|
||||
// mountToRootfs() handles remounting for MS_RDONLY.
|
||||
// No need to set c.fd here, because mountToRootfs() calls utils.WithProcfd() by itself in mountPropagate().
|
||||
err = mountToRootfs(bindM, c)
|
||||
if c.rootlessCgroups && errors.Is(err, unix.ENOENT) {
|
||||
// ENOENT (for `src = c.cgroup2Path`) happens when rootless runc is being executed
|
||||
// outside the userns+mountns.
|
||||
//
|
||||
// Mask `/sys/fs/cgroup` to ensure it is read-only, even when `/sys` is mounted
|
||||
// with `rbind,ro` (`runc spec --rootless` produces `rbind,ro` for `/sys`).
|
||||
err = utils.WithProcfd(c.root, m.Destination, func(procfd string) error {
|
||||
return maskPath(procfd, c.label)
|
||||
})
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
func doTmpfsCopyUp(m *configs.Mount, rootfs, mountLabel string) (Err error) {
|
||||
|
@ -396,6 +413,35 @@ func doTmpfsCopyUp(m *configs.Mount, rootfs, mountLabel string) (Err error) {
|
|||
|
||||
func mountToRootfs(m *configs.Mount, c *mountConfig) error {
|
||||
rootfs := c.root
|
||||
|
||||
// procfs and sysfs are special because we need to ensure they are actually
|
||||
// mounted on a specific path in a container without any funny business.
|
||||
switch m.Device {
|
||||
case "proc", "sysfs":
|
||||
// If the destination already exists and is not a directory, we bail
|
||||
// out. This is to avoid mounting through a symlink or similar -- which
|
||||
// has been a "fun" attack scenario in the past.
|
||||
// TODO: This won't be necessary once we switch to libpathrs and we can
|
||||
// stop all of these symlink-exchange attacks.
|
||||
dest := filepath.Clean(m.Destination)
|
||||
if !strings.HasPrefix(dest, rootfs) {
|
||||
// Do not use securejoin as it resolves symlinks.
|
||||
dest = filepath.Join(rootfs, dest)
|
||||
}
|
||||
if fi, err := os.Lstat(dest); err != nil {
|
||||
if !os.IsNotExist(err) {
|
||||
return err
|
||||
}
|
||||
} else if !fi.IsDir() {
|
||||
return fmt.Errorf("filesystem %q must be mounted on ordinary directory", m.Device)
|
||||
}
|
||||
if err := os.MkdirAll(dest, 0o755); err != nil {
|
||||
return err
|
||||
}
|
||||
// Selinux kernels do not support labeling of /proc or /sys.
|
||||
return mountPropagate(m, rootfs, "", nil)
|
||||
}
|
||||
|
||||
mountLabel := c.label
|
||||
mountFd := c.fd
|
||||
dest, err := securejoin.SecureJoin(rootfs, m.Destination)
|
||||
|
@ -404,24 +450,6 @@ func mountToRootfs(m *configs.Mount, c *mountConfig) error {
|
|||
}
|
||||
|
||||
switch m.Device {
|
||||
case "proc", "sysfs":
|
||||
// If the destination already exists and is not a directory, we bail
|
||||
// out This is to avoid mounting through a symlink or similar -- which
|
||||
// has been a "fun" attack scenario in the past.
|
||||
// TODO: This won't be necessary once we switch to libpathrs and we can
|
||||
// stop all of these symlink-exchange attacks.
|
||||
if fi, err := os.Lstat(dest); err != nil {
|
||||
if !os.IsNotExist(err) {
|
||||
return err
|
||||
}
|
||||
} else if fi.Mode()&os.ModeDir == 0 {
|
||||
return fmt.Errorf("filesystem %q must be mounted on ordinary directory", m.Device)
|
||||
}
|
||||
if err := os.MkdirAll(dest, 0o755); err != nil {
|
||||
return err
|
||||
}
|
||||
// Selinux kernels do not support labeling of /proc or /sys
|
||||
return mountPropagate(m, rootfs, "", nil)
|
||||
case "mqueue":
|
||||
if err := os.MkdirAll(dest, 0o755); err != nil {
|
||||
return err
|
||||
|
@ -431,11 +459,16 @@ func mountToRootfs(m *configs.Mount, c *mountConfig) error {
|
|||
}
|
||||
return label.SetFileLabel(dest, mountLabel)
|
||||
case "tmpfs":
|
||||
stat, err := os.Stat(dest)
|
||||
if err != nil {
|
||||
if stat, err := os.Stat(dest); err != nil {
|
||||
if err := os.MkdirAll(dest, 0o755); err != nil {
|
||||
return err
|
||||
}
|
||||
} else {
|
||||
dt := fmt.Sprintf("mode=%04o", syscallMode(stat.Mode()))
|
||||
if m.Data != "" {
|
||||
dt = dt + "," + m.Data
|
||||
}
|
||||
m.Data = dt
|
||||
}
|
||||
|
||||
if m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP {
|
||||
|
@ -444,16 +477,7 @@ func mountToRootfs(m *configs.Mount, c *mountConfig) error {
|
|||
err = mountPropagate(m, rootfs, mountLabel, nil)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if stat != nil {
|
||||
if err = os.Chmod(dest, stat.Mode()); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
return err
|
||||
case "bind":
|
||||
if err := prepareBindMount(m, rootfs, mountFd); err != nil {
|
||||
return err
|
||||
|
@ -577,6 +601,7 @@ func checkProcMount(rootfs, dest, source string) error {
|
|||
"/proc/loadavg",
|
||||
"/proc/slabinfo",
|
||||
"/proc/net/dev",
|
||||
"/proc/sys/kernel/ns_last_pid",
|
||||
}
|
||||
for _, valid := range validProcMounts {
|
||||
path, err := filepath.Rel(filepath.Join(rootfs, valid), dest)
|
||||
|
|
|
@ -38,6 +38,14 @@ func TestCheckMountDestFalsePositive(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestCheckMountDestNsLastPid(t *testing.T) {
|
||||
dest := "/rootfs/proc/sys/kernel/ns_last_pid"
|
||||
err := checkProcMount("/rootfs", dest, "/proc")
|
||||
if err != nil {
|
||||
t.Fatal("/proc/sys/kernel/ns_last_pid should not return an error")
|
||||
}
|
||||
}
|
||||
|
||||
func TestNeedsSetupDev(t *testing.T) {
|
||||
config := &configs.Config{
|
||||
Mounts: []*configs.Mount{
|
||||
|
|
|
@ -29,13 +29,15 @@ func KnownOperators() []string {
|
|||
}
|
||||
|
||||
var actions = map[string]configs.Action{
|
||||
"SCMP_ACT_KILL": configs.Kill,
|
||||
"SCMP_ACT_ERRNO": configs.Errno,
|
||||
"SCMP_ACT_TRAP": configs.Trap,
|
||||
"SCMP_ACT_ALLOW": configs.Allow,
|
||||
"SCMP_ACT_TRACE": configs.Trace,
|
||||
"SCMP_ACT_LOG": configs.Log,
|
||||
"SCMP_ACT_NOTIFY": configs.Notify,
|
||||
"SCMP_ACT_KILL": configs.Kill,
|
||||
"SCMP_ACT_ERRNO": configs.Errno,
|
||||
"SCMP_ACT_TRAP": configs.Trap,
|
||||
"SCMP_ACT_ALLOW": configs.Allow,
|
||||
"SCMP_ACT_TRACE": configs.Trace,
|
||||
"SCMP_ACT_LOG": configs.Log,
|
||||
"SCMP_ACT_NOTIFY": configs.Notify,
|
||||
"SCMP_ACT_KILL_THREAD": configs.KillThread,
|
||||
"SCMP_ACT_KILL_PROCESS": configs.KillProcess,
|
||||
}
|
||||
|
||||
// KnownActions returns the list of the known actions.
|
||||
|
@ -64,6 +66,7 @@ var archs = map[string]string{
|
|||
"SCMP_ARCH_PPC": "ppc",
|
||||
"SCMP_ARCH_PPC64": "ppc64",
|
||||
"SCMP_ARCH_PPC64LE": "ppc64le",
|
||||
"SCMP_ARCH_RISCV64": "riscv64",
|
||||
"SCMP_ARCH_S390": "s390",
|
||||
"SCMP_ARCH_S390X": "s390x",
|
||||
}
|
||||
|
|
|
@ -48,6 +48,13 @@ const uintptr_t C_FILTER_FLAG_LOG = SECCOMP_FILTER_FLAG_LOG;
|
|||
#endif
|
||||
const uintptr_t C_FILTER_FLAG_NEW_LISTENER = SECCOMP_FILTER_FLAG_NEW_LISTENER;
|
||||
|
||||
#ifndef AUDIT_ARCH_RISCV64
|
||||
#ifndef EM_RISCV
|
||||
#define EM_RISCV 243
|
||||
#endif
|
||||
#define AUDIT_ARCH_RISCV64 (EM_RISCV|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
|
||||
#endif
|
||||
|
||||
// We use the AUDIT_ARCH_* values because those are the ones used by the kernel
|
||||
// and SCMP_ARCH_* sometimes has fake values (such as SCMP_ARCH_X32). But we
|
||||
// use <seccomp.h> so we get libseccomp's fallback definitions of AUDIT_ARCH_*.
|
||||
|
@ -67,11 +74,17 @@ const uint32_t C_AUDIT_ARCH_PPC64 = AUDIT_ARCH_PPC64;
|
|||
const uint32_t C_AUDIT_ARCH_PPC64LE = AUDIT_ARCH_PPC64LE;
|
||||
const uint32_t C_AUDIT_ARCH_S390 = AUDIT_ARCH_S390;
|
||||
const uint32_t C_AUDIT_ARCH_S390X = AUDIT_ARCH_S390X;
|
||||
const uint32_t C_AUDIT_ARCH_RISCV64 = AUDIT_ARCH_RISCV64;
|
||||
*/
|
||||
import "C"
|
||||
|
||||
var retErrnoEnosys = uint32(C.C_ACT_ERRNO_ENOSYS)
|
||||
|
||||
// This syscall is used for multiplexing "large" syscalls on s390(x). Unknown
|
||||
// syscalls will end up with this syscall number, so we need to explicitly
|
||||
// return -ENOSYS for this syscall on those architectures.
|
||||
const s390xMultiplexSyscall libseccomp.ScmpSyscall = 0
|
||||
|
||||
func isAllowAction(action configs.Action) bool {
|
||||
switch action {
|
||||
// Trace is considered an "allow" action because a good tracer should
|
||||
|
@ -197,6 +210,8 @@ func archToNative(arch libseccomp.ScmpArch) (nativeArch, error) {
|
|||
return nativeArch(C.C_AUDIT_ARCH_S390), nil
|
||||
case libseccomp.ArchS390X:
|
||||
return nativeArch(C.C_AUDIT_ARCH_S390X), nil
|
||||
case libseccomp.ArchRISCV64:
|
||||
return nativeArch(C.C_AUDIT_ARCH_RISCV64), nil
|
||||
default:
|
||||
return invalidArch, fmt.Errorf("unknown architecture: %v", arch)
|
||||
}
|
||||
|
@ -305,7 +320,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
|
|||
// directly from the arch code so we need to do it here. Sadly we can't
|
||||
// share this code between architecture branches.
|
||||
section := []bpf.Instruction{
|
||||
// load [0]
|
||||
// load [0] (syscall number)
|
||||
bpf.LoadAbsolute{Off: 0, Size: 4}, // NOTE: We assume sizeof(int) == 4.
|
||||
}
|
||||
|
||||
|
@ -314,10 +329,37 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
|
|||
// No syscalls found for this arch -- skip it and move on.
|
||||
continue
|
||||
case 1:
|
||||
// Get the only syscall in the map.
|
||||
var sysno libseccomp.ScmpSyscall
|
||||
for _, no := range maxSyscalls {
|
||||
// Get the only syscall and scmpArch in the map.
|
||||
var (
|
||||
scmpArch libseccomp.ScmpArch
|
||||
sysno libseccomp.ScmpSyscall
|
||||
)
|
||||
for arch, no := range maxSyscalls {
|
||||
sysno = no
|
||||
scmpArch = arch
|
||||
}
|
||||
|
||||
switch scmpArch {
|
||||
// Return -ENOSYS for setup(2) on s390(x). This syscall is used for
|
||||
// multiplexing "large syscall number" syscalls, but if the syscall
|
||||
// number is not known to the kernel then the syscall number is
|
||||
// left unchanged (and because it is sysno=0, you'll end up with
|
||||
// EPERM for syscalls the kernel doesn't know about).
|
||||
//
|
||||
// The actual setup(2) syscall is never used by userspace anymore
|
||||
// (and hasn't existed for decades) outside of this multiplexing
|
||||
// scheme so returning -ENOSYS is fine.
|
||||
case libseccomp.ArchS390, libseccomp.ArchS390X:
|
||||
section = append(section, []bpf.Instruction{
|
||||
// jne [setup=0],1
|
||||
bpf.JumpIf{
|
||||
Cond: bpf.JumpNotEqual,
|
||||
Val: uint32(s390xMultiplexSyscall),
|
||||
SkipTrue: 1,
|
||||
},
|
||||
// ret [ENOSYS]
|
||||
bpf.RetConstant{Val: retErrnoEnosys},
|
||||
}...)
|
||||
}
|
||||
|
||||
// The simplest case just boils down to a single jgt instruction,
|
||||
|
@ -349,12 +391,6 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
|
|||
// If we're on x86 we need to add a check for x32 and if we're in
|
||||
// the wrong mode we jump over the section.
|
||||
if uint32(nativeArch) == uint32(C.C_AUDIT_ARCH_X86_64) {
|
||||
// Grab the only architecture in the map.
|
||||
var scmpArch libseccomp.ScmpArch
|
||||
for arch := range maxSyscalls {
|
||||
scmpArch = arch
|
||||
}
|
||||
|
||||
// Generate a prefix to check the mode.
|
||||
switch scmpArch {
|
||||
case libseccomp.ArchAMD64:
|
||||
|
@ -512,7 +548,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
|
|||
|
||||
// Prepend the load instruction for the architecture.
|
||||
programTail = append([]bpf.Instruction{
|
||||
// load [4]
|
||||
// load [4] (architecture)
|
||||
bpf.LoadAbsolute{Off: 4, Size: 4}, // NOTE: We assume sizeof(int) == 4.
|
||||
}, programTail...)
|
||||
|
||||
|
|
|
@ -213,6 +213,19 @@ func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string)
|
|||
})
|
||||
}
|
||||
|
||||
// If we're on s390(x) make sure you get -ENOSYS for the "setup"
|
||||
// syscall (this is done to work around an issue with s390x's
|
||||
// syscall multiplexing which results in unknown syscalls being a
|
||||
// setup(2) invocation).
|
||||
switch scmpArch {
|
||||
case libseccomp.ArchS390, libseccomp.ArchS390X:
|
||||
syscallTests = append(syscallTests, syscallTest{
|
||||
sysno: s390xMultiplexSyscall,
|
||||
syscall: "setup",
|
||||
expected: retErrnoEnosys,
|
||||
})
|
||||
}
|
||||
|
||||
// Test syscalls in the explicit list.
|
||||
for _, test := range syscallTests {
|
||||
// Override the expected value in the two special cases.
|
||||
|
@ -282,7 +295,7 @@ func TestDisassembleHugeFilterDoesNotHang(t *testing.T) {
|
|||
}
|
||||
|
||||
for i := 1; i < 10000; i++ {
|
||||
if err := hugeFilter.AddRule(libseccomp.ScmpSyscall(i), libseccomp.ActKill); err != nil {
|
||||
if err := hugeFilter.AddRule(libseccomp.ScmpSyscall(i), libseccomp.ActKillThread); err != nil {
|
||||
t.Fatalf("failed to add rule to filter %d: %v", i, err)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -113,8 +113,8 @@ func InitSeccomp(config *configs.Seccomp) (int, error) {
|
|||
// Convert Libcontainer Action to Libseccomp ScmpAction
|
||||
func getAction(act configs.Action, errnoRet *uint) (libseccomp.ScmpAction, error) {
|
||||
switch act {
|
||||
case configs.Kill:
|
||||
return libseccomp.ActKill, nil
|
||||
case configs.Kill, configs.KillThread:
|
||||
return libseccomp.ActKillThread, nil
|
||||
case configs.Errno:
|
||||
if errnoRet != nil {
|
||||
return libseccomp.ActErrno.SetReturnCode(int16(*errnoRet)), nil
|
||||
|
@ -133,8 +133,6 @@ func getAction(act configs.Action, errnoRet *uint) (libseccomp.ScmpAction, error
|
|||
return libseccomp.ActLog, nil
|
||||
case configs.Notify:
|
||||
return libseccomp.ActNotify, nil
|
||||
case configs.KillThread:
|
||||
return libseccomp.ActKillThread, nil
|
||||
case configs.KillProcess:
|
||||
return libseccomp.ActKillProcess, nil
|
||||
default:
|
||||
|
|
|
@ -4,6 +4,7 @@ import (
|
|||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"strconv"
|
||||
|
||||
"github.com/opencontainers/selinux/go-selinux"
|
||||
|
@ -14,6 +15,7 @@ import (
|
|||
"github.com/opencontainers/runc/libcontainer/keys"
|
||||
"github.com/opencontainers/runc/libcontainer/seccomp"
|
||||
"github.com/opencontainers/runc/libcontainer/system"
|
||||
"github.com/opencontainers/runc/libcontainer/utils"
|
||||
)
|
||||
|
||||
// linuxSetnsInit performs the container's initialization for running a new process
|
||||
|
@ -82,6 +84,21 @@ func (l *linuxSetnsInit) Init() error {
|
|||
if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Check for the arg before waiting to make sure it exists and it is
|
||||
// returned as a create time error.
|
||||
name, err := exec.LookPath(l.config.Args[0])
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
// exec.LookPath in Go < 1.20 might return no error for an executable
|
||||
// residing on a file system mounted with noexec flag, so perform this
|
||||
// extra check now while we can still return a proper error.
|
||||
// TODO: remove this once go < 1.20 is not supported.
|
||||
if err := eaccess(name); err != nil {
|
||||
return &os.PathError{Op: "eaccess", Path: name, Err: err}
|
||||
}
|
||||
|
||||
// Set seccomp as close to execve as possible, so as few syscalls take
|
||||
// place afterward (reducing the amount of syscalls that users need to
|
||||
// enable in their seccomp profiles).
|
||||
|
@ -101,5 +118,23 @@ func (l *linuxSetnsInit) Init() error {
|
|||
return &os.PathError{Op: "close log pipe", Path: "fd " + strconv.Itoa(l.logFd), Err: err}
|
||||
}
|
||||
|
||||
return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ())
|
||||
// Close all file descriptors we are not passing to the container. This is
|
||||
// necessary because the execve target could use internal runc fds as the
|
||||
// execve path, potentially giving access to binary files from the host
|
||||
// (which can then be opened by container processes, leading to container
|
||||
// escapes). Note that because this operation will close any open file
|
||||
// descriptors that are referenced by (*os.File) handles from underneath
|
||||
// the Go runtime, we must not do any file operations after this point
|
||||
// (otherwise the (*os.File) finaliser could close the wrong file). See
|
||||
// CVE-2024-21626 for more information as to why this protection is
|
||||
// necessary.
|
||||
//
|
||||
// This is not needed for runc-dmz, because the extra execve(2) step means
|
||||
// that all O_CLOEXEC file descriptors have already been closed and thus
|
||||
// the second execve(2) from runc-dmz cannot access internal file
|
||||
// descriptors from runc.
|
||||
if err := utils.UnsafeCloseFrom(l.config.PassedFilesCount + 3); err != nil {
|
||||
return err
|
||||
}
|
||||
return system.Exec(name, l.config.Args[0:], os.Environ())
|
||||
}
|
||||
|
|
|
@ -18,6 +18,7 @@ import (
|
|||
"github.com/opencontainers/runc/libcontainer/configs"
|
||||
"github.com/opencontainers/runc/libcontainer/devices"
|
||||
"github.com/opencontainers/runc/libcontainer/seccomp"
|
||||
"github.com/opencontainers/runc/libcontainer/userns"
|
||||
libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
|
||||
"github.com/opencontainers/runtime-spec/specs-go"
|
||||
"github.com/sirupsen/logrus"
|
||||
|
@ -176,18 +177,19 @@ func KnownMountOptions() []string {
|
|||
// AllowedDevices is the set of devices which are automatically included for
|
||||
// all containers.
|
||||
//
|
||||
// XXX (cyphar)
|
||||
// This behaviour is at the very least "questionable" (if not outright
|
||||
// wrong) according to the runtime-spec.
|
||||
// # XXX (cyphar)
|
||||
//
|
||||
// Yes, we have to include certain devices other than the ones the user
|
||||
// specifies, but several devices listed here are not part of the spec
|
||||
// (including "mknod for any device"?!). In addition, these rules are
|
||||
// appended to the user-provided set which means that users *cannot disable
|
||||
// this behaviour*.
|
||||
// This behaviour is at the very least "questionable" (if not outright
|
||||
// wrong) according to the runtime-spec.
|
||||
//
|
||||
// ... unfortunately I'm too scared to change this now because who knows how
|
||||
// many people depend on this (incorrect and arguably insecure) behaviour.
|
||||
// Yes, we have to include certain devices other than the ones the user
|
||||
// specifies, but several devices listed here are not part of the spec
|
||||
// (including "mknod for any device"?!). In addition, these rules are
|
||||
// appended to the user-provided set which means that users *cannot disable
|
||||
// this behaviour*.
|
||||
//
|
||||
// ... unfortunately I'm too scared to change this now because who knows how
|
||||
// many people depend on this (incorrect and arguably insecure) behaviour.
|
||||
var AllowedDevices = []*devices.Device{
|
||||
// allow mknod for any device
|
||||
{
|
||||
|
@ -925,9 +927,9 @@ next:
|
|||
func setupUserNamespace(spec *specs.Spec, config *configs.Config) error {
|
||||
create := func(m specs.LinuxIDMapping) configs.IDMap {
|
||||
return configs.IDMap{
|
||||
HostID: int(m.HostID),
|
||||
ContainerID: int(m.ContainerID),
|
||||
Size: int(m.Size),
|
||||
HostID: int64(m.HostID),
|
||||
ContainerID: int64(m.ContainerID),
|
||||
Size: int64(m.Size),
|
||||
}
|
||||
}
|
||||
if spec.Linux != nil {
|
||||
|
@ -938,6 +940,40 @@ func setupUserNamespace(spec *specs.Spec, config *configs.Config) error {
|
|||
config.GidMappings = append(config.GidMappings, create(m))
|
||||
}
|
||||
}
|
||||
if path := config.Namespaces.PathOf(configs.NEWUSER); path != "" {
|
||||
// Cache the current userns mappings in our configuration, so that we
|
||||
// can calculate uid and gid mappings within runc. These mappings are
|
||||
// never used for configuring the container if the path is set.
|
||||
uidMap, gidMap, err := userns.GetUserNamespaceMappings(path)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to cache mappings for userns: %w", err)
|
||||
}
|
||||
// We cannot allow uid or gid mappings to be set if we are also asked
|
||||
// to join a userns.
|
||||
if config.UidMappings != nil || config.GidMappings != nil {
|
||||
// FIXME: It turns out that containerd and CRIO pass both a userns
|
||||
// path and the mappings of the namespace in the same config.json.
|
||||
// Such a configuration is technically not valid, but we used to
|
||||
// require mappings be specified, and thus users worked around our
|
||||
// bug -- so we can't regress it at the moment. But we also don't
|
||||
// want to produce broken behaviour if the mapping doesn't match
|
||||
// the userns. So (for now) we output a warning if the actual
|
||||
// userns mappings match the configuration, otherwise we return an
|
||||
// error.
|
||||
if !userns.IsSameMapping(uidMap, config.UidMappings) ||
|
||||
!userns.IsSameMapping(gidMap, config.GidMappings) {
|
||||
return errors.New("user namespaces enabled, but both namespace path and non-matching mapping specified -- you may only provide one")
|
||||
}
|
||||
logrus.Warnf("config.json has both a userns path to join and a matching userns mapping specified -- you may only provide one. Future versions of runc may return an error with this configuration, please report a bug on <https://github.com/opencontainers/runc> if you see this warning and cannot update your configuration.")
|
||||
}
|
||||
|
||||
config.UidMappings = uidMap
|
||||
config.GidMappings = gidMap
|
||||
logrus.WithFields(logrus.Fields{
|
||||
"uid_map": uidMap,
|
||||
"gid_map": gidMap,
|
||||
}).Debugf("config uses path-based userns configuration -- current uid and gid mappings cached")
|
||||
}
|
||||
rootUID, err := config.HostRootUID()
|
||||
if err != nil {
|
||||
return err
|
||||
|
|
|
@ -234,6 +234,14 @@ func TestSetupSeccomp(t *testing.T) {
|
|||
Names: []string{"mknod"},
|
||||
Action: "SCMP_ACT_NOTIFY",
|
||||
},
|
||||
{
|
||||
Names: []string{"rmdir"},
|
||||
Action: "SCMP_ACT_KILL_THREAD",
|
||||
},
|
||||
{
|
||||
Names: []string{"mkdir"},
|
||||
Action: "SCMP_ACT_KILL_PROCESS",
|
||||
},
|
||||
},
|
||||
}
|
||||
seccomp, err := SetupSeccomp(conf)
|
||||
|
@ -263,9 +271,8 @@ func TestSetupSeccomp(t *testing.T) {
|
|||
|
||||
calls := seccomp.Syscalls
|
||||
|
||||
callsLength := len(calls)
|
||||
if callsLength != 8 {
|
||||
t.Errorf("Expected 8 syscalls, got :%d", callsLength)
|
||||
if len(calls) != len(conf.Syscalls) {
|
||||
t.Error("Mismatched number of syscalls")
|
||||
}
|
||||
|
||||
for _, call := range calls {
|
||||
|
@ -317,6 +324,14 @@ func TestSetupSeccomp(t *testing.T) {
|
|||
if call.Action != configs.Notify {
|
||||
t.Errorf("Wrong conversion for the %s syscall action", call.Name)
|
||||
}
|
||||
case "rmdir":
|
||||
if call.Action != configs.KillThread {
|
||||
t.Errorf("Wrong conversion for the %s syscall action", call.Name)
|
||||
}
|
||||
case "mkdir":
|
||||
if call.Action != configs.KillProcess {
|
||||
t.Errorf("Wrong conversion for the %s syscall action", call.Name)
|
||||
}
|
||||
default:
|
||||
t.Errorf("Unexpected syscall %s found", call.Name)
|
||||
}
|
||||
|
@ -595,6 +610,40 @@ func TestDupNamespaces(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestUserNamespaceMappingAndPath(t *testing.T) {
|
||||
if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) {
|
||||
t.Skip("Test requires userns.")
|
||||
}
|
||||
|
||||
spec := &specs.Spec{
|
||||
Root: &specs.Root{
|
||||
Path: "rootfs",
|
||||
},
|
||||
Linux: &specs.Linux{
|
||||
UIDMappings: []specs.LinuxIDMapping{
|
||||
{ContainerID: 0, HostID: 1000, Size: 1000},
|
||||
},
|
||||
GIDMappings: []specs.LinuxIDMapping{
|
||||
{ContainerID: 0, HostID: 2000, Size: 1000},
|
||||
},
|
||||
Namespaces: []specs.LinuxNamespace{
|
||||
{
|
||||
Type: "user",
|
||||
Path: "/proc/1/ns/user",
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
_, err := CreateLibcontainerConfig(&CreateOpts{
|
||||
Spec: spec,
|
||||
})
|
||||
|
||||
if !strings.Contains(err.Error(), "both namespace path and non-matching mapping specified") {
|
||||
t.Errorf("user namespace with path and non-matching mapping should be forbidden, got error %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNonZeroEUIDCompatibleSpecconvValidate(t *testing.T) {
|
||||
if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) {
|
||||
t.Skip("Test requires userns.")
|
||||
|
|
|
@ -17,6 +17,7 @@ import (
|
|||
"github.com/opencontainers/runc/libcontainer/keys"
|
||||
"github.com/opencontainers/runc/libcontainer/seccomp"
|
||||
"github.com/opencontainers/runc/libcontainer/system"
|
||||
"github.com/opencontainers/runc/libcontainer/utils"
|
||||
)
|
||||
|
||||
type linuxStandardInit struct {
|
||||
|
@ -198,6 +199,14 @@ func (l *linuxStandardInit) Init() error {
|
|||
if err != nil {
|
||||
return err
|
||||
}
|
||||
// exec.LookPath in Go < 1.20 might return no error for an executable
|
||||
// residing on a file system mounted with noexec flag, so perform this
|
||||
// extra check now while we can still return a proper error.
|
||||
// TODO: remove this once go < 1.20 is not supported.
|
||||
if err := eaccess(name); err != nil {
|
||||
return &os.PathError{Op: "eaccess", Path: name, Err: err}
|
||||
}
|
||||
|
||||
// Set seccomp as close to execve as possible, so as few syscalls take
|
||||
// place afterward (reducing the amount of syscalls that users need to
|
||||
// enable in their seccomp profiles). However, this needs to be done
|
||||
|
@ -250,5 +259,23 @@ func (l *linuxStandardInit) Init() error {
|
|||
return err
|
||||
}
|
||||
|
||||
// Close all file descriptors we are not passing to the container. This is
|
||||
// necessary because the execve target could use internal runc fds as the
|
||||
// execve path, potentially giving access to binary files from the host
|
||||
// (which can then be opened by container processes, leading to container
|
||||
// escapes). Note that because this operation will close any open file
|
||||
// descriptors that are referenced by (*os.File) handles from underneath
|
||||
// the Go runtime, we must not do any file operations after this point
|
||||
// (otherwise the (*os.File) finaliser could close the wrong file). See
|
||||
// CVE-2024-21626 for more information as to why this protection is
|
||||
// necessary.
|
||||
//
|
||||
// This is not needed for runc-dmz, because the extra execve(2) step means
|
||||
// that all O_CLOEXEC file descriptors have already been closed and thus
|
||||
// the second execve(2) from runc-dmz cannot access internal file
|
||||
// descriptors from runc.
|
||||
if err := utils.UnsafeCloseFrom(l.config.PassedFilesCount + 3); err != nil {
|
||||
return err
|
||||
}
|
||||
return system.Exec(name, l.config.Args[0:], os.Environ())
|
||||
}
|
||||
|
|
|
@ -15,16 +15,16 @@ type syncType string
|
|||
// during container setup. They come in pairs (with procError being a generic
|
||||
// response which is followed by an &initError).
|
||||
//
|
||||
// [ child ] <-> [ parent ]
|
||||
// [ child ] <-> [ parent ]
|
||||
//
|
||||
// procHooks --> [run hooks]
|
||||
// <-- procResume
|
||||
// procHooks --> [run hooks]
|
||||
// <-- procResume
|
||||
//
|
||||
// procReady --> [final setup]
|
||||
// <-- procRun
|
||||
// procReady --> [final setup]
|
||||
// <-- procRun
|
||||
//
|
||||
// procSeccomp --> [pick up seccomp fd with pidfd_getfd()]
|
||||
// <-- procSeccompDone
|
||||
// procSeccomp --> [pick up seccomp fd with pidfd_getfd()]
|
||||
// <-- procSeccompDone
|
||||
const (
|
||||
procError syncType = "procError"
|
||||
procReady syncType = "procReady"
|
||||
|
|
|
@ -201,7 +201,7 @@ func ParseGroupFilter(r io.Reader, filter func(Group) bool) ([]Group, error) {
|
|||
if err != nil {
|
||||
// We should return no error if EOF is reached
|
||||
// without a match.
|
||||
if err == io.EOF { //nolint:errorlint // comparison with io.EOF is legit, https://github.com/polyfloyd/go-errorlint/pull/12
|
||||
if err == io.EOF {
|
||||
err = nil
|
||||
}
|
||||
return out, err
|
||||
|
@ -280,13 +280,13 @@ func GetExecUserPath(userSpec string, defaults *ExecUser, passwdPath, groupPath
|
|||
// found in any entry in passwd and group respectively.
|
||||
//
|
||||
// Examples of valid user specifications are:
|
||||
// * ""
|
||||
// * "user"
|
||||
// * "uid"
|
||||
// * "user:group"
|
||||
// * "uid:gid
|
||||
// * "user:gid"
|
||||
// * "uid:group"
|
||||
// - ""
|
||||
// - "user"
|
||||
// - "uid"
|
||||
// - "user:group"
|
||||
// - "uid:gid
|
||||
// - "user:gid"
|
||||
// - "uid:group"
|
||||
//
|
||||
// It should be noted that if you specify a numeric user or group id, they will
|
||||
// not be evaluated as usernames (only the metadata will be filled). So attempting
|
||||
|
|
|
@ -0,0 +1,79 @@
|
|||
#define _GNU_SOURCE
|
||||
#include <fcntl.h>
|
||||
#include <sched.h>
|
||||
#include <stdio.h>
|
||||
#include <unistd.h>
|
||||
#include <stdarg.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
/*
|
||||
* All of the code here is run inside an aync-signal-safe context, so we need
|
||||
* to be careful to not call any functions that could cause issues. In theory,
|
||||
* since we are a Go program, there are fewer restrictions in practice, it's
|
||||
* better to be safe than sorry.
|
||||
*
|
||||
* The only exception is exit, which we need to call to make sure we don't
|
||||
* return into runc.
|
||||
*/
|
||||
|
||||
void bail(int pipefd, const char *fmt, ...)
|
||||
{
|
||||
va_list args;
|
||||
|
||||
va_start(args, fmt);
|
||||
vdprintf(pipefd, fmt, args);
|
||||
va_end(args);
|
||||
|
||||
exit(1);
|
||||
}
|
||||
|
||||
int spawn_userns_cat(char *userns_path, char *path, int outfd, int errfd)
|
||||
{
|
||||
char buffer[4096] = { 0 };
|
||||
|
||||
pid_t child = fork();
|
||||
if (child != 0)
|
||||
return child;
|
||||
/* in child */
|
||||
|
||||
/* Join the target userns. */
|
||||
int nsfd = open(userns_path, O_RDONLY);
|
||||
if (nsfd < 0)
|
||||
bail(errfd, "open userns path %s failed: %m", userns_path);
|
||||
|
||||
int err = setns(nsfd, CLONE_NEWUSER);
|
||||
if (err < 0)
|
||||
bail(errfd, "setns %s failed: %m", userns_path);
|
||||
|
||||
close(nsfd);
|
||||
|
||||
/* Pipe the requested file contents. */
|
||||
int fd = open(path, O_RDONLY);
|
||||
if (fd < 0)
|
||||
bail(errfd, "open %s in userns %s failed: %m", path, userns_path);
|
||||
|
||||
int nread, ntotal = 0;
|
||||
while ((nread = read(fd, buffer, sizeof(buffer))) != 0) {
|
||||
if (nread < 0)
|
||||
bail(errfd, "read bytes from %s failed (after %d total bytes read): %m", path, ntotal);
|
||||
ntotal += nread;
|
||||
|
||||
int nwritten = 0;
|
||||
while (nwritten < nread) {
|
||||
int n = write(outfd, buffer, nread - nwritten);
|
||||
if (n < 0)
|
||||
bail(errfd, "write %d bytes from %s failed (after %d bytes written): %m",
|
||||
nread - nwritten, path, nwritten);
|
||||
nwritten += n;
|
||||
}
|
||||
if (nread != nwritten)
|
||||
bail(errfd, "mismatch for bytes read and written: %d read != %d written", nread, nwritten);
|
||||
}
|
||||
|
||||
close(fd);
|
||||
close(outfd);
|
||||
close(errfd);
|
||||
|
||||
/* We must exit here, otherwise we would return into a forked runc. */
|
||||
exit(0);
|
||||
}
|
|
@ -0,0 +1,186 @@
|
|||
//go:build linux
|
||||
|
||||
package userns
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"unsafe"
|
||||
|
||||
"github.com/opencontainers/runc/libcontainer/configs"
|
||||
"github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
/*
|
||||
#include <stdlib.h>
|
||||
extern int spawn_userns_cat(char *userns_path, char *path, int outfd, int errfd);
|
||||
*/
|
||||
import "C"
|
||||
|
||||
func parseIdmapData(data []byte) (ms []configs.IDMap, err error) {
|
||||
scanner := bufio.NewScanner(bytes.NewReader(data))
|
||||
for scanner.Scan() {
|
||||
var m configs.IDMap
|
||||
line := scanner.Text()
|
||||
if _, err := fmt.Sscanf(line, "%d %d %d", &m.ContainerID, &m.HostID, &m.Size); err != nil {
|
||||
return nil, fmt.Errorf("parsing id map failed: invalid format in line %q: %w", line, err)
|
||||
}
|
||||
ms = append(ms, m)
|
||||
}
|
||||
if err := scanner.Err(); err != nil {
|
||||
return nil, fmt.Errorf("parsing id map failed: %w", err)
|
||||
}
|
||||
return ms, nil
|
||||
}
|
||||
|
||||
// Do something equivalent to nsenter --user=<nsPath> cat <path>, but more
|
||||
// efficiently. Returns the contents of the requested file from within the user
|
||||
// namespace.
|
||||
func spawnUserNamespaceCat(nsPath string, path string) ([]byte, error) {
|
||||
rdr, wtr, err := os.Pipe()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("create pipe for userns spawn failed: %w", err)
|
||||
}
|
||||
defer rdr.Close()
|
||||
defer wtr.Close()
|
||||
|
||||
errRdr, errWtr, err := os.Pipe()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("create error pipe for userns spawn failed: %w", err)
|
||||
}
|
||||
defer errRdr.Close()
|
||||
defer errWtr.Close()
|
||||
|
||||
cNsPath := C.CString(nsPath)
|
||||
defer C.free(unsafe.Pointer(cNsPath))
|
||||
cPath := C.CString(path)
|
||||
defer C.free(unsafe.Pointer(cPath))
|
||||
|
||||
childPid := C.spawn_userns_cat(cNsPath, cPath, C.int(wtr.Fd()), C.int(errWtr.Fd()))
|
||||
|
||||
if childPid < 0 {
|
||||
return nil, fmt.Errorf("failed to spawn fork for userns")
|
||||
} else if childPid == 0 {
|
||||
// this should never happen
|
||||
panic("runc executing inside fork child -- unsafe state!")
|
||||
}
|
||||
|
||||
// We are in the parent -- close the write end of the pipe before reading.
|
||||
wtr.Close()
|
||||
output, err := io.ReadAll(rdr)
|
||||
rdr.Close()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("reading from userns spawn failed: %w", err)
|
||||
}
|
||||
|
||||
// Ditto for the error pipe.
|
||||
errWtr.Close()
|
||||
errOutput, err := io.ReadAll(errRdr)
|
||||
errRdr.Close()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("reading from userns spawn error pipe failed: %w", err)
|
||||
}
|
||||
errOutput = bytes.TrimSpace(errOutput)
|
||||
|
||||
// Clean up the child.
|
||||
child, err := os.FindProcess(int(childPid))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("could not find userns spawn process: %w", err)
|
||||
}
|
||||
state, err := child.Wait()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to wait for userns spawn process: %w", err)
|
||||
}
|
||||
if !state.Success() {
|
||||
errStr := string(errOutput)
|
||||
if errStr == "" {
|
||||
errStr = fmt.Sprintf("unknown error (status code %d)", state.ExitCode())
|
||||
}
|
||||
return nil, fmt.Errorf("userns spawn: %s", errStr)
|
||||
} else if len(errOutput) > 0 {
|
||||
// We can just ignore weird output in the error pipe if the process
|
||||
// didn't bail(), but for completeness output for debugging.
|
||||
logrus.Debugf("userns spawn succeeded but unexpected error message found: %s", string(errOutput))
|
||||
}
|
||||
// The subprocess succeeded, return whatever it wrote to the pipe.
|
||||
return output, nil
|
||||
}
|
||||
|
||||
func GetUserNamespaceMappings(nsPath string) (uidMap, gidMap []configs.IDMap, err error) {
|
||||
var (
|
||||
pid int
|
||||
extra rune
|
||||
tryFastPath bool
|
||||
)
|
||||
|
||||
// nsPath is usually of the form /proc/<pid>/ns/user, which means that we
|
||||
// already have a pid that is part of the user namespace and thus we can
|
||||
// just use the pid to read from /proc/<pid>/*id_map.
|
||||
//
|
||||
// Note that Sscanf doesn't consume the whole input, so we check for any
|
||||
// trailing data with %c. That way, we can be sure the pattern matched
|
||||
// /proc/$pid/ns/user _exactly_ iff n === 1.
|
||||
if n, _ := fmt.Sscanf(nsPath, "/proc/%d/ns/user%c", &pid, &extra); n == 1 {
|
||||
tryFastPath = pid > 0
|
||||
}
|
||||
|
||||
for _, mapType := range []struct {
|
||||
name string
|
||||
idMap *[]configs.IDMap
|
||||
}{
|
||||
{"uid_map", &uidMap},
|
||||
{"gid_map", &gidMap},
|
||||
} {
|
||||
var mapData []byte
|
||||
|
||||
if tryFastPath {
|
||||
path := fmt.Sprintf("/proc/%d/%s", pid, mapType.name)
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
// Do not error out here -- we need to try the slow path if the
|
||||
// fast path failed.
|
||||
logrus.Debugf("failed to use fast path to read %s from userns %s (error: %s), falling back to slow userns-join path", mapType.name, nsPath, err)
|
||||
} else {
|
||||
mapData = data
|
||||
}
|
||||
} else {
|
||||
logrus.Debugf("cannot use fast path to read %s from userns %s, falling back to slow userns-join path", mapType.name, nsPath)
|
||||
}
|
||||
|
||||
if mapData == nil {
|
||||
// We have to actually join the namespace if we cannot take the
|
||||
// fast path. The path is resolved with respect to the child
|
||||
// process, so just use /proc/self.
|
||||
data, err := spawnUserNamespaceCat(nsPath, "/proc/self/"+mapType.name)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
mapData = data
|
||||
}
|
||||
idMap, err := parseIdmapData(mapData)
|
||||
if err != nil {
|
||||
return nil, nil, fmt.Errorf("failed to parse %s of userns %s: %w", mapType.name, nsPath, err)
|
||||
}
|
||||
*mapType.idMap = idMap
|
||||
}
|
||||
|
||||
return uidMap, gidMap, nil
|
||||
}
|
||||
|
||||
// IsSameMapping returns whether or not the two id mappings are the same. Note
|
||||
// that if the order of the mappings is different, or a mapping has been split,
|
||||
// the mappings will be considered different.
|
||||
func IsSameMapping(a, b []configs.IDMap) bool {
|
||||
if len(a) != len(b) {
|
||||
return false
|
||||
}
|
||||
for idx := range a {
|
||||
if a[idx] != b[idx] {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue