Import Upstream version 1.1.12+ds1

2024-04-30 18:13:09 +08:00 · 2024-04-30 18:13:09 +08:00 · 68d1261cab
parent a9cf6bd6d6
commit 68d1261cab
1005 changed files with 3396 additions and 289043 deletions
--- a/.cirrus.yml
+++ b/.cirrus.yml
@ -1,7 +1,8 @@
 ---
-# We use Cirrus for Vagrant tests and native CentOS 7 and 8, because macOS
-# instances of GHA are too slow and flaky, and Linux instances of GHA do not
-# support KVM.
+# We use Cirrus for CentOS (native) and Fedora (in Vagrant), because neither
+# CentOS nor Fedora is available on GHA natively, so the only option is VM.
+# In GHA, nested virtualization is only supported on macOS instances, which
+# are slow and flaky.

 # NOTE Cirrus execution environments lack a terminal, needed for
 # some integration tests. So we use `ssh -tt` command to fake a terminal.
@ -24,25 +25,31 @@ task:
    platform: linux
    nested_virtualization: true
    # CPU limit: `16 / NTASK`: see https://cirrus-ci.org/faq/#are-there-any-limits
-    cpu: 8
+    cpu: 4
    # Memory limit: `4GB * NCPU`
-    memory: 32G
+    memory: 16G

  host_info_script: |
    uname -a
-    echo "-----"
+    # -----
    cat /etc/os-release
-    echo "-----"
-    cat /proc/cpuinfo
-    echo "-----"
+    # -----
    df -T
+    # -----
+    cat /proc/cpuinfo
  install_libvirt_vagrant_script: |
+    curl -fsSL https://apt.releases.hashicorp.com/gpg | sudo gpg --dearmor -o /usr/share/keyrings/hashicorp-archive-keyring.gpg
+    echo "deb [signed-by=/usr/share/keyrings/hashicorp-archive-keyring.gpg] https://apt.releases.hashicorp.com $(lsb_release -cs) main" | sudo tee /etc/apt/sources.list.d/hashicorp.list
+    sudo sed -i 's/^# deb-src/deb-src/' /etc/apt/sources.list
    apt-get update
-    apt-get install -y libvirt-daemon libvirt-daemon-system vagrant vagrant-libvirt
+    apt-get install -y libvirt-daemon libvirt-daemon-system vagrant
    systemctl enable --now libvirtd
+    apt-get build-dep -y vagrant ruby-libvirt
+    apt-get install -y --no-install-recommends libxslt-dev libxml2-dev libvirt-dev ruby-bundler ruby-dev zlib1g-dev
+    vagrant plugin install vagrant-libvirt
  vagrant_cache:
-    fingerprint_script: uname -s ; cat Vagrantfile.$DISTRO
-    folder: /root/.vagrant.d
+    fingerprint_script: cat Vagrantfile.$DISTRO
+    folder: /root/.vagrant.d/boxes
  vagrant_up_script: |
    ln -sf Vagrantfile.$DISTRO Vagrantfile
    # Retry if it fails (download.fedoraproject.org returns 404 sometimes)
@ -50,7 +57,9 @@ task:
    mkdir -p -m 0700 /root/.ssh
    vagrant ssh-config >> /root/.ssh/config
  guest_info_script: |
-    ssh default 'sh -exc "uname -a && systemctl --version && df -T && cat /etc/os-release"'
+    ssh default 'sh -exc "uname -a && systemctl --version && df -T && cat /etc/os-release && go version"'
+  check_config_script: |
+    ssh default /vagrant/script/check-config.sh
  unit_tests_script: |
    ssh default 'sudo -i make -C /vagrant localunittest'
  integration_systemd_script: |
@ -68,12 +77,14 @@ task:
  env:
    HOME: /root
    CIRRUS_WORKING_DIR: /home/runc
-    GO_VERSION: "1.17.3"
-    BATS_VERSION: "v1.3.0"
+    GO_VERSION: "1.20"
+    BATS_VERSION: "v1.9.0"
+    RPMS: gcc git iptables jq glibc-static libseccomp-devel make criu fuse-sshfs
    # yamllint disable rule:key-duplicates
    matrix:
      DISTRO: centos-7
      DISTRO: centos-stream-8
+      DISTRO: centos-stream-9

  name: ci / $DISTRO

@ -88,6 +99,8 @@ task:
    case $DISTRO in
    centos-7)
      (cd /etc/yum.repos.d && curl -O https://copr.fedorainfracloud.org/coprs/adrian/criu-el7/repo/epel-7/adrian-criu-el7-epel-7.repo)
+      # EPEL is needed for jq and fuse-sshfs.
+      rpm -q epel-release || rpm -Uvh https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
      # sysctl
      echo "user.max_user_namespaces=15076" > /etc/sysctl.d/userns.conf
      sysctl --system
@ -95,15 +108,32 @@ task:
    centos-stream-8)
      yum config-manager --set-enabled powertools # for glibc-static
      ;;
+    centos-stream-9)
+      dnf config-manager --set-enabled crb # for glibc-static
+      dnf -y install epel-release epel-next-release # for fuse-sshfs
+      # Delegate all cgroup v2 controllers to rootless user via --systemd-cgroup.
+      # The default (since systemd v252) is "pids memory cpu".
+      mkdir -p /etc/systemd/system/user@.service.d
+      printf "[Service]\nDelegate=yes\n" > /etc/systemd/system/user@.service.d/delegate.conf
+      systemctl daemon-reload
+      ;;
    esac
    # Work around dnf mirror failures by retrying a few times.
    for i in $(seq 0 2); do
      sleep $i
-      yum install -y -q gcc git iptables jq glibc-static libseccomp-devel make criu fuse-sshfs && break
+      yum install -y $RPMS && break
    done
    [ $? -eq 0 ] # fail if yum failed
+
+    # Double check that all rpms were installed (yum from CentOS 7
+    # does not exit with an error if some packages were not found).
+    # Use --whatprovides since some packages are renamed.
+    rpm -q --whatprovides $RPMS
    # install Go
-    curl -fsSL "https://dl.google.com/go/go${GO_VERSION}.linux-amd64.tar.gz" | tar Cxz /usr/local
+    PREFIX="https://go.dev/dl/"
+    # Find out the latest minor release URL.
+    eval $(curl -fsSL "${PREFIX}?mode=json" | jq -r  --arg Ver "$GO_VERSION" '.[] | select(.version | startswith("go\($Ver)")) | .files[] | select(.os == "linux" and .arch == "amd64" and .kind == "archive") | "filename=\"" + .filename + "\""')
+    curl -fsSL "$PREFIX$filename" | tar Cxz /usr/local
    # install bats
    cd /tmp
    git clone https://github.com/bats-core/bats-core
@ -131,14 +161,18 @@ task:
    systemctl restart sshd
  host_info_script: |
    uname -a
-    echo "-----"
-    cat /etc/os-release
-    echo "-----"
-    cat /proc/cpuinfo
-    echo "-----"
-    df -T
-    echo "-----"
+    # -----
+    /usr/local/go/bin/go version
+    # -----
    systemctl --version
+    # -----
+    cat /etc/os-release
+    # -----
+    df -T
+    # -----
+    cat /proc/cpuinfo
+  check_config_script: |
+    /home/runc/script/check-config.sh
  unit_tests_script: |
    ssh -tt localhost "make -C /home/runc localunittest"
  integration_systemd_script: |
@ -146,13 +180,19 @@ task:
  integration_fs_script: |
    ssh -tt localhost "make -C /home/runc localintegration"
  integration_systemd_rootless_script: |
-    echo "SKIP: integration_systemd_rootless_script requires cgroup v2"
+    case $DISTRO in
+    centos-7|centos-stream-8)
+      echo "SKIP: integration_systemd_rootless_script requires cgroup v2"
+      ;;
+    *)
+      ssh -tt localhost "make -C /home/runc localrootlessintegration RUNC_USE_SYSTEMD=yes"
+    esac
  integration_fs_rootless_script: |
    case $DISTRO in
    centos-7)
      echo "SKIP: FIXME: integration_fs_rootless_script is skipped because of EPERM on writing cgroup.procs"
        ;;
-    centos-stream-8)
+    *)
      ssh -tt localhost "make -C /home/runc localrootlessintegration"
      ;;
    esac
--- a/.codespellrc
+++ b/.codespellrc
@ -1,3 +1,3 @@
 [codespell]
-skip = ./vendor,./.git
-ignore-words-list = clos,creat
+skip = ./vendor,./.git,./go.sum
+ignore-words-list = clos,mis
--- a/.editorconfig
+++ b/.editorconfig
@ -0,0 +1,8 @@
+# This file is used by shfmt. See https://EditorConfig.org
+
+# This is a top-most EditorConfig file.
+root = true
+
+# Ignore the entire "vendor" directory.
+[vendor/**]
+ignore = true
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@ -21,13 +21,13 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        go-version: [1.16.x, 1.17.x]
+        go-version: [1.17.x, 1.20.x, 1.21.x]
        rootless: ["rootless", ""]
        race: ["-race", ""]
        criu: [""]
        include:
          # Also test against latest criu-dev
-          - go-version: 1.17.x
+          - go-version: 1.20.x
            rootless: ""
            race: ""
            criu: "criu-dev"
@ -35,7 +35,7 @@ jobs:
    steps:

    - name: checkout
-      uses: actions/checkout@v2
+      uses: actions/checkout@v3

    - name: install deps
      if: matrix.criu == ''
@ -43,7 +43,7 @@ jobs:
        REPO: https://download.opensuse.org/repositories/devel:/tools:/criu/xUbuntu_20.04
      run: |
        # criu repo
-        curl -fSsl $REPO/Release.key | sudo apt-key add -
+        curl -fSsLl $REPO/Release.key | gpg --dearmor | sudo tee /etc/apt/trusted.gpg.d/devel_tools_criu.gpg > /dev/null
        echo "deb $REPO/ /" | sudo tee /etc/apt/sources.list.d/criu.list
        sudo apt update
        sudo apt install libseccomp-dev criu sshfs
@ -60,9 +60,8 @@ jobs:
        rm -rf ~/criu

    - name: install go ${{ matrix.go-version }}
-      uses: actions/setup-go@v2
+      uses: actions/setup-go@v4
      with:
-        stable: '!contains(${{ matrix.go-version }}, "beta") && !contains(${{ matrix.go-version }}, "rc")'
        go-version: ${{ matrix.go-version }}

    - name: build
@ -71,7 +70,7 @@ jobs:
    - name: install bats
      uses: mig4/setup-bats@v1
      with:
-        bats-version: 1.3.0
+        bats-version: 1.9.0

    - name: unit test
      if: matrix.rootless != 'rootless'
@ -105,7 +104,7 @@ jobs:
    steps:

    - name: checkout
-      uses: actions/checkout@v2
+      uses: actions/checkout@v3

    - name: install deps
      run: |
@ -120,10 +119,9 @@ jobs:
        sudo apt -q install libseccomp-dev libseccomp-dev:i386 gcc-multilib criu

    - name: install go
-      uses: actions/setup-go@v2
+      uses: actions/setup-go@v4
      with:
        go-version: 1.x # Latest stable

    - name: unit test
-      # cgo is disabled by default when cross-compiling
-      run: sudo -E PATH="$PATH" -- make GOARCH=386 CGO_ENABLED=1 localunittest
+      run: sudo -E PATH="$PATH" -- make GOARCH=386 localunittest
--- a/.github/workflows/validate.yml
+++ b/.github/workflows/validate.yml
@ -7,41 +7,39 @@ on:
      - master
      - release-*
  pull_request:
+env:
+  GO_VERSION: 1.20.x

 jobs:
+  keyring:
+    runs-on: ubuntu-22.04
+    steps:
+    - uses: actions/checkout@v3
+    - name: check runc.keyring
+      run: make validate-keyring

  lint:
    runs-on: ubuntu-20.04
    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 2
+      - uses: actions/setup-go@v4
+        with:
+          go-version: "${{ env.GO_VERSION }}"
+          cache: false # golangci-lint-action does its own caching
      - name: install deps
        run: |
          sudo apt -q update
          sudo apt -q install libseccomp-dev
-      - uses: golangci/golangci-lint-action@v2
+      - uses: golangci/golangci-lint-action@v3
        with:
-          # must be specified without patch version
-          version: v1.42
-
-  lint-extra:
-    # Extra linters, only checking new code from pull requests.
-    if: github.event_name == 'pull_request'
-    runs-on: ubuntu-20.04
-    permissions:
-      contents: read
-    steps:
-      - uses: actions/checkout@v2
-      - name: install deps
+          version: v1.53
+      # Extra linters, only checking new code from a pull request.
+      - name: lint-extra
+        if: github.event_name == 'pull_request'
        run: |
-          sudo apt -q update
-          sudo apt -q install libseccomp-dev
-      - uses: golangci/golangci-lint-action@v2
-        with:
-          only-new-issues: true
-          args: --config .golangci-extra.yml
-          # must be specified without patch version
-          version: v1.43
-
+          golangci-lint run --config .golangci-extra.yml --new-from-rev=HEAD~1 --out-format=github-actions

  compile-buildtags:
    runs-on: ubuntu-20.04
@ -49,18 +47,18 @@ jobs:
      # Don't ignore C warnings. Note that the output of "go env CGO_CFLAGS" by default is "-g -O2", so we keep them.
      CGO_CFLAGS: -g -O2 -Werror
    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
      - name: install go
-        uses: actions/setup-go@v2
+        uses: actions/setup-go@v4
        with:
-          go-version: 1.x # Latest stable
+          go-version: "${{ env.GO_VERSION }}"
      - name: compile with no build tags
        run: make BUILDTAGS=""

  codespell:
    runs-on: ubuntu-20.04
    steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
    - name: install deps
      # Version of codespell bundled with Ubuntu is way old, so use pip.
      run: pip install codespell
@ -70,35 +68,19 @@ jobs:
  shfmt:
    runs-on: ubuntu-20.04
    steps:
-    - uses: actions/checkout@v2
-    - name: vars
-      run: |
-        echo "VERSION=3.3.1" >> $GITHUB_ENV
-        echo "$(go env GOPATH)/bin" >> $GITHUB_PATH
-    - name: cache go mod and $GOCACHE
-      uses: actions/cache@v2
-      with:
-        path: |
-          ~/go/pkg/mod
-          ~/.cache/go-build
-        key: ${{ runner.os }}-shfmt-${{ env.VERSION }}
-        restore-keys: ${{ runner.os }}-shfmt-
-    - name: install shfmt
-      run: |
-        command -v shfmt || \
-          (cd ~ && GO111MODULE=on time go get mvdan.cc/sh/v3/cmd/shfmt@v$VERSION)
+    - uses: actions/checkout@v3
    - name: shfmt
      run: make shfmt

  shellcheck:
    runs-on: ubuntu-20.04
    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
      - name: vars
        run: |
-          echo 'VERSION=v0.7.2' >> $GITHUB_ENV
+          echo 'VERSION=v0.8.0' >> $GITHUB_ENV
          echo 'BASEURL=https://github.com/koalaman/shellcheck/releases/download' >> $GITHUB_ENV
-          echo 'SHA256SUM=12ee2e0b90a3d1e9cae24ac9b2838be66b48573cb2c8e8f3c566b959df6f050c' >> $GITHUB_ENV
+          echo 'SHA256SUM=f4bce23c11c3919c1b20bcb0f206f6b44c44e26f2bc95f8aa708716095fa0651' >> $GITHUB_ENV
          echo ~/bin >> $GITHUB_PATH
      - name: install shellcheck
        run: |
@ -108,27 +90,21 @@ jobs:
          sha256sum ~/bin/shellcheck | grep -q $SHA256SUM
          # make sure to remove the old version
          sudo rm -f /usr/bin/shellcheck
-      - uses: lumaxis/shellcheck-problem-matchers@v1
+      - uses: lumaxis/shellcheck-problem-matchers@v2
      - name: shellcheck
        run: |
          make shellcheck
+      - name: check-config.sh
+        run : ./script/check-config.sh

  deps:
    runs-on: ubuntu-20.04
    steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
    - name: install go
-      uses: actions/setup-go@v2
+      uses: actions/setup-go@v4
      with:
-        go-version: 1.x # Latest stable
-    - name: cache go mod and $GOCACHE
-      uses: actions/cache@v2
-      with:
-        path: |
-          ~/go/pkg/mod
-          ~/.cache/go-build
-        key: ${{ runner.os }}-go.sum-${{ hashFiles('**/go.sum') }}
-        restore-keys: ${{ runner.os }}-go.sum-
+        go-version: "${{ env.GO_VERSION }}"
    - name: verify deps
      run: make verify-dependencies

@ -151,12 +127,11 @@ jobs:
          pattern: '^.{0,72}(\n.*)*$'
          error: 'Subject too long (max 72)'

-
  cfmt:
    runs-on: ubuntu-20.04
    steps:
    - name: checkout
-      uses: actions/checkout@v2
+      uses: actions/checkout@v3
      with:
        fetch-depth: 0
    - name: install deps
@ -173,9 +148,13 @@ jobs:
    runs-on: ubuntu-20.04
    steps:
    - name: checkout
-      uses: actions/checkout@v2
+      uses: actions/checkout@v3
      with:
        fetch-depth: 0
+
+    - name: check CHANGELOG.md
+      run: make verify-changelog
+
      # We have to run this under Docker as Ubuntu (host) does not support all
      # the architectures we want to compile test against, and Dockerfile uses
      # Debian (which does).
@ -185,14 +164,12 @@ jobs:
      # under Docker will emerge, it will be good to have a separate make
      # runcimage job and share its result (the docker image) with whoever
      # needs it.
-    - uses: satackey/action-docker-layer-caching@v0.0.11
-      continue-on-error: true
    - name: build docker image
      run: make runcimage
    - name: make releaseall
      run: make releaseall
    - name: upload artifacts
-      uses: actions/upload-artifact@v2
+      uses: actions/upload-artifact@v3
      with:
        name: release-${{ github.run_id }}
        path: release/*
--- a/.golangci-extra.yml
+++ b/.golangci-extra.yml
@ -1,5 +1,5 @@
 # This is golangci-lint config file which is used to check new code in
-# github PRs only (see lint-extra job in .github/workflows/validate.yml).
+# github PRs only (see lint-extra in .github/workflows/validate.yml).
 #
 # For the default linter config, see .golangci.yml. This config should
 # only enable additional linters not enabled in the default config.
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,21 +1,288 @@
-# Changelog/
+# Changelog
 This file documents all notable changes made to this project since runc 1.0.

 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

-## [Unreleased]
+## [Unreleased 1.1.z]
+
+## [1.1.12] - 2024-01-31
+
+> Now you're thinking with Portals™!
+
+### Security
+
+* Fix [CVE-2024-21626][cve-2024-21626], a container breakout attack that took
+  advantage of a file descriptor that was leaked internally within runc (but
+  never leaked to the container process). In addition to fixing the leak,
+  several strict hardening measures were added to ensure that future internal
+  leaks could not be used to break out in this manner again. Based on our
+  research, while no other container runtime had a similar leak, none had any
+  of the hardening steps we've introduced (and some runtimes would not check
+  for any file descriptors that a calling process may have leaked to them,
+  allowing for container breakouts due to basic user error).
+
+[cve-2024-21626]: https://github.com/opencontainers/runc/security/advisories/GHSA-xr7r-f8xq-vfvv
+
+## [1.1.11] - 2024-01-01
+
+> Happy New Year!
+
+### Fixed
+
+* Fix several issues with userns path handling. (#4122, #4124, #4134, #4144)
+
+### Changed
+
+ * Support memory.peak and memory.swap.peak in cgroups v2.
+   Add `swapOnlyUsage` in `MemoryStats`. This field reports swap-only usage.
+   For cgroupv1, `Usage` and `Failcnt` are set by subtracting memory usage
+   from memory+swap usage. For cgroupv2, `Usage`, `Limit`, and `MaxUsage`
+   are set. (#4000, #4010, #4131)
+ * build(deps): bump github.com/cyphar/filepath-securejoin. (#4140)
+
+## [1.1.10] - 2023-10-31
+
+> Śruba, przykręcona we śnie, nie zmieni sytuacji, jaka panuje na jawie.
+
+### Added
+
+* Support for `hugetlb.<pagesize>.rsvd` limiting and accounting. Fixes the
+  issue of postres failing when hugepage limits are set. (#3859, #4077)
+
+### Fixed
+
+* Fixed permissions of a newly created directories to not depend on the value
+  of umask in tmpcopyup feature implementation. (#3991, #4060)
+* libcontainer: cgroup v1 GetStats now ignores missing `kmem.limit_in_bytes`
+  (fixes the compatibility with Linux kernel 6.1+). (#4028)
+* Fix a semi-arbitrary cgroup write bug when given a malicious hugetlb
+  configuration. This issue is not a security issue because it requires a
+  malicious `config.json`, which is outside of our threat model. (#4103)
+* Various CI fixes. (#4081, #4055)
+
+## [1.1.9] - 2023-08-10
+
+> There is a crack in everything. That's how the light gets in.
+
+### Added
+
+* Added go 1.21 to the CI matrix; other CI updates. (#3976, #3958)
+
+### Fixed
+
+* Fixed losing sticky bit on tmpfs (a regression in 1.1.8). (#3952, #3961)
+* intelrdt: fixed ignoring ClosID on some systems. (#3550, #3978)
+
+### Changed
+
+ * Sum `anon` and `file` from `memory.stat` for cgroupv2 root usage,
+   as the root does not have `memory.current` for cgroupv2.
+   This aligns cgroupv2 root usage more closely with cgroupv1 reporting.
+   Additionally, report root swap usage as sum of swap and memory usage,
+   aligned with v1 and existing non-root v2 reporting. (#3933)
+
+## [1.1.8] - 2023-07-20
+
+> 海纳百川 有容乃大
+
+### Added
+
+* Support riscv64. (#3905)
+
+### Fixed
+
+* init: do not print environment variable value. (#3879)
+* libct: fix a race with systemd removal. (#3877)
+* tests/int: increase num retries for oom tests. (#3891)
+* man/runc: fixes. (#3892)
+* Fix tmpfs mode opts when dir already exists. (#3916)
+* docs/systemd: fix a broken link. (#3917)
+* ci/cirrus: enable some rootless tests on cs9. (#3918)
+* runc delete: call systemd's reset-failed. (#3932)
+* libct/cg/sd/v1: do not update non-frozen cgroup after frozen failed. (#3921)
+
+### Changed
+
+* CI: bump Fedora, Vagrant, bats. (#3878)
+* `.codespellrc`: update for 2.2.5. (#3909)
+
+## [1.1.7] - 2023-04-26
+
+> Ночевала тучка золотая на груди утеса-великана.
+
+### Fixed
+
+* When used with systemd v240+, systemd cgroup drivers no longer skip
+  `DeviceAllow` rules if the device does not exist (a regression introduced
+  in runc 1.1.3). This fix also reverts the workaround added in runc 1.1.5,
+  removing an extra warning emitted by runc run/start. (#3845, #3708, #3671)
+
+### Added
+
+* The source code now has a new file, `runc.keyring`, which contains the keys
+  used to sign runc releases. (#3838)
+
+## [1.1.6] - 2023-04-11
+
+> In this world nothing is certain but death and taxes.
+
+### Compatibility
+
+* This release can no longer be built from sources using Go 1.16. Using a
+  latest maintained Go 1.20.x or Go 1.19.x release is recommended.
+  Go 1.17 can still be used.
+
+### Fixed
+
+* systemd cgroup v1 and v2 drivers were deliberately ignoring `UnitExist` error
+  from systemd while trying to create a systemd unit, which in some scenarios
+  may result in a container not being added to the proper systemd unit and
+  cgroup. (#3780, #3806)
+* systemd cgroup v2 driver was incorrectly translating cpuset range from spec's
+  `resources.cpu.cpus` to systemd unit property (`AllowedCPUs`) in case of more
+  than 8 CPUs, resulting in the wrong AllowedCPUs setting. (#3808)
+* systemd cgroup v1 driver was prefixing container's cgroup path with the path
+  of PID 1 cgroup, resulting in inability to place PID 1 in a non-root cgroup.
+  (#3811)
+* runc run/start may return "permission denied" error when starting a rootless
+  container when the file to be executed does not have executable bit set for
+  the user, not taking the `CAP_DAC_OVERRIDE` capability into account. This is
+  a regression in runc 1.1.4, as well as in Go 1.20 and 1.20.1 (#3715, #3817)
+* cgroup v1 drivers are now aware of `misc` controller. (#3823)
+* Various CI fixes and improvements, mostly to ensure Go 1.19.x and Go 1.20.x
+  compatibility.
+
+## [1.1.5] - 2023-03-29
+
+> 囚われた屈辱は
+> 反撃の嚆矢だ
+
+### Security
+
+The following CVEs were fixed in this release:
+
+* [CVE-2023-25809][] is a vulnerability involving rootless containers where
+  (under specific configurations), the container would have write access to the
+  `/sys/fs/cgroup/user.slice/...` cgroup hierarchy. No other hierarchies on the
+  host were affected. This vulnerability was discovered by Akihiro Suda.
+
+* [CVE-2023-27561][] was a regression in our protections against tricky `/proc`
+  and `/sys` configurations (where the container mountpoint is a symlink)
+  causing us to be tricked into incorrectly configuring the container, which
+  effectively re-introduced [CVE-2019-19921][]. This regression was present
+  from v1.0.0-rc95 to v1.1.4 and was discovered by @Beuc. (#3785)
+
+* [CVE-2023-28642][] is a different attack vector using the same regression
+  as in [CVE-2023-27561][]. This was reported by Lei Wang.
+
+[CVE-2019-19921]: https://github.com/advisories/GHSA-fh74-hm69-rqjw
+[CVE-2023-25809]: https://github.com/opencontainers/runc/security/advisories/GHSA-m8cg-xc2p-r3fc
+[CVE-2023-27561]: https://github.com/advisories/GHSA-vpvm-3wq2-2wvm
+[CVE-2023-28642]: https://github.com/opencontainers/runc/security/advisories/GHSA-g2j6-57v7-gm8c
+
+### Fixed
+
+* Fix the inability to use `/dev/null` when inside a container. (#3620)
+* Fix changing the ownership of host's `/dev/null` caused by fd redirection
+  (a regression in 1.1.1). (#3674, #3731)
+* Fix rare runc exec/enter unshare error on older kernels, including
+  CentOS < 7.7. (#3776)
+* nsexec: Check for errors in `write_log()`. (#3721)
+* Various CI fixes and updates. (#3618, #3630, #3640, #3729)
+
+## [1.1.4] - 2022-08-24
+
+> If you look for perfection, you'll never be content.
+
+### Fixed
+
+* Fix mounting via wrong proc fd.
+  When the user and mount namespaces are used, and the bind mount is followed by
+  the cgroup mount in the spec, the cgroup was mounted using the bind mount's
+  mount fd. (#3511)
+* Switch `kill()` in `libcontainer/nsenter` to `sane_kill()`. (#3536)
+* Fix "permission denied" error from `runc run` on `noexec` fs. (#3541)
+* Fix failed exec after `systemctl daemon-reload`.
+  Due to a regression in v1.1.3, the `DeviceAllow=char-pts rwm` rule was no
+  longer added and was causing an error `open /dev/pts/0: operation not permitted: unknown`
+  when systemd was reloaded. (#3554)
+* Various CI fixes. (#3538, #3558, #3562)
+
+## [1.1.3] - 2022-06-09
+
+> In the beginning there was nothing, which exploded.
+
+### Fixed
+ * Our seccomp `-ENOSYS` stub now correctly handles multiplexed syscalls on
+   s390 and s390x. This solves the issue where syscalls the host kernel did not
+   support would return `-EPERM` despite the existence of the `-ENOSYS` stub
+   code (this was due to how s390x does syscall multiplexing). (#3478)
+ * Retry on dbus disconnect logic in libcontainer/cgroups/systemd now works as
+   intended; this fix does not affect runc binary itself but is important for
+   libcontainer users such as Kubernetes. (#3476)
+ * Inability to compile with recent clang due to an issue with duplicate
+   constants in libseccomp-golang. (#3477)
+ * When using systemd cgroup driver, skip adding device paths that don't exist,
+   to stop systemd from emitting warnings about those paths. (#3504)
+ * Socket activation was failing when more than 3 sockets were used. (#3494)
+ * Various CI fixes. (#3472, #3479)
+
+### Added
+ * Allow to bind mount /proc/sys/kernel/ns_last_pid to inside container. (#3493)
+
+### Changed
+ * runc static binaries are now linked against libseccomp v2.5.4. (#3481)
+
+
+## [1.1.2] - 2022-05-11
+
+> I should think I'm going to be a perpetual student.
+
+### Security
+ * A bug was found in runc where runc exec --cap executed processes with
+   non-empty inheritable Linux process capabilities, creating an atypical Linux
+   environment. For more information, see [GHSA-f3fp-gc8g-vw66][] and
+   CVE-2022-29162.
+
+### Changed
+ * `runc spec` no longer sets any inheritable capabilities in the created
+   example OCI spec (`config.json`) file.
+
+[GHSA-f3fp-gc8g-vw66]: https://github.com/opencontainers/runc/security/advisories/GHSA-f3fp-gc8g-vw66
+
+
+## [1.1.1] - 2022-03-28
+
+> Violence is the last refuge of the incompetent.
+
+### Added
+ * CI is now also run on centos-stream-9. (#3436)
+
+### Fixed
+ * `runc run/start` can now run a container with read-only `/dev` in OCI spec,
+   rather than error out. (#3355)
+ * `runc exec` now ensures that `--cgroup` argument is a sub-cgroup. (#3403)
+ * libcontainer systemd v2 manager no longer errors out if one of the files
+   listed in `/sys/kernel/cgroup/delegate` do not exist in container's cgroup.
+   (#3387, #3404)
+ * Loose OCI spec validation to avoid bogus "Intel RDT is not supported" error.
+   (#3406)
+ * libcontainer/cgroups no longer panics in cgroup v1 managers if `stat`
+   of `/sys/fs/cgroup/unified` returns an error other than ENOENT. (#3435)
+

 ## [1.1.0] - 2022-01-14

 > A plan depends as much upon execution as it does upon concept.

-## Changed
+### Changed
 * libcontainer will now refuse to build without the nsenter package being
   correctly compiled (specifically this requires CGO to be enabled). This
   should avoid folks accidentally creating broken runc binaries (and
   incorrectly importing our internal libraries into their projects). (#3331)

+
 ## [1.1.0-rc.1] - 2021-12-14

 > He who controls the spice controls the universe.
@ -41,7 +308,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   binary etc.) and failures of the command being executed. (#3073)
 * runc run: new `--keep` option to skip removal exited containers artefacts.
   This might be useful to check the state (e.g. of cgroup controllers) after
-   the container hasexited. (#2817, #2825)
+   the container has exited. (#2817, #2825)
 * seccomp: add support for `SCMP_ACT_KILL_PROCESS` and `SCMP_ACT_KILL_THREAD`
   (the latter is just an alias for `SCMP_ACT_KILL`). (#3204)
 * seccomp: add support for `SCMP_ACT_NOTIFY` (seccomp actions). This allows
@ -130,13 +397,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Fixed
 * Fixed inability to start a container with read-write bind mount of a
   read-only fuse host mount. (#3283, #3292)
- * Fixed inability to start when read-only /dev in set in spec (#3276, #3277)
+ * Fixed inability to start when read-only /dev in set in spec. (#3276, #3277)
 * Fixed not removing sub-cgroups upon container delete, when rootless cgroup v2
   is used with older systemd. (#3226, #3297)
 * Fixed returning error from GetStats when hugetlb is unsupported (which causes
   excessive logging for Kubernetes). (#3233, #3295)
 * Improved an error message when dbus-user-session is not installed and
-   rootless + cgroup2 + systemd are used (#3212)
+   rootless + cgroup2 + systemd are used. (#3212)

 [GHSA-v95c-p5hm-xq8f]: https://github.com/opencontainers/runc/security/advisories/GHSA-v95c-p5hm-xq8f

@ -216,7 +483,7 @@ implementation (libcontainer) is *not* covered by this policy.
   code, optimize the method for checking whether a cgroup is frozen. (#2955)
 * cgroups/systemd: fixed "retry on dbus disconnect" logic introduced in rc94
 * cgroups/systemd: fixed returning "unit already exists" error from a systemd
-   cgroup manager (regression in rc94) (#2997, #2996)
+   cgroup manager (regression in rc94). (#2997, #2996)

 ### Added
 * cgroupv2: support SkipDevices with systemd driver. (#2958, #3019)
@ -225,7 +492,7 @@ implementation (libcontainer) is *not* covered by this policy.
   (#3022)

 ### Changed
- * cgroup/systemd: return, not ignore, stop unit error from Destroy (#2946)
+ * cgroup/systemd: return, not ignore, stop unit error from Destroy. (#2946)
 * Fix all golangci-lint failures. (#2781, #2962)
 * Make `runc --version` output sane even when built with `go get` or
   otherwise outside of our build scripts. (#2962)
@ -244,5 +511,17 @@ implementation (libcontainer) is *not* covered by this policy.
 [1.0.1]: https://github.com/opencontainers/runc/compare/v1.0.0...v1.0.1

 <!-- 1.1.z patch releases -->
-[Unreleased 1.1.z]: https://github.com/opencontainers/runc/compare/v1.1.0...release-1.1
+[Unreleased 1.1.z]: https://github.com/opencontainers/runc/compare/v1.1.12...release-1.1
+[1.1.12]: https://github.com/opencontainers/runc/compare/v1.1.11...v1.1.12
+[1.1.11]: https://github.com/opencontainers/runc/compare/v1.1.10...v1.1.11
+[1.1.10]: https://github.com/opencontainers/runc/compare/v1.1.9...v1.1.10
+[1.1.9]: https://github.com/opencontainers/runc/compare/v1.1.8...v1.1.9
+[1.1.8]: https://github.com/opencontainers/runc/compare/v1.1.7...v1.1.8
+[1.1.7]: https://github.com/opencontainers/runc/compare/v1.1.6...v1.1.7
+[1.1.6]: https://github.com/opencontainers/runc/compare/v1.1.5...v1.1.6
+[1.1.5]: https://github.com/opencontainers/runc/compare/v1.1.4...v1.1.5
+[1.1.4]: https://github.com/opencontainers/runc/compare/v1.1.3...v1.1.4
+[1.1.3]: https://github.com/opencontainers/runc/compare/v1.1.2...v1.1.3
+[1.1.2]: https://github.com/opencontainers/runc/compare/v1.1.1...v1.1.2
+[1.1.1]: https://github.com/opencontainers/runc/compare/v1.1.0...v1.1.1
 [1.1.0-rc.1]: https://github.com/opencontainers/runc/compare/v1.0.0...v1.1.0-rc.1
--- a/32
+++ b/32
@ -1,6 +1,6 @@
-ARG GO_VERSION=1.17
-ARG BATS_VERSION=v1.3.0
-ARG LIBSECCOMP_VERSION=2.5.3
+ARG GO_VERSION=1.20
+ARG BATS_VERSION=v1.9.0
+ARG LIBSECCOMP_VERSION=2.5.4

 FROM golang:${GO_VERSION}-bullseye
 ARG DEBIAN_FRONTEND=noninteractive
@ -9,19 +9,16 @@ ARG CRIU_REPO=https://download.opensuse.org/repositories/devel:/tools:/criu/Debi
 RUN KEYFILE=/usr/share/keyrings/criu-repo-keyring.gpg; \
    wget -nv $CRIU_REPO/Release.key -O- | gpg --dearmor > "$KEYFILE" \
    && echo "deb [signed-by=$KEYFILE] $CRIU_REPO/ /" > /etc/apt/sources.list.d/criu.list \
-    && dpkg --add-architecture armel \
-    && dpkg --add-architecture armhf \
-    && dpkg --add-architecture arm64 \
-    && dpkg --add-architecture ppc64el \
    && apt-get update \
    && apt-get install -y --no-install-recommends \
        build-essential \
        criu \
-        crossbuild-essential-arm64 \
-        crossbuild-essential-armel \
-        crossbuild-essential-armhf \
-        crossbuild-essential-ppc64el \
-        crossbuild-essential-s390x \
+        gcc-aarch64-linux-gnu libc-dev-arm64-cross \
+        gcc-arm-linux-gnueabi libc-dev-armel-cross \
+        gcc-arm-linux-gnueabihf libc-dev-armhf-cross \
+        gcc-powerpc64le-linux-gnu libc-dev-ppc64el-cross \
+        gcc-s390x-linux-gnu libc-dev-s390x-cross \
+        gcc-riscv64-linux-gnu libc-dev-riscv64-cross \
        curl \
        gawk \
        gcc \
@ -54,11 +51,18 @@ RUN cd /tmp \

 # install libseccomp
 ARG LIBSECCOMP_VERSION
-COPY script/* /tmp/script/
+COPY script/seccomp.sh script/lib.sh /tmp/script/
 RUN mkdir -p /opt/libseccomp \
-    && /tmp/script/seccomp.sh "$LIBSECCOMP_VERSION" /opt/libseccomp arm64 armel armhf ppc64le s390x
+    && /tmp/script/seccomp.sh "$LIBSECCOMP_VERSION" /opt/libseccomp arm64 armel armhf ppc64le riscv64 s390x
 ENV LIBSECCOMP_VERSION=$LIBSECCOMP_VERSION
 ENV LD_LIBRARY_PATH=/opt/libseccomp/lib
 ENV PKG_CONFIG_PATH=/opt/libseccomp/lib/pkgconfig

+# Prevent the "fatal: detected dubious ownership in repository" git complain during build.
+RUN git config --global --add safe.directory /go/src/github.com/opencontainers/runc
+
 WORKDIR /go/src/github.com/opencontainers/runc
+
+# Fixup for cgroup v2.
+COPY script/prepare-cgroup-v2.sh /
+ENTRYPOINT [ "/prepare-cgroup-v2.sh" ]
--- a/75
+++ b/75
@ -10,23 +10,51 @@ GIT_BRANCH_CLEAN := $(shell echo $(GIT_BRANCH) | sed -e "s/[^[:alnum:]]/-/g")
 RUNC_IMAGE := runc_dev$(if $(GIT_BRANCH_CLEAN),:$(GIT_BRANCH_CLEAN))
 PROJECT := github.com/opencontainers/runc
 BUILDTAGS ?= seccomp
+
 COMMIT ?= $(shell git describe --dirty --long --always)
 VERSION := $(shell cat ./VERSION)
+LDFLAGS_COMMON := -X main.gitCommit=$(COMMIT) -X main.version=$(VERSION)

-ifeq ($(shell $(GO) env GOOS),linux)
-	ifeq (,$(filter $(shell $(GO) env GOARCH),mips mipsle mips64 mips64le ppc64))
-		ifeq (,$(findstring -race,$(EXTRA_FLAGS)))
-			GO_BUILDMODE := "-buildmode=pie"
-		endif
+GOARCH := $(shell $(GO) env GOARCH)
+
+GO_BUILDMODE :=
+# Enable dynamic PIE executables on supported platforms.
+ifneq (,$(filter $(GOARCH),386 amd64 arm arm64 ppc64le riscv64 s390x))
+	ifeq (,$(findstring -race,$(EXTRA_FLAGS)))
+		GO_BUILDMODE := "-buildmode=pie"
 	endif
 endif
-GO_BUILD := $(GO) build -trimpath $(GO_BUILDMODE) $(EXTRA_FLAGS) -tags "$(BUILDTAGS)" \
-	-ldflags "-X main.gitCommit=$(COMMIT) -X main.version=$(VERSION) $(EXTRA_LDFLAGS)"
-GO_BUILD_STATIC := CGO_ENABLED=1 $(GO) build -trimpath $(EXTRA_FLAGS) -tags "$(BUILDTAGS) netgo osusergo" \
-	-ldflags "-extldflags -static -X main.gitCommit=$(COMMIT) -X main.version=$(VERSION) $(EXTRA_LDFLAGS)"
+GO_BUILD := $(GO) build -trimpath $(GO_BUILDMODE) \
+	$(EXTRA_FLAGS) -tags "$(BUILDTAGS)" \
+	-ldflags "$(LDFLAGS_COMMON) $(EXTRA_LDFLAGS)"
+
+GO_BUILDMODE_STATIC :=
+LDFLAGS_STATIC := -extldflags -static
+# Enable static PIE executables on supported platforms.
+# This (among the other things) requires libc support (rcrt1.o), which seems
+# to be available only for arm64 and amd64 (Debian Bullseye).
+ifneq (,$(filter $(GOARCH),arm64 amd64))
+	ifeq (,$(findstring -race,$(EXTRA_FLAGS)))
+		GO_BUILDMODE_STATIC := -buildmode=pie
+		LDFLAGS_STATIC := -linkmode external -extldflags --static-pie
+	endif
+endif
+# Enable static PIE binaries on supported platforms.
+GO_BUILD_STATIC := $(GO) build -trimpath $(GO_BUILDMODE_STATIC) \
+	$(EXTRA_FLAGS) -tags "$(BUILDTAGS) netgo osusergo" \
+	-ldflags "$(LDFLAGS_COMMON) $(LDFLAGS_STATIC) $(EXTRA_LDFLAGS)"

 GPG_KEYID ?= asarai@suse.de

+# Some targets need cgo, which is disabled by default when cross compiling.
+# Enable cgo explicitly for those.
+# Both runc and libcontainer/integration need libcontainer/nsenter.
+runc static localunittest: export CGO_ENABLED=1
+# seccompagent needs libseccomp (when seccomp build tag is set).
+ifneq (,$(filter $(BUILDTAGS),seccomp))
+seccompagent: export CGO_ENABLED=1
+endif
+
 .DEFAULT: runc

 runc:
@ -40,7 +68,7 @@ recvtty sd-helper seccompagent:
 static:
 	$(GO_BUILD_STATIC) -o runc .

-releaseall: RELEASE_ARGS := "-a arm64 -a armel -a armhf -a ppc64le -a s390x"
+releaseall: RELEASE_ARGS := "-a arm64 -a armel -a armhf -a ppc64le -a riscv64 -a s390x"
 releaseall: release

 release: runcimage
@ -50,7 +78,7 @@ release: runcimage
 		$(RUNC_IMAGE) make localrelease
 	script/release_sign.sh -S $(GPG_KEYID) -r release/$(VERSION) -v $(VERSION)

-localrelease:
+localrelease: verify-changelog
 	script/release_build.sh -r release/$(VERSION) -v $(VERSION) $(RELEASE_ARGS)

 dbuild: runcimage
@ -133,26 +161,39 @@ cfmt:
 shellcheck:
 	shellcheck tests/integration/*.bats tests/integration/*.sh \
 		tests/integration/*.bash tests/*.sh \
-		script/release_*.sh script/seccomp.sh script/lib.sh
-	# TODO: add shellcheck for more sh files
+		man/*.sh script/*
+	# TODO: add shellcheck for more sh files (contrib/completions/bash/runc).

 shfmt:
-	shfmt -ln bats -d -w tests/integration/*.bats
-	shfmt -ln bash -d -w man/*.sh script/* tests/*.sh tests/integration/*.bash
+	$(CONTAINER_ENGINE) run $(CONTAINER_ENGINE_RUN_FLAGS) \
+		--rm -v $(CURDIR):/src -w /src \
+		mvdan/shfmt:v3.5.1 -d -w .
+
+localshfmt:
+	shfmt -d -w .

 vendor:
 	$(GO) mod tidy
 	$(GO) mod vendor
 	$(GO) mod verify

+verify-changelog:
+	# No space at EOL.
+	! grep -n '\s$$' CHANGELOG.md
+	# Period before issue/PR references.
+	! grep -n '[0-9a-zA-Z][^.] (#[1-9][0-9, #]*)$$' CHANGELOG.md
+
 verify-dependencies: vendor
 	@test -z "$$(git status --porcelain -- go.mod go.sum vendor/)" \
 		|| (echo -e "git status:\n $$(git status -- go.mod go.sum vendor/)\nerror: vendor/, go.mod and/or go.sum not up to date. Run \"make vendor\" to update"; exit 1) \
 		&& echo "all vendor files are up to date."

+validate-keyring:
+	script/keyring_validate.sh
+
 .PHONY: runc all recvtty sd-helper seccompagent static releaseall release \
 	localrelease dbuild lint man runcimage \
 	test localtest unittest localunittest integration localintegration \
 	rootlessintegration localrootlessintegration shell install install-bash \
-	install-man clean cfmt shfmt shellcheck \
-	vendor verify-dependencies
+	install-man clean cfmt shfmt localshfmt shellcheck \
+	vendor verify-changelog verify-dependencies validate-keyring
--- a/README.md
+++ b/README.md
@ -1,10 +1,11 @@
 # runc

 [![Go Report Card](https://goreportcard.com/badge/github.com/opencontainers/runc)](https://goreportcard.com/report/github.com/opencontainers/runc)
-[![GoDoc](https://godoc.org/github.com/opencontainers/runc?status.svg)](https://godoc.org/github.com/opencontainers/runc)
+[![Go Reference](https://pkg.go.dev/badge/github.com/opencontainers/runc.svg)](https://pkg.go.dev/github.com/opencontainers/runc)
 [![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/588/badge)](https://bestpractices.coreinfrastructure.org/projects/588)
 [![gha/validate](https://github.com/opencontainers/runc/workflows/validate/badge.svg)](https://github.com/opencontainers/runc/actions?query=workflow%3Avalidate)
 [![gha/ci](https://github.com/opencontainers/runc/workflows/ci/badge.svg)](https://github.com/opencontainers/runc/actions?query=workflow%3Aci)
+[![CirrusCI](https://api.cirrus-ci.com/github/opencontainers/runc.svg)](https://cirrus-ci.com/github/opencontainers/runc)

 ## Introduction

@ -14,6 +15,8 @@

 You can find official releases of `runc` on the [release](https://github.com/opencontainers/runc/releases) page.

+All releases are signed by one of the keys listed in the [`runc.keyring` file in the root of this repository](runc.keyring).
+
 ## Security

 The reporting process and disclosure communications are outlined [here](https://github.com/opencontainers/org/blob/master/SECURITY.md).
@ -23,7 +26,7 @@ A third party security audit was performed by Cure53, you can see the full repor

 ## Building

-`runc` only supports Linux. It must be built with Go version 1.16 or higher.
+`runc` only supports Linux. It must be built with Go version 1.17 or higher.

 In order to enable seccomp support you will need to install `libseccomp` on your platform.
 > e.g. `libseccomp-devel` for CentOS, or `libseccomp-dev` for Ubuntu
--- a/2
+++ b/2
@ -1 +1 @@
-1.1.0
+1.1.12
--- a/Vagrantfile.fedora
+++ b/Vagrantfile.fedora
@ -3,7 +3,7 @@

 Vagrant.configure("2") do |config|
 # Fedora box is used for testing cgroup v2 support
-  config.vm.box = "fedora/35-cloud-base"
+  config.vm.box = "fedora/38-cloud-base"
  config.vm.provider :virtualbox do |v|
    v.memory = 2048
    v.cpus = 2
@ -29,6 +29,9 @@ EOF
    done
    dnf clean all

+    # Prevent the "fatal: unsafe repository" git complain during build.
+    git config --global --add safe.directory /vagrant
+
    # Add a user for rootless tests
    useradd -u2000 -m -d/home/rootless -s/bin/bash rootless

--- a/contrib/cmd/seccompagent/gen-seccomp-example-cfg.sh
+++ b/contrib/cmd/seccompagent/gen-seccomp-example-cfg.sh
@ -12,7 +12,7 @@ fi
 # exits when not running inside bats. We can do hacks, but just to redefine
 # update_config() seems clearer. We don't even really need to keep them in sync.
 function update_config() {
-        jq "$1" "./config.json" | awk 'BEGIN{RS="";getline<"-";print>ARGV[1]}' "./config.json"
+	jq "$1" "./config.json" | awk 'BEGIN{RS="";getline<"-";print>ARGV[1]}' "./config.json"
 }

 update_config '.linux.seccomp = {
--- a/debian/changelog
+++ b/debian/changelog
@ -1,29 +0,0 @@
-runc (1.1.0-ok3) yangtze; urgency=medium
-
-  * CVE-2022-29162 安全修复
-
- -- chenxinquan <chenxinquan@kylinos.cn>  Fri, 28 Jul 2023 16:16:46 +0800
-
-runc (1.1.0-ok2) yangtze; urgency=medium
-
-  * yangfs215 CVE-2022-29162 runc: do not set inheritable capabilities
-
- -- yangfengsheng <yangfs@whu.edu.cn>  Tue, 18 Jul 2023 00:10:28 +0800
-
-runc (1.1.0-ok1) yangtze; urgency=medium
-
-  * Merge new upstream version 1.1.0 
-
- -- Luoyaoming <luoyaoming@kylinos.cn>  Fri, 30 Dec 2022 11:11:29 +0800
-
-runc (1.0.0~rc10-ok2) yangtze; urgency=medium
-
-  * Update version.
-
- -- zhouganqing <zhouganqing@kylinos.cn>  Thu, 28 Jul 2022 16:49:00 +0800
-
-runc (1.0.0~rc10-ok1) yangtze; urgency=medium
-
-  * Build for openKylin.
-
- -- openKylinBot <openKylinBot@openkylin.com>  Mon, 25 Apr 2022 22:03:04 +0800
--- a/debian/clean
+++ b/debian/clean
@ -1,17 +0,0 @@
-## Remove generated man pages:
-man/man8/*
-
-## Drop hanging test (introduced in 0.0.9).
-## https://github.com/opencontainers/runc/issues/692
-libcontainer/nsenter/nsenter_test.go
-
-## Failing tests:
-
-## Privileged tests:
-### couldn't get cgroup root: mountpoint for cgroup not found
-libcontainer/cgroups/fs/apply_raw_test.go
-
-### FAIL: TestXattr (0.00s)
-###     xattr_test.go:26: Success
-###     xattr_test.go:30: failed
-libcontainer/xattr/xattr_test.go
--- a/debian/compat
+++ b/debian/compat
@ -1 +0,0 @@
-10
--- a/debian/control
+++ b/debian/control
@ -1,43 +0,0 @@
-Source: runc
-Section: devel
-Priority: optional
-Maintainer: Openkylin Developers <packaging@lists.openkylin.top>
-XSBC-Original-Maintainer: Debian Go Packaging Team <pkg-go-maintainers@lists.alioth.debian.org>
-Uploaders: Alexandre Viau <aviau@debian.org>,
-           Dmitry Smirnov <onlyjob@debian.org>,
-           Tim Potter <tpot@hpe.com>
-Build-Depends: debhelper (>= 11~),
-               dh-golang,
-               go-md2man,
-               golang-any,
-               libapparmor-dev,
-               libseccomp-dev,
-               pkg-config,
-               protobuf-compiler
-Standards-Version: 4.1.4
-Homepage: https://github.com/opencontainers/runc
-Vcs-Git: https://salsa.debian.org/go-team/packages/runc.git
-Vcs-Browser: https://salsa.debian.org/go-team/packages/runc
-XS-Go-Import-Path: github.com/opencontainers/runc
-
-Package: runc
-Architecture: any
-Depends: ${misc:Depends}, ${shlibs:Depends}
-Breaks: docker.io (<= 1.13.1~ds1-0)
-Built-Using: ${misc:Built-Using}
-Description: Open Container Project - runtime
- "runc" is a command line client for running applications packaged according
- to the Open Container Format (OCF) and is a compliant implementation of
- the Open Container Project specification.
-
-Package: golang-github-opencontainers-runc-dev
-Architecture: all
-Depends: ${misc:Depends}
-Description: Open Container Project - development files
- "runc" is a command line client for running applications packaged according
- to the Open Container Format (OCF) and is a compliant implementation of
- the Open Container Project specification.
- .
- This package provides development files formerly known as
- "github.com/docker/libcontainer".
-
--- a/debian/copyright
+++ b/debian/copyright
@ -1,82 +0,0 @@
-Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
-Upstream-Name: runc
-Source: https://github.com/opencontainers/runc
-
-Files: *
-Copyright: 2012-2015 Docker, Inc.
-License: Apache-2.0
-
-Files:
-    vendor/github.com/cyphar/filepath-securejoin/*
-Copyright:
-    2014-2015 Docker Inc & Go Authors. All rights reserved.
-    2017      SUSE LLC. All rights reserved.
-License: BSD-3-Clause~Google
-
-Files: debian/*
-Copyright:
-    2015      Alexandre Viau <alexandre@alexandreviau.net>
-    2015-2016 Dmitry Smirnov <onlyjob@debian.org>
-License: GPL-3+
-
-Files: debian/patches/*
-Copyright: 2015 Dmitry Smirnov <onlyjob@debian.org>
-License: GPL-3+ or Apache-2.0
-Comment: patches can be licensed under the same terms as upstream.
-
-License: Apache-2.0
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- .
- http://www.apache.org/licenses/LICENSE-2.0
- .
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- .
- The complete text of the Apache version 2.0 license
- can be found in "/usr/share/common-licenses/Apache-2.0".
-
-License: GPL-3+
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
- ․
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- GNU General Public License for more details.
- ․
- The complete text of the GNU General Public License version 3
- can be found in "/usr/share/common-licenses/GPL-3".
-
-License: BSD-3-Clause~Google
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are
- met:
- .
-   * Redistributions of source code must retain the above copyright
-     notice, this list of conditions and the following disclaimer.
-   * Redistributions in binary form must reproduce the above
-     copyright notice, this list of conditions and the following disclaimer
-     in the documentation and/or other materials provided with the
-     distribution.
-   * Neither the name of Google Inc. nor the names of its
-     contributors may be used to endorse or promote products derived from
-     this software without specific prior written permission.
- .
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/debian/gitlab-ci.yml
+++ b/debian/gitlab-ci.yml
@ -1,28 +0,0 @@
-
-# auto-generated, DO NOT MODIFY.
-# The authoritative copy of this file lives at:
-# https://salsa.debian.org/go-team/ci/blob/master/cmd/ci/gitlabciyml.go
-
-# TODO: publish under debian-go-team/ci
-image: stapelberg/ci2
-
-test_the_archive:
-  artifacts:
-    paths:
-    - before-applying-commit.json
-    - after-applying-commit.json
-  script:
-    # Create an overlay to discard writes to /srv/gopath/src after the build:
-    - "rm -rf /cache/overlay/{upper,work}"
-    - "mkdir -p /cache/overlay/{upper,work}"
-    - "mount -t overlay overlay -o lowerdir=/srv/gopath/src,upperdir=/cache/overlay/upper,workdir=/cache/overlay/work /srv/gopath/src"
-    - "export GOPATH=/srv/gopath"
-    - "export GOCACHE=/cache/go"
-    # Build the world as-is:
-    - "ci-build -exemptions=/var/lib/ci-build/exemptions.json > before-applying-commit.json"
-    # Copy this package into the overlay:
-    - "GBP_CONF_FILES=:debian/gbp.conf gbp buildpackage --git-no-pristine-tar --git-ignore-branch --git-ignore-new --git-export-dir=/tmp/export --git-no-overlay --git-tarball-dir=/nonexistant --git-cleaner=/bin/true --git-builder='dpkg-buildpackage -S -d --no-sign'"
-    - "pgt-gopath -dsc /tmp/export/*.dsc"
-    # Rebuild the world:
-    - "ci-build -exemptions=/var/lib/ci-build/exemptions.json > after-applying-commit.json"
-    - "ci-diff before-applying-commit.json after-applying-commit.json"
--- a/debian/golang-github-opencontainers-runc-dev.install
+++ b/debian/golang-github-opencontainers-runc-dev.install
@ -1 +0,0 @@
-usr/share/gocode/src
--- a/debian/patches/test--fix_TestGetAdditionalGroups.patch
+++ b/debian/patches/test--fix_TestGetAdditionalGroups.patch
@ -1,39 +0,0 @@
-From: Dmitry Smirnov <onlyjob@debian.org>
-Date: Thu, 28 Jul 2022 16:28:22 +0800
-Subject: fix FTBFS on i686
-
-src/github.com/opencontainers/runc/libcontainer/user/user_test.go:448:36: constant 2147483648 overflows int
-Last-Update: 2018-06-16
-Forwarded: https://github.com/opencontainers/runc/pull/1821
-Bug-Upstream: https://github.com/opencontainers/runc/issues/941
---
- libcontainer/user/user.go      | 2 +-
- libcontainer/user/user_test.go | 2 +-
- 2 files changed, 2 insertions(+), 2 deletions(-)
-
-diff --git a/libcontainer/user/user.go b/libcontainer/user/user.go
-index 7b912bb..38caded 100644
--- a/libcontainer/user/user.go
-+++ b/libcontainer/user/user.go
-@@ -473,7 +473,7 @@ func GetAdditionalGroups(additionalGroups []string, group io.Reader) ([]int, err
- 				return nil, fmt.Errorf("Unable to find group %s", ag)
- 			}
- 			// Ensure gid is inside gid range.
-			if gid < minId || gid > maxId {
-+			if gid < minId || gid >= maxId {
- 				return nil, ErrRange
- 			}
- 			gidMap[gid] = struct{}{}
-diff --git a/libcontainer/user/user_test.go b/libcontainer/user/user_test.go
-index 24ee559..a4aabdc 100644
--- a/libcontainer/user/user_test.go
-+++ b/libcontainer/user/user_test.go
-@@ -445,7 +445,7 @@ this is just some garbage data
- 	if utils.GetIntSize() > 4 {
- 		tests = append(tests, foo{
- 			// groups with too large id
-			groups:   []string{strconv.Itoa(1 << 31)},
-+			groups:   []string{strconv.Itoa( 1<<31 -1 )},
- 			expected: nil,
- 			hasError: true,
- 		})
--- a/debian/patches/test--skip-Hugetlb.patch
+++ b/debian/patches/test--skip-Hugetlb.patch
@ -1,48 +0,0 @@
-From: Dmitry Smirnov <onlyjob@debian.org>
-Date: Thu, 28 Jul 2022 16:28:22 +0800
-Subject: disabled unreliable tests due to random failures on [ppc64el,
- s390x].
-
-Last-Update: 2018-09-27
-Forwarded: not-needed
-Bug-Upstream: https://github.com/opencontainers/runc/issues/1822
---
- libcontainer/cgroups/fs/hugetlb_test.go | 4 ++++
- 1 file changed, 4 insertions(+)
-
-diff --git a/libcontainer/cgroups/fs/hugetlb_test.go b/libcontainer/cgroups/fs/hugetlb_test.go
-index 9ddacfe..9b60650 100644
--- a/libcontainer/cgroups/fs/hugetlb_test.go
-+++ b/libcontainer/cgroups/fs/hugetlb_test.go
-@@ -89,6 +89,7 @@ func TestHugetlbStats(t *testing.T) {
- }
- 
- func TestHugetlbStatsNoUsageFile(t *testing.T) {
-+t.Skip("Disabled unreliable test")
- 	helper := NewCgroupTestUtil("hugetlb", t)
- 	defer helper.cleanup()
- 	helper.writeFileContents(map[string]string{
-@@ -104,6 +105,7 @@ func TestHugetlbStatsNoUsageFile(t *testing.T) {
- }
- 
- func TestHugetlbStatsNoMaxUsageFile(t *testing.T) {
-+t.Skip("Disabled unreliable test")
- 	helper := NewCgroupTestUtil("hugetlb", t)
- 	defer helper.cleanup()
- 	for _, pageSize := range HugePageSizes {
-@@ -121,6 +123,7 @@ func TestHugetlbStatsNoMaxUsageFile(t *testing.T) {
- }
- 
- func TestHugetlbStatsBadUsageFile(t *testing.T) {
-+t.Skip("Disabled unreliable test")
- 	helper := NewCgroupTestUtil("hugetlb", t)
- 	defer helper.cleanup()
- 	for _, pageSize := range HugePageSizes {
-@@ -139,6 +142,7 @@ func TestHugetlbStatsBadUsageFile(t *testing.T) {
- }
- 
- func TestHugetlbStatsBadMaxUsageFile(t *testing.T) {
-+t.Skip("Disabled unreliable test")
- 	helper := NewCgroupTestUtil("hugetlb", t)
- 	defer helper.cleanup()
- 	helper.writeFileContents(map[string]string{
--- a/debian/patches/test--skip_TestFactoryNewTmpfs.patch
+++ b/debian/patches/test--skip_TestFactoryNewTmpfs.patch
@ -1,22 +0,0 @@
-From: Dmitry Smirnov <onlyjob@debian.org>
-Date: Thu, 28 Jul 2022 16:28:22 +0800
-Subject: disable test (requires root)
-
-Last-Update: 2018-06-15
-Forwarded: not-needed
---
- libcontainer/factory_linux_test.go | 1 +
- 1 file changed, 1 insertion(+)
-
-diff --git a/libcontainer/factory_linux_test.go b/libcontainer/factory_linux_test.go
-index 8d0ca8a..1dc0180 100644
--- a/libcontainer/factory_linux_test.go
-+++ b/libcontainer/factory_linux_test.go
-@@ -78,6 +78,7 @@ func TestFactoryNewIntelRdt(t *testing.T) {
- }
- 
- func TestFactoryNewTmpfs(t *testing.T) {
-+t.Skip("DM - skipping privileged test")
- 	root, rerr := newTestRoot()
- 	if rerr != nil {
- 		t.Fatal(rerr)
--- a/debian/rules
+++ b/debian/rules
@ -1,26 +0,0 @@
-#!/usr/bin/make -f
-
-# Uncomment this to turn on verbose mode.
-#export DH_VERBOSE=1
-
-export DH_GOPKG := github.com/opencontainers/runc
-export DH_GOLANG_INSTALL_EXTRA := libcontainer/seccomp/fixtures libcontainer/criurpc
-TAGS=apparmor seccomp selinux ambient
-
-%:
-	dh $@ --buildsystem=golang --with=golang --builddirectory=_build
-
-override_dh_auto_configure:
-	cd man && ./md2man-all.sh
-	dh_auto_configure
-	## Remove extra license files:
-	$(RM) -v \
-            _build/src/$(DH_GOPKG)/vendor/github.com/docker/docker/*/*/LICENSE* \
-        ;
-
-override_dh_auto_build:
-	dh_auto_build -- -tags "$(TAGS)"
-
-override_dh_auto_test:
-	DH_GOLANG_EXCLUDES="libcontainer/integration" \
-        dh_auto_test -- -tags "$(TAGS)"
--- a/debian/runc.docs
+++ b/debian/runc.docs
@ -1,2 +0,0 @@
-NOTICE
-README*
--- a/debian/runc.install
+++ b/debian/runc.install
@ -1 +0,0 @@
-usr/bin/*    /usr/sbin/
--- a/debian/runc.lintian-overrides
+++ b/debian/runc.lintian-overrides
@ -1 +0,0 @@
-runc: spelling-error-in-binary
--- a/debian/runc.manpages
+++ b/debian/runc.manpages
@ -1 +0,0 @@
-man/man8/*.8
--- a/debian/source/format
+++ b/debian/source/format
@ -1 +0,0 @@
-3.0 (native)
--- a/debian/source/lintian-overrides
+++ b/debian/source/lintian-overrides
@ -1,2 +0,0 @@
-# Result of Files-Excluded:
-source-contains-empty-directory vendor/*
--- a/debian/tests/basic-smoke
+++ b/debian/tests/basic-smoke
@ -1,34 +0,0 @@
-#!/bin/bash
-set -Eeuo pipefail
-set -x
-
-runc --version
-
-tempDir="$(mktemp -d)"
-trap 'rm -rf "$tempDir"' EXIT
-
-# build up rootfs with busybox
-busybox="$(which busybox)" # from busybox-static
-mkdir "$tempDir/rootfs"
-cp -a "$busybox" "$tempDir/rootfs/"
-
-# rough "rootfs" smoke test (makes sure "busybox" is actually static)
-chroot "$tempDir/rootfs" /busybox true
-
-# make a config.json file for our "bundle"
-runc spec --bundle "$tempDir"
-
-# edit the default command to something we can actually run with our rootfs
-grep '"sh"' "$tempDir/config.json"
-sed -i 's@"sh"@"/busybox","echo","success"@g' "$tempDir/config.json"
-grep '"/busybox","echo","success"' "$tempDir/config.json"
-# and disable the TTY
-grep '"terminal": true,' "$tempDir/config.json"
-sed -i 's/"terminal": true,/"terminal": false,/g' "$tempDir/config.json"
-grep '"terminal": false,' "$tempDir/config.json"
-
-# run it and capture the output
-output="$(runc run --bundle "$tempDir" "test-$$-$RANDOM")"
-
-# ensure the output was exactly what we expected
-[ "$output" = 'success' ]
--- a/debian/tests/control
+++ b/debian/tests/control
@ -1,7 +0,0 @@
-Tests: basic-smoke
-Depends: busybox-static, @
-Restrictions: allow-stderr, isolation-machine, needs-root
-
-Test-Command: /usr/bin/dh_golang_autopkgtest
-Depends: @, @builddeps@, dh-golang
-Restrictions: allow-stderr, isolation-machine
--- a/debian/watch
+++ b/debian/watch
@ -1,9 +0,0 @@
-version=3
-
-opts=\
-repack,\
-repacksuffix=+dfsg1,\
-uversionmangle=s/-rc/~rc/,\
-dversionmangle=s/[~+]dfsg\d*$// \
- https://github.com/opencontainers/runc/releases \
- .*archive/v?(\d\.\d\.\d.*)\.tar\.gz
--- a/docs/systemd.md
+++ b/docs/systemd.md
@ -123,8 +123,8 @@ The above will set the following properties:
 * `TimeoutStopSec` to 2 minutes and 3 seconds;
 * `CollectMode` to "inactive-or-failed".

-The values must be in the gvariant format (for details, see
-[gvariant documentation](https://developer.gnome.org/glib/stable/gvariant-text.html)).
+The values must be in the gvariant text format, as described in
+[gvariant documentation](https://docs.gtk.org/glib/gvariant-text.html).

 To find out which type systemd expects for a particular parameter, please
 consult systemd sources.
--- a/go.mod
+++ b/go.mod
@ -1,26 +1,33 @@
 module github.com/opencontainers/runc

-go 1.16
+go 1.17

 require (
 	github.com/checkpoint-restore/go-criu/v5 v5.3.0
 	github.com/cilium/ebpf v0.7.0
 	github.com/containerd/console v1.0.3
 	github.com/coreos/go-systemd/v22 v22.3.2
-	github.com/cyphar/filepath-securejoin v0.2.3
+	github.com/cyphar/filepath-securejoin v0.2.4
 	github.com/docker/go-units v0.4.0
 	github.com/godbus/dbus/v5 v5.0.6
 	github.com/moby/sys/mountinfo v0.5.0
-	github.com/mrunalp/fileutils v0.5.0
+	github.com/mrunalp/fileutils v0.5.1
 	github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417
 	github.com/opencontainers/selinux v1.10.0
-	github.com/seccomp/libseccomp-golang v0.9.2-0.20210429002308-3879420cc921
+	github.com/seccomp/libseccomp-golang v0.9.2-0.20220502022130-f33da4d89646
 	github.com/sirupsen/logrus v1.8.1
 	github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635
 	// NOTE: urfave/cli must be <= v1.22.1 due to a regression: https://github.com/urfave/cli/issues/1092
 	github.com/urfave/cli v1.22.1
 	github.com/vishvananda/netlink v1.1.0
-	golang.org/x/net v0.0.0-20201224014010-6772e930b67b
-	golang.org/x/sys v0.0.0-20211116061358-0a5406a5449c
+	golang.org/x/net v0.8.0
+	golang.org/x/sys v0.6.0
 	google.golang.org/protobuf v1.27.1
 )
+
+require (
+	github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d // indirect
+	github.com/russross/blackfriday/v2 v2.0.1 // indirect
+	github.com/shurcooL/sanitized_anchor_name v1.0.0 // indirect
+	github.com/vishvananda/netns v0.0.0-20191106174202-0a2b9b5464df // indirect
+)
--- a/go.sum
+++ b/go.sum
@ -9,8 +9,8 @@ github.com/coreos/go-systemd/v22 v22.3.2 h1:D9/bQk5vlXQFZ6Kwuu6zaiXJ9oTPe68++AzA
 github.com/coreos/go-systemd/v22 v22.3.2/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
 github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d h1:U+s90UTSYgptZMwQh2aRr3LuazLJIa+Pg3Kc1ylSYVY=
 github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU=
-github.com/cyphar/filepath-securejoin v0.2.3 h1:YX6ebbZCZP7VkM3scTTokDgBL2TY741X51MTk3ycuNI=
-github.com/cyphar/filepath-securejoin v0.2.3/go.mod h1:aPGpWjXOXUn2NCNjFvBE6aRxGGx79pTxQpKOJNYHHl4=
+github.com/cyphar/filepath-securejoin v0.2.4 h1:Ugdm7cg7i6ZK6x3xDF1oEu1nfkyfH53EtKeQYTC3kyg=
+github.com/cyphar/filepath-securejoin v0.2.4/go.mod h1:aPGpWjXOXUn2NCNjFvBE6aRxGGx79pTxQpKOJNYHHl4=
 github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/docker/go-units v0.4.0 h1:3uh0PgVws3nIA0Q+MwDC8yjEPf9zjRfZZWXZYDct3Tw=
@ -31,8 +31,8 @@ github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
 github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
 github.com/moby/sys/mountinfo v0.5.0 h1:2Ks8/r6lopsxWi9m58nlwjaeSzUX9iiL1vj5qB/9ObI=
 github.com/moby/sys/mountinfo v0.5.0/go.mod h1:3bMD3Rg+zkqx8MRYPi7Pyb0Ie97QEBmdxbhnCLlSvSU=
-github.com/mrunalp/fileutils v0.5.0 h1:NKzVxiH7eSk+OQ4M+ZYW1K6h27RUV3MI6NUTsHhU6Z4=
-github.com/mrunalp/fileutils v0.5.0/go.mod h1:M1WthSahJixYnrXQl/DFQuteStB1weuxD2QJNHXfbSQ=
+github.com/mrunalp/fileutils v0.5.1 h1:F+S7ZlNKnrwHfSwdlgNSkKo67ReVf8o9fel6C3dkm/Q=
+github.com/mrunalp/fileutils v0.5.1/go.mod h1:M1WthSahJixYnrXQl/DFQuteStB1weuxD2QJNHXfbSQ=
 github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417 h1:3snG66yBm59tKhhSPQrQ/0bCrv1LQbKt40LnUPiUxdc=
 github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0=
 github.com/opencontainers/selinux v1.10.0 h1:rAiKF8hTcgLI3w0DHm6i0ylVVcOrlgR1kK99DRLDhyU=
@ -41,8 +41,8 @@ github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZb
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 github.com/russross/blackfriday/v2 v2.0.1 h1:lPqVAte+HuHNfhJ/0LC98ESWRz8afy9tM/0RK8m9o+Q=
 github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
-github.com/seccomp/libseccomp-golang v0.9.2-0.20210429002308-3879420cc921 h1:58EBmR2dMNL2n/FnbQewK3D14nXr0V9CObDSvMJLq+Y=
-github.com/seccomp/libseccomp-golang v0.9.2-0.20210429002308-3879420cc921/go.mod h1:JA8cRccbGaA1s33RQf7Y1+q9gHmZX1yB/z9WDN1C6fg=
+github.com/seccomp/libseccomp-golang v0.9.2-0.20220502022130-f33da4d89646 h1:RpforrEYXWkmGwJHIGnLZ3tTWStkjVVstwzNGqxX2Ds=
+github.com/seccomp/libseccomp-golang v0.9.2-0.20220502022130-f33da4d89646/go.mod h1:JA8cRccbGaA1s33RQf7Y1+q9gHmZX1yB/z9WDN1C6fg=
 github.com/shurcooL/sanitized_anchor_name v1.0.0 h1:PdmoCO6wvbs+7yrJyMORt4/BmY5IYyJwS/kOiWx8mHo=
 github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc=
 github.com/sirupsen/logrus v1.8.1 h1:dJKuHgqk1NNQlqoA6BTlM1Wf9DOH3NBjQyu0h9+AZZE=
@ -57,20 +57,48 @@ github.com/vishvananda/netlink v1.1.0 h1:1iyaYNBLmP6L0220aDnYQpo1QEV4t4hJ+xEEhhJ
 github.com/vishvananda/netlink v1.1.0/go.mod h1:cTgwzPIzzgDAYoQrMm0EdrjRUBkTqKYppBueQtXaqoE=
 github.com/vishvananda/netns v0.0.0-20191106174202-0a2b9b5464df h1:OviZH7qLw/7ZovXvuNyL3XQl8UFofeikI1NW1Gypu7k=
 github.com/vishvananda/netns v0.0.0-20191106174202-0a2b9b5464df/go.mod h1:JP3t17pCcGlemwknint6hfoeCVQrEMVwxRLRjXpq+BU=
-golang.org/x/net v0.0.0-20201224014010-6772e930b67b h1:iFwSg7t5GZmB/Q5TjiEAsdoLDrdJRC1RiF2WhuV29Qw=
-golang.org/x/net v0.0.0-20201224014010-6772e930b67b/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
+github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
+golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
+golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
+golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
+golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
+golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
+golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
+golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
+golang.org/x/net v0.8.0 h1:Zrh2ngAOFYneWTAIAPethzeaQLuHwhuBkuV6ZiRnUaQ=
+golang.org/x/net v0.8.0/go.mod h1:QVkue5JL9kW//ek3r6jTKnTFis1tRmNAW2P1shuFdJc=
+golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
 golang.org/x/sys v0.0.0-20190606203320-7fc4e5ec1444/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20191115151921-52ab43148777/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.0.0-20210906170528-6f6e22806c34/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.0.0-20211025201205-69cdffdb9359/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.0.0-20211116061358-0a5406a5449c h1:DHcbWVXeY+0Y8HHKR+rbLwnoh2F4tNCY7rTiHJ30RmA=
-golang.org/x/sys v0.0.0-20211116061358-0a5406a5449c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.6.0 h1:MVltZSvRTcU2ljQOhs94SXPftV6DCNnZViHeQps87pQ=
+golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
+golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
+golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
+golang.org/x/term v0.6.0/go.mod h1:m6U89DPEgQRMq3DNkDClhWw02AUbt2daBVO4cn4Hv9U=
+golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
 golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
+golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
+golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
+golang.org/x/text v0.8.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
 golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
+golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
+golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
+golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4=
 golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
--- a/libcontainer/README.md
+++ b/libcontainer/README.md
@ -1,6 +1,6 @@
 # libcontainer

-[![GoDoc](https://godoc.org/github.com/opencontainers/runc/libcontainer?status.svg)](https://godoc.org/github.com/opencontainers/runc/libcontainer)
+[![Go Reference](https://pkg.go.dev/badge/github.com/opencontainers/runc/libcontainer.svg)](https://pkg.go.dev/github.com/opencontainers/runc/libcontainer)

 Libcontainer provides a native Go implementation for creating containers
 with namespaces, cgroups, capabilities, and filesystem access controls.
--- a/libcontainer/cgroups/ebpf/devicefilter/devicefilter_test.go
+++ b/libcontainer/cgroups/ebpf/devicefilter/devicefilter_test.go
@ -153,8 +153,7 @@ func TestDeviceFilter_Privileged(t *testing.T) {
 			Allow:       true,
 		},
 	}
-	expected :=
-		`
+	expected := `
 // load parameters into registers
        0: LdXMemW dst: r2 src: r1 off: 0 imm: 0
        1: And32Imm dst: r2 imm: 65535
--- a/libcontainer/cgroups/ebpf/ebpf_linux.go
+++ b/libcontainer/cgroups/ebpf/ebpf_linux.go
@ -93,7 +93,7 @@ var (
 )

 // Loosely based on the BPF_F_REPLACE support check in
-//   <https://github.com/cilium/ebpf/blob/v0.6.0/link/syscalls.go>.
+// https://github.com/cilium/ebpf/blob/v0.6.0/link/syscalls.go.
 //
 // TODO: move this logic to cilium/ebpf
 func haveBpfProgReplace() bool {
--- a/libcontainer/cgroups/file.go
+++ b/libcontainer/cgroups/file.go
@ -10,6 +10,7 @@ import (
 	"strings"
 	"sync"

+	"github.com/opencontainers/runc/libcontainer/utils"
 	"github.com/sirupsen/logrus"
 	"golang.org/x/sys/unix"
 )
@ -76,16 +77,16 @@ var (
 	// TestMode is set to true by unit tests that need "fake" cgroupfs.
 	TestMode bool

-	cgroupFd     int = -1
-	prepOnce     sync.Once
-	prepErr      error
-	resolveFlags uint64
+	cgroupRootHandle *os.File
+	prepOnce         sync.Once
+	prepErr          error
+	resolveFlags     uint64
 )

 func prepareOpenat2() error {
 	prepOnce.Do(func() {
 		fd, err := unix.Openat2(-1, cgroupfsDir, &unix.OpenHow{
-			Flags: unix.O_DIRECTORY | unix.O_PATH,
+			Flags: unix.O_DIRECTORY | unix.O_PATH | unix.O_CLOEXEC,
 		})
 		if err != nil {
 			prepErr = &os.PathError{Op: "openat2", Path: cgroupfsDir, Err: err}
@ -96,15 +97,16 @@ func prepareOpenat2() error {
 			}
 			return
 		}
+		file := os.NewFile(uintptr(fd), cgroupfsDir)
+
 		var st unix.Statfs_t
-		if err = unix.Fstatfs(fd, &st); err != nil {
+		if err := unix.Fstatfs(int(file.Fd()), &st); err != nil {
 			prepErr = &os.PathError{Op: "statfs", Path: cgroupfsDir, Err: err}
 			logrus.Warnf("falling back to securejoin: %s", prepErr)
 			return
 		}

-		cgroupFd = fd
-
+		cgroupRootHandle = file
 		resolveFlags = unix.RESOLVE_BENEATH | unix.RESOLVE_NO_MAGICLINKS
 		if st.Type == unix.CGROUP2_SUPER_MAGIC {
 			// cgroupv2 has a single mountpoint and no "cpu,cpuacct" symlinks
@ -122,7 +124,7 @@ func openFile(dir, file string, flags int) (*os.File, error) {
 		flags |= os.O_TRUNC | os.O_CREATE
 		mode = 0o600
 	}
-	path := path.Join(dir, file)
+	path := path.Join(dir, utils.CleanPath(file))
 	if prepareOpenat2() != nil {
 		return openFallback(path, flags, mode)
 	}
@ -131,7 +133,7 @@ func openFile(dir, file string, flags int) (*os.File, error) {
 		return openFallback(path, flags, mode)
 	}

-	fd, err := unix.Openat2(cgroupFd, relPath,
+	fd, err := unix.Openat2(int(cgroupRootHandle.Fd()), relPath,
 		&unix.OpenHow{
 			Resolve: resolveFlags,
 			Flags:   uint64(flags) | unix.O_CLOEXEC,
@ -139,20 +141,20 @@ func openFile(dir, file string, flags int) (*os.File, error) {
 		})
 	if err != nil {
 		err = &os.PathError{Op: "openat2", Path: path, Err: err}
-		// Check if cgroupFd is still opened to cgroupfsDir
+		// Check if cgroupRootHandle is still opened to cgroupfsDir
 		// (happens when this package is incorrectly used
 		// across the chroot/pivot_root/mntns boundary, or
 		// when /sys/fs/cgroup is remounted).
 		//
 		// TODO: if such usage will ever be common, amend this
-		// to reopen cgroupFd and retry openat2.
-		fdStr := strconv.Itoa(cgroupFd)
+		// to reopen cgroupRootHandle and retry openat2.
+		fdStr := strconv.Itoa(int(cgroupRootHandle.Fd()))
 		fdDest, _ := os.Readlink("/proc/self/fd/" + fdStr)
 		if fdDest != cgroupfsDir {
-			// Wrap the error so it is clear that cgroupFd
+			// Wrap the error so it is clear that cgroupRootHandle
 			// is opened to an unexpected/wrong directory.
-			err = fmt.Errorf("cgroupFd %s unexpectedly opened to %s != %s: %w",
-				fdStr, fdDest, cgroupfsDir, err)
+			err = fmt.Errorf("cgroupRootHandle %d unexpectedly opened to %s != %s: %w",
+				cgroupRootHandle.Fd(), fdDest, cgroupfsDir, err)
 		}
 		return nil, err
 	}
--- a/libcontainer/cgroups/file_test.go
+++ b/libcontainer/cgroups/file_test.go
@ -58,8 +58,6 @@ func TestOpenat2(t *testing.T) {
 		{"/sys/fs/cgroup", "/cgroup.controllers"},
 		{"/sys/fs/cgroup/", "cgroup.controllers"},
 		{"/sys/fs/cgroup/", "/cgroup.controllers"},
-		{"/sys/fs/cgroup/user.slice", "cgroup.controllers"},
-		{"/sys/fs/cgroup/user.slice/", "/cgroup.controllers"},
 		{"/", "/sys/fs/cgroup/cgroup.controllers"},
 		{"/", "sys/fs/cgroup/cgroup.controllers"},
 		{"/sys/fs/cgroup/cgroup.controllers", ""},
--- a/libcontainer/cgroups/fs/fs.go
+++ b/libcontainer/cgroups/fs/fs.go
@ -28,6 +28,7 @@ var subsystems = []subsystem{
 	&FreezerGroup{},
 	&RdmaGroup{},
 	&NameGroup{GroupName: "name=systemd", Join: true},
+	&NameGroup{GroupName: "misc", Join: true},
 }

 var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist")
--- a/libcontainer/cgroups/fs/hugetlb.go
+++ b/libcontainer/cgroups/fs/hugetlb.go
@ -1,6 +1,8 @@
 package fs

 import (
+	"errors"
+	"os"
 	"strconv"

 	"github.com/opencontainers/runc/libcontainer/cgroups"
@ -19,8 +21,23 @@ func (s *HugetlbGroup) Apply(path string, _ *configs.Resources, pid int) error {
 }

 func (s *HugetlbGroup) Set(path string, r *configs.Resources) error {
+	const suffix = ".limit_in_bytes"
+	skipRsvd := false
+
 	for _, hugetlb := range r.HugetlbLimit {
-		if err := cgroups.WriteFile(path, "hugetlb."+hugetlb.Pagesize+".limit_in_bytes", strconv.FormatUint(hugetlb.Limit, 10)); err != nil {
+		prefix := "hugetlb." + hugetlb.Pagesize
+		val := strconv.FormatUint(hugetlb.Limit, 10)
+		if err := cgroups.WriteFile(path, prefix+suffix, val); err != nil {
+			return err
+		}
+		if skipRsvd {
+			continue
+		}
+		if err := cgroups.WriteFile(path, prefix+".rsvd"+suffix, val); err != nil {
+			if errors.Is(err, os.ErrNotExist) {
+				skipRsvd = true
+				continue
+			}
 			return err
 		}
 	}
@ -32,24 +49,29 @@ func (s *HugetlbGroup) GetStats(path string, stats *cgroups.Stats) error {
 	if !cgroups.PathExists(path) {
 		return nil
 	}
+	rsvd := ".rsvd"
 	hugetlbStats := cgroups.HugetlbStats{}
 	for _, pageSize := range cgroups.HugePageSizes() {
-		usage := "hugetlb." + pageSize + ".usage_in_bytes"
-		value, err := fscommon.GetCgroupParamUint(path, usage)
+	again:
+		prefix := "hugetlb." + pageSize + rsvd
+
+		value, err := fscommon.GetCgroupParamUint(path, prefix+".usage_in_bytes")
 		if err != nil {
+			if rsvd != "" && errors.Is(err, os.ErrNotExist) {
+				rsvd = ""
+				goto again
+			}
 			return err
 		}
 		hugetlbStats.Usage = value

-		maxUsage := "hugetlb." + pageSize + ".max_usage_in_bytes"
-		value, err = fscommon.GetCgroupParamUint(path, maxUsage)
+		value, err = fscommon.GetCgroupParamUint(path, prefix+".max_usage_in_bytes")
 		if err != nil {
 			return err
 		}
 		hugetlbStats.MaxUsage = value

-		failcnt := "hugetlb." + pageSize + ".failcnt"
-		value, err = fscommon.GetCgroupParamUint(path, failcnt)
+		value, err = fscommon.GetCgroupParamUint(path, prefix+".failcnt")
 		if err != nil {
 			return err
 		}
--- a/libcontainer/cgroups/fs/hugetlb_test.go
+++ b/libcontainer/cgroups/fs/hugetlb_test.go
@ -21,6 +21,11 @@ const (
 	limit    = "hugetlb.%s.limit_in_bytes"
 	maxUsage = "hugetlb.%s.max_usage_in_bytes"
 	failcnt  = "hugetlb.%s.failcnt"
+
+	rsvdUsage    = "hugetlb.%s.rsvd.usage_in_bytes"
+	rsvdLimit    = "hugetlb.%s.rsvd.limit_in_bytes"
+	rsvdMaxUsage = "hugetlb.%s.rsvd.max_usage_in_bytes"
+	rsvdFailcnt  = "hugetlb.%s.rsvd.failcnt"
 )

 func TestHugetlbSetHugetlb(t *testing.T) {
@ -52,13 +57,15 @@ func TestHugetlbSetHugetlb(t *testing.T) {
 	}

 	for _, pageSize := range cgroups.HugePageSizes() {
-		limit := fmt.Sprintf(limit, pageSize)
-		value, err := fscommon.GetCgroupParamUint(path, limit)
-		if err != nil {
-			t.Fatal(err)
-		}
-		if value != hugetlbAfter {
-			t.Fatalf("Set hugetlb.limit_in_bytes failed. Expected: %v, Got: %v", hugetlbAfter, value)
+		for _, f := range []string{limit, rsvdLimit} {
+			limit := fmt.Sprintf(f, pageSize)
+			value, err := fscommon.GetCgroupParamUint(path, limit)
+			if err != nil {
+				t.Fatal(err)
+			}
+			if value != hugetlbAfter {
+				t.Fatalf("Set %s failed. Expected: %v, Got: %v", limit, hugetlbAfter, value)
+			}
 		}
 	}
 }
@ -85,6 +92,28 @@ func TestHugetlbStats(t *testing.T) {
 	}
 }

+func TestHugetlbRStatsRsvd(t *testing.T) {
+	path := tempDir(t, "hugetlb")
+	for _, pageSize := range cgroups.HugePageSizes() {
+		writeFileContents(t, path, map[string]string{
+			fmt.Sprintf(rsvdUsage, pageSize):    hugetlbUsageContents,
+			fmt.Sprintf(rsvdMaxUsage, pageSize): hugetlbMaxUsageContents,
+			fmt.Sprintf(rsvdFailcnt, pageSize):  hugetlbFailcnt,
+		})
+	}
+
+	hugetlb := &HugetlbGroup{}
+	actualStats := *cgroups.NewStats()
+	err := hugetlb.GetStats(path, &actualStats)
+	if err != nil {
+		t.Fatal(err)
+	}
+	expectedStats := cgroups.HugetlbStats{Usage: 128, MaxUsage: 256, Failcnt: 100}
+	for _, pageSize := range cgroups.HugePageSizes() {
+		expectHugetlbStatEquals(t, expectedStats, actualStats.HugetlbStats[pageSize])
+	}
+}
+
 func TestHugetlbStatsNoUsageFile(t *testing.T) {
 	path := tempDir(t, "hugetlb")
 	writeFileContents(t, path, map[string]string{
--- a/libcontainer/cgroups/fs/memory.go
+++ b/libcontainer/cgroups/fs/memory.go
@ -170,6 +170,10 @@ func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error {
 		return err
 	}
 	stats.MemoryStats.SwapUsage = swapUsage
+	stats.MemoryStats.SwapOnlyUsage = cgroups.MemoryData{
+		Usage:   swapUsage.Usage - memoryUsage.Usage,
+		Failcnt: swapUsage.Failcnt - memoryUsage.Failcnt,
+	}
 	kernelUsage, err := getMemoryData(path, "kmem")
 	if err != nil {
 		return err
@ -234,6 +238,12 @@ func getMemoryData(path, name string) (cgroups.MemoryData, error) {
 	memoryData.Failcnt = value
 	value, err = fscommon.GetCgroupParamUint(path, limit)
 	if err != nil {
+		if name == "kmem" && os.IsNotExist(err) {
+			// Ignore ENOENT as kmem.limit_in_bytes has
+			// been removed in newer kernels.
+			return memoryData, nil
+		}
+
 		return cgroups.MemoryData{}, err
 	}
 	memoryData.Limit = value
--- a/libcontainer/cgroups/fs/memory_test.go
+++ b/libcontainer/cgroups/fs/memory_test.go
@ -249,12 +249,13 @@ func TestMemoryStats(t *testing.T) {
 		t.Fatal(err)
 	}
 	expectedStats := cgroups.MemoryStats{
-		Cache:        512,
-		Usage:        cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192},
-		SwapUsage:    cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192},
-		KernelUsage:  cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192},
-		Stats:        map[string]uint64{"cache": 512, "rss": 1024},
-		UseHierarchy: true,
+		Cache:         512,
+		Usage:         cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192},
+		SwapUsage:     cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192},
+		SwapOnlyUsage: cgroups.MemoryData{Usage: 0, MaxUsage: 0, Failcnt: 0, Limit: 0},
+		KernelUsage:   cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192},
+		Stats:         map[string]uint64{"cache": 512, "rss": 1024},
+		UseHierarchy:  true,
 		PageUsageByNUMA: cgroups.PageUsageByNUMA{
 			PageUsageByNUMAInner: cgroups.PageUsageByNUMAInner{
 				Total:       cgroups.PageStats{Total: 44611, Nodes: map[uint8]uint64{0: 32631, 1: 7501, 2: 1982, 3: 2497}},
--- a/libcontainer/cgroups/fs/paths.go
+++ b/libcontainer/cgroups/fs/paths.go
@ -83,6 +83,7 @@ func tryDefaultCgroupRoot() string {
 	if err != nil {
 		return ""
 	}
+	defer dir.Close()
 	names, err := dir.Readdirnames(1)
 	if err != nil {
 		return ""
--- a/libcontainer/cgroups/fs2/hugetlb.go
+++ b/libcontainer/cgroups/fs2/hugetlb.go
@ -1,6 +1,8 @@
 package fs2

 import (
+	"errors"
+	"os"
 	"strconv"

 	"github.com/opencontainers/runc/libcontainer/cgroups"
@ -16,8 +18,22 @@ func setHugeTlb(dirPath string, r *configs.Resources) error {
 	if !isHugeTlbSet(r) {
 		return nil
 	}
+	const suffix = ".max"
+	skipRsvd := false
 	for _, hugetlb := range r.HugetlbLimit {
-		if err := cgroups.WriteFile(dirPath, "hugetlb."+hugetlb.Pagesize+".max", strconv.FormatUint(hugetlb.Limit, 10)); err != nil {
+		prefix := "hugetlb." + hugetlb.Pagesize
+		val := strconv.FormatUint(hugetlb.Limit, 10)
+		if err := cgroups.WriteFile(dirPath, prefix+suffix, val); err != nil {
+			return err
+		}
+		if skipRsvd {
+			continue
+		}
+		if err := cgroups.WriteFile(dirPath, prefix+".rsvd"+suffix, val); err != nil {
+			if errors.Is(err, os.ErrNotExist) {
+				skipRsvd = true
+				continue
+			}
 			return err
 		}
 	}
@ -27,15 +43,21 @@ func setHugeTlb(dirPath string, r *configs.Resources) error {

 func statHugeTlb(dirPath string, stats *cgroups.Stats) error {
 	hugetlbStats := cgroups.HugetlbStats{}
+	rsvd := ".rsvd"
 	for _, pagesize := range cgroups.HugePageSizes() {
-		value, err := fscommon.GetCgroupParamUint(dirPath, "hugetlb."+pagesize+".current")
+	again:
+		prefix := "hugetlb." + pagesize + rsvd
+		value, err := fscommon.GetCgroupParamUint(dirPath, prefix+".current")
 		if err != nil {
+			if rsvd != "" && errors.Is(err, os.ErrNotExist) {
+				rsvd = ""
+				goto again
+			}
 			return err
 		}
 		hugetlbStats.Usage = value

-		fileName := "hugetlb." + pagesize + ".events"
-		value, err = fscommon.GetValueByKey(dirPath, fileName, "max")
+		value, err = fscommon.GetValueByKey(dirPath, prefix+".events", "max")
 		if err != nil {
 			return err
 		}
--- a/libcontainer/cgroups/fs2/memory.go
+++ b/libcontainer/cgroups/fs2/memory.go
@ -100,17 +100,20 @@ func statMemory(dirPath string, stats *cgroups.Stats) error {
 	memoryUsage, err := getMemoryDataV2(dirPath, "")
 	if err != nil {
 		if errors.Is(err, unix.ENOENT) && dirPath == UnifiedMountpoint {
-			// The root cgroup does not have memory.{current,max}
-			// so emulate those using data from /proc/meminfo.
-			return statsFromMeminfo(stats)
+			// The root cgroup does not have memory.{current,max,peak}
+			// so emulate those using data from /proc/meminfo and
+			// /sys/fs/cgroup/memory.stat
+			return rootStatsFromMeminfo(stats)
 		}
 		return err
 	}
 	stats.MemoryStats.Usage = memoryUsage
-	swapUsage, err := getMemoryDataV2(dirPath, "swap")
+	swapOnlyUsage, err := getMemoryDataV2(dirPath, "swap")
 	if err != nil {
 		return err
 	}
+	stats.MemoryStats.SwapOnlyUsage = swapOnlyUsage
+	swapUsage := swapOnlyUsage
 	// As cgroup v1 reports SwapUsage values as mem+swap combined,
 	// while in cgroup v2 swap values do not include memory,
 	// report combined mem+swap for v1 compatibility.
@ -118,6 +121,9 @@ func statMemory(dirPath string, stats *cgroups.Stats) error {
 	if swapUsage.Limit != math.MaxUint64 {
 		swapUsage.Limit += memoryUsage.Limit
 	}
+	// The `MaxUsage` of mem+swap cannot simply combine mem with
+	// swap. So set it to 0 for v1 compatibility.
+	swapUsage.MaxUsage = 0
 	stats.MemoryStats.SwapUsage = swapUsage

 	return nil
@ -132,6 +138,7 @@ func getMemoryDataV2(path, name string) (cgroups.MemoryData, error) {
 	}
 	usage := moduleName + ".current"
 	limit := moduleName + ".max"
+	maxUsage := moduleName + ".peak"

 	value, err := fscommon.GetCgroupParamUint(path, usage)
 	if err != nil {
@ -151,10 +158,18 @@ func getMemoryDataV2(path, name string) (cgroups.MemoryData, error) {
 	}
 	memoryData.Limit = value

+	// `memory.peak` since kernel 5.19
+	// `memory.swap.peak` since kernel 6.5
+	value, err = fscommon.GetCgroupParamUint(path, maxUsage)
+	if err != nil && !os.IsNotExist(err) {
+		return cgroups.MemoryData{}, err
+	}
+	memoryData.MaxUsage = value
+
 	return memoryData, nil
 }

-func statsFromMeminfo(stats *cgroups.Stats) error {
+func rootStatsFromMeminfo(stats *cgroups.Stats) error {
 	const file = "/proc/meminfo"
 	f, err := os.Open(file)
 	if err != nil {
@ -166,14 +181,10 @@ func statsFromMeminfo(stats *cgroups.Stats) error {
 	var (
 		swap_free  uint64
 		swap_total uint64
-		main_total uint64
-		main_free  uint64
 	)
 	mem := map[string]*uint64{
 		"SwapFree":  &swap_free,
 		"SwapTotal": &swap_total,
-		"MemTotal":  &main_total,
-		"MemFree":   &main_free,
 	}

 	found := 0
@ -206,11 +217,18 @@ func statsFromMeminfo(stats *cgroups.Stats) error {
 		return &parseError{Path: "", File: file, Err: err}
 	}

+	// cgroup v1 `usage_in_bytes` reports memory usage as the sum of
+	// - rss (NR_ANON_MAPPED)
+	// - cache (NR_FILE_PAGES)
+	// cgroup v1 reports SwapUsage values as mem+swap combined
+	// cgroup v2 reports rss and cache as anon and file.
+	// sum `anon` + `file` to report the same value as `usage_in_bytes` in v1.
+	// sum swap usage as combined mem+swap usage for consistency as well.
+	stats.MemoryStats.Usage.Usage = stats.MemoryStats.Stats["anon"] + stats.MemoryStats.Stats["file"]
+	stats.MemoryStats.Usage.Limit = math.MaxUint64
 	stats.MemoryStats.SwapUsage.Usage = (swap_total - swap_free) * 1024
 	stats.MemoryStats.SwapUsage.Limit = math.MaxUint64
-
-	stats.MemoryStats.Usage.Usage = (main_total - main_free) * 1024
-	stats.MemoryStats.Usage.Limit = math.MaxUint64
+	stats.MemoryStats.SwapUsage.Usage += stats.MemoryStats.Usage.Usage

 	return nil
 }
--- a/libcontainer/cgroups/fs2/memory_test.go
+++ b/libcontainer/cgroups/fs2/memory_test.go
@ -0,0 +1,155 @@
+package fs2
+
+import (
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+
+	"github.com/opencontainers/runc/libcontainer/cgroups"
+)
+
+const exampleMemoryStatData = `anon 790425600
+file 6502666240
+kernel_stack 7012352
+pagetables 8867840
+percpu 2445520
+sock 40960
+shmem 6721536
+file_mapped 656187392
+file_dirty 1122304
+file_writeback 0
+swapcached 10
+anon_thp 438304768
+file_thp 0
+shmem_thp 0
+inactive_anon 892223488
+active_anon 2973696
+inactive_file 5307346944
+active_file 1179316224
+unevictable 31477760
+slab_reclaimable 348866240
+slab_unreclaimable 10099808
+slab 358966048
+workingset_refault_anon 0
+workingset_refault_file 0
+workingset_activate_anon 0
+workingset_activate_file 0
+workingset_restore_anon 0
+workingset_restore_file 0
+workingset_nodereclaim 0
+pgfault 103216687
+pgmajfault 6879
+pgrefill 0
+pgscan 0
+pgsteal 0
+pgactivate 1110217
+pgdeactivate 292
+pglazyfree 267
+pglazyfreed 0
+thp_fault_alloc 57411
+thp_collapse_alloc 443`
+
+func TestStatMemoryPodCgroupNotFound(t *testing.T) {
+	// We're using a fake cgroupfs.
+	cgroups.TestMode = true
+	fakeCgroupDir := t.TempDir()
+
+	// only write memory.stat to ensure pod cgroup usage
+	// still reads memory.current.
+	statPath := filepath.Join(fakeCgroupDir, "memory.stat")
+	if err := os.WriteFile(statPath, []byte(exampleMemoryStatData), 0o644); err != nil {
+		t.Fatal(err)
+	}
+
+	gotStats := cgroups.NewStats()
+
+	// use a fake root path to mismatch the file we wrote.
+	// this triggers the non-root path which should fail to find memory.current.
+	err := statMemory(fakeCgroupDir, gotStats)
+	if err == nil {
+		t.Errorf("expected error when statting memory for cgroupv2 root, but was nil")
+	}
+
+	if !strings.Contains(err.Error(), "memory.current: no such file or directory") {
+		t.Errorf("expected error to contain 'memory.current: no such file or directory', but was %s", err.Error())
+	}
+}
+
+func TestStatMemoryPodCgroup(t *testing.T) {
+	// We're using a fake cgroupfs.
+	cgroups.TestMode = true
+	fakeCgroupDir := t.TempDir()
+
+	statPath := filepath.Join(fakeCgroupDir, "memory.stat")
+	if err := os.WriteFile(statPath, []byte(exampleMemoryStatData), 0o644); err != nil {
+		t.Fatal(err)
+	}
+
+	if err := os.WriteFile(filepath.Join(fakeCgroupDir, "memory.current"), []byte("123456789"), 0o644); err != nil {
+		t.Fatal(err)
+	}
+
+	if err := os.WriteFile(filepath.Join(fakeCgroupDir, "memory.max"), []byte("999999999"), 0o644); err != nil {
+		t.Fatal(err)
+	}
+
+	if err := os.WriteFile(filepath.Join(fakeCgroupDir, "memory.peak"), []byte("987654321"), 0o644); err != nil {
+		t.Fatal(err)
+	}
+
+	gotStats := cgroups.NewStats()
+
+	// use a fake root path to trigger the pod cgroup lookup.
+	err := statMemory(fakeCgroupDir, gotStats)
+	if err != nil {
+		t.Errorf("expected no error when statting memory for cgroupv2 root, but got %#+v", err)
+	}
+
+	// result should be "memory.current"
+	var expectedUsageBytes uint64 = 123456789
+	if gotStats.MemoryStats.Usage.Usage != expectedUsageBytes {
+		t.Errorf("parsed cgroupv2 memory.stat doesn't match expected result: \ngot %#v\nexpected %#v\n", gotStats.MemoryStats.Usage.Usage, expectedUsageBytes)
+	}
+
+	// result should be "memory.max"
+	var expectedLimitBytes uint64 = 999999999
+	if gotStats.MemoryStats.Usage.Limit != expectedLimitBytes {
+		t.Errorf("parsed cgroupv2 memory.stat doesn't match expected result: \ngot %#v\nexpected %#v\n", gotStats.MemoryStats.Usage.Limit, expectedLimitBytes)
+	}
+
+	// result should be "memory.peak"
+	var expectedMaxUsageBytes uint64 = 987654321
+	if gotStats.MemoryStats.Usage.MaxUsage != expectedMaxUsageBytes {
+		t.Errorf("parsed cgroupv2 memory.stat doesn't match expected result: \ngot %#v\nexpected %#v\n", gotStats.MemoryStats.Usage.MaxUsage, expectedMaxUsageBytes)
+	}
+}
+
+func TestRootStatsFromMeminfo(t *testing.T) {
+	stats := &cgroups.Stats{
+		MemoryStats: cgroups.MemoryStats{
+			Stats: map[string]uint64{
+				"anon": 790425600,
+				"file": 6502666240,
+			},
+		},
+	}
+
+	if err := rootStatsFromMeminfo(stats); err != nil {
+		t.Fatal(err)
+	}
+
+	// result is anon + file
+	var expectedUsageBytes uint64 = 7293091840
+	if stats.MemoryStats.Usage.Usage != expectedUsageBytes {
+		t.Errorf("parsed cgroupv2 memory.stat doesn't match expected result: \ngot %d\nexpected %d\n", stats.MemoryStats.Usage.Usage, expectedUsageBytes)
+	}
+
+	// swap is adjusted to mem+swap
+	if stats.MemoryStats.SwapUsage.Usage < stats.MemoryStats.Usage.Usage {
+		t.Errorf("swap usage %d should be at least mem usage %d", stats.MemoryStats.SwapUsage.Usage, stats.MemoryStats.Usage.Usage)
+	}
+	if stats.MemoryStats.SwapUsage.Limit < stats.MemoryStats.Usage.Limit {
+		t.Errorf("swap limit %d should be at least mem limit %d", stats.MemoryStats.SwapUsage.Limit, stats.MemoryStats.Usage.Limit)
+	}
+}
--- a/libcontainer/cgroups/manager/manager_test.go
+++ b/libcontainer/cgroups/manager/manager_test.go
@ -3,6 +3,7 @@ package manager
 import (
 	"testing"

+	"github.com/opencontainers/runc/libcontainer/cgroups/systemd"
 	"github.com/opencontainers/runc/libcontainer/configs"
 )

@ -10,35 +11,45 @@ import (
 // config.Resources is nil. While it does not make sense to use a
 // manager with no resources, it should not result in a panic.
 //
-// This tests either v1 or v2 managers (both fs and systemd),
-// depending on what cgroup version is available on the host.
+// This tests either v1 or v2 fs cgroup manager, depending on which
+// cgroup version is available.
 func TestNilResources(t *testing.T) {
-	for _, sd := range []bool{false, true} {
-		cg := &configs.Cgroup{} // .Resources is nil
-		cg.Systemd = sd
-		mgr, err := New(cg)
-		if err != nil {
-			// Some managers require non-nil Resources during
-			// instantiation -- provide and retry. In such case
-			// we're mostly testing Set(nil) below.
-			cg.Resources = &configs.Resources{}
-			mgr, err = New(cg)
-			if err != nil {
-				t.Error(err)
-				continue
-			}
-		}
-		_ = mgr.Apply(-1)
-		_ = mgr.Set(nil)
-		_ = mgr.Freeze(configs.Thawed)
-		_ = mgr.Exists()
-		_, _ = mgr.GetAllPids()
-		_, _ = mgr.GetCgroups()
-		_, _ = mgr.GetFreezerState()
-		_ = mgr.Path("")
-		_ = mgr.GetPaths()
-		_, _ = mgr.GetStats()
-		_, _ = mgr.OOMKillCount()
-		_ = mgr.Destroy()
-	}
+	testNilResources(t, false)
+}
+
+// TestNilResourcesSystemd is the same as TestNilResources,
+// only checking the systemd cgroup manager.
+func TestNilResourcesSystemd(t *testing.T) {
+	if !systemd.IsRunningSystemd() {
+		t.Skip("requires systemd")
+	}
+	testNilResources(t, true)
+}
+
+func testNilResources(t *testing.T, systemd bool) {
+	cg := &configs.Cgroup{} // .Resources is nil
+	cg.Systemd = systemd
+	mgr, err := New(cg)
+	if err != nil {
+		// Some managers require non-nil Resources during
+		// instantiation -- provide and retry. In such case
+		// we're mostly testing Set(nil) below.
+		cg.Resources = &configs.Resources{}
+		mgr, err = New(cg)
+		if err != nil {
+			t.Fatal(err)
+		}
+	}
+	_ = mgr.Apply(-1)
+	_ = mgr.Set(nil)
+	_ = mgr.Freeze(configs.Thawed)
+	_ = mgr.Exists()
+	_, _ = mgr.GetAllPids()
+	_, _ = mgr.GetCgroups()
+	_, _ = mgr.GetFreezerState()
+	_ = mgr.Path("")
+	_ = mgr.GetPaths()
+	_, _ = mgr.GetStats()
+	_, _ = mgr.OOMKillCount()
+	_ = mgr.Destroy()
 }
--- a/libcontainer/cgroups/stats.go
+++ b/libcontainer/cgroups/stats.go
@ -78,6 +78,8 @@ type MemoryStats struct {
 	Usage MemoryData `json:"usage,omitempty"`
 	// usage of memory + swap
 	SwapUsage MemoryData `json:"swap_usage,omitempty"`
+	// usage of swap only
+	SwapOnlyUsage MemoryData `json:"swap_only_usage,omitempty"`
 	// usage of kernel memory
 	KernelUsage MemoryData `json:"kernel_usage,omitempty"`
 	// usage of kernel TCP memory
--- a/libcontainer/cgroups/systemd/common.go
+++ b/libcontainer/cgroups/systemd/common.go
@ -177,7 +177,7 @@ func allowAllDevices() []systemdDbus.Property {

 // generateDeviceProperties takes the configured device rules and generates a
 // corresponding set of systemd properties to configure the devices correctly.
-func generateDeviceProperties(r *configs.Resources) ([]systemdDbus.Property, error) {
+func generateDeviceProperties(r *configs.Resources, sdVer int) ([]systemdDbus.Property, error) {
 	if r.SkipDevices {
 		return nil, nil
 	}
@ -238,9 +238,10 @@ func generateDeviceProperties(r *configs.Resources) ([]systemdDbus.Property, err
 		// trickery to convert things:
 		//
 		//  * Concrete rules with non-wildcard major/minor numbers have to use
-		//    /dev/{block,char} paths. This is slightly odd because it means
-		//    that we cannot add whitelist rules for devices that don't exist,
-		//    but there's not too much we can do about that.
+		//    /dev/{block,char}/MAJOR:minor paths. Before v240, systemd uses
+		//    stat(2) on such paths to look up device properties, meaning we
+		//    cannot add whitelist rules for devices that don't exist. Since v240,
+		//    device properties are parsed from the path string.
 		//
 		//    However, path globbing is not support for path-based rules so we
 		//    need to handle wildcards in some other manner.
@ -288,6 +289,17 @@ func generateDeviceProperties(r *configs.Resources) ([]systemdDbus.Property, err
 			case devices.CharDevice:
 				entry.Path = fmt.Sprintf("/dev/char/%d:%d", rule.Major, rule.Minor)
 			}
+			if sdVer < 240 {
+				// Old systemd versions use stat(2) on path to find out device major:minor
+				// numbers and type. If the path doesn't exist, it will not add the rule,
+				// emitting a warning instead.
+				// Since all of this logic is best-effort anyway (we manually set these
+				// rules separately to systemd) we can safely skip entries that don't
+				// have a corresponding path.
+				if _, err := os.Stat(entry.Path); err != nil {
+					continue
+				}
+			}
 		}
 		deviceAllowList = append(deviceAllowList, entry)
 	}
@ -335,32 +347,55 @@ func isUnitExists(err error) bool {
 	return isDbusError(err, "org.freedesktop.systemd1.UnitExists")
 }

-func startUnit(cm *dbusConnManager, unitName string, properties []systemdDbus.Property) error {
+func startUnit(cm *dbusConnManager, unitName string, properties []systemdDbus.Property, ignoreExist bool) error {
 	statusChan := make(chan string, 1)
+	retry := true
+
+retry:
 	err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) error {
 		_, err := c.StartTransientUnitContext(context.TODO(), unitName, "replace", properties, statusChan)
 		return err
 	})
-	if err == nil {
-		timeout := time.NewTimer(30 * time.Second)
-		defer timeout.Stop()
-
-		select {
-		case s := <-statusChan:
-			close(statusChan)
-			// Please refer to https://pkg.go.dev/github.com/coreos/go-systemd/v22/dbus#Conn.StartUnit
-			if s != "done" {
-				resetFailedUnit(cm, unitName)
-				return fmt.Errorf("error creating systemd unit `%s`: got `%s`", unitName, s)
-			}
-		case <-timeout.C:
-			resetFailedUnit(cm, unitName)
-			return errors.New("Timeout waiting for systemd to create " + unitName)
+	if err != nil {
+		if !isUnitExists(err) {
+			return err
+		}
+		if ignoreExist {
+			// TODO: remove this hack.
+			// This is kubelet making sure a slice exists (see
+			// https://github.com/opencontainers/runc/pull/1124).
+			return nil
+		}
+		if retry {
+			// In case a unit with the same name exists, this may
+			// be a leftover failed unit. Reset it, so systemd can
+			// remove it, and retry once.
+			err = resetFailedUnit(cm, unitName)
+			if err != nil {
+				logrus.Warnf("unable to reset failed unit: %v", err)
+			}
+			retry = false
+			goto retry
 		}
-	} else if !isUnitExists(err) {
 		return err
 	}

+	timeout := time.NewTimer(30 * time.Second)
+	defer timeout.Stop()
+
+	select {
+	case s := <-statusChan:
+		close(statusChan)
+		// Please refer to https://pkg.go.dev/github.com/coreos/go-systemd/v22/dbus#Conn.StartUnit
+		if s != "done" {
+			_ = resetFailedUnit(cm, unitName)
+			return fmt.Errorf("error creating systemd unit `%s`: got `%s`", unitName, s)
+		}
+	case <-timeout.C:
+		_ = resetFailedUnit(cm, unitName)
+		return errors.New("Timeout waiting for systemd to create " + unitName)
+	}
+
 	return nil
 }

@ -385,16 +420,17 @@ func stopUnit(cm *dbusConnManager, unitName string) error {
 			return errors.New("Timed out while waiting for systemd to remove " + unitName)
 		}
 	}
+
+	// In case of a failed unit, let systemd remove it.
+	_ = resetFailedUnit(cm, unitName)
+
 	return nil
 }

-func resetFailedUnit(cm *dbusConnManager, name string) {
-	err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) error {
+func resetFailedUnit(cm *dbusConnManager, name string) error {
+	return cm.retryOnDisconnect(func(c *systemdDbus.Conn) error {
 		return c.ResetFailedUnitContext(context.TODO(), name)
 	})
-	if err != nil {
-		logrus.Warnf("unable to reset failed unit: %v", err)
-	}
 }

 func getUnitTypeProperty(cm *dbusConnManager, unitName string, unitType string, propertyName string) (*systemdDbus.Property, error) {
--- a/libcontainer/cgroups/systemd/cpuset.go
+++ b/libcontainer/cgroups/systemd/cpuset.go
@ -51,5 +51,10 @@ func RangeToBits(str string) ([]byte, error) {
 		// do not allow empty values
 		return nil, errors.New("empty value")
 	}
+
+	// fit cpuset parsing order in systemd
+	for l, r := 0, len(ret)-1; l < r; l, r = l+1, r-1 {
+		ret[l], ret[r] = ret[r], ret[l]
+	}
 	return ret, nil
 }
--- a/libcontainer/cgroups/systemd/cpuset_test.go
+++ b/libcontainer/cgroups/systemd/cpuset_test.go
@ -22,13 +22,13 @@ func TestRangeToBits(t *testing.T) {
 		{in: "4-7", out: []byte{0xf0}},
 		{in: "0-7", out: []byte{0xff}},
 		{in: "0-15", out: []byte{0xff, 0xff}},
-		{in: "16", out: []byte{1, 0, 0}},
-		{in: "0-3,32-33", out: []byte{3, 0, 0, 0, 0x0f}},
+		{in: "16", out: []byte{0, 0, 1}},
+		{in: "0-3,32-33", out: []byte{0x0f, 0, 0, 0, 3}},
 		// extra spaces and tabs are ok
 		{in: "1, 2, 1-2", out: []byte{6}},
 		{in: "    , 1   , 3  ,  5-7,	", out: []byte{0xea}},
 		// somewhat large values
-		{in: "128-130,1", out: []byte{7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2}},
+		{in: "128-130,1", out: []byte{2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7}},

 		{in: "-", isErr: true},
 		{in: "1-", isErr: true},
--- a/libcontainer/cgroups/systemd/dbus.go
+++ b/libcontainer/cgroups/systemd/dbus.go
@ -2,6 +2,7 @@ package systemd

 import (
 	"context"
+	"errors"
 	"fmt"
 	"sync"

@ -80,8 +81,6 @@ func (d *dbusConnManager) resetConnection(conn *systemdDbus.Conn) {
 	}
 }

-var errDbusConnClosed = dbus.ErrClosed.Error()
-
 // retryOnDisconnect calls op, and if the error it returns is about closed dbus
 // connection, the connection is re-established and the op is retried. This helps
 // with the situation when dbus is restarted and we have a stale connection.
@ -92,7 +91,10 @@ func (d *dbusConnManager) retryOnDisconnect(op func(*systemdDbus.Conn) error) er
 			return err
 		}
 		err = op(conn)
-		if !isDbusError(err, errDbusConnClosed) {
+		if err == nil {
+			return nil
+		}
+		if !errors.Is(err, dbus.ErrClosed) {
 			return err
 		}
 		d.resetConnection(conn)
--- a/libcontainer/cgroups/systemd/systemd_test.go
+++ b/libcontainer/cgroups/systemd/systemd_test.go
@ -127,7 +127,7 @@ func TestPodSkipDevicesUpdate(t *testing.T) {

 	// Create a "container" within the "pod" cgroup.
 	// This is not a real container, just a process in the cgroup.
-	cmd := exec.Command("bash", "-c", "while true; do echo > /dev/null; done")
+	cmd := exec.Command("sleep", "infinity")
 	cmd.Env = append(os.Environ(), "LANG=C")
 	var stderr bytes.Buffer
 	cmd.Stderr = &stderr
@ -183,6 +183,11 @@ func testSkipDevices(t *testing.T, skipDevices bool, expected []string) {
 	if os.Geteuid() != 0 {
 		t.Skip("Test requires root.")
 	}
+	// https://github.com/opencontainers/runc/issues/3743
+	centosVer, _ := exec.Command("rpm", "-q", "--qf", "%{version}", "centos-release").CombinedOutput()
+	if string(centosVer) == "7" {
+		t.Skip("Flaky on CentOS 7")
+	}

 	podConfig := &configs.Cgroup{
 		Parent: "system.slice",
--- a/libcontainer/cgroups/systemd/v1.go
+++ b/libcontainer/cgroups/systemd/v1.go
@ -71,12 +71,13 @@ var legacySubsystems = []subsystem{
 	&fs.NetClsGroup{},
 	&fs.NameGroup{GroupName: "name=systemd"},
 	&fs.RdmaGroup{},
+	&fs.NameGroup{GroupName: "misc"},
 }

 func genV1ResourcesProperties(r *configs.Resources, cm *dbusConnManager) ([]systemdDbus.Property, error) {
 	var properties []systemdDbus.Property

-	deviceProperties, err := generateDeviceProperties(r)
+	deviceProperties, err := generateDeviceProperties(r, systemdVersion(cm))
 	if err != nil {
 		return nil, err
 	}
@ -206,7 +207,7 @@ func (m *legacyManager) Apply(pid int) error {

 	properties = append(properties, c.SystemdProps...)

-	if err := startUnit(m.dbus, unitName, properties); err != nil {
+	if err := startUnit(m.dbus, unitName, properties, pid == -1); err != nil {
 		return err
 	}

@ -273,14 +274,7 @@ func getSubsystemPath(slice, unit, subsystem string) (string, error) {
 		return "", err
 	}

-	initPath, err := cgroups.GetInitCgroup(subsystem)
-	if err != nil {
-		return "", err
-	}
-	// if pid 1 is systemd 226 or later, it will be in init.scope, not the root
-	initPath = strings.TrimSuffix(filepath.Clean(initPath), "init.scope")
-
-	return filepath.Join(mountpoint, initPath, slice, unit), nil
+	return filepath.Join(mountpoint, slice, unit), nil
 }

 func (m *legacyManager) Freeze(state configs.FreezerState) error {
@ -423,6 +417,15 @@ func (m *legacyManager) Set(r *configs.Resources) error {
 		if err := m.doFreeze(configs.Frozen); err != nil {
 			// If freezer cgroup isn't supported, we just warn about it.
 			logrus.Infof("freeze container before SetUnitProperties failed: %v", err)
+			// skip update the cgroup while frozen failed. #3803
+			if !errors.Is(err, errSubsystemDoesNotExist) {
+				if needsThaw {
+					if thawErr := m.doFreeze(configs.Thawed); thawErr != nil {
+						logrus.Infof("thaw container after doFreeze failed: %v", thawErr)
+					}
+				}
+				return err
+			}
 		}
 	}
 	setErr := setUnitProperties(m.dbus, unitName, properties...)
--- a/libcontainer/cgroups/systemd/v2.go
+++ b/libcontainer/cgroups/systemd/v2.go
@ -2,6 +2,7 @@ package systemd

 import (
 	"bufio"
+	"errors"
 	"fmt"
 	"math"
 	"os"
@ -181,7 +182,7 @@ func genV2ResourcesProperties(r *configs.Resources, cm *dbusConnManager) ([]syst
 	//       aren't the end of the world, but it is a bit concerning. However
 	//       it's unclear if systemd removes all eBPF programs attached when
 	//       doing SetUnitProperties...
-	deviceProperties, err := generateDeviceProperties(r)
+	deviceProperties, err := generateDeviceProperties(r, systemdVersion(cm))
 	if err != nil {
 		return nil, err
 	}
@ -283,7 +284,7 @@ func (m *unifiedManager) Apply(pid int) error {

 	properties = append(properties, c.SystemdProps...)

-	if err := startUnit(m.dbus, unitName, properties); err != nil {
+	if err := startUnit(m.dbus, unitName, properties, pid == -1); err != nil {
 		return fmt.Errorf("unable to start unit %q (properties %+v): %w", unitName, properties, err)
 	}

@ -292,6 +293,12 @@ func (m *unifiedManager) Apply(pid int) error {
 	}

 	if c.OwnerUID != nil {
+		// The directory itself must be chowned.
+		err := os.Chown(m.path, *c.OwnerUID, -1)
+		if err != nil {
+			return err
+		}
+
 		filesToChown, err := cgroupFilesToChown()
 		if err != nil {
 			return err
@ -299,7 +306,8 @@ func (m *unifiedManager) Apply(pid int) error {

 		for _, v := range filesToChown {
 			err := os.Chown(m.path+"/"+v, *c.OwnerUID, -1)
-			if err != nil {
+			// Some files might not be present.
+			if err != nil && !errors.Is(err, os.ErrNotExist) {
 				return err
 			}
 		}
@ -312,21 +320,23 @@ func (m *unifiedManager) Apply(pid int) error {
 // uid in /sys/kernel/cgroup/delegate.  If the file is not present
 // (Linux < 4.15), use the initial values mentioned in cgroups(7).
 func cgroupFilesToChown() ([]string, error) {
-	filesToChown := []string{"."} // the directory itself must be chowned
 	const cgroupDelegateFile = "/sys/kernel/cgroup/delegate"
+
 	f, err := os.Open(cgroupDelegateFile)
-	if err == nil {
-		defer f.Close()
-		scanner := bufio.NewScanner(f)
-		for scanner.Scan() {
-			filesToChown = append(filesToChown, scanner.Text())
-		}
-		if err := scanner.Err(); err != nil {
-			return nil, fmt.Errorf("error reading %s: %w", cgroupDelegateFile, err)
-		}
-	} else {
-		filesToChown = append(filesToChown, "cgroup.procs", "cgroup.subtree_control", "cgroup.threads")
+	if err != nil {
+		return []string{"cgroup.procs", "cgroup.subtree_control", "cgroup.threads"}, nil
 	}
+	defer f.Close()
+
+	filesToChown := []string{}
+	scanner := bufio.NewScanner(f)
+	for scanner.Scan() {
+		filesToChown = append(filesToChown, scanner.Text())
+	}
+	if err := scanner.Err(); err != nil {
+		return nil, fmt.Errorf("error reading %s: %w", cgroupDelegateFile, err)
+	}
+
 	return filesToChown, nil
 }

--- a/libcontainer/cgroups/utils.go
+++ b/libcontainer/cgroups/utils.go
@ -55,12 +55,12 @@ func IsCgroup2HybridMode() bool {
 		var st unix.Statfs_t
 		err := unix.Statfs(hybridMountpoint, &st)
 		if err != nil {
-			if os.IsNotExist(err) {
-				// ignore the "not found" error
-				isHybrid = false
-				return
+			isHybrid = false
+			if !os.IsNotExist(err) {
+				// Report unexpected errors.
+				logrus.WithError(err).Debugf("statfs(%q) failed", hybridMountpoint)
 			}
-			panic(fmt.Sprintf("cannot statfs cgroup root: %s", err))
+			return
 		}
 		isHybrid = st.Type == unix.CGROUP2_SUPER_MAGIC
 	})
@ -162,8 +162,10 @@ func readProcsFile(dir string) ([]int, error) {

 // ParseCgroupFile parses the given cgroup file, typically /proc/self/cgroup
 // or /proc/<pid>/cgroup, into a map of subsystems to cgroup paths, e.g.
-//   "cpu": "/user.slice/user-1000.slice"
-//   "pids": "/user.slice/user-1000.slice"
+//
+//	"cpu": "/user.slice/user-1000.slice"
+//	"pids": "/user.slice/user-1000.slice"
+//
 // etc.
 //
 // Note that for cgroup v2 unified hierarchy, there are no per-controller
--- a/libcontainer/configs/config.go
+++ b/libcontainer/configs/config.go
@ -21,9 +21,9 @@ type Rlimit struct {

 // IDMap represents UID/GID Mappings for User Namespaces.
 type IDMap struct {
-	ContainerID int `json:"container_id"`
-	HostID      int `json:"host_id"`
-	Size        int `json:"size"`
+	ContainerID int64 `json:"container_id"`
+	HostID      int64 `json:"host_id"`
+	Size        int64 `json:"size"`
 }

 // Seccomp represents syscall restrictions
--- a/libcontainer/configs/config_linux.go
+++ b/libcontainer/configs/config_linux.go
@ -1,6 +1,10 @@
 package configs

-import "errors"
+import (
+	"errors"
+	"fmt"
+	"math"
+)

 var (
 	errNoUIDMap   = errors.New("User namespaces enabled, but no uid mappings found.")
@ -16,11 +20,18 @@ func (c Config) HostUID(containerId int) (int, error) {
 		if c.UidMappings == nil {
 			return -1, errNoUIDMap
 		}
-		id, found := c.hostIDFromMapping(containerId, c.UidMappings)
+		id, found := c.hostIDFromMapping(int64(containerId), c.UidMappings)
 		if !found {
 			return -1, errNoUserMap
 		}
-		return id, nil
+		// If we are a 32-bit binary running on a 64-bit system, it's possible
+		// the mapped user is too large to store in an int, which means we
+		// cannot do the mapping. We can't just return an int64, because
+		// os.Setuid() takes an int.
+		if id > math.MaxInt {
+			return -1, fmt.Errorf("mapping for uid %d (host id %d) is larger than native integer size (%d)", containerId, id, math.MaxInt)
+		}
+		return int(id), nil
 	}
 	// Return unchanged id.
 	return containerId, nil
@ -39,11 +50,18 @@ func (c Config) HostGID(containerId int) (int, error) {
 		if c.GidMappings == nil {
 			return -1, errNoGIDMap
 		}
-		id, found := c.hostIDFromMapping(containerId, c.GidMappings)
+		id, found := c.hostIDFromMapping(int64(containerId), c.GidMappings)
 		if !found {
 			return -1, errNoGroupMap
 		}
-		return id, nil
+		// If we are a 32-bit binary running on a 64-bit system, it's possible
+		// the mapped user is too large to store in an int, which means we
+		// cannot do the mapping. We can't just return an int64, because
+		// os.Setgid() takes an int.
+		if id > math.MaxInt {
+			return -1, fmt.Errorf("mapping for gid %d (host id %d) is larger than native integer size (%d)", containerId, id, math.MaxInt)
+		}
+		return int(id), nil
 	}
 	// Return unchanged id.
 	return containerId, nil
@ -57,7 +75,7 @@ func (c Config) HostRootGID() (int, error) {

 // Utility function that gets a host ID for a container ID from user namespace map
 // if that ID is present in the map.
-func (c Config) hostIDFromMapping(containerID int, uMap []IDMap) (int, bool) {
+func (c Config) hostIDFromMapping(containerID int64, uMap []IDMap) (int64, bool) {
 	for _, m := range uMap {
 		if (containerID >= m.ContainerID) && (containerID <= (m.ContainerID + m.Size - 1)) {
 			hostID := m.HostID + (containerID - m.ContainerID)
--- a/libcontainer/configs/validate/rootless.go
+++ b/libcontainer/configs/validate/rootless.go
@ -28,25 +28,18 @@ func (v *ConfigValidator) rootlessEUID(config *configs.Config) error {
 	return nil
 }

-func hasIDMapping(id int, mappings []configs.IDMap) bool {
-	for _, m := range mappings {
-		if id >= m.ContainerID && id < m.ContainerID+m.Size {
-			return true
-		}
-	}
-	return false
-}
-
 func rootlessEUIDMappings(config *configs.Config) error {
 	if !config.Namespaces.Contains(configs.NEWUSER) {
 		return errors.New("rootless container requires user namespaces")
 	}
-
-	if len(config.UidMappings) == 0 {
-		return errors.New("rootless containers requires at least one UID mapping")
-	}
-	if len(config.GidMappings) == 0 {
-		return errors.New("rootless containers requires at least one GID mapping")
+	// We only require mappings if we are not joining another userns.
+	if path := config.Namespaces.PathOf(configs.NEWUSER); path == "" {
+		if len(config.UidMappings) == 0 {
+			return errors.New("rootless containers requires at least one UID mapping")
+		}
+		if len(config.GidMappings) == 0 {
+			return errors.New("rootless containers requires at least one GID mapping")
+		}
 	}
 	return nil
 }
@ -70,8 +63,8 @@ func rootlessEUIDMount(config *configs.Config) error {
 					// Ignore unknown mount options.
 					continue
 				}
-				if !hasIDMapping(uid, config.UidMappings) {
-					return errors.New("cannot specify uid= mount options for unmapped uid in rootless containers")
+				if _, err := config.HostUID(uid); err != nil {
+					return fmt.Errorf("cannot specify uid=%d mount option for rootless container: %w", uid, err)
 				}
 			}

@ -82,8 +75,8 @@ func rootlessEUIDMount(config *configs.Config) error {
 					// Ignore unknown mount options.
 					continue
 				}
-				if !hasIDMapping(gid, config.GidMappings) {
-					return errors.New("cannot specify gid= mount options for unmapped gid in rootless containers")
+				if _, err := config.HostGID(gid); err != nil {
+					return fmt.Errorf("cannot specify gid=%d mount option for rootless container: %w", gid, err)
 				}
 			}
 		}
--- a/libcontainer/configs/validate/validator.go
+++ b/libcontainer/configs/validate/validator.go
@ -109,11 +109,19 @@ func (v *ConfigValidator) security(config *configs.Config) error {
 func (v *ConfigValidator) usernamespace(config *configs.Config) error {
 	if config.Namespaces.Contains(configs.NEWUSER) {
 		if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) {
-			return errors.New("USER namespaces aren't enabled in the kernel")
+			return errors.New("user namespaces aren't enabled in the kernel")
 		}
+		hasPath := config.Namespaces.PathOf(configs.NEWUSER) != ""
+		hasMappings := config.UidMappings != nil || config.GidMappings != nil
+		if !hasPath && !hasMappings {
+			return errors.New("user namespaces enabled, but no namespace path to join nor mappings to apply specified")
+		}
+		// The hasPath && hasMappings validation case is handled in specconv --
+		// we cache the mappings in Config during specconv in the hasPath case,
+		// so we cannot do that validation here.
 	} else {
 		if config.UidMappings != nil || config.GidMappings != nil {
-			return errors.New("User namespace mappings specified, but USER namespace isn't enabled in the config")
+			return errors.New("user namespace mappings specified, but user namespace isn't enabled in the config")
 		}
 	}
 	return nil
@ -131,9 +139,8 @@ func (v *ConfigValidator) cgroupnamespace(config *configs.Config) error {
 // convertSysctlVariableToDotsSeparator can return sysctl variables in dots separator format.
 // The '/' separator is also accepted in place of a '.'.
 // Convert the sysctl variables to dots separator format for validation.
-// More info:
-//   https://man7.org/linux/man-pages/man8/sysctl.8.html
-//   https://man7.org/linux/man-pages/man5/sysctl.d.5.html
+// More info: sysctl(8), sysctl.d(5).
+//
 // For example:
 // Input sysctl variable "net/ipv4/conf/eno2.100.rp_filter"
 // will return the converted value "net.ipv4.conf.eno2/100.rp_filter"
@ -229,10 +236,6 @@ func (v *ConfigValidator) sysctl(config *configs.Config) error {

 func (v *ConfigValidator) intelrdt(config *configs.Config) error {
 	if config.IntelRdt != nil {
-		if !intelrdt.IsCATEnabled() && !intelrdt.IsMBAEnabled() {
-			return errors.New("intelRdt is specified in config, but Intel RDT is not supported or enabled")
-		}
-
 		if config.IntelRdt.ClosID == "." || config.IntelRdt.ClosID == ".." || strings.Contains(config.IntelRdt.ClosID, "/") {
 			return fmt.Errorf("invalid intelRdt.ClosID %q", config.IntelRdt.ClosID)
 		}
--- a/libcontainer/configs/validate/validator_test.go
+++ b/libcontainer/configs/validate/validator_test.go
@ -150,7 +150,7 @@ func TestValidateSecurityWithoutNEWNS(t *testing.T) {
 	}
 }

-func TestValidateUsernamespace(t *testing.T) {
+func TestValidateUserNamespace(t *testing.T) {
 	if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) {
 		t.Skip("Test requires userns.")
 	}
@ -161,6 +161,8 @@ func TestValidateUsernamespace(t *testing.T) {
 				{Type: configs.NEWUSER},
 			},
 		),
+		UidMappings: []configs.IDMap{{HostID: 0, ContainerID: 123, Size: 100}},
+		GidMappings: []configs.IDMap{{HostID: 0, ContainerID: 123, Size: 100}},
 	}

 	validator := New()
@ -170,11 +172,11 @@ func TestValidateUsernamespace(t *testing.T) {
 	}
 }

-func TestValidateUsernamespaceWithoutUserNS(t *testing.T) {
-	uidMap := configs.IDMap{ContainerID: 123}
+func TestValidateUsernsMappingWithoutNamespace(t *testing.T) {
 	config := &configs.Config{
 		Rootfs:      "/var",
-		UidMappings: []configs.IDMap{uidMap},
+		UidMappings: []configs.IDMap{{HostID: 0, ContainerID: 123, Size: 100}},
+		GidMappings: []configs.IDMap{{HostID: 0, ContainerID: 123, Size: 100}},
 	}

 	validator := New()
--- a/libcontainer/container_linux.go
+++ b/libcontainer/container_linux.go
@ -40,7 +40,7 @@ type linuxContainer struct {
 	root                 string
 	config               *configs.Config
 	cgroupManager        cgroups.Manager
-	intelRdtManager      intelrdt.Manager
+	intelRdtManager      *intelrdt.Manager
 	initPath             string
 	initArgs             []string
 	initProcess          parentProcess
@ -146,19 +146,21 @@ func (c *linuxContainer) OCIState() (*specs.State, error) {
 	return c.currentOCIState()
 }

-func (c *linuxContainer) Processes() ([]int, error) {
-	var pids []int
-	status, err := c.currentStatus()
-	if err != nil {
-		return pids, err
+// ignoreCgroupError filters out cgroup-related errors that can be ignored,
+// because the container is stopped and its cgroup is gone.
+func (c *linuxContainer) ignoreCgroupError(err error) error {
+	if err == nil {
+		return nil
 	}
-	// for systemd cgroup, the unit's cgroup path will be auto removed if container's all processes exited
-	if status == Stopped && !c.cgroupManager.Exists() {
-		return pids, nil
+	if errors.Is(err, os.ErrNotExist) && c.runType() == Stopped && !c.cgroupManager.Exists() {
+		return nil
 	}
+	return err
+}

-	pids, err = c.cgroupManager.GetAllPids()
-	if err != nil {
+func (c *linuxContainer) Processes() ([]int, error) {
+	pids, err := c.cgroupManager.GetAllPids()
+	if err = c.ignoreCgroupError(err); err != nil {
 		return nil, fmt.Errorf("unable to get all container pids: %w", err)
 	}
 	return pids, nil
@ -351,6 +353,15 @@ func (c *linuxContainer) start(process *Process) (retErr error) {
 		}()
 	}

+	// Before starting "runc init", mark all non-stdio open files as O_CLOEXEC
+	// to make sure we don't leak any files into "runc init". Any files to be
+	// passed to "runc init" through ExtraFiles will get dup2'd by the Go
+	// runtime and thus their O_CLOEXEC flag will be cleared. This is some
+	// additional protection against attacks like CVE-2024-21626, by making
+	// sure we never leak files to "runc init" we didn't intend to.
+	if err := utils.CloseExecFrom(3); err != nil {
+		return fmt.Errorf("unable to mark non-stdio fds as cloexec: %w", err)
+	}
 	if err := parent.start(); err != nil {
 		return fmt.Errorf("unable to start container process: %w", err)
 	}
@ -382,11 +393,12 @@ func (c *linuxContainer) Signal(s os.Signal, all bool) error {
 		return err
 	}
 	if all {
-		// for systemd cgroup, the unit's cgroup path will be auto removed if container's all processes exited
 		if status == Stopped && !c.cgroupManager.Exists() {
+			// Avoid calling signalAllProcesses which may print
+			// a warning trying to freeze a non-existing cgroup.
 			return nil
 		}
-		return signalAllProcesses(c.cgroupManager, s)
+		return c.ignoreCgroupError(signalAllProcesses(c.cgroupManager, s))
 	}
 	// to avoid a PID reuse attack
 	if status == Running || status == Created || status == Paused {
@ -636,7 +648,11 @@ func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, messageSockP
 			// cgroup v1: using the same path for all controllers.
 			// cgroup v2: the only possible way.
 			for k := range proc.cgroupPaths {
-				proc.cgroupPaths[k] = path.Join(proc.cgroupPaths[k], add)
+				subPath := path.Join(proc.cgroupPaths[k], add)
+				if !strings.HasPrefix(subPath, proc.cgroupPaths[k]) {
+					return nil, fmt.Errorf("%s is not a sub cgroup path", add)
+				}
+				proc.cgroupPaths[k] = subPath
 			}
 			// cgroup v2: do not try to join init process's cgroup
 			// as a fallback (see (*setnsProcess).start).
@ -645,7 +661,11 @@ func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, messageSockP
 			// Per-controller paths.
 			for ctrl, add := range p.SubCgroupPaths {
 				if val, ok := proc.cgroupPaths[ctrl]; ok {
-					proc.cgroupPaths[ctrl] = path.Join(val, add)
+					subPath := path.Join(val, add)
+					if !strings.HasPrefix(subPath, val) {
+						return nil, fmt.Errorf("%s is not a sub cgroup path", add)
+					}
+					proc.cgroupPaths[ctrl] = subPath
 				} else {
 					return nil, fmt.Errorf("unknown controller %s in SubCgroupPaths", ctrl)
 				}
@ -918,7 +938,7 @@ func (c *linuxContainer) criuSupportsExtNS(t configs.NamespaceType) bool {
 }

 func criuNsToKey(t configs.NamespaceType) string {
-	return "extRoot" + strings.Title(configs.NsName(t)) + "NS"
+	return "extRoot" + strings.Title(configs.NsName(t)) + "NS" //nolint:staticcheck // SA1019: strings.Title is deprecated
 }

 func (c *linuxContainer) handleCheckpointingExternalNamespaces(rpcOpts *criurpc.CriuOpts, t configs.NamespaceType) error {
@ -2257,7 +2277,7 @@ func ignoreTerminateErrors(err error) error {

 func requiresRootOrMappingTool(c *configs.Config) bool {
 	gidMap := []configs.IDMap{
-		{ContainerID: 0, HostID: os.Getegid(), Size: 1},
+		{ContainerID: 0, HostID: int64(os.Getegid()), Size: 1},
 	}
 	return !reflect.DeepEqual(c.GidMappings, gidMap)
 }
--- a/libcontainer/container_linux_test.go
+++ b/libcontainer/container_linux_test.go
@ -7,22 +7,15 @@ import (

 	"github.com/opencontainers/runc/libcontainer/cgroups"
 	"github.com/opencontainers/runc/libcontainer/configs"
-	"github.com/opencontainers/runc/libcontainer/intelrdt"
 	"github.com/opencontainers/runc/libcontainer/system"
 )

 type mockCgroupManager struct {
 	pids    []int
 	allPids []int
-	stats   *cgroups.Stats
 	paths   map[string]string
 }

-type mockIntelRdtManager struct {
-	stats *intelrdt.Stats
-	path  string
-}
-
 func (m *mockCgroupManager) GetPids() ([]int, error) {
 	return m.pids, nil
 }
@ -32,7 +25,7 @@ func (m *mockCgroupManager) GetAllPids() ([]int, error) {
 }

 func (m *mockCgroupManager) GetStats() (*cgroups.Stats, error) {
-	return m.stats, nil
+	return nil, nil
 }

 func (m *mockCgroupManager) Apply(pid int) error {
@ -76,30 +69,6 @@ func (m *mockCgroupManager) GetFreezerState() (configs.FreezerState, error) {
 	return configs.Thawed, nil
 }

-func (m *mockIntelRdtManager) Apply(pid int) error {
-	return nil
-}
-
-func (m *mockIntelRdtManager) GetStats() (*intelrdt.Stats, error) {
-	return m.stats, nil
-}
-
-func (m *mockIntelRdtManager) Destroy() error {
-	return nil
-}
-
-func (m *mockIntelRdtManager) GetPath() string {
-	return m.path
-}
-
-func (m *mockIntelRdtManager) Set(container *configs.Config) error {
-	return nil
-}
-
-func (m *mockIntelRdtManager) GetCgroups() (*configs.Cgroup, error) {
-	return nil, nil
-}
-
 type mockProcess struct {
 	_pid    int
 	started uint64
@ -173,61 +142,11 @@ func TestGetContainerPids(t *testing.T) {
 	}
 }

-func TestGetContainerStats(t *testing.T) {
-	container := &linuxContainer{
-		id:     "myid",
-		config: &configs.Config{},
-		cgroupManager: &mockCgroupManager{
-			pids: []int{1, 2, 3},
-			stats: &cgroups.Stats{
-				MemoryStats: cgroups.MemoryStats{
-					Usage: cgroups.MemoryData{
-						Usage: 1024,
-					},
-				},
-			},
-		},
-		intelRdtManager: &mockIntelRdtManager{
-			stats: &intelrdt.Stats{
-				L3CacheSchema: "L3:0=f;1=f0",
-				MemBwSchema:   "MB:0=20;1=70",
-			},
-		},
-	}
-	stats, err := container.Stats()
-	if err != nil {
-		t.Fatal(err)
-	}
-	if stats.CgroupStats == nil {
-		t.Fatal("cgroup stats are nil")
-	}
-	if stats.CgroupStats.MemoryStats.Usage.Usage != 1024 {
-		t.Fatalf("expected memory usage 1024 but received %d", stats.CgroupStats.MemoryStats.Usage.Usage)
-	}
-	if intelrdt.IsCATEnabled() {
-		if stats.IntelRdtStats == nil {
-			t.Fatal("intel rdt stats are nil")
-		}
-		if stats.IntelRdtStats.L3CacheSchema != "L3:0=f;1=f0" {
-			t.Fatalf("expected L3CacheSchema L3:0=f;1=f0 but received %s", stats.IntelRdtStats.L3CacheSchema)
-		}
-	}
-	if intelrdt.IsMBAEnabled() {
-		if stats.IntelRdtStats == nil {
-			t.Fatal("intel rdt stats are nil")
-		}
-		if stats.IntelRdtStats.MemBwSchema != "MB:0=20;1=70" {
-			t.Fatalf("expected MemBwSchema MB:0=20;1=70 but received %s", stats.IntelRdtStats.MemBwSchema)
-		}
-	}
-}
-
 func TestGetContainerState(t *testing.T) {
 	var (
-		pid                  = os.Getpid()
-		expectedMemoryPath   = "/sys/fs/cgroup/memory/myid"
-		expectedNetworkPath  = fmt.Sprintf("/proc/%d/ns/net", pid)
-		expectedIntelRdtPath = "/sys/fs/resctrl/myid"
+		pid                 = os.Getpid()
+		expectedMemoryPath  = "/sys/fs/cgroup/memory/myid"
+		expectedNetworkPath = fmt.Sprintf("/proc/%d/ns/net", pid)
 	)
 	container := &linuxContainer{
 		id: "myid",
@ -248,24 +167,10 @@ func TestGetContainerState(t *testing.T) {
 		},
 		cgroupManager: &mockCgroupManager{
 			pids: []int{1, 2, 3},
-			stats: &cgroups.Stats{
-				MemoryStats: cgroups.MemoryStats{
-					Usage: cgroups.MemoryData{
-						Usage: 1024,
-					},
-				},
-			},
 			paths: map[string]string{
 				"memory": expectedMemoryPath,
 			},
 		},
-		intelRdtManager: &mockIntelRdtManager{
-			stats: &intelrdt.Stats{
-				L3CacheSchema: "L3:0=f0;1=f",
-				MemBwSchema:   "MB:0=70;1=20",
-			},
-			path: expectedIntelRdtPath,
-		},
 	}
 	container.state = &createdState{c: container}
 	state, err := container.State()
@ -285,15 +190,6 @@ func TestGetContainerState(t *testing.T) {
 	if memPath := paths["memory"]; memPath != expectedMemoryPath {
 		t.Fatalf("expected memory path %q but received %q", expectedMemoryPath, memPath)
 	}
-	if intelrdt.IsCATEnabled() || intelrdt.IsMBAEnabled() {
-		intelRdtPath := state.IntelRdtPath
-		if intelRdtPath == "" {
-			t.Fatal("intel rdt path should not be empty")
-		}
-		if intelRdtPath != expectedIntelRdtPath {
-			t.Fatalf("expected intel rdt path %q but received %q", expectedIntelRdtPath, intelRdtPath)
-		}
-	}
 	for _, ns := range container.config.Namespaces {
 		path := state.NamespacePaths[ns.Type]
 		if path == "" {
--- a/libcontainer/devices/device_unix_go116_test.go
+++ b/libcontainer/devices/device_unix_go116_test.go
@ -1,39 +0,0 @@
-//go:build !go1.17
-// +build !go1.17
-
-package devices
-
-import "io/fs"
-
-// The following code is adapted from go1.17.1/src/io/fs/readdir.go
-// to compensate for the lack of fs.FileInfoToDirEntry in Go 1.16.
-
-// dirInfo is a DirEntry based on a FileInfo.
-type dirInfo struct {
-	fileInfo fs.FileInfo
-}
-
-func (di dirInfo) IsDir() bool {
-	return di.fileInfo.IsDir()
-}
-
-func (di dirInfo) Type() fs.FileMode {
-	return di.fileInfo.Mode().Type()
-}
-
-func (di dirInfo) Info() (fs.FileInfo, error) {
-	return di.fileInfo, nil
-}
-
-func (di dirInfo) Name() string {
-	return di.fileInfo.Name()
-}
-
-// fileInfoToDirEntry returns a DirEntry that returns information from info.
-// If info is nil, FileInfoToDirEntry returns nil.
-func fileInfoToDirEntry(info fs.FileInfo) fs.DirEntry {
-	if info == nil {
-		return nil
-	}
-	return dirInfo{fileInfo: info}
-}
--- a/libcontainer/devices/device_unix_go117_test.go
+++ b/libcontainer/devices/device_unix_go117_test.go
@ -1,8 +0,0 @@
-//go:build go1.17
-// +build go1.17
-
-package devices
-
-import "io/fs"
-
-var fileInfoToDirEntry = fs.FileInfoToDirEntry
--- a/libcontainer/devices/device_unix_test.go
+++ b/libcontainer/devices/device_unix_test.go
@ -64,7 +64,7 @@ func TestHostDevicesIoutilReadDirDeepFailure(t *testing.T) {
 			t.Fatalf("Unexpected error %v", err)
 		}

-		return []fs.DirEntry{fileInfoToDirEntry(fi)}, nil
+		return []fs.DirEntry{fs.FileInfoToDirEntry(fi)}, nil
 	}
 	defer cleanupTest()

--- a/libcontainer/eaccess_go119.go
+++ b/libcontainer/eaccess_go119.go
@ -0,0 +1,17 @@
+//go:build !go1.20
+// +build !go1.20
+
+package libcontainer
+
+import "golang.org/x/sys/unix"
+
+func eaccess(path string) error {
+	// This check is similar to access(2) with X_OK except for
+	// setuid/setgid binaries where it checks against the effective
+	// (rather than real) uid and gid. It is not needed in go 1.20
+	// and beyond and will be removed later.
+
+	// Relies on code added in https://go-review.googlesource.com/c/sys/+/468877
+	// and older CLs linked from there.
+	return unix.Faccessat(unix.AT_FDCWD, path, unix.X_OK, unix.AT_EACCESS)
+}
--- a/libcontainer/eaccess_stub.go
+++ b/libcontainer/eaccess_stub.go
@ -0,0 +1,10 @@
+//go:build go1.20
+
+package libcontainer
+
+func eaccess(path string) error {
+	// Not needed in Go 1.20+ as the functionality is already in there
+	// (added by https://go.dev/cl/416115, https://go.dev/cl/414824,
+	// and fixed in Go 1.20.2 by https://go.dev/cl/469956).
+	return nil
+}
--- a/libcontainer/factory_linux.go
+++ b/libcontainer/factory_linux.go
@ -48,20 +48,6 @@ func InitArgs(args ...string) func(*LinuxFactory) error {
 	}
 }

-// IntelRdtfs is an options func to configure a LinuxFactory to return
-// containers that use the Intel RDT "resource control" filesystem to
-// create and manage Intel RDT resources (e.g., L3 cache, memory bandwidth).
-func IntelRdtFs(l *LinuxFactory) error {
-	if !intelrdt.IsCATEnabled() && !intelrdt.IsMBAEnabled() {
-		l.NewIntelRdtManager = nil
-	} else {
-		l.NewIntelRdtManager = func(config *configs.Config, id string, path string) intelrdt.Manager {
-			return intelrdt.NewManager(config, id, path)
-		}
-	}
-	return nil
-}
-
 // TmpfsRoot is an option func to mount LinuxFactory.Root to tmpfs.
 func TmpfsRoot(l *LinuxFactory) error {
 	mounted, err := mountinfo.Mounted(l.Root)
@ -136,9 +122,6 @@ type LinuxFactory struct {

 	// Validator provides validation to container configurations.
 	Validator validate.Validator
-
-	// NewIntelRdtManager returns an initialized Intel RDT manager for a single container.
-	NewIntelRdtManager func(config *configs.Config, id string, path string) intelrdt.Manager
 }

 func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, error) {
@ -179,6 +162,12 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err
 			return nil, fmt.Errorf("unable to get cgroup PIDs: %w", err)
 		}
 		if len(pids) != 0 {
+			if config.Cgroups.Systemd {
+				// systemd cgroup driver can't add a pid to an
+				// existing systemd unit and will return an
+				// error anyway, so let's error out early.
+				return nil, fmt.Errorf("container's cgroup is not empty: %d process(es) found", len(pids))
+			}
 			// TODO: return an error.
 			logrus.Warnf("container's cgroup is not empty: %d process(es) found", len(pids))
 			logrus.Warn("DEPRECATED: running container in a non-empty cgroup won't be supported in runc 1.2; https://github.com/opencontainers/runc/issues/3132")
@ -202,18 +191,16 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err
 		return nil, err
 	}
 	c := &linuxContainer{
-		id:            id,
-		root:          containerRoot,
-		config:        config,
-		initPath:      l.InitPath,
-		initArgs:      l.InitArgs,
-		criuPath:      l.CriuPath,
-		newuidmapPath: l.NewuidmapPath,
-		newgidmapPath: l.NewgidmapPath,
-		cgroupManager: cm,
-	}
-	if l.NewIntelRdtManager != nil {
-		c.intelRdtManager = l.NewIntelRdtManager(config, id, "")
+		id:              id,
+		root:            containerRoot,
+		config:          config,
+		initPath:        l.InitPath,
+		initArgs:        l.InitArgs,
+		criuPath:        l.CriuPath,
+		newuidmapPath:   l.NewuidmapPath,
+		newgidmapPath:   l.NewgidmapPath,
+		cgroupManager:   cm,
+		intelRdtManager: intelrdt.NewManager(config, id, ""),
 	}
 	c.state = &stoppedState{c: c}
 	return c, nil
@ -255,12 +242,10 @@ func (l *LinuxFactory) Load(id string) (Container, error) {
 		newuidmapPath:        l.NewuidmapPath,
 		newgidmapPath:        l.NewgidmapPath,
 		cgroupManager:        cm,
+		intelRdtManager:      intelrdt.NewManager(&state.Config, id, state.IntelRdtPath),
 		root:                 containerRoot,
 		created:              state.Created,
 	}
-	if l.NewIntelRdtManager != nil {
-		c.intelRdtManager = l.NewIntelRdtManager(&state.Config, id, state.IntelRdtPath)
-	}
 	c.state = &loadedState{c: c}
 	if err := c.refreshState(); err != nil {
 		return nil, err
@ -338,7 +323,11 @@ func (l *LinuxFactory) StartInitialization() (err error) {

 	defer func() {
 		if e := recover(); e != nil {
-			err = fmt.Errorf("panic from initialization: %w, %v", e, string(debug.Stack()))
+			if ee, ok := e.(error); ok {
+				err = fmt.Errorf("panic from initialization: %w, %s", ee, debug.Stack())
+			} else {
+				err = fmt.Errorf("panic from initialization: %v, %s", e, debug.Stack())
+			}
 		}
 	}()

--- a/libcontainer/factory_linux_test.go
+++ b/libcontainer/factory_linux_test.go
@ -37,28 +37,6 @@ func TestFactoryNew(t *testing.T) {
 	}
 }

-func TestFactoryNewIntelRdt(t *testing.T) {
-	root := t.TempDir()
-	factory, err := New(root, IntelRdtFs)
-	if err != nil {
-		t.Fatal(err)
-	}
-	if factory == nil {
-		t.Fatal("factory should not be nil")
-	}
-	lfactory, ok := factory.(*LinuxFactory)
-	if !ok {
-		t.Fatal("expected linux factory returned on linux based systems")
-	}
-	if lfactory.Root != root {
-		t.Fatalf("expected factory root to be %q but received %q", root, lfactory.Root)
-	}
-
-	if factory.Type() != "libcontainer" {
-		t.Fatalf("unexpected factory type: %q, expected %q", factory.Type(), "libcontainer")
-	}
-}
-
 func TestFactoryNewTmpfs(t *testing.T) {
 	root := t.TempDir()
 	factory, err := New(root, TmpfsRoot)
@ -157,7 +135,7 @@ func TestFactoryLoadContainer(t *testing.T) {
 	if err := marshal(filepath.Join(root, id, stateFilename), expectedState); err != nil {
 		t.Fatal(err)
 	}
-	factory, err := New(root, IntelRdtFs)
+	factory, err := New(root)
 	if err != nil {
 		t.Fatal(err)
 	}
--- a/libcontainer/init_linux.go
+++ b/libcontainer/init_linux.go
@ -8,7 +8,7 @@ import (
 	"io"
 	"net"
 	"os"
-	"strconv"
+	"path/filepath"
 	"strings"
 	"unsafe"

@ -117,17 +117,17 @@ func populateProcessEnvironment(env []string) error {
 	for _, pair := range env {
 		p := strings.SplitN(pair, "=", 2)
 		if len(p) < 2 {
-			return fmt.Errorf("invalid environment variable: %q", pair)
+			return errors.New("invalid environment variable: missing '='")
 		}
 		name, val := p[0], p[1]
 		if name == "" {
-			return fmt.Errorf("environment variable name can't be empty: %q", pair)
+			return errors.New("invalid environment variable: name cannot be empty")
 		}
 		if strings.IndexByte(name, 0) >= 0 {
-			return fmt.Errorf("environment variable name can't contain null(\\x00): %q", pair)
+			return fmt.Errorf("invalid environment variable %q: name contains nul byte (\\x00)", name)
 		}
 		if strings.IndexByte(val, 0) >= 0 {
-			return fmt.Errorf("environment variable value can't contain null(\\x00): %q", pair)
+			return fmt.Errorf("invalid environment variable %q: value contains nul byte (\\x00)", name)
 		}
 		if err := os.Setenv(name, val); err != nil {
 			return err
@ -136,6 +136,32 @@ func populateProcessEnvironment(env []string) error {
 	return nil
 }

+// verifyCwd ensures that the current directory is actually inside the mount
+// namespace root of the current process.
+func verifyCwd() error {
+	// getcwd(2) on Linux detects if cwd is outside of the rootfs of the
+	// current mount namespace root, and in that case prefixes "(unreachable)"
+	// to the returned string. glibc's getcwd(3) and Go's Getwd() both detect
+	// when this happens and return ENOENT rather than returning a non-absolute
+	// path. In both cases we can therefore easily detect if we have an invalid
+	// cwd by checking the return value of getcwd(3). See getcwd(3) for more
+	// details, and CVE-2024-21626 for the security issue that motivated this
+	// check.
+	//
+	// We have to use unix.Getwd() here because os.Getwd() has a workaround for
+	// $PWD which involves doing stat(.), which can fail if the current
+	// directory is inaccessible to the container process.
+	if wd, err := unix.Getwd(); errors.Is(err, unix.ENOENT) {
+		return errors.New("current working directory is outside of container mount namespace root -- possible container breakout detected")
+	} else if err != nil {
+		return fmt.Errorf("failed to verify if current working directory is safe: %w", err)
+	} else if !filepath.IsAbs(wd) {
+		// We shouldn't ever hit this, but check just in case.
+		return fmt.Errorf("current working directory is not absolute -- possible container breakout detected: cwd is %q", wd)
+	}
+	return nil
+}
+
 // finalizeNamespace drops the caps, sets the correct user
 // and working dir, and closes any leaked file descriptors
 // before executing the command inside the namespace
@ -194,6 +220,10 @@ func finalizeNamespace(config *initConfig) error {
 			return fmt.Errorf("chdir to cwd (%q) set in config.json failed: %w", config.Cwd, err)
 		}
 	}
+	// Make sure our final working directory is inside the container.
+	if err := verifyCwd(); err != nil {
+		return err
+	}
 	if err := system.ClearKeepCaps(); err != nil {
 		return fmt.Errorf("unable to clear keep caps: %w", err)
 	}
@ -406,40 +436,37 @@ func fixStdioPermissions(u *user.ExecUser) error {
 	if err := unix.Stat("/dev/null", &null); err != nil {
 		return &os.PathError{Op: "stat", Path: "/dev/null", Err: err}
 	}
-	for _, fd := range []uintptr{
-		os.Stdin.Fd(),
-		os.Stderr.Fd(),
-		os.Stdout.Fd(),
-	} {
+	for _, file := range []*os.File{os.Stdin, os.Stdout, os.Stderr} {
 		var s unix.Stat_t
-		if err := unix.Fstat(int(fd), &s); err != nil {
-			return &os.PathError{Op: "fstat", Path: "fd " + strconv.Itoa(int(fd)), Err: err}
+		if err := unix.Fstat(int(file.Fd()), &s); err != nil {
+			return &os.PathError{Op: "fstat", Path: file.Name(), Err: err}
 		}

-		// Skip chown of /dev/null if it was used as one of the STDIO fds.
-		if s.Rdev == null.Rdev {
+		// Skip chown if uid is already the one we want or any of the STDIO descriptors
+		// were redirected to /dev/null.
+		if int(s.Uid) == u.Uid || s.Rdev == null.Rdev {
 			continue
 		}

-		// We only change the uid owner (as it is possible for the mount to
+		// We only change the uid (as it is possible for the mount to
 		// prefer a different gid, and there's no reason for us to change it).
 		// The reason why we don't just leave the default uid=X mount setup is
 		// that users expect to be able to actually use their console. Without
 		// this code, you couldn't effectively run as a non-root user inside a
 		// container and also have a console set up.
-		if err := unix.Fchown(int(fd), u.Uid, int(s.Gid)); err != nil {
+		if err := file.Chown(u.Uid, int(s.Gid)); err != nil {
 			// If we've hit an EINVAL then s.Gid isn't mapped in the user
 			// namespace. If we've hit an EPERM then the inode's current owner
 			// is not mapped in our user namespace (in particular,
-			// privileged_wrt_inode_uidgid() has failed). In either case, we
-			// are in a configuration where it's better for us to just not
-			// touch the stdio rather than bail at this point.
+			// privileged_wrt_inode_uidgid() has failed). Read-only
+			// /dev can result in EROFS error. In any case, it's
+			// better for us to just not touch the stdio rather
+			// than bail at this point.

-			// nolint:errorlint // unix errors are bare
-			if err == unix.EINVAL || err == unix.EPERM {
+			if errors.Is(err, unix.EINVAL) || errors.Is(err, unix.EPERM) || errors.Is(err, unix.EROFS) {
 				continue
 			}
-			return &os.PathError{Op: "fchown", Path: "fd " + strconv.Itoa(int(fd)), Err: err}
+			return err
 		}
 	}
 	return nil
--- a/libcontainer/integration/checkpoint_test.go
+++ b/libcontainer/integration/checkpoint_test.go
@ -6,6 +6,7 @@ import (
 	"os"
 	"os/exec"
 	"path/filepath"
+	"regexp"
 	"strings"
 	"testing"

@ -61,6 +62,12 @@ func testCheckpoint(t *testing.T, userns bool) {
 		t.Skipf("criu binary not found: %v", err)
 	}

+	// Workaround for https://github.com/opencontainers/runc/issues/3532.
+	out, err := exec.Command("rpm", "-q", "criu").CombinedOutput()
+	if err == nil && regexp.MustCompile(`^criu-3\.17-[123]\.el9`).Match(out) {
+		t.Skip("Test requires criu >= 3.17-4 on CentOS Stream 9.")
+	}
+
 	config := newTemplateConfig(t, &tParam{userns: userns})
 	factory, err := libcontainer.New(t.TempDir())
 	ok(t, err)
--- a/libcontainer/integration/exec_test.go
+++ b/libcontainer/integration/exec_test.go
@ -18,6 +18,7 @@ import (
 	"github.com/opencontainers/runc/libcontainer/cgroups"
 	"github.com/opencontainers/runc/libcontainer/cgroups/systemd"
 	"github.com/opencontainers/runc/libcontainer/configs"
+	"github.com/opencontainers/runc/libcontainer/userns"
 	"github.com/opencontainers/runtime-spec/specs-go"

 	"golang.org/x/sys/unix"
@ -40,13 +41,7 @@ func testExecPS(t *testing.T, userns bool) {
 	}
 	config := newTemplateConfig(t, &tParam{userns: userns})

-	buffers, exitCode, err := runContainer(t, config, "ps", "-o", "pid,user,comm")
-	if err != nil {
-		t.Fatalf("%s: %s", buffers, err)
-	}
-	if exitCode != 0 {
-		t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
-	}
+	buffers := runContainerOk(t, config, "ps", "-o", "pid,user,comm")
 	lines := strings.Split(buffers.Stdout.String(), "\n")
 	if len(lines) < 2 {
 		t.Fatalf("more than one process running for output %q", buffers.Stdout.String())
@ -67,12 +62,7 @@ func TestIPCPrivate(t *testing.T) {
 	ok(t, err)

 	config := newTemplateConfig(t, nil)
-	buffers, exitCode, err := runContainer(t, config, "readlink", "/proc/self/ns/ipc")
-	ok(t, err)
-
-	if exitCode != 0 {
-		t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
-	}
+	buffers := runContainerOk(t, config, "readlink", "/proc/self/ns/ipc")

 	if actual := strings.Trim(buffers.Stdout.String(), "\n"); actual == l {
 		t.Fatalf("ipc link should be private to the container but equals host %q %q", actual, l)
@ -89,12 +79,7 @@ func TestIPCHost(t *testing.T) {

 	config := newTemplateConfig(t, nil)
 	config.Namespaces.Remove(configs.NEWIPC)
-	buffers, exitCode, err := runContainer(t, config, "readlink", "/proc/self/ns/ipc")
-	ok(t, err)
-
-	if exitCode != 0 {
-		t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
-	}
+	buffers := runContainerOk(t, config, "readlink", "/proc/self/ns/ipc")

 	if actual := strings.Trim(buffers.Stdout.String(), "\n"); actual != l {
 		t.Fatalf("ipc link not equal to host link %q %q", actual, l)
@ -111,13 +96,7 @@ func TestIPCJoinPath(t *testing.T) {

 	config := newTemplateConfig(t, nil)
 	config.Namespaces.Add(configs.NEWIPC, "/proc/1/ns/ipc")
-
-	buffers, exitCode, err := runContainer(t, config, "readlink", "/proc/self/ns/ipc")
-	ok(t, err)
-
-	if exitCode != 0 {
-		t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
-	}
+	buffers := runContainerOk(t, config, "readlink", "/proc/self/ns/ipc")

 	if actual := strings.Trim(buffers.Stdout.String(), "\n"); actual != l {
 		t.Fatalf("ipc link not equal to host link %q %q", actual, l)
@ -163,8 +142,7 @@ func testRlimit(t *testing.T, userns bool) {
 		Cur: 1024,
 	}))

-	out, _, err := runContainer(t, config, "/bin/sh", "-c", "ulimit -n")
-	ok(t, err)
+	out := runContainerOk(t, config, "/bin/sh", "-c", "ulimit -n")
 	if limit := strings.TrimSpace(out.Stdout.String()); limit != "1025" {
 		t.Fatalf("expected rlimit to be 1025, got %s", limit)
 	}
@ -537,7 +515,7 @@ func testCpuShares(t *testing.T, systemd bool) {
 	config.Cgroups.Resources.CpuShares = 1

 	if _, _, err := runContainer(t, config, "ps"); err == nil {
-		t.Fatalf("runContainer should failed with invalid CpuShares")
+		t.Fatal("runContainer should fail with invalid CpuShares")
 	}
 }

@ -560,30 +538,20 @@ func testPids(t *testing.T, systemd bool) {
 	config := newTemplateConfig(t, &tParam{systemd: systemd})
 	config.Cgroups.Resources.PidsLimit = -1

-	// Running multiple processes.
-	_, ret, err := runContainer(t, config, "/bin/sh", "-c", "/bin/true | /bin/true | /bin/true | /bin/true")
-	ok(t, err)
-
-	if ret != 0 {
-		t.Fatalf("expected fork() to succeed with no pids limit")
-	}
+	// Running multiple processes, expecting it to succeed with no pids limit.
+	_ = runContainerOk(t, config, "/bin/sh", "-c", "/bin/true | /bin/true | /bin/true | /bin/true")

 	// Enforce a permissive limit. This needs to be fairly hand-wavey due to the
 	// issues with running Go binaries with pids restrictions (see below).
 	config.Cgroups.Resources.PidsLimit = 64
-	_, ret, err = runContainer(t, config, "/bin/sh", "-c", `
+	_ = runContainerOk(t, config, "/bin/sh", "-c", `
 	/bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true |
 	/bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true |
 	/bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true |
 	/bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true`)
-	ok(t, err)

-	if ret != 0 {
-		t.Fatalf("expected fork() to succeed with permissive pids limit")
-	}
-
-	// Enforce a restrictive limit. 64 * /bin/true + 1 * shell should cause this
-	// to fail reliability.
+	// Enforce a restrictive limit. 64 * /bin/true + 1 * shell should cause
+	// this to fail reliably.
 	config.Cgroups.Resources.PidsLimit = 64
 	out, _, err := runContainer(t, config, "/bin/sh", "-c", `
 	/bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true |
@ -933,13 +901,8 @@ func TestMountCgroupRO(t *testing.T) {
 		return
 	}
 	config := newTemplateConfig(t, nil)
-	buffers, exitCode, err := runContainer(t, config, "mount")
-	if err != nil {
-		t.Fatalf("%s: %s", buffers, err)
-	}
-	if exitCode != 0 {
-		t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
-	}
+	buffers := runContainerOk(t, config, "mount")
+
 	mountInfo := buffers.Stdout.String()
 	lines := strings.Split(mountInfo, "\n")
 	for _, l := range lines {
@ -980,13 +943,8 @@ func TestMountCgroupRW(t *testing.T) {
 		}
 	}

-	buffers, exitCode, err := runContainer(t, config, "mount")
-	if err != nil {
-		t.Fatalf("%s: %s", buffers, err)
-	}
-	if exitCode != 0 {
-		t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
-	}
+	buffers := runContainerOk(t, config, "mount")
+
 	mountInfo := buffers.Stdout.String()
 	lines := strings.Split(mountInfo, "\n")
 	for _, l := range lines {
@ -1197,11 +1155,7 @@ func TestSTDIOPermissions(t *testing.T) {
 	}

 	config := newTemplateConfig(t, nil)
-	buffers, exitCode, err := runContainer(t, config, "sh", "-c", "echo hi > /dev/stderr")
-	ok(t, err)
-	if exitCode != 0 {
-		t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
-	}
+	buffers := runContainerOk(t, config, "sh", "-c", "echo hi > /dev/stderr")

 	if actual := strings.Trim(buffers.Stderr.String(), "\n"); actual != "hi" {
 		t.Fatalf("stderr should equal be equal %q %q", actual, "hi")
@ -1444,12 +1398,7 @@ func TestPIDHost(t *testing.T) {

 	config := newTemplateConfig(t, nil)
 	config.Namespaces.Remove(configs.NEWPID)
-	buffers, exitCode, err := runContainer(t, config, "readlink", "/proc/self/ns/pid")
-	ok(t, err)
-
-	if exitCode != 0 {
-		t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
-	}
+	buffers := runContainerOk(t, config, "readlink", "/proc/self/ns/pid")

 	if actual := strings.Trim(buffers.Stdout.String(), "\n"); actual != l {
 		t.Fatalf("ipc link not equal to host link %q %q", actual, l)
@ -1640,6 +1589,11 @@ func TestInitJoinNetworkAndUser(t *testing.T) {
 	config2 := newTemplateConfig(t, &tParam{userns: true})
 	config2.Namespaces.Add(configs.NEWNET, netns1)
 	config2.Namespaces.Add(configs.NEWUSER, userns1)
+	// Emulate specconv.setupUserNamespace().
+	uidMap, gidMap, err := userns.GetUserNamespaceMappings(userns1)
+	ok(t, err)
+	config2.UidMappings = uidMap
+	config2.GidMappings = gidMap
 	config2.Cgroups.Path = "integration/test2"
 	container2, err := newContainer(t, config2)
 	ok(t, err)
@ -1738,12 +1692,7 @@ func TestCGROUPPrivate(t *testing.T) {

 	config := newTemplateConfig(t, nil)
 	config.Namespaces.Add(configs.NEWCGROUP, "")
-	buffers, exitCode, err := runContainer(t, config, "readlink", "/proc/self/ns/cgroup")
-	ok(t, err)
-
-	if exitCode != 0 {
-		t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
-	}
+	buffers := runContainerOk(t, config, "readlink", "/proc/self/ns/cgroup")

 	if actual := strings.Trim(buffers.Stdout.String(), "\n"); actual == l {
 		t.Fatalf("cgroup link should be private to the container but equals host %q %q", actual, l)
@ -1762,12 +1711,7 @@ func TestCGROUPHost(t *testing.T) {
 	ok(t, err)

 	config := newTemplateConfig(t, nil)
-	buffers, exitCode, err := runContainer(t, config, "readlink", "/proc/self/ns/cgroup")
-	ok(t, err)
-
-	if exitCode != 0 {
-		t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
-	}
+	buffers := runContainerOk(t, config, "readlink", "/proc/self/ns/cgroup")

 	if actual := strings.Trim(buffers.Stdout.String(), "\n"); actual != l {
 		t.Fatalf("cgroup link not equal to host link %q %q", actual, l)
@ -1790,6 +1734,16 @@ func testFdLeaks(t *testing.T, systemd bool) {
 		return
 	}

+	config := newTemplateConfig(t, &tParam{systemd: systemd})
+	// Run a container once to exclude file descriptors that are only
+	// opened once during the process lifetime by the library and are
+	// never closed. Those are not considered leaks.
+	//
+	// Examples of this open-once file descriptors are:
+	//  - /sys/fs/cgroup dirfd opened by prepareOpenat2 in libct/cgroups;
+	//  - dbus connection opened by getConnection in libct/cgroups/systemd.
+	_ = runContainerOk(t, config, "true")
+
 	pfd, err := os.Open("/proc/self/fd")
 	ok(t, err)
 	defer pfd.Close()
@ -1798,13 +1752,7 @@ func testFdLeaks(t *testing.T, systemd bool) {
 	_, err = pfd.Seek(0, 0)
 	ok(t, err)

-	config := newTemplateConfig(t, &tParam{systemd: systemd})
-	buffers, exitCode, err := runContainer(t, config, "true")
-	ok(t, err)
-
-	if exitCode != 0 {
-		t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
-	}
+	_ = runContainerOk(t, config, "true")

 	fds1, err := pfd.Readdirnames(0)
 	ok(t, err)
@ -1815,7 +1763,6 @@ func testFdLeaks(t *testing.T, systemd bool) {
 	// Show the extra opened files.

 	excludedPaths := []string{
-		"/sys/fs/cgroup",      // opened once, see prepareOpenat2
 		"anon_inode:bpf-prog", // FIXME: see https://github.com/opencontainers/runc/issues/2366#issuecomment-776411392
 	}

--- a/libcontainer/integration/seccomp_test.go
+++ b/libcontainer/integration/seccomp_test.go
@ -13,7 +13,7 @@ import (
 	libseccomp "github.com/seccomp/libseccomp-golang"
 )

-func TestSeccompDenyGetcwdWithErrno(t *testing.T) {
+func TestSeccompDenySyslogWithErrno(t *testing.T) {
 	if testing.Short() {
 		return
 	}
@ -25,7 +25,7 @@ func TestSeccompDenyGetcwdWithErrno(t *testing.T) {
 		DefaultAction: configs.Allow,
 		Syscalls: []*configs.Syscall{
 			{
-				Name:     "getcwd",
+				Name:     "syslog",
 				Action:   configs.Errno,
 				ErrnoRet: &errnoRet,
 			},
@ -39,7 +39,7 @@ func TestSeccompDenyGetcwdWithErrno(t *testing.T) {
 	buffers := newStdBuffers()
 	pwd := &libcontainer.Process{
 		Cwd:    "/",
-		Args:   []string{"pwd"},
+		Args:   []string{"dmesg"},
 		Env:    standardEnvironment,
 		Stdin:  buffers.Stdin,
 		Stdout: buffers.Stdout,
@ -65,17 +65,17 @@ func TestSeccompDenyGetcwdWithErrno(t *testing.T) {
 	}

 	if exitCode == 0 {
-		t.Fatalf("Getcwd should fail with negative exit code, instead got %d!", exitCode)
+		t.Fatalf("dmesg should fail with negative exit code, instead got %d!", exitCode)
 	}

-	expected := "pwd: getcwd: No such process"
+	expected := "dmesg: klogctl: No such process"
 	actual := strings.Trim(buffers.Stderr.String(), "\n")
 	if actual != expected {
 		t.Fatalf("Expected output %s but got %s\n", expected, actual)
 	}
 }

-func TestSeccompDenyGetcwd(t *testing.T) {
+func TestSeccompDenySyslog(t *testing.T) {
 	if testing.Short() {
 		return
 	}
@ -85,7 +85,7 @@ func TestSeccompDenyGetcwd(t *testing.T) {
 		DefaultAction: configs.Allow,
 		Syscalls: []*configs.Syscall{
 			{
-				Name:   "getcwd",
+				Name:   "syslog",
 				Action: configs.Errno,
 			},
 		},
@ -98,7 +98,7 @@ func TestSeccompDenyGetcwd(t *testing.T) {
 	buffers := newStdBuffers()
 	pwd := &libcontainer.Process{
 		Cwd:    "/",
-		Args:   []string{"pwd"},
+		Args:   []string{"dmesg"},
 		Env:    standardEnvironment,
 		Stdin:  buffers.Stdin,
 		Stdout: buffers.Stdout,
@ -124,10 +124,10 @@ func TestSeccompDenyGetcwd(t *testing.T) {
 	}

 	if exitCode == 0 {
-		t.Fatalf("Getcwd should fail with negative exit code, instead got %d!", exitCode)
+		t.Fatalf("dmesg should fail with negative exit code, instead got %d!", exitCode)
 	}

-	expected := "pwd: getcwd: Operation not permitted"
+	expected := "dmesg: klogctl: Operation not permitted"
 	actual := strings.Trim(buffers.Stderr.String(), "\n")
 	if actual != expected {
 		t.Fatalf("Expected output %s but got %s\n", expected, actual)
@ -282,13 +282,7 @@ func TestSeccompPermitWriteMultipleConditions(t *testing.T) {
 		},
 	}

-	buffers, exitCode, err := runContainer(t, config, "ls", "/")
-	if err != nil {
-		t.Fatalf("%s: %s", buffers, err)
-	}
-	if exitCode != 0 {
-		t.Fatalf("exit code not 0. code %d buffers %s", exitCode, buffers)
-	}
+	buffers := runContainerOk(t, config, "ls", "/")
 	// We don't need to verify the actual thing printed
 	// Just that something was written to stdout
 	if len(buffers.Stdout.String()) == 0 {
@ -375,13 +369,7 @@ func TestSeccompMultipleConditionSameArgDeniesStdout(t *testing.T) {
 		},
 	}

-	buffers, exitCode, err := runContainer(t, config, "ls", "/")
-	if err != nil {
-		t.Fatalf("%s: %s", buffers, err)
-	}
-	if exitCode != 0 {
-		t.Fatalf("exit code not 0. code %d buffers %s", exitCode, buffers)
-	}
+	buffers := runContainerOk(t, config, "ls", "/")
 	// Verify that nothing was printed
 	if len(buffers.Stdout.String()) != 0 {
 		t.Fatalf("Something was written to stdout, write call succeeded!\n")
--- a/libcontainer/integration/utils_test.go
+++ b/libcontainer/integration/utils_test.go
@ -216,6 +216,22 @@ func runContainer(t *testing.T, config *configs.Config, args ...string) (buffers
 	return
 }

+// runContainerOk is a wrapper for runContainer, simplifying its use for cases
+// when the run is expected to succeed and return exit code of 0.
+func runContainerOk(t *testing.T, config *configs.Config, args ...string) *stdBuffers {
+	buffers, exitCode, err := runContainer(t, config, args...)
+
+	t.Helper()
+	if err != nil {
+		t.Fatalf("%s: %s", buffers, err)
+	}
+	if exitCode != 0 {
+		t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr)
+	}
+
+	return buffers
+}
+
 func destroyContainer(container libcontainer.Container) {
 	_ = container.Destroy()
 }
--- a/libcontainer/intelrdt/intelrdt.go
+++ b/libcontainer/intelrdt/intelrdt.go
@ -1,11 +1,9 @@
 package intelrdt

 import (
-	"bufio"
 	"bytes"
 	"errors"
 	"fmt"
-	"io"
 	"os"
 	"path/filepath"
 	"strconv"
@ -13,6 +11,8 @@ import (
 	"sync"

 	"github.com/moby/sys/mountinfo"
+	"golang.org/x/sys/unix"
+
 	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
 	"github.com/opencontainers/runc/libcontainer/configs"
 )
@ -145,34 +145,31 @@ import (
 * }
 */

-type Manager interface {
-	// Applies Intel RDT configuration to the process with the specified pid
-	Apply(pid int) error
-
-	// Returns statistics for Intel RDT
-	GetStats() (*Stats, error)
-
-	// Destroys the Intel RDT container-specific 'container_id' group
-	Destroy() error
-
-	// Returns Intel RDT path to save in a state file and to be able to
-	// restore the object later
-	GetPath() string
-
-	// Set Intel RDT "resource control" filesystem as configured.
-	Set(container *configs.Config) error
-}
-
-// This implements interface Manager
-type intelRdtManager struct {
+type Manager struct {
 	mu     sync.Mutex
 	config *configs.Config
 	id     string
 	path   string
 }

-func NewManager(config *configs.Config, id string, path string) Manager {
-	return &intelRdtManager{
+// NewManager returns a new instance of Manager, or nil if the Intel RDT
+// functionality is not specified in the config, available from hardware or
+// enabled in the kernel.
+func NewManager(config *configs.Config, id string, path string) *Manager {
+	if config.IntelRdt == nil {
+		return nil
+	}
+	if _, err := Root(); err != nil {
+		// Intel RDT is not available.
+		return nil
+	}
+	return newManager(config, id, path)
+}
+
+// newManager is the same as NewManager, except it does not check if the feature
+// is actually available. Used by unit tests that mock intelrdt paths.
+func newManager(config *configs.Config, id string, path string) *Manager {
+	return &Manager{
 		config: config,
 		id:     id,
 		path:   path,
@ -188,71 +185,52 @@ var (
 	catEnabled bool
 	// The flag to indicate if Intel RDT/MBA is enabled
 	mbaEnabled bool
-	// The flag to indicate if Intel RDT/MBA Software Controller is enabled
-	mbaScEnabled bool

 	// For Intel RDT initialization
 	initOnce sync.Once

-	errNotFound = errors.New("Intel RDT resctrl mount point not found")
+	errNotFound = errors.New("Intel RDT not available")
 )

 // Check if Intel RDT sub-features are enabled in featuresInit()
 func featuresInit() {
 	initOnce.Do(func() {
-		// 1. Check if hardware and kernel support Intel RDT sub-features
-		flagsSet, err := parseCpuInfoFile("/proc/cpuinfo")
-		if err != nil {
-			return
-		}
-
-		// 2. Check if Intel RDT "resource control" filesystem is available.
+		// 1. Check if Intel RDT "resource control" filesystem is available.
 		// The user guarantees to mount the filesystem.
 		root, err := Root()
 		if err != nil {
 			return
 		}

-		// 3. Double check if Intel RDT sub-features are available in
-		// "resource control" filesystem. Intel RDT sub-features can be
+		// 2. Check if Intel RDT sub-features are available in "resource
+		// control" filesystem. Intel RDT sub-features can be
 		// selectively disabled or enabled by kernel command line
 		// (e.g., rdt=!l3cat,mba) in 4.14 and newer kernel
-		if flagsSet.CAT {
-			if _, err := os.Stat(filepath.Join(root, "info", "L3")); err == nil {
-				catEnabled = true
-			}
+		if _, err := os.Stat(filepath.Join(root, "info", "L3")); err == nil {
+			catEnabled = true
 		}
-		if mbaScEnabled {
-			// We confirm MBA Software Controller is enabled in step 2,
-			// MBA should be enabled because MBA Software Controller
-			// depends on MBA
+		if _, err := os.Stat(filepath.Join(root, "info", "MB")); err == nil {
 			mbaEnabled = true
-		} else if flagsSet.MBA {
-			if _, err := os.Stat(filepath.Join(root, "info", "MB")); err == nil {
-				mbaEnabled = true
-			}
 		}
-		if flagsSet.MBMTotal || flagsSet.MBMLocal || flagsSet.CMT {
-			if _, err := os.Stat(filepath.Join(root, "info", "L3_MON")); err != nil {
-				return
-			}
-			enabledMonFeatures, err = getMonFeatures(root)
-			if err != nil {
-				return
-			}
-			if enabledMonFeatures.mbmTotalBytes || enabledMonFeatures.mbmLocalBytes {
-				mbmEnabled = true
-			}
-			if enabledMonFeatures.llcOccupancy {
-				cmtEnabled = true
-			}
+		if _, err := os.Stat(filepath.Join(root, "info", "L3_MON")); err != nil {
+			return
+		}
+		enabledMonFeatures, err = getMonFeatures(root)
+		if err != nil {
+			return
+		}
+		if enabledMonFeatures.mbmTotalBytes || enabledMonFeatures.mbmLocalBytes {
+			mbmEnabled = true
+		}
+		if enabledMonFeatures.llcOccupancy {
+			cmtEnabled = true
 		}
 	})
 }

-// Return the mount point path of Intel RDT "resource control" filesysem
-func findIntelRdtMountpointDir(f io.Reader) (string, error) {
-	mi, err := mountinfo.GetMountsFromReader(f, func(m *mountinfo.Info) (bool, bool) {
+// findIntelRdtMountpointDir returns the mount point of the Intel RDT "resource control" filesystem.
+func findIntelRdtMountpointDir() (string, error) {
+	mi, err := mountinfo.GetMounts(func(m *mountinfo.Info) (bool, bool) {
 		// similar to mountinfo.FSTypeFilter but stops after the first match
 		if m.FSType == "resctrl" {
 			return false, true // don't skip, stop
@ -266,97 +244,45 @@ func findIntelRdtMountpointDir(f io.Reader) (string, error) {
 		return "", errNotFound
 	}

-	// Check if MBA Software Controller is enabled through mount option "-o mba_MBps"
-	if strings.Contains(","+mi[0].VFSOptions+",", ",mba_MBps,") {
-		mbaScEnabled = true
-	}
-
 	return mi[0].Mountpoint, nil
 }

 // For Root() use only.
 var (
-	intelRdtRoot string
-	rootMu       sync.Mutex
+	intelRdtRoot    string
+	intelRdtRootErr error
+	rootOnce        sync.Once
 )

+// The kernel creates this (empty) directory if resctrl is supported by the
+// hardware and kernel. The user is responsible for mounting the resctrl
+// filesystem, and they could mount it somewhere else if they wanted to.
+const defaultResctrlMountpoint = "/sys/fs/resctrl"
+
 // Root returns the Intel RDT "resource control" filesystem mount point.
 func Root() (string, error) {
-	rootMu.Lock()
-	defer rootMu.Unlock()
-
-	if intelRdtRoot != "" {
-		return intelRdtRoot, nil
-	}
-
-	f, err := os.Open("/proc/self/mountinfo")
-	if err != nil {
-		return "", err
-	}
-	root, err := findIntelRdtMountpointDir(f)
-	f.Close()
-	if err != nil {
-		return "", err
-	}
-
-	if _, err := os.Stat(root); err != nil {
-		return "", err
-	}
-
-	intelRdtRoot = root
-	return intelRdtRoot, nil
-}
-
-type cpuInfoFlags struct {
-	CAT bool // Cache Allocation Technology
-	MBA bool // Memory Bandwidth Allocation
-
-	// Memory Bandwidth Monitoring related.
-	MBMTotal bool
-	MBMLocal bool
-
-	CMT bool // Cache Monitoring Technology
-}
-
-func parseCpuInfoFile(path string) (cpuInfoFlags, error) {
-	infoFlags := cpuInfoFlags{}
-
-	f, err := os.Open(path)
-	if err != nil {
-		return infoFlags, err
-	}
-	defer f.Close()
-
-	s := bufio.NewScanner(f)
-	for s.Scan() {
-		line := s.Text()
-
-		// Search "cat_l3" and "mba" flags in first "flags" line
-		if strings.HasPrefix(line, "flags") {
-			flags := strings.Split(line, " ")
-			// "cat_l3" flag for CAT and "mba" flag for MBA
-			for _, flag := range flags {
-				switch flag {
-				case "cat_l3":
-					infoFlags.CAT = true
-				case "mba":
-					infoFlags.MBA = true
-				case "cqm_mbm_total":
-					infoFlags.MBMTotal = true
-				case "cqm_mbm_local":
-					infoFlags.MBMLocal = true
-				case "cqm_occup_llc":
-					infoFlags.CMT = true
-				}
+	rootOnce.Do(func() {
+		// Does this system support resctrl?
+		var statfs unix.Statfs_t
+		if err := unix.Statfs(defaultResctrlMountpoint, &statfs); err != nil {
+			if errors.Is(err, unix.ENOENT) {
+				err = errNotFound
 			}
-			return infoFlags, nil
+			intelRdtRootErr = err
+			return
 		}
-	}
-	if err := s.Err(); err != nil {
-		return infoFlags, err
-	}

-	return infoFlags, nil
+		// Has the resctrl fs been mounted to the default mount point?
+		if statfs.Type == unix.RDTGROUP_SUPER_MAGIC {
+			intelRdtRoot = defaultResctrlMountpoint
+			return
+		}
+
+		// The resctrl fs could have been mounted somewhere nonstandard.
+		intelRdtRoot, intelRdtRootErr = findIntelRdtMountpointDir()
+	})
+
+	return intelRdtRoot, intelRdtRootErr
 }

 // Gets a single uint64 value from the specified file.
@ -502,14 +428,8 @@ func IsMBAEnabled() bool {
 	return mbaEnabled
 }

-// Check if Intel RDT/MBA Software Controller is enabled
-func IsMBAScEnabled() bool {
-	featuresInit()
-	return mbaScEnabled
-}
-
 // Get the path of the clos group in "resource control" filesystem that the container belongs to
-func (m *intelRdtManager) getIntelRdtPath() (string, error) {
+func (m *Manager) getIntelRdtPath() (string, error) {
 	rootPath, err := Root()
 	if err != nil {
 		return "", err
@ -524,7 +444,7 @@ func (m *intelRdtManager) getIntelRdtPath() (string, error) {
 }

 // Applies Intel RDT configuration to the process with the specified pid
-func (m *intelRdtManager) Apply(pid int) (err error) {
+func (m *Manager) Apply(pid int) (err error) {
 	// If intelRdt is not specified in config, we do nothing
 	if m.config.IntelRdt == nil {
 		return nil
@ -559,11 +479,11 @@ func (m *intelRdtManager) Apply(pid int) (err error) {
 }

 // Destroys the Intel RDT container-specific 'container_id' group
-func (m *intelRdtManager) Destroy() error {
+func (m *Manager) Destroy() error {
 	// Don't remove resctrl group if closid has been explicitly specified. The
 	// group is likely externally managed, i.e. by some other entity than us.
 	// There are probably other containers/tasks sharing the same group.
-	if m.config.IntelRdt == nil || m.config.IntelRdt.ClosID == "" {
+	if m.config.IntelRdt != nil && m.config.IntelRdt.ClosID == "" {
 		m.mu.Lock()
 		defer m.mu.Unlock()
 		if err := os.RemoveAll(m.GetPath()); err != nil {
@ -576,7 +496,7 @@ func (m *intelRdtManager) Destroy() error {

 // Returns Intel RDT path to save in a state file and to be able to
 // restore the object later
-func (m *intelRdtManager) GetPath() string {
+func (m *Manager) GetPath() string {
 	if m.path == "" {
 		m.path, _ = m.getIntelRdtPath()
 	}
@ -584,7 +504,7 @@ func (m *intelRdtManager) GetPath() string {
 }

 // Returns statistics for Intel RDT
-func (m *intelRdtManager) GetStats() (*Stats, error) {
+func (m *Manager) GetStats() (*Stats, error) {
 	// If intelRdt is not specified in config
 	if m.config.IntelRdt == nil {
 		return nil, nil
@ -670,7 +590,7 @@ func (m *intelRdtManager) GetStats() (*Stats, error) {
 }

 // Set Intel RDT "resource control" filesystem as configured.
-func (m *intelRdtManager) Set(container *configs.Config) error {
+func (m *Manager) Set(container *configs.Config) error {
 	// About L3 cache schema:
 	// It has allocation bitmasks/values for L3 cache on each socket,
 	// which contains L3 cache id and capacity bitmask (CBM).
--- a/libcontainer/intelrdt/intelrdt_test.go
+++ b/libcontainer/intelrdt/intelrdt_test.go
@ -1,8 +1,6 @@
 package intelrdt

 import (
-	"errors"
-	"io"
 	"os"
 	"path/filepath"
 	"strings"
@ -22,7 +20,7 @@ func TestIntelRdtSetL3CacheSchema(t *testing.T) {
 	})

 	helper.config.IntelRdt.L3CacheSchema = l3CacheSchemeAfter
-	intelrdt := NewManager(helper.config, "", helper.IntelRdtPath)
+	intelrdt := newManager(helper.config, "", helper.IntelRdtPath)
 	if err := intelrdt.Set(helper.config); err != nil {
 		t.Fatal(err)
 	}
@ -52,7 +50,7 @@ func TestIntelRdtSetMemBwSchema(t *testing.T) {
 	})

 	helper.config.IntelRdt.MemBwSchema = memBwSchemeAfter
-	intelrdt := NewManager(helper.config, "", helper.IntelRdtPath)
+	intelrdt := newManager(helper.config, "", helper.IntelRdtPath)
 	if err := intelrdt.Set(helper.config); err != nil {
 		t.Fatal(err)
 	}
@ -82,7 +80,7 @@ func TestIntelRdtSetMemBwScSchema(t *testing.T) {
 	})

 	helper.config.IntelRdt.MemBwSchema = memBwScSchemeAfter
-	intelrdt := NewManager(helper.config, "", helper.IntelRdtPath)
+	intelrdt := newManager(helper.config, "", helper.IntelRdtPath)
 	if err := intelrdt.Set(helper.config); err != nil {
 		t.Fatal(err)
 	}
@ -105,7 +103,7 @@ func TestApply(t *testing.T) {
 	const closID = "test-clos"

 	helper.config.IntelRdt.ClosID = closID
-	intelrdt := NewManager(helper.config, "", helper.IntelRdtPath)
+	intelrdt := newManager(helper.config, "", helper.IntelRdtPath)
 	if err := intelrdt.Apply(1234); err == nil {
 		t.Fatal("unexpected success when applying pid")
 	}
@ -114,7 +112,7 @@ func TestApply(t *testing.T) {
 	}

 	// Dir should be created if some schema has been specified
-	intelrdt.(*intelRdtManager).config.IntelRdt.L3CacheSchema = "L3:0=f"
+	intelrdt.config.IntelRdt.L3CacheSchema = "L3:0=f"
 	if err := intelrdt.Apply(1235); err != nil {
 		t.Fatalf("Apply() failed: %v", err)
 	}
@ -127,141 +125,3 @@ func TestApply(t *testing.T) {
 		t.Fatalf("unexpected tasks file, expected '1235', got %q", pids)
 	}
 }
-
-const (
-	mountinfoValid = `18 40 0:18 / /sys rw,nosuid,nodev,noexec,relatime shared:6 - sysfs sysfs rw
-19 40 0:3 / /proc rw,nosuid,nodev,noexec,relatime shared:5 - proc proc rw
-20 40 0:5 / /dev rw,nosuid shared:2 - devtmpfs devtmpfs rw,size=131927256k,nr_inodes=32981814,mode=755
-21 18 0:17 / /sys/kernel/security rw,nosuid,nodev,noexec,relatime shared:7 - securityfs securityfs rw
-22 20 0:19 / /dev/shm rw,nosuid,nodev shared:3 - tmpfs tmpfs rw
-23 20 0:12 / /dev/pts rw,nosuid,noexec,relatime shared:4 - devpts devpts rw,gid=5,mode=620,ptmxmode=000
-24 40 0:20 / /run rw,nosuid,nodev shared:22 - tmpfs tmpfs rw,mode=755
-25 18 0:21 / /sys/fs/cgroup ro,nosuid,nodev,noexec shared:8 - tmpfs tmpfs ro,mode=755
-26 25 0:22 / /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:9 - cgroup cgroup rw,xattr,release_agent=/usr/lib/systemd/systemd-cgroups-agent,name=systemd
-27 18 0:23 / /sys/fs/pstore rw,nosuid,nodev,noexec,relatime shared:20 - pstore pstore rw
-28 25 0:24 / /sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:10 - cgroup cgroup rw,perf_event
-29 25 0:25 / /sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:11 - cgroup cgroup rw,cpuacct,cpu
-30 25 0:26 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:12 - cgroup cgroup rw,memory
-31 25 0:27 / /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:13 - cgroup cgroup rw,devices
-32 25 0:28 / /sys/fs/cgroup/hugetlb rw,nosuid,nodev,noexec,relatime shared:14 - cgroup cgroup rw,hugetlb
-33 25 0:29 / /sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:15 - cgroup cgroup rw,blkio
-34 25 0:30 / /sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:16 - cgroup cgroup rw,pids
-35 25 0:31 / /sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,cpuset
-36 25 0:32 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,freezer
-37 25 0:33 / /sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:19 - cgroup cgroup rw,net_prio,net_cls
-38 18 0:34 / /sys/kernel/config rw,relatime shared:21 - configfs configfs rw
-40 0 253:0 / / rw,relatime shared:1 - ext4 /dev/mapper/vvrg-vvrg rw,data=ordered
-16 18 0:6 / /sys/kernel/debug rw,relatime shared:23 - debugfs debugfs rw
-41 18 0:16 / /sys/fs/resctrl rw,relatime shared:24 - resctrl resctrl rw
-42 20 0:36 / /dev/hugepages rw,relatime shared:25 - hugetlbfs hugetlbfs rw
-43 19 0:37 / /proc/sys/fs/binfmt_misc rw,relatime shared:26 - autofs systemd-1 rw,fd=32,pgrp=1,timeout=0,minproto=5,maxproto=5,direct,pipe_ino=35492
-44 20 0:15 / /dev/mqueue rw,relatime shared:27 - mqueue mqueue rw
-45 40 8:1 / /boot rw,relatime shared:28 - ext4 /dev/sda1 rw,stripe=4,data=ordered
-46 40 253:1 / /home rw,relatime shared:29 - ext4 /dev/mapper/vvhg-vvhg rw,data=ordered
-47 40 0:38 / /var/lib/nfs/rpc_pipefs rw,relatime shared:30 - rpc_pipefs sunrpc rw
-125 24 0:20 /mesos/containers /run/mesos/containers rw,nosuid shared:22 - tmpfs tmpfs rw,mode=755
-123 40 253:0 /var/lib/docker/containers /var/lib/docker/containers rw,relatime - ext4 /dev/mapper/vvrg-vvrg rw,data=ordered
-129 40 253:0 /var/lib/docker/overlay2 /var/lib/docker/overlay2 rw,relatime - ext4 /dev/mapper/vvrg-vvrg rw,data=ordered
-119 24 0:39 / /run/user/1009 rw,nosuid,nodev,relatime shared:100 - tmpfs tmpfs rw,size=26387788k,mode=700,uid=1009,gid=1009`
-
-	mountinfoMbaSc = `18 40 0:18 / /sys rw,nosuid,nodev,noexec,relatime shared:6 - sysfs sysfs rw
-19 40 0:3 / /proc rw,nosuid,nodev,noexec,relatime shared:5 - proc proc rw
-20 40 0:5 / /dev rw,nosuid shared:2 - devtmpfs devtmpfs rw,size=131927256k,nr_inodes=32981814,mode=755
-21 18 0:17 / /sys/kernel/security rw,nosuid,nodev,noexec,relatime shared:7 - securityfs securityfs rw
-22 20 0:19 / /dev/shm rw,nosuid,nodev shared:3 - tmpfs tmpfs rw
-23 20 0:12 / /dev/pts rw,nosuid,noexec,relatime shared:4 - devpts devpts rw,gid=5,mode=620,ptmxmode=000
-24 40 0:20 / /run rw,nosuid,nodev shared:22 - tmpfs tmpfs rw,mode=755
-25 18 0:21 / /sys/fs/cgroup ro,nosuid,nodev,noexec shared:8 - tmpfs tmpfs ro,mode=755
-26 25 0:22 / /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:9 - cgroup cgroup rw,xattr,release_agent=/usr/lib/systemd/systemd-cgroups-agent,name=systemd
-27 18 0:23 / /sys/fs/pstore rw,nosuid,nodev,noexec,relatime shared:20 - pstore pstore rw
-28 25 0:24 / /sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:10 - cgroup cgroup rw,perf_event
-29 25 0:25 / /sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:11 - cgroup cgroup rw,cpuacct,cpu
-30 25 0:26 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:12 - cgroup cgroup rw,memory
-31 25 0:27 / /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:13 - cgroup cgroup rw,devices
-32 25 0:28 / /sys/fs/cgroup/hugetlb rw,nosuid,nodev,noexec,relatime shared:14 - cgroup cgroup rw,hugetlb
-33 25 0:29 / /sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:15 - cgroup cgroup rw,blkio
-34 25 0:30 / /sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:16 - cgroup cgroup rw,pids
-35 25 0:31 / /sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,cpuset
-36 25 0:32 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,freezer
-37 25 0:33 / /sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:19 - cgroup cgroup rw,net_prio,net_cls
-38 18 0:34 / /sys/kernel/config rw,relatime shared:21 - configfs configfs rw
-40 0 253:0 / / rw,relatime shared:1 - ext4 /dev/mapper/vvrg-vvrg rw,data=ordered
-16 18 0:6 / /sys/kernel/debug rw,relatime shared:23 - debugfs debugfs rw
-41 18 0:16 / /sys/fs/resctrl rw,relatime shared:24 - resctrl resctrl rw,mba_MBps
-42 20 0:36 / /dev/hugepages rw,relatime shared:25 - hugetlbfs hugetlbfs rw
-43 19 0:37 / /proc/sys/fs/binfmt_misc rw,relatime shared:26 - autofs systemd-1 rw,fd=32,pgrp=1,timeout=0,minproto=5,maxproto=5,direct,pipe_ino=35492
-44 20 0:15 / /dev/mqueue rw,relatime shared:27 - mqueue mqueue rw
-45 40 8:1 / /boot rw,relatime shared:28 - ext4 /dev/sda1 rw,stripe=4,data=ordered
-46 40 253:1 / /home rw,relatime shared:29 - ext4 /dev/mapper/vvhg-vvhg rw,data=ordered
-47 40 0:38 / /var/lib/nfs/rpc_pipefs rw,relatime shared:30 - rpc_pipefs sunrpc rw
-125 24 0:20 /mesos/containers /run/mesos/containers rw,nosuid shared:22 - tmpfs tmpfs rw,mode=755
-123 40 253:0 /var/lib/docker/containers /var/lib/docker/containers rw,relatime - ext4 /dev/mapper/vvrg-vvrg rw,data=ordered
-129 40 253:0 /var/lib/docker/overlay2 /var/lib/docker/overlay2 rw,relatime - ext4 /dev/mapper/vvrg-vvrg rw,data=ordered
-119 24 0:39 / /run/user/1009 rw,nosuid,nodev,relatime shared:100 - tmpfs tmpfs rw,size=26387788k,mode=700,uid=1009,gid=1009`
-)
-
-func TestFindIntelRdtMountpointDir(t *testing.T) {
-	testCases := []struct {
-		name            string
-		input           io.Reader
-		isNotFoundError bool
-		isError         bool
-		mbaScEnabled    bool
-		mountpoint      string
-	}{
-		{
-			name:       "Valid mountinfo with MBA Software Controller disabled",
-			input:      strings.NewReader(mountinfoValid),
-			mountpoint: "/sys/fs/resctrl",
-		},
-		{
-			name:         "Valid mountinfo with MBA Software Controller enabled",
-			input:        strings.NewReader(mountinfoMbaSc),
-			mbaScEnabled: true,
-			mountpoint:   "/sys/fs/resctrl",
-		},
-		{
-			name:            "Empty mountinfo",
-			input:           strings.NewReader(""),
-			isNotFoundError: true,
-		},
-		{
-			name:    "Broken mountinfo",
-			input:   strings.NewReader("baa"),
-			isError: true,
-		},
-	}
-
-	for _, tc := range testCases {
-		tc := tc
-		t.Run(tc.name, func(t *testing.T) {
-			mbaScEnabled = false
-			mp, err := findIntelRdtMountpointDir(tc.input)
-			if tc.isNotFoundError {
-				if !errors.Is(err, errNotFound) {
-					t.Errorf("expected errNotFound error, got %+v", err)
-				}
-				return
-			}
-			if tc.isError {
-				if err == nil {
-					t.Error("expected error, got nil")
-				}
-				return
-			}
-			if err != nil {
-				t.Errorf("expected nil, got %+v", err)
-				return
-			}
-			// no errors, check the results
-			if tc.mbaScEnabled != mbaScEnabled {
-				t.Errorf("expected mbaScEnabled=%v, got %v",
-					tc.mbaScEnabled, mbaScEnabled)
-			}
-			if tc.mountpoint != mp {
-				t.Errorf("expected mountpoint=%q, got %q",
-					tc.mountpoint, mp)
-			}
-		})
-	}
-}
--- a/libcontainer/intelrdt/util_test.go
+++ b/libcontainer/intelrdt/util_test.go
@ -26,7 +26,12 @@ func NewIntelRdtTestUtil(t *testing.T) *intelRdtTestUtil {
 	config := &configs.Config{
 		IntelRdt: &configs.IntelRdt{},
 	}
+
+	// Assign fake intelRtdRoot value, returned by Root().
 	intelRdtRoot = t.TempDir()
+	// Make sure Root() won't even try to parse mountinfo.
+	rootOnce.Do(func() {})
+
 	testIntelRdtPath := filepath.Join(intelRdtRoot, "resctrl")

 	// Ensure the full mock Intel RDT "resource control" filesystem path exists
--- a/libcontainer/mount_linux.go
+++ b/libcontainer/mount_linux.go
@ -1,6 +1,7 @@
 package libcontainer

 import (
+	"io/fs"
 	"strconv"

 	"golang.org/x/sys/unix"
@ -81,3 +82,20 @@ func unmount(target string, flags int) error {
 	}
 	return nil
 }
+
+// syscallMode returns the syscall-specific mode bits from Go's portable mode bits.
+// Copy from https://cs.opensource.google/go/go/+/refs/tags/go1.20.7:src/os/file_posix.go;l=61-75
+func syscallMode(i fs.FileMode) (o uint32) {
+	o |= uint32(i.Perm())
+	if i&fs.ModeSetuid != 0 {
+		o |= unix.S_ISUID
+	}
+	if i&fs.ModeSetgid != 0 {
+		o |= unix.S_ISGID
+	}
+	if i&fs.ModeSticky != 0 {
+		o |= unix.S_ISVTX
+	}
+	// No mapping for Go's ModeTemporary (plan9 only).
+	return
+}
--- a/libcontainer/nsenter/cloned_binary.c
+++ b/libcontainer/nsenter/cloned_binary.c
@ -151,7 +151,7 @@ static int is_self_cloned(void)
 	 * Is the binary a fully-sealed memfd? We don't need CLONED_BINARY_ENV for
 	 * this, because you cannot write to a sealed memfd no matter what (so
 	 * sharing it isn't a bad thing -- and an admin could bind-mount a sealed
-	 * memfd to /usr/bin/runc to allow re-use).
+	 * memfd to /usr/bin/runc to allow reuse).
 	 */
 	ret = fcntl(fd, F_GET_SEALS);
 	if (ret >= 0) {
--- a/libcontainer/nsenter/nsexec.c
+++ b/libcontainer/nsenter/nsexec.c
@ -168,15 +168,17 @@ static void write_log(int level, const char *format, ...)

 	message = escape_json_string(message);

-	if (current_stage == STAGE_SETUP)
+	if (current_stage == STAGE_SETUP) {
 		stage = strdup("nsexec");
-	else
+		if (stage == NULL)
+			goto out;
+	} else {
 		ret = asprintf(&stage, "nsexec-%d", current_stage);
-	if (ret < 0) {
-		stage = NULL;
-		goto out;
+		if (ret < 0) {
+			stage = NULL;
+			goto out;
+		}
 	}
-
 	ret = asprintf(&json, "{\"level\":\"%s\", \"msg\": \"%s[%d]: %s\"}\n",
 		       level_str[level], stage, getpid(), message);
 	if (ret < 0) {
@ -416,11 +418,9 @@ static int getenv_int(const char *name)
 	if (val == endptr || *endptr != '\0')
 		bail("unable to parse %s=%s", name, val);
 	/*
-	 * Sanity check: this must be a small non-negative number.
-	 * Practically, we pass two fds (3 and 4) and a log level,
-	 * for which the maximum is 6 (TRACE).
-	 * */
-	if (ret < 0 || ret > TRACE)
+	 * Sanity check: this must be a non-negative number.
+	 */
+	if (ret < 0)
 		bail("bad value for %s=%s (%d)", name, val, ret);

 	return ret;
@ -832,6 +832,25 @@ void send_mountsources(int sockfd, pid_t child, char *mountsources, size_t mount
 		bail("failed to close container mount namespace fd %d", container_mntns_fd);
 }

+void try_unshare(int flags, const char *msg)
+{
+	write_log(DEBUG, "unshare %s", msg);
+	/*
+	 * Kernels prior to v4.3 may return EINVAL on unshare when another process
+	 * reads runc's /proc/$PID/status or /proc/$PID/maps. To work around this,
+	 * retry on EINVAL a few times.
+	 */
+	int retries = 5;
+	for (; retries > 0; retries--) {
+		if (unshare(flags) == 0) {
+			return;
+		}
+		if (errno != EINVAL)
+			break;
+	}
+	bail("failed to unshare %s", msg);
+}
+
 void nsexec(void)
 {
 	int pipenum;
@ -1070,7 +1089,7 @@ void nsexec(void)

 					s = SYNC_MOUNTSOURCES_ACK;
 					if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
-						kill(stage1_pid, SIGKILL);
+						sane_kill(stage1_pid, SIGKILL);
 						bail("failed to sync with child: write(SYNC_MOUNTSOURCES_ACK)");
 					}
 					break;
@ -1170,9 +1189,7 @@ void nsexec(void)
 			 * problem.
 			 */
 			if (config.cloneflags & CLONE_NEWUSER) {
-				write_log(DEBUG, "unshare user namespace");
-				if (unshare(CLONE_NEWUSER) < 0)
-					bail("failed to unshare user namespace");
+				try_unshare(CLONE_NEWUSER, "user namespace");
 				config.cloneflags &= ~CLONE_NEWUSER;

 				/*
@ -1224,15 +1241,13 @@ void nsexec(void)
 			 * some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID)
 			 * was broken, so we'll just do it the long way anyway.
 			 */
-			write_log(DEBUG, "unshare remaining namespace (except cgroupns)");
-			if (unshare(config.cloneflags & ~CLONE_NEWCGROUP) < 0)
-				bail("failed to unshare remaining namespaces (except cgroupns)");
+			try_unshare(config.cloneflags & ~CLONE_NEWCGROUP, "remaining namespaces (except cgroupns)");

 			/* Ask our parent to send the mount sources fds. */
 			if (config.mountsources) {
 				s = SYNC_MOUNTSOURCES_PLS;
 				if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
-					kill(stage2_pid, SIGKILL);
+					sane_kill(stage2_pid, SIGKILL);
 					bail("failed to sync with parent: write(SYNC_MOUNTSOURCES_PLS)");
 				}

@ -1241,11 +1256,11 @@ void nsexec(void)

 				/* Parent finished to send the mount sources fds. */
 				if (read(syncfd, &s, sizeof(s)) != sizeof(s)) {
-					kill(stage2_pid, SIGKILL);
+					sane_kill(stage2_pid, SIGKILL);
 					bail("failed to sync with parent: read(SYNC_MOUNTSOURCES_ACK)");
 				}
 				if (s != SYNC_MOUNTSOURCES_ACK) {
-					kill(stage2_pid, SIGKILL);
+					sane_kill(stage2_pid, SIGKILL);
 					bail("failed to sync with parent: SYNC_MOUNTSOURCES_ACK: got %u", s);
 				}
 			}
@ -1344,8 +1359,7 @@ void nsexec(void)
 			}

 			if (config.cloneflags & CLONE_NEWCGROUP) {
-				if (unshare(CLONE_NEWCGROUP) < 0)
-					bail("failed to unshare cgroup namespace");
+				try_unshare(CLONE_NEWCGROUP, "cgroup namespace");
 			}

 			write_log(DEBUG, "signal completion to stage-0");
--- a/libcontainer/process_linux.go
+++ b/libcontainer/process_linux.go
@ -39,13 +39,9 @@ type parentProcess interface {

 	// startTime returns the process start time.
 	startTime() (uint64, error)
-
 	signal(os.Signal) error
-
 	externalDescriptors() []string
-
 	setExternalDescriptors(fds []string)
-
 	forwardChildLogs() chan error
 }

@ -303,7 +299,7 @@ type initProcess struct {
 	logFilePair     filePair
 	config          *initConfig
 	manager         cgroups.Manager
-	intelRdtManager intelrdt.Manager
+	intelRdtManager *intelrdt.Manager
 	container       *linuxContainer
 	fds             []string
 	process         *Process
--- a/libcontainer/rootfs_linux.go
+++ b/libcontainer/rootfs_linux.go
@ -80,6 +80,8 @@ func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig, mountFds []int) (err
 		// Therefore, we can access mountFds[i] without any concerns.
 		if mountFds != nil && mountFds[i] != -1 {
 			mountConfig.fd = &mountFds[i]
+		} else {
+			mountConfig.fd = nil
 		}

 		if err := mountToRootfs(m, mountConfig); err != nil {
@ -327,26 +329,41 @@ func mountCgroupV2(m *configs.Mount, c *mountConfig) error {
 	if err := os.MkdirAll(dest, 0o755); err != nil {
 		return err
 	}
-	return utils.WithProcfd(c.root, m.Destination, func(procfd string) error {
-		if err := mount(m.Source, m.Destination, procfd, "cgroup2", uintptr(m.Flags), m.Data); err != nil {
-			// when we are in UserNS but CgroupNS is not unshared, we cannot mount cgroup2 (#2158)
-			if errors.Is(err, unix.EPERM) || errors.Is(err, unix.EBUSY) {
-				src := fs2.UnifiedMountpoint
-				if c.cgroupns && c.cgroup2Path != "" {
-					// Emulate cgroupns by bind-mounting
-					// the container cgroup path rather than
-					// the whole /sys/fs/cgroup.
-					src = c.cgroup2Path
-				}
-				err = mount(src, m.Destination, procfd, "", uintptr(m.Flags)|unix.MS_BIND, "")
-				if c.rootlessCgroups && errors.Is(err, unix.ENOENT) {
-					err = nil
-				}
-			}
-			return err
-		}
-		return nil
+	err = utils.WithProcfd(c.root, m.Destination, func(procfd string) error {
+		return mount(m.Source, m.Destination, procfd, "cgroup2", uintptr(m.Flags), m.Data)
 	})
+	if err == nil || !(errors.Is(err, unix.EPERM) || errors.Is(err, unix.EBUSY)) {
+		return err
+	}
+
+	// When we are in UserNS but CgroupNS is not unshared, we cannot mount
+	// cgroup2 (#2158), so fall back to bind mount.
+	bindM := &configs.Mount{
+		Device:           "bind",
+		Source:           fs2.UnifiedMountpoint,
+		Destination:      m.Destination,
+		Flags:            unix.MS_BIND | m.Flags,
+		PropagationFlags: m.PropagationFlags,
+	}
+	if c.cgroupns && c.cgroup2Path != "" {
+		// Emulate cgroupns by bind-mounting the container cgroup path
+		// rather than the whole /sys/fs/cgroup.
+		bindM.Source = c.cgroup2Path
+	}
+	// mountToRootfs() handles remounting for MS_RDONLY.
+	// No need to set c.fd here, because mountToRootfs() calls utils.WithProcfd() by itself in mountPropagate().
+	err = mountToRootfs(bindM, c)
+	if c.rootlessCgroups && errors.Is(err, unix.ENOENT) {
+		// ENOENT (for `src = c.cgroup2Path`) happens when rootless runc is being executed
+		// outside the userns+mountns.
+		//
+		// Mask `/sys/fs/cgroup` to ensure it is read-only, even when `/sys` is mounted
+		// with `rbind,ro` (`runc spec --rootless` produces `rbind,ro` for `/sys`).
+		err = utils.WithProcfd(c.root, m.Destination, func(procfd string) error {
+			return maskPath(procfd, c.label)
+		})
+	}
+	return err
 }

 func doTmpfsCopyUp(m *configs.Mount, rootfs, mountLabel string) (Err error) {
@ -396,6 +413,35 @@ func doTmpfsCopyUp(m *configs.Mount, rootfs, mountLabel string) (Err error) {

 func mountToRootfs(m *configs.Mount, c *mountConfig) error {
 	rootfs := c.root
+
+	// procfs and sysfs are special because we need to ensure they are actually
+	// mounted on a specific path in a container without any funny business.
+	switch m.Device {
+	case "proc", "sysfs":
+		// If the destination already exists and is not a directory, we bail
+		// out. This is to avoid mounting through a symlink or similar -- which
+		// has been a "fun" attack scenario in the past.
+		// TODO: This won't be necessary once we switch to libpathrs and we can
+		//       stop all of these symlink-exchange attacks.
+		dest := filepath.Clean(m.Destination)
+		if !strings.HasPrefix(dest, rootfs) {
+			// Do not use securejoin as it resolves symlinks.
+			dest = filepath.Join(rootfs, dest)
+		}
+		if fi, err := os.Lstat(dest); err != nil {
+			if !os.IsNotExist(err) {
+				return err
+			}
+		} else if !fi.IsDir() {
+			return fmt.Errorf("filesystem %q must be mounted on ordinary directory", m.Device)
+		}
+		if err := os.MkdirAll(dest, 0o755); err != nil {
+			return err
+		}
+		// Selinux kernels do not support labeling of /proc or /sys.
+		return mountPropagate(m, rootfs, "", nil)
+	}
+
 	mountLabel := c.label
 	mountFd := c.fd
 	dest, err := securejoin.SecureJoin(rootfs, m.Destination)
@ -404,24 +450,6 @@ func mountToRootfs(m *configs.Mount, c *mountConfig) error {
 	}

 	switch m.Device {
-	case "proc", "sysfs":
-		// If the destination already exists and is not a directory, we bail
-		// out This is to avoid mounting through a symlink or similar -- which
-		// has been a "fun" attack scenario in the past.
-		// TODO: This won't be necessary once we switch to libpathrs and we can
-		//       stop all of these symlink-exchange attacks.
-		if fi, err := os.Lstat(dest); err != nil {
-			if !os.IsNotExist(err) {
-				return err
-			}
-		} else if fi.Mode()&os.ModeDir == 0 {
-			return fmt.Errorf("filesystem %q must be mounted on ordinary directory", m.Device)
-		}
-		if err := os.MkdirAll(dest, 0o755); err != nil {
-			return err
-		}
-		// Selinux kernels do not support labeling of /proc or /sys
-		return mountPropagate(m, rootfs, "", nil)
 	case "mqueue":
 		if err := os.MkdirAll(dest, 0o755); err != nil {
 			return err
@ -431,11 +459,16 @@ func mountToRootfs(m *configs.Mount, c *mountConfig) error {
 		}
 		return label.SetFileLabel(dest, mountLabel)
 	case "tmpfs":
-		stat, err := os.Stat(dest)
-		if err != nil {
+		if stat, err := os.Stat(dest); err != nil {
 			if err := os.MkdirAll(dest, 0o755); err != nil {
 				return err
 			}
+		} else {
+			dt := fmt.Sprintf("mode=%04o", syscallMode(stat.Mode()))
+			if m.Data != "" {
+				dt = dt + "," + m.Data
+			}
+			m.Data = dt
 		}

 		if m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP {
@ -444,16 +477,7 @@ func mountToRootfs(m *configs.Mount, c *mountConfig) error {
 			err = mountPropagate(m, rootfs, mountLabel, nil)
 		}

-		if err != nil {
-			return err
-		}
-
-		if stat != nil {
-			if err = os.Chmod(dest, stat.Mode()); err != nil {
-				return err
-			}
-		}
-		return nil
+		return err
 	case "bind":
 		if err := prepareBindMount(m, rootfs, mountFd); err != nil {
 			return err
@ -577,6 +601,7 @@ func checkProcMount(rootfs, dest, source string) error {
 		"/proc/loadavg",
 		"/proc/slabinfo",
 		"/proc/net/dev",
+		"/proc/sys/kernel/ns_last_pid",
 	}
 	for _, valid := range validProcMounts {
 		path, err := filepath.Rel(filepath.Join(rootfs, valid), dest)
--- a/libcontainer/rootfs_linux_test.go
+++ b/libcontainer/rootfs_linux_test.go
@ -38,6 +38,14 @@ func TestCheckMountDestFalsePositive(t *testing.T) {
 	}
 }

+func TestCheckMountDestNsLastPid(t *testing.T) {
+	dest := "/rootfs/proc/sys/kernel/ns_last_pid"
+	err := checkProcMount("/rootfs", dest, "/proc")
+	if err != nil {
+		t.Fatal("/proc/sys/kernel/ns_last_pid should not return an error")
+	}
+}
+
 func TestNeedsSetupDev(t *testing.T) {
 	config := &configs.Config{
 		Mounts: []*configs.Mount{
--- a/libcontainer/seccomp/config.go
+++ b/libcontainer/seccomp/config.go
@ -29,13 +29,15 @@ func KnownOperators() []string {
 }

 var actions = map[string]configs.Action{
-	"SCMP_ACT_KILL":   configs.Kill,
-	"SCMP_ACT_ERRNO":  configs.Errno,
-	"SCMP_ACT_TRAP":   configs.Trap,
-	"SCMP_ACT_ALLOW":  configs.Allow,
-	"SCMP_ACT_TRACE":  configs.Trace,
-	"SCMP_ACT_LOG":    configs.Log,
-	"SCMP_ACT_NOTIFY": configs.Notify,
+	"SCMP_ACT_KILL":         configs.Kill,
+	"SCMP_ACT_ERRNO":        configs.Errno,
+	"SCMP_ACT_TRAP":         configs.Trap,
+	"SCMP_ACT_ALLOW":        configs.Allow,
+	"SCMP_ACT_TRACE":        configs.Trace,
+	"SCMP_ACT_LOG":          configs.Log,
+	"SCMP_ACT_NOTIFY":       configs.Notify,
+	"SCMP_ACT_KILL_THREAD":  configs.KillThread,
+	"SCMP_ACT_KILL_PROCESS": configs.KillProcess,
 }

 // KnownActions returns the list of the known actions.
@ -64,6 +66,7 @@ var archs = map[string]string{
 	"SCMP_ARCH_PPC":         "ppc",
 	"SCMP_ARCH_PPC64":       "ppc64",
 	"SCMP_ARCH_PPC64LE":     "ppc64le",
+	"SCMP_ARCH_RISCV64":     "riscv64",
 	"SCMP_ARCH_S390":        "s390",
 	"SCMP_ARCH_S390X":       "s390x",
 }
--- a/libcontainer/seccomp/patchbpf/enosys_linux.go
+++ b/libcontainer/seccomp/patchbpf/enosys_linux.go
@ -48,6 +48,13 @@ const uintptr_t C_FILTER_FLAG_LOG = SECCOMP_FILTER_FLAG_LOG;
 #endif
 const uintptr_t C_FILTER_FLAG_NEW_LISTENER = SECCOMP_FILTER_FLAG_NEW_LISTENER;

+#ifndef AUDIT_ARCH_RISCV64
+#ifndef EM_RISCV
+#define EM_RISCV		243
+#endif
+#define AUDIT_ARCH_RISCV64	(EM_RISCV|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
+#endif
+
 // We use the AUDIT_ARCH_* values because those are the ones used by the kernel
 // and SCMP_ARCH_* sometimes has fake values (such as SCMP_ARCH_X32). But we
 // use <seccomp.h> so we get libseccomp's fallback definitions of AUDIT_ARCH_*.
@ -67,11 +74,17 @@ const uint32_t C_AUDIT_ARCH_PPC64        = AUDIT_ARCH_PPC64;
 const uint32_t C_AUDIT_ARCH_PPC64LE      = AUDIT_ARCH_PPC64LE;
 const uint32_t C_AUDIT_ARCH_S390         = AUDIT_ARCH_S390;
 const uint32_t C_AUDIT_ARCH_S390X        = AUDIT_ARCH_S390X;
+const uint32_t C_AUDIT_ARCH_RISCV64      = AUDIT_ARCH_RISCV64;
 */
 import "C"

 var retErrnoEnosys = uint32(C.C_ACT_ERRNO_ENOSYS)

+// This syscall is used for multiplexing "large" syscalls on s390(x). Unknown
+// syscalls will end up with this syscall number, so we need to explicitly
+// return -ENOSYS for this syscall on those architectures.
+const s390xMultiplexSyscall libseccomp.ScmpSyscall = 0
+
 func isAllowAction(action configs.Action) bool {
 	switch action {
 	// Trace is considered an "allow" action because a good tracer should
@ -197,6 +210,8 @@ func archToNative(arch libseccomp.ScmpArch) (nativeArch, error) {
 		return nativeArch(C.C_AUDIT_ARCH_S390), nil
 	case libseccomp.ArchS390X:
 		return nativeArch(C.C_AUDIT_ARCH_S390X), nil
+	case libseccomp.ArchRISCV64:
+		return nativeArch(C.C_AUDIT_ARCH_RISCV64), nil
 	default:
 		return invalidArch, fmt.Errorf("unknown architecture: %v", arch)
 	}
@ -305,7 +320,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
 		// directly from the arch code so we need to do it here. Sadly we can't
 		// share this code between architecture branches.
 		section := []bpf.Instruction{
-			// load [0]
+			// load [0] (syscall number)
 			bpf.LoadAbsolute{Off: 0, Size: 4}, // NOTE: We assume sizeof(int) == 4.
 		}

@ -314,10 +329,37 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
 			// No syscalls found for this arch -- skip it and move on.
 			continue
 		case 1:
-			// Get the only syscall in the map.
-			var sysno libseccomp.ScmpSyscall
-			for _, no := range maxSyscalls {
+			// Get the only syscall and scmpArch in the map.
+			var (
+				scmpArch libseccomp.ScmpArch
+				sysno    libseccomp.ScmpSyscall
+			)
+			for arch, no := range maxSyscalls {
 				sysno = no
+				scmpArch = arch
+			}
+
+			switch scmpArch {
+			// Return -ENOSYS for setup(2) on s390(x). This syscall is used for
+			// multiplexing "large syscall number" syscalls, but if the syscall
+			// number is not known to the kernel then the syscall number is
+			// left unchanged (and because it is sysno=0, you'll end up with
+			// EPERM for syscalls the kernel doesn't know about).
+			//
+			// The actual setup(2) syscall is never used by userspace anymore
+			// (and hasn't existed for decades) outside of this multiplexing
+			// scheme so returning -ENOSYS is fine.
+			case libseccomp.ArchS390, libseccomp.ArchS390X:
+				section = append(section, []bpf.Instruction{
+					// jne [setup=0],1
+					bpf.JumpIf{
+						Cond:     bpf.JumpNotEqual,
+						Val:      uint32(s390xMultiplexSyscall),
+						SkipTrue: 1,
+					},
+					// ret [ENOSYS]
+					bpf.RetConstant{Val: retErrnoEnosys},
+				}...)
 			}

 			// The simplest case just boils down to a single jgt instruction,
@ -349,12 +391,6 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
 			// If we're on x86 we need to add a check for x32 and if we're in
 			// the wrong mode we jump over the section.
 			if uint32(nativeArch) == uint32(C.C_AUDIT_ARCH_X86_64) {
-				// Grab the only architecture in the map.
-				var scmpArch libseccomp.ScmpArch
-				for arch := range maxSyscalls {
-					scmpArch = arch
-				}
-
 				// Generate a prefix to check the mode.
 				switch scmpArch {
 				case libseccomp.ArchAMD64:
@ -512,7 +548,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)

 	// Prepend the load instruction for the architecture.
 	programTail = append([]bpf.Instruction{
-		// load [4]
+		// load [4] (architecture)
 		bpf.LoadAbsolute{Off: 4, Size: 4}, // NOTE: We assume sizeof(int) == 4.
 	}, programTail...)

--- a/libcontainer/seccomp/patchbpf/enosys_linux_test.go
+++ b/libcontainer/seccomp/patchbpf/enosys_linux_test.go
@ -213,6 +213,19 @@ func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string)
 				})
 			}

+			// If we're on s390(x) make sure you get -ENOSYS for the "setup"
+			// syscall (this is done to work around an issue with s390x's
+			// syscall multiplexing which results in unknown syscalls being a
+			// setup(2) invocation).
+			switch scmpArch {
+			case libseccomp.ArchS390, libseccomp.ArchS390X:
+				syscallTests = append(syscallTests, syscallTest{
+					sysno:    s390xMultiplexSyscall,
+					syscall:  "setup",
+					expected: retErrnoEnosys,
+				})
+			}
+
 			// Test syscalls in the explicit list.
 			for _, test := range syscallTests {
 				// Override the expected value in the two special cases.
@ -282,7 +295,7 @@ func TestDisassembleHugeFilterDoesNotHang(t *testing.T) {
 	}

 	for i := 1; i < 10000; i++ {
-		if err := hugeFilter.AddRule(libseccomp.ScmpSyscall(i), libseccomp.ActKill); err != nil {
+		if err := hugeFilter.AddRule(libseccomp.ScmpSyscall(i), libseccomp.ActKillThread); err != nil {
 			t.Fatalf("failed to add rule to filter %d: %v", i, err)
 		}
 	}
--- a/libcontainer/seccomp/seccomp_linux.go
+++ b/libcontainer/seccomp/seccomp_linux.go
@ -113,8 +113,8 @@ func InitSeccomp(config *configs.Seccomp) (int, error) {
 // Convert Libcontainer Action to Libseccomp ScmpAction
 func getAction(act configs.Action, errnoRet *uint) (libseccomp.ScmpAction, error) {
 	switch act {
-	case configs.Kill:
-		return libseccomp.ActKill, nil
+	case configs.Kill, configs.KillThread:
+		return libseccomp.ActKillThread, nil
 	case configs.Errno:
 		if errnoRet != nil {
 			return libseccomp.ActErrno.SetReturnCode(int16(*errnoRet)), nil
@ -133,8 +133,6 @@ func getAction(act configs.Action, errnoRet *uint) (libseccomp.ScmpAction, error
 		return libseccomp.ActLog, nil
 	case configs.Notify:
 		return libseccomp.ActNotify, nil
-	case configs.KillThread:
-		return libseccomp.ActKillThread, nil
 	case configs.KillProcess:
 		return libseccomp.ActKillProcess, nil
 	default:
--- a/libcontainer/setns_init_linux.go
+++ b/libcontainer/setns_init_linux.go
@ -4,6 +4,7 @@ import (
 	"errors"
 	"fmt"
 	"os"
+	"os/exec"
 	"strconv"

 	"github.com/opencontainers/selinux/go-selinux"
@ -14,6 +15,7 @@ import (
 	"github.com/opencontainers/runc/libcontainer/keys"
 	"github.com/opencontainers/runc/libcontainer/seccomp"
 	"github.com/opencontainers/runc/libcontainer/system"
+	"github.com/opencontainers/runc/libcontainer/utils"
 )

 // linuxSetnsInit performs the container's initialization for running a new process
@ -82,6 +84,21 @@ func (l *linuxSetnsInit) Init() error {
 	if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
 		return err
 	}
+
+	// Check for the arg before waiting to make sure it exists and it is
+	// returned as a create time error.
+	name, err := exec.LookPath(l.config.Args[0])
+	if err != nil {
+		return err
+	}
+	// exec.LookPath in Go < 1.20 might return no error for an executable
+	// residing on a file system mounted with noexec flag, so perform this
+	// extra check now while we can still return a proper error.
+	// TODO: remove this once go < 1.20 is not supported.
+	if err := eaccess(name); err != nil {
+		return &os.PathError{Op: "eaccess", Path: name, Err: err}
+	}
+
 	// Set seccomp as close to execve as possible, so as few syscalls take
 	// place afterward (reducing the amount of syscalls that users need to
 	// enable in their seccomp profiles).
@ -101,5 +118,23 @@ func (l *linuxSetnsInit) Init() error {
 		return &os.PathError{Op: "close log pipe", Path: "fd " + strconv.Itoa(l.logFd), Err: err}
 	}

-	return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ())
+	// Close all file descriptors we are not passing to the container. This is
+	// necessary because the execve target could use internal runc fds as the
+	// execve path, potentially giving access to binary files from the host
+	// (which can then be opened by container processes, leading to container
+	// escapes). Note that because this operation will close any open file
+	// descriptors that are referenced by (*os.File) handles from underneath
+	// the Go runtime, we must not do any file operations after this point
+	// (otherwise the (*os.File) finaliser could close the wrong file). See
+	// CVE-2024-21626 for more information as to why this protection is
+	// necessary.
+	//
+	// This is not needed for runc-dmz, because the extra execve(2) step means
+	// that all O_CLOEXEC file descriptors have already been closed and thus
+	// the second execve(2) from runc-dmz cannot access internal file
+	// descriptors from runc.
+	if err := utils.UnsafeCloseFrom(l.config.PassedFilesCount + 3); err != nil {
+		return err
+	}
+	return system.Exec(name, l.config.Args[0:], os.Environ())
 }
--- a/libcontainer/specconv/spec_linux.go
+++ b/libcontainer/specconv/spec_linux.go
@ -18,6 +18,7 @@ import (
 	"github.com/opencontainers/runc/libcontainer/configs"
 	"github.com/opencontainers/runc/libcontainer/devices"
 	"github.com/opencontainers/runc/libcontainer/seccomp"
+	"github.com/opencontainers/runc/libcontainer/userns"
 	libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
 	"github.com/opencontainers/runtime-spec/specs-go"
 	"github.com/sirupsen/logrus"
@ -176,18 +177,19 @@ func KnownMountOptions() []string {
 // AllowedDevices is the set of devices which are automatically included for
 // all containers.
 //
-// XXX (cyphar)
-//    This behaviour is at the very least "questionable" (if not outright
-//    wrong) according to the runtime-spec.
+// # XXX (cyphar)
 //
-//    Yes, we have to include certain devices other than the ones the user
-//    specifies, but several devices listed here are not part of the spec
-//    (including "mknod for any device"?!). In addition, these rules are
-//    appended to the user-provided set which means that users *cannot disable
-//    this behaviour*.
+// This behaviour is at the very least "questionable" (if not outright
+// wrong) according to the runtime-spec.
 //
-//    ... unfortunately I'm too scared to change this now because who knows how
-//    many people depend on this (incorrect and arguably insecure) behaviour.
+// Yes, we have to include certain devices other than the ones the user
+// specifies, but several devices listed here are not part of the spec
+// (including "mknod for any device"?!). In addition, these rules are
+// appended to the user-provided set which means that users *cannot disable
+// this behaviour*.
+//
+// ... unfortunately I'm too scared to change this now because who knows how
+// many people depend on this (incorrect and arguably insecure) behaviour.
 var AllowedDevices = []*devices.Device{
 	// allow mknod for any device
 	{
@ -925,9 +927,9 @@ next:
 func setupUserNamespace(spec *specs.Spec, config *configs.Config) error {
 	create := func(m specs.LinuxIDMapping) configs.IDMap {
 		return configs.IDMap{
-			HostID:      int(m.HostID),
-			ContainerID: int(m.ContainerID),
-			Size:        int(m.Size),
+			HostID:      int64(m.HostID),
+			ContainerID: int64(m.ContainerID),
+			Size:        int64(m.Size),
 		}
 	}
 	if spec.Linux != nil {
@ -938,6 +940,40 @@ func setupUserNamespace(spec *specs.Spec, config *configs.Config) error {
 			config.GidMappings = append(config.GidMappings, create(m))
 		}
 	}
+	if path := config.Namespaces.PathOf(configs.NEWUSER); path != "" {
+		// Cache the current userns mappings in our configuration, so that we
+		// can calculate uid and gid mappings within runc. These mappings are
+		// never used for configuring the container if the path is set.
+		uidMap, gidMap, err := userns.GetUserNamespaceMappings(path)
+		if err != nil {
+			return fmt.Errorf("failed to cache mappings for userns: %w", err)
+		}
+		// We cannot allow uid or gid mappings to be set if we are also asked
+		// to join a userns.
+		if config.UidMappings != nil || config.GidMappings != nil {
+			// FIXME: It turns out that containerd and CRIO pass both a userns
+			// path and the mappings of the namespace in the same config.json.
+			// Such a configuration is technically not valid, but we used to
+			// require mappings be specified, and thus users worked around our
+			// bug -- so we can't regress it at the moment. But we also don't
+			// want to produce broken behaviour if the mapping doesn't match
+			// the userns. So (for now) we output a warning if the actual
+			// userns mappings match the configuration, otherwise we return an
+			// error.
+			if !userns.IsSameMapping(uidMap, config.UidMappings) ||
+				!userns.IsSameMapping(gidMap, config.GidMappings) {
+				return errors.New("user namespaces enabled, but both namespace path and non-matching mapping specified -- you may only provide one")
+			}
+			logrus.Warnf("config.json has both a userns path to join and a matching userns mapping specified -- you may only provide one. Future versions of runc may return an error with this configuration, please report a bug on <https://github.com/opencontainers/runc> if you see this warning and cannot update your configuration.")
+		}
+
+		config.UidMappings = uidMap
+		config.GidMappings = gidMap
+		logrus.WithFields(logrus.Fields{
+			"uid_map": uidMap,
+			"gid_map": gidMap,
+		}).Debugf("config uses path-based userns configuration -- current uid and gid mappings cached")
+	}
 	rootUID, err := config.HostRootUID()
 	if err != nil {
 		return err
--- a/libcontainer/specconv/spec_linux_test.go
+++ b/libcontainer/specconv/spec_linux_test.go
@ -234,6 +234,14 @@ func TestSetupSeccomp(t *testing.T) {
 				Names:  []string{"mknod"},
 				Action: "SCMP_ACT_NOTIFY",
 			},
+			{
+				Names:  []string{"rmdir"},
+				Action: "SCMP_ACT_KILL_THREAD",
+			},
+			{
+				Names:  []string{"mkdir"},
+				Action: "SCMP_ACT_KILL_PROCESS",
+			},
 		},
 	}
 	seccomp, err := SetupSeccomp(conf)
@ -263,9 +271,8 @@ func TestSetupSeccomp(t *testing.T) {

 	calls := seccomp.Syscalls

-	callsLength := len(calls)
-	if callsLength != 8 {
-		t.Errorf("Expected 8 syscalls, got :%d", callsLength)
+	if len(calls) != len(conf.Syscalls) {
+		t.Error("Mismatched number of syscalls")
 	}

 	for _, call := range calls {
@ -317,6 +324,14 @@ func TestSetupSeccomp(t *testing.T) {
 			if call.Action != configs.Notify {
 				t.Errorf("Wrong conversion for the %s syscall action", call.Name)
 			}
+		case "rmdir":
+			if call.Action != configs.KillThread {
+				t.Errorf("Wrong conversion for the %s syscall action", call.Name)
+			}
+		case "mkdir":
+			if call.Action != configs.KillProcess {
+				t.Errorf("Wrong conversion for the %s syscall action", call.Name)
+			}
 		default:
 			t.Errorf("Unexpected syscall %s found", call.Name)
 		}
@ -595,6 +610,40 @@ func TestDupNamespaces(t *testing.T) {
 	}
 }

+func TestUserNamespaceMappingAndPath(t *testing.T) {
+	if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) {
+		t.Skip("Test requires userns.")
+	}
+
+	spec := &specs.Spec{
+		Root: &specs.Root{
+			Path: "rootfs",
+		},
+		Linux: &specs.Linux{
+			UIDMappings: []specs.LinuxIDMapping{
+				{ContainerID: 0, HostID: 1000, Size: 1000},
+			},
+			GIDMappings: []specs.LinuxIDMapping{
+				{ContainerID: 0, HostID: 2000, Size: 1000},
+			},
+			Namespaces: []specs.LinuxNamespace{
+				{
+					Type: "user",
+					Path: "/proc/1/ns/user",
+				},
+			},
+		},
+	}
+
+	_, err := CreateLibcontainerConfig(&CreateOpts{
+		Spec: spec,
+	})
+
+	if !strings.Contains(err.Error(), "both namespace path and non-matching mapping specified") {
+		t.Errorf("user namespace with path and non-matching mapping should be forbidden, got error %v", err)
+	}
+}
+
 func TestNonZeroEUIDCompatibleSpecconvValidate(t *testing.T) {
 	if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) {
 		t.Skip("Test requires userns.")
--- a/libcontainer/standard_init_linux.go
+++ b/libcontainer/standard_init_linux.go
@ -17,6 +17,7 @@ import (
 	"github.com/opencontainers/runc/libcontainer/keys"
 	"github.com/opencontainers/runc/libcontainer/seccomp"
 	"github.com/opencontainers/runc/libcontainer/system"
+	"github.com/opencontainers/runc/libcontainer/utils"
 )

 type linuxStandardInit struct {
@ -198,6 +199,14 @@ func (l *linuxStandardInit) Init() error {
 	if err != nil {
 		return err
 	}
+	// exec.LookPath in Go < 1.20 might return no error for an executable
+	// residing on a file system mounted with noexec flag, so perform this
+	// extra check now while we can still return a proper error.
+	// TODO: remove this once go < 1.20 is not supported.
+	if err := eaccess(name); err != nil {
+		return &os.PathError{Op: "eaccess", Path: name, Err: err}
+	}
+
 	// Set seccomp as close to execve as possible, so as few syscalls take
 	// place afterward (reducing the amount of syscalls that users need to
 	// enable in their seccomp profiles). However, this needs to be done
@ -250,5 +259,23 @@ func (l *linuxStandardInit) Init() error {
 		return err
 	}

+	// Close all file descriptors we are not passing to the container. This is
+	// necessary because the execve target could use internal runc fds as the
+	// execve path, potentially giving access to binary files from the host
+	// (which can then be opened by container processes, leading to container
+	// escapes). Note that because this operation will close any open file
+	// descriptors that are referenced by (*os.File) handles from underneath
+	// the Go runtime, we must not do any file operations after this point
+	// (otherwise the (*os.File) finaliser could close the wrong file). See
+	// CVE-2024-21626 for more information as to why this protection is
+	// necessary.
+	//
+	// This is not needed for runc-dmz, because the extra execve(2) step means
+	// that all O_CLOEXEC file descriptors have already been closed and thus
+	// the second execve(2) from runc-dmz cannot access internal file
+	// descriptors from runc.
+	if err := utils.UnsafeCloseFrom(l.config.PassedFilesCount + 3); err != nil {
+		return err
+	}
 	return system.Exec(name, l.config.Args[0:], os.Environ())
 }
--- a/libcontainer/sync.go
+++ b/libcontainer/sync.go
@ -15,16 +15,16 @@ type syncType string
 // during container setup. They come in pairs (with procError being a generic
 // response which is followed by an &initError).
 //
-// [  child  ] <-> [   parent   ]
+//	[  child  ] <-> [   parent   ]
 //
-// procHooks   --> [run hooks]
-//             <-- procResume
+//	procHooks   --> [run hooks]
+//	            <-- procResume
 //
-// procReady   --> [final setup]
-//             <-- procRun
+//	procReady   --> [final setup]
+//	            <-- procRun
 //
-// procSeccomp --> [pick up seccomp fd with pidfd_getfd()]
-//             <-- procSeccompDone
+//	procSeccomp --> [pick up seccomp fd with pidfd_getfd()]
+//	            <-- procSeccompDone
 const (
 	procError       syncType = "procError"
 	procReady       syncType = "procReady"
--- a/libcontainer/user/user.go
+++ b/libcontainer/user/user.go
@ -201,7 +201,7 @@ func ParseGroupFilter(r io.Reader, filter func(Group) bool) ([]Group, error) {
 			if err != nil {
 				// We should return no error if EOF is reached
 				// without a match.
-				if err == io.EOF { //nolint:errorlint // comparison with io.EOF is legit, https://github.com/polyfloyd/go-errorlint/pull/12
+				if err == io.EOF {
 					err = nil
 				}
 				return out, err
@ -280,13 +280,13 @@ func GetExecUserPath(userSpec string, defaults *ExecUser, passwdPath, groupPath
 // found in any entry in passwd and group respectively.
 //
 // Examples of valid user specifications are:
-//     * ""
-//     * "user"
-//     * "uid"
-//     * "user:group"
-//     * "uid:gid
-//     * "user:gid"
-//     * "uid:group"
+//   - ""
+//   - "user"
+//   - "uid"
+//   - "user:group"
+//   - "uid:gid
+//   - "user:gid"
+//   - "uid:group"
 //
 // It should be noted that if you specify a numeric user or group id, they will
 // not be evaluated as usernames (only the metadata will be filled). So attempting
--- a/libcontainer/userns/userns_maps.c
+++ b/libcontainer/userns/userns_maps.c
@ -0,0 +1,79 @@
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <sched.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdarg.h>
+#include <stdlib.h>
+
+/*
+ * All of the code here is run inside an aync-signal-safe context, so we need
+ * to be careful to not call any functions that could cause issues. In theory,
+ * since we are a Go program, there are fewer restrictions in practice, it's
+ * better to be safe than sorry.
+ *
+ * The only exception is exit, which we need to call to make sure we don't
+ * return into runc.
+ */
+
+void bail(int pipefd, const char *fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+	vdprintf(pipefd, fmt, args);
+	va_end(args);
+
+	exit(1);
+}
+
+int spawn_userns_cat(char *userns_path, char *path, int outfd, int errfd)
+{
+	char buffer[4096] = { 0 };
+
+	pid_t child = fork();
+	if (child != 0)
+		return child;
+	/* in child */
+
+	/* Join the target userns. */
+	int nsfd = open(userns_path, O_RDONLY);
+	if (nsfd < 0)
+		bail(errfd, "open userns path %s failed: %m", userns_path);
+
+	int err = setns(nsfd, CLONE_NEWUSER);
+	if (err < 0)
+		bail(errfd, "setns %s failed: %m", userns_path);
+
+	close(nsfd);
+
+	/* Pipe the requested file contents. */
+	int fd = open(path, O_RDONLY);
+	if (fd < 0)
+		bail(errfd, "open %s in userns %s failed: %m", path, userns_path);
+
+	int nread, ntotal = 0;
+	while ((nread = read(fd, buffer, sizeof(buffer))) != 0) {
+		if (nread < 0)
+			bail(errfd, "read bytes from %s failed (after %d total bytes read): %m", path, ntotal);
+		ntotal += nread;
+
+		int nwritten = 0;
+		while (nwritten < nread) {
+			int n = write(outfd, buffer, nread - nwritten);
+			if (n < 0)
+				bail(errfd, "write %d bytes from %s failed (after %d bytes written): %m",
+				     nread - nwritten, path, nwritten);
+			nwritten += n;
+		}
+		if (nread != nwritten)
+			bail(errfd, "mismatch for bytes read and written: %d read != %d written", nread, nwritten);
+	}
+
+	close(fd);
+	close(outfd);
+	close(errfd);
+
+	/* We must exit here, otherwise we would return into a forked runc. */
+	exit(0);
+}
--- a/libcontainer/userns/userns_maps_linux.go
+++ b/libcontainer/userns/userns_maps_linux.go
@ -0,0 +1,186 @@
+//go:build linux
+
+package userns
+
+import (
+	"bufio"
+	"bytes"
+	"fmt"
+	"io"
+	"os"
+	"unsafe"
+
+	"github.com/opencontainers/runc/libcontainer/configs"
+	"github.com/sirupsen/logrus"
+)
+
+/*
+#include <stdlib.h>
+extern int spawn_userns_cat(char *userns_path, char *path, int outfd, int errfd);
+*/
+import "C"
+
+func parseIdmapData(data []byte) (ms []configs.IDMap, err error) {
+	scanner := bufio.NewScanner(bytes.NewReader(data))
+	for scanner.Scan() {
+		var m configs.IDMap
+		line := scanner.Text()
+		if _, err := fmt.Sscanf(line, "%d %d %d", &m.ContainerID, &m.HostID, &m.Size); err != nil {
+			return nil, fmt.Errorf("parsing id map failed: invalid format in line %q: %w", line, err)
+		}
+		ms = append(ms, m)
+	}
+	if err := scanner.Err(); err != nil {
+		return nil, fmt.Errorf("parsing id map failed: %w", err)
+	}
+	return ms, nil
+}
+
+// Do something equivalent to nsenter --user=<nsPath> cat <path>, but more
+// efficiently. Returns the contents of the requested file from within the user
+// namespace.
+func spawnUserNamespaceCat(nsPath string, path string) ([]byte, error) {
+	rdr, wtr, err := os.Pipe()
+	if err != nil {
+		return nil, fmt.Errorf("create pipe for userns spawn failed: %w", err)
+	}
+	defer rdr.Close()
+	defer wtr.Close()
+
+	errRdr, errWtr, err := os.Pipe()
+	if err != nil {
+		return nil, fmt.Errorf("create error pipe for userns spawn failed: %w", err)
+	}
+	defer errRdr.Close()
+	defer errWtr.Close()
+
+	cNsPath := C.CString(nsPath)
+	defer C.free(unsafe.Pointer(cNsPath))
+	cPath := C.CString(path)
+	defer C.free(unsafe.Pointer(cPath))
+
+	childPid := C.spawn_userns_cat(cNsPath, cPath, C.int(wtr.Fd()), C.int(errWtr.Fd()))
+
+	if childPid < 0 {
+		return nil, fmt.Errorf("failed to spawn fork for userns")
+	} else if childPid == 0 {
+		// this should never happen
+		panic("runc executing inside fork child -- unsafe state!")
+	}
+
+	// We are in the parent -- close the write end of the pipe before reading.
+	wtr.Close()
+	output, err := io.ReadAll(rdr)
+	rdr.Close()
+	if err != nil {
+		return nil, fmt.Errorf("reading from userns spawn failed: %w", err)
+	}
+
+	// Ditto for the error pipe.
+	errWtr.Close()
+	errOutput, err := io.ReadAll(errRdr)
+	errRdr.Close()
+	if err != nil {
+		return nil, fmt.Errorf("reading from userns spawn error pipe failed: %w", err)
+	}
+	errOutput = bytes.TrimSpace(errOutput)
+
+	// Clean up the child.
+	child, err := os.FindProcess(int(childPid))
+	if err != nil {
+		return nil, fmt.Errorf("could not find userns spawn process: %w", err)
+	}
+	state, err := child.Wait()
+	if err != nil {
+		return nil, fmt.Errorf("failed to wait for userns spawn process: %w", err)
+	}
+	if !state.Success() {
+		errStr := string(errOutput)
+		if errStr == "" {
+			errStr = fmt.Sprintf("unknown error (status code %d)", state.ExitCode())
+		}
+		return nil, fmt.Errorf("userns spawn: %s", errStr)
+	} else if len(errOutput) > 0 {
+		// We can just ignore weird output in the error pipe if the process
+		// didn't bail(), but for completeness output for debugging.
+		logrus.Debugf("userns spawn succeeded but unexpected error message found: %s", string(errOutput))
+	}
+	// The subprocess succeeded, return whatever it wrote to the pipe.
+	return output, nil
+}
+
+func GetUserNamespaceMappings(nsPath string) (uidMap, gidMap []configs.IDMap, err error) {
+	var (
+		pid         int
+		extra       rune
+		tryFastPath bool
+	)
+
+	// nsPath is usually of the form /proc/<pid>/ns/user, which means that we
+	// already have a pid that is part of the user namespace and thus we can
+	// just use the pid to read from /proc/<pid>/*id_map.
+	//
+	// Note that Sscanf doesn't consume the whole input, so we check for any
+	// trailing data with %c. That way, we can be sure the pattern matched
+	// /proc/$pid/ns/user _exactly_ iff n === 1.
+	if n, _ := fmt.Sscanf(nsPath, "/proc/%d/ns/user%c", &pid, &extra); n == 1 {
+		tryFastPath = pid > 0
+	}
+
+	for _, mapType := range []struct {
+		name  string
+		idMap *[]configs.IDMap
+	}{
+		{"uid_map", &uidMap},
+		{"gid_map", &gidMap},
+	} {
+		var mapData []byte
+
+		if tryFastPath {
+			path := fmt.Sprintf("/proc/%d/%s", pid, mapType.name)
+			data, err := os.ReadFile(path)
+			if err != nil {
+				// Do not error out here -- we need to try the slow path if the
+				// fast path failed.
+				logrus.Debugf("failed to use fast path to read %s from userns %s (error: %s), falling back to slow userns-join path", mapType.name, nsPath, err)
+			} else {
+				mapData = data
+			}
+		} else {
+			logrus.Debugf("cannot use fast path to read %s from userns %s, falling back to slow userns-join path", mapType.name, nsPath)
+		}
+
+		if mapData == nil {
+			// We have to actually join the namespace if we cannot take the
+			// fast path. The path is resolved with respect to the child
+			// process, so just use /proc/self.
+			data, err := spawnUserNamespaceCat(nsPath, "/proc/self/"+mapType.name)
+			if err != nil {
+				return nil, nil, err
+			}
+			mapData = data
+		}
+		idMap, err := parseIdmapData(mapData)
+		if err != nil {
+			return nil, nil, fmt.Errorf("failed to parse %s of userns %s: %w", mapType.name, nsPath, err)
+		}
+		*mapType.idMap = idMap
+	}
+
+	return uidMap, gidMap, nil
+}
+
+// IsSameMapping returns whether or not the two id mappings are the same. Note
+// that if the order of the mappings is different, or a mapping has been split,
+// the mappings will be considered different.
+func IsSameMapping(a, b []configs.IDMap) bool {
+	if len(a) != len(b) {
+		return false
+	}
+	for idx := range a {
+		if a[idx] != b[idx] {
+			return false
+		}
+	}
+	return true
+}
--- a/Show More
+++ b/Show More
 @ -1 +1 @@
 .1.0
 .1.12