merge upstream 1.1.0

2022-12-30 11:21:19 +08:00 · 2022-12-30 11:21:19 +08:00 · 1dc29861c3
parent f67506f80e
commit 1dc29861c3
1285 changed files with 161305 additions and 143317 deletions
--- a/.cirrus.yml
+++ b/.cirrus.yml
@ -0,0 +1,158 @@
+---
+# We use Cirrus for Vagrant tests and native CentOS 7 and 8, because macOS
+# instances of GHA are too slow and flaky, and Linux instances of GHA do not
+# support KVM.
+
+# NOTE Cirrus execution environments lack a terminal, needed for
+# some integration tests. So we use `ssh -tt` command to fake a terminal.
+
+task:
+  timeout_in: 30m
+
+  env:
+    DEBIAN_FRONTEND: noninteractive
+    HOME: /root
+    # yamllint disable rule:key-duplicates
+    matrix:
+      DISTRO: fedora
+
+  name: vagrant DISTRO:$DISTRO
+
+  compute_engine_instance:
+    image_project: cirrus-images
+    image: family/docker-kvm
+    platform: linux
+    nested_virtualization: true
+    # CPU limit: `16 / NTASK`: see https://cirrus-ci.org/faq/#are-there-any-limits
+    cpu: 8
+    # Memory limit: `4GB * NCPU`
+    memory: 32G
+
+  host_info_script: |
+    uname -a
+    echo "-----"
+    cat /etc/os-release
+    echo "-----"
+    cat /proc/cpuinfo
+    echo "-----"
+    df -T
+  install_libvirt_vagrant_script: |
+    apt-get update
+    apt-get install -y libvirt-daemon libvirt-daemon-system vagrant vagrant-libvirt
+    systemctl enable --now libvirtd
+  vagrant_cache:
+    fingerprint_script: uname -s ; cat Vagrantfile.$DISTRO
+    folder: /root/.vagrant.d
+  vagrant_up_script: |
+    ln -sf Vagrantfile.$DISTRO Vagrantfile
+    # Retry if it fails (download.fedoraproject.org returns 404 sometimes)
+    vagrant up --no-tty || vagrant up --no-tty
+    mkdir -p -m 0700 /root/.ssh
+    vagrant ssh-config >> /root/.ssh/config
+  guest_info_script: |
+    ssh default 'sh -exc "uname -a && systemctl --version && df -T && cat /etc/os-release"'
+  unit_tests_script: |
+    ssh default 'sudo -i make -C /vagrant localunittest'
+  integration_systemd_script: |
+    ssh -tt default "sudo -i make -C /vagrant localintegration RUNC_USE_SYSTEMD=yes"
+  integration_fs_script: |
+    ssh -tt default "sudo -i make -C /vagrant localintegration"
+  integration_systemd_rootless_script: |
+    ssh -tt default "sudo -i make -C /vagrant localrootlessintegration RUNC_USE_SYSTEMD=yes"
+  integration_fs_rootless_script: |
+    ssh -tt default "sudo -i make -C /vagrant localrootlessintegration"
+
+task:
+  timeout_in: 30m
+
+  env:
+    HOME: /root
+    CIRRUS_WORKING_DIR: /home/runc
+    GO_VERSION: "1.17.3"
+    BATS_VERSION: "v1.3.0"
+    # yamllint disable rule:key-duplicates
+    matrix:
+      DISTRO: centos-7
+      DISTRO: centos-stream-8
+
+  name: ci / $DISTRO
+
+  compute_engine_instance:
+    image_project: centos-cloud
+    image: family/$DISTRO
+    platform: linux
+    cpu: 4
+    memory: 8G
+
+  install_dependencies_script: |
+    case $DISTRO in
+    centos-7)
+      (cd /etc/yum.repos.d && curl -O https://copr.fedorainfracloud.org/coprs/adrian/criu-el7/repo/epel-7/adrian-criu-el7-epel-7.repo)
+      # sysctl
+      echo "user.max_user_namespaces=15076" > /etc/sysctl.d/userns.conf
+      sysctl --system
+      ;;
+    centos-stream-8)
+      yum config-manager --set-enabled powertools # for glibc-static
+      ;;
+    esac
+    # Work around dnf mirror failures by retrying a few times.
+    for i in $(seq 0 2); do
+      sleep $i
+      yum install -y -q gcc git iptables jq glibc-static libseccomp-devel make criu fuse-sshfs && break
+    done
+    [ $? -eq 0 ] # fail if yum failed
+    # install Go
+    curl -fsSL "https://dl.google.com/go/go${GO_VERSION}.linux-amd64.tar.gz" | tar Cxz /usr/local
+    # install bats
+    cd /tmp
+    git clone https://github.com/bats-core/bats-core
+    cd bats-core
+    git checkout $BATS_VERSION
+    ./install.sh /usr/local
+    cd -
+    # Add a user for rootless tests
+    useradd -u2000 -m -d/home/rootless -s/bin/bash rootless
+    # Allow root and rootless itself to execute `ssh rootless@localhost` in tests/rootless.sh
+    ssh-keygen -t ecdsa -N "" -f /root/rootless.key
+    mkdir -m 0700 -p /home/rootless/.ssh
+    cp /root/rootless.key /home/rootless/.ssh/id_ecdsa
+    cat /root/rootless.key.pub >> /home/rootless/.ssh/authorized_keys
+    chown -R rootless.rootless /home/rootless
+    # set PATH
+    echo 'export PATH=/usr/local/go/bin:/usr/local/bin:$PATH' >> /root/.bashrc
+    # Setup ssh localhost for terminal emulation (script -e did not work)
+    ssh-keygen -t ed25519 -f /root/.ssh/id_ed25519 -N ""
+    cat /root/.ssh/id_ed25519.pub >> /root/.ssh/authorized_keys
+    chmod 400 /root/.ssh/authorized_keys
+    ssh-keyscan localhost >> /root/.ssh/known_hosts
+    echo -e "Host localhost\n\tStrictHostKeyChecking no\t\nIdentityFile /root/.ssh/id_ed25519\n" >> /root/.ssh/config
+    sed -e "s,PermitRootLogin.*,PermitRootLogin prohibit-password,g" -i /etc/ssh/sshd_config
+    systemctl restart sshd
+  host_info_script: |
+    uname -a
+    echo "-----"
+    cat /etc/os-release
+    echo "-----"
+    cat /proc/cpuinfo
+    echo "-----"
+    df -T
+    echo "-----"
+    systemctl --version
+  unit_tests_script: |
+    ssh -tt localhost "make -C /home/runc localunittest"
+  integration_systemd_script: |
+    ssh -tt localhost "make -C /home/runc localintegration RUNC_USE_SYSTEMD=yes"
+  integration_fs_script: |
+    ssh -tt localhost "make -C /home/runc localintegration"
+  integration_systemd_rootless_script: |
+    echo "SKIP: integration_systemd_rootless_script requires cgroup v2"
+  integration_fs_rootless_script: |
+    case $DISTRO in
+    centos-7)
+      echo "SKIP: FIXME: integration_fs_rootless_script is skipped because of EPERM on writing cgroup.procs"
+        ;;
+    centos-stream-8)
+      ssh -tt localhost "make -C /home/runc localrootlessintegration"
+      ;;
+    esac
--- a/.codespellrc
+++ b/.codespellrc
@ -0,0 +1,3 @@
+[codespell]
+skip = ./vendor,./.git
+ignore-words-list = clos,creat
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@ -0,0 +1,25 @@
+# Please see the documentation for all configuration options:
+# https://help.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
+
+version: 2
+updates:
+  # Dependencies listed in go.mod
+  - package-ecosystem: "gomod"
+    directory: "/" # Location of package manifests
+    schedule:
+      interval: "daily"
+    ignore:
+      # a regression in v1.22.2, see https://github.com/urfave/cli/issues/1092
+      - dependency-name: "github.com/urfave/cli"
+
+  # Dependencies listed in .github/workflows/*.yml
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      interval: "daily"
+
+  # Dependencies listed in Dockerfile
+  - package-ecosystem: "docker"
+    directory: "/"
+    schedule:
+      interval: "daily"
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@ -0,0 +1,129 @@
+# NOTE Github Actions execution environments lack a terminal, needed for
+# some integration tests. So we use `script` command to fake a terminal.
+
+name: ci
+on:
+  push:
+    tags:
+      - v*
+    branches:
+      - master
+      - release-*
+  pull_request:
+
+env:
+  # Don't ignore C warnings. Note that the output of "go env CGO_CFLAGS" by default is "-g -O2", so we keep them.
+  CGO_CFLAGS: -g -O2 -Werror
+
+jobs:
+  test:
+    runs-on: ubuntu-20.04
+    strategy:
+      fail-fast: false
+      matrix:
+        go-version: [1.16.x, 1.17.x]
+        rootless: ["rootless", ""]
+        race: ["-race", ""]
+        criu: [""]
+        include:
+          # Also test against latest criu-dev
+          - go-version: 1.17.x
+            rootless: ""
+            race: ""
+            criu: "criu-dev"
+
+    steps:
+
+    - name: checkout
+      uses: actions/checkout@v2
+
+    - name: install deps
+      if: matrix.criu == ''
+      env:
+        REPO: https://download.opensuse.org/repositories/devel:/tools:/criu/xUbuntu_20.04
+      run: |
+        # criu repo
+        curl -fSsl $REPO/Release.key | sudo apt-key add -
+        echo "deb $REPO/ /" | sudo tee /etc/apt/sources.list.d/criu.list
+        sudo apt update
+        sudo apt install libseccomp-dev criu sshfs
+
+    - name: install deps (criu ${{ matrix.criu }})
+      if: matrix.criu != ''
+      run: |
+        sudo apt -q update
+        sudo apt -q install libseccomp-dev sshfs \
+          libcap-dev libnet1-dev libnl-3-dev \
+          libprotobuf-c-dev libprotobuf-dev protobuf-c-compiler protobuf-compiler
+        git clone https://github.com/checkpoint-restore/criu.git ~/criu
+        (cd ~/criu && git checkout ${{ matrix.criu }} && sudo make install-criu)
+        rm -rf ~/criu
+
+    - name: install go ${{ matrix.go-version }}
+      uses: actions/setup-go@v2
+      with:
+        stable: '!contains(${{ matrix.go-version }}, "beta") && !contains(${{ matrix.go-version }}, "rc")'
+        go-version: ${{ matrix.go-version }}
+
+    - name: build
+      run: sudo -E PATH="$PATH" make EXTRA_FLAGS="${{ matrix.race }}" all
+
+    - name: install bats
+      uses: mig4/setup-bats@v1
+      with:
+        bats-version: 1.3.0
+
+    - name: unit test
+      if: matrix.rootless != 'rootless'
+      run: sudo -E PATH="$PATH" -- make TESTFLAGS="${{ matrix.race }}" localunittest
+
+    - name: add rootless user
+      if: matrix.rootless == 'rootless'
+      run: |
+        sudo useradd -u2000 -m -d/home/rootless -s/bin/bash rootless
+        # Allow root and rootless itself to execute `ssh rootless@localhost` in tests/rootless.sh
+        ssh-keygen -t ecdsa -N "" -f $HOME/rootless.key
+        sudo mkdir -m 0700 -p /home/rootless/.ssh
+        sudo cp $HOME/rootless.key /home/rootless/.ssh/id_ecdsa
+        sudo cp $HOME/rootless.key.pub /home/rootless/.ssh/authorized_keys
+        sudo chown -R rootless.rootless /home/rootless
+
+    - name: integration test (fs driver)
+      run: sudo -E PATH="$PATH" script -e -c 'make local${{ matrix.rootless }}integration'
+
+    - name: integration test (systemd driver)
+      # can't use systemd driver with cgroupv1
+      if: matrix.rootless != 'rootless'
+      run: sudo -E PATH="$PATH" script -e -c 'make RUNC_USE_SYSTEMD=yes local${{ matrix.rootless }}integration'
+
+  # We need to continue support for 32-bit ARM.
+  # However, we do not have 32-bit ARM CI, so we use i386 for testing 32bit stuff.
+  # We are not interested in providing official support for i386.
+  cross-i386:
+    runs-on: ubuntu-20.04
+
+    steps:
+
+    - name: checkout
+      uses: actions/checkout@v2
+
+    - name: install deps
+      run: |
+        sudo dpkg --add-architecture i386
+        # add criu repo
+        sudo add-apt-repository -y ppa:criu/ppa
+        # apt-add-repository runs apt update so we don't have to.
+
+        # Due to a bug in apt, we have to update it first
+        # (see https://bugs.launchpad.net/ubuntu-cdimage/+bug/1871268)
+        sudo apt -q install apt
+        sudo apt -q install libseccomp-dev libseccomp-dev:i386 gcc-multilib criu
+
+    - name: install go
+      uses: actions/setup-go@v2
+      with:
+        go-version: 1.x # Latest stable
+
+    - name: unit test
+      # cgo is disabled by default when cross-compiling
+      run: sudo -E PATH="$PATH" -- make GOARCH=386 CGO_ENABLED=1 localunittest
--- a/.github/workflows/validate.yml
+++ b/.github/workflows/validate.yml
@ -0,0 +1,198 @@
+name: validate
+on:
+  push:
+    tags:
+      - v*
+    branches:
+      - master
+      - release-*
+  pull_request:
+
+jobs:
+
+  lint:
+    runs-on: ubuntu-20.04
+    steps:
+      - uses: actions/checkout@v2
+      - name: install deps
+        run: |
+          sudo apt -q update
+          sudo apt -q install libseccomp-dev
+      - uses: golangci/golangci-lint-action@v2
+        with:
+          # must be specified without patch version
+          version: v1.42
+
+  lint-extra:
+    # Extra linters, only checking new code from pull requests.
+    if: github.event_name == 'pull_request'
+    runs-on: ubuntu-20.04
+    permissions:
+      contents: read
+    steps:
+      - uses: actions/checkout@v2
+      - name: install deps
+        run: |
+          sudo apt -q update
+          sudo apt -q install libseccomp-dev
+      - uses: golangci/golangci-lint-action@v2
+        with:
+          only-new-issues: true
+          args: --config .golangci-extra.yml
+          # must be specified without patch version
+          version: v1.43
+
+
+  compile-buildtags:
+    runs-on: ubuntu-20.04
+    env:
+      # Don't ignore C warnings. Note that the output of "go env CGO_CFLAGS" by default is "-g -O2", so we keep them.
+      CGO_CFLAGS: -g -O2 -Werror
+    steps:
+      - uses: actions/checkout@v2
+      - name: install go
+        uses: actions/setup-go@v2
+        with:
+          go-version: 1.x # Latest stable
+      - name: compile with no build tags
+        run: make BUILDTAGS=""
+
+  codespell:
+    runs-on: ubuntu-20.04
+    steps:
+    - uses: actions/checkout@v2
+    - name: install deps
+      # Version of codespell bundled with Ubuntu is way old, so use pip.
+      run: pip install codespell
+    - name: run codespell
+      run: codespell
+
+  shfmt:
+    runs-on: ubuntu-20.04
+    steps:
+    - uses: actions/checkout@v2
+    - name: vars
+      run: |
+        echo "VERSION=3.3.1" >> $GITHUB_ENV
+        echo "$(go env GOPATH)/bin" >> $GITHUB_PATH
+    - name: cache go mod and $GOCACHE
+      uses: actions/cache@v2
+      with:
+        path: |
+          ~/go/pkg/mod
+          ~/.cache/go-build
+        key: ${{ runner.os }}-shfmt-${{ env.VERSION }}
+        restore-keys: ${{ runner.os }}-shfmt-
+    - name: install shfmt
+      run: |
+        command -v shfmt || \
+          (cd ~ && GO111MODULE=on time go get mvdan.cc/sh/v3/cmd/shfmt@v$VERSION)
+    - name: shfmt
+      run: make shfmt
+
+  shellcheck:
+    runs-on: ubuntu-20.04
+    steps:
+      - uses: actions/checkout@v2
+      - name: vars
+        run: |
+          echo 'VERSION=v0.7.2' >> $GITHUB_ENV
+          echo 'BASEURL=https://github.com/koalaman/shellcheck/releases/download' >> $GITHUB_ENV
+          echo 'SHA256SUM=12ee2e0b90a3d1e9cae24ac9b2838be66b48573cb2c8e8f3c566b959df6f050c' >> $GITHUB_ENV
+          echo ~/bin >> $GITHUB_PATH
+      - name: install shellcheck
+        run: |
+          mkdir ~/bin
+          curl -sSfL --retry 5 $BASEURL/$VERSION/shellcheck-$VERSION.linux.x86_64.tar.xz |
+            tar xfJ - -C ~/bin --strip 1 shellcheck-$VERSION/shellcheck
+          sha256sum ~/bin/shellcheck | grep -q $SHA256SUM
+          # make sure to remove the old version
+          sudo rm -f /usr/bin/shellcheck
+      - uses: lumaxis/shellcheck-problem-matchers@v1
+      - name: shellcheck
+        run: |
+          make shellcheck
+
+  deps:
+    runs-on: ubuntu-20.04
+    steps:
+    - uses: actions/checkout@v2
+    - name: install go
+      uses: actions/setup-go@v2
+      with:
+        go-version: 1.x # Latest stable
+    - name: cache go mod and $GOCACHE
+      uses: actions/cache@v2
+      with:
+        path: |
+          ~/go/pkg/mod
+          ~/.cache/go-build
+        key: ${{ runner.os }}-go.sum-${{ hashFiles('**/go.sum') }}
+        restore-keys: ${{ runner.os }}-go.sum-
+    - name: verify deps
+      run: make verify-dependencies
+
+
+  commit:
+    runs-on: ubuntu-20.04
+    # Only check commits on pull requests.
+    if: github.event_name == 'pull_request'
+    steps:
+      - name: get pr commits
+        id: 'get-pr-commits'
+        uses: tim-actions/get-pr-commits@v1.1.0
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: check subject line length
+        uses: tim-actions/commit-message-checker-with-regex@v0.3.1
+        with:
+          commits: ${{ steps.get-pr-commits.outputs.commits }}
+          pattern: '^.{0,72}(\n.*)*$'
+          error: 'Subject too long (max 72)'
+
+
+  cfmt:
+    runs-on: ubuntu-20.04
+    steps:
+    - name: checkout
+      uses: actions/checkout@v2
+      with:
+        fetch-depth: 0
+    - name: install deps
+      run: |
+        sudo apt -qq update
+        sudo apt -qq install indent
+    - name: cfmt
+      run: |
+        make cfmt
+        git diff --exit-code
+
+
+  release:
+    runs-on: ubuntu-20.04
+    steps:
+    - name: checkout
+      uses: actions/checkout@v2
+      with:
+        fetch-depth: 0
+      # We have to run this under Docker as Ubuntu (host) does not support all
+      # the architectures we want to compile test against, and Dockerfile uses
+      # Debian (which does).
+      #
+      # XXX: as currently this is the only job that is using Docker, we are
+      # building and using the runcimage locally. In case more jobs running
+      # under Docker will emerge, it will be good to have a separate make
+      # runcimage job and share its result (the docker image) with whoever
+      # needs it.
+    - uses: satackey/action-docker-layer-caching@v0.0.11
+      continue-on-error: true
+    - name: build docker image
+      run: make runcimage
+    - name: make releaseall
+      run: make releaseall
+    - name: upload artifacts
+      uses: actions/upload-artifact@v2
+      with:
+        name: release-${{ github.run_id }}
+        path: release/*
--- a/.gitignore
+++ b/.gitignore
@ -2,5 +2,9 @@ vendor/pkg
 /runc
 /runc-*
 contrib/cmd/recvtty/recvtty
+contrib/cmd/sd-helper/sd-helper
+contrib/cmd/seccompagent/seccompagent
 man/man8
 release
+Vagrantfile
+.vagrant
--- a/.golangci-extra.yml
+++ b/.golangci-extra.yml
@ -0,0 +1,15 @@
+# This is golangci-lint config file which is used to check new code in
+# github PRs only (see lint-extra job in .github/workflows/validate.yml).
+#
+# For the default linter config, see .golangci.yml. This config should
+# only enable additional linters not enabled in the default config.
+
+run:
+  build-tags:
+    - seccomp
+
+linters:
+  disable-all: true
+  enable:
+    - godot
+    - revive
--- a/.golangci.yml
+++ b/.golangci.yml
@ -0,0 +1,12 @@
+# For documentation, see https://golangci-lint.run/usage/configuration/
+
+run:
+  build-tags:
+    - seccomp
+
+linters:
+  enable:
+    - gofumpt
+    - errorlint
+    - unconvert
+    - unparam
--- a/.pullapprove.yml
+++ b/.pullapprove.yml
@ -1,10 +0,0 @@
-approve_by_comment: true
-approve_regex: ^LGTM
-reject_regex: ^Rejected
-reset_on_push: true
-author_approval: ignored
-reviewers:
-  teams:
-    - runc-maintainers
-  name: default
-  required: 2
--- a/.travis.yml
+++ b/.travis.yml
@ -1,54 +0,0 @@
-dist: bionic
-language: go
-go:
-  - 1.11.x
-  - 1.12.x
-  - tip
-
-matrix:
-  include:
-    - go: 1.12.x
-      env:
-        - RUNC_USE_SYSTEMD=1
-      script:
-        - make BUILDTAGS="${BUILDTAGS}" all
-        - sudo PATH="$PATH" make localintegration RUNC_USE_SYSTEMD=1
-    - go: 1.12.x
-      env:
-        - VIRTUALBOX_VERSION=6.0
-        - VAGRANT_VERSION=2.2.6
-        - FEDORA_VERSION=31
-      before_install:
-        - cat /proc/cpuinfo
-        - wget -q https://www.virtualbox.org/download/oracle_vbox_2016.asc -O- | sudo apt-key add - && sudo sh -c "echo deb https://download.virtualbox.org/virtualbox/debian $(lsb_release -cs) contrib >> /etc/apt/sources.list" && sudo apt-get update && sudo apt-get install -yq build-essential gcc make linux-headers-$(uname -r) virtualbox-${VIRTUALBOX_VERSION} && sudo usermod -aG vboxusers $(whoami)
-        - wget https://releases.hashicorp.com/vagrant/${VAGRANT_VERSION}/vagrant_${VAGRANT_VERSION}_$(uname -m).deb && sudo dpkg -i vagrant_${VAGRANT_VERSION}_$(uname -m).deb
-        - vagrant init bento/fedora-${FEDORA_VERSION} && vagrant up && mkdir -p ~/.ssh && vagrant ssh-config >> ~/.ssh/config
-        - ssh default sudo dnf install -y podman
-      script:
-        - ssh default sudo podman build -t test /vagrant
-        - ssh default sudo podman run --privileged --cgroupns=private test make localunittest
-  allow_failures:
-    - go: tip
-
-go_import_path: github.com/opencontainers/runc
-
-# `make ci` uses Docker.
-sudo: required
-services:
-  - docker
-
-env:
-  global:
-    - BUILDTAGS="seccomp apparmor selinux ambient"
-
-before_install:
-  - sudo apt-get -qq update
-  - sudo apt-get install -y libseccomp-dev
-  - go get -u golang.org/x/lint/golint
-  - go get -u github.com/vbatts/git-validation
-  - env | grep TRAVIS_
-
-script:
-  - git-validation -run DCO,short-subject -v
-  - make BUILDTAGS="${BUILDTAGS}"
-  - make BUILDTAGS="${BUILDTAGS}" clean ci cross
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -0,0 +1,248 @@
+# Changelog/
+This file documents all notable changes made to this project since runc 1.0.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+## [Unreleased]
+
+## [1.1.0] - 2022-01-14
+
+> A plan depends as much upon execution as it does upon concept.
+
+## Changed
+ * libcontainer will now refuse to build without the nsenter package being
+   correctly compiled (specifically this requires CGO to be enabled). This
+   should avoid folks accidentally creating broken runc binaries (and
+   incorrectly importing our internal libraries into their projects). (#3331)
+
+## [1.1.0-rc.1] - 2021-12-14
+
+> He who controls the spice controls the universe.
+
+### Deprecated
+ * runc run/start now warns if a new container cgroup is non-empty or frozen;
+   this warning will become an error in runc 1.2. (#3132, #3223)
+ * runc can only be built with Go 1.16 or later from this release onwards.
+   (#3100, #3245, #3325)
+
+### Removed
+ * `cgroup.GetHugePageSizes` has been removed entirely, and been replaced with
+   `cgroup.HugePageSizes` which is more efficient. (#3234)
+ * `intelrdt.GetIntelRdtPath` has been removed. Users who were using this
+   function to get the intelrdt root should use the new `intelrdt.Root`
+   instead. (#2920, #3239)
+
+### Added
+ * Add support for RDMA cgroup added in Linux 4.11. (#2883)
+ * runc exec now produces exit code of 255 when the exec failed.
+   This may help in distinguishing between runc exec failures
+   (such as invalid options, non-running container or non-existent
+   binary etc.) and failures of the command being executed. (#3073)
+ * runc run: new `--keep` option to skip removal exited containers artefacts.
+   This might be useful to check the state (e.g. of cgroup controllers) after
+   the container hasexited. (#2817, #2825)
+ * seccomp: add support for `SCMP_ACT_KILL_PROCESS` and `SCMP_ACT_KILL_THREAD`
+   (the latter is just an alias for `SCMP_ACT_KILL`). (#3204)
+ * seccomp: add support for `SCMP_ACT_NOTIFY` (seccomp actions). This allows
+   users to create sophisticated seccomp filters where syscalls can be
+   efficiently emulated by privileged processes on the host. (#2682)
+ * checkpoint/restore: add an option (`--lsm-mount-context`) to set
+   a different LSM mount context on restore. (#3068)
+ * runc releases are now cross-compiled for several architectures. Static
+   builds for said architectures will be available for all future releases.
+   (#3197)
+ * intelrdt: support ClosID parameter. (#2920)
+ * runc exec --cgroup: an option to specify a (non-top) in-container cgroup
+   to use for the process being executed. (#3040, #3059)
+ * cgroup v1 controllers now support hybrid hierarchy (i.e. when on a cgroup v1
+   machine a cgroup2 filesystem is mounted to /sys/fs/cgroup/unified, runc
+   run/exec now adds the container to the appropriate cgroup under it). (#2087,
+   #3059)
+ * sysctl: allow slashes in sysctl names, to better match `sysctl(8)`'s
+   behaviour. (#3254, #3257)
+ * mounts: add support for bind-mounts which are inaccessible after switching
+   the user namespace. Note that this does not permit the container any
+   additional access to the host filesystem, it simply allows containers to
+   have bind-mounts configured for paths the user can access but have
+   restrictive access control settings for other users. (#2576)
+ * Add support for recursive mount attributes using `mount_setattr(2)`. These
+   have the same names as the proposed `mount(8)` options -- just prepend `r`
+   to the option name (such as `rro`). (#3272)
+ * Add `runc features` subcommand to allow runc users to detect what features
+   runc has been built with. This includes critical information such as
+   supported mount flags, hook names, and so on. Note that the output of this
+   command is subject to change and will not be considered stable until runc
+   1.2 at the earliest. The runtime-spec specification for this feature is
+   being developed in [opencontainers/runtime-spec#1130]. (#3296)
+
+[opencontainers/runtime-spec#1130]: https://github.com/opencontainers/runtime-spec/pull/1130
+
+### Changed
+ * system: improve performance of `/proc/$pid/stat` parsing. (#2696)
+ * cgroup2: when `/sys/fs/cgroup` is configured as a read-write mount, change
+   the ownership of certain cgroup control files (as per
+   `/sys/kernel/cgroup/delegate`) to allow for proper deferral to the container
+   process. (#3057)
+ * docs: series of improvements to man pages to make them easier to read and
+   use. (#3032)
+
+#### libcontainer API
+ * internal api: remove internal error types and handling system, switch to Go
+   wrapped errors. (#3033)
+ * New configs.Cgroup structure fields (#3177):
+   * Systemd (whether to use systemd cgroup manager); and
+   * Rootless (whether to use rootless cgroups).
+ * New cgroups/manager package aiming to simplify cgroup manager instantiation.
+   (#3177)
+ * All cgroup managers' instantiation methods now initialize cgroup paths and
+   can return errors. This allows to use any cgroup manager method (e.g.
+   Exists, Destroy, Set, GetStats) right after instantiation, which was not
+   possible before (as paths were initialized in Apply only). (#3178)
+
+### Fixed
+ * nsenter: do not try to close already-closed fds during container setup and
+   bail on close(2) failures. (#3058)
+ * runc checkpoint/restore: fixed for containers with an external bind mount
+   which destination is a symlink. (#3047).
+ * cgroup: improve openat2 handling for cgroup directory handle hardening.
+   (#3030)
+ * `runc delete -f` now succeeds (rather than timing out) on a paused
+   container. (#3134)
+ * runc run/start/exec now refuses a frozen cgroup (paused container in case of
+   exec). Users can disable this using `--ignore-paused`. (#3132, #3223)
+ * config: do not permit null bytes in mount fields. (#3287)
+
+
+## [1.0.3] - 2021-12-06
+
+> If you were waiting for the opportune moment, that was it.
+
+### Security
+ * A potential vulnerability was discovered in runc (related to an internal
+   usage of netlink), however upon further investigation we discovered that
+   while this bug was exploitable on the master branch of runc, no released
+   version of runc could be exploited using this bug. The exploit required being
+   able to create a netlink attribute with a length that would overflow a uint16
+   but this was not possible in any released version of runc. For more
+   information, see [GHSA-v95c-p5hm-xq8f][] and CVE-2021-43784.
+
+### Fixed
+ * Fixed inability to start a container with read-write bind mount of a
+   read-only fuse host mount. (#3283, #3292)
+ * Fixed inability to start when read-only /dev in set in spec (#3276, #3277)
+ * Fixed not removing sub-cgroups upon container delete, when rootless cgroup v2
+   is used with older systemd. (#3226, #3297)
+ * Fixed returning error from GetStats when hugetlb is unsupported (which causes
+   excessive logging for Kubernetes). (#3233, #3295)
+ * Improved an error message when dbus-user-session is not installed and
+   rootless + cgroup2 + systemd are used (#3212)
+
+[GHSA-v95c-p5hm-xq8f]: https://github.com/opencontainers/runc/security/advisories/GHSA-v95c-p5hm-xq8f
+
+
+## [1.0.2] - 2021-07-16
+
+> Given the right lever, you can move a planet.
+
+### Changed
+ * Made release builds reproducible from now on. (#3099, #3142)
+
+### Fixed
+ * Fixed a failure to set CPU quota period in some cases on cgroup v1. (#3090
+   #3115)
+ * Fixed the inability to start a container with the "adding seccomp filter
+   rule for syscall ..." error, caused by redundant seccomp rules (i.e. those
+   that has action equal to the default one). Such redundant rules are now
+   skipped. (#3109, #3129)
+ * Fixed a rare debug log race in runc init, which can result in occasional
+   harmful "failed to decode ..." errors from runc run or exec. (#3120, #3130)
+ * Fixed the check in cgroup v1 systemd manager if a container needs to be
+   frozen before Set, and add a setting to skip such freeze unconditionally.
+   The previous fix for that issue, done in  runc 1.0.1, was not working.
+   (#3166, #3167)
+
+
+## [1.0.1] - 2021-07-16
+
+> If in doubt, Meriadoc, always follow your nose.
+
+### Fixed
+ * Fixed occasional runc exec/run failure ("interrupted system call") on an
+   Azure volume. (#3045, #3074)
+ * Fixed "unable to find groups ... token too long" error with /etc/group
+   containing lines longer than 64K characters. (#3062, #3079)
+ * cgroup/systemd/v1: fix leaving cgroup frozen after Set if a parent cgroup is
+   frozen.  This is a regression in 1.0.0, not affecting runc itself but some
+   of libcontainer users (e.g Kubernetes). (#3081, #3085)
+ * cgroupv2: bpf: Ignore inaccessible existing programs in case of
+   permission error when handling replacement of existing bpf cgroup
+   programs. This fixes a regression in 1.0.0, where some SELinux
+   policies would block runc from being able to run entirely. (#3055, #3087)
+ * cgroup/systemd/v2: don't freeze cgroup on Set. (#3067, #3092)
+ * cgroup/systemd/v1: avoid unnecessary freeze on Set. (#3082, #3093)
+
+
+## [1.0.0] - 2021-06-22
+
+> A wizard is never late, nor is he early, he arrives precisely when he means
+> to.
+
+As runc follows Semantic Versioning, we will endeavour to not make any
+breaking changes without bumping the major version number of runc.
+However, it should be noted that Go API usage of runc's internal
+implementation (libcontainer) is *not* covered by this policy.
+
+### Removed
+ * Removed libcontainer/configs.Device* identifiers (deprecated since rc94,
+   use libcontainer/devices). (#2999)
+ * Removed libcontainer/system.RunningInUserNS function (deprecated since
+   rc94, use libcontainer/userns). (#2999)
+
+### Deprecated
+ * The usage of relative paths for mountpoints will now produce a warning
+   (such configurations are outside of the spec, and in future runc will
+   produce an error when given such configurations). (#2917, #3004)
+
+### Fixed
+ * cgroupv2: devices: rework the filter generation to produce consistent
+   results with cgroupv1, and always clobber any existing eBPF
+   program(s) to fix `runc update` and avoid leaking eBPF programs
+   (resulting in errors when managing containers).  (#2951)
+ * cgroupv2: correctly convert "number of IOs" statistics in a
+   cgroupv1-compatible way. (#2965, #2967, #2968, #2964)
+ * cgroupv2: support larger than 32-bit IO statistics on 32-bit architectures.
+ * cgroupv2: wait for freeze to finish before returning from the freezing
+   code, optimize the method for checking whether a cgroup is frozen. (#2955)
+ * cgroups/systemd: fixed "retry on dbus disconnect" logic introduced in rc94
+ * cgroups/systemd: fixed returning "unit already exists" error from a systemd
+   cgroup manager (regression in rc94) (#2997, #2996)
+
+### Added
+ * cgroupv2: support SkipDevices with systemd driver. (#2958, #3019)
+ * cgroup1: blkio: support BFQ weights. (#3010)
+ * cgroupv2: set per-device io weights if BFQ IO scheduler is available.
+   (#3022)
+
+### Changed
+ * cgroup/systemd: return, not ignore, stop unit error from Destroy (#2946)
+ * Fix all golangci-lint failures. (#2781, #2962)
+ * Make `runc --version` output sane even when built with `go get` or
+   otherwise outside of our build scripts. (#2962)
+ * cgroups: set SkipDevices during runc update (so we don't modify
+   cgroups at all during `runc update`). (#2994)
+
+<!-- minor releases -->
+[Unreleased]: https://github.com/opencontainers/runc/compare/v1.1.0...HEAD
+[1.1.0]: https://github.com/opencontainers/runc/compare/v1.1.0-rc.1...v1.1.0
+[1.0.0]: https://github.com/opencontainers/runc/releases/tag/v1.0.0
+
+<!-- 1.0.z patch releases -->
+[Unreleased 1.0.z]: https://github.com/opencontainers/runc/compare/v1.0.3...release-1.0
+[1.0.3]: https://github.com/opencontainers/runc/compare/v1.0.2...v1.0.3
+[1.0.2]: https://github.com/opencontainers/runc/compare/v1.0.1...v1.0.2
+[1.0.1]: https://github.com/opencontainers/runc/compare/v1.0.0...v1.0.1
+
+<!-- 1.1.z patch releases -->
+[Unreleased 1.1.z]: https://github.com/opencontainers/runc/compare/v1.1.0...release-1.1
+[1.1.0-rc.1]: https://github.com/opencontainers/runc/compare/v1.0.0...v1.1.0-rc.1
--- a/96
+++ b/96
@ -1,34 +1,41 @@
-FROM golang:1.12-stretch
+ARG GO_VERSION=1.17
+ARG BATS_VERSION=v1.3.0
+ARG LIBSECCOMP_VERSION=2.5.3

-RUN dpkg --add-architecture armel \
+FROM golang:${GO_VERSION}-bullseye
+ARG DEBIAN_FRONTEND=noninteractive
+ARG CRIU_REPO=https://download.opensuse.org/repositories/devel:/tools:/criu/Debian_11
+
+RUN KEYFILE=/usr/share/keyrings/criu-repo-keyring.gpg; \
+    wget -nv $CRIU_REPO/Release.key -O- | gpg --dearmor > "$KEYFILE" \
+    && echo "deb [signed-by=$KEYFILE] $CRIU_REPO/ /" > /etc/apt/sources.list.d/criu.list \
+    && dpkg --add-architecture armel \
    && dpkg --add-architecture armhf \
    && dpkg --add-architecture arm64 \
    && dpkg --add-architecture ppc64el \
-    && apt-get update && apt-get install -y \
-    build-essential \
-    curl \
-    sudo \
-    gawk \
-    iptables \
-    jq \
-    pkg-config \
-    libaio-dev \
-    libcap-dev \
-    libprotobuf-dev \
-    libprotobuf-c0-dev \
-    libnl-3-dev \
-    libnet-dev \
-    libseccomp2 \
-    libseccomp-dev \
-    protobuf-c-compiler \
-    protobuf-compiler \
-    python-minimal \
-    uidmap \
-    kmod \
-    crossbuild-essential-armel crossbuild-essential-armhf crossbuild-essential-arm64 crossbuild-essential-ppc64el \
-    libseccomp-dev:armel libseccomp-dev:armhf libseccomp-dev:arm64 libseccomp-dev:ppc64el \
-    --no-install-recommends \
-    && apt-get clean
+    && apt-get update \
+    && apt-get install -y --no-install-recommends \
+        build-essential \
+        criu \
+        crossbuild-essential-arm64 \
+        crossbuild-essential-armel \
+        crossbuild-essential-armhf \
+        crossbuild-essential-ppc64el \
+        crossbuild-essential-s390x \
+        curl \
+        gawk \
+        gcc \
+        gperf \
+        iptables \
+        jq \
+        kmod \
+        pkg-config \
+        python3-minimal \
+        sshfs \
+        sudo \
+        uidmap \
+    && apt-get clean \
+    && rm -rf /var/cache/apt /var/lib/apt/lists/* /etc/apt/sources.list.d/*.list

 # Add a dummy user for the rootless integration tests. While runC does
 # not require an entry in /etc/passwd to operate, one of the tests uses
@ -37,30 +44,21 @@ RUN dpkg --add-architecture armel \
 RUN useradd -u1000 -m -d/home/rootless -s/bin/bash rootless

 # install bats
+ARG BATS_VERSION
 RUN cd /tmp \
-    && git clone https://github.com/sstephenson/bats.git \
-    && cd bats \
-    && git reset --hard 03608115df2071fff4eaaff1605768c275e5f81f \
+    && git clone https://github.com/bats-core/bats-core.git \
+    && cd bats-core \
+    && git reset --hard "${BATS_VERSION}" \
    && ./install.sh /usr/local \
-    && rm -rf /tmp/bats
+    && rm -rf /tmp/bats-core

-# install criu
-ENV CRIU_VERSION v3.12
-RUN mkdir -p /usr/src/criu \
-    && curl -sSL https://github.com/checkpoint-restore/criu/archive/${CRIU_VERSION}.tar.gz | tar -v -C /usr/src/criu/ -xz --strip-components=1 \
-    && cd /usr/src/criu \
-    && make install-criu \
-    && rm -rf /usr/src/criu
+# install libseccomp
+ARG LIBSECCOMP_VERSION
+COPY script/* /tmp/script/
+RUN mkdir -p /opt/libseccomp \
+    && /tmp/script/seccomp.sh "$LIBSECCOMP_VERSION" /opt/libseccomp arm64 armel armhf ppc64le s390x
+ENV LIBSECCOMP_VERSION=$LIBSECCOMP_VERSION
+ENV LD_LIBRARY_PATH=/opt/libseccomp/lib
+ENV PKG_CONFIG_PATH=/opt/libseccomp/lib/pkgconfig

-# setup a playground for us to spawn containers in
-ENV ROOTFS /busybox
-RUN mkdir -p ${ROOTFS}
-
-COPY script/tmpmount /
 WORKDIR /go/src/github.com/opencontainers/runc
-ENTRYPOINT ["/tmpmount"]
-
-ADD . /go/src/github.com/opencontainers/runc
-
-RUN . tests/integration/multi-arch.bash \
-    && curl -o- -sSL `get_busybox` | tar xfJC - ${ROOTFS}
--- a/EMERITUS.md
+++ b/EMERITUS.md
@ -0,0 +1,11 @@
+## Emeritus ##
+
+We would like to acknowledge previous runc maintainers and their huge
+contributions to our collective success:
+
+ * Alexander Morozov (@lk4d4)
+ * Andrei Vagin (@avagin)
+ * Rohit Jnagal (@rjnagal)
+ * Victor Marmol (@vmarmol)
+
+We thank these members for their service to the OCI community.
--- a/7
+++ b/7
@ -1,5 +1,8 @@
-Michael Crosby <michael@docker.com> (@crosbymichael)
+Michael Crosby <michael@thepasture.io> (@crosbymichael)
 Mrunal Patel <mpatel@redhat.com> (@mrunalp)
 Daniel, Dao Quang Minh <dqminh89@gmail.com> (@dqminh)
 Qiang Huang <h.huangqiang@huawei.com> (@hqhq)
-Aleksa Sarai <asarai@suse.de> (@cyphar)
+Aleksa Sarai <cyphar@cyphar.com> (@cyphar)
+Akihiro Suda <akihiro.suda.cz@hco.ntt.co.jp> (@AkihiroSuda)
+Kir Kolyshkin <kolyshkin@gmail.com> (@kolyshkin)
+Sebastiaan van Stijn <github@gone.nl> (@thaJeztah)
--- a/175
+++ b/175
@ -1,133 +1,158 @@
-.PHONY: all shell dbuild man release \
-	    localtest localunittest localintegration \
-	    test unittest integration \
-	    cross localcross
-
 CONTAINER_ENGINE := docker
-GO := go
+GO ?= go

-SOURCES := $(shell find . 2>&1 | grep -E '.*\.(c|h|go)$$')
-PREFIX := $(DESTDIR)/usr/local
+PREFIX ?= /usr/local
 BINDIR := $(PREFIX)/sbin
+MANDIR := $(PREFIX)/share/man
+
 GIT_BRANCH := $(shell git rev-parse --abbrev-ref HEAD 2>/dev/null)
 GIT_BRANCH_CLEAN := $(shell echo $(GIT_BRANCH) | sed -e "s/[^[:alnum:]]/-/g")
 RUNC_IMAGE := runc_dev$(if $(GIT_BRANCH_CLEAN),:$(GIT_BRANCH_CLEAN))
 PROJECT := github.com/opencontainers/runc
 BUILDTAGS ?= seccomp
-COMMIT_NO := $(shell git rev-parse HEAD 2> /dev/null || true)
-COMMIT ?= $(if $(shell git status --porcelain --untracked-files=no),"${COMMIT_NO}-dirty","${COMMIT_NO}")
+COMMIT ?= $(shell git describe --dirty --long --always)
+VERSION := $(shell cat ./VERSION)

-MAN_DIR := $(CURDIR)/man/man8
-MAN_PAGES = $(shell ls $(MAN_DIR)/*.8)
-MAN_PAGES_BASE = $(notdir $(MAN_PAGES))
-MAN_INSTALL_PATH := ${PREFIX}/share/man/man8/
+ifeq ($(shell $(GO) env GOOS),linux)
+	ifeq (,$(filter $(shell $(GO) env GOARCH),mips mipsle mips64 mips64le ppc64))
+		ifeq (,$(findstring -race,$(EXTRA_FLAGS)))
+			GO_BUILDMODE := "-buildmode=pie"
+		endif
+	endif
+endif
+GO_BUILD := $(GO) build -trimpath $(GO_BUILDMODE) $(EXTRA_FLAGS) -tags "$(BUILDTAGS)" \
+	-ldflags "-X main.gitCommit=$(COMMIT) -X main.version=$(VERSION) $(EXTRA_LDFLAGS)"
+GO_BUILD_STATIC := CGO_ENABLED=1 $(GO) build -trimpath $(EXTRA_FLAGS) -tags "$(BUILDTAGS) netgo osusergo" \
+	-ldflags "-extldflags -static -X main.gitCommit=$(COMMIT) -X main.version=$(VERSION) $(EXTRA_LDFLAGS)"

-RELEASE_DIR := $(CURDIR)/release
-
-VERSION := ${shell cat ./VERSION}
-
-SHELL := $(shell command -v bash 2>/dev/null)
+GPG_KEYID ?= asarai@suse.de

 .DEFAULT: runc

-runc: $(SOURCES)
-	$(GO) build -buildmode=pie $(EXTRA_FLAGS) -ldflags "-X main.gitCommit=${COMMIT} -X main.version=${VERSION} $(EXTRA_LDFLAGS)" -tags "$(BUILDTAGS)" -o runc .
+runc:
+	$(GO_BUILD) -o runc .

-all: runc recvtty
+all: runc recvtty sd-helper seccompagent

-recvtty: contrib/cmd/recvtty/recvtty
+recvtty sd-helper seccompagent:
+	$(GO_BUILD) -o contrib/cmd/$@/$@ ./contrib/cmd/$@

-contrib/cmd/recvtty/recvtty: $(SOURCES)
-	$(GO) build -buildmode=pie $(EXTRA_FLAGS) -ldflags "-X main.gitCommit=${COMMIT} -X main.version=${VERSION} $(EXTRA_LDFLAGS)" -tags "$(BUILDTAGS)" -o contrib/cmd/recvtty/recvtty ./contrib/cmd/recvtty
+static:
+	$(GO_BUILD_STATIC) -o runc .

-static: $(SOURCES)
-	CGO_ENABLED=1 $(GO) build $(EXTRA_FLAGS) -tags "$(BUILDTAGS) netgo osusergo" -installsuffix netgo -ldflags "-w -extldflags -static -X main.gitCommit=${COMMIT} -X main.version=${VERSION} $(EXTRA_LDFLAGS)" -o runc .
-	CGO_ENABLED=1 $(GO) build $(EXTRA_FLAGS) -tags "$(BUILDTAGS) netgo osusergo" -installsuffix netgo -ldflags "-w -extldflags -static -X main.gitCommit=${COMMIT} -X main.version=${VERSION} $(EXTRA_LDFLAGS)" -o contrib/cmd/recvtty/recvtty ./contrib/cmd/recvtty
+releaseall: RELEASE_ARGS := "-a arm64 -a armel -a armhf -a ppc64le -a s390x"
+releaseall: release

-release:
-	script/release.sh -r release/$(VERSION) -v $(VERSION)
+release: runcimage
+	$(CONTAINER_ENGINE) run $(CONTAINER_ENGINE_RUN_FLAGS) \
+		--rm -v $(CURDIR):/go/src/$(PROJECT) \
+		-e RELEASE_ARGS=$(RELEASE_ARGS) \
+		$(RUNC_IMAGE) make localrelease
+	script/release_sign.sh -S $(GPG_KEYID) -r release/$(VERSION) -v $(VERSION)
+
+localrelease:
+	script/release_build.sh -r release/$(VERSION) -v $(VERSION) $(RELEASE_ARGS)

 dbuild: runcimage
-	$(CONTAINER_ENGINE) run ${CONTAINER_ENGINE_RUN_FLAGS} --rm -v $(CURDIR):/go/src/$(PROJECT) --privileged $(RUNC_IMAGE) make clean all
+	$(CONTAINER_ENGINE) run $(CONTAINER_ENGINE_RUN_FLAGS) \
+		--privileged --rm \
+		-v $(CURDIR):/go/src/$(PROJECT) \
+		$(RUNC_IMAGE) make clean all

 lint:
-	$(GO) vet $(allpackages)
-	$(GO) fmt $(allpackages)
+	golangci-lint run ./...

 man:
 	man/md2man-all.sh

 runcimage:
-	$(CONTAINER_ENGINE) build ${CONTAINER_ENGINE_BUILD_FLAGS} -t $(RUNC_IMAGE) .
+	$(CONTAINER_ENGINE) build $(CONTAINER_ENGINE_BUILD_FLAGS) -t $(RUNC_IMAGE) .

-test:
-	make unittest integration rootlessintegration
+test: unittest integration rootlessintegration

-localtest:
-	make localunittest localintegration localrootlessintegration
+localtest: localunittest localintegration localrootlessintegration

 unittest: runcimage
-	$(CONTAINER_ENGINE) run ${CONTAINER_ENGINE_RUN_FLAGS} -t --privileged --rm -v /lib/modules:/lib/modules:ro -v $(CURDIR):/go/src/$(PROJECT) $(RUNC_IMAGE) make localunittest TESTFLAGS=${TESTFLAGS}
+	$(CONTAINER_ENGINE) run $(CONTAINER_ENGINE_RUN_FLAGS) \
+		-t --privileged --rm \
+		-v /lib/modules:/lib/modules:ro \
+		-v $(CURDIR):/go/src/$(PROJECT) \
+		$(RUNC_IMAGE) make localunittest TESTFLAGS=$(TESTFLAGS)

 localunittest: all
-	$(GO) test -timeout 3m -tags "$(BUILDTAGS)" ${TESTFLAGS} -v $(allpackages)
+	$(GO) test -timeout 3m -tags "$(BUILDTAGS)" $(TESTFLAGS) -v ./...

 integration: runcimage
-	$(CONTAINER_ENGINE) run ${CONTAINER_ENGINE_RUN_FLAGS} -t --privileged --rm -v /lib/modules:/lib/modules:ro -v $(CURDIR):/go/src/$(PROJECT) $(RUNC_IMAGE) make localintegration TESTPATH=${TESTPATH}
+	$(CONTAINER_ENGINE) run $(CONTAINER_ENGINE_RUN_FLAGS) \
+		-t --privileged --rm \
+		-v /lib/modules:/lib/modules:ro \
+		-v $(CURDIR):/go/src/$(PROJECT) \
+		$(RUNC_IMAGE) make localintegration TESTPATH=$(TESTPATH)

 localintegration: all
-	bats -t tests/integration${TESTPATH}
+	bats -t tests/integration$(TESTPATH)

 rootlessintegration: runcimage
-	$(CONTAINER_ENGINE) run ${CONTAINER_ENGINE_RUN_FLAGS} -t --privileged --rm -v $(CURDIR):/go/src/$(PROJECT) $(RUNC_IMAGE) make localrootlessintegration
+	$(CONTAINER_ENGINE) run $(CONTAINER_ENGINE_RUN_FLAGS) \
+		-t --privileged --rm \
+		-v $(CURDIR):/go/src/$(PROJECT) \
+		-e ROOTLESS_TESTPATH \
+		$(RUNC_IMAGE) make localrootlessintegration

 localrootlessintegration: all
 	tests/rootless.sh

 shell: runcimage
-	$(CONTAINER_ENGINE) run ${CONTAINER_ENGINE_RUN_FLAGS} -ti --privileged --rm -v $(CURDIR):/go/src/$(PROJECT) $(RUNC_IMAGE) bash
+	$(CONTAINER_ENGINE) run $(CONTAINER_ENGINE_RUN_FLAGS) \
+		-ti --privileged --rm \
+		-v $(CURDIR):/go/src/$(PROJECT) \
+		$(RUNC_IMAGE) bash

 install:
-	install -D -m0755 runc $(BINDIR)/runc
+	install -D -m0755 runc $(DESTDIR)$(BINDIR)/runc

 install-bash:
-	install -D -m0644 contrib/completions/bash/runc $(PREFIX)/share/bash-completion/completions/runc
+	install -D -m0644 contrib/completions/bash/runc $(DESTDIR)$(PREFIX)/share/bash-completion/completions/runc

-install-man:
-	install -d -m 755 $(MAN_INSTALL_PATH)
-	install -m 644 $(MAN_PAGES) $(MAN_INSTALL_PATH)
-
-uninstall:
-	rm -f $(BINDIR)/runc
-
-uninstall-bash:
-	rm -f $(PREFIX)/share/bash-completion/completions/runc
-
-uninstall-man:
-	rm -f $(addprefix $(MAN_INSTALL_PATH),$(MAN_PAGES_BASE))
+install-man: man
+	install -d -m 755 $(DESTDIR)$(MANDIR)/man8
+	install -D -m 644 man/man8/*.8 $(DESTDIR)$(MANDIR)/man8

 clean:
 	rm -f runc runc-*
 	rm -f contrib/cmd/recvtty/recvtty
-	rm -rf $(RELEASE_DIR)
-	rm -rf $(MAN_DIR)
+	rm -f contrib/cmd/sd-helper/sd-helper
+	rm -f contrib/cmd/seccompagent/seccompagent
+	rm -rf release
+	rm -rf man/man8

-validate:
-	script/validate-gofmt
-	script/validate-c
-	$(GO) vet $(allpackages)
+cfmt: C_SRC=$(shell git ls-files '*.c' | grep -v '^vendor/')
+cfmt:
+	indent -linux -l120 -il0 -ppi2 -cp1 -T size_t -T jmp_buf $(C_SRC)

-ci: validate test release
+shellcheck:
+	shellcheck tests/integration/*.bats tests/integration/*.sh \
+		tests/integration/*.bash tests/*.sh \
+		script/release_*.sh script/seccomp.sh script/lib.sh
+	# TODO: add shellcheck for more sh files

-cross: runcimage
-	$(CONTAINER_ENGINE) run ${CONTAINER_ENGINE_RUN_FLAGS} -e BUILDTAGS="$(BUILDTAGS)" --rm -v $(CURDIR):/go/src/$(PROJECT) $(RUNC_IMAGE) make localcross
+shfmt:
+	shfmt -ln bats -d -w tests/integration/*.bats
+	shfmt -ln bash -d -w man/*.sh script/* tests/*.sh tests/integration/*.bash

-localcross:
-	CGO_ENABLED=1 GOARCH=arm GOARM=6 CC=arm-linux-gnueabi-gcc $(GO) build -buildmode=pie $(EXTRA_FLAGS) -ldflags "-X main.gitCommit=${COMMIT} -X main.version=${VERSION} $(EXTRA_LDFLAGS)" -tags "$(BUILDTAGS)" -o runc-armel .
-	CGO_ENABLED=1 GOARCH=arm GOARM=7 CC=arm-linux-gnueabihf-gcc $(GO) build -buildmode=pie $(EXTRA_FLAGS) -ldflags "-X main.gitCommit=${COMMIT} -X main.version=${VERSION} $(EXTRA_LDFLAGS)" -tags "$(BUILDTAGS)" -o runc-armhf .
-	CGO_ENABLED=1 GOARCH=arm64 CC=aarch64-linux-gnu-gcc $(GO) build -buildmode=pie $(EXTRA_FLAGS) -ldflags "-X main.gitCommit=${COMMIT} -X main.version=${VERSION} $(EXTRA_LDFLAGS)" -tags "$(BUILDTAGS)" -o runc-arm64 .
-	CGO_ENABLED=1 GOARCH=ppc64le CC=powerpc64le-linux-gnu-gcc $(GO) build -buildmode=pie $(EXTRA_FLAGS) -ldflags "-X main.gitCommit=${COMMIT} -X main.version=${VERSION} $(EXTRA_LDFLAGS)" -tags "$(BUILDTAGS)" -o runc-ppc64le .
+vendor:
+	$(GO) mod tidy
+	$(GO) mod vendor
+	$(GO) mod verify

-# memoize allpackages, so that it's executed only once and only if used
-_allpackages = $(shell $(GO) list ./... | grep -v vendor)
-allpackages = $(if $(__allpackages),,$(eval __allpackages := $$(_allpackages)))$(__allpackages)
+verify-dependencies: vendor
+	@test -z "$$(git status --porcelain -- go.mod go.sum vendor/)" \
+		|| (echo -e "git status:\n $$(git status -- go.mod go.sum vendor/)\nerror: vendor/, go.mod and/or go.sum not up to date. Run \"make vendor\" to update"; exit 1) \
+		&& echo "all vendor files are up to date."
+
+.PHONY: runc all recvtty sd-helper seccompagent static releaseall release \
+	localrelease dbuild lint man runcimage \
+	test localtest unittest localunittest integration localintegration \
+	rootlessintegration localrootlessintegration shell install install-bash \
+	install-man clean cfmt shfmt shellcheck \
+	vendor verify-dependencies
--- a/README.md
+++ b/README.md
@ -1,39 +1,33 @@
 # runc

-[![Build Status](https://travis-ci.org/opencontainers/runc.svg?branch=master)](https://travis-ci.org/opencontainers/runc)
 [![Go Report Card](https://goreportcard.com/badge/github.com/opencontainers/runc)](https://goreportcard.com/report/github.com/opencontainers/runc)
 [![GoDoc](https://godoc.org/github.com/opencontainers/runc?status.svg)](https://godoc.org/github.com/opencontainers/runc)
+[![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/588/badge)](https://bestpractices.coreinfrastructure.org/projects/588)
+[![gha/validate](https://github.com/opencontainers/runc/workflows/validate/badge.svg)](https://github.com/opencontainers/runc/actions?query=workflow%3Avalidate)
+[![gha/ci](https://github.com/opencontainers/runc/workflows/ci/badge.svg)](https://github.com/opencontainers/runc/actions?query=workflow%3Aci)

 ## Introduction

-`runc` is a CLI tool for spawning and running containers according to the OCI specification.
+`runc` is a CLI tool for spawning and running containers on Linux according to the OCI specification.

 ## Releases

-`runc` depends on and tracks the [runtime-spec](https://github.com/opencontainers/runtime-spec) repository.
-We will try to make sure that `runc` and the OCI specification major versions stay in lockstep.
-This means that `runc` 1.0.0 should implement the 1.0 version of the specification.
-
 You can find official releases of `runc` on the [release](https://github.com/opencontainers/runc/releases) page.

-Currently, the following features are not considered to be production-ready:
-
-* Support for cgroup v2
-
 ## Security

-The reporting process and disclosure communications are outlined in [/org/security](https://github.com/opencontainers/org/blob/master/security/).
+The reporting process and disclosure communications are outlined [here](https://github.com/opencontainers/org/blob/master/SECURITY.md).
+
+### Security Audit
+A third party security audit was performed by Cure53, you can see the full report [here](https://github.com/opencontainers/runc/blob/master/docs/Security-Audit.pdf).

 ## Building

-`runc` currently supports the Linux platform with various architecture support.
-It must be built with Go version 1.6 or higher in order for some features to function properly.
+`runc` only supports Linux. It must be built with Go version 1.16 or higher.

 In order to enable seccomp support you will need to install `libseccomp` on your platform.
 > e.g. `libseccomp-devel` for CentOS, or `libseccomp-dev` for Ubuntu

-Otherwise, if you do not want to build `runc` with seccomp support you can add `BUILDTAGS=""` when running make.
-
 ```bash
 # create a 'github.com/opencontainers' in your GOPATH/src
 cd github.com/opencontainers
@ -58,21 +52,24 @@ sudo make install

 #### Build Tags

-`runc` supports optional build tags for compiling support of various features.
-To add build tags to the make option the `BUILDTAGS` variable must be set.
+`runc` supports optional build tags for compiling support of various features,
+with some of them enabled by default (see `BUILDTAGS` in top-level `Makefile`).
+
+To change build tags from the default, set the `BUILDTAGS` variable for make,
+e.g. to disable seccomp:

 ```bash
-make BUILDTAGS='seccomp apparmor'
+make BUILDTAGS=""
 ```

-| Build Tag | Feature                            | Dependency  |
-|-----------|------------------------------------|-------------|
-| seccomp   | Syscall filtering                  | libseccomp  |
-| selinux   | selinux process and mount labeling | <none>      |
-| apparmor  | apparmor profile support           | <none>      |
-| ambient   | ambient capability support         | kernel 4.3  |
-| nokmem    | disable kernel memory account      | <none>      |
+| Build Tag | Feature                            | Enabled by default | Dependency |
+|-----------|------------------------------------|--------------------|------------|
+| seccomp   | Syscall filtering                  | yes                | libseccomp |

+The following build tags were used earlier, but are now obsoleted:
+ - **nokmem** (since runc v1.0.0-rc94 kernel memory settings are ignored)
+ - **apparmor** (since runc v1.0.0-rc93 the feature is always enabled)
+ - **selinux**  (since runc v1.0.0-rc93 the feature is always enabled)

 ### Running the test suite

@ -97,20 +94,41 @@ You can run a specific integration test by setting the `TESTPATH` variable.
 # make test TESTPATH="/checkpoint.bats"
 ```

-You can run a test in your proxy environment by setting `DOCKER_BUILD_PROXY` and `DOCKER_RUN_PROXY` variables.
+You can run a specific rootless integration test by setting the `ROOTLESS_TESTPATH` variable.

 ```bash
-# make test DOCKER_BUILD_PROXY="--build-arg HTTP_PROXY=http://yourproxy/" DOCKER_RUN_PROXY="-e HTTP_PROXY=http://yourproxy/"
+# make test ROOTLESS_TESTPATH="/checkpoint.bats"
+```
+
+You can run a test using your container engine's flags by setting `CONTAINER_ENGINE_BUILD_FLAGS` and `CONTAINER_ENGINE_RUN_FLAGS` variables.
+
+```bash
+# make test CONTAINER_ENGINE_BUILD_FLAGS="--build-arg http_proxy=http://yourproxy/" CONTAINER_ENGINE_RUN_FLAGS="-e http_proxy=http://yourproxy/"
 ```

 ### Dependencies Management

-`runc` uses [vndr](https://github.com/LK4D4/vndr) for dependencies management.
-Please refer to [vndr](https://github.com/LK4D4/vndr) for how to add or update
+`runc` uses [Go Modules](https://github.com/golang/go/wiki/Modules) for dependencies management.
+Please refer to [Go Modules](https://github.com/golang/go/wiki/Modules) for how to add or update
 new dependencies.

+```
+# Update vendored dependencies
+make vendor
+# Verify all dependencies
+make verify-dependencies
+```
+
 ## Using runc

+Please note that runc is a low level tool not designed with an end user
+in mind. It is mostly employed by other higher level container software.
+
+Therefore, unless there is some specific use case that prevents the use
+of tools like Docker or Podman, it is not recommended to use runc directly.
+
+If you still want to use runc, here's how.
+
 ### Creating an OCI Bundle

 In order to use runc you must have your container in the format of an OCI bundle.
@ -152,7 +170,9 @@ If you used the unmodified `runc spec` template this should give you a `sh` sess

 The second way to start a container is using the specs lifecycle operations.
 This gives you more power over how the container is created and managed while it is running.
-This will also launch the container in the background so you will have to edit the `config.json` to remove the `terminal` setting for the simple examples here.
+This will also launch the container in the background so you will have to edit
+the `config.json` to remove the `terminal` setting for the simple examples
+below (see more details about [runc terminal handling](docs/terminals.md)).
 Your process field in the `config.json` should look like this below with `"terminal": false` and `"args": ["sleep", "5"]`.


@ -275,6 +295,14 @@ PIDFile=/run/mycontainerid.pid
 WantedBy=multi-user.target
 ```

+## More documentation
+
+* [cgroup v2](./docs/cgroup-v2.md)
+* [Checkpoint and restore](./docs/checkpoint-restore.md)
+* [systemd cgroup driver](./docs/systemd.md)
+* [Terminals and standard IO](./docs/terminals.md)
+* [Experimental features](./docs/experimental.md)
+
 ## License

 The code and docs are released under the [Apache 2.0 license](LICENSE).
--- a/SECURITY.md
+++ b/SECURITY.md
@ -1,3 +1,3 @@
 # Security

-The reporting process and disclosure communications are outlined in [/org/security](https://github.com/opencontainers/org/blob/master/security/).
+The reporting process and disclosure communications are outlined [here](https://github.com/opencontainers/org/blob/master/SECURITY.md).
--- a/2
+++ b/2
@ -1 +1 @@
-1.0.0-rc10
+1.1.0
--- a/Vagrantfile.fedora
+++ b/Vagrantfile.fedora
@ -0,0 +1,52 @@
+# -*- mode: ruby -*-
+# vi: set ft=ruby :
+
+Vagrant.configure("2") do |config|
+# Fedora box is used for testing cgroup v2 support
+  config.vm.box = "fedora/35-cloud-base"
+  config.vm.provider :virtualbox do |v|
+    v.memory = 2048
+    v.cpus = 2
+  end
+  config.vm.provider :libvirt do |v|
+    v.memory = 2048
+    v.cpus = 2
+  end
+  config.vm.provision "shell", inline: <<-SHELL
+    set -e -u -o pipefail
+    # Work around dnf mirror failures by retrying a few times
+    for i in $(seq 0 2); do
+      sleep $i
+      # "config exclude" dnf shell command is not working in Fedora 35
+      # (see https://bugzilla.redhat.com/show_bug.cgi?id=2022571);
+      # the workaround is to specify it as an option.
+      cat << EOF | dnf -y --exclude=kernel,kernel-core shell && break
+config install_weak_deps false
+update
+install iptables gcc make golang-go glibc-static libseccomp-devel bats jq git-core criu fuse-sshfs
+ts run
+EOF
+    done
+    dnf clean all
+
+    # Add a user for rootless tests
+    useradd -u2000 -m -d/home/rootless -s/bin/bash rootless
+
+    # Allow root and rootless itself to execute `ssh rootless@localhost` in tests/rootless.sh
+    ssh-keygen -t ecdsa -N "" -f /root/rootless.key
+    mkdir -m 0700 -p /home/rootless/.ssh
+    cp /root/rootless.key /home/rootless/.ssh/id_ecdsa
+    cat /root/rootless.key.pub >> /home/rootless/.ssh/authorized_keys
+    chown -R rootless.rootless /home/rootless
+
+    # Delegate cgroup v2 controllers to rootless user via --systemd-cgroup
+    mkdir -p /etc/systemd/system/user@.service.d
+    cat > /etc/systemd/system/user@.service.d/delegate.conf << EOF
+[Service]
+# default: Delegate=pids memory
+# NOTE: delegation of cpuset requires systemd >= 244 (Fedora >= 32, Ubuntu >= 20.04).
+Delegate=yes
+EOF
+    systemctl daemon-reload
+  SHELL
+end
--- a/checkpoint.go
+++ b/checkpoint.go
@ -1,19 +1,19 @@
-// +build linux
-
 package main

 import (
+	"errors"
 	"fmt"
+	"net"
 	"os"
+	"path/filepath"
 	"strconv"
-	"strings"

+	criu "github.com/checkpoint-restore/go-criu/v5/rpc"
 	"github.com/opencontainers/runc/libcontainer"
-	"github.com/opencontainers/runc/libcontainer/system"
+	"github.com/opencontainers/runc/libcontainer/userns"
 	"github.com/opencontainers/runtime-spec/specs-go"
 	"github.com/sirupsen/logrus"
 	"github.com/urfave/cli"
-
 	"golang.org/x/sys/unix"
 )

@ -34,7 +34,7 @@ checkpointed.`,
 		cli.BoolFlag{Name: "ext-unix-sk", Usage: "allow external unix sockets"},
 		cli.BoolFlag{Name: "shell-job", Usage: "allow shell jobs"},
 		cli.BoolFlag{Name: "lazy-pages", Usage: "use userfaultfd to lazily restore memory pages"},
-		cli.StringFlag{Name: "status-fd", Value: "", Usage: "criu writes \\0 to this FD once lazy-pages is ready"},
+		cli.IntFlag{Name: "status-fd", Value: -1, Usage: "criu writes \\0 to this FD once lazy-pages is ready"},
 		cli.StringFlag{Name: "page-server", Value: "", Usage: "ADDRESS:PORT of the page server"},
 		cli.BoolFlag{Name: "file-locks", Usage: "handle file locks, for safety"},
 		cli.BoolFlag{Name: "pre-dump", Usage: "dump container's memory information only, leave the container running after this"},
@ -47,7 +47,7 @@ checkpointed.`,
 			return err
 		}
 		// XXX: Currently this is untested with rootless containers.
-		if os.Geteuid() != 0 || system.RunningInUserNS() {
+		if os.Geteuid() != 0 || userns.RunningInUserNS() {
 			logrus.Warn("runc checkpoint is untested with rootless containers")
 		}

@ -60,10 +60,13 @@ checkpointed.`,
 			return err
 		}
 		if status == libcontainer.Created || status == libcontainer.Stopped {
-			fatalf("Container cannot be checkpointed in %s state", status.String())
+			fatal(fmt.Errorf("Container cannot be checkpointed in %s state", status.String()))
 		}
-		defer destroy(container)
 		options := criuOptions(context)
+		if !(options.LeaveRunning || options.PreDump) {
+			// destroy container unless we tell CRIU to keep it
+			defer destroy(container)
+		}
 		// these are the mandatory criu options for a container
 		setPageServer(context, options)
 		setManageCgroupsMode(context, options)
@ -74,28 +77,53 @@ checkpointed.`,
 	},
 }

-func getCheckpointImagePath(context *cli.Context) string {
+func prepareImagePaths(context *cli.Context) (string, string, error) {
 	imagePath := context.String("image-path")
 	if imagePath == "" {
-		imagePath = getDefaultImagePath(context)
+		imagePath = getDefaultImagePath()
 	}
-	return imagePath
+
+	if err := os.MkdirAll(imagePath, 0o600); err != nil {
+		return "", "", err
+	}
+
+	parentPath := context.String("parent-path")
+	if parentPath == "" {
+		return imagePath, parentPath, nil
+	}
+
+	if filepath.IsAbs(parentPath) {
+		return "", "", errors.New("--parent-path must be relative")
+	}
+
+	realParent := filepath.Join(imagePath, parentPath)
+	fi, err := os.Stat(realParent)
+	if err == nil && !fi.IsDir() {
+		err = &os.PathError{Path: realParent, Err: unix.ENOTDIR}
+	}
+
+	if err != nil {
+		return "", "", fmt.Errorf("invalid --parent-path: %w", err)
+	}
+
+	return imagePath, parentPath, nil
 }

 func setPageServer(context *cli.Context, options *libcontainer.CriuOpts) {
 	// xxx following criu opts are optional
 	// The dump image can be sent to a criu page server
 	if psOpt := context.String("page-server"); psOpt != "" {
-		addressPort := strings.Split(psOpt, ":")
-		if len(addressPort) != 2 {
-			fatal(fmt.Errorf("Use --page-server ADDRESS:PORT to specify page server"))
+		address, port, err := net.SplitHostPort(psOpt)
+
+		if err != nil || address == "" || port == "" {
+			fatal(errors.New("Use --page-server ADDRESS:PORT to specify page server"))
 		}
-		portInt, err := strconv.Atoi(addressPort[1])
+		portInt, err := strconv.Atoi(port)
 		if err != nil {
-			fatal(fmt.Errorf("Invalid port number"))
+			fatal(errors.New("Invalid port number"))
 		}
 		options.PageServer = libcontainer.CriuPageServerInfo{
-			Address: addressPort[0],
+			Address: address,
 			Port:    int32(portInt),
 		}
 	}
@ -105,13 +133,13 @@ func setManageCgroupsMode(context *cli.Context, options *libcontainer.CriuOpts)
 	if cgOpt := context.String("manage-cgroups-mode"); cgOpt != "" {
 		switch cgOpt {
 		case "soft":
-			options.ManageCgroupsMode = libcontainer.CRIU_CG_MODE_SOFT
+			options.ManageCgroupsMode = criu.CriuCgMode_SOFT
 		case "full":
-			options.ManageCgroupsMode = libcontainer.CRIU_CG_MODE_FULL
+			options.ManageCgroupsMode = criu.CriuCgMode_FULL
 		case "strict":
-			options.ManageCgroupsMode = libcontainer.CRIU_CG_MODE_STRICT
+			options.ManageCgroupsMode = criu.CriuCgMode_STRICT
 		default:
-			fatal(fmt.Errorf("Invalid manage cgroups mode"))
+			fatal(errors.New("Invalid manage cgroups mode"))
 		}
 	}
 }
--- a/contrib/cmd/recvtty/recvtty.go
+++ b/contrib/cmd/recvtty/recvtty.go
@ -17,12 +17,13 @@
 package main

 import (
+	"errors"
 	"fmt"
 	"io"
-	"io/ioutil"
 	"net"
 	"os"
 	"strings"
+	"sync"

 	"github.com/containerd/console"
 	"github.com/opencontainers/runc/libcontainer/utils"
@ -65,7 +66,7 @@ func bail(err error) {
 	os.Exit(1)
 }

-func handleSingle(path string) error {
+func handleSingle(path string, noStdin bool) error {
 	// Open a socket.
 	ln, err := net.Listen("unix", path)
 	if err != nil {
@ -87,7 +88,7 @@ func handleSingle(path string) error {
 	// Get the fd of the connection.
 	unixconn, ok := conn.(*net.UnixConn)
 	if !ok {
-		return fmt.Errorf("failed to cast to unixconn")
+		return errors.New("failed to cast to unixconn")
 	}

 	socket, err := unixconn.File()
@ -105,23 +106,37 @@ func handleSingle(path string) error {
 	if err != nil {
 		return err
 	}
-	console.ClearONLCR(c.Fd())
+	if err := console.ClearONLCR(c.Fd()); err != nil {
+		return err
+	}

 	// Copy from our stdio to the master fd.
-	quitChan := make(chan struct{})
+	var (
+		wg            sync.WaitGroup
+		inErr, outErr error
+	)
+	wg.Add(1)
 	go func() {
-		io.Copy(os.Stdout, c)
-		quitChan <- struct{}{}
-	}()
-	go func() {
-		io.Copy(c, os.Stdin)
-		quitChan <- struct{}{}
+		_, outErr = io.Copy(os.Stdout, c)
+		wg.Done()
 	}()
+	if !noStdin {
+		wg.Add(1)
+		go func() {
+			_, inErr = io.Copy(c, os.Stdin)
+			wg.Done()
+		}()
+	}

 	// Only close the master fd once we've stopped copying.
-	<-quitChan
+	wg.Wait()
 	c.Close()
-	return nil
+
+	if outErr != nil {
+		return outErr
+	}
+
+	return inErr
 }

 func handleNull(path string) error {
@ -161,15 +176,7 @@ func handleNull(path string) error {
 				return
 			}

-			// Just do a dumb copy to /dev/null.
-			devnull, err := os.OpenFile("/dev/null", os.O_RDWR, 0)
-			if err != nil {
-				// TODO: Handle this nicely.
-				return
-			}
-
-			io.Copy(devnull, master)
-			devnull.Close()
+			_, _ = io.Copy(io.Discard, master)
 		}(conn)
 	}
 }
@ -185,7 +192,7 @@ func main() {
 		v = append(v, version)
 	}
 	if gitCommit != "" {
-		v = append(v, fmt.Sprintf("commit: %s", gitCommit))
+		v = append(v, "commit: "+gitCommit)
 	}
 	app.Version = strings.Join(v, "\n")

@ -201,26 +208,31 @@ func main() {
 			Value: "",
 			Usage: "Path to write daemon process ID to",
 		},
+		cli.BoolFlag{
+			Name:  "no-stdin",
+			Usage: "Disable stdin handling (no-op for null mode)",
+		},
 	}

 	app.Action = func(ctx *cli.Context) error {
 		args := ctx.Args()
 		if len(args) != 1 {
-			return fmt.Errorf("need to specify a single socket path")
+			return errors.New("need to specify a single socket path")
 		}
 		path := ctx.Args()[0]

 		pidPath := ctx.String("pid-file")
 		if pidPath != "" {
 			pid := fmt.Sprintf("%d\n", os.Getpid())
-			if err := ioutil.WriteFile(pidPath, []byte(pid), 0644); err != nil {
+			if err := os.WriteFile(pidPath, []byte(pid), 0o644); err != nil {
 				return err
 			}
 		}

+		noStdin := ctx.Bool("no-stdin")
 		switch ctx.String("mode") {
 		case "single":
-			if err := handleSingle(path); err != nil {
+			if err := handleSingle(path, noStdin); err != nil {
 				return err
 			}
 		case "null":
--- a/contrib/cmd/sd-helper/helper.go
+++ b/contrib/cmd/sd-helper/helper.go
@ -0,0 +1,86 @@
+package main
+
+import (
+	"flag"
+	"fmt"
+	"os"
+
+	"github.com/sirupsen/logrus"
+
+	"github.com/opencontainers/runc/libcontainer/cgroups"
+	"github.com/opencontainers/runc/libcontainer/cgroups/systemd"
+	"github.com/opencontainers/runc/libcontainer/configs"
+)
+
+func usage() {
+	fmt.Print(`Open Container Initiative contrib/cmd/sd-helper
+
+sd-helper is a tool that uses runc/libcontainer/cgroups/systemd package
+functionality to communicate to systemd in order to perform various operations.
+Currently this is limited to starting and stopping systemd transient slice
+units.
+
+Usage:
+	sd-helper [-debug] [-parent <pname>] {start|stop} <name>
+
+Example:
+	sd-helper -parent system.slice start system-pod123.slice
+`)
+	os.Exit(1)
+}
+
+var (
+	debug  = flag.Bool("debug", false, "enable debug output")
+	parent = flag.String("parent", "", "parent unit name")
+)
+
+func main() {
+	if !systemd.IsRunningSystemd() {
+		logrus.Fatal("systemd is required")
+	}
+
+	// Set the flags.
+	flag.Parse()
+	if *debug {
+		logrus.SetLevel(logrus.DebugLevel)
+	}
+	if flag.NArg() != 2 {
+		usage()
+	}
+
+	cmd := flag.Arg(0)
+	unit := flag.Arg(1)
+
+	err := unitCommand(cmd, unit, *parent)
+	if err != nil {
+		logrus.Fatal(err)
+	}
+}
+
+func newManager(config *configs.Cgroup) (cgroups.Manager, error) {
+	if cgroups.IsCgroup2UnifiedMode() {
+		return systemd.NewUnifiedManager(config, "")
+	}
+	return systemd.NewLegacyManager(config, nil)
+}
+
+func unitCommand(cmd, name, parent string) error {
+	podConfig := &configs.Cgroup{
+		Name:      name,
+		Parent:    parent,
+		Resources: &configs.Resources{},
+	}
+	pm, err := newManager(podConfig)
+	if err != nil {
+		return err
+	}
+
+	switch cmd {
+	case "start":
+		return pm.Apply(-1)
+	case "stop":
+		return pm.Destroy()
+	}
+
+	return fmt.Errorf("unknown command: %s", cmd)
+}
--- a/contrib/cmd/seccompagent/README.md
+++ b/contrib/cmd/seccompagent/README.md
@ -0,0 +1,70 @@
+# Seccomp Agent
+
+## Warning
+
+Please note this is an example agent, as such it is possible that specially
+crafted messages can produce bad behaviour. Please use it as an example only.
+
+Also, this agent is used for integration tests. Be aware that changing the
+behaviour can break the integration tests.
+
+## Get started
+
+Compile runc and seccompagent:
+```bash
+make all
+```
+
+Run the seccomp agent in the background:
+```bash
+sudo ./contrib/cmd/seccompagent/seccompagent &
+```
+
+Prepare a container:
+```bash
+mkdir container-seccomp-notify
+cd container-seccomp-notify
+mkdir rootfs
+docker export $(docker create busybox) | tar -C rootfs -xvf -
+```
+
+Then, generate a config.json by running the script gen-seccomp-example-cfg.sh
+from the directory where this README.md is in the container directory you
+prepared earlier (`container-seccomp-notify`).
+
+Then start the container:
+```bash
+runc run mycontainerid
+```
+
+The container will output something like this:
+```bash
+ cd /dev/shm
+ mkdir test-dir
+ touch test-file
+ chmod 777 test-file
+chmod: changing permissions of 'test-file': No medium found
+ stat /dev/shm/test-dir-foo
+  File: /dev/shm/test-dir-foo
+  Size: 40        	Blocks: 0          IO Block: 4096   directory
+Device: 3eh/62d	Inode: 2           Links: 2
+Access: (0755/drwxr-xr-x)  Uid: (    0/    root)   Gid: (    0/    root)
+Access: 2021-09-09 15:03:13.043716040 +0000
+Modify: 2021-09-09 15:03:13.043716040 +0000
+Change: 2021-09-09 15:03:13.043716040 +0000
+ Birth: -
+ ls -l /dev/shm
+total 0
+drwxr-xr-x 2 root root 40 Sep  9 15:03 test-dir-foo
+-rw-r--r-- 1 root root  0 Sep  9 15:03 test-file
+ echo Note the agent added a suffix for the directory name and chmod fails
+Note the agent added a suffix for the directory name and chmod fails
+```
+
+This shows a simple example that runs in /dev/shm just because it is a tmpfs in
+the example config.json.
+
+The agent makes all chmod calls fail with ENOMEDIUM, as the example output shows.
+
+For mkdir, the agent adds a "-foo" suffix: the container runs "mkdir test-dir"
+but the directory created is "test-dir-foo".
--- a/contrib/cmd/seccompagent/gen-seccomp-example-cfg.sh
+++ b/contrib/cmd/seccompagent/gen-seccomp-example-cfg.sh
@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+# Detect if we are running inside bats (i.e. inside integration tests) or just
+# called by an end-user
+# bats-core v1.2.1 defines BATS_RUN_TMPDIR
+if [ -z "$BATS_RUN_TMPDIR" ]; then
+	# When not running in bats, we create the config.json
+	set -e
+	runc spec
+fi
+
+# We can't source $(dirname $0)/../../../tests/integration/helpers.bash as that
+# exits when not running inside bats. We can do hacks, but just to redefine
+# update_config() seems clearer. We don't even really need to keep them in sync.
+function update_config() {
+        jq "$1" "./config.json" | awk 'BEGIN{RS="";getline<"-";print>ARGV[1]}' "./config.json"
+}
+
+update_config '.linux.seccomp = {
+                        "defaultAction": "SCMP_ACT_ALLOW",
+                        "listenerPath": "/run/seccomp-agent.socket",
+                        "listenerMetadata": "foo",
+                        "architectures": [ "SCMP_ARCH_X86", "SCMP_ARCH_X32", "SCMP_ARCH_X86_64" ],
+                        "syscalls": [
+                                {
+                                        "names": [ "chmod", "fchmod", "fchmodat", "mkdir" ],
+                                        "action": "SCMP_ACT_NOTIFY"
+                                }
+			]
+		}'
+
+update_config '.process.args = [
+				"sh",
+				"-c",
+				"set -x; cd /dev/shm; mkdir test-dir; touch test-file; chmod 777 test-file; stat /dev/shm/test-dir-foo && ls -l /dev/shm && echo \"Note the agent added a suffix for the directory name and chmod fails\" "
+				]'
--- a/contrib/cmd/seccompagent/seccompagent.go
+++ b/contrib/cmd/seccompagent/seccompagent.go
@ -0,0 +1,291 @@
+//go:build linux && seccomp
+// +build linux,seccomp
+
+package main
+
+import (
+	"bytes"
+	"encoding/json"
+	"errors"
+	"flag"
+	"fmt"
+	"net"
+	"os"
+	"path/filepath"
+	"strings"
+
+	securejoin "github.com/cyphar/filepath-securejoin"
+	"github.com/opencontainers/runtime-spec/specs-go"
+	libseccomp "github.com/seccomp/libseccomp-golang"
+	"github.com/sirupsen/logrus"
+	"golang.org/x/sys/unix"
+)
+
+var (
+	socketFile string
+	pidFile    string
+)
+
+func closeStateFds(recvFds []int) {
+	for i := range recvFds {
+		unix.Close(i)
+	}
+}
+
+// parseStateFds returns the seccomp-fd and closes the rest of the fds in recvFds.
+// In case of error, no fd is closed.
+// StateFds is assumed to be formatted as specs.ContainerProcessState.Fds and
+// recvFds the corresponding list of received fds in the same SCM_RIGHT message.
+func parseStateFds(stateFds []string, recvFds []int) (uintptr, error) {
+	// Let's find the index in stateFds of the seccomp-fd.
+	idx := -1
+	err := false
+
+	for i, name := range stateFds {
+		if name == specs.SeccompFdName && idx == -1 {
+			idx = i
+			continue
+		}
+
+		// We found the seccompFdName twice. Error out!
+		if name == specs.SeccompFdName && idx != -1 {
+			err = true
+		}
+	}
+
+	if idx == -1 || err {
+		return 0, errors.New("seccomp fd not found or malformed containerProcessState.Fds")
+	}
+
+	if idx >= len(recvFds) || idx < 0 {
+		return 0, errors.New("seccomp fd index out of range")
+	}
+
+	fd := uintptr(recvFds[idx])
+
+	for i := range recvFds {
+		if i == idx {
+			continue
+		}
+
+		unix.Close(recvFds[i])
+	}
+
+	return fd, nil
+}
+
+func handleNewMessage(sockfd int) (uintptr, string, error) {
+	const maxNameLen = 4096
+	stateBuf := make([]byte, maxNameLen)
+	oobSpace := unix.CmsgSpace(4)
+	oob := make([]byte, oobSpace)
+
+	n, oobn, _, _, err := unix.Recvmsg(sockfd, stateBuf, oob, 0)
+	if err != nil {
+		return 0, "", err
+	}
+	if n >= maxNameLen || oobn != oobSpace {
+		return 0, "", fmt.Errorf("recvfd: incorrect number of bytes read (n=%d oobn=%d)", n, oobn)
+	}
+
+	// Truncate.
+	stateBuf = stateBuf[:n]
+	oob = oob[:oobn]
+
+	scms, err := unix.ParseSocketControlMessage(oob)
+	if err != nil {
+		return 0, "", err
+	}
+	if len(scms) != 1 {
+		return 0, "", fmt.Errorf("recvfd: number of SCMs is not 1: %d", len(scms))
+	}
+	scm := scms[0]
+
+	fds, err := unix.ParseUnixRights(&scm)
+	if err != nil {
+		return 0, "", err
+	}
+
+	containerProcessState := &specs.ContainerProcessState{}
+	err = json.Unmarshal(stateBuf, containerProcessState)
+	if err != nil {
+		closeStateFds(fds)
+		return 0, "", fmt.Errorf("cannot parse OCI state: %w", err)
+	}
+
+	fd, err := parseStateFds(containerProcessState.Fds, fds)
+	if err != nil {
+		closeStateFds(fds)
+		return 0, "", err
+	}
+
+	return fd, containerProcessState.Metadata, nil
+}
+
+func readArgString(pid uint32, offset int64) (string, error) {
+	buffer := make([]byte, 4096) // PATH_MAX
+
+	memfd, err := unix.Open(fmt.Sprintf("/proc/%d/mem", pid), unix.O_RDONLY, 0o777)
+	if err != nil {
+		return "", err
+	}
+	defer unix.Close(memfd)
+
+	_, err = unix.Pread(memfd, buffer, offset)
+	if err != nil {
+		return "", err
+	}
+
+	buffer[len(buffer)-1] = 0
+	s := buffer[:bytes.IndexByte(buffer, 0)]
+	return string(s), nil
+}
+
+func runMkdirForContainer(pid uint32, fileName string, mode uint32, metadata string) error {
+	// We validated before that metadata is not a string that can make
+	// newFile a file in a different location other than root.
+	newFile := fmt.Sprintf("%s-%s", fileName, metadata)
+	root := fmt.Sprintf("/proc/%d/cwd/", pid)
+
+	if strings.HasPrefix(fileName, "/") {
+		// If it starts with /, use the rootfs as base
+		root = fmt.Sprintf("/proc/%d/root/", pid)
+	}
+
+	path, err := securejoin.SecureJoin(root, newFile)
+	if err != nil {
+		return err
+	}
+
+	return unix.Mkdir(path, mode)
+}
+
+// notifHandler handles seccomp notifications and responses
+func notifHandler(fd libseccomp.ScmpFd, metadata string) {
+	defer unix.Close(int(fd))
+	for {
+		req, err := libseccomp.NotifReceive(fd)
+		if err != nil {
+			logrus.Errorf("Error in NotifReceive(): %s", err)
+			continue
+		}
+		syscallName, err := req.Data.Syscall.GetName()
+		if err != nil {
+			logrus.Errorf("Error decoding syscall %v(): %s", req.Data.Syscall, err)
+			continue
+		}
+		logrus.Debugf("Received syscall %q, pid %v, arch %q, args %+v", syscallName, req.Pid, req.Data.Arch, req.Data.Args)
+
+		resp := &libseccomp.ScmpNotifResp{
+			ID:    req.ID,
+			Error: 0,
+			Val:   0,
+			Flags: libseccomp.NotifRespFlagContinue,
+		}
+
+		// TOCTOU check
+		if err := libseccomp.NotifIDValid(fd, req.ID); err != nil {
+			logrus.Errorf("TOCTOU check failed: req.ID is no longer valid: %s", err)
+			continue
+		}
+
+		switch syscallName {
+		case "mkdir":
+			fileName, err := readArgString(req.Pid, int64(req.Data.Args[0]))
+			if err != nil {
+				logrus.Errorf("Cannot read argument: %s", err)
+				resp.Error = int32(unix.ENOSYS)
+				resp.Val = ^uint64(0) // -1
+				goto sendResponse
+			}
+
+			logrus.Debugf("mkdir: %q", fileName)
+
+			// TOCTOU check
+			if err := libseccomp.NotifIDValid(fd, req.ID); err != nil {
+				logrus.Errorf("TOCTOU check failed: req.ID is no longer valid: %s", err)
+				continue
+			}
+
+			err = runMkdirForContainer(req.Pid, fileName, uint32(req.Data.Args[1]), metadata)
+			if err != nil {
+				resp.Error = int32(unix.ENOSYS)
+				resp.Val = ^uint64(0) // -1
+			}
+			resp.Flags = 0
+		case "chmod", "fchmod", "fchmodat":
+			resp.Error = int32(unix.ENOMEDIUM)
+			resp.Val = ^uint64(0) // -1
+			resp.Flags = 0
+		}
+
+	sendResponse:
+		if err = libseccomp.NotifRespond(fd, resp); err != nil {
+			logrus.Errorf("Error in notification response: %s", err)
+			continue
+		}
+	}
+}
+
+func main() {
+	flag.StringVar(&socketFile, "socketfile", "/run/seccomp-agent.socket", "Socket file")
+	flag.StringVar(&pidFile, "pid-file", "", "Pid file")
+	logrus.SetLevel(logrus.DebugLevel)
+
+	// Parse arguments
+	flag.Parse()
+	if flag.NArg() > 0 {
+		flag.PrintDefaults()
+		logrus.Fatal("Invalid command")
+	}
+
+	if err := os.Remove(socketFile); err != nil && !errors.Is(err, os.ErrNotExist) {
+		logrus.Fatalf("Cannot cleanup socket file: %v", err)
+	}
+
+	if pidFile != "" {
+		pid := fmt.Sprintf("%d", os.Getpid())
+		if err := os.WriteFile(pidFile, []byte(pid), 0o644); err != nil {
+			logrus.Fatalf("Cannot write pid file: %v", err)
+		}
+	}
+
+	logrus.Info("Waiting for seccomp file descriptors")
+	l, err := net.Listen("unix", socketFile)
+	if err != nil {
+		logrus.Fatalf("Cannot listen: %s", err)
+	}
+	defer l.Close()
+
+	for {
+		conn, err := l.Accept()
+		if err != nil {
+			logrus.Errorf("Cannot accept connection: %s", err)
+			continue
+		}
+		socket, err := conn.(*net.UnixConn).File()
+		conn.Close()
+		if err != nil {
+			logrus.Errorf("Cannot get socket: %v", err)
+			continue
+		}
+		newFd, metadata, err := handleNewMessage(int(socket.Fd()))
+		socket.Close()
+		if err != nil {
+			logrus.Errorf("Error receiving seccomp file descriptor: %v", err)
+			continue
+		}
+
+		// Make sure we don't allow strings like "/../p", as that means
+		// a file in a different location than expected. We just want
+		// safe things to use as a suffix for a file name.
+		metadata = filepath.Base(metadata)
+		if strings.Contains(metadata, "/") {
+			// Fallback to a safe string.
+			metadata = "agent-generated-suffix"
+		}
+
+		logrus.Infof("Received new seccomp fd: %v", newFd)
+		go notifHandler(libseccomp.ScmpFd(newFd), metadata)
+	}
+}
--- a/contrib/cmd/seccompagent/unsupported.go
+++ b/contrib/cmd/seccompagent/unsupported.go
@ -0,0 +1,10 @@
+//go:build !linux || !seccomp
+// +build !linux !seccomp
+
+package main
+
+import "fmt"
+
+func main() {
+	fmt.Println("Not supported, to use this compile with build tag: seccomp.")
+}
--- a/contrib/completions/bash/runc
+++ b/contrib/completions/bash/runc
@ -113,6 +113,8 @@ __runc_complete_capabilities() {
 		AUDIT_WRITE
 		AUDIT_READ
 		BLOCK_SUSPEND
+		BPF
+		CHECKPOINT_RESTORE
 		CHOWN
 		DAC_OVERRIDE
 		DAC_READ_SEARCH
@ -130,6 +132,7 @@ __runc_complete_capabilities() {
 		NET_BIND_SERVICE
 		NET_BROADCAST
 		NET_RAW
+		PERFMON
 		SETFCAP
 		SETGID
 		SETPCAP
@ -170,6 +173,7 @@ _runc_exec() {
 	   --apparmor
 	   --cap, -c
 	   --preserve-fds
+	   --ignore-paused
 	"

 	local all_options="$options_with_args $boolean_options"
@ -221,6 +225,7 @@ _runc_runc() {
 		--help
 		--version -v
 		--debug
+		--systemd-cgroup
 	"
 	local options_with_args="
 		--log
@ -733,8 +738,6 @@ _runc_update() {
 	   --cpu-share
 	   --cpuset-cpus
 	   --cpuset-mems
-	   --kernel-memory
-	   --kernel-memory-tcp
 	   --memory
 	   --memory-reservation
 	   --memory-swap
@ -769,7 +772,6 @@ _runc() {
 		delete
 		events
 		exec
-		init
 		kill
 		list
 		pause
--- a/create.go
+++ b/create.go
@ -1,6 +1,7 @@
 package main

 import (
+	"fmt"
 	"os"

 	"github.com/urfave/cli"
@ -55,20 +56,12 @@ command(s) that get executed on start, edit the args parameter of the spec. See
 		if err := checkArgs(context, 1, exactArgs); err != nil {
 			return err
 		}
-		if err := revisePidFile(context); err != nil {
-			return err
+		status, err := startContainer(context, CT_ACT_CREATE, nil)
+		if err == nil {
+			// exit with the container's exit status so any external supervisor
+			// is notified of the exit with the correct exit status.
+			os.Exit(status)
 		}
-		spec, err := setupSpec(context)
-		if err != nil {
-			return err
-		}
-		status, err := startContainer(context, spec, CT_ACT_CREATE, nil)
-		if err != nil {
-			return err
-		}
-		// exit with the container's exit status so any external supervisor is
-		// notified of the exit with the correct exit status.
-		os.Exit(status)
-		return nil
+		return fmt.Errorf("runc create failed: %w", err)
 	},
 }
--- a/debian/changelog
+++ b/debian/changelog
@ -1,3 +1,9 @@
+runc (1.1.0-ok1) yangtze; urgency=medium
+
+  * Merge new upstream version 1.1.0 
+
+ -- Luoyaoming <luoyaoming@kylinos.cn>  Fri, 30 Dec 2022 11:11:29 +0800
+
 runc (1.0.0~rc10-ok2) yangtze; urgency=medium

  * Update version.
--- a/debian/patches/test--fix_TestGetAdditionalGroups.patch
+++ b/debian/patches/test--fix_TestGetAdditionalGroups.patch
@ -0,0 +1,39 @@
+From: Dmitry Smirnov <onlyjob@debian.org>
+Date: Thu, 28 Jul 2022 16:28:22 +0800
+Subject: fix FTBFS on i686
+
+src/github.com/opencontainers/runc/libcontainer/user/user_test.go:448:36: constant 2147483648 overflows int
+Last-Update: 2018-06-16
+Forwarded: https://github.com/opencontainers/runc/pull/1821
+Bug-Upstream: https://github.com/opencontainers/runc/issues/941
+---
+ libcontainer/user/user.go      | 2 +-
+ libcontainer/user/user_test.go | 2 +-
+ 2 files changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/libcontainer/user/user.go b/libcontainer/user/user.go
+index 7b912bb..38caded 100644
+--- a/libcontainer/user/user.go
+++ b/libcontainer/user/user.go
+@@ -473,7 +473,7 @@ func GetAdditionalGroups(additionalGroups []string, group io.Reader) ([]int, err
+ 				return nil, fmt.Errorf("Unable to find group %s", ag)
+ 			}
+ 			// Ensure gid is inside gid range.
+-			if gid < minId || gid > maxId {
+			if gid < minId || gid >= maxId {
+ 				return nil, ErrRange
+ 			}
+ 			gidMap[gid] = struct{}{}
+diff --git a/libcontainer/user/user_test.go b/libcontainer/user/user_test.go
+index 24ee559..a4aabdc 100644
+--- a/libcontainer/user/user_test.go
+++ b/libcontainer/user/user_test.go
+@@ -445,7 +445,7 @@ this is just some garbage data
+ 	if utils.GetIntSize() > 4 {
+ 		tests = append(tests, foo{
+ 			// groups with too large id
+-			groups:   []string{strconv.Itoa(1 << 31)},
+			groups:   []string{strconv.Itoa( 1<<31 -1 )},
+ 			expected: nil,
+ 			hasError: true,
+ 		})
--- a/debian/patches/test--skip-Hugetlb.patch
+++ b/debian/patches/test--skip-Hugetlb.patch
@ -0,0 +1,48 @@
+From: Dmitry Smirnov <onlyjob@debian.org>
+Date: Thu, 28 Jul 2022 16:28:22 +0800
+Subject: disabled unreliable tests due to random failures on [ppc64el,
+ s390x].
+
+Last-Update: 2018-09-27
+Forwarded: not-needed
+Bug-Upstream: https://github.com/opencontainers/runc/issues/1822
+---
+ libcontainer/cgroups/fs/hugetlb_test.go | 4 ++++
+ 1 file changed, 4 insertions(+)
+
+diff --git a/libcontainer/cgroups/fs/hugetlb_test.go b/libcontainer/cgroups/fs/hugetlb_test.go
+index 9ddacfe..9b60650 100644
+--- a/libcontainer/cgroups/fs/hugetlb_test.go
+++ b/libcontainer/cgroups/fs/hugetlb_test.go
+@@ -89,6 +89,7 @@ func TestHugetlbStats(t *testing.T) {
+ }
+ 
+ func TestHugetlbStatsNoUsageFile(t *testing.T) {
+t.Skip("Disabled unreliable test")
+ 	helper := NewCgroupTestUtil("hugetlb", t)
+ 	defer helper.cleanup()
+ 	helper.writeFileContents(map[string]string{
+@@ -104,6 +105,7 @@ func TestHugetlbStatsNoUsageFile(t *testing.T) {
+ }
+ 
+ func TestHugetlbStatsNoMaxUsageFile(t *testing.T) {
+t.Skip("Disabled unreliable test")
+ 	helper := NewCgroupTestUtil("hugetlb", t)
+ 	defer helper.cleanup()
+ 	for _, pageSize := range HugePageSizes {
+@@ -121,6 +123,7 @@ func TestHugetlbStatsNoMaxUsageFile(t *testing.T) {
+ }
+ 
+ func TestHugetlbStatsBadUsageFile(t *testing.T) {
+t.Skip("Disabled unreliable test")
+ 	helper := NewCgroupTestUtil("hugetlb", t)
+ 	defer helper.cleanup()
+ 	for _, pageSize := range HugePageSizes {
+@@ -139,6 +142,7 @@ func TestHugetlbStatsBadUsageFile(t *testing.T) {
+ }
+ 
+ func TestHugetlbStatsBadMaxUsageFile(t *testing.T) {
+t.Skip("Disabled unreliable test")
+ 	helper := NewCgroupTestUtil("hugetlb", t)
+ 	defer helper.cleanup()
+ 	helper.writeFileContents(map[string]string{
--- a/debian/patches/test--skip_TestFactoryNewTmpfs.patch
+++ b/debian/patches/test--skip_TestFactoryNewTmpfs.patch
@ -0,0 +1,22 @@
+From: Dmitry Smirnov <onlyjob@debian.org>
+Date: Thu, 28 Jul 2022 16:28:22 +0800
+Subject: disable test (requires root)
+
+Last-Update: 2018-06-15
+Forwarded: not-needed
+---
+ libcontainer/factory_linux_test.go | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/libcontainer/factory_linux_test.go b/libcontainer/factory_linux_test.go
+index 8d0ca8a..1dc0180 100644
+--- a/libcontainer/factory_linux_test.go
+++ b/libcontainer/factory_linux_test.go
+@@ -78,6 +78,7 @@ func TestFactoryNewIntelRdt(t *testing.T) {
+ }
+ 
+ func TestFactoryNewTmpfs(t *testing.T) {
+t.Skip("DM - skipping privileged test")
+ 	root, rerr := newTestRoot()
+ 	if rerr != nil {
+ 		t.Fatal(rerr)
--- a/delete.go
+++ b/delete.go
@ -1,12 +1,10 @@
-// +build !solaris
-
 package main

 import (
+	"errors"
 	"fmt"
 	"os"
 	"path/filepath"
-	"syscall"
 	"time"

 	"github.com/opencontainers/runc/libcontainer"
@ -19,12 +17,12 @@ func killContainer(container libcontainer.Container) error {
 	_ = container.Signal(unix.SIGKILL, false)
 	for i := 0; i < 100; i++ {
 		time.Sleep(100 * time.Millisecond)
-		if err := container.Signal(syscall.Signal(0), false); err != nil {
+		if err := container.Signal(unix.Signal(0), false); err != nil {
 			destroy(container)
 			return nil
 		}
 	}
-	return fmt.Errorf("container init still running")
+	return errors.New("container init still running")
 }

 var deleteCommand = cli.Command{
@ -55,7 +53,7 @@ status of "ubuntu01" as "stopped" the following will delete resources held for
 		force := context.Bool("force")
 		container, err := getContainer(context)
 		if err != nil {
-			if lerr, ok := err.(libcontainer.Error); ok && lerr.Code() == libcontainer.ContainerNotExists {
+			if errors.Is(err, libcontainer.ErrNotExist) {
 				// if there was an aborted start or something of the sort then the container's directory could exist but
 				// libcontainer does not see it because the state.json file inside that directory was never created.
 				path := filepath.Join(context.GlobalString("root"), id)
@ -81,7 +79,7 @@ status of "ubuntu01" as "stopped" the following will delete resources held for
 			if force {
 				return killContainer(container)
 			}
-			return fmt.Errorf("cannot delete container %s that is not stopped: %s\n", id, s)
+			return fmt.Errorf("cannot delete container %s that is not stopped: %s", id, s)
 		}

 		return nil
--- a/docs/Security-Audit.pdf
+++ b/docs/Security-Audit.pdf
--- a/docs/cgroup-v2.md
+++ b/docs/cgroup-v2.md
@ -0,0 +1,62 @@
+# cgroup v2
+
+runc fully supports cgroup v2 (unified mode) since v1.0.0-rc93.
+
+To use cgroup v2, you might need to change the configuration of the host init system.
+Fedora (>= 31) uses cgroup v2 by default and no extra configuration is required.
+On other systemd-based distros, cgroup v2 can be enabled by adding `systemd.unified_cgroup_hierarchy=1` to the kernel cmdline.
+
+## Am I using cgroup v2?
+
+Yes if `/sys/fs/cgroup/cgroup.controllers` is present.
+
+## Host Requirements
+### Kernel
+* Recommended version: 5.2 or later
+* Minimum version: 4.15
+
+Kernel older than 5.2 is not recommended due to lack of freezer.
+
+Notably, kernel older than 4.15 MUST NOT be used (unless you are running containers with user namespaces), as it lacks support for controlling permissions of devices.
+
+### Systemd
+On cgroup v2 hosts, it is highly recommended to run runc with the systemd cgroup driver (`runc --systemd-cgroup`), though not mandatory.
+
+The recommended systemd version is 244 or later. Older systemd does not support delegation of `cpuset` controller.
+
+Make sure you also have the `dbus-user-session` (Debian/Ubuntu) or `dbus-daemon` (CentOS/Fedora) package installed, and that `dbus` is running. On Debian-flavored distros, this can be accomplished like so:
+
+```console
+$ sudo apt install -y dbus-user-session
+$ systemctl --user start dbus
+```
+
+## Rootless
+On cgroup v2 hosts, rootless runc can talk to systemd to get cgroup permissions to be delegated.
+
+```console
+$ runc spec --rootless
+$ jq '.linux.cgroupsPath="user.slice:runc:foo"' config.json | sponge config.json
+$ runc --systemd-cgroup run foo
+```
+
+The container processes are executed in a cgroup like `/user.slice/user-$(id -u).slice/user@$(id -u).service/user.slice/runc-foo.scope`.
+
+### Configuring delegation
+Typically, only `memory` and `pids` controllers are delegated to non-root users by default.
+
+```console
+$ cat /sys/fs/cgroup/user.slice/user-$(id -u).slice/user@$(id -u).service/cgroup.controllers
+memory pids
+```
+
+To allow delegation of other controllers, you need to change the systemd configuration as follows:
+
+```console
+# mkdir -p /etc/systemd/system/user@.service.d
+# cat > /etc/systemd/system/user@.service.d/delegate.conf << EOF
+[Service]
+Delegate=cpu cpuset io memory pids
+EOF
+# systemctl daemon-reload
+```
--- a/docs/experimental.md
+++ b/docs/experimental.md
@ -0,0 +1,11 @@
+# Experimental features
+
+The following features are experimental and subject to change:
+
+- The `runc features` command (since runc v1.1.0)
+
+The following features were experimental in the past:
+
+Feature                                  | Experimental release | Graduation release
+---------------------------------------- | -------------------- | ------------------
+cgroup v2                                | v1.0.0-rc91          | v1.0.0-rc93
--- a/docs/systemd.md
+++ b/docs/systemd.md
@ -0,0 +1,130 @@
+## systemd cgroup driver
+
+By default, runc creates cgroups and sets cgroup limits on its own (this mode
+is known as fs cgroup driver). When `--systemd-cgroup` global option is given
+(as in e.g. `runc --systemd-cgroup run ...`), runc switches to systemd cgroup
+driver. This document describes its features and peculiarities.
+
+### systemd unit name and placement
+
+When creating a container, runc requests systemd (over dbus) to create
+a transient unit for the container, and place it into a specified slice.
+
+The name of the unit and the containing slice is derived from the container
+runtime spec in the following way:
+
+1. If `Linux.CgroupsPath` is set, it is expected to be in the form
+   `[slice]:[prefix]:[name]`.
+
+   Here `slice` is a systemd slice under which the container is placed.
+   If empty, it defaults to `system.slice`, except when cgroup v2 is
+   used and rootless container is created, in which case it defaults
+   to `user.slice`.
+
+   Note that `slice` can contain dashes to denote a sub-slice
+   (e.g. `user-1000.slice` is a correct notation, meaning a subslice
+   of `user.slice`), but it must not contain slashes (e.g.
+   `user.slice/user-1000.slice` is invalid).
+
+   A `slice` of `-` represents a root slice.
+
+   Next, `prefix` and `name` are used to compose the  unit name, which
+   is `<prefix>-<name>.scope`, unless `name` has `.slice` suffix, in
+   which case `prefix` is ignored and the `name` is used as is.
+
+2. If `Linux.CgroupsPath` is not set or empty, it works the same way as if it
+   would be set to `:runc:<container-id>`. See the description above to see
+   what it transforms to.
+
+As described above, a unit being created can either be a scope or a slice.
+For a scope, runc specifies its parent slice via a _Slice=_ systemd property,
+and also sets _Delegate=true_. For a slice, runc specifies a weak dependency on
+the parent slice via a _Wants=_ property.
+
+### Resource limits
+
+runc always enables accounting for all controllers, regardless of any limits
+being set. This means it unconditionally sets the following properties for the
+systemd unit being created:
+
+ * _CPUAccounting=true_
+ * _IOAccounting=true_ (_BlockIOAccounting_ for cgroup v1)
+ * _MemoryAccounting=true_
+ * _TasksAccounting=true_
+
+The resource limits of the systemd unit are set by runc by translating the
+runtime spec resources to systemd unit properties.
+
+Such translation is by no means complete, as there are some cgroup properties
+that can not be set via systemd.  Therefore, runc systemd cgroup driver is
+backed by fs driver (in other words, cgroup limits are first set via systemd
+unit properties, and when by writing to cgroupfs files).
+
+The set of runtime spec resources which is translated by runc to systemd unit
+properties depends on kernel cgroup version being used (v1 or v2), and on the
+systemd version being run. If an older systemd version (which does not support
+some resources) is used, runc do not set those resources.
+
+The following tables summarize which properties are translated.
+
+#### cgroup v1
+
+| runtime spec resource | systemd property name | min systemd version |
+|-----------------------|-----------------------|---------------------|
+| memory.limit          | MemoryLimit           |                     |
+| cpu.shares            | CPUShares             |                     |
+| blockIO.weight        | BlockIOWeight         |                     |
+| pids.limit            | TasksMax              |                     |
+| cpu.cpus              | AllowedCPUs           | v244                |
+| cpu.mems              | AllowedMemoryNodes    | v244                |
+
+#### cgroup v2
+
+| runtime spec resource   | systemd property name | min systemd version |
+|-------------------------|-----------------------|---------------------|
+| memory.limit            | MemoryMax             |                     |
+| memory.reservation      | MemoryLow             |                     |
+| memory.swap             | MemorySwapMax         |                     |
+| cpu.shares              | CPUWeight             |                     |
+| pids.limit              | TasksMax              |                     |
+| cpu.cpus                | AllowedCPUs           | v244                |
+| cpu.mems                | AllowedMemoryNodes    | v244                |
+| unified.cpu.max         | CPUQuota, CPUQuotaPeriodSec | v242          |
+| unified.cpu.weight      | CPUWeight             |                     |
+| unified.cpuset.cpus     | AllowedCPUs           | v244                |
+| unified.cpuset.mems     | AllowedMemoryNodes    | v244                |
+| unified.memory.high     | MemoryHigh            |                     |
+| unified.memory.low      | MemoryLow             |                     |
+| unified.memory.min      | MemoryMin             |                     |
+| unified.memory.max      | MemoryMax             |                     |
+| unified.memory.swap.max | MemorySwapMax         |                     |
+| unified.pids.max        | TasksMax              |                     |
+
+For documentation on systemd unit resource properties, see
+`systemd.resource-control(5)` man page.
+
+### Auxiliary properties
+
+Auxiliary properties of a systemd unit (as shown by `systemctl show
+<unit-name>` after the container is created) can be set (or overwritten) by
+adding annotations to the container runtime spec (`config.json`).
+
+For example:
+
+```json
+        "annotations": {
+                "org.systemd.property.TimeoutStopUSec": "uint64 123456789",
+                "org.systemd.property.CollectMode":"'inactive-or-failed'"
+        },
+```
+
+The above will set the following properties:
+
+* `TimeoutStopSec` to 2 minutes and 3 seconds;
+* `CollectMode` to "inactive-or-failed".
+
+The values must be in the gvariant format (for details, see
+[gvariant documentation](https://developer.gnome.org/glib/stable/gvariant-text.html)).
+
+To find out which type systemd expects for a particular parameter, please
+consult systemd sources.
--- a/docs/terminals.md
+++ b/docs/terminals.md
@ -113,6 +113,33 @@ interact with pseudo-terminal `stdio`][tty_ioctl(4)].
 > means that it is not really possible to uniquely distinguish between `stdout`
 > and `stderr` from the caller's perspective.

+#### Issues
+
+If you see an error like
+
+```
+open /dev/tty: no such device or address
+```
+
+from runc, it means it can't open a terminal (because there isn't one). This
+can happen when stdin (and possibly also stdout and stderr) are redirected,
+or in some environments that lack a tty (such as GitHub Actions runners).
+
+The solution to this is to *not* use a terminal for the container, i.e. have
+`terminal: false` in `config.json`. If the container really needs a terminal
+(some programs require one), you can provide one, using one of the following
+methods.
+
+One way is to use `ssh` with the `-tt` flag. The second `t` forces a terminal
+allocation even if there's no local one -- and so it is required when stdin is
+not a terminal (some `ssh` implementations only look for a terminal on stdin).
+
+Another way is to run runc under the `script` utility, like this
+
+```console
+$ script -e -c 'runc run <container>'
+```
+
 [tty_ioctl(4)]: https://linux.die.net/man/4/tty_ioctl

 ### <a name="pass-through"> Pass-Through ###
@ -124,7 +151,7 @@ passing of file descriptors -- [details below](#runc-modes)). As an example
 (assuming that `terminal: false` is set in `config.json`):

 ```
-% echo input | runc run some_container > /tmp/log.out 2>& /tmp/log.err
+% echo input | runc run some_container > /tmp/log.out 2> /tmp/log.err
 ```

 Here the container's various `stdio` file descriptors will be substituted with
@ -228,6 +255,19 @@ Unfortunately using detached mode is a bit more complicated and requires more
 care than the foreground mode -- mainly because it is now up to the caller to
 handle the `stdio` of the container.

+Another complication is that the parent process is responsible for acting as
+the subreaper for the container. In short, you need to call
+`prctl(PR_SET_CHILD_SUBREAPER, 1, ...)` in the parent process and correctly
+handle the implications of being a subreaper. Failing to do so may result in
+zombie processes being accumulated on your host.
+
+These tasks are usually performed by a dedicated (and minimal) monitor process
+per-container. For the sake of comparison, other runtimes such as LXC do not
+have an equivalent detached mode and instead integrate this monitor process
+into the container runtime itself -- this has several tradeoffs, and runc has
+opted to support delegating the monitoring responsibility to the parent process
+through this detached mode.
+
 #### Detached Pass-Through ####

 In detached mode, pass-through actually does what it says on the tin -- the
--- a/events.go
+++ b/events.go
@ -1,9 +1,8 @@
-// +build linux
-
 package main

 import (
 	"encoding/json"
+	"errors"
 	"fmt"
 	"os"
 	"sync"
@ -40,7 +39,7 @@ information is displayed once every 5 seconds.`,
 		}
 		duration := context.Duration("interval")
 		if duration <= 0 {
-			return fmt.Errorf("duration interval must be greater than 0")
+			return errors.New("duration interval must be greater than 0")
 		}
 		status, err := container.Status()
 		if err != nil {
@ -125,10 +124,14 @@ func convertLibcontainerStats(ls *libcontainer.Stats) *types.Stats {
 	s.CPU.Usage.User = cg.CpuStats.CpuUsage.UsageInUsermode
 	s.CPU.Usage.Total = cg.CpuStats.CpuUsage.TotalUsage
 	s.CPU.Usage.Percpu = cg.CpuStats.CpuUsage.PercpuUsage
+	s.CPU.Usage.PercpuKernel = cg.CpuStats.CpuUsage.PercpuUsageInKernelmode
+	s.CPU.Usage.PercpuUser = cg.CpuStats.CpuUsage.PercpuUsageInUsermode
 	s.CPU.Throttling.Periods = cg.CpuStats.ThrottlingData.Periods
 	s.CPU.Throttling.ThrottledPeriods = cg.CpuStats.ThrottlingData.ThrottledPeriods
 	s.CPU.Throttling.ThrottledTime = cg.CpuStats.ThrottlingData.ThrottledTime

+	s.CPUSet = types.CPUSet(cg.CPUSetStats)
+
 	s.Memory.Cache = cg.MemoryStats.Cache
 	s.Memory.Kernel = convertMemoryEntry(cg.MemoryStats.KernelUsage)
 	s.Memory.KernelTCP = convertMemoryEntry(cg.MemoryStats.KernelTCPUsage)
@ -151,16 +154,22 @@ func convertLibcontainerStats(ls *libcontainer.Stats) *types.Stats {
 	}

 	if is := ls.IntelRdtStats; is != nil {
-		if intelrdt.IsCatEnabled() {
+		if intelrdt.IsCATEnabled() {
 			s.IntelRdt.L3CacheInfo = convertL3CacheInfo(is.L3CacheInfo)
 			s.IntelRdt.L3CacheSchemaRoot = is.L3CacheSchemaRoot
 			s.IntelRdt.L3CacheSchema = is.L3CacheSchema
 		}
-		if intelrdt.IsMbaEnabled() {
+		if intelrdt.IsMBAEnabled() {
 			s.IntelRdt.MemBwInfo = convertMemBwInfo(is.MemBwInfo)
 			s.IntelRdt.MemBwSchemaRoot = is.MemBwSchemaRoot
 			s.IntelRdt.MemBwSchema = is.MemBwSchema
 		}
+		if intelrdt.IsMBMEnabled() {
+			s.IntelRdt.MBMStats = is.MBMStats
+		}
+		if intelrdt.IsCMTEnabled() {
+			s.IntelRdt.CMTStats = is.CMTStats
+		}
 	}

 	s.NetworkInterfaces = ls.Interfaces
@ -187,29 +196,17 @@ func convertMemoryEntry(c cgroups.MemoryData) types.MemoryEntry {
 func convertBlkioEntry(c []cgroups.BlkioStatEntry) []types.BlkioEntry {
 	var out []types.BlkioEntry
 	for _, e := range c {
-		out = append(out, types.BlkioEntry{
-			Major: e.Major,
-			Minor: e.Minor,
-			Op:    e.Op,
-			Value: e.Value,
-		})
+		out = append(out, types.BlkioEntry(e))
 	}
 	return out
 }

 func convertL3CacheInfo(i *intelrdt.L3CacheInfo) *types.L3CacheInfo {
-	return &types.L3CacheInfo{
-		CbmMask:    i.CbmMask,
-		MinCbmBits: i.MinCbmBits,
-		NumClosids: i.NumClosids,
-	}
+	ci := types.L3CacheInfo(*i)
+	return &ci
 }

 func convertMemBwInfo(i *intelrdt.MemBwInfo) *types.MemBwInfo {
-	return &types.MemBwInfo{
-		BandwidthGran: i.BandwidthGran,
-		DelayLinear:   i.DelayLinear,
-		MinBandwidth:  i.MinBandwidth,
-		NumClosids:    i.NumClosids,
-	}
+	mi := types.MemBwInfo(*i)
+	return &mi
 }
--- a/exec.go
+++ b/exec.go
@ -1,9 +1,8 @@
-// +build linux
-
 package main

 import (
 	"encoding/json"
+	"errors"
 	"fmt"
 	"os"
 	"strconv"
@ -84,15 +83,18 @@ following will output a list of processes running in the container:
 			Value: &cli.StringSlice{},
 			Usage: "add a capability to the bounding set for the process",
 		},
-		cli.BoolFlag{
-			Name:   "no-subreaper",
-			Usage:  "disable the use of the subreaper used to reap reparented processes",
-			Hidden: true,
-		},
 		cli.IntFlag{
 			Name:  "preserve-fds",
 			Usage: "Pass N additional file descriptors to the container (stdio + $LISTEN_FDS + N in total)",
 		},
+		cli.StringSliceFlag{
+			Name:  "cgroup",
+			Usage: "run the process in an (existing) sub-cgroup(s). Format is [<controller>:]<cgroup>.",
+		},
+		cli.BoolFlag{
+			Name:  "ignore-paused",
+			Usage: "allow exec in a paused container",
+		},
 	},
 	Action: func(context *cli.Context) error {
 		if err := checkArgs(context, 1, minArgs); err != nil {
@ -105,11 +107,38 @@ following will output a list of processes running in the container:
 		if err == nil {
 			os.Exit(status)
 		}
-		return fmt.Errorf("exec failed: %v", err)
+		fatalWithCode(fmt.Errorf("exec failed: %w", err), 255)
+		return nil // to satisfy the linter
 	},
 	SkipArgReorder: true,
 }

+func getSubCgroupPaths(args []string) (map[string]string, error) {
+	if len(args) == 0 {
+		return nil, nil
+	}
+	paths := make(map[string]string, len(args))
+	for _, c := range args {
+		// Split into controller:path.
+		cs := strings.SplitN(c, ":", 3)
+		if len(cs) > 2 {
+			return nil, fmt.Errorf("invalid --cgroup argument: %s", c)
+		}
+		if len(cs) == 1 { // no controller: prefix
+			if len(args) != 1 {
+				return nil, fmt.Errorf("invalid --cgroup argument: %s (missing <controller>: prefix)", c)
+			}
+			paths[""] = c
+		} else {
+			// There may be a few comma-separated controllers.
+			for _, ctrl := range strings.Split(cs[0], ",") {
+				paths[ctrl] = cs[1]
+			}
+		}
+	}
+	return paths, nil
+}
+
 func execProcess(context *cli.Context) (int, error) {
 	container, err := getContainer(context)
 	if err != nil {
@ -120,13 +149,15 @@ func execProcess(context *cli.Context) (int, error) {
 		return -1, err
 	}
 	if status == libcontainer.Stopped {
-		return -1, fmt.Errorf("cannot exec a container that has stopped")
+		return -1, errors.New("cannot exec in a stopped container")
+	}
+	if status == libcontainer.Paused && !context.Bool("ignore-paused") {
+		return -1, errors.New("cannot exec in a paused container (use --ignore-paused to override)")
 	}
 	path := context.String("process")
 	if path == "" && len(context.Args()) == 1 {
-		return -1, fmt.Errorf("process args cannot be empty")
+		return -1, errors.New("process args cannot be empty")
 	}
-	detach := context.Bool("detach")
 	state, err := container.State()
 	if err != nil {
 		return -1, err
@ -137,9 +168,9 @@ func execProcess(context *cli.Context) (int, error) {
 		return -1, err
 	}

-	logLevel := "info"
-	if context.GlobalBool("debug") {
-		logLevel = "debug"
+	cgPaths, err := getSubCgroupPaths(context.StringSlice("cgroup"))
+	if err != nil {
+		return -1, err
 	}

 	r := &runner{
@ -147,12 +178,12 @@ func execProcess(context *cli.Context) (int, error) {
 		shouldDestroy:   false,
 		container:       container,
 		consoleSocket:   context.String("console-socket"),
-		detach:          detach,
+		detach:          context.Bool("detach"),
 		pidFile:         context.String("pid-file"),
 		action:          CT_ACT_RUN,
 		init:            false,
 		preserveFDs:     context.Int("preserve-fds"),
-		logLevel:        logLevel,
+		subCgroupPaths:  cgPaths,
 	}
 	return r.run(p)
 }
@ -203,6 +234,7 @@ func getProcess(context *cli.Context, bundle string) (*specs.Process, error) {
 	p.Env = append(p.Env, context.StringSlice("env")...)

 	// set the tty
+	p.Terminal = false
 	if context.IsSet("tty") {
 		p.Terminal = context.Bool("tty")
 	}
@ -215,13 +247,13 @@ func getProcess(context *cli.Context, bundle string) (*specs.Process, error) {
 		if len(u) > 1 {
 			gid, err := strconv.Atoi(u[1])
 			if err != nil {
-				return nil, fmt.Errorf("parsing %s as int for gid failed: %v", u[1], err)
+				return nil, fmt.Errorf("parsing %s as int for gid failed: %w", u[1], err)
 			}
 			p.User.GID = uint32(gid)
 		}
 		uid, err := strconv.Atoi(u[0])
 		if err != nil {
-			return nil, fmt.Errorf("parsing %s as int for uid failed: %v", u[0], err)
+			return nil, fmt.Errorf("parsing %s as int for uid failed: %w", u[0], err)
 		}
 		p.User.UID = uint32(uid)
 	}
--- a/features.go
+++ b/features.go
@ -0,0 +1,75 @@
+package main
+
+import (
+	"encoding/json"
+	"fmt"
+
+	"github.com/opencontainers/runc/libcontainer/capabilities"
+	"github.com/opencontainers/runc/libcontainer/configs"
+	"github.com/opencontainers/runc/libcontainer/seccomp"
+	"github.com/opencontainers/runc/libcontainer/specconv"
+	"github.com/opencontainers/runc/types/features"
+	"github.com/opencontainers/runtime-spec/specs-go"
+	"github.com/urfave/cli"
+)
+
+var featuresCommand = cli.Command{
+	Name:      "features",
+	Usage:     "show the enabled features",
+	ArgsUsage: "",
+	Description: `Show the enabled features.
+   The result is parsable as a JSON.
+   See https://pkg.go.dev/github.com/opencontainers/runc/types/features for the type definition.
+   The types are experimental and subject to change.
+`,
+	Action: func(context *cli.Context) error {
+		if err := checkArgs(context, 0, exactArgs); err != nil {
+			return err
+		}
+
+		tru := true
+
+		feat := features.Features{
+			OCIVersionMin: "1.0.0",
+			OCIVersionMax: specs.Version,
+			Annotations: map[string]string{
+				features.AnnotationRuncVersion:           version,
+				features.AnnotationRuncCommit:            gitCommit,
+				features.AnnotationRuncCheckpointEnabled: "true",
+			},
+			Hooks:        configs.KnownHookNames(),
+			MountOptions: specconv.KnownMountOptions(),
+			Linux: &features.Linux{
+				Namespaces:   specconv.KnownNamespaces(),
+				Capabilities: capabilities.KnownCapabilities(),
+				Cgroup: &features.Cgroup{
+					V1:          &tru,
+					V2:          &tru,
+					Systemd:     &tru,
+					SystemdUser: &tru,
+				},
+				Apparmor: &features.Apparmor{
+					Enabled: &tru,
+				},
+				Selinux: &features.Selinux{
+					Enabled: &tru,
+				},
+			},
+		}
+
+		if seccomp.Enabled {
+			feat.Linux.Seccomp = &features.Seccomp{
+				Enabled:   &tru,
+				Actions:   seccomp.KnownActions(),
+				Operators: seccomp.KnownOperators(),
+				Archs:     seccomp.KnownArchs(),
+			}
+			major, minor, patch := seccomp.Version()
+			feat.Annotations[features.AnnotationLibseccompVersion] = fmt.Sprintf("%d.%d.%d", major, minor, patch)
+		}
+
+		enc := json.NewEncoder(context.App.Writer)
+		enc.SetIndent("", "    ")
+		return enc.Encode(feat)
+	},
+}
--- a/go.mod
+++ b/go.mod
@ -0,0 +1,26 @@
+module github.com/opencontainers/runc
+
+go 1.16
+
+require (
+	github.com/checkpoint-restore/go-criu/v5 v5.3.0
+	github.com/cilium/ebpf v0.7.0
+	github.com/containerd/console v1.0.3
+	github.com/coreos/go-systemd/v22 v22.3.2
+	github.com/cyphar/filepath-securejoin v0.2.3
+	github.com/docker/go-units v0.4.0
+	github.com/godbus/dbus/v5 v5.0.6
+	github.com/moby/sys/mountinfo v0.5.0
+	github.com/mrunalp/fileutils v0.5.0
+	github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417
+	github.com/opencontainers/selinux v1.10.0
+	github.com/seccomp/libseccomp-golang v0.9.2-0.20210429002308-3879420cc921
+	github.com/sirupsen/logrus v1.8.1
+	github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635
+	// NOTE: urfave/cli must be <= v1.22.1 due to a regression: https://github.com/urfave/cli/issues/1092
+	github.com/urfave/cli v1.22.1
+	github.com/vishvananda/netlink v1.1.0
+	golang.org/x/net v0.0.0-20201224014010-6772e930b67b
+	golang.org/x/sys v0.0.0-20211116061358-0a5406a5449c
+	google.golang.org/protobuf v1.27.1
+)
--- a/go.sum
+++ b/go.sum
@ -0,0 +1,80 @@
+github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
+github.com/checkpoint-restore/go-criu/v5 v5.3.0 h1:wpFFOoomK3389ue2lAb0Boag6XPht5QYpipxmSNL4d8=
+github.com/checkpoint-restore/go-criu/v5 v5.3.0/go.mod h1:E/eQpaFtUKGOOSEBZgmKAcn+zUUwWxqcaKZlF54wK8E=
+github.com/cilium/ebpf v0.7.0 h1:1k/q3ATgxSXRdrmPfH8d7YK0GfqVsEKZAX9dQZvs56k=
+github.com/cilium/ebpf v0.7.0/go.mod h1:/oI2+1shJiTGAMgl6/RgJr36Eo1jzrRcAWbcXO2usCA=
+github.com/containerd/console v1.0.3 h1:lIr7SlA5PxZyMV30bDW0MGbiOPXwc63yRuCP0ARubLw=
+github.com/containerd/console v1.0.3/go.mod h1:7LqA/THxQ86k76b8c/EMSiaJ3h1eZkMkXar0TQ1gf3U=
+github.com/coreos/go-systemd/v22 v22.3.2 h1:D9/bQk5vlXQFZ6Kwuu6zaiXJ9oTPe68++AzAJc1DzSI=
+github.com/coreos/go-systemd/v22 v22.3.2/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
+github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d h1:U+s90UTSYgptZMwQh2aRr3LuazLJIa+Pg3Kc1ylSYVY=
+github.com/cpuguy83/go-md2man/v2 v2.0.0-20190314233015-f79a8a8ca69d/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU=
+github.com/cyphar/filepath-securejoin v0.2.3 h1:YX6ebbZCZP7VkM3scTTokDgBL2TY741X51MTk3ycuNI=
+github.com/cyphar/filepath-securejoin v0.2.3/go.mod h1:aPGpWjXOXUn2NCNjFvBE6aRxGGx79pTxQpKOJNYHHl4=
+github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
+github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/docker/go-units v0.4.0 h1:3uh0PgVws3nIA0Q+MwDC8yjEPf9zjRfZZWXZYDct3Tw=
+github.com/docker/go-units v0.4.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk=
+github.com/frankban/quicktest v1.11.3 h1:8sXhOn0uLys67V8EsXLc6eszDs8VXWxL3iRvebPhedY=
+github.com/frankban/quicktest v1.11.3/go.mod h1:wRf/ReqHper53s+kmmSZizM8NamnL3IM0I9ntUbOk+k=
+github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
+github.com/godbus/dbus/v5 v5.0.6 h1:mkgN1ofwASrYnJ5W6U/BxG15eXXXjirgZc7CLqkcaro=
+github.com/godbus/dbus/v5 v5.0.6/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
+github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk=
+github.com/google/go-cmp v0.5.4/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
+github.com/google/go-cmp v0.5.5 h1:Khx7svrCpmxxtHBq5j2mp/xVjsi8hQMfNLvJFAlrGgU=
+github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
+github.com/kr/pretty v0.2.1 h1:Fmg33tUaq4/8ym9TJN1x7sLJnHVwhP33CNkpYV/7rwI=
+github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
+github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
+github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
+github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
+github.com/moby/sys/mountinfo v0.5.0 h1:2Ks8/r6lopsxWi9m58nlwjaeSzUX9iiL1vj5qB/9ObI=
+github.com/moby/sys/mountinfo v0.5.0/go.mod h1:3bMD3Rg+zkqx8MRYPi7Pyb0Ie97QEBmdxbhnCLlSvSU=
+github.com/mrunalp/fileutils v0.5.0 h1:NKzVxiH7eSk+OQ4M+ZYW1K6h27RUV3MI6NUTsHhU6Z4=
+github.com/mrunalp/fileutils v0.5.0/go.mod h1:M1WthSahJixYnrXQl/DFQuteStB1weuxD2QJNHXfbSQ=
+github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417 h1:3snG66yBm59tKhhSPQrQ/0bCrv1LQbKt40LnUPiUxdc=
+github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0=
+github.com/opencontainers/selinux v1.10.0 h1:rAiKF8hTcgLI3w0DHm6i0ylVVcOrlgR1kK99DRLDhyU=
+github.com/opencontainers/selinux v1.10.0/go.mod h1:2i0OySw99QjzBBQByd1Gr9gSjvuho1lHsJxIJ3gGbJI=
+github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
+github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/russross/blackfriday/v2 v2.0.1 h1:lPqVAte+HuHNfhJ/0LC98ESWRz8afy9tM/0RK8m9o+Q=
+github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
+github.com/seccomp/libseccomp-golang v0.9.2-0.20210429002308-3879420cc921 h1:58EBmR2dMNL2n/FnbQewK3D14nXr0V9CObDSvMJLq+Y=
+github.com/seccomp/libseccomp-golang v0.9.2-0.20210429002308-3879420cc921/go.mod h1:JA8cRccbGaA1s33RQf7Y1+q9gHmZX1yB/z9WDN1C6fg=
+github.com/shurcooL/sanitized_anchor_name v1.0.0 h1:PdmoCO6wvbs+7yrJyMORt4/BmY5IYyJwS/kOiWx8mHo=
+github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc=
+github.com/sirupsen/logrus v1.8.1 h1:dJKuHgqk1NNQlqoA6BTlM1Wf9DOH3NBjQyu0h9+AZZE=
+github.com/sirupsen/logrus v1.8.1/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0=
+github.com/stretchr/testify v1.2.2 h1:bSDNvY7ZPG5RlJ8otE/7V6gMiyenm9RtJ7IUVIAoJ1w=
+github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
+github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635 h1:kdXcSzyDtseVEc4yCz2qF8ZrQvIDBJLl4S1c3GCXmoI=
+github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635/go.mod h1:hkRG7XYTFWNJGYcbNJQlaLq0fg1yr4J4t/NcTQtrfww=
+github.com/urfave/cli v1.22.1 h1:+mkCCcOFKPnCmVYVcURKps1Xe+3zP90gSYGNfRkjoIY=
+github.com/urfave/cli v1.22.1/go.mod h1:Gos4lmkARVdJ6EkW0WaNv/tZAAMe9V7XWyB60NtXRu0=
+github.com/vishvananda/netlink v1.1.0 h1:1iyaYNBLmP6L0220aDnYQpo1QEV4t4hJ+xEEhhJH8j0=
+github.com/vishvananda/netlink v1.1.0/go.mod h1:cTgwzPIzzgDAYoQrMm0EdrjRUBkTqKYppBueQtXaqoE=
+github.com/vishvananda/netns v0.0.0-20191106174202-0a2b9b5464df h1:OviZH7qLw/7ZovXvuNyL3XQl8UFofeikI1NW1Gypu7k=
+github.com/vishvananda/netns v0.0.0-20191106174202-0a2b9b5464df/go.mod h1:JP3t17pCcGlemwknint6hfoeCVQrEMVwxRLRjXpq+BU=
+golang.org/x/net v0.0.0-20201224014010-6772e930b67b h1:iFwSg7t5GZmB/Q5TjiEAsdoLDrdJRC1RiF2WhuV29Qw=
+golang.org/x/net v0.0.0-20201224014010-6772e930b67b/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
+golang.org/x/sys v0.0.0-20190606203320-7fc4e5ec1444/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20191115151921-52ab43148777/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210906170528-6f6e22806c34/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20211025201205-69cdffdb9359/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20211116061358-0a5406a5449c h1:DHcbWVXeY+0Y8HHKR+rbLwnoh2F4tNCY7rTiHJ30RmA=
+golang.org/x/sys v0.0.0-20211116061358-0a5406a5449c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
+golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
+golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4=
+golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
+google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
+google.golang.org/protobuf v1.27.1 h1:SnqbnDw1V7RiZcXPx5MEeqPv2s79L9i7BJUlG/+RurQ=
+google.golang.org/protobuf v1.27.1/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
--- a/init.go
+++ b/init.go
@ -1,44 +1,37 @@
 package main

 import (
-	"fmt"
 	"os"
 	"runtime"
+	"strconv"

 	"github.com/opencontainers/runc/libcontainer"
-	"github.com/opencontainers/runc/libcontainer/logs"
 	_ "github.com/opencontainers/runc/libcontainer/nsenter"
 	"github.com/sirupsen/logrus"
-	"github.com/urfave/cli"
 )

 func init() {
 	if len(os.Args) > 1 && os.Args[1] == "init" {
+		// This is the golang entry point for runc init, executed
+		// before main() but after libcontainer/nsenter's nsexec().
 		runtime.GOMAXPROCS(1)
 		runtime.LockOSThread()

-		level := os.Getenv("_LIBCONTAINER_LOGLEVEL")
-		logLevel, err := logrus.ParseLevel(level)
+		level, err := strconv.Atoi(os.Getenv("_LIBCONTAINER_LOGLEVEL"))
 		if err != nil {
-			panic(fmt.Sprintf("libcontainer: failed to parse log level: %q: %v", level, err))
+			panic(err)
 		}

-		err = logs.ConfigureLogging(logs.Config{
-			LogPipeFd: os.Getenv("_LIBCONTAINER_LOGPIPE"),
-			LogFormat: "json",
-			LogLevel:  logLevel,
-		})
+		logPipeFd, err := strconv.Atoi(os.Getenv("_LIBCONTAINER_LOGPIPE"))
 		if err != nil {
-			panic(fmt.Sprintf("libcontainer: failed to configure logging: %v", err))
+			panic(err)
 		}
+
+		logrus.SetLevel(logrus.Level(level))
+		logrus.SetOutput(os.NewFile(uintptr(logPipeFd), "logpipe"))
+		logrus.SetFormatter(new(logrus.JSONFormatter))
 		logrus.Debug("child process in init()")
-	}
-}

-var initCommand = cli.Command{
-	Name:  "init",
-	Usage: `initialize the namespaces and launch the process (do not call it outside of runc)`,
-	Action: func(context *cli.Context) error {
 		factory, _ := libcontainer.New("")
 		if err := factory.StartInitialization(); err != nil {
 			// as the error is sent back to the parent there is no need to log
@ -46,5 +39,5 @@ var initCommand = cli.Command{
 			os.Exit(1)
 		}
 		panic("libcontainer: container init failed to exec")
-	},
+	}
 }
--- a/kill.go
+++ b/kill.go
@ -1,14 +1,12 @@
-// +build linux
-
 package main

 import (
 	"fmt"
 	"strconv"
 	"strings"
-	"syscall"

 	"github.com/urfave/cli"
+	"golang.org/x/sys/unix"
 )

 var killCommand = cli.Command{
@ -55,13 +53,17 @@ signal to the init process of the "ubuntu01" container:
 	},
 }

-func parseSignal(rawSignal string) (syscall.Signal, error) {
+func parseSignal(rawSignal string) (unix.Signal, error) {
 	s, err := strconv.Atoi(rawSignal)
 	if err == nil {
-		return syscall.Signal(s), nil
+		return unix.Signal(s), nil
 	}
-	signal, ok := signalMap[strings.TrimPrefix(strings.ToUpper(rawSignal), "SIG")]
-	if !ok {
+	sig := strings.ToUpper(rawSignal)
+	if !strings.HasPrefix(sig, "SIG") {
+		sig = "SIG" + sig
+	}
+	signal := unix.SignalNum(sig)
+	if signal == 0 {
 		return -1, fmt.Errorf("unknown signal %q", rawSignal)
 	}
 	return signal, nil
--- a/libcontainer/README.md
+++ b/libcontainer/README.md
@ -57,90 +57,94 @@ struct describing how the container is to be created. A sample would look simila

 ```go
 defaultMountFlags := unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV
+var devices []*configs.DeviceRule
+for _, device := range specconv.AllowedDevices {
+	devices = append(devices, &device.Rule)
+}
 config := &configs.Config{
 	Rootfs: "/your/path/to/rootfs",
 	Capabilities: &configs.Capabilities{
-                Bounding: []string{
-                        "CAP_CHOWN",
-                        "CAP_DAC_OVERRIDE",
-                        "CAP_FSETID",
-                        "CAP_FOWNER",
-                        "CAP_MKNOD",
-                        "CAP_NET_RAW",
-                        "CAP_SETGID",
-                        "CAP_SETUID",
-                        "CAP_SETFCAP",
-                        "CAP_SETPCAP",
-                        "CAP_NET_BIND_SERVICE",
-                        "CAP_SYS_CHROOT",
-                        "CAP_KILL",
-                        "CAP_AUDIT_WRITE",
-                },
-                Effective: []string{
-                        "CAP_CHOWN",
-                        "CAP_DAC_OVERRIDE",
-                        "CAP_FSETID",
-                        "CAP_FOWNER",
-                        "CAP_MKNOD",
-                        "CAP_NET_RAW",
-                        "CAP_SETGID",
-                        "CAP_SETUID",
-                        "CAP_SETFCAP",
-                        "CAP_SETPCAP",
-                        "CAP_NET_BIND_SERVICE",
-                        "CAP_SYS_CHROOT",
-                        "CAP_KILL",
-                        "CAP_AUDIT_WRITE",
-                },
-                Inheritable: []string{
-                        "CAP_CHOWN",
-                        "CAP_DAC_OVERRIDE",
-                        "CAP_FSETID",
-                        "CAP_FOWNER",
-                        "CAP_MKNOD",
-                        "CAP_NET_RAW",
-                        "CAP_SETGID",
-                        "CAP_SETUID",
-                        "CAP_SETFCAP",
-                        "CAP_SETPCAP",
-                        "CAP_NET_BIND_SERVICE",
-                        "CAP_SYS_CHROOT",
-                        "CAP_KILL",
-                        "CAP_AUDIT_WRITE",
-                },
-                Permitted: []string{
-                        "CAP_CHOWN",
-                        "CAP_DAC_OVERRIDE",
-                        "CAP_FSETID",
-                        "CAP_FOWNER",
-                        "CAP_MKNOD",
-                        "CAP_NET_RAW",
-                        "CAP_SETGID",
-                        "CAP_SETUID",
-                        "CAP_SETFCAP",
-                        "CAP_SETPCAP",
-                        "CAP_NET_BIND_SERVICE",
-                        "CAP_SYS_CHROOT",
-                        "CAP_KILL",
-                        "CAP_AUDIT_WRITE",
-                },
-                Ambient: []string{
-                        "CAP_CHOWN",
-                        "CAP_DAC_OVERRIDE",
-                        "CAP_FSETID",
-                        "CAP_FOWNER",
-                        "CAP_MKNOD",
-                        "CAP_NET_RAW",
-                        "CAP_SETGID",
-                        "CAP_SETUID",
-                        "CAP_SETFCAP",
-                        "CAP_SETPCAP",
-                        "CAP_NET_BIND_SERVICE",
-                        "CAP_SYS_CHROOT",
-                        "CAP_KILL",
-                        "CAP_AUDIT_WRITE",
-                },
-        },
+		Bounding: []string{
+			"CAP_CHOWN",
+			"CAP_DAC_OVERRIDE",
+			"CAP_FSETID",
+			"CAP_FOWNER",
+			"CAP_MKNOD",
+			"CAP_NET_RAW",
+			"CAP_SETGID",
+			"CAP_SETUID",
+			"CAP_SETFCAP",
+			"CAP_SETPCAP",
+			"CAP_NET_BIND_SERVICE",
+			"CAP_SYS_CHROOT",
+			"CAP_KILL",
+			"CAP_AUDIT_WRITE",
+		},
+		Effective: []string{
+			"CAP_CHOWN",
+			"CAP_DAC_OVERRIDE",
+			"CAP_FSETID",
+			"CAP_FOWNER",
+			"CAP_MKNOD",
+			"CAP_NET_RAW",
+			"CAP_SETGID",
+			"CAP_SETUID",
+			"CAP_SETFCAP",
+			"CAP_SETPCAP",
+			"CAP_NET_BIND_SERVICE",
+			"CAP_SYS_CHROOT",
+			"CAP_KILL",
+			"CAP_AUDIT_WRITE",
+		},
+		Inheritable: []string{
+			"CAP_CHOWN",
+			"CAP_DAC_OVERRIDE",
+			"CAP_FSETID",
+			"CAP_FOWNER",
+			"CAP_MKNOD",
+			"CAP_NET_RAW",
+			"CAP_SETGID",
+			"CAP_SETUID",
+			"CAP_SETFCAP",
+			"CAP_SETPCAP",
+			"CAP_NET_BIND_SERVICE",
+			"CAP_SYS_CHROOT",
+			"CAP_KILL",
+			"CAP_AUDIT_WRITE",
+		},
+		Permitted: []string{
+			"CAP_CHOWN",
+			"CAP_DAC_OVERRIDE",
+			"CAP_FSETID",
+			"CAP_FOWNER",
+			"CAP_MKNOD",
+			"CAP_NET_RAW",
+			"CAP_SETGID",
+			"CAP_SETUID",
+			"CAP_SETFCAP",
+			"CAP_SETPCAP",
+			"CAP_NET_BIND_SERVICE",
+			"CAP_SYS_CHROOT",
+			"CAP_KILL",
+			"CAP_AUDIT_WRITE",
+		},
+		Ambient: []string{
+			"CAP_CHOWN",
+			"CAP_DAC_OVERRIDE",
+			"CAP_FSETID",
+			"CAP_FOWNER",
+			"CAP_MKNOD",
+			"CAP_NET_RAW",
+			"CAP_SETGID",
+			"CAP_SETUID",
+			"CAP_SETFCAP",
+			"CAP_SETPCAP",
+			"CAP_NET_BIND_SERVICE",
+			"CAP_SYS_CHROOT",
+			"CAP_KILL",
+			"CAP_AUDIT_WRITE",
+		},
+	},
 	Namespaces: configs.Namespaces([]configs.Namespace{
 		{Type: configs.NEWNS},
 		{Type: configs.NEWUTS},
@ -155,8 +159,7 @@ config := &configs.Config{
 		Parent: "system",
 		Resources: &configs.Resources{
 			MemorySwappiness: nil,
-			AllowAllDevices:  nil,
-			AllowedDevices:   configs.DefaultAllowedDevices,
+			Devices:          devices,
 		},
 	},
 	MaskPaths: []string{
@ -166,7 +169,7 @@ config := &configs.Config{
 	ReadonlyPaths: []string{
 		"/proc/sys", "/proc/sysrq-trigger", "/proc/irq", "/proc/bus",
 	},
-	Devices:  configs.DefaultAutoCreatedDevices,
+	Devices:  specconv.AllowedDevices,
 	Hostname: "testing",
 	Mounts: []*configs.Mount{
 		{
@ -314,7 +317,7 @@ state, err := container.State()
 #### Checkpoint & Restore

 libcontainer now integrates [CRIU](http://criu.org/) for checkpointing and restoring containers.
-This let's you save the state of a process running inside a container to disk, and then restore
+This lets you save the state of a process running inside a container to disk, and then restore
 that state into a new process, on the same machine or on another machine.

 `criu` version 1.5.2 or higher is required to use checkpoint and restore.
--- a/libcontainer/apparmor/apparmor.go
+++ b/libcontainer/apparmor/apparmor.go
@ -1,60 +1,16 @@
-// +build apparmor,linux
-
 package apparmor

-import (
-	"fmt"
-	"io/ioutil"
-	"os"
+import "errors"

-	"github.com/opencontainers/runc/libcontainer/utils"
+var (
+	// IsEnabled returns true if apparmor is enabled for the host.
+	IsEnabled = isEnabled
+
+	// ApplyProfile will apply the profile with the specified name to the process after
+	// the next exec. It is only supported on Linux and produces an ErrApparmorNotEnabled
+	// on other platforms.
+	ApplyProfile = applyProfile
+
+	// ErrApparmorNotEnabled indicates that AppArmor is not enabled or not supported.
+	ErrApparmorNotEnabled = errors.New("apparmor: config provided but apparmor not supported")
 )
-
-// IsEnabled returns true if apparmor is enabled for the host.
-func IsEnabled() bool {
-	if _, err := os.Stat("/sys/kernel/security/apparmor"); err == nil && os.Getenv("container") == "" {
-		if _, err = os.Stat("/sbin/apparmor_parser"); err == nil {
-			buf, err := ioutil.ReadFile("/sys/module/apparmor/parameters/enabled")
-			return err == nil && len(buf) > 1 && buf[0] == 'Y'
-		}
-	}
-	return false
-}
-
-func setProcAttr(attr, value string) error {
-	// Under AppArmor you can only change your own attr, so use /proc/self/
-	// instead of /proc/<tid>/ like libapparmor does
-	path := fmt.Sprintf("/proc/self/attr/%s", attr)
-
-	f, err := os.OpenFile(path, os.O_WRONLY, 0)
-	if err != nil {
-		return err
-	}
-	defer f.Close()
-
-	if err := utils.EnsureProcHandle(f); err != nil {
-		return err
-	}
-
-	_, err = fmt.Fprintf(f, "%s", value)
-	return err
-}
-
-// changeOnExec reimplements aa_change_onexec from libapparmor in Go
-func changeOnExec(name string) error {
-	value := "exec " + name
-	if err := setProcAttr("exec", value); err != nil {
-		return fmt.Errorf("apparmor failed to apply profile: %s", err)
-	}
-	return nil
-}
-
-// ApplyProfile will apply the profile with the specified name to the process after
-// the next exec.
-func ApplyProfile(name string) error {
-	if name == "" {
-		return nil
-	}
-
-	return changeOnExec(name)
-}
--- a/libcontainer/apparmor/apparmor_disabled.go
+++ b/libcontainer/apparmor/apparmor_disabled.go
@ -1,20 +0,0 @@
-// +build !apparmor !linux
-
-package apparmor
-
-import (
-	"errors"
-)
-
-var ErrApparmorNotEnabled = errors.New("apparmor: config provided but apparmor not supported")
-
-func IsEnabled() bool {
-	return false
-}
-
-func ApplyProfile(name string) error {
-	if name != "" {
-		return ErrApparmorNotEnabled
-	}
-	return nil
-}
--- a/libcontainer/apparmor/apparmor_linux.go
+++ b/libcontainer/apparmor/apparmor_linux.go
@ -0,0 +1,68 @@
+package apparmor
+
+import (
+	"errors"
+	"fmt"
+	"os"
+	"sync"
+
+	"github.com/opencontainers/runc/libcontainer/utils"
+)
+
+var (
+	appArmorEnabled bool
+	checkAppArmor   sync.Once
+)
+
+// isEnabled returns true if apparmor is enabled for the host.
+func isEnabled() bool {
+	checkAppArmor.Do(func() {
+		if _, err := os.Stat("/sys/kernel/security/apparmor"); err == nil {
+			buf, err := os.ReadFile("/sys/module/apparmor/parameters/enabled")
+			appArmorEnabled = err == nil && len(buf) > 1 && buf[0] == 'Y'
+		}
+	})
+	return appArmorEnabled
+}
+
+func setProcAttr(attr, value string) error {
+	// Under AppArmor you can only change your own attr, so use /proc/self/
+	// instead of /proc/<tid>/ like libapparmor does
+	attrPath := "/proc/self/attr/apparmor/" + attr
+	if _, err := os.Stat(attrPath); errors.Is(err, os.ErrNotExist) {
+		// fall back to the old convention
+		attrPath = "/proc/self/attr/" + attr
+	}
+
+	f, err := os.OpenFile(attrPath, os.O_WRONLY, 0)
+	if err != nil {
+		return err
+	}
+	defer f.Close()
+
+	if err := utils.EnsureProcHandle(f); err != nil {
+		return err
+	}
+
+	_, err = f.WriteString(value)
+	return err
+}
+
+// changeOnExec reimplements aa_change_onexec from libapparmor in Go
+func changeOnExec(name string) error {
+	if err := setProcAttr("exec", "exec "+name); err != nil {
+		return fmt.Errorf("apparmor failed to apply profile: %w", err)
+	}
+	return nil
+}
+
+// applyProfile will apply the profile with the specified name to the process after
+// the next exec. It is only supported on Linux and produces an error on other
+// platforms.
+func applyProfile(name string) error {
+	if name == "" {
+		return nil
+	}
+
+	return changeOnExec(name)
+}
--- a/libcontainer/apparmor/apparmor_unsupported.go
+++ b/libcontainer/apparmor/apparmor_unsupported.go
@ -0,0 +1,15 @@
+//go:build !linux
+// +build !linux
+
+package apparmor
+
+func isEnabled() bool {
+	return false
+}
+
+func applyProfile(name string) error {
+	if name != "" {
+		return ErrApparmorNotEnabled
+	}
+	return nil
+}
--- a/libcontainer/capabilities/capabilities.go
+++ b/libcontainer/capabilities/capabilities.go
@ -0,0 +1,123 @@
+//go:build linux
+// +build linux
+
+package capabilities
+
+import (
+	"sort"
+	"strings"
+
+	"github.com/opencontainers/runc/libcontainer/configs"
+	"github.com/sirupsen/logrus"
+	"github.com/syndtr/gocapability/capability"
+)
+
+const allCapabilityTypes = capability.CAPS | capability.BOUNDING | capability.AMBIENT
+
+var (
+	capabilityMap map[string]capability.Cap
+	capTypes      = []capability.CapType{
+		capability.BOUNDING,
+		capability.PERMITTED,
+		capability.INHERITABLE,
+		capability.EFFECTIVE,
+		capability.AMBIENT,
+	}
+)
+
+func init() {
+	capabilityMap = make(map[string]capability.Cap, capability.CAP_LAST_CAP+1)
+	for _, c := range capability.List() {
+		if c > capability.CAP_LAST_CAP {
+			continue
+		}
+		capabilityMap["CAP_"+strings.ToUpper(c.String())] = c
+	}
+}
+
+// KnownCapabilities returns the list of the known capabilities.
+// Used by `runc features`.
+func KnownCapabilities() []string {
+	list := capability.List()
+	res := make([]string, len(list))
+	for i, c := range list {
+		res[i] = "CAP_" + strings.ToUpper(c.String())
+	}
+	return res
+}
+
+// New creates a new Caps from the given Capabilities config. Unknown Capabilities
+// or Capabilities that are unavailable in the current environment are ignored,
+// printing a warning instead.
+func New(capConfig *configs.Capabilities) (*Caps, error) {
+	var (
+		err error
+		c   Caps
+	)
+
+	unknownCaps := make(map[string]struct{})
+	c.caps = map[capability.CapType][]capability.Cap{
+		capability.BOUNDING:    capSlice(capConfig.Bounding, unknownCaps),
+		capability.EFFECTIVE:   capSlice(capConfig.Effective, unknownCaps),
+		capability.INHERITABLE: capSlice(capConfig.Inheritable, unknownCaps),
+		capability.PERMITTED:   capSlice(capConfig.Permitted, unknownCaps),
+		capability.AMBIENT:     capSlice(capConfig.Ambient, unknownCaps),
+	}
+	if c.pid, err = capability.NewPid2(0); err != nil {
+		return nil, err
+	}
+	if err = c.pid.Load(); err != nil {
+		return nil, err
+	}
+	if len(unknownCaps) > 0 {
+		logrus.Warn("ignoring unknown or unavailable capabilities: ", mapKeys(unknownCaps))
+	}
+	return &c, nil
+}
+
+// capSlice converts the slice of capability names in caps, to their numeric
+// equivalent, and returns them as a slice. Unknown or unavailable capabilities
+// are not returned, but appended to unknownCaps.
+func capSlice(caps []string, unknownCaps map[string]struct{}) []capability.Cap {
+	var out []capability.Cap
+	for _, c := range caps {
+		if v, ok := capabilityMap[c]; !ok {
+			unknownCaps[c] = struct{}{}
+		} else {
+			out = append(out, v)
+		}
+	}
+	return out
+}
+
+// mapKeys returns the keys of input in sorted order
+func mapKeys(input map[string]struct{}) []string {
+	var keys []string
+	for c := range input {
+		keys = append(keys, c)
+	}
+	sort.Strings(keys)
+	return keys
+}
+
+// Caps holds the capabilities for a container.
+type Caps struct {
+	pid  capability.Capabilities
+	caps map[capability.CapType][]capability.Cap
+}
+
+// ApplyBoundingSet sets the capability bounding set to those specified in the whitelist.
+func (c *Caps) ApplyBoundingSet() error {
+	c.pid.Clear(capability.BOUNDING)
+	c.pid.Set(capability.BOUNDING, c.caps[capability.BOUNDING]...)
+	return c.pid.Apply(capability.BOUNDING)
+}
+
+// Apply sets all the capabilities for the current process in the config.
+func (c *Caps) ApplyCaps() error {
+	c.pid.Clear(allCapabilityTypes)
+	for _, g := range capTypes {
+		c.pid.Set(g, c.caps[g]...)
+	}
+	return c.pid.Apply(allCapabilityTypes)
+}
--- a/libcontainer/capabilities/capabilities_linux_test.go
+++ b/libcontainer/capabilities/capabilities_linux_test.go
@ -0,0 +1,71 @@
+package capabilities
+
+import (
+	"io"
+	"os"
+	"testing"
+
+	"github.com/opencontainers/runc/libcontainer/configs"
+	"github.com/sirupsen/logrus"
+	"github.com/sirupsen/logrus/hooks/test"
+	"github.com/syndtr/gocapability/capability"
+)
+
+func TestNew(t *testing.T) {
+	cs := []string{"CAP_CHOWN", "CAP_UNKNOWN", "CAP_UNKNOWN2"}
+	conf := configs.Capabilities{
+		Bounding:    cs,
+		Effective:   cs,
+		Inheritable: cs,
+		Permitted:   cs,
+		Ambient:     cs,
+	}
+
+	hook := test.NewGlobal()
+	defer hook.Reset()
+
+	logrus.SetOutput(io.Discard)
+	caps, err := New(&conf)
+	logrus.SetOutput(os.Stderr)
+
+	if err != nil {
+		t.Error(err)
+	}
+	e := hook.AllEntries()
+	if len(e) != 1 {
+		t.Errorf("expected 1 warning, got %d", len(e))
+	}
+
+	expectedLogs := logrus.Entry{
+		Level:   logrus.WarnLevel,
+		Message: "ignoring unknown or unavailable capabilities: [CAP_UNKNOWN CAP_UNKNOWN2]",
+	}
+
+	l := hook.LastEntry()
+	if l == nil {
+		t.Fatal("expected a warning, but got none")
+	}
+	if l.Level != expectedLogs.Level {
+		t.Errorf("expected %q, got %q", expectedLogs.Level, l.Level)
+	}
+	if l.Message != expectedLogs.Message {
+		t.Errorf("expected %q, got %q", expectedLogs.Message, l.Message)
+	}
+
+	if len(caps.caps) != len(capTypes) {
+		t.Errorf("expected %d capability types, got %d: %v", len(capTypes), len(caps.caps), caps.caps)
+	}
+
+	for _, cType := range capTypes {
+		if i := len(caps.caps[cType]); i != 1 {
+			t.Errorf("expected 1 capability for %s, got %d: %v", cType, i, caps.caps[cType])
+			continue
+		}
+		if caps.caps[cType][0] != capability.CAP_CHOWN {
+			t.Errorf("expected CAP_CHOWN, got %s: ", caps.caps[cType][0])
+			continue
+		}
+	}
+
+	hook.Reset()
+}
--- a/libcontainer/capabilities/capabilities_unsupported.go
+++ b/libcontainer/capabilities/capabilities_unsupported.go
@ -0,0 +1,4 @@
+//go:build !linux
+// +build !linux
+
+package capabilities
--- a/libcontainer/capabilities_linux.go
+++ b/libcontainer/capabilities_linux.go
@ -1,117 +0,0 @@
-// +build linux
-
-package libcontainer
-
-import (
-	"fmt"
-	"strings"
-
-	"github.com/opencontainers/runc/libcontainer/configs"
-	"github.com/syndtr/gocapability/capability"
-)
-
-const allCapabilityTypes = capability.CAPS | capability.BOUNDS | capability.AMBS
-
-var capabilityMap map[string]capability.Cap
-
-func init() {
-	capabilityMap = make(map[string]capability.Cap)
-	last := capability.CAP_LAST_CAP
-	// workaround for RHEL6 which has no /proc/sys/kernel/cap_last_cap
-	if last == capability.Cap(63) {
-		last = capability.CAP_BLOCK_SUSPEND
-	}
-	for _, cap := range capability.List() {
-		if cap > last {
-			continue
-		}
-		capKey := fmt.Sprintf("CAP_%s", strings.ToUpper(cap.String()))
-		capabilityMap[capKey] = cap
-	}
-}
-
-func newContainerCapList(capConfig *configs.Capabilities) (*containerCapabilities, error) {
-	bounding := []capability.Cap{}
-	for _, c := range capConfig.Bounding {
-		v, ok := capabilityMap[c]
-		if !ok {
-			return nil, fmt.Errorf("unknown capability %q", c)
-		}
-		bounding = append(bounding, v)
-	}
-	effective := []capability.Cap{}
-	for _, c := range capConfig.Effective {
-		v, ok := capabilityMap[c]
-		if !ok {
-			return nil, fmt.Errorf("unknown capability %q", c)
-		}
-		effective = append(effective, v)
-	}
-	inheritable := []capability.Cap{}
-	for _, c := range capConfig.Inheritable {
-		v, ok := capabilityMap[c]
-		if !ok {
-			return nil, fmt.Errorf("unknown capability %q", c)
-		}
-		inheritable = append(inheritable, v)
-	}
-	permitted := []capability.Cap{}
-	for _, c := range capConfig.Permitted {
-		v, ok := capabilityMap[c]
-		if !ok {
-			return nil, fmt.Errorf("unknown capability %q", c)
-		}
-		permitted = append(permitted, v)
-	}
-	ambient := []capability.Cap{}
-	for _, c := range capConfig.Ambient {
-		v, ok := capabilityMap[c]
-		if !ok {
-			return nil, fmt.Errorf("unknown capability %q", c)
-		}
-		ambient = append(ambient, v)
-	}
-	pid, err := capability.NewPid2(0)
-	if err != nil {
-		return nil, err
-	}
-	err = pid.Load()
-	if err != nil {
-		return nil, err
-	}
-	return &containerCapabilities{
-		bounding:    bounding,
-		effective:   effective,
-		inheritable: inheritable,
-		permitted:   permitted,
-		ambient:     ambient,
-		pid:         pid,
-	}, nil
-}
-
-type containerCapabilities struct {
-	pid         capability.Capabilities
-	bounding    []capability.Cap
-	effective   []capability.Cap
-	inheritable []capability.Cap
-	permitted   []capability.Cap
-	ambient     []capability.Cap
-}
-
-// ApplyBoundingSet sets the capability bounding set to those specified in the whitelist.
-func (c *containerCapabilities) ApplyBoundingSet() error {
-	c.pid.Clear(capability.BOUNDS)
-	c.pid.Set(capability.BOUNDS, c.bounding...)
-	return c.pid.Apply(capability.BOUNDS)
-}
-
-// Apply sets all the capabilities for the current process in the config.
-func (c *containerCapabilities) ApplyCaps() error {
-	c.pid.Clear(allCapabilityTypes)
-	c.pid.Set(capability.BOUNDS, c.bounding...)
-	c.pid.Set(capability.PERMITTED, c.permitted...)
-	c.pid.Set(capability.INHERITABLE, c.inheritable...)
-	c.pid.Set(capability.EFFECTIVE, c.effective...)
-	c.pid.Set(capability.AMBIENT, c.ambient...)
-	return c.pid.Apply(allCapabilityTypes)
-}
--- a/libcontainer/cgroups/cgroups.go
+++ b/libcontainer/cgroups/cgroups.go
@ -1,74 +1,59 @@
-// +build linux
-
 package cgroups

 import (
-	"fmt"
-
 	"github.com/opencontainers/runc/libcontainer/configs"
 )

 type Manager interface {
-	// Applies cgroup configuration to the process with the specified pid
+	// Apply creates a cgroup, if not yet created, and adds a process
+	// with the specified pid into that cgroup.  A special value of -1
+	// can be used to merely create a cgroup.
 	Apply(pid int) error

-	// Returns the PIDs inside the cgroup set
+	// GetPids returns the PIDs of all processes inside the cgroup.
 	GetPids() ([]int, error)

-	// Returns the PIDs inside the cgroup set & all sub-cgroups
+	// GetAllPids returns the PIDs of all processes inside the cgroup
+	// any all its sub-cgroups.
 	GetAllPids() ([]int, error)

-	// Returns statistics for the cgroup set
+	// GetStats returns cgroups statistics.
 	GetStats() (*Stats, error)

-	// Toggles the freezer cgroup according with specified state
+	// Freeze sets the freezer cgroup to the specified state.
 	Freeze(state configs.FreezerState) error

-	// Destroys the cgroup set
+	// Destroy removes cgroup.
 	Destroy() error

-	// The option func SystemdCgroups() and Cgroupfs() require following attributes:
-	// 	Paths   map[string]string
-	// 	Cgroups *configs.Cgroup
-	// Paths maps cgroup subsystem to path at which it is mounted.
-	// Cgroups specifies specific cgroup settings for the various subsystems
+	// Path returns a cgroup path to the specified controller/subsystem.
+	// For cgroupv2, the argument is unused and can be empty.
+	Path(string) string

-	// Returns cgroup paths to save in a state file and to be able to
-	// restore the object later.
+	// Set sets cgroup resources parameters/limits. If the argument is nil,
+	// the resources specified during Manager creation (or the previous call
+	// to Set) are used.
+	Set(r *configs.Resources) error
+
+	// GetPaths returns cgroup path(s) to save in a state file in order to
+	// restore later.
+	//
+	// For cgroup v1, a key is cgroup subsystem name, and the value is the
+	// path to the cgroup for this subsystem.
+	//
+	// For cgroup v2 unified hierarchy, a key is "", and the value is the
+	// unified path.
 	GetPaths() map[string]string

-	// GetUnifiedPath returns the unified path when running in unified mode.
-	// The value corresponds to the all values of GetPaths() map.
-	//
-	// GetUnifiedPath returns error when running in hybrid mode as well as
-	// in legacy mode.
-	GetUnifiedPath() (string, error)
-
-	// Sets the cgroup as configured.
-	Set(container *configs.Config) error
-
-	// Gets the cgroup as configured.
+	// GetCgroups returns the cgroup data as configured.
 	GetCgroups() (*configs.Cgroup, error)
-}

-type NotFoundError struct {
-	Subsystem string
-}
+	// GetFreezerState retrieves the current FreezerState of the cgroup.
+	GetFreezerState() (configs.FreezerState, error)

-func (e *NotFoundError) Error() string {
-	return fmt.Sprintf("mountpoint for %s not found", e.Subsystem)
-}
+	// Exists returns whether the cgroup path exists or not.
+	Exists() bool

-func NewNotFoundError(sub string) error {
-	return &NotFoundError{
-		Subsystem: sub,
-	}
-}
-
-func IsNotFound(err error) bool {
-	if err == nil {
-		return false
-	}
-	_, ok := err.(*NotFoundError)
-	return ok
+	// OOMKillCount reports OOM kill count for the cgroup.
+	OOMKillCount() (uint64, error)
 }
--- a/libcontainer/cgroups/cgroups_test.go
+++ b/libcontainer/cgroups/cgroups_test.go
@ -1,5 +1,3 @@
-// +build linux
-
 package cgroups

 import (
--- a/libcontainer/cgroups/cgroups_unsupported.go
+++ b/libcontainer/cgroups/cgroups_unsupported.go
@ -1,3 +0,0 @@
-// +build !linux
-
-package cgroups
--- a/libcontainer/cgroups/devices/devices_emulator.go
+++ b/libcontainer/cgroups/devices/devices_emulator.go
@ -0,0 +1,386 @@
+// SPDX-License-Identifier: Apache-2.0
+/*
+ * Copyright (C) 2020 Aleksa Sarai <cyphar@cyphar.com>
+ * Copyright (C) 2020 SUSE LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package devices
+
+import (
+	"bufio"
+	"fmt"
+	"io"
+	"sort"
+	"strconv"
+	"strings"
+
+	"github.com/opencontainers/runc/libcontainer/devices"
+)
+
+// deviceMeta is a Rule without the Allow or Permissions fields, and no
+// wildcard-type support. It's effectively the "match" portion of a metadata
+// rule, for the purposes of our emulation.
+type deviceMeta struct {
+	node  devices.Type
+	major int64
+	minor int64
+}
+
+// deviceRule is effectively the tuple (deviceMeta, Permissions).
+type deviceRule struct {
+	meta  deviceMeta
+	perms devices.Permissions
+}
+
+// deviceRules is a mapping of device metadata rules to the associated
+// permissions in the ruleset.
+type deviceRules map[deviceMeta]devices.Permissions
+
+func (r deviceRules) orderedEntries() []deviceRule {
+	var rules []deviceRule
+	for meta, perms := range r {
+		rules = append(rules, deviceRule{meta: meta, perms: perms})
+	}
+	sort.Slice(rules, func(i, j int) bool {
+		// Sort by (major, minor, type).
+		a, b := rules[i].meta, rules[j].meta
+		return a.major < b.major ||
+			(a.major == b.major && a.minor < b.minor) ||
+			(a.major == b.major && a.minor == b.minor && a.node < b.node)
+	})
+	return rules
+}
+
+type Emulator struct {
+	defaultAllow bool
+	rules        deviceRules
+}
+
+func (e *Emulator) IsBlacklist() bool {
+	return e.defaultAllow
+}
+
+func (e *Emulator) IsAllowAll() bool {
+	return e.IsBlacklist() && len(e.rules) == 0
+}
+
+func parseLine(line string) (*deviceRule, error) {
+	// Input: node major:minor perms.
+	fields := strings.FieldsFunc(line, func(r rune) bool {
+		return r == ' ' || r == ':'
+	})
+	if len(fields) != 4 {
+		return nil, fmt.Errorf("malformed devices.list rule %s", line)
+	}
+
+	var (
+		rule  deviceRule
+		node  = fields[0]
+		major = fields[1]
+		minor = fields[2]
+		perms = fields[3]
+	)
+
+	// Parse the node type.
+	switch node {
+	case "a":
+		// Super-special case -- "a" always means every device with every
+		// access mode. In fact, for devices.list this actually indicates that
+		// the cgroup is in black-list mode.
+		// TODO: Double-check that the entire file is "a *:* rwm".
+		return nil, nil
+	case "b":
+		rule.meta.node = devices.BlockDevice
+	case "c":
+		rule.meta.node = devices.CharDevice
+	default:
+		return nil, fmt.Errorf("unknown device type %q", node)
+	}
+
+	// Parse the major number.
+	if major == "*" {
+		rule.meta.major = devices.Wildcard
+	} else {
+		val, err := strconv.ParseUint(major, 10, 32)
+		if err != nil {
+			return nil, fmt.Errorf("invalid major number: %w", err)
+		}
+		rule.meta.major = int64(val)
+	}
+
+	// Parse the minor number.
+	if minor == "*" {
+		rule.meta.minor = devices.Wildcard
+	} else {
+		val, err := strconv.ParseUint(minor, 10, 32)
+		if err != nil {
+			return nil, fmt.Errorf("invalid minor number: %w", err)
+		}
+		rule.meta.minor = int64(val)
+	}
+
+	// Parse the access permissions.
+	rule.perms = devices.Permissions(perms)
+	if !rule.perms.IsValid() || rule.perms.IsEmpty() {
+		return nil, fmt.Errorf("parse access mode: contained unknown modes or is empty: %q", perms)
+	}
+	return &rule, nil
+}
+
+func (e *Emulator) addRule(rule deviceRule) error { //nolint:unparam
+	if e.rules == nil {
+		e.rules = make(map[deviceMeta]devices.Permissions)
+	}
+
+	// Merge with any pre-existing permissions.
+	oldPerms := e.rules[rule.meta]
+	newPerms := rule.perms.Union(oldPerms)
+	e.rules[rule.meta] = newPerms
+	return nil
+}
+
+func (e *Emulator) rmRule(rule deviceRule) error {
+	// Give an error if any of the permissions requested to be removed are
+	// present in a partially-matching wildcard rule, because such rules will
+	// be ignored by cgroupv1.
+	//
+	// This is a diversion from cgroupv1, but is necessary to avoid leading
+	// users into a false sense of security. cgroupv1 will silently(!) ignore
+	// requests to remove partial exceptions, but we really shouldn't do that.
+	//
+	// It may seem like we could just "split" wildcard rules which hit this
+	// issue, but unfortunately there are 2^32 possible major and minor
+	// numbers, which would exhaust kernel memory quickly if we did this. Not
+	// to mention it'd be really slow (the kernel side is implemented as a
+	// linked-list of exceptions).
+	for _, partialMeta := range []deviceMeta{
+		{node: rule.meta.node, major: devices.Wildcard, minor: rule.meta.minor},
+		{node: rule.meta.node, major: rule.meta.major, minor: devices.Wildcard},
+		{node: rule.meta.node, major: devices.Wildcard, minor: devices.Wildcard},
+	} {
+		// This wildcard rule is equivalent to the requested rule, so skip it.
+		if rule.meta == partialMeta {
+			continue
+		}
+		// Only give an error if the set of permissions overlap.
+		partialPerms := e.rules[partialMeta]
+		if !partialPerms.Intersection(rule.perms).IsEmpty() {
+			return fmt.Errorf("requested rule [%v %v] not supported by devices cgroupv1 (cannot punch hole in existing wildcard rule [%v %v])", rule.meta, rule.perms, partialMeta, partialPerms)
+		}
+	}
+
+	// Subtract all of the permissions listed from the full match rule. If the
+	// rule didn't exist, all of this is a no-op.
+	newPerms := e.rules[rule.meta].Difference(rule.perms)
+	if newPerms.IsEmpty() {
+		delete(e.rules, rule.meta)
+	} else {
+		e.rules[rule.meta] = newPerms
+	}
+	// TODO: The actual cgroup code doesn't care if an exception didn't exist
+	//       during removal, so not erroring out here is /accurate/ but quite
+	//       worrying. Maybe we should do additional validation, but again we
+	//       have to worry about backwards-compatibility.
+	return nil
+}
+
+func (e *Emulator) allow(rule *deviceRule) error {
+	// This cgroup is configured as a black-list. Reset the entire emulator,
+	// and put is into black-list mode.
+	if rule == nil || rule.meta.node == devices.WildcardDevice {
+		*e = Emulator{
+			defaultAllow: true,
+			rules:        nil,
+		}
+		return nil
+	}
+
+	var err error
+	if e.defaultAllow {
+		err = wrapErr(e.rmRule(*rule), "unable to remove 'deny' exception")
+	} else {
+		err = wrapErr(e.addRule(*rule), "unable to add 'allow' exception")
+	}
+	return err
+}
+
+func (e *Emulator) deny(rule *deviceRule) error {
+	// This cgroup is configured as a white-list. Reset the entire emulator,
+	// and put is into white-list mode.
+	if rule == nil || rule.meta.node == devices.WildcardDevice {
+		*e = Emulator{
+			defaultAllow: false,
+			rules:        nil,
+		}
+		return nil
+	}
+
+	var err error
+	if e.defaultAllow {
+		err = wrapErr(e.addRule(*rule), "unable to add 'deny' exception")
+	} else {
+		err = wrapErr(e.rmRule(*rule), "unable to remove 'allow' exception")
+	}
+	return err
+}
+
+func (e *Emulator) Apply(rule devices.Rule) error {
+	if !rule.Type.CanCgroup() {
+		return fmt.Errorf("cannot add rule [%#v] with non-cgroup type %q", rule, rule.Type)
+	}
+
+	innerRule := &deviceRule{
+		meta: deviceMeta{
+			node:  rule.Type,
+			major: rule.Major,
+			minor: rule.Minor,
+		},
+		perms: rule.Permissions,
+	}
+	if innerRule.meta.node == devices.WildcardDevice {
+		innerRule = nil
+	}
+
+	if rule.Allow {
+		return e.allow(innerRule)
+	}
+
+	return e.deny(innerRule)
+}
+
+// EmulatorFromList takes a reader to a "devices.list"-like source, and returns
+// a new Emulator that represents the state of the devices cgroup. Note that
+// black-list devices cgroups cannot be fully reconstructed, due to limitations
+// in the devices cgroup API. Instead, such cgroups are always treated as
+// "allow all" cgroups.
+func EmulatorFromList(list io.Reader) (*Emulator, error) {
+	// Normally cgroups are in black-list mode by default, but the way we
+	// figure out the current mode is whether or not devices.list has an
+	// allow-all rule. So we default to a white-list, and the existence of an
+	// "a *:* rwm" entry will tell us otherwise.
+	e := &Emulator{
+		defaultAllow: false,
+	}
+
+	// Parse the "devices.list".
+	s := bufio.NewScanner(list)
+	for s.Scan() {
+		line := s.Text()
+		deviceRule, err := parseLine(line)
+		if err != nil {
+			return nil, fmt.Errorf("error parsing line %q: %w", line, err)
+		}
+		// "devices.list" is an allow list. Note that this means that in
+		// black-list mode, we have no idea what rules are in play. As a
+		// result, we need to be very careful in Transition().
+		if err := e.allow(deviceRule); err != nil {
+			return nil, fmt.Errorf("error adding devices.list rule: %w", err)
+		}
+	}
+	if err := s.Err(); err != nil {
+		return nil, fmt.Errorf("error reading devices.list lines: %w", err)
+	}
+	return e, nil
+}
+
+// Transition calculates what is the minimally-disruptive set of rules need to
+// be applied to a devices cgroup in order to transition to the given target.
+// This means that any already-existing rules will not be applied, and
+// disruptive rules (like denying all device access) will only be applied if
+// necessary.
+//
+// This function is the sole reason for all of Emulator -- to allow us
+// to figure out how to update a containers' cgroups without causing spurious
+// device errors (if possible).
+func (source *Emulator) Transition(target *Emulator) ([]*devices.Rule, error) {
+	var transitionRules []*devices.Rule
+	oldRules := source.rules
+
+	// If the default policy doesn't match, we need to include a "disruptive"
+	// rule (either allow-all or deny-all) in order to switch the cgroup to the
+	// correct default policy.
+	//
+	// However, due to a limitation in "devices.list" we cannot be sure what
+	// deny rules are in place in a black-list cgroup. Thus if the source is a
+	// black-list we also have to include a disruptive rule.
+	if source.IsBlacklist() || source.defaultAllow != target.defaultAllow {
+		transitionRules = append(transitionRules, &devices.Rule{
+			Type:        'a',
+			Major:       -1,
+			Minor:       -1,
+			Permissions: devices.Permissions("rwm"),
+			Allow:       target.defaultAllow,
+		})
+		// The old rules are only relevant if we aren't starting out with a
+		// disruptive rule.
+		oldRules = nil
+	}
+
+	// NOTE: We traverse through the rules in a sorted order so we always write
+	//       the same set of rules (this is to aid testing).
+
+	// First, we create inverse rules for any old rules not in the new set.
+	// This includes partial-inverse rules for specific permissions. This is a
+	// no-op if we added a disruptive rule, since oldRules will be empty.
+	for _, rule := range oldRules.orderedEntries() {
+		meta, oldPerms := rule.meta, rule.perms
+		newPerms := target.rules[meta]
+		droppedPerms := oldPerms.Difference(newPerms)
+		if !droppedPerms.IsEmpty() {
+			transitionRules = append(transitionRules, &devices.Rule{
+				Type:        meta.node,
+				Major:       meta.major,
+				Minor:       meta.minor,
+				Permissions: droppedPerms,
+				Allow:       target.defaultAllow,
+			})
+		}
+	}
+
+	// Add any additional rules which weren't in the old set. We happen to
+	// filter out rules which are present in both sets, though this isn't
+	// strictly necessary.
+	for _, rule := range target.rules.orderedEntries() {
+		meta, newPerms := rule.meta, rule.perms
+		oldPerms := oldRules[meta]
+		gainedPerms := newPerms.Difference(oldPerms)
+		if !gainedPerms.IsEmpty() {
+			transitionRules = append(transitionRules, &devices.Rule{
+				Type:        meta.node,
+				Major:       meta.major,
+				Minor:       meta.minor,
+				Permissions: gainedPerms,
+				Allow:       !target.defaultAllow,
+			})
+		}
+	}
+	return transitionRules, nil
+}
+
+// Rules returns the minimum set of rules necessary to convert a *deny-all*
+// cgroup to the emulated filter state (note that this is not the same as a
+// default cgroupv1 cgroup -- which is allow-all). This is effectively just a
+// wrapper around Transition() with the source emulator being an empty cgroup.
+func (e *Emulator) Rules() ([]*devices.Rule, error) {
+	defaultCgroup := &Emulator{defaultAllow: false}
+	return defaultCgroup.Transition(e)
+}
+
+func wrapErr(err error, text string) error {
+	if err == nil {
+		return nil
+	}
+	return fmt.Errorf(text+": %w", err)
+}
--- a/libcontainer/cgroups/devices/devices_emulator_test.go
+++ b/libcontainer/cgroups/devices/devices_emulator_test.go
--- a/libcontainer/cgroups/ebpf/devicefilter/devicefilter.go
+++ b/libcontainer/cgroups/ebpf/devicefilter/devicefilter.go
@ -1,4 +1,4 @@
-// Package devicefilter containes eBPF device filter program
+// Package devicefilter contains eBPF device filter program
 //
 // The implementation is based on https://github.com/containers/crun/blob/0.10.2/src/libcrun/ebpf.c
 //
@ -7,12 +7,14 @@
 package devicefilter

 import (
+	"errors"
 	"fmt"
 	"math"
+	"strconv"

 	"github.com/cilium/ebpf/asm"
-	"github.com/opencontainers/runc/libcontainer/configs"
-	"github.com/pkg/errors"
+	devicesemulator "github.com/opencontainers/runc/libcontainer/cgroups/devices"
+	"github.com/opencontainers/runc/libcontainer/devices"
 	"golang.org/x/sys/unix"
 )

@ -22,22 +24,54 @@ const (
 )

 // DeviceFilter returns eBPF device filter program and its license string
-func DeviceFilter(devices []*configs.Device) (asm.Instructions, string, error) {
-	p := &program{}
-	p.init()
-	for i := len(devices) - 1; i >= 0; i-- {
-		if err := p.appendDevice(devices[i]); err != nil {
+func DeviceFilter(rules []*devices.Rule) (asm.Instructions, string, error) {
+	// Generate the minimum ruleset for the device rules we are given. While we
+	// don't care about minimum transitions in cgroupv2, using the emulator
+	// gives us a guarantee that the behaviour of devices filtering is the same
+	// as cgroupv1, including security hardenings to avoid misconfiguration
+	// (such as punching holes in wildcard rules).
+	emu := new(devicesemulator.Emulator)
+	for _, rule := range rules {
+		if err := emu.Apply(*rule); err != nil {
 			return nil, "", err
 		}
 	}
-	insts, err := p.finalize()
-	return insts, license, err
+	cleanRules, err := emu.Rules()
+	if err != nil {
+		return nil, "", err
+	}
+
+	p := &program{
+		defaultAllow: emu.IsBlacklist(),
+	}
+	p.init()
+
+	for idx, rule := range cleanRules {
+		if rule.Type == devices.WildcardDevice {
+			// We can safely skip over wildcard entries because there should
+			// only be one (at most) at the very start to instruct cgroupv1 to
+			// go into allow-list mode. However we do double-check this here.
+			if idx != 0 || rule.Allow != emu.IsBlacklist() {
+				return nil, "", fmt.Errorf("[internal error] emulated cgroupv2 devices ruleset had bad wildcard at idx %v (%s)", idx, rule.CgroupString())
+			}
+			continue
+		}
+		if rule.Allow == p.defaultAllow {
+			// There should be no rules which have an action equal to the
+			// default action, the emulator removes those.
+			return nil, "", fmt.Errorf("[internal error] emulated cgroupv2 devices ruleset had no-op rule at idx %v (%s)", idx, rule.CgroupString())
+		}
+		if err := p.appendRule(rule); err != nil {
+			return nil, "", err
+		}
+	}
+	return p.finalize(), license, nil
 }

 type program struct {
-	insts       asm.Instructions
-	hasWildCard bool
-	blockID     int
+	insts        asm.Instructions
+	defaultAllow bool
+	blockID      int
 }

 func (p *program) init() {
@ -49,7 +83,8 @@ func (p *program) init() {
 	*/
 	// R2 <- type (lower 16 bit of u32 access_type at R1[0])
 	p.insts = append(p.insts,
-		asm.LoadMem(asm.R2, asm.R1, 0, asm.Half))
+		asm.LoadMem(asm.R2, asm.R1, 0, asm.Word),
+		asm.And.Imm32(asm.R2, 0xFFFF))

 	// R3 <- access (upper 16 bit of u32 access_type at R1[0])
 	p.insts = append(p.insts,
@ -66,39 +101,35 @@ func (p *program) init() {
 		asm.LoadMem(asm.R5, asm.R1, 8, asm.Word))
 }

-// appendDevice needs to be called from the last element of OCI linux.resources.devices to the head element.
-func (p *program) appendDevice(dev *configs.Device) error {
+// appendRule rule converts an OCI rule to the relevant eBPF block and adds it
+// to the in-progress filter program. In order to operate properly, it must be
+// called with a "clean" rule list (generated by devices.Emulator.Rules() --
+// with any "a" rules removed).
+func (p *program) appendRule(rule *devices.Rule) error {
 	if p.blockID < 0 {
 		return errors.New("the program is finalized")
 	}
-	if p.hasWildCard {
-		// All entries after wildcard entry are ignored
-		return nil
-	}

-	bpfType := int32(-1)
-	hasType := true
-	switch dev.Type {
-	case 'c':
+	var bpfType int32
+	switch rule.Type {
+	case devices.CharDevice:
 		bpfType = int32(unix.BPF_DEVCG_DEV_CHAR)
-	case 'b':
+	case devices.BlockDevice:
 		bpfType = int32(unix.BPF_DEVCG_DEV_BLOCK)
-	case 'a':
-		hasType = false
 	default:
-		// if not specified in OCI json, typ is set to DeviceTypeAll
-		return errors.Errorf("invalid DeviceType %q", string(dev.Type))
+		// We do not permit 'a', nor any other types we don't know about.
+		return fmt.Errorf("invalid type %q", string(rule.Type))
 	}
-	if dev.Major > math.MaxUint32 {
-		return errors.Errorf("invalid major %d", dev.Major)
+	if rule.Major > math.MaxUint32 {
+		return fmt.Errorf("invalid major %d", rule.Major)
 	}
-	if dev.Minor > math.MaxUint32 {
-		return errors.Errorf("invalid minor %d", dev.Major)
+	if rule.Minor > math.MaxUint32 {
+		return fmt.Errorf("invalid minor %d", rule.Major)
 	}
-	hasMajor := dev.Major >= 0 // if not specified in OCI json, major is set to -1
-	hasMinor := dev.Minor >= 0
+	hasMajor := rule.Major >= 0 // if not specified in OCI json, major is set to -1
+	hasMinor := rule.Minor >= 0
 	bpfAccess := int32(0)
-	for _, r := range dev.Permissions {
+	for _, r := range rule.Permissions {
 		switch r {
 		case 'r':
 			bpfAccess |= unix.BPF_DEVCG_ACC_READ
@ -107,68 +138,65 @@ func (p *program) appendDevice(dev *configs.Device) error {
 		case 'm':
 			bpfAccess |= unix.BPF_DEVCG_ACC_MKNOD
 		default:
-			return errors.Errorf("unknown device access %v", r)
+			return fmt.Errorf("unknown device access %v", r)
 		}
 	}
 	// If the access is rwm, skip the check.
 	hasAccess := bpfAccess != (unix.BPF_DEVCG_ACC_READ | unix.BPF_DEVCG_ACC_WRITE | unix.BPF_DEVCG_ACC_MKNOD)

-	blockSym := fmt.Sprintf("block-%d", p.blockID)
-	nextBlockSym := fmt.Sprintf("block-%d", p.blockID+1)
-	prevBlockLastIdx := len(p.insts) - 1
-	if hasType {
-		p.insts = append(p.insts,
-			// if (R2 != bpfType) goto next
-			asm.JNE.Imm(asm.R2, bpfType, nextBlockSym),
-		)
-	}
+	var (
+		blockSym         = "block-" + strconv.Itoa(p.blockID)
+		nextBlockSym     = "block-" + strconv.Itoa(p.blockID+1)
+		prevBlockLastIdx = len(p.insts) - 1
+	)
+	p.insts = append(p.insts,
+		// if (R2 != bpfType) goto next
+		asm.JNE.Imm(asm.R2, bpfType, nextBlockSym),
+	)
 	if hasAccess {
 		p.insts = append(p.insts,
-			// if (R3 & bpfAccess == 0 /* use R1 as a temp var */) goto next
+			// if (R3 & bpfAccess != R3 /* use R1 as a temp var */) goto next
 			asm.Mov.Reg32(asm.R1, asm.R3),
 			asm.And.Imm32(asm.R1, bpfAccess),
-			asm.JEq.Imm(asm.R1, 0, nextBlockSym),
+			asm.JNE.Reg(asm.R1, asm.R3, nextBlockSym),
 		)
 	}
 	if hasMajor {
 		p.insts = append(p.insts,
 			// if (R4 != major) goto next
-			asm.JNE.Imm(asm.R4, int32(dev.Major), nextBlockSym),
+			asm.JNE.Imm(asm.R4, int32(rule.Major), nextBlockSym),
 		)
 	}
 	if hasMinor {
 		p.insts = append(p.insts,
 			// if (R5 != minor) goto next
-			asm.JNE.Imm(asm.R5, int32(dev.Minor), nextBlockSym),
+			asm.JNE.Imm(asm.R5, int32(rule.Minor), nextBlockSym),
 		)
 	}
-	if !hasType && !hasAccess && !hasMajor && !hasMinor {
-		p.hasWildCard = true
-	}
-	p.insts = append(p.insts, acceptBlock(dev.Allow)...)
+	p.insts = append(p.insts, acceptBlock(rule.Allow)...)
 	// set blockSym to the first instruction we added in this iteration
 	p.insts[prevBlockLastIdx+1] = p.insts[prevBlockLastIdx+1].Sym(blockSym)
 	p.blockID++
 	return nil
 }

-func (p *program) finalize() (asm.Instructions, error) {
-	if p.hasWildCard {
-		// acceptBlock with asm.Return() is already inserted
-		return p.insts, nil
+func (p *program) finalize() asm.Instructions {
+	var v int32
+	if p.defaultAllow {
+		v = 1
 	}
-	blockSym := fmt.Sprintf("block-%d", p.blockID)
+	blockSym := "block-" + strconv.Itoa(p.blockID)
 	p.insts = append(p.insts,
-		// R0 <- 0
-		asm.Mov.Imm32(asm.R0, 0).Sym(blockSym),
+		// R0 <- v
+		asm.Mov.Imm32(asm.R0, v).Sym(blockSym),
 		asm.Return(),
 	)
 	p.blockID = -1
-	return p.insts, nil
+	return p.insts
 }

 func acceptBlock(accept bool) asm.Instructions {
-	v := int32(0)
+	var v int32
 	if accept {
 		v = 1
 	}
--- a/libcontainer/cgroups/ebpf/devicefilter/devicefilter_test.go
+++ b/libcontainer/cgroups/ebpf/devicefilter/devicefilter_test.go
@ -4,7 +4,7 @@ import (
 	"strings"
 	"testing"

-	"github.com/opencontainers/runc/libcontainer/configs"
+	"github.com/opencontainers/runc/libcontainer/devices"
 	"github.com/opencontainers/runc/libcontainer/specconv"
 )

@ -20,13 +20,12 @@ func hash(s, comm string) string {
 	return strings.Join(res, "\n")
 }

-func testDeviceFilter(t testing.TB, devices []*configs.Device, expectedStr string) {
+func testDeviceFilter(t testing.TB, devices []*devices.Rule, expectedStr string) {
 	insts, _, err := DeviceFilter(devices)
 	if err != nil {
 		t.Fatalf("%s: %v (devices: %+v)", t.Name(), err, devices)
 	}
 	s := insts.String()
-	t.Logf("%s: devices: %+v\n%s", t.Name(), devices, s)
 	if expectedStr != "" {
 		hashed := hash(s, "//")
 		expectedHashed := hash(expectedStr, "//")
@ -39,15 +38,16 @@ func testDeviceFilter(t testing.TB, devices []*configs.Device, expectedStr strin
 func TestDeviceFilter_Nil(t *testing.T) {
 	expected := `
 // load parameters into registers
-        0: LdXMemH dst: r2 src: r1 off: 0 imm: 0
-        1: LdXMemW dst: r3 src: r1 off: 0 imm: 0
-        2: RSh32Imm dst: r3 imm: 16
-        3: LdXMemW dst: r4 src: r1 off: 4 imm: 0
-        4: LdXMemW dst: r5 src: r1 off: 8 imm: 0
+        0: LdXMemW dst: r2 src: r1 off: 0 imm: 0
+        1: And32Imm dst: r2 imm: 65535
+        2: LdXMemW dst: r3 src: r1 off: 0 imm: 0
+        3: RSh32Imm dst: r3 imm: 16
+        4: LdXMemW dst: r4 src: r1 off: 4 imm: 0
+        5: LdXMemW dst: r5 src: r1 off: 8 imm: 0
 block-0:
 // return 0 (reject)
-        5: Mov32Imm dst: r0 imm: 0
-        6: Exit
+        6: Mov32Imm dst: r0 imm: 0
+        7: Exit
 	`
 	testDeviceFilter(t, nil, expected)
 }
@ -55,97 +55,96 @@ block-0:
 func TestDeviceFilter_BuiltInAllowList(t *testing.T) {
 	expected := `
 // load parameters into registers
-         0: LdXMemH dst: r2 src: r1 off: 0 imm: 0
-         1: LdXMemW dst: r3 src: r1 off: 0 imm: 0
-         2: RSh32Imm dst: r3 imm: 16
-         3: LdXMemW dst: r4 src: r1 off: 4 imm: 0
-         4: LdXMemW dst: r5 src: r1 off: 8 imm: 0
+        0: LdXMemW dst: r2 src: r1 off: 0 imm: 0
+        1: And32Imm dst: r2 imm: 65535
+        2: LdXMemW dst: r3 src: r1 off: 0 imm: 0
+        3: RSh32Imm dst: r3 imm: 16
+        4: LdXMemW dst: r4 src: r1 off: 4 imm: 0
+        5: LdXMemW dst: r5 src: r1 off: 8 imm: 0
 block-0:
-// tuntap (c, 10, 200, rwm, allow)
-         5: JNEImm dst: r2 off: -1 imm: 2 <block-1>
-         6: JNEImm dst: r4 off: -1 imm: 10 <block-1>
-         7: JNEImm dst: r5 off: -1 imm: 200 <block-1>
-         8: Mov32Imm dst: r0 imm: 1
-         9: Exit
-block-1:
-        10: JNEImm dst: r2 off: -1 imm: 2 <block-2>
-        11: JNEImm dst: r4 off: -1 imm: 5 <block-2>
-        12: JNEImm dst: r5 off: -1 imm: 2 <block-2>
-        13: Mov32Imm dst: r0 imm: 1
-        14: Exit
-block-2:
-// /dev/pts (c, 136, wildcard, rwm, true)
-        15: JNEImm dst: r2 off: -1 imm: 2 <block-3>
-        16: JNEImm dst: r4 off: -1 imm: 136 <block-3>
-        17: Mov32Imm dst: r0 imm: 1
-        18: Exit
-block-3:
-        19: JNEImm dst: r2 off: -1 imm: 2 <block-4>
-        20: JNEImm dst: r4 off: -1 imm: 5 <block-4>
-        21: JNEImm dst: r5 off: -1 imm: 1 <block-4>
-        22: Mov32Imm dst: r0 imm: 1
-        23: Exit
-block-4:
-        24: JNEImm dst: r2 off: -1 imm: 2 <block-5>
-        25: JNEImm dst: r4 off: -1 imm: 1 <block-5>
-        26: JNEImm dst: r5 off: -1 imm: 9 <block-5>
-        27: Mov32Imm dst: r0 imm: 1
-        28: Exit
-block-5:
-        29: JNEImm dst: r2 off: -1 imm: 2 <block-6>
-        30: JNEImm dst: r4 off: -1 imm: 1 <block-6>
-        31: JNEImm dst: r5 off: -1 imm: 5 <block-6>
-        32: Mov32Imm dst: r0 imm: 1
-        33: Exit
-block-6:
-        34: JNEImm dst: r2 off: -1 imm: 2 <block-7>
-        35: JNEImm dst: r4 off: -1 imm: 5 <block-7>
-        36: JNEImm dst: r5 off: -1 imm: 0 <block-7>
-        37: Mov32Imm dst: r0 imm: 1
-        38: Exit
-block-7:
-        39: JNEImm dst: r2 off: -1 imm: 2 <block-8>
-        40: JNEImm dst: r4 off: -1 imm: 1 <block-8>
-        41: JNEImm dst: r5 off: -1 imm: 7 <block-8>
-        42: Mov32Imm dst: r0 imm: 1
-        43: Exit
-block-8:
-        44: JNEImm dst: r2 off: -1 imm: 2 <block-9>
-        45: JNEImm dst: r4 off: -1 imm: 1 <block-9>
-        46: JNEImm dst: r5 off: -1 imm: 8 <block-9>
-        47: Mov32Imm dst: r0 imm: 1
-        48: Exit
-block-9:
-        49: JNEImm dst: r2 off: -1 imm: 2 <block-10>
-        50: JNEImm dst: r4 off: -1 imm: 1 <block-10>
-        51: JNEImm dst: r5 off: -1 imm: 3 <block-10>
-        52: Mov32Imm dst: r0 imm: 1
-        53: Exit
-block-10:
 // (b, wildcard, wildcard, m, true)
-        54: JNEImm dst: r2 off: -1 imm: 1 <block-11>
-        55: Mov32Reg dst: r1 src: r3
-        56: And32Imm dst: r1 imm: 1
-        57: JEqImm dst: r1 off: -1 imm: 0 <block-11>
-        58: Mov32Imm dst: r0 imm: 1
-        59: Exit
-block-11:
+        6: JNEImm dst: r2 off: -1 imm: 1 <block-1>
+        7: Mov32Reg dst: r1 src: r3
+        8: And32Imm dst: r1 imm: 1
+        9: JNEReg dst: r1 off: -1 src: r3 <block-1>
+        10: Mov32Imm dst: r0 imm: 1
+        11: Exit
+block-1:
 // (c, wildcard, wildcard, m, true)
-        60: JNEImm dst: r2 off: -1 imm: 2 <block-12>
-        61: Mov32Reg dst: r1 src: r3
-        62: And32Imm dst: r1 imm: 1
-        63: JEqImm dst: r1 off: -1 imm: 0 <block-12>
-        64: Mov32Imm dst: r0 imm: 1
-        65: Exit
-block-12:
-        66: Mov32Imm dst: r0 imm: 0
-        67: Exit
+        12: JNEImm dst: r2 off: -1 imm: 2 <block-2>
+        13: Mov32Reg dst: r1 src: r3
+        14: And32Imm dst: r1 imm: 1
+        15: JNEReg dst: r1 off: -1 src: r3 <block-2>
+        16: Mov32Imm dst: r0 imm: 1
+        17: Exit
+block-2:
+        18: JNEImm dst: r2 off: -1 imm: 2 <block-3>
+        19: JNEImm dst: r4 off: -1 imm: 1 <block-3>
+        20: JNEImm dst: r5 off: -1 imm: 3 <block-3>
+        21: Mov32Imm dst: r0 imm: 1
+        22: Exit
+block-3:
+        23: JNEImm dst: r2 off: -1 imm: 2 <block-4>
+        24: JNEImm dst: r4 off: -1 imm: 1 <block-4>
+        25: JNEImm dst: r5 off: -1 imm: 5 <block-4>
+        26: Mov32Imm dst: r0 imm: 1
+        27: Exit
+block-4:
+        28: JNEImm dst: r2 off: -1 imm: 2 <block-5>
+        29: JNEImm dst: r4 off: -1 imm: 1 <block-5>
+        30: JNEImm dst: r5 off: -1 imm: 7 <block-5>
+        31: Mov32Imm dst: r0 imm: 1
+        32: Exit
+block-5:
+        33: JNEImm dst: r2 off: -1 imm: 2 <block-6>
+        34: JNEImm dst: r4 off: -1 imm: 1 <block-6>
+        35: JNEImm dst: r5 off: -1 imm: 8 <block-6>
+        36: Mov32Imm dst: r0 imm: 1
+        37: Exit
+block-6:
+        38: JNEImm dst: r2 off: -1 imm: 2 <block-7>
+        39: JNEImm dst: r4 off: -1 imm: 1 <block-7>
+        40: JNEImm dst: r5 off: -1 imm: 9 <block-7>
+        41: Mov32Imm dst: r0 imm: 1
+        42: Exit
+block-7:
+        43: JNEImm dst: r2 off: -1 imm: 2 <block-8>
+        44: JNEImm dst: r4 off: -1 imm: 5 <block-8>
+        45: JNEImm dst: r5 off: -1 imm: 0 <block-8>
+        46: Mov32Imm dst: r0 imm: 1
+        47: Exit
+block-8:
+        48: JNEImm dst: r2 off: -1 imm: 2 <block-9>
+        49: JNEImm dst: r4 off: -1 imm: 5 <block-9>
+        50: JNEImm dst: r5 off: -1 imm: 2 <block-9>
+        51: Mov32Imm dst: r0 imm: 1
+        52: Exit
+block-9:
+// tuntap (c, 10, 200, rwm, allow)
+        53: JNEImm dst: r2 off: -1 imm: 2 <block-10>
+        54: JNEImm dst: r4 off: -1 imm: 10 <block-10>
+        55: JNEImm dst: r5 off: -1 imm: 200 <block-10>
+        56: Mov32Imm dst: r0 imm: 1
+        57: Exit
+block-10:
+// /dev/pts (c, 136, wildcard, rwm, true)
+        58: JNEImm dst: r2 off: -1 imm: 2 <block-11>
+        59: JNEImm dst: r4 off: -1 imm: 136 <block-11>
+        60: Mov32Imm dst: r0 imm: 1
+        61: Exit
+block-11:
+        62: Mov32Imm dst: r0 imm: 0
+        63: Exit
 `
-	testDeviceFilter(t, specconv.AllowedDevices, expected)
+	var devices []*devices.Rule
+	for _, device := range specconv.AllowedDevices {
+		devices = append(devices, &device.Rule)
+	}
+	testDeviceFilter(t, devices, expected)
 }

 func TestDeviceFilter_Privileged(t *testing.T) {
-	devices := []*configs.Device{
+	devices := []*devices.Rule{
 		{
 			Type:        'a',
 			Major:       -1,
@ -157,21 +156,22 @@ func TestDeviceFilter_Privileged(t *testing.T) {
 	expected :=
 		`
 // load parameters into registers
-        0: LdXMemH dst: r2 src: r1 off: 0 imm: 0
-        1: LdXMemW dst: r3 src: r1 off: 0 imm: 0
-        2: RSh32Imm dst: r3 imm: 16
-        3: LdXMemW dst: r4 src: r1 off: 4 imm: 0
-        4: LdXMemW dst: r5 src: r1 off: 8 imm: 0
+        0: LdXMemW dst: r2 src: r1 off: 0 imm: 0
+        1: And32Imm dst: r2 imm: 65535
+        2: LdXMemW dst: r3 src: r1 off: 0 imm: 0
+        3: RSh32Imm dst: r3 imm: 16
+        4: LdXMemW dst: r4 src: r1 off: 4 imm: 0
+        5: LdXMemW dst: r5 src: r1 off: 8 imm: 0
 block-0:
 // return 1 (accept)
-        5: Mov32Imm dst: r0 imm: 1
-        6: Exit
+        6: Mov32Imm dst: r0 imm: 1
+        7: Exit
 	`
 	testDeviceFilter(t, devices, expected)
 }

 func TestDeviceFilter_PrivilegedExceptSingleDevice(t *testing.T) {
-	devices := []*configs.Device{
+	devices := []*devices.Rule{
 		{
 			Type:        'a',
 			Major:       -1,
@ -189,28 +189,29 @@ func TestDeviceFilter_PrivilegedExceptSingleDevice(t *testing.T) {
 	}
 	expected := `
 // load parameters into registers
-         0: LdXMemH dst: r2 src: r1 off: 0 imm: 0
-         1: LdXMemW dst: r3 src: r1 off: 0 imm: 0
-         2: RSh32Imm dst: r3 imm: 16
-         3: LdXMemW dst: r4 src: r1 off: 4 imm: 0
-         4: LdXMemW dst: r5 src: r1 off: 8 imm: 0
+         0: LdXMemW dst: r2 src: r1 off: 0 imm: 0
+         1: And32Imm dst: r2 imm: 65535
+         2: LdXMemW dst: r3 src: r1 off: 0 imm: 0
+         3: RSh32Imm dst: r3 imm: 16
+         4: LdXMemW dst: r4 src: r1 off: 4 imm: 0
+         5: LdXMemW dst: r5 src: r1 off: 8 imm: 0
 block-0:
 // return 0 (reject) if type==b && major == 8 && minor == 0
-         5: JNEImm dst: r2 off: -1 imm: 1 <block-1>
-         6: JNEImm dst: r4 off: -1 imm: 8 <block-1>
-         7: JNEImm dst: r5 off: -1 imm: 0 <block-1>
-         8: Mov32Imm dst: r0 imm: 0
-         9: Exit
+         6: JNEImm dst: r2 off: -1 imm: 1 <block-1>
+         7: JNEImm dst: r4 off: -1 imm: 8 <block-1>
+         8: JNEImm dst: r5 off: -1 imm: 0 <block-1>
+         9: Mov32Imm dst: r0 imm: 0
+        10: Exit
 block-1:
 // return 1 (accept)
-        10: Mov32Imm dst: r0 imm: 1
-        11: Exit
+        11: Mov32Imm dst: r0 imm: 1
+        12: Exit
 `
 	testDeviceFilter(t, devices, expected)
 }

 func TestDeviceFilter_Weird(t *testing.T) {
-	devices := []*configs.Device{
+	devices := []*devices.Rule{
 		{
 			Type:        'b',
 			Major:       8,
@ -237,22 +238,23 @@ func TestDeviceFilter_Weird(t *testing.T) {
 	// This conforms to runc v1.0.0-rc.9 (cgroup1) behavior.
 	expected := `
 // load parameters into registers
-         0: LdXMemH dst: r2 src: r1 off: 0 imm: 0
-         1: LdXMemW dst: r3 src: r1 off: 0 imm: 0
-         2: RSh32Imm dst: r3 imm: 16
-         3: LdXMemW dst: r4 src: r1 off: 4 imm: 0
-         4: LdXMemW dst: r5 src: r1 off: 8 imm: 0
+         0: LdXMemW dst: r2 src: r1 off: 0 imm: 0
+         1: And32Imm dst: r2 imm: 65535
+         2: LdXMemW dst: r3 src: r1 off: 0 imm: 0
+         3: RSh32Imm dst: r3 imm: 16
+         4: LdXMemW dst: r4 src: r1 off: 4 imm: 0
+         5: LdXMemW dst: r5 src: r1 off: 8 imm: 0
 block-0:
 // return 0 (reject) if type==b && major == 8 && minor == 2
-         5: JNEImm dst: r2 off: -1 imm: 1 <block-1>
-         6: JNEImm dst: r4 off: -1 imm: 8 <block-1>
-         7: JNEImm dst: r5 off: -1 imm: 2 <block-1>
-         8: Mov32Imm dst: r0 imm: 0
-         9: Exit
+         6: JNEImm dst: r2 off: -1 imm: 1 <block-1>
+         7: JNEImm dst: r4 off: -1 imm: 8 <block-1>
+         8: JNEImm dst: r5 off: -1 imm: 2 <block-1>
+         9: Mov32Imm dst: r0 imm: 0
+        10: Exit
 block-1:
 // return 1 (accept)
-        10: Mov32Imm dst: r0 imm: 1
-        11: Exit
+        11: Mov32Imm dst: r0 imm: 1
+        12: Exit
 `
 	testDeviceFilter(t, devices, expected)
 }
--- a/libcontainer/cgroups/ebpf/ebpf.go
+++ b/libcontainer/cgroups/ebpf/ebpf.go
@ -1,45 +0,0 @@
-package ebpf
-
-import (
-	"github.com/cilium/ebpf"
-	"github.com/cilium/ebpf/asm"
-	"github.com/pkg/errors"
-	"golang.org/x/sys/unix"
-)
-
-// LoadAttachCgroupDeviceFilter installs eBPF device filter program to /sys/fs/cgroup/<foo> directory.
-//
-// Requires the system to be running in cgroup2 unified-mode with kernel >= 4.15 .
-//
-// https://github.com/torvalds/linux/commit/ebc614f687369f9df99828572b1d85a7c2de3d92
-func LoadAttachCgroupDeviceFilter(insts asm.Instructions, license string, dirFD int) (func() error, error) {
-	nilCloser := func() error {
-		return nil
-	}
-	// Increase `ulimit -l` limit to avoid BPF_PROG_LOAD error (#2167).
-	// This limit is not inherited into the container.
-	memlockLimit := &unix.Rlimit{
-		Cur: unix.RLIM_INFINITY,
-		Max: unix.RLIM_INFINITY,
-	}
-	_ = unix.Setrlimit(unix.RLIMIT_MEMLOCK, memlockLimit)
-	spec := &ebpf.ProgramSpec{
-		Type:         ebpf.CGroupDevice,
-		Instructions: insts,
-		License:      license,
-	}
-	prog, err := ebpf.NewProgram(spec)
-	if err != nil {
-		return nilCloser, err
-	}
-	if err := prog.Attach(dirFD, ebpf.AttachCGroupDevice, unix.BPF_F_ALLOW_MULTI); err != nil {
-		return nilCloser, errors.Wrap(err, "failed to call BPF_PROG_ATTACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI)")
-	}
-	closer := func() error {
-		if err := prog.Detach(dirFD, ebpf.AttachCGroupDevice, unix.BPF_F_ALLOW_MULTI); err != nil {
-			return errors.Wrap(err, "failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI)")
-		}
-		return nil
-	}
-	return closer, nil
-}
--- a/libcontainer/cgroups/ebpf/ebpf_linux.go
+++ b/libcontainer/cgroups/ebpf/ebpf_linux.go
@ -0,0 +1,253 @@
+package ebpf
+
+import (
+	"errors"
+	"fmt"
+	"os"
+	"runtime"
+	"sync"
+	"unsafe"
+
+	"github.com/cilium/ebpf"
+	"github.com/cilium/ebpf/asm"
+	"github.com/cilium/ebpf/link"
+	"github.com/sirupsen/logrus"
+	"golang.org/x/sys/unix"
+)
+
+func nilCloser() error {
+	return nil
+}
+
+func findAttachedCgroupDeviceFilters(dirFd int) ([]*ebpf.Program, error) {
+	type bpfAttrQuery struct {
+		TargetFd    uint32
+		AttachType  uint32
+		QueryType   uint32
+		AttachFlags uint32
+		ProgIds     uint64 // __aligned_u64
+		ProgCnt     uint32
+	}
+
+	// Currently you can only have 64 eBPF programs attached to a cgroup.
+	size := 64
+	retries := 0
+	for retries < 10 {
+		progIds := make([]uint32, size)
+		query := bpfAttrQuery{
+			TargetFd:   uint32(dirFd),
+			AttachType: uint32(unix.BPF_CGROUP_DEVICE),
+			ProgIds:    uint64(uintptr(unsafe.Pointer(&progIds[0]))),
+			ProgCnt:    uint32(len(progIds)),
+		}
+
+		// Fetch the list of program ids.
+		_, _, errno := unix.Syscall(unix.SYS_BPF,
+			uintptr(unix.BPF_PROG_QUERY),
+			uintptr(unsafe.Pointer(&query)),
+			unsafe.Sizeof(query))
+		size = int(query.ProgCnt)
+		runtime.KeepAlive(query)
+		if errno != 0 {
+			// On ENOSPC we get the correct number of programs.
+			if errno == unix.ENOSPC {
+				retries++
+				continue
+			}
+			return nil, fmt.Errorf("bpf_prog_query(BPF_CGROUP_DEVICE) failed: %w", errno)
+		}
+
+		// Convert the ids to program handles.
+		progIds = progIds[:size]
+		programs := make([]*ebpf.Program, 0, len(progIds))
+		for _, progId := range progIds {
+			program, err := ebpf.NewProgramFromID(ebpf.ProgramID(progId))
+			if err != nil {
+				// We skip over programs that give us -EACCES or -EPERM. This
+				// is necessary because there may be BPF programs that have
+				// been attached (such as with --systemd-cgroup) which have an
+				// LSM label that blocks us from interacting with the program.
+				//
+				// Because additional BPF_CGROUP_DEVICE programs only can add
+				// restrictions, there's no real issue with just ignoring these
+				// programs (and stops runc from breaking on distributions with
+				// very strict SELinux policies).
+				if errors.Is(err, os.ErrPermission) {
+					logrus.Debugf("ignoring existing CGROUP_DEVICE program (prog_id=%v) which cannot be accessed by runc -- likely due to LSM policy: %v", progId, err)
+					continue
+				}
+				return nil, fmt.Errorf("cannot fetch program from id: %w", err)
+			}
+			programs = append(programs, program)
+		}
+		runtime.KeepAlive(progIds)
+		return programs, nil
+	}
+
+	return nil, errors.New("could not get complete list of CGROUP_DEVICE programs")
+}
+
+var (
+	haveBpfProgReplaceBool bool
+	haveBpfProgReplaceOnce sync.Once
+)
+
+// Loosely based on the BPF_F_REPLACE support check in
+//   <https://github.com/cilium/ebpf/blob/v0.6.0/link/syscalls.go>.
+//
+// TODO: move this logic to cilium/ebpf
+func haveBpfProgReplace() bool {
+	haveBpfProgReplaceOnce.Do(func() {
+		prog, err := ebpf.NewProgram(&ebpf.ProgramSpec{
+			Type:    ebpf.CGroupDevice,
+			License: "MIT",
+			Instructions: asm.Instructions{
+				asm.Mov.Imm(asm.R0, 0),
+				asm.Return(),
+			},
+		})
+		if err != nil {
+			logrus.Debugf("checking for BPF_F_REPLACE support: ebpf.NewProgram failed: %v", err)
+			return
+		}
+		defer prog.Close()
+
+		devnull, err := os.Open("/dev/null")
+		if err != nil {
+			logrus.Debugf("checking for BPF_F_REPLACE support: open dummy target fd: %v", err)
+			return
+		}
+		defer devnull.Close()
+
+		// We know that we have BPF_PROG_ATTACH since we can load
+		// BPF_CGROUP_DEVICE programs. If passing BPF_F_REPLACE gives us EINVAL
+		// we know that the feature isn't present.
+		err = link.RawAttachProgram(link.RawAttachProgramOptions{
+			// We rely on this fd being checked after attachFlags.
+			Target: int(devnull.Fd()),
+			// Attempt to "replace" bad fds with this program.
+			Program: prog,
+			Attach:  ebpf.AttachCGroupDevice,
+			Flags:   unix.BPF_F_ALLOW_MULTI | unix.BPF_F_REPLACE,
+		})
+		if errors.Is(err, unix.EINVAL) {
+			// not supported
+			return
+		}
+		// attach_flags test succeeded.
+		if !errors.Is(err, unix.EBADF) {
+			logrus.Debugf("checking for BPF_F_REPLACE: got unexpected (not EBADF or EINVAL) error: %v", err)
+		}
+		haveBpfProgReplaceBool = true
+	})
+	return haveBpfProgReplaceBool
+}
+
+// LoadAttachCgroupDeviceFilter installs eBPF device filter program to /sys/fs/cgroup/<foo> directory.
+//
+// Requires the system to be running in cgroup2 unified-mode with kernel >= 4.15 .
+//
+// https://github.com/torvalds/linux/commit/ebc614f687369f9df99828572b1d85a7c2de3d92
+func LoadAttachCgroupDeviceFilter(insts asm.Instructions, license string, dirFd int) (func() error, error) {
+	// Increase `ulimit -l` limit to avoid BPF_PROG_LOAD error (#2167).
+	// This limit is not inherited into the container.
+	memlockLimit := &unix.Rlimit{
+		Cur: unix.RLIM_INFINITY,
+		Max: unix.RLIM_INFINITY,
+	}
+	_ = unix.Setrlimit(unix.RLIMIT_MEMLOCK, memlockLimit)
+
+	// Get the list of existing programs.
+	oldProgs, err := findAttachedCgroupDeviceFilters(dirFd)
+	if err != nil {
+		return nilCloser, err
+	}
+	useReplaceProg := haveBpfProgReplace() && len(oldProgs) == 1
+
+	// Generate new program.
+	spec := &ebpf.ProgramSpec{
+		Type:         ebpf.CGroupDevice,
+		Instructions: insts,
+		License:      license,
+	}
+	prog, err := ebpf.NewProgram(spec)
+	if err != nil {
+		return nilCloser, err
+	}
+
+	// If there is only one old program, we can just replace it directly.
+	var (
+		replaceProg *ebpf.Program
+		attachFlags uint32 = unix.BPF_F_ALLOW_MULTI
+	)
+	if useReplaceProg {
+		replaceProg = oldProgs[0]
+		attachFlags |= unix.BPF_F_REPLACE
+	}
+	err = link.RawAttachProgram(link.RawAttachProgramOptions{
+		Target:  dirFd,
+		Program: prog,
+		Replace: replaceProg,
+		Attach:  ebpf.AttachCGroupDevice,
+		Flags:   attachFlags,
+	})
+	if err != nil {
+		return nilCloser, fmt.Errorf("failed to call BPF_PROG_ATTACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI): %w", err)
+	}
+	closer := func() error {
+		err = link.RawDetachProgram(link.RawDetachProgramOptions{
+			Target:  dirFd,
+			Program: prog,
+			Attach:  ebpf.AttachCGroupDevice,
+		})
+		if err != nil {
+			return fmt.Errorf("failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE): %w", err)
+		}
+		// TODO: Should we attach the old filters back in this case? Otherwise
+		//       we fail-open on a security feature, which is a bit scary.
+		return nil
+	}
+	if !useReplaceProg {
+		logLevel := logrus.DebugLevel
+		// If there was more than one old program, give a warning (since this
+		// really shouldn't happen with runc-managed cgroups) and then detach
+		// all the old programs.
+		if len(oldProgs) > 1 {
+			// NOTE: Ideally this should be a warning but it turns out that
+			//       systemd-managed cgroups trigger this warning (apparently
+			//       systemd doesn't delete old non-systemd programs when
+			//       setting properties).
+			logrus.Infof("found more than one filter (%d) attached to a cgroup -- removing extra filters!", len(oldProgs))
+			logLevel = logrus.InfoLevel
+		}
+		for idx, oldProg := range oldProgs {
+			// Output some extra debug info.
+			if info, err := oldProg.Info(); err == nil {
+				fields := logrus.Fields{
+					"type": info.Type.String(),
+					"tag":  info.Tag,
+					"name": info.Name,
+				}
+				if id, ok := info.ID(); ok {
+					fields["id"] = id
+				}
+				if runCount, ok := info.RunCount(); ok {
+					fields["run_count"] = runCount
+				}
+				if runtime, ok := info.Runtime(); ok {
+					fields["runtime"] = runtime.String()
+				}
+				logrus.WithFields(fields).Logf(logLevel, "removing old filter %d from cgroup", idx)
+			}
+			err = link.RawDetachProgram(link.RawDetachProgramOptions{
+				Target:  dirFd,
+				Program: oldProg,
+				Attach:  ebpf.AttachCGroupDevice,
+			})
+			if err != nil {
+				return closer, fmt.Errorf("failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE) on old filter program: %w", err)
+			}
+		}
+	}
+	return closer, nil
+}
--- a/libcontainer/cgroups/file.go
+++ b/libcontainer/cgroups/file.go
@ -0,0 +1,190 @@
+package cgroups
+
+import (
+	"bytes"
+	"errors"
+	"fmt"
+	"os"
+	"path"
+	"strconv"
+	"strings"
+	"sync"
+
+	"github.com/sirupsen/logrus"
+	"golang.org/x/sys/unix"
+)
+
+// OpenFile opens a cgroup file in a given dir with given flags.
+// It is supposed to be used for cgroup files only, and returns
+// an error if the file is not a cgroup file.
+//
+// Arguments dir and file are joined together to form an absolute path
+// to a file being opened.
+func OpenFile(dir, file string, flags int) (*os.File, error) {
+	if dir == "" {
+		return nil, fmt.Errorf("no directory specified for %s", file)
+	}
+	return openFile(dir, file, flags)
+}
+
+// ReadFile reads data from a cgroup file in dir.
+// It is supposed to be used for cgroup files only.
+func ReadFile(dir, file string) (string, error) {
+	fd, err := OpenFile(dir, file, unix.O_RDONLY)
+	if err != nil {
+		return "", err
+	}
+	defer fd.Close()
+	var buf bytes.Buffer
+
+	_, err = buf.ReadFrom(fd)
+	return buf.String(), err
+}
+
+// WriteFile writes data to a cgroup file in dir.
+// It is supposed to be used for cgroup files only.
+func WriteFile(dir, file, data string) error {
+	fd, err := OpenFile(dir, file, unix.O_WRONLY)
+	if err != nil {
+		return err
+	}
+	defer fd.Close()
+	if err := retryingWriteFile(fd, data); err != nil {
+		// Having data in the error message helps in debugging.
+		return fmt.Errorf("failed to write %q: %w", data, err)
+	}
+	return nil
+}
+
+func retryingWriteFile(fd *os.File, data string) error {
+	for {
+		_, err := fd.Write([]byte(data))
+		if errors.Is(err, unix.EINTR) {
+			logrus.Infof("interrupted while writing %s to %s", data, fd.Name())
+			continue
+		}
+		return err
+	}
+}
+
+const (
+	cgroupfsDir    = "/sys/fs/cgroup"
+	cgroupfsPrefix = cgroupfsDir + "/"
+)
+
+var (
+	// TestMode is set to true by unit tests that need "fake" cgroupfs.
+	TestMode bool
+
+	cgroupFd     int = -1
+	prepOnce     sync.Once
+	prepErr      error
+	resolveFlags uint64
+)
+
+func prepareOpenat2() error {
+	prepOnce.Do(func() {
+		fd, err := unix.Openat2(-1, cgroupfsDir, &unix.OpenHow{
+			Flags: unix.O_DIRECTORY | unix.O_PATH,
+		})
+		if err != nil {
+			prepErr = &os.PathError{Op: "openat2", Path: cgroupfsDir, Err: err}
+			if err != unix.ENOSYS { //nolint:errorlint // unix errors are bare
+				logrus.Warnf("falling back to securejoin: %s", prepErr)
+			} else {
+				logrus.Debug("openat2 not available, falling back to securejoin")
+			}
+			return
+		}
+		var st unix.Statfs_t
+		if err = unix.Fstatfs(fd, &st); err != nil {
+			prepErr = &os.PathError{Op: "statfs", Path: cgroupfsDir, Err: err}
+			logrus.Warnf("falling back to securejoin: %s", prepErr)
+			return
+		}
+
+		cgroupFd = fd
+
+		resolveFlags = unix.RESOLVE_BENEATH | unix.RESOLVE_NO_MAGICLINKS
+		if st.Type == unix.CGROUP2_SUPER_MAGIC {
+			// cgroupv2 has a single mountpoint and no "cpu,cpuacct" symlinks
+			resolveFlags |= unix.RESOLVE_NO_XDEV | unix.RESOLVE_NO_SYMLINKS
+		}
+	})
+
+	return prepErr
+}
+
+func openFile(dir, file string, flags int) (*os.File, error) {
+	mode := os.FileMode(0)
+	if TestMode && flags&os.O_WRONLY != 0 {
+		// "emulate" cgroup fs for unit tests
+		flags |= os.O_TRUNC | os.O_CREATE
+		mode = 0o600
+	}
+	path := path.Join(dir, file)
+	if prepareOpenat2() != nil {
+		return openFallback(path, flags, mode)
+	}
+	relPath := strings.TrimPrefix(path, cgroupfsPrefix)
+	if len(relPath) == len(path) { // non-standard path, old system?
+		return openFallback(path, flags, mode)
+	}
+
+	fd, err := unix.Openat2(cgroupFd, relPath,
+		&unix.OpenHow{
+			Resolve: resolveFlags,
+			Flags:   uint64(flags) | unix.O_CLOEXEC,
+			Mode:    uint64(mode),
+		})
+	if err != nil {
+		err = &os.PathError{Op: "openat2", Path: path, Err: err}
+		// Check if cgroupFd is still opened to cgroupfsDir
+		// (happens when this package is incorrectly used
+		// across the chroot/pivot_root/mntns boundary, or
+		// when /sys/fs/cgroup is remounted).
+		//
+		// TODO: if such usage will ever be common, amend this
+		// to reopen cgroupFd and retry openat2.
+		fdStr := strconv.Itoa(cgroupFd)
+		fdDest, _ := os.Readlink("/proc/self/fd/" + fdStr)
+		if fdDest != cgroupfsDir {
+			// Wrap the error so it is clear that cgroupFd
+			// is opened to an unexpected/wrong directory.
+			err = fmt.Errorf("cgroupFd %s unexpectedly opened to %s != %s: %w",
+				fdStr, fdDest, cgroupfsDir, err)
+		}
+		return nil, err
+	}
+
+	return os.NewFile(uintptr(fd), path), nil
+}
+
+var errNotCgroupfs = errors.New("not a cgroup file")
+
+// Can be changed by unit tests.
+var openFallback = openAndCheck
+
+// openAndCheck is used when openat2(2) is not available. It checks the opened
+// file is on cgroupfs, returning an error otherwise.
+func openAndCheck(path string, flags int, mode os.FileMode) (*os.File, error) {
+	fd, err := os.OpenFile(path, flags, mode)
+	if err != nil {
+		return nil, err
+	}
+	if TestMode {
+		return fd, nil
+	}
+	// Check this is a cgroupfs file.
+	var st unix.Statfs_t
+	if err := unix.Fstatfs(int(fd.Fd()), &st); err != nil {
+		_ = fd.Close()
+		return nil, &os.PathError{Op: "statfs", Path: path, Err: err}
+	}
+	if st.Type != unix.CGROUP_SUPER_MAGIC && st.Type != unix.CGROUP2_SUPER_MAGIC {
+		_ = fd.Close()
+		return nil, &os.PathError{Op: "open", Path: path, Err: errNotCgroupfs}
+	}
+
+	return fd, nil
+}
--- a/libcontainer/cgroups/file_test.go
+++ b/libcontainer/cgroups/file_test.go
@ -0,0 +1,73 @@
+package cgroups
+
+import (
+	"errors"
+	"fmt"
+	"os"
+	"path/filepath"
+	"strconv"
+	"testing"
+	"time"
+)
+
+func TestWriteCgroupFileHandlesInterrupt(t *testing.T) {
+	const (
+		memoryCgroupMount = "/sys/fs/cgroup/memory"
+		memoryLimit       = "memory.limit_in_bytes"
+	)
+	if _, err := os.Stat(memoryCgroupMount); err != nil {
+		// most probably cgroupv2
+		t.Skip(err)
+	}
+
+	cgroupName := fmt.Sprintf("test-eint-%d", time.Now().Nanosecond())
+	cgroupPath := filepath.Join(memoryCgroupMount, cgroupName)
+	if err := os.MkdirAll(cgroupPath, 0o755); err != nil {
+		t.Fatal(err)
+	}
+	defer os.RemoveAll(cgroupPath)
+
+	if _, err := os.Stat(filepath.Join(cgroupPath, memoryLimit)); err != nil {
+		// either cgroupv2, or memory controller is not available
+		t.Skip(err)
+	}
+
+	for i := 0; i < 100000; i++ {
+		limit := 1024*1024 + i
+		if err := WriteFile(cgroupPath, memoryLimit, strconv.Itoa(limit)); err != nil {
+			t.Fatalf("Failed to write %d on attempt %d: %+v", limit, i, err)
+		}
+	}
+}
+
+func TestOpenat2(t *testing.T) {
+	if !IsCgroup2UnifiedMode() {
+		// The reason is many test cases below test opening files from
+		// the top-level directory, where cgroup v1 has no files.
+		t.Skip("test requires cgroup v2")
+	}
+
+	// Make sure we test openat2, not its fallback.
+	openFallback = func(_ string, _ int, _ os.FileMode) (*os.File, error) {
+		return nil, errors.New("fallback")
+	}
+	defer func() { openFallback = openAndCheck }()
+
+	for _, tc := range []struct{ dir, file string }{
+		{"/sys/fs/cgroup", "cgroup.controllers"},
+		{"/sys/fs/cgroup", "/cgroup.controllers"},
+		{"/sys/fs/cgroup/", "cgroup.controllers"},
+		{"/sys/fs/cgroup/", "/cgroup.controllers"},
+		{"/sys/fs/cgroup/user.slice", "cgroup.controllers"},
+		{"/sys/fs/cgroup/user.slice/", "/cgroup.controllers"},
+		{"/", "/sys/fs/cgroup/cgroup.controllers"},
+		{"/", "sys/fs/cgroup/cgroup.controllers"},
+		{"/sys/fs/cgroup/cgroup.controllers", ""},
+	} {
+		fd, err := OpenFile(tc.dir, tc.file, os.O_RDONLY)
+		if err != nil {
+			t.Errorf("case %+v: %v", tc, err)
+		}
+		fd.Close()
+	}
+}
--- a/libcontainer/cgroups/fs/apply_raw.go
+++ b/libcontainer/cgroups/fs/apply_raw.go
@ -1,411 +0,0 @@
-// +build linux
-
-package fs
-
-import (
-	"fmt"
-	"io"
-	"os"
-	"path/filepath"
-	"sync"
-
-	"github.com/opencontainers/runc/libcontainer/cgroups"
-	"github.com/opencontainers/runc/libcontainer/configs"
-	libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
-	"github.com/pkg/errors"
-	"golang.org/x/sys/unix"
-)
-
-var (
-	subsystemsLegacy = subsystemSet{
-		&CpusetGroup{},
-		&DevicesGroup{},
-		&MemoryGroup{},
-		&CpuGroup{},
-		&CpuacctGroup{},
-		&PidsGroup{},
-		&BlkioGroup{},
-		&HugetlbGroup{},
-		&NetClsGroup{},
-		&NetPrioGroup{},
-		&PerfEventGroup{},
-		&FreezerGroup{},
-		&NameGroup{GroupName: "name=systemd", Join: true},
-	}
-	HugePageSizes, _ = cgroups.GetHugePageSize()
-)
-
-var errSubsystemDoesNotExist = fmt.Errorf("cgroup: subsystem does not exist")
-
-type subsystemSet []subsystem
-
-func (s subsystemSet) Get(name string) (subsystem, error) {
-	for _, ss := range s {
-		if ss.Name() == name {
-			return ss, nil
-		}
-	}
-	return nil, errSubsystemDoesNotExist
-}
-
-type subsystem interface {
-	// Name returns the name of the subsystem.
-	Name() string
-	// Returns the stats, as 'stats', corresponding to the cgroup under 'path'.
-	GetStats(path string, stats *cgroups.Stats) error
-	// Removes the cgroup represented by 'cgroupData'.
-	Remove(*cgroupData) error
-	// Creates and joins the cgroup represented by 'cgroupData'.
-	Apply(*cgroupData) error
-	// Set the cgroup represented by cgroup.
-	Set(path string, cgroup *configs.Cgroup) error
-}
-
-type Manager struct {
-	mu       sync.Mutex
-	Cgroups  *configs.Cgroup
-	Rootless bool // ignore permission-related errors
-	Paths    map[string]string
-}
-
-// The absolute path to the root of the cgroup hierarchies.
-var cgroupRootLock sync.Mutex
-var cgroupRoot string
-
-// Gets the cgroupRoot.
-func getCgroupRoot() (string, error) {
-	cgroupRootLock.Lock()
-	defer cgroupRootLock.Unlock()
-
-	if cgroupRoot != "" {
-		return cgroupRoot, nil
-	}
-
-	root, err := cgroups.FindCgroupMountpointDir()
-	if err != nil {
-		return "", err
-	}
-
-	if _, err := os.Stat(root); err != nil {
-		return "", err
-	}
-
-	cgroupRoot = root
-	return cgroupRoot, nil
-}
-
-type cgroupData struct {
-	root      string
-	innerPath string
-	config    *configs.Cgroup
-	pid       int
-}
-
-// isIgnorableError returns whether err is a permission error (in the loose
-// sense of the word). This includes EROFS (which for an unprivileged user is
-// basically a permission error) and EACCES (for similar reasons) as well as
-// the normal EPERM.
-func isIgnorableError(rootless bool, err error) bool {
-	// We do not ignore errors if we are root.
-	if !rootless {
-		return false
-	}
-	// Is it an ordinary EPERM?
-	if os.IsPermission(errors.Cause(err)) {
-		return true
-	}
-
-	// Try to handle other errnos.
-	var errno error
-	switch err := errors.Cause(err).(type) {
-	case *os.PathError:
-		errno = err.Err
-	case *os.LinkError:
-		errno = err.Err
-	case *os.SyscallError:
-		errno = err.Err
-	}
-	return errno == unix.EROFS || errno == unix.EPERM || errno == unix.EACCES
-}
-
-func (m *Manager) getSubsystems() subsystemSet {
-	return subsystemsLegacy
-}
-
-func (m *Manager) Apply(pid int) (err error) {
-	if m.Cgroups == nil {
-		return nil
-	}
-	m.mu.Lock()
-	defer m.mu.Unlock()
-
-	var c = m.Cgroups
-
-	d, err := getCgroupData(m.Cgroups, pid)
-	if err != nil {
-		return err
-	}
-
-	m.Paths = make(map[string]string)
-	if c.Paths != nil {
-		for name, path := range c.Paths {
-			_, err := d.path(name)
-			if err != nil {
-				if cgroups.IsNotFound(err) {
-					continue
-				}
-				return err
-			}
-			m.Paths[name] = path
-		}
-		return cgroups.EnterPid(m.Paths, pid)
-	}
-
-	for _, sys := range m.getSubsystems() {
-		// TODO: Apply should, ideally, be reentrant or be broken up into a separate
-		// create and join phase so that the cgroup hierarchy for a container can be
-		// created then join consists of writing the process pids to cgroup.procs
-		p, err := d.path(sys.Name())
-		if err != nil {
-			// The non-presence of the devices subsystem is
-			// considered fatal for security reasons.
-			if cgroups.IsNotFound(err) && sys.Name() != "devices" {
-				continue
-			}
-			return err
-		}
-		m.Paths[sys.Name()] = p
-
-		if err := sys.Apply(d); err != nil {
-			// In the case of rootless (including euid=0 in userns), where an explicit cgroup path hasn't
-			// been set, we don't bail on error in case of permission problems.
-			// Cases where limits have been set (and we couldn't create our own
-			// cgroup) are handled by Set.
-			if isIgnorableError(m.Rootless, err) && m.Cgroups.Path == "" {
-				delete(m.Paths, sys.Name())
-				continue
-			}
-			return err
-		}
-
-	}
-	return nil
-}
-
-func (m *Manager) Destroy() error {
-	if m.Cgroups == nil || m.Cgroups.Paths != nil {
-		return nil
-	}
-	m.mu.Lock()
-	defer m.mu.Unlock()
-	if err := cgroups.RemovePaths(m.Paths); err != nil {
-		return err
-	}
-	m.Paths = make(map[string]string)
-	return nil
-}
-
-func (m *Manager) GetPaths() map[string]string {
-	m.mu.Lock()
-	paths := m.Paths
-	m.mu.Unlock()
-	return paths
-}
-
-func (m *Manager) GetUnifiedPath() (string, error) {
-	return "", errors.New("unified path is only supported when running in unified mode")
-}
-
-func (m *Manager) GetStats() (*cgroups.Stats, error) {
-	m.mu.Lock()
-	defer m.mu.Unlock()
-	stats := cgroups.NewStats()
-	for name, path := range m.Paths {
-		sys, err := m.getSubsystems().Get(name)
-		if err == errSubsystemDoesNotExist || !cgroups.PathExists(path) {
-			continue
-		}
-		if err := sys.GetStats(path, stats); err != nil {
-			return nil, err
-		}
-	}
-	return stats, nil
-}
-
-func (m *Manager) Set(container *configs.Config) error {
-	if container.Cgroups == nil {
-		return nil
-	}
-
-	// If Paths are set, then we are just joining cgroups paths
-	// and there is no need to set any values.
-	if m.Cgroups != nil && m.Cgroups.Paths != nil {
-		return nil
-	}
-
-	paths := m.GetPaths()
-	for _, sys := range m.getSubsystems() {
-		path := paths[sys.Name()]
-		if err := sys.Set(path, container.Cgroups); err != nil {
-			if m.Rootless && sys.Name() == "devices" {
-				continue
-			}
-			// When m.Rootless is true, errors from the device subsystem are ignored because it is really not expected to work.
-			// However, errors from other subsystems are not ignored.
-			// see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error"
-			if path == "" {
-				// We never created a path for this cgroup, so we cannot set
-				// limits for it (though we have already tried at this point).
-				return fmt.Errorf("cannot set %s limit: container could not join or create cgroup", sys.Name())
-			}
-			return err
-		}
-	}
-
-	if m.Paths["cpu"] != "" {
-		if err := CheckCpushares(m.Paths["cpu"], container.Cgroups.Resources.CpuShares); err != nil {
-			return err
-		}
-	}
-	return nil
-}
-
-// Freeze toggles the container's freezer cgroup depending on the state
-// provided
-func (m *Manager) Freeze(state configs.FreezerState) error {
-	if m.Cgroups == nil {
-		return errors.New("cannot toggle freezer: cgroups not configured for container")
-	}
-
-	paths := m.GetPaths()
-	dir := paths["freezer"]
-	prevState := m.Cgroups.Resources.Freezer
-	m.Cgroups.Resources.Freezer = state
-	freezer, err := m.getSubsystems().Get("freezer")
-	if err != nil {
-		return err
-	}
-	err = freezer.Set(dir, m.Cgroups)
-	if err != nil {
-		m.Cgroups.Resources.Freezer = prevState
-		return err
-	}
-	return nil
-}
-
-func (m *Manager) GetPids() ([]int, error) {
-	paths := m.GetPaths()
-	return cgroups.GetPids(paths["devices"])
-}
-
-func (m *Manager) GetAllPids() ([]int, error) {
-	paths := m.GetPaths()
-	return cgroups.GetAllPids(paths["devices"])
-}
-
-func getCgroupData(c *configs.Cgroup, pid int) (*cgroupData, error) {
-	root, err := getCgroupRoot()
-	if err != nil {
-		return nil, err
-	}
-
-	if (c.Name != "" || c.Parent != "") && c.Path != "" {
-		return nil, fmt.Errorf("cgroup: either Path or Name and Parent should be used")
-	}
-
-	// XXX: Do not remove this code. Path safety is important! -- cyphar
-	cgPath := libcontainerUtils.CleanPath(c.Path)
-	cgParent := libcontainerUtils.CleanPath(c.Parent)
-	cgName := libcontainerUtils.CleanPath(c.Name)
-
-	innerPath := cgPath
-	if innerPath == "" {
-		innerPath = filepath.Join(cgParent, cgName)
-	}
-
-	return &cgroupData{
-		root:      root,
-		innerPath: innerPath,
-		config:    c,
-		pid:       pid,
-	}, nil
-}
-
-func (raw *cgroupData) path(subsystem string) (string, error) {
-	mnt, err := cgroups.FindCgroupMountpoint(raw.root, subsystem)
-	// If we didn't mount the subsystem, there is no point we make the path.
-	if err != nil {
-		return "", err
-	}
-
-	// If the cgroup name/path is absolute do not look relative to the cgroup of the init process.
-	if filepath.IsAbs(raw.innerPath) {
-		// Sometimes subsystems can be mounted together as 'cpu,cpuacct'.
-		return filepath.Join(raw.root, filepath.Base(mnt), raw.innerPath), nil
-	}
-
-	// Use GetOwnCgroupPath instead of GetInitCgroupPath, because the creating
-	// process could in container and shared pid namespace with host, and
-	// /proc/1/cgroup could point to whole other world of cgroups.
-	parentPath, err := cgroups.GetOwnCgroupPath(subsystem)
-	if err != nil {
-		return "", err
-	}
-
-	return filepath.Join(parentPath, raw.innerPath), nil
-}
-
-func (raw *cgroupData) join(subsystem string) (string, error) {
-	path, err := raw.path(subsystem)
-	if err != nil {
-		return "", err
-	}
-	if err := os.MkdirAll(path, 0755); err != nil {
-		return "", err
-	}
-	if err := cgroups.WriteCgroupProc(path, raw.pid); err != nil {
-		return "", err
-	}
-	return path, nil
-}
-
-func removePath(p string, err error) error {
-	if err != nil {
-		return err
-	}
-	if p != "" {
-		return os.RemoveAll(p)
-	}
-	return nil
-}
-
-func CheckCpushares(path string, c uint64) error {
-	var cpuShares uint64
-
-	if c == 0 {
-		return nil
-	}
-
-	fd, err := os.Open(filepath.Join(path, "cpu.shares"))
-	if err != nil {
-		return err
-	}
-	defer fd.Close()
-
-	_, err = fmt.Fscanf(fd, "%d", &cpuShares)
-	if err != nil && err != io.EOF {
-		return err
-	}
-
-	if c > cpuShares {
-		return fmt.Errorf("The maximum allowed cpu-shares is %d", cpuShares)
-	} else if c < cpuShares {
-		return fmt.Errorf("The minimum allowed cpu-shares is %d", cpuShares)
-	}
-
-	return nil
-}
-
-func (m *Manager) GetCgroups() (*configs.Cgroup, error) {
-	return m.Cgroups, nil
-}
--- a/libcontainer/cgroups/fs/apply_raw_test.go
+++ b/libcontainer/cgroups/fs/apply_raw_test.go
@ -1,297 +0,0 @@
-// +build linux
-
-package fs
-
-import (
-	"path/filepath"
-	"strings"
-	"testing"
-
-	"github.com/opencontainers/runc/libcontainer/cgroups"
-	"github.com/opencontainers/runc/libcontainer/configs"
-)
-
-func TestInvalidCgroupPath(t *testing.T) {
-	if cgroups.IsCgroup2UnifiedMode() {
-		t.Skip("cgroup v1 is not supported")
-	}
-	root, err := getCgroupRoot()
-	if err != nil {
-		t.Errorf("couldn't get cgroup root: %v", err)
-	}
-
-	config := &configs.Cgroup{
-		Path: "../../../../../../../../../../some/path",
-	}
-
-	data, err := getCgroupData(config, 0)
-	if err != nil {
-		t.Errorf("couldn't get cgroup data: %v", err)
-	}
-
-	// Make sure the final innerPath doesn't go outside the cgroup mountpoint.
-	if strings.HasPrefix(data.innerPath, "..") {
-		t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!")
-	}
-
-	// Double-check, using an actual cgroup.
-	deviceRoot := filepath.Join(root, "devices")
-	devicePath, err := data.path("devices")
-	if err != nil {
-		t.Errorf("couldn't get cgroup path: %v", err)
-	}
-	if !strings.HasPrefix(devicePath, deviceRoot) {
-		t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!")
-	}
-}
-
-func TestInvalidAbsoluteCgroupPath(t *testing.T) {
-	if cgroups.IsCgroup2UnifiedMode() {
-		t.Skip("cgroup v1 is not supported")
-	}
-	root, err := getCgroupRoot()
-	if err != nil {
-		t.Errorf("couldn't get cgroup root: %v", err)
-	}
-
-	config := &configs.Cgroup{
-		Path: "/../../../../../../../../../../some/path",
-	}
-
-	data, err := getCgroupData(config, 0)
-	if err != nil {
-		t.Errorf("couldn't get cgroup data: %v", err)
-	}
-
-	// Make sure the final innerPath doesn't go outside the cgroup mountpoint.
-	if strings.HasPrefix(data.innerPath, "..") {
-		t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!")
-	}
-
-	// Double-check, using an actual cgroup.
-	deviceRoot := filepath.Join(root, "devices")
-	devicePath, err := data.path("devices")
-	if err != nil {
-		t.Errorf("couldn't get cgroup path: %v", err)
-	}
-	if !strings.HasPrefix(devicePath, deviceRoot) {
-		t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!")
-	}
-}
-
-// XXX: Remove me after we get rid of configs.Cgroup.Name and configs.Cgroup.Parent.
-func TestInvalidCgroupParent(t *testing.T) {
-	if cgroups.IsCgroup2UnifiedMode() {
-		t.Skip("cgroup v1 is not supported")
-	}
-	root, err := getCgroupRoot()
-	if err != nil {
-		t.Errorf("couldn't get cgroup root: %v", err)
-	}
-
-	config := &configs.Cgroup{
-		Parent: "../../../../../../../../../../some/path",
-		Name:   "name",
-	}
-
-	data, err := getCgroupData(config, 0)
-	if err != nil {
-		t.Errorf("couldn't get cgroup data: %v", err)
-	}
-
-	// Make sure the final innerPath doesn't go outside the cgroup mountpoint.
-	if strings.HasPrefix(data.innerPath, "..") {
-		t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!")
-	}
-
-	// Double-check, using an actual cgroup.
-	deviceRoot := filepath.Join(root, "devices")
-	devicePath, err := data.path("devices")
-	if err != nil {
-		t.Errorf("couldn't get cgroup path: %v", err)
-	}
-	if !strings.HasPrefix(devicePath, deviceRoot) {
-		t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!")
-	}
-}
-
-// XXX: Remove me after we get rid of configs.Cgroup.Name and configs.Cgroup.Parent.
-func TestInvalidAbsoluteCgroupParent(t *testing.T) {
-	if cgroups.IsCgroup2UnifiedMode() {
-		t.Skip("cgroup v1 is not supported")
-	}
-	root, err := getCgroupRoot()
-	if err != nil {
-		t.Errorf("couldn't get cgroup root: %v", err)
-	}
-
-	config := &configs.Cgroup{
-		Parent: "/../../../../../../../../../../some/path",
-		Name:   "name",
-	}
-
-	data, err := getCgroupData(config, 0)
-	if err != nil {
-		t.Errorf("couldn't get cgroup data: %v", err)
-	}
-
-	// Make sure the final innerPath doesn't go outside the cgroup mountpoint.
-	if strings.HasPrefix(data.innerPath, "..") {
-		t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!")
-	}
-
-	// Double-check, using an actual cgroup.
-	deviceRoot := filepath.Join(root, "devices")
-	devicePath, err := data.path("devices")
-	if err != nil {
-		t.Errorf("couldn't get cgroup path: %v", err)
-	}
-	if !strings.HasPrefix(devicePath, deviceRoot) {
-		t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!")
-	}
-}
-
-// XXX: Remove me after we get rid of configs.Cgroup.Name and configs.Cgroup.Parent.
-func TestInvalidCgroupName(t *testing.T) {
-	if cgroups.IsCgroup2UnifiedMode() {
-		t.Skip("cgroup v1 is not supported")
-	}
-	root, err := getCgroupRoot()
-	if err != nil {
-		t.Errorf("couldn't get cgroup root: %v", err)
-	}
-
-	config := &configs.Cgroup{
-		Parent: "parent",
-		Name:   "../../../../../../../../../../some/path",
-	}
-
-	data, err := getCgroupData(config, 0)
-	if err != nil {
-		t.Errorf("couldn't get cgroup data: %v", err)
-	}
-
-	// Make sure the final innerPath doesn't go outside the cgroup mountpoint.
-	if strings.HasPrefix(data.innerPath, "..") {
-		t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!")
-	}
-
-	// Double-check, using an actual cgroup.
-	deviceRoot := filepath.Join(root, "devices")
-	devicePath, err := data.path("devices")
-	if err != nil {
-		t.Errorf("couldn't get cgroup path: %v", err)
-	}
-	if !strings.HasPrefix(devicePath, deviceRoot) {
-		t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!")
-	}
-
-}
-
-// XXX: Remove me after we get rid of configs.Cgroup.Name and configs.Cgroup.Parent.
-func TestInvalidAbsoluteCgroupName(t *testing.T) {
-	if cgroups.IsCgroup2UnifiedMode() {
-		t.Skip("cgroup v1 is not supported")
-	}
-	root, err := getCgroupRoot()
-	if err != nil {
-		t.Errorf("couldn't get cgroup root: %v", err)
-	}
-
-	config := &configs.Cgroup{
-		Parent: "parent",
-		Name:   "/../../../../../../../../../../some/path",
-	}
-
-	data, err := getCgroupData(config, 0)
-	if err != nil {
-		t.Errorf("couldn't get cgroup data: %v", err)
-	}
-
-	// Make sure the final innerPath doesn't go outside the cgroup mountpoint.
-	if strings.HasPrefix(data.innerPath, "..") {
-		t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!")
-	}
-
-	// Double-check, using an actual cgroup.
-	deviceRoot := filepath.Join(root, "devices")
-	devicePath, err := data.path("devices")
-	if err != nil {
-		t.Errorf("couldn't get cgroup path: %v", err)
-	}
-	if !strings.HasPrefix(devicePath, deviceRoot) {
-		t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!")
-	}
-}
-
-// XXX: Remove me after we get rid of configs.Cgroup.Name and configs.Cgroup.Parent.
-func TestInvalidCgroupNameAndParent(t *testing.T) {
-	if cgroups.IsCgroup2UnifiedMode() {
-		t.Skip("cgroup v1 is not supported")
-	}
-	root, err := getCgroupRoot()
-	if err != nil {
-		t.Errorf("couldn't get cgroup root: %v", err)
-	}
-
-	config := &configs.Cgroup{
-		Parent: "../../../../../../../../../../some/path",
-		Name:   "../../../../../../../../../../some/path",
-	}
-
-	data, err := getCgroupData(config, 0)
-	if err != nil {
-		t.Errorf("couldn't get cgroup data: %v", err)
-	}
-
-	// Make sure the final innerPath doesn't go outside the cgroup mountpoint.
-	if strings.HasPrefix(data.innerPath, "..") {
-		t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!")
-	}
-
-	// Double-check, using an actual cgroup.
-	deviceRoot := filepath.Join(root, "devices")
-	devicePath, err := data.path("devices")
-	if err != nil {
-		t.Errorf("couldn't get cgroup path: %v", err)
-	}
-	if !strings.HasPrefix(devicePath, deviceRoot) {
-		t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!")
-	}
-}
-
-// XXX: Remove me after we get rid of configs.Cgroup.Name and configs.Cgroup.Parent.
-func TestInvalidAbsoluteCgroupNameAndParent(t *testing.T) {
-	if cgroups.IsCgroup2UnifiedMode() {
-		t.Skip("cgroup v1 is not supported")
-	}
-	root, err := getCgroupRoot()
-	if err != nil {
-		t.Errorf("couldn't get cgroup root: %v", err)
-	}
-
-	config := &configs.Cgroup{
-		Parent: "/../../../../../../../../../../some/path",
-		Name:   "/../../../../../../../../../../some/path",
-	}
-
-	data, err := getCgroupData(config, 0)
-	if err != nil {
-		t.Errorf("couldn't get cgroup data: %v", err)
-	}
-
-	// Make sure the final innerPath doesn't go outside the cgroup mountpoint.
-	if strings.HasPrefix(data.innerPath, "..") {
-		t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!")
-	}
-
-	// Double-check, using an actual cgroup.
-	deviceRoot := filepath.Join(root, "devices")
-	devicePath, err := data.path("devices")
-	if err != nil {
-		t.Errorf("couldn't get cgroup path: %v", err)
-	}
-	if !strings.HasPrefix(devicePath, deviceRoot) {
-		t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!")
-	}
-}
--- a/libcontainer/cgroups/fs/blkio.go
+++ b/libcontainer/cgroups/fs/blkio.go
@ -1,72 +1,71 @@
-// +build linux
-
 package fs

 import (
 	"bufio"
-	"fmt"
 	"os"
 	"path/filepath"
 	"strconv"
 	"strings"

 	"github.com/opencontainers/runc/libcontainer/cgroups"
-	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
 	"github.com/opencontainers/runc/libcontainer/configs"
 )

 type BlkioGroup struct {
+	weightFilename       string
+	weightDeviceFilename string
 }

 func (s *BlkioGroup) Name() string {
 	return "blkio"
 }

-func (s *BlkioGroup) Apply(d *cgroupData) error {
-	_, err := d.join("blkio")
-	if err != nil && !cgroups.IsNotFound(err) {
-		return err
-	}
-	return nil
+func (s *BlkioGroup) Apply(path string, _ *configs.Resources, pid int) error {
+	return apply(path, pid)
 }

-func (s *BlkioGroup) Set(path string, cgroup *configs.Cgroup) error {
-	if cgroup.Resources.BlkioWeight != 0 {
-		if err := fscommon.WriteFile(path, "blkio.weight", strconv.FormatUint(uint64(cgroup.Resources.BlkioWeight), 10)); err != nil {
+func (s *BlkioGroup) Set(path string, r *configs.Resources) error {
+	s.detectWeightFilenames(path)
+	if r.BlkioWeight != 0 {
+		if err := cgroups.WriteFile(path, s.weightFilename, strconv.FormatUint(uint64(r.BlkioWeight), 10)); err != nil {
 			return err
 		}
 	}

-	if cgroup.Resources.BlkioLeafWeight != 0 {
-		if err := fscommon.WriteFile(path, "blkio.leaf_weight", strconv.FormatUint(uint64(cgroup.Resources.BlkioLeafWeight), 10)); err != nil {
+	if r.BlkioLeafWeight != 0 {
+		if err := cgroups.WriteFile(path, "blkio.leaf_weight", strconv.FormatUint(uint64(r.BlkioLeafWeight), 10)); err != nil {
 			return err
 		}
 	}
-	for _, wd := range cgroup.Resources.BlkioWeightDevice {
-		if err := fscommon.WriteFile(path, "blkio.weight_device", wd.WeightString()); err != nil {
-			return err
+	for _, wd := range r.BlkioWeightDevice {
+		if wd.Weight != 0 {
+			if err := cgroups.WriteFile(path, s.weightDeviceFilename, wd.WeightString()); err != nil {
+				return err
+			}
 		}
-		if err := fscommon.WriteFile(path, "blkio.leaf_weight_device", wd.LeafWeightString()); err != nil {
+		if wd.LeafWeight != 0 {
+			if err := cgroups.WriteFile(path, "blkio.leaf_weight_device", wd.LeafWeightString()); err != nil {
+				return err
+			}
+		}
+	}
+	for _, td := range r.BlkioThrottleReadBpsDevice {
+		if err := cgroups.WriteFile(path, "blkio.throttle.read_bps_device", td.String()); err != nil {
 			return err
 		}
 	}
-	for _, td := range cgroup.Resources.BlkioThrottleReadBpsDevice {
-		if err := fscommon.WriteFile(path, "blkio.throttle.read_bps_device", td.String()); err != nil {
+	for _, td := range r.BlkioThrottleWriteBpsDevice {
+		if err := cgroups.WriteFile(path, "blkio.throttle.write_bps_device", td.String()); err != nil {
 			return err
 		}
 	}
-	for _, td := range cgroup.Resources.BlkioThrottleWriteBpsDevice {
-		if err := fscommon.WriteFile(path, "blkio.throttle.write_bps_device", td.String()); err != nil {
+	for _, td := range r.BlkioThrottleReadIOPSDevice {
+		if err := cgroups.WriteFile(path, "blkio.throttle.read_iops_device", td.String()); err != nil {
 			return err
 		}
 	}
-	for _, td := range cgroup.Resources.BlkioThrottleReadIOPSDevice {
-		if err := fscommon.WriteFile(path, "blkio.throttle.read_iops_device", td.String()); err != nil {
-			return err
-		}
-	}
-	for _, td := range cgroup.Resources.BlkioThrottleWriteIOPSDevice {
-		if err := fscommon.WriteFile(path, "blkio.throttle.write_iops_device", td.String()); err != nil {
+	for _, td := range r.BlkioThrottleWriteIOPSDevice {
+		if err := cgroups.WriteFile(path, "blkio.throttle.write_iops_device", td.String()); err != nil {
 			return err
 		}
 	}
@ -74,10 +73,6 @@ func (s *BlkioGroup) Set(path string, cgroup *configs.Cgroup) error {
 	return nil
 }

-func (s *BlkioGroup) Remove(d *cgroupData) error {
-	return removePath(d.path("blkio"))
-}
-
 /*
 examples:

@ -113,9 +108,9 @@ func splitBlkioStatLine(r rune) bool {
 	return r == ' ' || r == ':'
 }

-func getBlkioStat(path string) ([]cgroups.BlkioStatEntry, error) {
+func getBlkioStat(dir, file string) ([]cgroups.BlkioStatEntry, error) {
 	var blkioStats []cgroups.BlkioStatEntry
-	f, err := os.Open(path)
+	f, err := cgroups.OpenFile(dir, file, os.O_RDONLY)
 	if err != nil {
 		if os.IsNotExist(err) {
 			return blkioStats, nil
@ -133,19 +128,19 @@ func getBlkioStat(path string) ([]cgroups.BlkioStatEntry, error) {
 				// skip total line
 				continue
 			} else {
-				return nil, fmt.Errorf("Invalid line found while parsing %s: %s", path, sc.Text())
+				return nil, malformedLine(dir, file, sc.Text())
 			}
 		}

 		v, err := strconv.ParseUint(fields[0], 10, 64)
 		if err != nil {
-			return nil, err
+			return nil, &parseError{Path: dir, File: file, Err: err}
 		}
 		major := v

 		v, err = strconv.ParseUint(fields[1], 10, 64)
 		if err != nil {
-			return nil, err
+			return nil, &parseError{Path: dir, File: file, Err: err}
 		}
 		minor := v

@ -157,82 +152,160 @@ func getBlkioStat(path string) ([]cgroups.BlkioStatEntry, error) {
 		}
 		v, err = strconv.ParseUint(fields[valueField], 10, 64)
 		if err != nil {
-			return nil, err
+			return nil, &parseError{Path: dir, File: file, Err: err}
 		}
 		blkioStats = append(blkioStats, cgroups.BlkioStatEntry{Major: major, Minor: minor, Op: op, Value: v})
 	}
+	if err := sc.Err(); err != nil {
+		return nil, &parseError{Path: dir, File: file, Err: err}
+	}

 	return blkioStats, nil
 }

 func (s *BlkioGroup) GetStats(path string, stats *cgroups.Stats) error {
-	// Try to read CFQ stats available on all CFQ enabled kernels first
-	if blkioStats, err := getBlkioStat(filepath.Join(path, "blkio.io_serviced_recursive")); err == nil && blkioStats != nil {
-		return getCFQStats(path, stats)
+	type blkioStatInfo struct {
+		filename            string
+		blkioStatEntriesPtr *[]cgroups.BlkioStatEntry
+	}
+	bfqDebugStats := []blkioStatInfo{
+		{
+			filename:            "blkio.bfq.sectors_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.SectorsRecursive,
+		},
+		{
+			filename:            "blkio.bfq.io_service_time_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoServiceTimeRecursive,
+		},
+		{
+			filename:            "blkio.bfq.io_wait_time_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoWaitTimeRecursive,
+		},
+		{
+			filename:            "blkio.bfq.io_merged_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoMergedRecursive,
+		},
+		{
+			filename:            "blkio.bfq.io_queued_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoQueuedRecursive,
+		},
+		{
+			filename:            "blkio.bfq.time_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoTimeRecursive,
+		},
+		{
+			filename:            "blkio.bfq.io_serviced_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive,
+		},
+		{
+			filename:            "blkio.bfq.io_service_bytes_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive,
+		},
+	}
+	bfqStats := []blkioStatInfo{
+		{
+			filename:            "blkio.bfq.io_serviced_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive,
+		},
+		{
+			filename:            "blkio.bfq.io_service_bytes_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive,
+		},
+	}
+	cfqStats := []blkioStatInfo{
+		{
+			filename:            "blkio.sectors_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.SectorsRecursive,
+		},
+		{
+			filename:            "blkio.io_service_time_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoServiceTimeRecursive,
+		},
+		{
+			filename:            "blkio.io_wait_time_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoWaitTimeRecursive,
+		},
+		{
+			filename:            "blkio.io_merged_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoMergedRecursive,
+		},
+		{
+			filename:            "blkio.io_queued_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoQueuedRecursive,
+		},
+		{
+			filename:            "blkio.time_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoTimeRecursive,
+		},
+		{
+			filename:            "blkio.io_serviced_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive,
+		},
+		{
+			filename:            "blkio.io_service_bytes_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive,
+		},
+	}
+	throttleRecursiveStats := []blkioStatInfo{
+		{
+			filename:            "blkio.throttle.io_serviced_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive,
+		},
+		{
+			filename:            "blkio.throttle.io_service_bytes_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive,
+		},
+	}
+	baseStats := []blkioStatInfo{
+		{
+			filename:            "blkio.throttle.io_serviced",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive,
+		},
+		{
+			filename:            "blkio.throttle.io_service_bytes",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive,
+		},
+	}
+	orderedStats := [][]blkioStatInfo{
+		bfqDebugStats,
+		bfqStats,
+		cfqStats,
+		throttleRecursiveStats,
+		baseStats,
 	}
-	return getStats(path, stats) // Use generic stats as fallback
-}

-func getCFQStats(path string, stats *cgroups.Stats) error {
 	var blkioStats []cgroups.BlkioStatEntry
 	var err error

-	if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.sectors_recursive")); err != nil {
-		return err
+	for _, statGroup := range orderedStats {
+		for i, statInfo := range statGroup {
+			if blkioStats, err = getBlkioStat(path, statInfo.filename); err != nil || blkioStats == nil {
+				// if error occurs on first file, move to next group
+				if i == 0 {
+					break
+				}
+				return err
+			}
+			*statInfo.blkioStatEntriesPtr = blkioStats
+			// finish if all stats are gathered
+			if i == len(statGroup)-1 {
+				return nil
+			}
+		}
 	}
-	stats.BlkioStats.SectorsRecursive = blkioStats
-
-	if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_service_bytes_recursive")); err != nil {
-		return err
-	}
-	stats.BlkioStats.IoServiceBytesRecursive = blkioStats
-
-	if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_serviced_recursive")); err != nil {
-		return err
-	}
-	stats.BlkioStats.IoServicedRecursive = blkioStats
-
-	if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_queued_recursive")); err != nil {
-		return err
-	}
-	stats.BlkioStats.IoQueuedRecursive = blkioStats
-
-	if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_service_time_recursive")); err != nil {
-		return err
-	}
-	stats.BlkioStats.IoServiceTimeRecursive = blkioStats
-
-	if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_wait_time_recursive")); err != nil {
-		return err
-	}
-	stats.BlkioStats.IoWaitTimeRecursive = blkioStats
-
-	if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_merged_recursive")); err != nil {
-		return err
-	}
-	stats.BlkioStats.IoMergedRecursive = blkioStats
-
-	if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.time_recursive")); err != nil {
-		return err
-	}
-	stats.BlkioStats.IoTimeRecursive = blkioStats
-
 	return nil
 }

-func getStats(path string, stats *cgroups.Stats) error {
-	var blkioStats []cgroups.BlkioStatEntry
-	var err error
-
-	if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.throttle.io_service_bytes")); err != nil {
-		return err
+func (s *BlkioGroup) detectWeightFilenames(path string) {
+	if s.weightFilename != "" {
+		// Already detected.
+		return
 	}
-	stats.BlkioStats.IoServiceBytesRecursive = blkioStats
-
-	if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.throttle.io_serviced")); err != nil {
-		return err
+	if cgroups.PathExists(filepath.Join(path, "blkio.weight")) {
+		s.weightFilename = "blkio.weight"
+		s.weightDeviceFilename = "blkio.weight_device"
+	} else {
+		s.weightFilename = "blkio.bfq.weight"
+		s.weightDeviceFilename = "blkio.bfq.weight_device"
 	}
-	stats.BlkioStats.IoServicedRecursive = blkioStats
-
-	return nil
 }
--- a/libcontainer/cgroups/fs/blkio_test.go
+++ b/libcontainer/cgroups/fs/blkio_test.go
--- a/libcontainer/cgroups/fs/cpu.go
+++ b/libcontainer/cgroups/fs/cpu.go
@ -1,94 +1,105 @@
-// +build linux
-
 package fs

 import (
 	"bufio"
+	"errors"
+	"fmt"
 	"os"
-	"path/filepath"
 	"strconv"

 	"github.com/opencontainers/runc/libcontainer/cgroups"
 	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
 	"github.com/opencontainers/runc/libcontainer/configs"
+	"golang.org/x/sys/unix"
 )

-type CpuGroup struct {
-}
+type CpuGroup struct{}

 func (s *CpuGroup) Name() string {
 	return "cpu"
 }

-func (s *CpuGroup) Apply(d *cgroupData) error {
-	// We always want to join the cpu group, to allow fair cpu scheduling
-	// on a container basis
-	path, err := d.path("cpu")
-	if err != nil && !cgroups.IsNotFound(err) {
-		return err
-	}
-	return s.ApplyDir(path, d.config, d.pid)
-}
-
-func (s *CpuGroup) ApplyDir(path string, cgroup *configs.Cgroup, pid int) error {
-	// This might happen if we have no cpu cgroup mounted.
-	// Just do nothing and don't fail.
-	if path == "" {
-		return nil
-	}
-	if err := os.MkdirAll(path, 0755); err != nil {
+func (s *CpuGroup) Apply(path string, r *configs.Resources, pid int) error {
+	if err := os.MkdirAll(path, 0o755); err != nil {
 		return err
 	}
 	// We should set the real-Time group scheduling settings before moving
 	// in the process because if the process is already in SCHED_RR mode
 	// and no RT bandwidth is set, adding it will fail.
-	if err := s.SetRtSched(path, cgroup); err != nil {
+	if err := s.SetRtSched(path, r); err != nil {
 		return err
 	}
-	// because we are not using d.join we need to place the pid into the procs file
-	// unlike the other subsystems
+	// Since we are not using apply(), we need to place the pid
+	// into the procs file.
 	return cgroups.WriteCgroupProc(path, pid)
 }

-func (s *CpuGroup) SetRtSched(path string, cgroup *configs.Cgroup) error {
-	if cgroup.Resources.CpuRtPeriod != 0 {
-		if err := fscommon.WriteFile(path, "cpu.rt_period_us", strconv.FormatUint(cgroup.Resources.CpuRtPeriod, 10)); err != nil {
+func (s *CpuGroup) SetRtSched(path string, r *configs.Resources) error {
+	if r.CpuRtPeriod != 0 {
+		if err := cgroups.WriteFile(path, "cpu.rt_period_us", strconv.FormatUint(r.CpuRtPeriod, 10)); err != nil {
 			return err
 		}
 	}
-	if cgroup.Resources.CpuRtRuntime != 0 {
-		if err := fscommon.WriteFile(path, "cpu.rt_runtime_us", strconv.FormatInt(cgroup.Resources.CpuRtRuntime, 10)); err != nil {
+	if r.CpuRtRuntime != 0 {
+		if err := cgroups.WriteFile(path, "cpu.rt_runtime_us", strconv.FormatInt(r.CpuRtRuntime, 10)); err != nil {
 			return err
 		}
 	}
 	return nil
 }

-func (s *CpuGroup) Set(path string, cgroup *configs.Cgroup) error {
-	if cgroup.Resources.CpuShares != 0 {
-		if err := fscommon.WriteFile(path, "cpu.shares", strconv.FormatUint(cgroup.Resources.CpuShares, 10)); err != nil {
+func (s *CpuGroup) Set(path string, r *configs.Resources) error {
+	if r.CpuShares != 0 {
+		shares := r.CpuShares
+		if err := cgroups.WriteFile(path, "cpu.shares", strconv.FormatUint(shares, 10)); err != nil {
 			return err
 		}
-	}
-	if cgroup.Resources.CpuPeriod != 0 {
-		if err := fscommon.WriteFile(path, "cpu.cfs_period_us", strconv.FormatUint(cgroup.Resources.CpuPeriod, 10)); err != nil {
+		// read it back
+		sharesRead, err := fscommon.GetCgroupParamUint(path, "cpu.shares")
+		if err != nil {
 			return err
 		}
-	}
-	if cgroup.Resources.CpuQuota != 0 {
-		if err := fscommon.WriteFile(path, "cpu.cfs_quota_us", strconv.FormatInt(cgroup.Resources.CpuQuota, 10)); err != nil {
-			return err
+		// ... and check
+		if shares > sharesRead {
+			return fmt.Errorf("the maximum allowed cpu-shares is %d", sharesRead)
+		} else if shares < sharesRead {
+			return fmt.Errorf("the minimum allowed cpu-shares is %d", sharesRead)
 		}
 	}
-	return s.SetRtSched(path, cgroup)
-}

-func (s *CpuGroup) Remove(d *cgroupData) error {
-	return removePath(d.path("cpu"))
+	var period string
+	if r.CpuPeriod != 0 {
+		period = strconv.FormatUint(r.CpuPeriod, 10)
+		if err := cgroups.WriteFile(path, "cpu.cfs_period_us", period); err != nil {
+			// Sometimes when the period to be set is smaller
+			// than the current one, it is rejected by the kernel
+			// (EINVAL) as old_quota/new_period exceeds the parent
+			// cgroup quota limit. If this happens and the quota is
+			// going to be set, ignore the error for now and retry
+			// after setting the quota.
+			if !errors.Is(err, unix.EINVAL) || r.CpuQuota == 0 {
+				return err
+			}
+		} else {
+			period = ""
+		}
+	}
+	if r.CpuQuota != 0 {
+		if err := cgroups.WriteFile(path, "cpu.cfs_quota_us", strconv.FormatInt(r.CpuQuota, 10)); err != nil {
+			return err
+		}
+		if period != "" {
+			if err := cgroups.WriteFile(path, "cpu.cfs_period_us", period); err != nil {
+				return err
+			}
+		}
+	}
+	return s.SetRtSched(path, r)
 }

 func (s *CpuGroup) GetStats(path string, stats *cgroups.Stats) error {
-	f, err := os.Open(filepath.Join(path, "cpu.stat"))
+	const file = "cpu.stat"
+	f, err := cgroups.OpenFile(path, file, os.O_RDONLY)
 	if err != nil {
 		if os.IsNotExist(err) {
 			return nil
@ -99,9 +110,9 @@ func (s *CpuGroup) GetStats(path string, stats *cgroups.Stats) error {

 	sc := bufio.NewScanner(f)
 	for sc.Scan() {
-		t, v, err := fscommon.GetCgroupParamKeyValue(sc.Text())
+		t, v, err := fscommon.ParseKeyValue(sc.Text())
 		if err != nil {
-			return err
+			return &parseError{Path: path, File: file, Err: err}
 		}
 		switch t {
 		case "nr_periods":
--- a/libcontainer/cgroups/fs/cpu_test.go
+++ b/libcontainer/cgroups/fs/cpu_test.go
@ -1,5 +1,3 @@
-// +build linux
-
 package fs

 import (
@ -9,40 +7,40 @@ import (

 	"github.com/opencontainers/runc/libcontainer/cgroups"
 	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+	"github.com/opencontainers/runc/libcontainer/configs"
 )

 func TestCpuSetShares(t *testing.T) {
-	helper := NewCgroupTestUtil("cpu", t)
-	defer helper.cleanup()
+	path := tempDir(t, "cpu")

 	const (
 		sharesBefore = 1024
 		sharesAfter  = 512
 	)

-	helper.writeFileContents(map[string]string{
+	writeFileContents(t, path, map[string]string{
 		"cpu.shares": strconv.Itoa(sharesBefore),
 	})

-	helper.CgroupData.config.Resources.CpuShares = sharesAfter
+	r := &configs.Resources{
+		CpuShares: sharesAfter,
+	}
 	cpu := &CpuGroup{}
-	if err := cpu.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+	if err := cpu.Set(path, r); err != nil {
 		t.Fatal(err)
 	}

-	value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "cpu.shares")
+	value, err := fscommon.GetCgroupParamUint(path, "cpu.shares")
 	if err != nil {
-		t.Fatalf("Failed to parse cpu.shares - %s", err)
+		t.Fatal(err)
 	}
-
 	if value != sharesAfter {
 		t.Fatal("Got the wrong value, set cpu.shares failed.")
 	}
 }

 func TestCpuSetBandWidth(t *testing.T) {
-	helper := NewCgroupTestUtil("cpu", t)
-	defer helper.cleanup()
+	path := tempDir(t, "cpu")

 	const (
 		quotaBefore     = 8000
@ -55,47 +53,51 @@ func TestCpuSetBandWidth(t *testing.T) {
 		rtPeriodAfter   = 7000
 	)

-	helper.writeFileContents(map[string]string{
+	writeFileContents(t, path, map[string]string{
 		"cpu.cfs_quota_us":  strconv.Itoa(quotaBefore),
 		"cpu.cfs_period_us": strconv.Itoa(periodBefore),
 		"cpu.rt_runtime_us": strconv.Itoa(rtRuntimeBefore),
 		"cpu.rt_period_us":  strconv.Itoa(rtPeriodBefore),
 	})

-	helper.CgroupData.config.Resources.CpuQuota = quotaAfter
-	helper.CgroupData.config.Resources.CpuPeriod = periodAfter
-	helper.CgroupData.config.Resources.CpuRtRuntime = rtRuntimeAfter
-	helper.CgroupData.config.Resources.CpuRtPeriod = rtPeriodAfter
+	r := &configs.Resources{
+		CpuQuota:     quotaAfter,
+		CpuPeriod:    periodAfter,
+		CpuRtRuntime: rtRuntimeAfter,
+		CpuRtPeriod:  rtPeriodAfter,
+	}
 	cpu := &CpuGroup{}
-	if err := cpu.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+	if err := cpu.Set(path, r); err != nil {
 		t.Fatal(err)
 	}

-	quota, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "cpu.cfs_quota_us")
+	quota, err := fscommon.GetCgroupParamUint(path, "cpu.cfs_quota_us")
 	if err != nil {
-		t.Fatalf("Failed to parse cpu.cfs_quota_us - %s", err)
+		t.Fatal(err)
 	}
 	if quota != quotaAfter {
 		t.Fatal("Got the wrong value, set cpu.cfs_quota_us failed.")
 	}

-	period, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "cpu.cfs_period_us")
+	period, err := fscommon.GetCgroupParamUint(path, "cpu.cfs_period_us")
 	if err != nil {
-		t.Fatalf("Failed to parse cpu.cfs_period_us - %s", err)
+		t.Fatal(err)
 	}
 	if period != periodAfter {
 		t.Fatal("Got the wrong value, set cpu.cfs_period_us failed.")
 	}
-	rtRuntime, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "cpu.rt_runtime_us")
+
+	rtRuntime, err := fscommon.GetCgroupParamUint(path, "cpu.rt_runtime_us")
 	if err != nil {
-		t.Fatalf("Failed to parse cpu.rt_runtime_us - %s", err)
+		t.Fatal(err)
 	}
 	if rtRuntime != rtRuntimeAfter {
 		t.Fatal("Got the wrong value, set cpu.rt_runtime_us failed.")
 	}
-	rtPeriod, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "cpu.rt_period_us")
+
+	rtPeriod, err := fscommon.GetCgroupParamUint(path, "cpu.rt_period_us")
 	if err != nil {
-		t.Fatalf("Failed to parse cpu.rt_period_us - %s", err)
+		t.Fatal(err)
 	}
 	if rtPeriod != rtPeriodAfter {
 		t.Fatal("Got the wrong value, set cpu.rt_period_us failed.")
@ -103,8 +105,7 @@ func TestCpuSetBandWidth(t *testing.T) {
 }

 func TestCpuStats(t *testing.T) {
-	helper := NewCgroupTestUtil("cpu", t)
-	defer helper.cleanup()
+	path := tempDir(t, "cpu")

 	const (
 		nrPeriods     = 2000
@ -112,15 +113,15 @@ func TestCpuStats(t *testing.T) {
 		throttledTime = uint64(18446744073709551615)
 	)

-	cpuStatContent := fmt.Sprintf("nr_periods %d\n nr_throttled %d\n throttled_time %d\n",
+	cpuStatContent := fmt.Sprintf("nr_periods %d\nnr_throttled %d\nthrottled_time %d\n",
 		nrPeriods, nrThrottled, throttledTime)
-	helper.writeFileContents(map[string]string{
+	writeFileContents(t, path, map[string]string{
 		"cpu.stat": cpuStatContent,
 	})

 	cpu := &CpuGroup{}
 	actualStats := *cgroups.NewStats()
-	err := cpu.GetStats(helper.CgroupPath, &actualStats)
+	err := cpu.GetStats(path, &actualStats)
 	if err != nil {
 		t.Fatal(err)
 	}
@ -128,44 +129,43 @@ func TestCpuStats(t *testing.T) {
 	expectedStats := cgroups.ThrottlingData{
 		Periods:          nrPeriods,
 		ThrottledPeriods: nrThrottled,
-		ThrottledTime:    throttledTime}
+		ThrottledTime:    throttledTime,
+	}

 	expectThrottlingDataEquals(t, expectedStats, actualStats.CpuStats.ThrottlingData)
 }

 func TestNoCpuStatFile(t *testing.T) {
-	helper := NewCgroupTestUtil("cpu", t)
-	defer helper.cleanup()
+	path := tempDir(t, "cpu")

 	cpu := &CpuGroup{}
 	actualStats := *cgroups.NewStats()
-	err := cpu.GetStats(helper.CgroupPath, &actualStats)
+	err := cpu.GetStats(path, &actualStats)
 	if err != nil {
 		t.Fatal("Expected not to fail, but did")
 	}
 }

 func TestInvalidCpuStat(t *testing.T) {
-	helper := NewCgroupTestUtil("cpu", t)
-	defer helper.cleanup()
+	path := tempDir(t, "cpu")
+
 	cpuStatContent := `nr_periods 2000
 	nr_throttled 200
 	throttled_time fortytwo`
-	helper.writeFileContents(map[string]string{
+	writeFileContents(t, path, map[string]string{
 		"cpu.stat": cpuStatContent,
 	})

 	cpu := &CpuGroup{}
 	actualStats := *cgroups.NewStats()
-	err := cpu.GetStats(helper.CgroupPath, &actualStats)
+	err := cpu.GetStats(path, &actualStats)
 	if err == nil {
 		t.Fatal("Expected failed stat parsing.")
 	}
 }

 func TestCpuSetRtSchedAtApply(t *testing.T) {
-	helper := NewCgroupTestUtil("cpu", t)
-	defer helper.cleanup()
+	path := tempDir(t, "cpu")

 	const (
 		rtRuntimeBefore = 0
@ -174,35 +174,40 @@ func TestCpuSetRtSchedAtApply(t *testing.T) {
 		rtPeriodAfter   = 7000
 	)

-	helper.writeFileContents(map[string]string{
+	writeFileContents(t, path, map[string]string{
 		"cpu.rt_runtime_us": strconv.Itoa(rtRuntimeBefore),
 		"cpu.rt_period_us":  strconv.Itoa(rtPeriodBefore),
 	})

-	helper.CgroupData.config.Resources.CpuRtRuntime = rtRuntimeAfter
-	helper.CgroupData.config.Resources.CpuRtPeriod = rtPeriodAfter
+	r := &configs.Resources{
+		CpuRtRuntime: rtRuntimeAfter,
+		CpuRtPeriod:  rtPeriodAfter,
+	}
 	cpu := &CpuGroup{}
-	if err := cpu.ApplyDir(helper.CgroupPath, helper.CgroupData.config, 1234); err != nil {
+
+	if err := cpu.Apply(path, r, 1234); err != nil {
 		t.Fatal(err)
 	}

-	rtRuntime, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "cpu.rt_runtime_us")
+	rtRuntime, err := fscommon.GetCgroupParamUint(path, "cpu.rt_runtime_us")
 	if err != nil {
-		t.Fatalf("Failed to parse cpu.rt_runtime_us - %s", err)
+		t.Fatal(err)
 	}
 	if rtRuntime != rtRuntimeAfter {
 		t.Fatal("Got the wrong value, set cpu.rt_runtime_us failed.")
 	}
-	rtPeriod, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "cpu.rt_period_us")
+
+	rtPeriod, err := fscommon.GetCgroupParamUint(path, "cpu.rt_period_us")
 	if err != nil {
-		t.Fatalf("Failed to parse cpu.rt_period_us - %s", err)
+		t.Fatal(err)
 	}
 	if rtPeriod != rtPeriodAfter {
 		t.Fatal("Got the wrong value, set cpu.rt_period_us failed.")
 	}
-	pid, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "cgroup.procs")
+
+	pid, err := fscommon.GetCgroupParamUint(path, "cgroup.procs")
 	if err != nil {
-		t.Fatalf("Failed to parse cgroup.procs - %s", err)
+		t.Fatal(err)
 	}
 	if pid != 1234 {
 		t.Fatal("Got the wrong value, set cgroup.procs failed.")
--- a/libcontainer/cgroups/fs/cpuacct.go
+++ b/libcontainer/cgroups/fs/cpuacct.go
@ -1,52 +1,51 @@
-// +build linux
-
 package fs

 import (
-	"fmt"
-	"io/ioutil"
-	"path/filepath"
+	"bufio"
+	"os"
 	"strconv"
 	"strings"

 	"github.com/opencontainers/runc/libcontainer/cgroups"
 	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
 	"github.com/opencontainers/runc/libcontainer/configs"
-	"github.com/opencontainers/runc/libcontainer/system"
 )

 const (
-	cgroupCpuacctStat   = "cpuacct.stat"
+	cgroupCpuacctStat     = "cpuacct.stat"
+	cgroupCpuacctUsageAll = "cpuacct.usage_all"
+
 	nanosecondsInSecond = 1000000000
+
+	userModeColumn              = 1
+	kernelModeColumn            = 2
+	cuacctUsageAllColumnsNumber = 3
+
+	// The value comes from `C.sysconf(C._SC_CLK_TCK)`, and
+	// on Linux it's a constant which is safe to be hard coded,
+	// so we can avoid using cgo here. For details, see:
+	// https://github.com/containerd/cgroups/pull/12
+	clockTicks uint64 = 100
 )

-var clockTicks = uint64(system.GetClockTicks())
-
-type CpuacctGroup struct {
-}
+type CpuacctGroup struct{}

 func (s *CpuacctGroup) Name() string {
 	return "cpuacct"
 }

-func (s *CpuacctGroup) Apply(d *cgroupData) error {
-	// we just want to join this group even though we don't set anything
-	if _, err := d.join("cpuacct"); err != nil && !cgroups.IsNotFound(err) {
-		return err
-	}
-
-	return nil
+func (s *CpuacctGroup) Apply(path string, _ *configs.Resources, pid int) error {
+	return apply(path, pid)
 }

-func (s *CpuacctGroup) Set(path string, cgroup *configs.Cgroup) error {
+func (s *CpuacctGroup) Set(_ string, _ *configs.Resources) error {
 	return nil
 }

-func (s *CpuacctGroup) Remove(d *cgroupData) error {
-	return removePath(d.path("cpuacct"))
-}
-
 func (s *CpuacctGroup) GetStats(path string, stats *cgroups.Stats) error {
+	if !cgroups.PathExists(path) {
+		return nil
+	}
 	userModeUsage, kernelModeUsage, err := getCpuUsageBreakdown(path)
 	if err != nil {
 		return err
@ -62,8 +61,15 @@ func (s *CpuacctGroup) GetStats(path string, stats *cgroups.Stats) error {
 		return err
 	}

+	percpuUsageInKernelmode, percpuUsageInUsermode, err := getPercpuUsageInModes(path)
+	if err != nil {
+		return err
+	}
+
 	stats.CpuStats.CpuUsage.TotalUsage = totalUsage
 	stats.CpuStats.CpuUsage.PercpuUsage = percpuUsage
+	stats.CpuStats.CpuUsage.PercpuUsageInKernelmode = percpuUsageInKernelmode
+	stats.CpuStats.CpuUsage.PercpuUsageInUsermode = percpuUsageInUsermode
 	stats.CpuStats.CpuUsage.UsageInUsermode = userModeUsage
 	stats.CpuStats.CpuUsage.UsageInKernelmode = kernelModeUsage
 	return nil
@ -71,52 +77,90 @@ func (s *CpuacctGroup) GetStats(path string, stats *cgroups.Stats) error {

 // Returns user and kernel usage breakdown in nanoseconds.
 func getCpuUsageBreakdown(path string) (uint64, uint64, error) {
-	userModeUsage := uint64(0)
-	kernelModeUsage := uint64(0)
+	var userModeUsage, kernelModeUsage uint64
 	const (
 		userField   = "user"
 		systemField = "system"
+		file        = cgroupCpuacctStat
 	)

 	// Expected format:
 	// user <usage in ticks>
 	// system <usage in ticks>
-	data, err := ioutil.ReadFile(filepath.Join(path, cgroupCpuacctStat))
+	data, err := cgroups.ReadFile(path, file)
 	if err != nil {
 		return 0, 0, err
 	}
-	fields := strings.Fields(string(data))
-	if len(fields) < 4 {
-		return 0, 0, fmt.Errorf("failure - %s is expected to have at least 4 fields", filepath.Join(path, cgroupCpuacctStat))
-	}
-	if fields[0] != userField {
-		return 0, 0, fmt.Errorf("unexpected field %q in %q, expected %q", fields[0], cgroupCpuacctStat, userField)
-	}
-	if fields[2] != systemField {
-		return 0, 0, fmt.Errorf("unexpected field %q in %q, expected %q", fields[2], cgroupCpuacctStat, systemField)
+	// TODO: use strings.SplitN instead.
+	fields := strings.Fields(data)
+	if len(fields) < 4 || fields[0] != userField || fields[2] != systemField {
+		return 0, 0, malformedLine(path, file, data)
 	}
 	if userModeUsage, err = strconv.ParseUint(fields[1], 10, 64); err != nil {
-		return 0, 0, err
+		return 0, 0, &parseError{Path: path, File: file, Err: err}
 	}
 	if kernelModeUsage, err = strconv.ParseUint(fields[3], 10, 64); err != nil {
-		return 0, 0, err
+		return 0, 0, &parseError{Path: path, File: file, Err: err}
 	}

 	return (userModeUsage * nanosecondsInSecond) / clockTicks, (kernelModeUsage * nanosecondsInSecond) / clockTicks, nil
 }

 func getPercpuUsage(path string) ([]uint64, error) {
+	const file = "cpuacct.usage_percpu"
 	percpuUsage := []uint64{}
-	data, err := ioutil.ReadFile(filepath.Join(path, "cpuacct.usage_percpu"))
+	data, err := cgroups.ReadFile(path, file)
 	if err != nil {
 		return percpuUsage, err
 	}
-	for _, value := range strings.Fields(string(data)) {
+	// TODO: use strings.SplitN instead.
+	for _, value := range strings.Fields(data) {
 		value, err := strconv.ParseUint(value, 10, 64)
 		if err != nil {
-			return percpuUsage, fmt.Errorf("Unable to convert param value to uint64: %s", err)
+			return percpuUsage, &parseError{Path: path, File: file, Err: err}
 		}
 		percpuUsage = append(percpuUsage, value)
 	}
 	return percpuUsage, nil
 }
+
+func getPercpuUsageInModes(path string) ([]uint64, []uint64, error) {
+	usageKernelMode := []uint64{}
+	usageUserMode := []uint64{}
+	const file = cgroupCpuacctUsageAll
+
+	fd, err := cgroups.OpenFile(path, file, os.O_RDONLY)
+	if os.IsNotExist(err) {
+		return usageKernelMode, usageUserMode, nil
+	} else if err != nil {
+		return nil, nil, err
+	}
+	defer fd.Close()
+
+	scanner := bufio.NewScanner(fd)
+	scanner.Scan() // skipping header line
+
+	for scanner.Scan() {
+		lineFields := strings.SplitN(scanner.Text(), " ", cuacctUsageAllColumnsNumber+1)
+		if len(lineFields) != cuacctUsageAllColumnsNumber {
+			continue
+		}
+
+		usageInKernelMode, err := strconv.ParseUint(lineFields[kernelModeColumn], 10, 64)
+		if err != nil {
+			return nil, nil, &parseError{Path: path, File: file, Err: err}
+		}
+		usageKernelMode = append(usageKernelMode, usageInKernelMode)
+
+		usageInUserMode, err := strconv.ParseUint(lineFields[userModeColumn], 10, 64)
+		if err != nil {
+			return nil, nil, &parseError{Path: path, File: file, Err: err}
+		}
+		usageUserMode = append(usageUserMode, usageInUserMode)
+	}
+	if err := scanner.Err(); err != nil {
+		return nil, nil, &parseError{Path: path, File: file, Err: err}
+	}
+
+	return usageKernelMode, usageUserMode, nil
+}
--- a/libcontainer/cgroups/fs/cpuacct_test.go
+++ b/libcontainer/cgroups/fs/cpuacct_test.go
@ -0,0 +1,97 @@
+package fs
+
+import (
+	"reflect"
+	"testing"
+
+	"github.com/opencontainers/runc/libcontainer/cgroups"
+)
+
+const (
+	cpuAcctUsageContents       = "12262454190222160"
+	cpuAcctUsagePerCPUContents = "1564936537989058 1583937096487821 1604195415465681 1596445226820187 1481069084155629 1478735613864327 1477610593414743 1476362015778086"
+	cpuAcctStatContents        = "user 452278264\nsystem 291429664"
+	cpuAcctUsageAll            = `cpu user system
+	0 962250696038415 637727786389114
+	1 981956408513304 638197595421064
+	2 1002658817529022 638956774598358
+	3 994937703492523 637985531181620
+	4 874843781648690 638837766495476
+	5 872544369885276 638763309884944
+	6 870104915696359 640081778921247
+	7 870202363887496 638716766259495
+	`
+)
+
+func TestCpuacctStats(t *testing.T) {
+	path := tempDir(t, "cpuacct")
+	writeFileContents(t, path, map[string]string{
+		"cpuacct.usage":        cpuAcctUsageContents,
+		"cpuacct.usage_percpu": cpuAcctUsagePerCPUContents,
+		"cpuacct.stat":         cpuAcctStatContents,
+		"cpuacct.usage_all":    cpuAcctUsageAll,
+	})
+
+	cpuacct := &CpuacctGroup{}
+	actualStats := *cgroups.NewStats()
+	err := cpuacct.GetStats(path, &actualStats)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	expectedStats := cgroups.CpuUsage{
+		TotalUsage: uint64(12262454190222160),
+		PercpuUsage: []uint64{
+			1564936537989058, 1583937096487821, 1604195415465681, 1596445226820187,
+			1481069084155629, 1478735613864327, 1477610593414743, 1476362015778086,
+		},
+		PercpuUsageInKernelmode: []uint64{
+			637727786389114, 638197595421064, 638956774598358, 637985531181620,
+			638837766495476, 638763309884944, 640081778921247, 638716766259495,
+		},
+		PercpuUsageInUsermode: []uint64{
+			962250696038415, 981956408513304, 1002658817529022, 994937703492523,
+			874843781648690, 872544369885276, 870104915696359, 870202363887496,
+		},
+		UsageInKernelmode: (uint64(291429664) * nanosecondsInSecond) / clockTicks,
+		UsageInUsermode:   (uint64(452278264) * nanosecondsInSecond) / clockTicks,
+	}
+
+	if !reflect.DeepEqual(expectedStats, actualStats.CpuStats.CpuUsage) {
+		t.Errorf("Expected CPU usage %#v but found %#v\n",
+			expectedStats, actualStats.CpuStats.CpuUsage)
+	}
+}
+
+func TestCpuacctStatsWithoutUsageAll(t *testing.T) {
+	path := tempDir(t, "cpuacct")
+	writeFileContents(t, path, map[string]string{
+		"cpuacct.usage":        cpuAcctUsageContents,
+		"cpuacct.usage_percpu": cpuAcctUsagePerCPUContents,
+		"cpuacct.stat":         cpuAcctStatContents,
+	})
+
+	cpuacct := &CpuacctGroup{}
+	actualStats := *cgroups.NewStats()
+	err := cpuacct.GetStats(path, &actualStats)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	expectedStats := cgroups.CpuUsage{
+		TotalUsage: uint64(12262454190222160),
+		PercpuUsage: []uint64{
+			1564936537989058, 1583937096487821, 1604195415465681, 1596445226820187,
+			1481069084155629, 1478735613864327, 1477610593414743, 1476362015778086,
+		},
+		PercpuUsageInKernelmode: []uint64{},
+		PercpuUsageInUsermode:   []uint64{},
+		UsageInKernelmode:       (uint64(291429664) * nanosecondsInSecond) / clockTicks,
+		UsageInUsermode:         (uint64(452278264) * nanosecondsInSecond) / clockTicks,
+	}
+
+	if !reflect.DeepEqual(expectedStats, actualStats.CpuStats.CpuUsage) {
+		t.Errorf("Expected CPU usage %#v but found %#v\n",
+			expectedStats, actualStats.CpuStats.CpuUsage)
+	}
+}
--- a/libcontainer/cgroups/fs/cpuset.go
+++ b/libcontainer/cgroups/fs/cpuset.go
@ -1,75 +1,159 @@
-// +build linux
-
 package fs

 import (
-	"bytes"
-	"fmt"
-	"io/ioutil"
+	"errors"
 	"os"
 	"path/filepath"
+	"strconv"
+	"strings"
+
+	"golang.org/x/sys/unix"

 	"github.com/opencontainers/runc/libcontainer/cgroups"
 	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
 	"github.com/opencontainers/runc/libcontainer/configs"
-	libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
 )

-type CpusetGroup struct {
-}
+type CpusetGroup struct{}

 func (s *CpusetGroup) Name() string {
 	return "cpuset"
 }

-func (s *CpusetGroup) Apply(d *cgroupData) error {
-	dir, err := d.path("cpuset")
-	if err != nil && !cgroups.IsNotFound(err) {
-		return err
-	}
-	return s.ApplyDir(dir, d.config, d.pid)
+func (s *CpusetGroup) Apply(path string, r *configs.Resources, pid int) error {
+	return s.ApplyDir(path, r, pid)
 }

-func (s *CpusetGroup) Set(path string, cgroup *configs.Cgroup) error {
-	if cgroup.Resources.CpusetCpus != "" {
-		if err := fscommon.WriteFile(path, "cpuset.cpus", cgroup.Resources.CpusetCpus); err != nil {
+func (s *CpusetGroup) Set(path string, r *configs.Resources) error {
+	if r.CpusetCpus != "" {
+		if err := cgroups.WriteFile(path, "cpuset.cpus", r.CpusetCpus); err != nil {
 			return err
 		}
 	}
-	if cgroup.Resources.CpusetMems != "" {
-		if err := fscommon.WriteFile(path, "cpuset.mems", cgroup.Resources.CpusetMems); err != nil {
+	if r.CpusetMems != "" {
+		if err := cgroups.WriteFile(path, "cpuset.mems", r.CpusetMems); err != nil {
 			return err
 		}
 	}
 	return nil
 }

-func (s *CpusetGroup) Remove(d *cgroupData) error {
-	return removePath(d.path("cpuset"))
+func getCpusetStat(path string, file string) ([]uint16, error) {
+	var extracted []uint16
+	fileContent, err := fscommon.GetCgroupParamString(path, file)
+	if err != nil {
+		return extracted, err
+	}
+	if len(fileContent) == 0 {
+		return extracted, &parseError{Path: path, File: file, Err: errors.New("empty file")}
+	}
+
+	for _, s := range strings.Split(fileContent, ",") {
+		sp := strings.SplitN(s, "-", 3)
+		switch len(sp) {
+		case 3:
+			return extracted, &parseError{Path: path, File: file, Err: errors.New("extra dash")}
+		case 2:
+			min, err := strconv.ParseUint(sp[0], 10, 16)
+			if err != nil {
+				return extracted, &parseError{Path: path, File: file, Err: err}
+			}
+			max, err := strconv.ParseUint(sp[1], 10, 16)
+			if err != nil {
+				return extracted, &parseError{Path: path, File: file, Err: err}
+			}
+			if min > max {
+				return extracted, &parseError{Path: path, File: file, Err: errors.New("invalid values, min > max")}
+			}
+			for i := min; i <= max; i++ {
+				extracted = append(extracted, uint16(i))
+			}
+		case 1:
+			value, err := strconv.ParseUint(s, 10, 16)
+			if err != nil {
+				return extracted, &parseError{Path: path, File: file, Err: err}
+			}
+			extracted = append(extracted, uint16(value))
+		}
+	}
+
+	return extracted, nil
 }

 func (s *CpusetGroup) GetStats(path string, stats *cgroups.Stats) error {
+	var err error
+
+	stats.CPUSetStats.CPUs, err = getCpusetStat(path, "cpuset.cpus")
+	if err != nil && !errors.Is(err, os.ErrNotExist) {
+		return err
+	}
+
+	stats.CPUSetStats.CPUExclusive, err = fscommon.GetCgroupParamUint(path, "cpuset.cpu_exclusive")
+	if err != nil && !errors.Is(err, os.ErrNotExist) {
+		return err
+	}
+
+	stats.CPUSetStats.Mems, err = getCpusetStat(path, "cpuset.mems")
+	if err != nil && !errors.Is(err, os.ErrNotExist) {
+		return err
+	}
+
+	stats.CPUSetStats.MemHardwall, err = fscommon.GetCgroupParamUint(path, "cpuset.mem_hardwall")
+	if err != nil && !errors.Is(err, os.ErrNotExist) {
+		return err
+	}
+
+	stats.CPUSetStats.MemExclusive, err = fscommon.GetCgroupParamUint(path, "cpuset.mem_exclusive")
+	if err != nil && !errors.Is(err, os.ErrNotExist) {
+		return err
+	}
+
+	stats.CPUSetStats.MemoryMigrate, err = fscommon.GetCgroupParamUint(path, "cpuset.memory_migrate")
+	if err != nil && !errors.Is(err, os.ErrNotExist) {
+		return err
+	}
+
+	stats.CPUSetStats.MemorySpreadPage, err = fscommon.GetCgroupParamUint(path, "cpuset.memory_spread_page")
+	if err != nil && !errors.Is(err, os.ErrNotExist) {
+		return err
+	}
+
+	stats.CPUSetStats.MemorySpreadSlab, err = fscommon.GetCgroupParamUint(path, "cpuset.memory_spread_slab")
+	if err != nil && !errors.Is(err, os.ErrNotExist) {
+		return err
+	}
+
+	stats.CPUSetStats.MemoryPressure, err = fscommon.GetCgroupParamUint(path, "cpuset.memory_pressure")
+	if err != nil && !errors.Is(err, os.ErrNotExist) {
+		return err
+	}
+
+	stats.CPUSetStats.SchedLoadBalance, err = fscommon.GetCgroupParamUint(path, "cpuset.sched_load_balance")
+	if err != nil && !errors.Is(err, os.ErrNotExist) {
+		return err
+	}
+
+	stats.CPUSetStats.SchedRelaxDomainLevel, err = fscommon.GetCgroupParamInt(path, "cpuset.sched_relax_domain_level")
+	if err != nil && !errors.Is(err, os.ErrNotExist) {
+		return err
+	}
+
 	return nil
 }

-func (s *CpusetGroup) ApplyDir(dir string, cgroup *configs.Cgroup, pid int) error {
+func (s *CpusetGroup) ApplyDir(dir string, r *configs.Resources, pid int) error {
 	// This might happen if we have no cpuset cgroup mounted.
 	// Just do nothing and don't fail.
 	if dir == "" {
 		return nil
 	}
-	mountInfo, err := ioutil.ReadFile("/proc/self/mountinfo")
-	if err != nil {
-		return err
-	}
-	root := filepath.Dir(cgroups.GetClosestMountpointAncestor(dir, string(mountInfo)))
 	// 'ensureParent' start with parent because we don't want to
 	// explicitly inherit from parent, it could conflict with
 	// 'cpuset.cpu_exclusive'.
-	if err := s.ensureParent(filepath.Dir(dir), root); err != nil {
+	if err := cpusetEnsureParent(filepath.Dir(dir)); err != nil {
 		return err
 	}
-	if err := os.MkdirAll(dir, 0755); err != nil {
+	if err := os.Mkdir(dir, 0o755); err != nil && !os.IsExist(err) {
 		return err
 	}
 	// We didn't inherit cpuset configs from parent, but we have
@ -79,82 +163,83 @@ func (s *CpusetGroup) ApplyDir(dir string, cgroup *configs.Cgroup, pid int) erro
 	// specified configs, otherwise, inherit from parent. This makes
 	// cpuset configs work correctly with 'cpuset.cpu_exclusive', and
 	// keep backward compatibility.
-	if err := s.ensureCpusAndMems(dir, cgroup); err != nil {
+	if err := s.ensureCpusAndMems(dir, r); err != nil {
 		return err
 	}
-
-	// because we are not using d.join we need to place the pid into the procs file
-	// unlike the other subsystems
+	// Since we are not using apply(), we need to place the pid
+	// into the procs file.
 	return cgroups.WriteCgroupProc(dir, pid)
 }

-func (s *CpusetGroup) getSubsystemSettings(parent string) (cpus []byte, mems []byte, err error) {
-	if cpus, err = ioutil.ReadFile(filepath.Join(parent, "cpuset.cpus")); err != nil {
+func getCpusetSubsystemSettings(parent string) (cpus, mems string, err error) {
+	if cpus, err = cgroups.ReadFile(parent, "cpuset.cpus"); err != nil {
 		return
 	}
-	if mems, err = ioutil.ReadFile(filepath.Join(parent, "cpuset.mems")); err != nil {
+	if mems, err = cgroups.ReadFile(parent, "cpuset.mems"); err != nil {
 		return
 	}
 	return cpus, mems, nil
 }

-// ensureParent makes sure that the parent directory of current is created
-// and populated with the proper cpus and mems files copied from
-// it's parent.
-func (s *CpusetGroup) ensureParent(current, root string) error {
+// cpusetEnsureParent makes sure that the parent directories of current
+// are created and populated with the proper cpus and mems files copied
+// from their respective parent. It does that recursively, starting from
+// the top of the cpuset hierarchy (i.e. cpuset cgroup mount point).
+func cpusetEnsureParent(current string) error {
+	var st unix.Statfs_t
+
 	parent := filepath.Dir(current)
-	if libcontainerUtils.CleanPath(parent) == root {
+	err := unix.Statfs(parent, &st)
+	if err == nil && st.Type != unix.CGROUP_SUPER_MAGIC {
 		return nil
 	}
-	// Avoid infinite recursion.
-	if parent == current {
-		return fmt.Errorf("cpuset: cgroup parent path outside cgroup root")
+	// Treat non-existing directory as cgroupfs as it will be created,
+	// and the root cpuset directory obviously exists.
+	if err != nil && err != unix.ENOENT { //nolint:errorlint // unix errors are bare
+		return &os.PathError{Op: "statfs", Path: parent, Err: err}
 	}
-	if err := s.ensureParent(parent, root); err != nil {
+
+	if err := cpusetEnsureParent(parent); err != nil {
 		return err
 	}
-	if err := os.MkdirAll(current, 0755); err != nil {
+	if err := os.Mkdir(current, 0o755); err != nil && !os.IsExist(err) {
 		return err
 	}
-	return s.copyIfNeeded(current, parent)
+	return cpusetCopyIfNeeded(current, parent)
 }

-// copyIfNeeded copies the cpuset.cpus and cpuset.mems from the parent
+// cpusetCopyIfNeeded copies the cpuset.cpus and cpuset.mems from the parent
 // directory to the current directory if the file's contents are 0
-func (s *CpusetGroup) copyIfNeeded(current, parent string) error {
-	var (
-		err                      error
-		currentCpus, currentMems []byte
-		parentCpus, parentMems   []byte
-	)
-
-	if currentCpus, currentMems, err = s.getSubsystemSettings(current); err != nil {
+func cpusetCopyIfNeeded(current, parent string) error {
+	currentCpus, currentMems, err := getCpusetSubsystemSettings(current)
+	if err != nil {
 		return err
 	}
-	if parentCpus, parentMems, err = s.getSubsystemSettings(parent); err != nil {
+	parentCpus, parentMems, err := getCpusetSubsystemSettings(parent)
+	if err != nil {
 		return err
 	}

-	if s.isEmpty(currentCpus) {
-		if err := fscommon.WriteFile(current, "cpuset.cpus", string(parentCpus)); err != nil {
+	if isEmptyCpuset(currentCpus) {
+		if err := cgroups.WriteFile(current, "cpuset.cpus", parentCpus); err != nil {
 			return err
 		}
 	}
-	if s.isEmpty(currentMems) {
-		if err := fscommon.WriteFile(current, "cpuset.mems", string(parentMems)); err != nil {
+	if isEmptyCpuset(currentMems) {
+		if err := cgroups.WriteFile(current, "cpuset.mems", parentMems); err != nil {
 			return err
 		}
 	}
 	return nil
 }

-func (s *CpusetGroup) isEmpty(b []byte) bool {
-	return len(bytes.Trim(b, "\n")) == 0
+func isEmptyCpuset(str string) bool {
+	return str == "" || str == "\n"
 }

-func (s *CpusetGroup) ensureCpusAndMems(path string, cgroup *configs.Cgroup) error {
-	if err := s.Set(path, cgroup); err != nil {
+func (s *CpusetGroup) ensureCpusAndMems(path string, r *configs.Resources) error {
+	if err := s.Set(path, r); err != nil {
 		return err
 	}
-	return s.copyIfNeeded(path, filepath.Dir(path))
+	return cpusetCopyIfNeeded(path, filepath.Dir(path))
 }
--- a/libcontainer/cgroups/fs/cpuset_test.go
+++ b/libcontainer/cgroups/fs/cpuset_test.go
@ -1,67 +1,242 @@
-// +build linux
-
 package fs

 import (
+	"reflect"
 	"testing"

+	"github.com/opencontainers/runc/libcontainer/cgroups"
 	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+	"github.com/opencontainers/runc/libcontainer/configs"
 )

-func TestCpusetSetCpus(t *testing.T) {
-	helper := NewCgroupTestUtil("cpuset", t)
-	defer helper.cleanup()
+const (
+	cpus                  = "0-2,7,12-14\n"
+	cpuExclusive          = "1\n"
+	mems                  = "1-4,6,9\n"
+	memHardwall           = "0\n"
+	memExclusive          = "0\n"
+	memoryMigrate         = "1\n"
+	memorySpreadPage      = "0\n"
+	memorySpeadSlab       = "1\n"
+	memoryPressure        = "34377\n"
+	schedLoadBalance      = "1\n"
+	schedRelaxDomainLevel = "-1\n"
+)
+
+var cpusetTestFiles = map[string]string{
+	"cpuset.cpus":                     cpus,
+	"cpuset.cpu_exclusive":            cpuExclusive,
+	"cpuset.mems":                     mems,
+	"cpuset.mem_hardwall":             memHardwall,
+	"cpuset.mem_exclusive":            memExclusive,
+	"cpuset.memory_migrate":           memoryMigrate,
+	"cpuset.memory_spread_page":       memorySpreadPage,
+	"cpuset.memory_spread_slab":       memorySpeadSlab,
+	"cpuset.memory_pressure":          memoryPressure,
+	"cpuset.sched_load_balance":       schedLoadBalance,
+	"cpuset.sched_relax_domain_level": schedRelaxDomainLevel,
+}
+
+func TestCPUSetSetCpus(t *testing.T) {
+	path := tempDir(t, "cpuset")

 	const (
 		cpusBefore = "0"
 		cpusAfter  = "1-3"
 	)

-	helper.writeFileContents(map[string]string{
+	writeFileContents(t, path, map[string]string{
 		"cpuset.cpus": cpusBefore,
 	})

-	helper.CgroupData.config.Resources.CpusetCpus = cpusAfter
+	r := &configs.Resources{
+		CpusetCpus: cpusAfter,
+	}
 	cpuset := &CpusetGroup{}
-	if err := cpuset.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+	if err := cpuset.Set(path, r); err != nil {
 		t.Fatal(err)
 	}

-	value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "cpuset.cpus")
+	value, err := fscommon.GetCgroupParamString(path, "cpuset.cpus")
 	if err != nil {
-		t.Fatalf("Failed to parse cpuset.cpus - %s", err)
+		t.Fatal(err)
 	}
-
 	if value != cpusAfter {
 		t.Fatal("Got the wrong value, set cpuset.cpus failed.")
 	}
 }

-func TestCpusetSetMems(t *testing.T) {
-	helper := NewCgroupTestUtil("cpuset", t)
-	defer helper.cleanup()
+func TestCPUSetSetMems(t *testing.T) {
+	path := tempDir(t, "cpuset")

 	const (
 		memsBefore = "0"
 		memsAfter  = "1"
 	)

-	helper.writeFileContents(map[string]string{
+	writeFileContents(t, path, map[string]string{
 		"cpuset.mems": memsBefore,
 	})

-	helper.CgroupData.config.Resources.CpusetMems = memsAfter
+	r := &configs.Resources{
+		CpusetMems: memsAfter,
+	}
 	cpuset := &CpusetGroup{}
-	if err := cpuset.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+	if err := cpuset.Set(path, r); err != nil {
 		t.Fatal(err)
 	}

-	value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "cpuset.mems")
+	value, err := fscommon.GetCgroupParamString(path, "cpuset.mems")
 	if err != nil {
-		t.Fatalf("Failed to parse cpuset.mems - %s", err)
+		t.Fatal(err)
 	}
-
 	if value != memsAfter {
 		t.Fatal("Got the wrong value, set cpuset.mems failed.")
 	}
 }
+
+func TestCPUSetStatsCorrect(t *testing.T) {
+	path := tempDir(t, "cpuset")
+	writeFileContents(t, path, cpusetTestFiles)
+
+	cpuset := &CpusetGroup{}
+	actualStats := *cgroups.NewStats()
+	err := cpuset.GetStats(path, &actualStats)
+	if err != nil {
+		t.Fatal(err)
+	}
+	expectedStats := cgroups.CPUSetStats{
+		CPUs:                  []uint16{0, 1, 2, 7, 12, 13, 14},
+		CPUExclusive:          1,
+		Mems:                  []uint16{1, 2, 3, 4, 6, 9},
+		MemoryMigrate:         1,
+		MemHardwall:           0,
+		MemExclusive:          0,
+		MemorySpreadPage:      0,
+		MemorySpreadSlab:      1,
+		MemoryPressure:        34377,
+		SchedLoadBalance:      1,
+		SchedRelaxDomainLevel: -1,
+	}
+	if !reflect.DeepEqual(expectedStats, actualStats.CPUSetStats) {
+		t.Fatalf("Expected Cpuset stats usage %#v but found %#v",
+			expectedStats, actualStats.CPUSetStats)
+	}
+}
+
+func TestCPUSetStatsMissingFiles(t *testing.T) {
+	for _, testCase := range []struct {
+		desc               string
+		filename, contents string
+		removeFile         bool
+	}{
+		{
+			desc:       "empty cpus file",
+			filename:   "cpuset.cpus",
+			contents:   "",
+			removeFile: false,
+		},
+		{
+			desc:       "empty mems file",
+			filename:   "cpuset.mems",
+			contents:   "",
+			removeFile: false,
+		},
+		{
+			desc:       "corrupted cpus file",
+			filename:   "cpuset.cpus",
+			contents:   "0-3,*4^2",
+			removeFile: false,
+		},
+		{
+			desc:       "corrupted mems file",
+			filename:   "cpuset.mems",
+			contents:   "0,1,2-5,8-7",
+			removeFile: false,
+		},
+		{
+			desc:       "missing cpu_exclusive file",
+			filename:   "cpuset.cpu_exclusive",
+			contents:   "",
+			removeFile: true,
+		},
+		{
+			desc:       "missing memory_migrate file",
+			filename:   "cpuset.memory_migrate",
+			contents:   "",
+			removeFile: true,
+		},
+		{
+			desc:       "missing mem_hardwall file",
+			filename:   "cpuset.mem_hardwall",
+			contents:   "",
+			removeFile: true,
+		},
+		{
+			desc:       "missing mem_exclusive file",
+			filename:   "cpuset.mem_exclusive",
+			contents:   "",
+			removeFile: true,
+		},
+		{
+			desc:       "missing memory_spread_page file",
+			filename:   "cpuset.memory_spread_page",
+			contents:   "",
+			removeFile: true,
+		},
+		{
+			desc:       "missing memory_spread_slab file",
+			filename:   "cpuset.memory_spread_slab",
+			contents:   "",
+			removeFile: true,
+		},
+		{
+			desc:       "missing memory_pressure file",
+			filename:   "cpuset.memory_pressure",
+			contents:   "",
+			removeFile: true,
+		},
+		{
+			desc:       "missing sched_load_balance file",
+			filename:   "cpuset.sched_load_balance",
+			contents:   "",
+			removeFile: true,
+		},
+		{
+			desc:       "missing sched_relax_domain_level file",
+			filename:   "cpuset.sched_relax_domain_level",
+			contents:   "",
+			removeFile: true,
+		},
+	} {
+		t.Run(testCase.desc, func(t *testing.T) {
+			path := tempDir(t, "cpuset")
+
+			tempCpusetTestFiles := map[string]string{}
+			for i, v := range cpusetTestFiles {
+				tempCpusetTestFiles[i] = v
+			}
+
+			if testCase.removeFile {
+				delete(tempCpusetTestFiles, testCase.filename)
+				writeFileContents(t, path, tempCpusetTestFiles)
+				cpuset := &CpusetGroup{}
+				actualStats := *cgroups.NewStats()
+				err := cpuset.GetStats(path, &actualStats)
+				if err != nil {
+					t.Errorf("failed unexpectedly: %q", err)
+				}
+			} else {
+				tempCpusetTestFiles[testCase.filename] = testCase.contents
+				writeFileContents(t, path, tempCpusetTestFiles)
+				cpuset := &CpusetGroup{}
+				actualStats := *cgroups.NewStats()
+				err := cpuset.GetStats(path, &actualStats)
+
+				if err == nil {
+					t.Error("failed to return expected error")
+				}
+			}
+		})
+	}
+}
--- a/libcontainer/cgroups/fs/devices.go
+++ b/libcontainer/cgroups/fs/devices.go
@ -1,81 +1,109 @@
-// +build linux
-
 package fs

 import (
+	"bytes"
+	"errors"
+	"reflect"
+
 	"github.com/opencontainers/runc/libcontainer/cgroups"
-	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+	cgroupdevices "github.com/opencontainers/runc/libcontainer/cgroups/devices"
 	"github.com/opencontainers/runc/libcontainer/configs"
-	"github.com/opencontainers/runc/libcontainer/system"
+	"github.com/opencontainers/runc/libcontainer/devices"
+	"github.com/opencontainers/runc/libcontainer/userns"
 )

 type DevicesGroup struct {
+	TestingSkipFinalCheck bool
 }

 func (s *DevicesGroup) Name() string {
 	return "devices"
 }

-func (s *DevicesGroup) Apply(d *cgroupData) error {
-	_, err := d.join("devices")
+func (s *DevicesGroup) Apply(path string, r *configs.Resources, pid int) error {
+	if r.SkipDevices {
+		return nil
+	}
+	if path == "" {
+		// Return error here, since devices cgroup
+		// is a hard requirement for container's security.
+		return errSubsystemDoesNotExist
+	}
+
+	return apply(path, pid)
+}
+
+func loadEmulator(path string) (*cgroupdevices.Emulator, error) {
+	list, err := cgroups.ReadFile(path, "devices.list")
+	if err != nil {
+		return nil, err
+	}
+	return cgroupdevices.EmulatorFromList(bytes.NewBufferString(list))
+}
+
+func buildEmulator(rules []*devices.Rule) (*cgroupdevices.Emulator, error) {
+	// This defaults to a white-list -- which is what we want!
+	emu := &cgroupdevices.Emulator{}
+	for _, rule := range rules {
+		if err := emu.Apply(*rule); err != nil {
+			return nil, err
+		}
+	}
+	return emu, nil
+}
+
+func (s *DevicesGroup) Set(path string, r *configs.Resources) error {
+	if userns.RunningInUserNS() || r.SkipDevices {
+		return nil
+	}
+
+	// Generate two emulators, one for the current state of the cgroup and one
+	// for the requested state by the user.
+	current, err := loadEmulator(path)
 	if err != nil {
-		// We will return error even it's `not found` error, devices
-		// cgroup is hard requirement for container's security.
 		return err
 	}
-	return nil
-}
-
-func (s *DevicesGroup) Set(path string, cgroup *configs.Cgroup) error {
-	if system.RunningInUserNS() {
-		return nil
+	target, err := buildEmulator(r.Devices)
+	if err != nil {
+		return err
 	}

-	devices := cgroup.Resources.Devices
-	if len(devices) > 0 {
-		for _, dev := range devices {
-			file := "devices.deny"
-			if dev.Allow {
-				file = "devices.allow"
-			}
-			if err := fscommon.WriteFile(path, file, dev.CgroupString()); err != nil {
-				return err
-			}
-		}
-		return nil
+	// Compute the minimal set of transition rules needed to achieve the
+	// requested state.
+	transitionRules, err := current.Transition(target)
+	if err != nil {
+		return err
 	}
-	if cgroup.Resources.AllowAllDevices != nil {
-		if *cgroup.Resources.AllowAllDevices == false {
-			if err := fscommon.WriteFile(path, "devices.deny", "a"); err != nil {
-				return err
-			}
-
-			for _, dev := range cgroup.Resources.AllowedDevices {
-				if err := fscommon.WriteFile(path, "devices.allow", dev.CgroupString()); err != nil {
-					return err
-				}
-			}
-			return nil
+	for _, rule := range transitionRules {
+		file := "devices.deny"
+		if rule.Allow {
+			file = "devices.allow"
 		}
-
-		if err := fscommon.WriteFile(path, "devices.allow", "a"); err != nil {
+		if err := cgroups.WriteFile(path, file, rule.CgroupString()); err != nil {
 			return err
 		}
 	}

-	for _, dev := range cgroup.Resources.DeniedDevices {
-		if err := fscommon.WriteFile(path, "devices.deny", dev.CgroupString()); err != nil {
+	// Final safety check -- ensure that the resulting state is what was
+	// requested. This is only really correct for white-lists, but for
+	// black-lists we can at least check that the cgroup is in the right mode.
+	//
+	// This safety-check is skipped for the unit tests because we cannot
+	// currently mock devices.list correctly.
+	if !s.TestingSkipFinalCheck {
+		currentAfter, err := loadEmulator(path)
+		if err != nil {
 			return err
 		}
+		if !target.IsBlacklist() && !reflect.DeepEqual(currentAfter, target) {
+			return errors.New("resulting devices cgroup doesn't precisely match target")
+		} else if target.IsBlacklist() != currentAfter.IsBlacklist() {
+			return errors.New("resulting devices cgroup doesn't match target mode")
+		}
 	}
-
 	return nil
 }

-func (s *DevicesGroup) Remove(d *cgroupData) error {
-	return removePath(d.path("devices"))
-}
-
 func (s *DevicesGroup) GetStats(path string, stats *cgroups.Stats) error {
 	return nil
 }
--- a/libcontainer/cgroups/fs/devices_test.go
+++ b/libcontainer/cgroups/fs/devices_test.go
@ -1,5 +1,3 @@
-// +build linux
-
 package fs

 import (
@ -7,93 +5,48 @@ import (

 	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
 	"github.com/opencontainers/runc/libcontainer/configs"
-)
-
-var (
-	allowedDevices = []*configs.Device{
-		{
-			Path:        "/dev/zero",
-			Type:        'c',
-			Major:       1,
-			Minor:       5,
-			Permissions: "rwm",
-			FileMode:    0666,
-		},
-	}
-	allowedList   = "c 1:5 rwm"
-	deniedDevices = []*configs.Device{
-		{
-			Path:        "/dev/null",
-			Type:        'c',
-			Major:       1,
-			Minor:       3,
-			Permissions: "rwm",
-			FileMode:    0666,
-		},
-	}
-	deniedList = "c 1:3 rwm"
+	"github.com/opencontainers/runc/libcontainer/devices"
 )

 func TestDevicesSetAllow(t *testing.T) {
-	helper := NewCgroupTestUtil("devices", t)
-	defer helper.cleanup()
+	path := tempDir(t, "devices")

-	helper.writeFileContents(map[string]string{
-		"devices.deny": "a",
-	})
-	allowAllDevices := false
-	helper.CgroupData.config.Resources.AllowAllDevices = &allowAllDevices
-	helper.CgroupData.config.Resources.AllowedDevices = allowedDevices
-	devices := &DevicesGroup{}
-	if err := devices.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
-		t.Fatal(err)
-	}
-
-	value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "devices.allow")
-	if err != nil {
-		t.Fatalf("Failed to parse devices.allow - %s", err)
-	}
-
-	if value != allowedList {
-		t.Fatal("Got the wrong value, set devices.allow failed.")
-	}
-
-	// When AllowAllDevices is nil, devices.allow file should not be modified.
-	helper.CgroupData.config.Resources.AllowAllDevices = nil
-	if err := devices.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
-		t.Fatal(err)
-	}
-	value, err = fscommon.GetCgroupParamString(helper.CgroupPath, "devices.allow")
-	if err != nil {
-		t.Fatalf("Failed to parse devices.allow - %s", err)
-	}
-	if value != allowedList {
-		t.Fatal("devices policy shouldn't have changed on AllowedAllDevices=nil.")
-	}
-}
-
-func TestDevicesSetDeny(t *testing.T) {
-	helper := NewCgroupTestUtil("devices", t)
-	defer helper.cleanup()
-
-	helper.writeFileContents(map[string]string{
-		"devices.allow": "a",
+	writeFileContents(t, path, map[string]string{
+		"devices.allow": "",
+		"devices.deny":  "",
+		"devices.list":  "a *:* rwm",
 	})

-	allowAllDevices := true
-	helper.CgroupData.config.Resources.AllowAllDevices = &allowAllDevices
-	helper.CgroupData.config.Resources.DeniedDevices = deniedDevices
-	devices := &DevicesGroup{}
-	if err := devices.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+	r := &configs.Resources{
+		Devices: []*devices.Rule{
+			{
+				Type:        devices.CharDevice,
+				Major:       1,
+				Minor:       5,
+				Permissions: devices.Permissions("rwm"),
+				Allow:       true,
+			},
+		},
+	}
+
+	d := &DevicesGroup{TestingSkipFinalCheck: true}
+	if err := d.Set(path, r); err != nil {
 		t.Fatal(err)
 	}

-	value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "devices.deny")
+	// The default deny rule must be written.
+	value, err := fscommon.GetCgroupParamString(path, "devices.deny")
 	if err != nil {
-		t.Fatalf("Failed to parse devices.deny - %s", err)
+		t.Fatal(err)
+	}
+	if value[0] != 'a' {
+		t.Errorf("Got the wrong value (%q), set devices.deny failed.", value)
 	}

-	if value != deniedList {
-		t.Fatal("Got the wrong value, set devices.deny failed.")
+	// Permitted rule must be written.
+	if value, err := fscommon.GetCgroupParamString(path, "devices.allow"); err != nil {
+		t.Fatal(err)
+	} else if value != "c 1:5 rwm" {
+		t.Errorf("Got the wrong value (%q), set devices.allow failed.", value)
 	}
 }
--- a/libcontainer/cgroups/fs/error.go
+++ b/libcontainer/cgroups/fs/error.go
@ -0,0 +1,15 @@
+package fs
+
+import (
+	"fmt"
+
+	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+)
+
+type parseError = fscommon.ParseError
+
+// malformedLine is used by all cgroupfs file parsers that expect a line
+// in a particular format but get some garbage instead.
+func malformedLine(path, file, line string) error {
+	return &parseError{Path: path, File: file, Err: fmt.Errorf("malformed line: %s", line)}
+}
--- a/libcontainer/cgroups/fs/freezer.go
+++ b/libcontainer/cgroups/fs/freezer.go
@ -1,67 +1,158 @@
-// +build linux
-
 package fs

 import (
+	"errors"
 	"fmt"
+	"os"
 	"strings"
 	"time"

 	"github.com/opencontainers/runc/libcontainer/cgroups"
-	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
 	"github.com/opencontainers/runc/libcontainer/configs"
+	"github.com/sirupsen/logrus"
+	"golang.org/x/sys/unix"
 )

-type FreezerGroup struct {
-}
+type FreezerGroup struct{}

 func (s *FreezerGroup) Name() string {
 	return "freezer"
 }

-func (s *FreezerGroup) Apply(d *cgroupData) error {
-	_, err := d.join("freezer")
-	if err != nil && !cgroups.IsNotFound(err) {
-		return err
-	}
-	return nil
+func (s *FreezerGroup) Apply(path string, _ *configs.Resources, pid int) error {
+	return apply(path, pid)
 }

-func (s *FreezerGroup) Set(path string, cgroup *configs.Cgroup) error {
-	switch cgroup.Resources.Freezer {
-	case configs.Frozen, configs.Thawed:
-		for {
-			// In case this loop does not exit because it doesn't get the expected
-			// state, let's write again this state, hoping it's going to be properly
-			// set this time. Otherwise, this loop could run infinitely, waiting for
-			// a state change that would never happen.
-			if err := fscommon.WriteFile(path, "freezer.state", string(cgroup.Resources.Freezer)); err != nil {
+func (s *FreezerGroup) Set(path string, r *configs.Resources) (Err error) {
+	switch r.Freezer {
+	case configs.Frozen:
+		defer func() {
+			if Err != nil {
+				// Freezing failed, and it is bad and dangerous
+				// to leave the cgroup in FROZEN or FREEZING
+				// state, so (try to) thaw it back.
+				_ = cgroups.WriteFile(path, "freezer.state", string(configs.Thawed))
+			}
+		}()
+
+		// As per older kernel docs (freezer-subsystem.txt before
+		// kernel commit ef9fe980c6fcc1821), if FREEZING is seen,
+		// userspace should either retry or thaw. While current
+		// kernel cgroup v1 docs no longer mention a need to retry,
+		// even a recent kernel (v5.4, Ubuntu 20.04) can't reliably
+		// freeze a cgroup v1 while new processes keep appearing in it
+		// (either via fork/clone or by writing new PIDs to
+		// cgroup.procs).
+		//
+		// The numbers below are empirically chosen to have a decent
+		// chance to succeed in various scenarios ("runc pause/unpause
+		// with parallel runc exec" and "bare freeze/unfreeze on a very
+		// slow system"), tested on RHEL7 and Ubuntu 20.04 kernels.
+		//
+		// Adding any amount of sleep in between retries did not
+		// increase the chances of successful freeze in "pause/unpause
+		// with parallel exec" reproducer. OTOH, adding an occasional
+		// sleep helped for the case where the system is extremely slow
+		// (CentOS 7 VM on GHA CI).
+		//
+		// Alas, this is still a game of chances, since the real fix
+		// belong to the kernel (cgroup v2 do not have this bug).
+
+		for i := 0; i < 1000; i++ {
+			if i%50 == 49 {
+				// Occasional thaw and sleep improves
+				// the chances to succeed in freezing
+				// in case new processes keep appearing
+				// in the cgroup.
+				_ = cgroups.WriteFile(path, "freezer.state", string(configs.Thawed))
+				time.Sleep(10 * time.Millisecond)
+			}
+
+			if err := cgroups.WriteFile(path, "freezer.state", string(configs.Frozen)); err != nil {
 				return err
 			}

-			state, err := fscommon.ReadFile(path, "freezer.state")
+			if i%25 == 24 {
+				// Occasional short sleep before reading
+				// the state back also improves the chances to
+				// succeed in freezing in case of a very slow
+				// system.
+				time.Sleep(10 * time.Microsecond)
+			}
+			state, err := cgroups.ReadFile(path, "freezer.state")
 			if err != nil {
 				return err
 			}
-			if strings.TrimSpace(state) == string(cgroup.Resources.Freezer) {
-				break
+			state = strings.TrimSpace(state)
+			switch state {
+			case "FREEZING":
+				continue
+			case string(configs.Frozen):
+				if i > 1 {
+					logrus.Debugf("frozen after %d retries", i)
+				}
+				return nil
+			default:
+				// should never happen
+				return fmt.Errorf("unexpected state %s while freezing", strings.TrimSpace(state))
 			}
-
-			time.Sleep(1 * time.Millisecond)
 		}
+		// Despite our best efforts, it got stuck in FREEZING.
+		return errors.New("unable to freeze")
+	case configs.Thawed:
+		return cgroups.WriteFile(path, "freezer.state", string(configs.Thawed))
 	case configs.Undefined:
 		return nil
 	default:
-		return fmt.Errorf("Invalid argument '%s' to freezer.state", string(cgroup.Resources.Freezer))
+		return fmt.Errorf("Invalid argument '%s' to freezer.state", string(r.Freezer))
 	}
-
-	return nil
-}
-
-func (s *FreezerGroup) Remove(d *cgroupData) error {
-	return removePath(d.path("freezer"))
 }

 func (s *FreezerGroup) GetStats(path string, stats *cgroups.Stats) error {
 	return nil
 }
+
+func (s *FreezerGroup) GetState(path string) (configs.FreezerState, error) {
+	for {
+		state, err := cgroups.ReadFile(path, "freezer.state")
+		if err != nil {
+			// If the kernel is too old, then we just treat the freezer as
+			// being in an "undefined" state.
+			if os.IsNotExist(err) || errors.Is(err, unix.ENODEV) {
+				err = nil
+			}
+			return configs.Undefined, err
+		}
+		switch strings.TrimSpace(state) {
+		case "THAWED":
+			return configs.Thawed, nil
+		case "FROZEN":
+			// Find out whether the cgroup is frozen directly,
+			// or indirectly via an ancestor.
+			self, err := cgroups.ReadFile(path, "freezer.self_freezing")
+			if err != nil {
+				// If the kernel is too old, then we just treat
+				// it as being frozen.
+				if errors.Is(err, os.ErrNotExist) || errors.Is(err, unix.ENODEV) {
+					err = nil
+				}
+				return configs.Frozen, err
+			}
+			switch self {
+			case "0\n":
+				return configs.Thawed, nil
+			case "1\n":
+				return configs.Frozen, nil
+			default:
+				return configs.Undefined, fmt.Errorf(`unknown "freezer.self_freezing" state: %q`, self)
+			}
+		case "FREEZING":
+			// Make sure we get a stable freezer state, so retry if the cgroup
+			// is still undergoing freezing. This should be a temporary delay.
+			time.Sleep(1 * time.Millisecond)
+			continue
+		default:
+			return configs.Undefined, fmt.Errorf("unknown freezer.state %q", state)
+		}
+	}
+}
--- a/libcontainer/cgroups/fs/freezer_test.go
+++ b/libcontainer/cgroups/fs/freezer_test.go
@ -1,5 +1,3 @@
-// +build linux
-
 package fs

 import (
@ -10,22 +8,23 @@ import (
 )

 func TestFreezerSetState(t *testing.T) {
-	helper := NewCgroupTestUtil("freezer", t)
-	defer helper.cleanup()
+	path := tempDir(t, "freezer")

-	helper.writeFileContents(map[string]string{
+	writeFileContents(t, path, map[string]string{
 		"freezer.state": string(configs.Frozen),
 	})

-	helper.CgroupData.config.Resources.Freezer = configs.Thawed
+	r := &configs.Resources{
+		Freezer: configs.Thawed,
+	}
 	freezer := &FreezerGroup{}
-	if err := freezer.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+	if err := freezer.Set(path, r); err != nil {
 		t.Fatal(err)
 	}

-	value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "freezer.state")
+	value, err := fscommon.GetCgroupParamString(path, "freezer.state")
 	if err != nil {
-		t.Fatalf("Failed to parse freezer.state - %s", err)
+		t.Fatal(err)
 	}
 	if value != string(configs.Thawed) {
 		t.Fatal("Got the wrong value, set freezer.state failed.")
@ -33,16 +32,15 @@ func TestFreezerSetState(t *testing.T) {
 }

 func TestFreezerSetInvalidState(t *testing.T) {
-	helper := NewCgroupTestUtil("freezer", t)
-	defer helper.cleanup()
+	path := tempDir(t, "freezer")

-	const (
-		invalidArg configs.FreezerState = "Invalid"
-	)
+	const invalidArg configs.FreezerState = "Invalid"

-	helper.CgroupData.config.Resources.Freezer = invalidArg
+	r := &configs.Resources{
+		Freezer: invalidArg,
+	}
 	freezer := &FreezerGroup{}
-	if err := freezer.Set(helper.CgroupPath, helper.CgroupData.config); err == nil {
+	if err := freezer.Set(path, r); err == nil {
 		t.Fatal("Failed to return invalid argument error")
 	}
 }
--- a/libcontainer/cgroups/fs/fs.go
+++ b/libcontainer/cgroups/fs/fs.go
@ -0,0 +1,264 @@
+package fs
+
+import (
+	"errors"
+	"fmt"
+	"os"
+	"sync"
+
+	"golang.org/x/sys/unix"
+
+	"github.com/opencontainers/runc/libcontainer/cgroups"
+	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+	"github.com/opencontainers/runc/libcontainer/configs"
+)
+
+var subsystems = []subsystem{
+	&CpusetGroup{},
+	&DevicesGroup{},
+	&MemoryGroup{},
+	&CpuGroup{},
+	&CpuacctGroup{},
+	&PidsGroup{},
+	&BlkioGroup{},
+	&HugetlbGroup{},
+	&NetClsGroup{},
+	&NetPrioGroup{},
+	&PerfEventGroup{},
+	&FreezerGroup{},
+	&RdmaGroup{},
+	&NameGroup{GroupName: "name=systemd", Join: true},
+}
+
+var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist")
+
+func init() {
+	// If using cgroups-hybrid mode then add a "" controller indicating
+	// it should join the cgroups v2.
+	if cgroups.IsCgroup2HybridMode() {
+		subsystems = append(subsystems, &NameGroup{GroupName: "", Join: true})
+	}
+}
+
+type subsystem interface {
+	// Name returns the name of the subsystem.
+	Name() string
+	// GetStats fills in the stats for the subsystem.
+	GetStats(path string, stats *cgroups.Stats) error
+	// Apply creates and joins a cgroup, adding pid into it. Some
+	// subsystems use resources to pre-configure the cgroup parents
+	// before creating or joining it.
+	Apply(path string, r *configs.Resources, pid int) error
+	// Set sets the cgroup resources.
+	Set(path string, r *configs.Resources) error
+}
+
+type manager struct {
+	mu      sync.Mutex
+	cgroups *configs.Cgroup
+	paths   map[string]string
+}
+
+func NewManager(cg *configs.Cgroup, paths map[string]string) (cgroups.Manager, error) {
+	// Some v1 controllers (cpu, cpuset, and devices) expect
+	// cgroups.Resources to not be nil in Apply.
+	if cg.Resources == nil {
+		return nil, errors.New("cgroup v1 manager needs configs.Resources to be set during manager creation")
+	}
+	if cg.Resources.Unified != nil {
+		return nil, cgroups.ErrV1NoUnified
+	}
+
+	if paths == nil {
+		var err error
+		paths, err = initPaths(cg)
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	return &manager{
+		cgroups: cg,
+		paths:   paths,
+	}, nil
+}
+
+// isIgnorableError returns whether err is a permission error (in the loose
+// sense of the word). This includes EROFS (which for an unprivileged user is
+// basically a permission error) and EACCES (for similar reasons) as well as
+// the normal EPERM.
+func isIgnorableError(rootless bool, err error) bool {
+	// We do not ignore errors if we are root.
+	if !rootless {
+		return false
+	}
+	// Is it an ordinary EPERM?
+	if errors.Is(err, os.ErrPermission) {
+		return true
+	}
+	// Handle some specific syscall errors.
+	var errno unix.Errno
+	if errors.As(err, &errno) {
+		return errno == unix.EROFS || errno == unix.EPERM || errno == unix.EACCES
+	}
+	return false
+}
+
+func (m *manager) Apply(pid int) (err error) {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+
+	c := m.cgroups
+
+	for _, sys := range subsystems {
+		name := sys.Name()
+		p, ok := m.paths[name]
+		if !ok {
+			continue
+		}
+
+		if err := sys.Apply(p, c.Resources, pid); err != nil {
+			// In the case of rootless (including euid=0 in userns), where an
+			// explicit cgroup path hasn't been set, we don't bail on error in
+			// case of permission problems here, but do delete the path from
+			// the m.paths map, since it is either non-existent and could not
+			// be created, or the pid could not be added to it.
+			//
+			// Cases where limits for the subsystem have been set are handled
+			// later by Set, which fails with a friendly error (see
+			// if path == "" in Set).
+			if isIgnorableError(c.Rootless, err) && c.Path == "" {
+				delete(m.paths, name)
+				continue
+			}
+			return err
+		}
+
+	}
+	return nil
+}
+
+func (m *manager) Destroy() error {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+	return cgroups.RemovePaths(m.paths)
+}
+
+func (m *manager) Path(subsys string) string {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+	return m.paths[subsys]
+}
+
+func (m *manager) GetStats() (*cgroups.Stats, error) {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+	stats := cgroups.NewStats()
+	for _, sys := range subsystems {
+		path := m.paths[sys.Name()]
+		if path == "" {
+			continue
+		}
+		if err := sys.GetStats(path, stats); err != nil {
+			return nil, err
+		}
+	}
+	return stats, nil
+}
+
+func (m *manager) Set(r *configs.Resources) error {
+	if r == nil {
+		return nil
+	}
+
+	if r.Unified != nil {
+		return cgroups.ErrV1NoUnified
+	}
+
+	m.mu.Lock()
+	defer m.mu.Unlock()
+	for _, sys := range subsystems {
+		path := m.paths[sys.Name()]
+		if err := sys.Set(path, r); err != nil {
+			// When rootless is true, errors from the device subsystem
+			// are ignored, as it is really not expected to work.
+			if m.cgroups.Rootless && sys.Name() == "devices" {
+				continue
+			}
+			// However, errors from other subsystems are not ignored.
+			// see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error"
+			if path == "" {
+				// We never created a path for this cgroup, so we cannot set
+				// limits for it (though we have already tried at this point).
+				return fmt.Errorf("cannot set %s limit: container could not join or create cgroup", sys.Name())
+			}
+			return err
+		}
+	}
+
+	return nil
+}
+
+// Freeze toggles the container's freezer cgroup depending on the state
+// provided
+func (m *manager) Freeze(state configs.FreezerState) error {
+	path := m.Path("freezer")
+	if path == "" {
+		return errors.New("cannot toggle freezer: cgroups not configured for container")
+	}
+
+	prevState := m.cgroups.Resources.Freezer
+	m.cgroups.Resources.Freezer = state
+	freezer := &FreezerGroup{}
+	if err := freezer.Set(path, m.cgroups.Resources); err != nil {
+		m.cgroups.Resources.Freezer = prevState
+		return err
+	}
+	return nil
+}
+
+func (m *manager) GetPids() ([]int, error) {
+	return cgroups.GetPids(m.Path("devices"))
+}
+
+func (m *manager) GetAllPids() ([]int, error) {
+	return cgroups.GetAllPids(m.Path("devices"))
+}
+
+func (m *manager) GetPaths() map[string]string {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+	return m.paths
+}
+
+func (m *manager) GetCgroups() (*configs.Cgroup, error) {
+	return m.cgroups, nil
+}
+
+func (m *manager) GetFreezerState() (configs.FreezerState, error) {
+	dir := m.Path("freezer")
+	// If the container doesn't have the freezer cgroup, say it's undefined.
+	if dir == "" {
+		return configs.Undefined, nil
+	}
+	freezer := &FreezerGroup{}
+	return freezer.GetState(dir)
+}
+
+func (m *manager) Exists() bool {
+	return cgroups.PathExists(m.Path("devices"))
+}
+
+func OOMKillCount(path string) (uint64, error) {
+	return fscommon.GetValueByKey(path, "memory.oom_control", "oom_kill")
+}
+
+func (m *manager) OOMKillCount() (uint64, error) {
+	c, err := OOMKillCount(m.Path("memory"))
+	// Ignore ENOENT when rootless as it couldn't create cgroup.
+	if err != nil && m.cgroups.Rootless && os.IsNotExist(err) {
+		err = nil
+	}
+
+	return c, err
+}
--- a/libcontainer/cgroups/fs/fs_test.go
+++ b/libcontainer/cgroups/fs/fs_test.go
@ -0,0 +1,50 @@
+package fs
+
+import (
+	"testing"
+
+	"github.com/opencontainers/runc/libcontainer/cgroups"
+	"github.com/opencontainers/runc/libcontainer/configs"
+)
+
+func BenchmarkGetStats(b *testing.B) {
+	if cgroups.IsCgroup2UnifiedMode() {
+		b.Skip("cgroup v2 is not supported")
+	}
+
+	// Unset TestMode as we work with real cgroupfs here,
+	// and we want OpenFile to perform the fstype check.
+	cgroups.TestMode = false
+	defer func() {
+		cgroups.TestMode = true
+	}()
+
+	cg := &configs.Cgroup{
+		Path:      "/some/kind/of/a/path/here",
+		Resources: &configs.Resources{},
+	}
+	m, err := NewManager(cg, nil)
+	if err != nil {
+		b.Fatal(err)
+	}
+	err = m.Apply(-1)
+	if err != nil {
+		b.Fatal(err)
+	}
+	defer func() {
+		_ = m.Destroy()
+	}()
+
+	var st *cgroups.Stats
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		st, err = m.GetStats()
+		if err != nil {
+			b.Fatal(err)
+		}
+	}
+	if st.CpuStats.CpuUsage.TotalUsage != 0 {
+		b.Fatalf("stats: %+v", st)
+	}
+}
--- a/libcontainer/cgroups/fs/fs_unsupported.go
+++ b/libcontainer/cgroups/fs/fs_unsupported.go
@ -1,3 +0,0 @@
-// +build !linux
-
-package fs
--- a/libcontainer/cgroups/fs/hugetlb.go
+++ b/libcontainer/cgroups/fs/hugetlb.go
@ -1,35 +1,26 @@
-// +build linux
-
 package fs

 import (
-	"fmt"
 	"strconv"
-	"strings"

 	"github.com/opencontainers/runc/libcontainer/cgroups"
 	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
 	"github.com/opencontainers/runc/libcontainer/configs"
 )

-type HugetlbGroup struct {
-}
+type HugetlbGroup struct{}

 func (s *HugetlbGroup) Name() string {
 	return "hugetlb"
 }

-func (s *HugetlbGroup) Apply(d *cgroupData) error {
-	_, err := d.join("hugetlb")
-	if err != nil && !cgroups.IsNotFound(err) {
-		return err
-	}
-	return nil
+func (s *HugetlbGroup) Apply(path string, _ *configs.Resources, pid int) error {
+	return apply(path, pid)
 }

-func (s *HugetlbGroup) Set(path string, cgroup *configs.Cgroup) error {
-	for _, hugetlb := range cgroup.Resources.HugetlbLimit {
-		if err := fscommon.WriteFile(path, strings.Join([]string{"hugetlb", hugetlb.Pagesize, "limit_in_bytes"}, "."), strconv.FormatUint(hugetlb.Limit, 10)); err != nil {
+func (s *HugetlbGroup) Set(path string, r *configs.Resources) error {
+	for _, hugetlb := range r.HugetlbLimit {
+		if err := cgroups.WriteFile(path, "hugetlb."+hugetlb.Pagesize+".limit_in_bytes", strconv.FormatUint(hugetlb.Limit, 10)); err != nil {
 			return err
 		}
 	}
@ -37,31 +28,30 @@ func (s *HugetlbGroup) Set(path string, cgroup *configs.Cgroup) error {
 	return nil
 }

-func (s *HugetlbGroup) Remove(d *cgroupData) error {
-	return removePath(d.path("hugetlb"))
-}
-
 func (s *HugetlbGroup) GetStats(path string, stats *cgroups.Stats) error {
+	if !cgroups.PathExists(path) {
+		return nil
+	}
 	hugetlbStats := cgroups.HugetlbStats{}
-	for _, pageSize := range HugePageSizes {
-		usage := strings.Join([]string{"hugetlb", pageSize, "usage_in_bytes"}, ".")
+	for _, pageSize := range cgroups.HugePageSizes() {
+		usage := "hugetlb." + pageSize + ".usage_in_bytes"
 		value, err := fscommon.GetCgroupParamUint(path, usage)
 		if err != nil {
-			return fmt.Errorf("failed to parse %s - %v", usage, err)
+			return err
 		}
 		hugetlbStats.Usage = value

-		maxUsage := strings.Join([]string{"hugetlb", pageSize, "max_usage_in_bytes"}, ".")
+		maxUsage := "hugetlb." + pageSize + ".max_usage_in_bytes"
 		value, err = fscommon.GetCgroupParamUint(path, maxUsage)
 		if err != nil {
-			return fmt.Errorf("failed to parse %s - %v", maxUsage, err)
+			return err
 		}
 		hugetlbStats.MaxUsage = value

-		failcnt := strings.Join([]string{"hugetlb", pageSize, "failcnt"}, ".")
+		failcnt := "hugetlb." + pageSize + ".failcnt"
 		value, err = fscommon.GetCgroupParamUint(path, failcnt)
 		if err != nil {
-			return fmt.Errorf("failed to parse %s - %v", failcnt, err)
+			return err
 		}
 		hugetlbStats.Failcnt = value

--- a/libcontainer/cgroups/fs/hugetlb_test.go
+++ b/libcontainer/cgroups/fs/hugetlb_test.go
@ -1,5 +1,3 @@
-// +build linux
-
 package fs

 import (
@ -18,7 +16,7 @@ const (
 	hugetlbFailcnt          = "100\n"
 )

-var (
+const (
 	usage    = "hugetlb.%s.usage_in_bytes"
 	limit    = "hugetlb.%s.limit_in_bytes"
 	maxUsage = "hugetlb.%s.max_usage_in_bytes"
@ -26,38 +24,38 @@ var (
 )

 func TestHugetlbSetHugetlb(t *testing.T) {
-	helper := NewCgroupTestUtil("hugetlb", t)
-	defer helper.cleanup()
+	path := tempDir(t, "hugetlb")

 	const (
 		hugetlbBefore = 256
 		hugetlbAfter  = 512
 	)

-	for _, pageSize := range HugePageSizes {
-		helper.writeFileContents(map[string]string{
+	for _, pageSize := range cgroups.HugePageSizes() {
+		writeFileContents(t, path, map[string]string{
 			fmt.Sprintf(limit, pageSize): strconv.Itoa(hugetlbBefore),
 		})
 	}

-	for _, pageSize := range HugePageSizes {
-		helper.CgroupData.config.Resources.HugetlbLimit = []*configs.HugepageLimit{
+	r := &configs.Resources{}
+	for _, pageSize := range cgroups.HugePageSizes() {
+		r.HugetlbLimit = []*configs.HugepageLimit{
 			{
 				Pagesize: pageSize,
 				Limit:    hugetlbAfter,
 			},
 		}
 		hugetlb := &HugetlbGroup{}
-		if err := hugetlb.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+		if err := hugetlb.Set(path, r); err != nil {
 			t.Fatal(err)
 		}
 	}

-	for _, pageSize := range HugePageSizes {
+	for _, pageSize := range cgroups.HugePageSizes() {
 		limit := fmt.Sprintf(limit, pageSize)
-		value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, limit)
+		value, err := fscommon.GetCgroupParamUint(path, limit)
 		if err != nil {
-			t.Fatalf("Failed to parse %s - %s", limit, err)
+			t.Fatal(err)
 		}
 		if value != hugetlbAfter {
 			t.Fatalf("Set hugetlb.limit_in_bytes failed. Expected: %v, Got: %v", hugetlbAfter, value)
@ -66,10 +64,9 @@ func TestHugetlbSetHugetlb(t *testing.T) {
 }

 func TestHugetlbStats(t *testing.T) {
-	helper := NewCgroupTestUtil("hugetlb", t)
-	defer helper.cleanup()
-	for _, pageSize := range HugePageSizes {
-		helper.writeFileContents(map[string]string{
+	path := tempDir(t, "hugetlb")
+	for _, pageSize := range cgroups.HugePageSizes() {
+		writeFileContents(t, path, map[string]string{
 			fmt.Sprintf(usage, pageSize):    hugetlbUsageContents,
 			fmt.Sprintf(maxUsage, pageSize): hugetlbMaxUsageContents,
 			fmt.Sprintf(failcnt, pageSize):  hugetlbFailcnt,
@ -78,56 +75,50 @@ func TestHugetlbStats(t *testing.T) {

 	hugetlb := &HugetlbGroup{}
 	actualStats := *cgroups.NewStats()
-	err := hugetlb.GetStats(helper.CgroupPath, &actualStats)
+	err := hugetlb.GetStats(path, &actualStats)
 	if err != nil {
 		t.Fatal(err)
 	}
 	expectedStats := cgroups.HugetlbStats{Usage: 128, MaxUsage: 256, Failcnt: 100}
-	for _, pageSize := range HugePageSizes {
+	for _, pageSize := range cgroups.HugePageSizes() {
 		expectHugetlbStatEquals(t, expectedStats, actualStats.HugetlbStats[pageSize])
 	}
 }

 func TestHugetlbStatsNoUsageFile(t *testing.T) {
-t.Skip("Disabled unreliable test")
-	helper := NewCgroupTestUtil("hugetlb", t)
-	defer helper.cleanup()
-	helper.writeFileContents(map[string]string{
+	path := tempDir(t, "hugetlb")
+	writeFileContents(t, path, map[string]string{
 		maxUsage: hugetlbMaxUsageContents,
 	})

 	hugetlb := &HugetlbGroup{}
 	actualStats := *cgroups.NewStats()
-	err := hugetlb.GetStats(helper.CgroupPath, &actualStats)
+	err := hugetlb.GetStats(path, &actualStats)
 	if err == nil {
 		t.Fatal("Expected failure")
 	}
 }

 func TestHugetlbStatsNoMaxUsageFile(t *testing.T) {
-t.Skip("Disabled unreliable test")
-	helper := NewCgroupTestUtil("hugetlb", t)
-	defer helper.cleanup()
-	for _, pageSize := range HugePageSizes {
-		helper.writeFileContents(map[string]string{
+	path := tempDir(t, "hugetlb")
+	for _, pageSize := range cgroups.HugePageSizes() {
+		writeFileContents(t, path, map[string]string{
 			fmt.Sprintf(usage, pageSize): hugetlbUsageContents,
 		})
 	}

 	hugetlb := &HugetlbGroup{}
 	actualStats := *cgroups.NewStats()
-	err := hugetlb.GetStats(helper.CgroupPath, &actualStats)
+	err := hugetlb.GetStats(path, &actualStats)
 	if err == nil {
 		t.Fatal("Expected failure")
 	}
 }

 func TestHugetlbStatsBadUsageFile(t *testing.T) {
-t.Skip("Disabled unreliable test")
-	helper := NewCgroupTestUtil("hugetlb", t)
-	defer helper.cleanup()
-	for _, pageSize := range HugePageSizes {
-		helper.writeFileContents(map[string]string{
+	path := tempDir(t, "hugetlb")
+	for _, pageSize := range cgroups.HugePageSizes() {
+		writeFileContents(t, path, map[string]string{
 			fmt.Sprintf(usage, pageSize): "bad",
 			maxUsage:                     hugetlbMaxUsageContents,
 		})
@ -135,24 +126,22 @@ t.Skip("Disabled unreliable test")

 	hugetlb := &HugetlbGroup{}
 	actualStats := *cgroups.NewStats()
-	err := hugetlb.GetStats(helper.CgroupPath, &actualStats)
+	err := hugetlb.GetStats(path, &actualStats)
 	if err == nil {
 		t.Fatal("Expected failure")
 	}
 }

 func TestHugetlbStatsBadMaxUsageFile(t *testing.T) {
-t.Skip("Disabled unreliable test")
-	helper := NewCgroupTestUtil("hugetlb", t)
-	defer helper.cleanup()
-	helper.writeFileContents(map[string]string{
+	path := tempDir(t, "hugetlb")
+	writeFileContents(t, path, map[string]string{
 		usage:    hugetlbUsageContents,
 		maxUsage: "bad",
 	})

 	hugetlb := &HugetlbGroup{}
 	actualStats := *cgroups.NewStats()
-	err := hugetlb.GetStats(helper.CgroupPath, &actualStats)
+	err := hugetlb.GetStats(path, &actualStats)
 	if err == nil {
 		t.Fatal("Expected failure")
 	}
--- a/libcontainer/cgroups/fs/kmem.go
+++ b/libcontainer/cgroups/fs/kmem.go
@ -1,62 +0,0 @@
-// +build linux,!nokmem
-
-package fs
-
-import (
-	"errors"
-	"fmt"
-	"io/ioutil"
-	"os"
-	"path/filepath"
-	"strconv"
-	"syscall" // for Errno type only
-
-	"github.com/opencontainers/runc/libcontainer/cgroups"
-	"golang.org/x/sys/unix"
-)
-
-const cgroupKernelMemoryLimit = "memory.kmem.limit_in_bytes"
-
-func EnableKernelMemoryAccounting(path string) error {
-	// Ensure that kernel memory is available in this kernel build. If it
-	// isn't, we just ignore it because EnableKernelMemoryAccounting is
-	// automatically called for all memory limits.
-	if !cgroups.PathExists(filepath.Join(path, cgroupKernelMemoryLimit)) {
-		return nil
-	}
-	// We have to limit the kernel memory here as it won't be accounted at all
-	// until a limit is set on the cgroup and limit cannot be set once the
-	// cgroup has children, or if there are already tasks in the cgroup.
-	for _, i := range []int64{1, -1} {
-		if err := setKernelMemory(path, i); err != nil {
-			return err
-		}
-	}
-	return nil
-}
-
-func setKernelMemory(path string, kernelMemoryLimit int64) error {
-	if path == "" {
-		return fmt.Errorf("no such directory for %s", cgroupKernelMemoryLimit)
-	}
-	if !cgroups.PathExists(filepath.Join(path, cgroupKernelMemoryLimit)) {
-		// We have specifically been asked to set a kmem limit. If the kernel
-		// doesn't support it we *must* error out.
-		return errors.New("kernel memory accounting not supported by this kernel")
-	}
-	if err := ioutil.WriteFile(filepath.Join(path, cgroupKernelMemoryLimit), []byte(strconv.FormatInt(kernelMemoryLimit, 10)), 0700); err != nil {
-		// Check if the error number returned by the syscall is "EBUSY"
-		// The EBUSY signal is returned on attempts to write to the
-		// memory.kmem.limit_in_bytes file if the cgroup has children or
-		// once tasks have been attached to the cgroup
-		if pathErr, ok := err.(*os.PathError); ok {
-			if errNo, ok := pathErr.Err.(syscall.Errno); ok {
-				if errNo == unix.EBUSY {
-					return fmt.Errorf("failed to set %s, because either tasks have already joined this cgroup or it has children", cgroupKernelMemoryLimit)
-				}
-			}
-		}
-		return fmt.Errorf("failed to write %v to %v: %v", kernelMemoryLimit, cgroupKernelMemoryLimit, err)
-	}
-	return nil
-}
--- a/libcontainer/cgroups/fs/kmem_disabled.go
+++ b/libcontainer/cgroups/fs/kmem_disabled.go
@ -1,15 +0,0 @@
-// +build linux,nokmem
-
-package fs
-
-import (
-	"errors"
-)
-
-func EnableKernelMemoryAccounting(path string) error {
-	return nil
-}
-
-func setKernelMemory(path string, kernelMemoryLimit int64) error {
-	return errors.New("kernel memory accounting disabled in this runc build")
-}
--- a/libcontainer/cgroups/fs/memory.go
+++ b/libcontainer/cgroups/fs/memory.go
@ -1,15 +1,17 @@
-// +build linux
-
 package fs

 import (
 	"bufio"
+	"errors"
 	"fmt"
+	"math"
 	"os"
 	"path/filepath"
 	"strconv"
 	"strings"

+	"golang.org/x/sys/unix"
+
 	"github.com/opencontainers/runc/libcontainer/cgroups"
 	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
 	"github.com/opencontainers/runc/libcontainer/configs"
@ -18,65 +20,66 @@ import (
 const (
 	cgroupMemorySwapLimit = "memory.memsw.limit_in_bytes"
 	cgroupMemoryLimit     = "memory.limit_in_bytes"
+	cgroupMemoryUsage     = "memory.usage_in_bytes"
+	cgroupMemoryMaxUsage  = "memory.max_usage_in_bytes"
 )

-type MemoryGroup struct {
-}
+type MemoryGroup struct{}

 func (s *MemoryGroup) Name() string {
 	return "memory"
 }

-func (s *MemoryGroup) Apply(d *cgroupData) (err error) {
-	path, err := d.path("memory")
-	if err != nil && !cgroups.IsNotFound(err) {
-		return err
-	} else if path == "" {
-		return nil
-	}
-	if memoryAssigned(d.config) {
-		if _, err := os.Stat(path); os.IsNotExist(err) {
-			if err := os.MkdirAll(path, 0755); err != nil {
-				return err
-			}
-			// Only enable kernel memory accouting when this cgroup
-			// is created by libcontainer, otherwise we might get
-			// error when people use `cgroupsPath` to join an existed
-			// cgroup whose kernel memory is not initialized.
-			if err := EnableKernelMemoryAccounting(path); err != nil {
-				return err
-			}
-		}
-	}
-	defer func() {
-		if err != nil {
-			os.RemoveAll(path)
-		}
-	}()
-
-	// We need to join memory cgroup after set memory limits, because
-	// kmem.limit_in_bytes can only be set when the cgroup is empty.
-	_, err = d.join("memory")
-	if err != nil && !cgroups.IsNotFound(err) {
-		return err
-	}
-	return nil
+func (s *MemoryGroup) Apply(path string, _ *configs.Resources, pid int) error {
+	return apply(path, pid)
 }

-func setMemoryAndSwap(path string, cgroup *configs.Cgroup) error {
-	// If the memory update is set to -1 we should also
-	// set swap to -1, it means unlimited memory.
-	if cgroup.Resources.Memory == -1 {
+func setMemory(path string, val int64) error {
+	if val == 0 {
+		return nil
+	}
+
+	err := cgroups.WriteFile(path, cgroupMemoryLimit, strconv.FormatInt(val, 10))
+	if !errors.Is(err, unix.EBUSY) {
+		return err
+	}
+
+	// EBUSY means the kernel can't set new limit as it's too low
+	// (lower than the current usage). Return more specific error.
+	usage, err := fscommon.GetCgroupParamUint(path, cgroupMemoryUsage)
+	if err != nil {
+		return err
+	}
+	max, err := fscommon.GetCgroupParamUint(path, cgroupMemoryMaxUsage)
+	if err != nil {
+		return err
+	}
+
+	return fmt.Errorf("unable to set memory limit to %d (current usage: %d, peak usage: %d)", val, usage, max)
+}
+
+func setSwap(path string, val int64) error {
+	if val == 0 {
+		return nil
+	}
+
+	return cgroups.WriteFile(path, cgroupMemorySwapLimit, strconv.FormatInt(val, 10))
+}
+
+func setMemoryAndSwap(path string, r *configs.Resources) error {
+	// If the memory update is set to -1 and the swap is not explicitly
+	// set, we should also set swap to -1, it means unlimited memory.
+	if r.Memory == -1 && r.MemorySwap == 0 {
 		// Only set swap if it's enabled in kernel
 		if cgroups.PathExists(filepath.Join(path, cgroupMemorySwapLimit)) {
-			cgroup.Resources.MemorySwap = -1
+			r.MemorySwap = -1
 		}
 	}

 	// When memory and swap memory are both set, we need to handle the cases
 	// for updating container.
-	if cgroup.Resources.Memory != 0 && cgroup.Resources.MemorySwap != 0 {
-		memoryUsage, err := getMemoryData(path, "")
+	if r.Memory != 0 && r.MemorySwap != 0 {
+		curLimit, err := fscommon.GetCgroupParamUint(path, cgroupMemoryLimit)
 		if err != nil {
 			return err
 		}
@ -84,84 +87,61 @@ func setMemoryAndSwap(path string, cgroup *configs.Cgroup) error {
 		// When update memory limit, we should adapt the write sequence
 		// for memory and swap memory, so it won't fail because the new
 		// value and the old value don't fit kernel's validation.
-		if cgroup.Resources.MemorySwap == -1 || memoryUsage.Limit < uint64(cgroup.Resources.MemorySwap) {
-			if err := fscommon.WriteFile(path, cgroupMemorySwapLimit, strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil {
+		if r.MemorySwap == -1 || curLimit < uint64(r.MemorySwap) {
+			if err := setSwap(path, r.MemorySwap); err != nil {
 				return err
 			}
-			if err := fscommon.WriteFile(path, cgroupMemoryLimit, strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil {
-				return err
-			}
-		} else {
-			if err := fscommon.WriteFile(path, cgroupMemoryLimit, strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil {
-				return err
-			}
-			if err := fscommon.WriteFile(path, cgroupMemorySwapLimit, strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil {
-				return err
-			}
-		}
-	} else {
-		if cgroup.Resources.Memory != 0 {
-			if err := fscommon.WriteFile(path, cgroupMemoryLimit, strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil {
-				return err
-			}
-		}
-		if cgroup.Resources.MemorySwap != 0 {
-			if err := fscommon.WriteFile(path, cgroupMemorySwapLimit, strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil {
+			if err := setMemory(path, r.Memory); err != nil {
 				return err
 			}
+			return nil
 		}
 	}

-	return nil
-}
-
-func (s *MemoryGroup) Set(path string, cgroup *configs.Cgroup) error {
-	if err := setMemoryAndSwap(path, cgroup); err != nil {
+	if err := setMemory(path, r.Memory); err != nil {
+		return err
+	}
+	if err := setSwap(path, r.MemorySwap); err != nil {
 		return err
 	}

-	if cgroup.Resources.KernelMemory != 0 {
-		if err := setKernelMemory(path, cgroup.Resources.KernelMemory); err != nil {
+	return nil
+}
+
+func (s *MemoryGroup) Set(path string, r *configs.Resources) error {
+	if err := setMemoryAndSwap(path, r); err != nil {
+		return err
+	}
+
+	// ignore KernelMemory and KernelMemoryTCP
+
+	if r.MemoryReservation != 0 {
+		if err := cgroups.WriteFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(r.MemoryReservation, 10)); err != nil {
 			return err
 		}
 	}

-	if cgroup.Resources.MemoryReservation != 0 {
-		if err := fscommon.WriteFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemoryReservation, 10)); err != nil {
+	if r.OomKillDisable {
+		if err := cgroups.WriteFile(path, "memory.oom_control", "1"); err != nil {
 			return err
 		}
 	}
-
-	if cgroup.Resources.KernelMemoryTCP != 0 {
-		if err := fscommon.WriteFile(path, "memory.kmem.tcp.limit_in_bytes", strconv.FormatInt(cgroup.Resources.KernelMemoryTCP, 10)); err != nil {
-			return err
-		}
-	}
-	if cgroup.Resources.OomKillDisable {
-		if err := fscommon.WriteFile(path, "memory.oom_control", "1"); err != nil {
-			return err
-		}
-	}
-	if cgroup.Resources.MemorySwappiness == nil || int64(*cgroup.Resources.MemorySwappiness) == -1 {
+	if r.MemorySwappiness == nil || int64(*r.MemorySwappiness) == -1 {
 		return nil
-	} else if *cgroup.Resources.MemorySwappiness <= 100 {
-		if err := fscommon.WriteFile(path, "memory.swappiness", strconv.FormatUint(*cgroup.Resources.MemorySwappiness, 10)); err != nil {
+	} else if *r.MemorySwappiness <= 100 {
+		if err := cgroups.WriteFile(path, "memory.swappiness", strconv.FormatUint(*r.MemorySwappiness, 10)); err != nil {
 			return err
 		}
 	} else {
-		return fmt.Errorf("invalid value:%d. valid memory swappiness range is 0-100", *cgroup.Resources.MemorySwappiness)
+		return fmt.Errorf("invalid memory swappiness value: %d (valid range is 0-100)", *r.MemorySwappiness)
 	}

 	return nil
 }

-func (s *MemoryGroup) Remove(d *cgroupData) error {
-	return removePath(d.path("memory"))
-}
-
 func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error {
-	// Set stats from memory.stat.
-	statsFile, err := os.Open(filepath.Join(path, "memory.stat"))
+	const file = "memory.stat"
+	statsFile, err := cgroups.OpenFile(path, file, os.O_RDONLY)
 	if err != nil {
 		if os.IsNotExist(err) {
 			return nil
@ -172,9 +152,9 @@ func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error {

 	sc := bufio.NewScanner(statsFile)
 	for sc.Scan() {
-		t, v, err := fscommon.GetCgroupParamKeyValue(sc.Text())
+		t, v, err := fscommon.ParseKeyValue(sc.Text())
 		if err != nil {
-			return fmt.Errorf("failed to parse memory.stat (%q) - %v", sc.Text(), err)
+			return &parseError{Path: path, File: file, Err: err}
 		}
 		stats.MemoryStats.Stats[t] = v
 	}
@ -201,25 +181,21 @@ func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error {
 	}
 	stats.MemoryStats.KernelTCPUsage = kernelTCPUsage

-	useHierarchy := strings.Join([]string{"memory", "use_hierarchy"}, ".")
-	value, err := fscommon.GetCgroupParamUint(path, useHierarchy)
+	value, err := fscommon.GetCgroupParamUint(path, "memory.use_hierarchy")
 	if err != nil {
 		return err
 	}
 	if value == 1 {
 		stats.MemoryStats.UseHierarchy = true
 	}
-	return nil
-}

-func memoryAssigned(cgroup *configs.Cgroup) bool {
-	return cgroup.Resources.Memory != 0 ||
-		cgroup.Resources.MemoryReservation != 0 ||
-		cgroup.Resources.MemorySwap > 0 ||
-		cgroup.Resources.KernelMemory > 0 ||
-		cgroup.Resources.KernelMemoryTCP > 0 ||
-		cgroup.Resources.OomKillDisable ||
-		(cgroup.Resources.MemorySwappiness != nil && int64(*cgroup.Resources.MemorySwappiness) != -1)
+	pagesByNUMA, err := getPageUsageByNUMA(path)
+	if err != nil {
+		return err
+	}
+	stats.MemoryStats.PageUsageByNUMA = pagesByNUMA
+
+	return nil
 }

 func getMemoryData(path, name string) (cgroups.MemoryData, error) {
@ -227,45 +203,146 @@ func getMemoryData(path, name string) (cgroups.MemoryData, error) {

 	moduleName := "memory"
 	if name != "" {
-		moduleName = strings.Join([]string{"memory", name}, ".")
+		moduleName = "memory." + name
 	}
-	usage := strings.Join([]string{moduleName, "usage_in_bytes"}, ".")
-	maxUsage := strings.Join([]string{moduleName, "max_usage_in_bytes"}, ".")
-	failcnt := strings.Join([]string{moduleName, "failcnt"}, ".")
-	limit := strings.Join([]string{moduleName, "limit_in_bytes"}, ".")
+	var (
+		usage    = moduleName + ".usage_in_bytes"
+		maxUsage = moduleName + ".max_usage_in_bytes"
+		failcnt  = moduleName + ".failcnt"
+		limit    = moduleName + ".limit_in_bytes"
+	)

 	value, err := fscommon.GetCgroupParamUint(path, usage)
 	if err != nil {
-		if moduleName != "memory" && os.IsNotExist(err) {
+		if name != "" && os.IsNotExist(err) {
+			// Ignore ENOENT as swap and kmem controllers
+			// are optional in the kernel.
 			return cgroups.MemoryData{}, nil
 		}
-		return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", usage, err)
+		return cgroups.MemoryData{}, err
 	}
 	memoryData.Usage = value
 	value, err = fscommon.GetCgroupParamUint(path, maxUsage)
 	if err != nil {
-		if moduleName != "memory" && os.IsNotExist(err) {
-			return cgroups.MemoryData{}, nil
-		}
-		return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", maxUsage, err)
+		return cgroups.MemoryData{}, err
 	}
 	memoryData.MaxUsage = value
 	value, err = fscommon.GetCgroupParamUint(path, failcnt)
 	if err != nil {
-		if moduleName != "memory" && os.IsNotExist(err) {
-			return cgroups.MemoryData{}, nil
-		}
-		return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", failcnt, err)
+		return cgroups.MemoryData{}, err
 	}
 	memoryData.Failcnt = value
 	value, err = fscommon.GetCgroupParamUint(path, limit)
 	if err != nil {
-		if moduleName != "memory" && os.IsNotExist(err) {
-			return cgroups.MemoryData{}, nil
-		}
-		return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", limit, err)
+		return cgroups.MemoryData{}, err
 	}
 	memoryData.Limit = value

 	return memoryData, nil
 }
+
+func getPageUsageByNUMA(path string) (cgroups.PageUsageByNUMA, error) {
+	const (
+		maxColumns = math.MaxUint8 + 1
+		file       = "memory.numa_stat"
+	)
+	stats := cgroups.PageUsageByNUMA{}
+
+	fd, err := cgroups.OpenFile(path, file, os.O_RDONLY)
+	if os.IsNotExist(err) {
+		return stats, nil
+	} else if err != nil {
+		return stats, err
+	}
+	defer fd.Close()
+
+	// File format is documented in linux/Documentation/cgroup-v1/memory.txt
+	// and it looks like this:
+	//
+	// total=<total pages> N0=<node 0 pages> N1=<node 1 pages> ...
+	// file=<total file pages> N0=<node 0 pages> N1=<node 1 pages> ...
+	// anon=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ...
+	// unevictable=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ...
+	// hierarchical_<counter>=<counter pages> N0=<node 0 pages> N1=<node 1 pages> ...
+
+	scanner := bufio.NewScanner(fd)
+	for scanner.Scan() {
+		var field *cgroups.PageStats
+
+		line := scanner.Text()
+		columns := strings.SplitN(line, " ", maxColumns)
+		for i, column := range columns {
+			byNode := strings.SplitN(column, "=", 2)
+			// Some custom kernels have non-standard fields, like
+			//   numa_locality 0 0 0 0 0 0 0 0 0 0
+			//   numa_exectime 0
+			if len(byNode) < 2 {
+				if i == 0 {
+					// Ignore/skip those.
+					break
+				} else {
+					// The first column was already validated,
+					// so be strict to the rest.
+					return stats, malformedLine(path, file, line)
+				}
+			}
+			key, val := byNode[0], byNode[1]
+			if i == 0 { // First column: key is name, val is total.
+				field = getNUMAField(&stats, key)
+				if field == nil { // unknown field (new kernel?)
+					break
+				}
+				field.Total, err = strconv.ParseUint(val, 0, 64)
+				if err != nil {
+					return stats, &parseError{Path: path, File: file, Err: err}
+				}
+				field.Nodes = map[uint8]uint64{}
+			} else { // Subsequent columns: key is N<id>, val is usage.
+				if len(key) < 2 || key[0] != 'N' {
+					// This is definitely an error.
+					return stats, malformedLine(path, file, line)
+				}
+
+				n, err := strconv.ParseUint(key[1:], 10, 8)
+				if err != nil {
+					return stats, &parseError{Path: path, File: file, Err: err}
+				}
+
+				usage, err := strconv.ParseUint(val, 10, 64)
+				if err != nil {
+					return stats, &parseError{Path: path, File: file, Err: err}
+				}
+
+				field.Nodes[uint8(n)] = usage
+			}
+
+		}
+	}
+	if err := scanner.Err(); err != nil {
+		return cgroups.PageUsageByNUMA{}, &parseError{Path: path, File: file, Err: err}
+	}
+
+	return stats, nil
+}
+
+func getNUMAField(stats *cgroups.PageUsageByNUMA, name string) *cgroups.PageStats {
+	switch name {
+	case "total":
+		return &stats.Total
+	case "file":
+		return &stats.File
+	case "anon":
+		return &stats.Anon
+	case "unevictable":
+		return &stats.Unevictable
+	case "hierarchical_total":
+		return &stats.Hierarchical.Total
+	case "hierarchical_file":
+		return &stats.Hierarchical.File
+	case "hierarchical_anon":
+		return &stats.Hierarchical.Anon
+	case "hierarchical_unevictable":
+		return &stats.Hierarchical.Unevictable
+	}
+	return nil
+}
--- a/libcontainer/cgroups/fs/memory_test.go
+++ b/libcontainer/cgroups/fs/memory_test.go
@ -1,5 +1,3 @@
-// +build linux
-
 package fs

 import (
@ -8,6 +6,7 @@ import (

 	"github.com/opencontainers/runc/libcontainer/cgroups"
 	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+	"github.com/opencontainers/runc/libcontainer/configs"
 )

 const (
@ -18,11 +17,29 @@ rss 1024`
 	memoryFailcnt              = "100\n"
 	memoryLimitContents        = "8192\n"
 	memoryUseHierarchyContents = "1\n"
+	memoryNUMAStatContents     = `total=44611 N0=32631 N1=7501 N2=1982 N3=2497
+file=44428 N0=32614 N1=7335 N2=1982 N3=2497
+anon=183 N0=17 N1=166 N2=0 N3=0
+unevictable=0 N0=0 N1=0 N2=0 N3=0
+hierarchical_total=768133 N0=509113 N1=138887 N2=20464 N3=99669
+hierarchical_file=722017 N0=496516 N1=119997 N2=20181 N3=85323
+hierarchical_anon=46096 N0=12597 N1=18890 N2=283 N3=14326
+hierarchical_unevictable=20 N0=0 N1=0 N2=0 N3=20
+`
+	memoryNUMAStatNoHierarchyContents = `total=44611 N0=32631 N1=7501 N2=1982 N3=2497
+file=44428 N0=32614 N1=7335 N2=1982 N3=2497
+anon=183 N0=17 N1=166 N2=0 N3=0
+unevictable=0 N0=0 N1=0 N2=0 N3=0
+`
+	// Some custom kernels has extra fields that should be ignored
+	memoryNUMAStatExtraContents = `numa_locality 0 0 0 0 0 0 0 0 0 0
+numa_exectime 0
+whatever=100 N0=0
+`
 )

 func TestMemorySetMemory(t *testing.T) {
-	helper := NewCgroupTestUtil("memory", t)
-	defer helper.cleanup()
+	path := tempDir(t, "memory")

 	const (
 		memoryBefore      = 314572800 // 300M
@ -31,29 +48,31 @@ func TestMemorySetMemory(t *testing.T) {
 		reservationAfter  = 314572800 // 300M
 	)

-	helper.writeFileContents(map[string]string{
+	writeFileContents(t, path, map[string]string{
 		"memory.limit_in_bytes":      strconv.Itoa(memoryBefore),
 		"memory.soft_limit_in_bytes": strconv.Itoa(reservationBefore),
 	})

-	helper.CgroupData.config.Resources.Memory = memoryAfter
-	helper.CgroupData.config.Resources.MemoryReservation = reservationAfter
+	r := &configs.Resources{
+		Memory:            memoryAfter,
+		MemoryReservation: reservationAfter,
+	}
 	memory := &MemoryGroup{}
-	if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+	if err := memory.Set(path, r); err != nil {
 		t.Fatal(err)
 	}

-	value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.limit_in_bytes")
+	value, err := fscommon.GetCgroupParamUint(path, "memory.limit_in_bytes")
 	if err != nil {
-		t.Fatalf("Failed to parse memory.limit_in_bytes - %s", err)
+		t.Fatal(err)
 	}
 	if value != memoryAfter {
 		t.Fatal("Got the wrong value, set memory.limit_in_bytes failed.")
 	}

-	value, err = fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.soft_limit_in_bytes")
+	value, err = fscommon.GetCgroupParamUint(path, "memory.soft_limit_in_bytes")
 	if err != nil {
-		t.Fatalf("Failed to parse memory.soft_limit_in_bytes - %s", err)
+		t.Fatal(err)
 	}
 	if value != reservationAfter {
 		t.Fatal("Got the wrong value, set memory.soft_limit_in_bytes failed.")
@ -61,27 +80,28 @@ func TestMemorySetMemory(t *testing.T) {
 }

 func TestMemorySetMemoryswap(t *testing.T) {
-	helper := NewCgroupTestUtil("memory", t)
-	defer helper.cleanup()
+	path := tempDir(t, "memory")

 	const (
 		memoryswapBefore = 314572800 // 300M
 		memoryswapAfter  = 524288000 // 500M
 	)

-	helper.writeFileContents(map[string]string{
+	writeFileContents(t, path, map[string]string{
 		"memory.memsw.limit_in_bytes": strconv.Itoa(memoryswapBefore),
 	})

-	helper.CgroupData.config.Resources.MemorySwap = memoryswapAfter
+	r := &configs.Resources{
+		MemorySwap: memoryswapAfter,
+	}
 	memory := &MemoryGroup{}
-	if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+	if err := memory.Set(path, r); err != nil {
 		t.Fatal(err)
 	}

-	value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.memsw.limit_in_bytes")
+	value, err := fscommon.GetCgroupParamUint(path, "memory.memsw.limit_in_bytes")
 	if err != nil {
-		t.Fatalf("Failed to parse memory.memsw.limit_in_bytes - %s", err)
+		t.Fatal(err)
 	}
 	if value != memoryswapAfter {
 		t.Fatal("Got the wrong value, set memory.memsw.limit_in_bytes failed.")
@ -89,8 +109,7 @@ func TestMemorySetMemoryswap(t *testing.T) {
 }

 func TestMemorySetMemoryLargerThanSwap(t *testing.T) {
-	helper := NewCgroupTestUtil("memory", t)
-	defer helper.cleanup()
+	path := tempDir(t, "memory")

 	const (
 		memoryBefore     = 314572800 // 300M
@ -99,7 +118,7 @@ func TestMemorySetMemoryLargerThanSwap(t *testing.T) {
 		memoryswapAfter  = 838860800 // 800M
 	)

-	helper.writeFileContents(map[string]string{
+	writeFileContents(t, path, map[string]string{
 		"memory.limit_in_bytes":       strconv.Itoa(memoryBefore),
 		"memory.memsw.limit_in_bytes": strconv.Itoa(memoryswapBefore),
 		// Set will call getMemoryData when memory and swap memory are
@ -109,23 +128,26 @@ func TestMemorySetMemoryLargerThanSwap(t *testing.T) {
 		"memory.failcnt":            "0",
 	})

-	helper.CgroupData.config.Resources.Memory = memoryAfter
-	helper.CgroupData.config.Resources.MemorySwap = memoryswapAfter
+	r := &configs.Resources{
+		Memory:     memoryAfter,
+		MemorySwap: memoryswapAfter,
+	}
 	memory := &MemoryGroup{}
-	if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+	if err := memory.Set(path, r); err != nil {
 		t.Fatal(err)
 	}

-	value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.limit_in_bytes")
+	value, err := fscommon.GetCgroupParamUint(path, "memory.limit_in_bytes")
 	if err != nil {
-		t.Fatalf("Failed to parse memory.limit_in_bytes - %s", err)
+		t.Fatal(err)
 	}
 	if value != memoryAfter {
 		t.Fatal("Got the wrong value, set memory.limit_in_bytes failed.")
 	}
-	value, err = fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.memsw.limit_in_bytes")
+
+	value, err = fscommon.GetCgroupParamUint(path, "memory.memsw.limit_in_bytes")
 	if err != nil {
-		t.Fatalf("Failed to parse memory.memsw.limit_in_bytes - %s", err)
+		t.Fatal(err)
 	}
 	if value != memoryswapAfter {
 		t.Fatal("Got the wrong value, set memory.memsw.limit_in_bytes failed.")
@ -133,8 +155,7 @@ func TestMemorySetMemoryLargerThanSwap(t *testing.T) {
 }

 func TestMemorySetSwapSmallerThanMemory(t *testing.T) {
-	helper := NewCgroupTestUtil("memory", t)
-	defer helper.cleanup()
+	path := tempDir(t, "memory")

 	const (
 		memoryBefore     = 629145600 // 600M
@ -143,115 +164,58 @@ func TestMemorySetSwapSmallerThanMemory(t *testing.T) {
 		memoryswapAfter  = 524288000 // 500M
 	)

-	helper.writeFileContents(map[string]string{
+	writeFileContents(t, path, map[string]string{
 		"memory.limit_in_bytes":       strconv.Itoa(memoryBefore),
 		"memory.memsw.limit_in_bytes": strconv.Itoa(memoryswapBefore),
-		// Set will call getMemoryData when memory and swap memory are
-		// both set, fake these fields so we don't get error.
-		"memory.usage_in_bytes":     "0",
-		"memory.max_usage_in_bytes": "0",
-		"memory.failcnt":            "0",
 	})

-	helper.CgroupData.config.Resources.Memory = memoryAfter
-	helper.CgroupData.config.Resources.MemorySwap = memoryswapAfter
+	r := &configs.Resources{
+		Memory:     memoryAfter,
+		MemorySwap: memoryswapAfter,
+	}
 	memory := &MemoryGroup{}
-	if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+	if err := memory.Set(path, r); err != nil {
 		t.Fatal(err)
 	}

-	value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.limit_in_bytes")
+	value, err := fscommon.GetCgroupParamUint(path, "memory.limit_in_bytes")
 	if err != nil {
-		t.Fatalf("Failed to parse memory.limit_in_bytes - %s", err)
+		t.Fatal(err)
 	}
 	if value != memoryAfter {
-		t.Fatal("Got the wrong value, set memory.limit_in_bytes failed.")
+		t.Fatalf("Got the wrong value (%d != %d), set memory.limit_in_bytes failed", value, memoryAfter)
 	}
-	value, err = fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.memsw.limit_in_bytes")
+
+	value, err = fscommon.GetCgroupParamUint(path, "memory.memsw.limit_in_bytes")
 	if err != nil {
-		t.Fatalf("Failed to parse memory.memsw.limit_in_bytes - %s", err)
+		t.Fatal(err)
 	}
 	if value != memoryswapAfter {
-		t.Fatal("Got the wrong value, set memory.memsw.limit_in_bytes failed.")
-	}
-}
-
-func TestMemorySetKernelMemory(t *testing.T) {
-	helper := NewCgroupTestUtil("memory", t)
-	defer helper.cleanup()
-
-	const (
-		kernelMemoryBefore = 314572800 // 300M
-		kernelMemoryAfter  = 524288000 // 500M
-	)
-
-	helper.writeFileContents(map[string]string{
-		"memory.kmem.limit_in_bytes": strconv.Itoa(kernelMemoryBefore),
-	})
-
-	helper.CgroupData.config.Resources.KernelMemory = kernelMemoryAfter
-	memory := &MemoryGroup{}
-	if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
-		t.Fatal(err)
-	}
-
-	value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.kmem.limit_in_bytes")
-	if err != nil {
-		t.Fatalf("Failed to parse memory.kmem.limit_in_bytes - %s", err)
-	}
-	if value != kernelMemoryAfter {
-		t.Fatal("Got the wrong value, set memory.kmem.limit_in_bytes failed.")
-	}
-}
-
-func TestMemorySetKernelMemoryTCP(t *testing.T) {
-	helper := NewCgroupTestUtil("memory", t)
-	defer helper.cleanup()
-
-	const (
-		kernelMemoryTCPBefore = 314572800 // 300M
-		kernelMemoryTCPAfter  = 524288000 // 500M
-	)
-
-	helper.writeFileContents(map[string]string{
-		"memory.kmem.tcp.limit_in_bytes": strconv.Itoa(kernelMemoryTCPBefore),
-	})
-
-	helper.CgroupData.config.Resources.KernelMemoryTCP = kernelMemoryTCPAfter
-	memory := &MemoryGroup{}
-	if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
-		t.Fatal(err)
-	}
-
-	value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.kmem.tcp.limit_in_bytes")
-	if err != nil {
-		t.Fatalf("Failed to parse memory.kmem.tcp.limit_in_bytes - %s", err)
-	}
-	if value != kernelMemoryTCPAfter {
-		t.Fatal("Got the wrong value, set memory.kmem.tcp.limit_in_bytes failed.")
+		t.Fatalf("Got the wrong value (%d != %d), set memory.memsw.limit_in_bytes failed", value, memoryswapAfter)
 	}
 }

 func TestMemorySetMemorySwappinessDefault(t *testing.T) {
-	helper := NewCgroupTestUtil("memory", t)
-	defer helper.cleanup()
+	path := tempDir(t, "memory")

-	swappinessBefore := 60 //default is 60
+	swappinessBefore := 60 // default is 60
 	swappinessAfter := uint64(0)

-	helper.writeFileContents(map[string]string{
+	writeFileContents(t, path, map[string]string{
 		"memory.swappiness": strconv.Itoa(swappinessBefore),
 	})

-	helper.CgroupData.config.Resources.MemorySwappiness = &swappinessAfter
+	r := &configs.Resources{
+		MemorySwappiness: &swappinessAfter,
+	}
 	memory := &MemoryGroup{}
-	if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+	if err := memory.Set(path, r); err != nil {
 		t.Fatal(err)
 	}

-	value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.swappiness")
+	value, err := fscommon.GetCgroupParamUint(path, "memory.swappiness")
 	if err != nil {
-		t.Fatalf("Failed to parse memory.swappiness - %s", err)
+		t.Fatal(err)
 	}
 	if value != swappinessAfter {
 		t.Fatalf("Got the wrong value (%d), set memory.swappiness = %d failed.", value, swappinessAfter)
@ -259,9 +223,8 @@ func TestMemorySetMemorySwappinessDefault(t *testing.T) {
 }

 func TestMemoryStats(t *testing.T) {
-	helper := NewCgroupTestUtil("memory", t)
-	defer helper.cleanup()
-	helper.writeFileContents(map[string]string{
+	path := tempDir(t, "memory")
+	writeFileContents(t, path, map[string]string{
 		"memory.stat":                     memoryStatContents,
 		"memory.usage_in_bytes":           memoryUsageContents,
 		"memory.limit_in_bytes":           memoryLimitContents,
@ -276,22 +239,43 @@ func TestMemoryStats(t *testing.T) {
 		"memory.kmem.failcnt":             memoryFailcnt,
 		"memory.kmem.limit_in_bytes":      memoryLimitContents,
 		"memory.use_hierarchy":            memoryUseHierarchyContents,
+		"memory.numa_stat":                memoryNUMAStatContents + memoryNUMAStatExtraContents,
 	})

 	memory := &MemoryGroup{}
 	actualStats := *cgroups.NewStats()
-	err := memory.GetStats(helper.CgroupPath, &actualStats)
+	err := memory.GetStats(path, &actualStats)
 	if err != nil {
 		t.Fatal(err)
 	}
-	expectedStats := cgroups.MemoryStats{Cache: 512, Usage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192}, SwapUsage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192}, KernelUsage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192}, Stats: map[string]uint64{"cache": 512, "rss": 1024}, UseHierarchy: true}
+	expectedStats := cgroups.MemoryStats{
+		Cache:        512,
+		Usage:        cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192},
+		SwapUsage:    cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192},
+		KernelUsage:  cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192},
+		Stats:        map[string]uint64{"cache": 512, "rss": 1024},
+		UseHierarchy: true,
+		PageUsageByNUMA: cgroups.PageUsageByNUMA{
+			PageUsageByNUMAInner: cgroups.PageUsageByNUMAInner{
+				Total:       cgroups.PageStats{Total: 44611, Nodes: map[uint8]uint64{0: 32631, 1: 7501, 2: 1982, 3: 2497}},
+				File:        cgroups.PageStats{Total: 44428, Nodes: map[uint8]uint64{0: 32614, 1: 7335, 2: 1982, 3: 2497}},
+				Anon:        cgroups.PageStats{Total: 183, Nodes: map[uint8]uint64{0: 17, 1: 166, 2: 0, 3: 0}},
+				Unevictable: cgroups.PageStats{Total: 0, Nodes: map[uint8]uint64{0: 0, 1: 0, 2: 0, 3: 0}},
+			},
+			Hierarchical: cgroups.PageUsageByNUMAInner{
+				Total:       cgroups.PageStats{Total: 768133, Nodes: map[uint8]uint64{0: 509113, 1: 138887, 2: 20464, 3: 99669}},
+				File:        cgroups.PageStats{Total: 722017, Nodes: map[uint8]uint64{0: 496516, 1: 119997, 2: 20181, 3: 85323}},
+				Anon:        cgroups.PageStats{Total: 46096, Nodes: map[uint8]uint64{0: 12597, 1: 18890, 2: 283, 3: 14326}},
+				Unevictable: cgroups.PageStats{Total: 20, Nodes: map[uint8]uint64{0: 0, 1: 0, 2: 0, 3: 20}},
+			},
+		},
+	}
 	expectMemoryStatEquals(t, expectedStats, actualStats.MemoryStats)
 }

 func TestMemoryStatsNoStatFile(t *testing.T) {
-	helper := NewCgroupTestUtil("memory", t)
-	defer helper.cleanup()
-	helper.writeFileContents(map[string]string{
+	path := tempDir(t, "memory")
+	writeFileContents(t, path, map[string]string{
 		"memory.usage_in_bytes":     memoryUsageContents,
 		"memory.max_usage_in_bytes": memoryMaxUsageContents,
 		"memory.limit_in_bytes":     memoryLimitContents,
@ -299,16 +283,15 @@ func TestMemoryStatsNoStatFile(t *testing.T) {

 	memory := &MemoryGroup{}
 	actualStats := *cgroups.NewStats()
-	err := memory.GetStats(helper.CgroupPath, &actualStats)
+	err := memory.GetStats(path, &actualStats)
 	if err != nil {
 		t.Fatal(err)
 	}
 }

 func TestMemoryStatsNoUsageFile(t *testing.T) {
-	helper := NewCgroupTestUtil("memory", t)
-	defer helper.cleanup()
-	helper.writeFileContents(map[string]string{
+	path := tempDir(t, "memory")
+	writeFileContents(t, path, map[string]string{
 		"memory.stat":               memoryStatContents,
 		"memory.max_usage_in_bytes": memoryMaxUsageContents,
 		"memory.limit_in_bytes":     memoryLimitContents,
@ -316,16 +299,15 @@ func TestMemoryStatsNoUsageFile(t *testing.T) {

 	memory := &MemoryGroup{}
 	actualStats := *cgroups.NewStats()
-	err := memory.GetStats(helper.CgroupPath, &actualStats)
+	err := memory.GetStats(path, &actualStats)
 	if err == nil {
 		t.Fatal("Expected failure")
 	}
 }

 func TestMemoryStatsNoMaxUsageFile(t *testing.T) {
-	helper := NewCgroupTestUtil("memory", t)
-	defer helper.cleanup()
-	helper.writeFileContents(map[string]string{
+	path := tempDir(t, "memory")
+	writeFileContents(t, path, map[string]string{
 		"memory.stat":           memoryStatContents,
 		"memory.usage_in_bytes": memoryUsageContents,
 		"memory.limit_in_bytes": memoryLimitContents,
@ -333,16 +315,15 @@ func TestMemoryStatsNoMaxUsageFile(t *testing.T) {

 	memory := &MemoryGroup{}
 	actualStats := *cgroups.NewStats()
-	err := memory.GetStats(helper.CgroupPath, &actualStats)
+	err := memory.GetStats(path, &actualStats)
 	if err == nil {
 		t.Fatal("Expected failure")
 	}
 }

 func TestMemoryStatsNoLimitInBytesFile(t *testing.T) {
-	helper := NewCgroupTestUtil("memory", t)
-	defer helper.cleanup()
-	helper.writeFileContents(map[string]string{
+	path := tempDir(t, "memory")
+	writeFileContents(t, path, map[string]string{
 		"memory.stat":               memoryStatContents,
 		"memory.usage_in_bytes":     memoryUsageContents,
 		"memory.max_usage_in_bytes": memoryMaxUsageContents,
@ -350,16 +331,15 @@ func TestMemoryStatsNoLimitInBytesFile(t *testing.T) {

 	memory := &MemoryGroup{}
 	actualStats := *cgroups.NewStats()
-	err := memory.GetStats(helper.CgroupPath, &actualStats)
+	err := memory.GetStats(path, &actualStats)
 	if err == nil {
 		t.Fatal("Expected failure")
 	}
 }

 func TestMemoryStatsBadStatFile(t *testing.T) {
-	helper := NewCgroupTestUtil("memory", t)
-	defer helper.cleanup()
-	helper.writeFileContents(map[string]string{
+	path := tempDir(t, "memory")
+	writeFileContents(t, path, map[string]string{
 		"memory.stat":               "rss rss",
 		"memory.usage_in_bytes":     memoryUsageContents,
 		"memory.max_usage_in_bytes": memoryMaxUsageContents,
@ -368,16 +348,15 @@ func TestMemoryStatsBadStatFile(t *testing.T) {

 	memory := &MemoryGroup{}
 	actualStats := *cgroups.NewStats()
-	err := memory.GetStats(helper.CgroupPath, &actualStats)
+	err := memory.GetStats(path, &actualStats)
 	if err == nil {
 		t.Fatal("Expected failure")
 	}
 }

 func TestMemoryStatsBadUsageFile(t *testing.T) {
-	helper := NewCgroupTestUtil("memory", t)
-	defer helper.cleanup()
-	helper.writeFileContents(map[string]string{
+	path := tempDir(t, "memory")
+	writeFileContents(t, path, map[string]string{
 		"memory.stat":               memoryStatContents,
 		"memory.usage_in_bytes":     "bad",
 		"memory.max_usage_in_bytes": memoryMaxUsageContents,
@ -386,16 +365,15 @@ func TestMemoryStatsBadUsageFile(t *testing.T) {

 	memory := &MemoryGroup{}
 	actualStats := *cgroups.NewStats()
-	err := memory.GetStats(helper.CgroupPath, &actualStats)
+	err := memory.GetStats(path, &actualStats)
 	if err == nil {
 		t.Fatal("Expected failure")
 	}
 }

 func TestMemoryStatsBadMaxUsageFile(t *testing.T) {
-	helper := NewCgroupTestUtil("memory", t)
-	defer helper.cleanup()
-	helper.writeFileContents(map[string]string{
+	path := tempDir(t, "memory")
+	writeFileContents(t, path, map[string]string{
 		"memory.stat":               memoryStatContents,
 		"memory.usage_in_bytes":     memoryUsageContents,
 		"memory.max_usage_in_bytes": "bad",
@ -404,16 +382,15 @@ func TestMemoryStatsBadMaxUsageFile(t *testing.T) {

 	memory := &MemoryGroup{}
 	actualStats := *cgroups.NewStats()
-	err := memory.GetStats(helper.CgroupPath, &actualStats)
+	err := memory.GetStats(path, &actualStats)
 	if err == nil {
 		t.Fatal("Expected failure")
 	}
 }

 func TestMemoryStatsBadLimitInBytesFile(t *testing.T) {
-	helper := NewCgroupTestUtil("memory", t)
-	defer helper.cleanup()
-	helper.writeFileContents(map[string]string{
+	path := tempDir(t, "memory")
+	writeFileContents(t, path, map[string]string{
 		"memory.stat":               memoryStatContents,
 		"memory.usage_in_bytes":     memoryUsageContents,
 		"memory.max_usage_in_bytes": memoryMaxUsageContents,
@ -422,35 +399,108 @@ func TestMemoryStatsBadLimitInBytesFile(t *testing.T) {

 	memory := &MemoryGroup{}
 	actualStats := *cgroups.NewStats()
-	err := memory.GetStats(helper.CgroupPath, &actualStats)
+	err := memory.GetStats(path, &actualStats)
 	if err == nil {
 		t.Fatal("Expected failure")
 	}
 }

 func TestMemorySetOomControl(t *testing.T) {
-	helper := NewCgroupTestUtil("memory", t)
-	defer helper.cleanup()
+	path := tempDir(t, "memory")

 	const (
 		oomKillDisable = 1 // disable oom killer, default is 0
 	)

-	helper.writeFileContents(map[string]string{
+	writeFileContents(t, path, map[string]string{
 		"memory.oom_control": strconv.Itoa(oomKillDisable),
 	})

 	memory := &MemoryGroup{}
-	if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+	r := &configs.Resources{}
+	if err := memory.Set(path, r); err != nil {
 		t.Fatal(err)
 	}

-	value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.oom_control")
+	value, err := fscommon.GetCgroupParamUint(path, "memory.oom_control")
 	if err != nil {
-		t.Fatalf("Failed to parse memory.oom_control - %s", err)
+		t.Fatal(err)
 	}
-
 	if value != oomKillDisable {
 		t.Fatalf("Got the wrong value, set memory.oom_control failed.")
 	}
 }
+
+func TestNoHierarchicalNumaStat(t *testing.T) {
+	path := tempDir(t, "memory")
+	writeFileContents(t, path, map[string]string{
+		"memory.numa_stat": memoryNUMAStatNoHierarchyContents + memoryNUMAStatExtraContents,
+	})
+
+	actualStats, err := getPageUsageByNUMA(path)
+	if err != nil {
+		t.Fatal(err)
+	}
+	pageUsageByNUMA := cgroups.PageUsageByNUMA{
+		PageUsageByNUMAInner: cgroups.PageUsageByNUMAInner{
+			Total:       cgroups.PageStats{Total: 44611, Nodes: map[uint8]uint64{0: 32631, 1: 7501, 2: 1982, 3: 2497}},
+			File:        cgroups.PageStats{Total: 44428, Nodes: map[uint8]uint64{0: 32614, 1: 7335, 2: 1982, 3: 2497}},
+			Anon:        cgroups.PageStats{Total: 183, Nodes: map[uint8]uint64{0: 17, 1: 166, 2: 0, 3: 0}},
+			Unevictable: cgroups.PageStats{Total: 0, Nodes: map[uint8]uint64{0: 0, 1: 0, 2: 0, 3: 0}},
+		},
+		Hierarchical: cgroups.PageUsageByNUMAInner{},
+	}
+	expectPageUsageByNUMAEquals(t, pageUsageByNUMA, actualStats)
+}
+
+func TestBadNumaStat(t *testing.T) {
+	memoryNUMAStatBadContents := []struct {
+		desc, contents string
+	}{
+		{
+			desc: "Nx where x is not a number",
+			contents: `total=44611 N0=44611,
+file=44428 Nx=0
+`,
+		}, {
+			desc:     "Nx where x > 255",
+			contents: `total=44611 N333=444`,
+		}, {
+			desc:     "Nx argument missing",
+			contents: `total=44611 N0=123 N1=`,
+		}, {
+			desc:     "Nx argument is not a number",
+			contents: `total=44611 N0=123 N1=a`,
+		}, {
+			desc:     "Missing = after Nx",
+			contents: `total=44611 N0=123 N1`,
+		}, {
+			desc: "No Nx at non-first position",
+			contents: `total=44611 N0=32631
+file=44428 N0=32614
+anon=183 N0=12 badone
+`,
+		},
+	}
+	path := tempDir(t, "memory")
+	for _, c := range memoryNUMAStatBadContents {
+		writeFileContents(t, path, map[string]string{
+			"memory.numa_stat": c.contents,
+		})
+
+		_, err := getPageUsageByNUMA(path)
+		if err == nil {
+			t.Errorf("case %q: expected error, got nil", c.desc)
+		}
+	}
+}
+
+func TestWithoutNumaStat(t *testing.T) {
+	path := tempDir(t, "memory")
+
+	actualStats, err := getPageUsageByNUMA(path)
+	if err != nil {
+		t.Fatal(err)
+	}
+	expectPageUsageByNUMAEquals(t, cgroups.PageUsageByNUMA{}, actualStats)
+}
--- a/libcontainer/cgroups/fs/name.go
+++ b/libcontainer/cgroups/fs/name.go
@ -1,5 +1,3 @@
-// +build linux
-
 package fs

 import (
@ -16,22 +14,15 @@ func (s *NameGroup) Name() string {
 	return s.GroupName
 }

-func (s *NameGroup) Apply(d *cgroupData) error {
+func (s *NameGroup) Apply(path string, _ *configs.Resources, pid int) error {
 	if s.Join {
-		// ignore errors if the named cgroup does not exist
-		d.join(s.GroupName)
+		// Ignore errors if the named cgroup does not exist.
+		_ = apply(path, pid)
 	}
 	return nil
 }

-func (s *NameGroup) Set(path string, cgroup *configs.Cgroup) error {
-	return nil
-}
-
-func (s *NameGroup) Remove(d *cgroupData) error {
-	if s.Join {
-		removePath(d.path(s.GroupName))
-	}
+func (s *NameGroup) Set(_ string, _ *configs.Resources) error {
 	return nil
 }

--- a/libcontainer/cgroups/fs/net_cls.go
+++ b/libcontainer/cgroups/fs/net_cls.go
@ -1,33 +1,25 @@
-// +build linux
-
 package fs

 import (
 	"strconv"

 	"github.com/opencontainers/runc/libcontainer/cgroups"
-	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
 	"github.com/opencontainers/runc/libcontainer/configs"
 )

-type NetClsGroup struct {
-}
+type NetClsGroup struct{}

 func (s *NetClsGroup) Name() string {
 	return "net_cls"
 }

-func (s *NetClsGroup) Apply(d *cgroupData) error {
-	_, err := d.join("net_cls")
-	if err != nil && !cgroups.IsNotFound(err) {
-		return err
-	}
-	return nil
+func (s *NetClsGroup) Apply(path string, _ *configs.Resources, pid int) error {
+	return apply(path, pid)
 }

-func (s *NetClsGroup) Set(path string, cgroup *configs.Cgroup) error {
-	if cgroup.Resources.NetClsClassid != 0 {
-		if err := fscommon.WriteFile(path, "net_cls.classid", strconv.FormatUint(uint64(cgroup.Resources.NetClsClassid), 10)); err != nil {
+func (s *NetClsGroup) Set(path string, r *configs.Resources) error {
+	if r.NetClsClassid != 0 {
+		if err := cgroups.WriteFile(path, "net_cls.classid", strconv.FormatUint(uint64(r.NetClsClassid), 10)); err != nil {
 			return err
 		}
 	}
@ -35,10 +27,6 @@ func (s *NetClsGroup) Set(path string, cgroup *configs.Cgroup) error {
 	return nil
 }

-func (s *NetClsGroup) Remove(d *cgroupData) error {
-	return removePath(d.path("net_cls"))
-}
-
 func (s *NetClsGroup) GetStats(path string, stats *cgroups.Stats) error {
 	return nil
 }
--- a/libcontainer/cgroups/fs/net_cls_test.go
+++ b/libcontainer/cgroups/fs/net_cls_test.go
@ -1,5 +1,3 @@
-// +build linux
-
 package fs

 import (
@ -7,6 +5,7 @@ import (
 	"testing"

 	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+	"github.com/opencontainers/runc/libcontainer/configs"
 )

 const (
@ -15,25 +14,26 @@ const (
 )

 func TestNetClsSetClassid(t *testing.T) {
-	helper := NewCgroupTestUtil("net_cls", t)
-	defer helper.cleanup()
+	path := tempDir(t, "net_cls")

-	helper.writeFileContents(map[string]string{
+	writeFileContents(t, path, map[string]string{
 		"net_cls.classid": strconv.FormatUint(classidBefore, 10),
 	})

-	helper.CgroupData.config.Resources.NetClsClassid = classidAfter
+	r := &configs.Resources{
+		NetClsClassid: classidAfter,
+	}
 	netcls := &NetClsGroup{}
-	if err := netcls.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+	if err := netcls.Set(path, r); err != nil {
 		t.Fatal(err)
 	}

 	// As we are in mock environment, we can't get correct value of classid from
 	// net_cls.classid.
 	// So. we just judge if we successfully write classid into file
-	value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "net_cls.classid")
+	value, err := fscommon.GetCgroupParamUint(path, "net_cls.classid")
 	if err != nil {
-		t.Fatalf("Failed to parse net_cls.classid - %s", err)
+		t.Fatal(err)
 	}
 	if value != classidAfter {
 		t.Fatal("Got the wrong value, set net_cls.classid failed.")
--- a/libcontainer/cgroups/fs/net_prio.go
+++ b/libcontainer/cgroups/fs/net_prio.go
@ -1,31 +1,23 @@
-// +build linux
-
 package fs

 import (
 	"github.com/opencontainers/runc/libcontainer/cgroups"
-	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
 	"github.com/opencontainers/runc/libcontainer/configs"
 )

-type NetPrioGroup struct {
-}
+type NetPrioGroup struct{}

 func (s *NetPrioGroup) Name() string {
 	return "net_prio"
 }

-func (s *NetPrioGroup) Apply(d *cgroupData) error {
-	_, err := d.join("net_prio")
-	if err != nil && !cgroups.IsNotFound(err) {
-		return err
-	}
-	return nil
+func (s *NetPrioGroup) Apply(path string, _ *configs.Resources, pid int) error {
+	return apply(path, pid)
 }

-func (s *NetPrioGroup) Set(path string, cgroup *configs.Cgroup) error {
-	for _, prioMap := range cgroup.Resources.NetPrioIfpriomap {
-		if err := fscommon.WriteFile(path, "net_prio.ifpriomap", prioMap.CgroupString()); err != nil {
+func (s *NetPrioGroup) Set(path string, r *configs.Resources) error {
+	for _, prioMap := range r.NetPrioIfpriomap {
+		if err := cgroups.WriteFile(path, "net_prio.ifpriomap", prioMap.CgroupString()); err != nil {
 			return err
 		}
 	}
@ -33,10 +25,6 @@ func (s *NetPrioGroup) Set(path string, cgroup *configs.Cgroup) error {
 	return nil
 }

-func (s *NetPrioGroup) Remove(d *cgroupData) error {
-	return removePath(d.path("net_prio"))
-}
-
 func (s *NetPrioGroup) GetStats(path string, stats *cgroups.Stats) error {
 	return nil
 }
--- a/libcontainer/cgroups/fs/net_prio_test.go
+++ b/libcontainer/cgroups/fs/net_prio_test.go
@ -1,5 +1,3 @@
-// +build linux
-
 package fs

 import (
@ -10,28 +8,27 @@ import (
 	"github.com/opencontainers/runc/libcontainer/configs"
 )

-var (
-	prioMap = []*configs.IfPrioMap{
-		{
-			Interface: "test",
-			Priority:  5,
-		},
-	}
-)
+var prioMap = []*configs.IfPrioMap{
+	{
+		Interface: "test",
+		Priority:  5,
+	},
+}

 func TestNetPrioSetIfPrio(t *testing.T) {
-	helper := NewCgroupTestUtil("net_prio", t)
-	defer helper.cleanup()
+	path := tempDir(t, "net_prio")

-	helper.CgroupData.config.Resources.NetPrioIfpriomap = prioMap
+	r := &configs.Resources{
+		NetPrioIfpriomap: prioMap,
+	}
 	netPrio := &NetPrioGroup{}
-	if err := netPrio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+	if err := netPrio.Set(path, r); err != nil {
 		t.Fatal(err)
 	}

-	value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "net_prio.ifpriomap")
+	value, err := fscommon.GetCgroupParamString(path, "net_prio.ifpriomap")
 	if err != nil {
-		t.Fatalf("Failed to parse net_prio.ifpriomap - %s", err)
+		t.Fatal(err)
 	}
 	if !strings.Contains(value, "test 5") {
 		t.Fatal("Got the wrong value, set net_prio.ifpriomap failed.")
--- a/libcontainer/cgroups/fs/paths.go
+++ b/libcontainer/cgroups/fs/paths.go
@ -0,0 +1,186 @@
+package fs
+
+import (
+	"errors"
+	"os"
+	"path/filepath"
+	"sync"
+
+	"golang.org/x/sys/unix"
+
+	"github.com/opencontainers/runc/libcontainer/cgroups"
+	"github.com/opencontainers/runc/libcontainer/configs"
+	"github.com/opencontainers/runc/libcontainer/utils"
+)
+
+// The absolute path to the root of the cgroup hierarchies.
+var (
+	cgroupRootLock sync.Mutex
+	cgroupRoot     string
+)
+
+const defaultCgroupRoot = "/sys/fs/cgroup"
+
+func initPaths(cg *configs.Cgroup) (map[string]string, error) {
+	root, err := rootPath()
+	if err != nil {
+		return nil, err
+	}
+
+	inner, err := innerPath(cg)
+	if err != nil {
+		return nil, err
+	}
+
+	paths := make(map[string]string)
+	for _, sys := range subsystems {
+		name := sys.Name()
+		path, err := subsysPath(root, inner, name)
+		if err != nil {
+			// The non-presence of the devices subsystem
+			// is considered fatal for security reasons.
+			if cgroups.IsNotFound(err) && (cg.SkipDevices || name != "devices") {
+				continue
+			}
+
+			return nil, err
+		}
+		paths[name] = path
+	}
+
+	return paths, nil
+}
+
+func tryDefaultCgroupRoot() string {
+	var st, pst unix.Stat_t
+
+	// (1) it should be a directory...
+	err := unix.Lstat(defaultCgroupRoot, &st)
+	if err != nil || st.Mode&unix.S_IFDIR == 0 {
+		return ""
+	}
+
+	// (2) ... and a mount point ...
+	err = unix.Lstat(filepath.Dir(defaultCgroupRoot), &pst)
+	if err != nil {
+		return ""
+	}
+
+	if st.Dev == pst.Dev {
+		// parent dir has the same dev -- not a mount point
+		return ""
+	}
+
+	// (3) ... of 'tmpfs' fs type.
+	var fst unix.Statfs_t
+	err = unix.Statfs(defaultCgroupRoot, &fst)
+	if err != nil || fst.Type != unix.TMPFS_MAGIC {
+		return ""
+	}
+
+	// (4) it should have at least 1 entry ...
+	dir, err := os.Open(defaultCgroupRoot)
+	if err != nil {
+		return ""
+	}
+	names, err := dir.Readdirnames(1)
+	if err != nil {
+		return ""
+	}
+	if len(names) < 1 {
+		return ""
+	}
+	// ... which is a cgroup mount point.
+	err = unix.Statfs(filepath.Join(defaultCgroupRoot, names[0]), &fst)
+	if err != nil || fst.Type != unix.CGROUP_SUPER_MAGIC {
+		return ""
+	}
+
+	return defaultCgroupRoot
+}
+
+// rootPath finds and returns path to the root of the cgroup hierarchies.
+func rootPath() (string, error) {
+	cgroupRootLock.Lock()
+	defer cgroupRootLock.Unlock()
+
+	if cgroupRoot != "" {
+		return cgroupRoot, nil
+	}
+
+	// fast path
+	cgroupRoot = tryDefaultCgroupRoot()
+	if cgroupRoot != "" {
+		return cgroupRoot, nil
+	}
+
+	// slow path: parse mountinfo
+	mi, err := cgroups.GetCgroupMounts(false)
+	if err != nil {
+		return "", err
+	}
+	if len(mi) < 1 {
+		return "", errors.New("no cgroup mount found in mountinfo")
+	}
+
+	// Get the first cgroup mount (e.g. "/sys/fs/cgroup/memory"),
+	// use its parent directory.
+	root := filepath.Dir(mi[0].Mountpoint)
+
+	if _, err := os.Stat(root); err != nil {
+		return "", err
+	}
+
+	cgroupRoot = root
+	return cgroupRoot, nil
+}
+
+func innerPath(c *configs.Cgroup) (string, error) {
+	if (c.Name != "" || c.Parent != "") && c.Path != "" {
+		return "", errors.New("cgroup: either Path or Name and Parent should be used")
+	}
+
+	// XXX: Do not remove CleanPath. Path safety is important! -- cyphar
+	innerPath := utils.CleanPath(c.Path)
+	if innerPath == "" {
+		cgParent := utils.CleanPath(c.Parent)
+		cgName := utils.CleanPath(c.Name)
+		innerPath = filepath.Join(cgParent, cgName)
+	}
+
+	return innerPath, nil
+}
+
+func subsysPath(root, inner, subsystem string) (string, error) {
+	// If the cgroup name/path is absolute do not look relative to the cgroup of the init process.
+	if filepath.IsAbs(inner) {
+		mnt, err := cgroups.FindCgroupMountpoint(root, subsystem)
+		// If we didn't mount the subsystem, there is no point we make the path.
+		if err != nil {
+			return "", err
+		}
+
+		// Sometimes subsystems can be mounted together as 'cpu,cpuacct'.
+		return filepath.Join(root, filepath.Base(mnt), inner), nil
+	}
+
+	// Use GetOwnCgroupPath instead of GetInitCgroupPath, because the creating
+	// process could in container and shared pid namespace with host, and
+	// /proc/1/cgroup could point to whole other world of cgroups.
+	parentPath, err := cgroups.GetOwnCgroupPath(subsystem)
+	if err != nil {
+		return "", err
+	}
+
+	return filepath.Join(parentPath, inner), nil
+}
+
+func apply(path string, pid int) error {
+	if path == "" {
+		return nil
+	}
+	if err := os.MkdirAll(path, 0o755); err != nil {
+		return err
+	}
+	return cgroups.WriteCgroupProc(path, pid)
+}
--- a/libcontainer/cgroups/fs/paths_test.go
+++ b/libcontainer/cgroups/fs/paths_test.go
@ -0,0 +1,104 @@
+package fs
+
+import (
+	"path/filepath"
+	"strings"
+	"testing"
+
+	"github.com/opencontainers/runc/libcontainer/cgroups"
+	"github.com/opencontainers/runc/libcontainer/configs"
+)
+
+func TestInvalidCgroupPath(t *testing.T) {
+	if cgroups.IsCgroup2UnifiedMode() {
+		t.Skip("cgroup v2 is not supported")
+	}
+
+	root, err := rootPath()
+	if err != nil {
+		t.Fatalf("couldn't get cgroup root: %v", err)
+	}
+
+	testCases := []struct {
+		test               string
+		path, name, parent string
+	}{
+		{
+			test: "invalid cgroup path",
+			path: "../../../../../../../../../../some/path",
+		},
+		{
+			test: "invalid absolute cgroup path",
+			path: "/../../../../../../../../../../some/path",
+		},
+		{
+			test:   "invalid cgroup parent",
+			parent: "../../../../../../../../../../some/path",
+			name:   "name",
+		},
+		{
+			test:   "invalid absolute cgroup parent",
+			parent: "/../../../../../../../../../../some/path",
+			name:   "name",
+		},
+		{
+			test:   "invalid cgroup name",
+			parent: "parent",
+			name:   "../../../../../../../../../../some/path",
+		},
+		{
+			test:   "invalid absolute cgroup name",
+			parent: "parent",
+			name:   "/../../../../../../../../../../some/path",
+		},
+		{
+			test:   "invalid cgroup name and parent",
+			parent: "../../../../../../../../../../some/path",
+			name:   "../../../../../../../../../../some/path",
+		},
+		{
+			test:   "invalid absolute cgroup name and parent",
+			parent: "/../../../../../../../../../../some/path",
+			name:   "/../../../../../../../../../../some/path",
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.test, func(t *testing.T) {
+			config := &configs.Cgroup{Path: tc.path, Name: tc.name, Parent: tc.parent}
+
+			inner, err := innerPath(config)
+			if err != nil {
+				t.Fatalf("couldn't get cgroup data: %v", err)
+			}
+
+			// Make sure the final inner path doesn't go outside the cgroup mountpoint.
+			if strings.HasPrefix(inner, "..") {
+				t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!")
+			}
+
+			// Double-check, using an actual cgroup.
+			deviceRoot := filepath.Join(root, "devices")
+			devicePath, err := subsysPath(root, inner, "devices")
+			if err != nil {
+				t.Fatalf("couldn't get cgroup path: %v", err)
+			}
+			if !strings.HasPrefix(devicePath, deviceRoot) {
+				t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!")
+			}
+		})
+	}
+}
+
+func TestTryDefaultCgroupRoot(t *testing.T) {
+	res := tryDefaultCgroupRoot()
+	exp := defaultCgroupRoot
+	if cgroups.IsCgroup2UnifiedMode() {
+		// checking that tryDefaultCgroupRoot does return ""
+		// in case /sys/fs/cgroup is not cgroup v1 root dir.
+		exp = ""
+	}
+	if res != exp {
+		t.Errorf("tryDefaultCgroupRoot: want %q, got %q", exp, res)
+	}
+}
--- a/libcontainer/cgroups/fs/perf_event.go
+++ b/libcontainer/cgroups/fs/perf_event.go
@ -1,5 +1,3 @@
-// +build linux
-
 package fs

 import (
@ -7,29 +5,20 @@ import (
 	"github.com/opencontainers/runc/libcontainer/configs"
 )

-type PerfEventGroup struct {
-}
+type PerfEventGroup struct{}

 func (s *PerfEventGroup) Name() string {
 	return "perf_event"
 }

-func (s *PerfEventGroup) Apply(d *cgroupData) error {
-	// we just want to join this group even though we don't set anything
-	if _, err := d.join("perf_event"); err != nil && !cgroups.IsNotFound(err) {
-		return err
-	}
-	return nil
+func (s *PerfEventGroup) Apply(path string, _ *configs.Resources, pid int) error {
+	return apply(path, pid)
 }

-func (s *PerfEventGroup) Set(path string, cgroup *configs.Cgroup) error {
+func (s *PerfEventGroup) Set(_ string, _ *configs.Resources) error {
 	return nil
 }

-func (s *PerfEventGroup) Remove(d *cgroupData) error {
-	return removePath(d.path("perf_event"))
-}
-
 func (s *PerfEventGroup) GetStats(path string, stats *cgroups.Stats) error {
 	return nil
 }
--- a/libcontainer/cgroups/fs/pids.go
+++ b/libcontainer/cgroups/fs/pids.go
@ -1,10 +1,7 @@
-// +build linux
-
 package fs

 import (
-	"fmt"
-	"path/filepath"
+	"math"
 	"strconv"

 	"github.com/opencontainers/runc/libcontainer/cgroups"
@ -12,31 +9,26 @@ import (
 	"github.com/opencontainers/runc/libcontainer/configs"
 )

-type PidsGroup struct {
-}
+type PidsGroup struct{}

 func (s *PidsGroup) Name() string {
 	return "pids"
 }

-func (s *PidsGroup) Apply(d *cgroupData) error {
-	_, err := d.join("pids")
-	if err != nil && !cgroups.IsNotFound(err) {
-		return err
-	}
-	return nil
+func (s *PidsGroup) Apply(path string, _ *configs.Resources, pid int) error {
+	return apply(path, pid)
 }

-func (s *PidsGroup) Set(path string, cgroup *configs.Cgroup) error {
-	if cgroup.Resources.PidsLimit != 0 {
+func (s *PidsGroup) Set(path string, r *configs.Resources) error {
+	if r.PidsLimit != 0 {
 		// "max" is the fallback value.
 		limit := "max"

-		if cgroup.Resources.PidsLimit > 0 {
-			limit = strconv.FormatInt(cgroup.Resources.PidsLimit, 10)
+		if r.PidsLimit > 0 {
+			limit = strconv.FormatInt(r.PidsLimit, 10)
 		}

-		if err := fscommon.WriteFile(path, "pids.max", limit); err != nil {
+		if err := cgroups.WriteFile(path, "pids.max", limit); err != nil {
 			return err
 		}
 	}
@ -44,28 +36,24 @@ func (s *PidsGroup) Set(path string, cgroup *configs.Cgroup) error {
 	return nil
 }

-func (s *PidsGroup) Remove(d *cgroupData) error {
-	return removePath(d.path("pids"))
-}
-
 func (s *PidsGroup) GetStats(path string, stats *cgroups.Stats) error {
+	if !cgroups.PathExists(path) {
+		return nil
+	}
 	current, err := fscommon.GetCgroupParamUint(path, "pids.current")
 	if err != nil {
-		return fmt.Errorf("failed to parse pids.current - %s", err)
+		return err
 	}

-	maxString, err := fscommon.GetCgroupParamString(path, "pids.max")
+	max, err := fscommon.GetCgroupParamUint(path, "pids.max")
 	if err != nil {
-		return fmt.Errorf("failed to parse pids.max - %s", err)
+		return err
 	}
-
-	// Default if pids.max == "max" is 0 -- which represents "no limit".
-	var max uint64
-	if maxString != "max" {
-		max, err = fscommon.ParseUint(maxString, 10, 64)
-		if err != nil {
-			return fmt.Errorf("failed to parse pids.max - unable to parse %q as a uint from Cgroup file %q", maxString, filepath.Join(path, "pids.max"))
-		}
+	// If no limit is set, read from pids.max returns "max", which is
+	// converted to MaxUint64 by GetCgroupParamUint. Historically, we
+	// represent "no limit" for pids as 0, thus this conversion.
+	if max == math.MaxUint64 {
+		max = 0
 	}

 	stats.PidsStats.Current = current
--- a/libcontainer/cgroups/fs/pids_test.go
+++ b/libcontainer/cgroups/fs/pids_test.go
@ -1,5 +1,3 @@
-// +build linux
-
 package fs

 import (
@ -8,6 +6,7 @@ import (

 	"github.com/opencontainers/runc/libcontainer/cgroups"
 	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+	"github.com/opencontainers/runc/libcontainer/configs"
 )

 const (
@ -16,65 +15,64 @@ const (
 )

 func TestPidsSetMax(t *testing.T) {
-	helper := NewCgroupTestUtil("pids", t)
-	defer helper.cleanup()
+	path := tempDir(t, "pids")

-	helper.writeFileContents(map[string]string{
+	writeFileContents(t, path, map[string]string{
 		"pids.max": "max",
 	})

-	helper.CgroupData.config.Resources.PidsLimit = maxLimited
+	r := &configs.Resources{
+		PidsLimit: maxLimited,
+	}
 	pids := &PidsGroup{}
-	if err := pids.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+	if err := pids.Set(path, r); err != nil {
 		t.Fatal(err)
 	}

-	value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "pids.max")
+	value, err := fscommon.GetCgroupParamUint(path, "pids.max")
 	if err != nil {
-		t.Fatalf("Failed to parse pids.max - %s", err)
+		t.Fatal(err)
 	}
-
 	if value != maxLimited {
 		t.Fatalf("Expected %d, got %d for setting pids.max - limited", maxLimited, value)
 	}
 }

 func TestPidsSetUnlimited(t *testing.T) {
-	helper := NewCgroupTestUtil("pids", t)
-	defer helper.cleanup()
+	path := tempDir(t, "pids")

-	helper.writeFileContents(map[string]string{
+	writeFileContents(t, path, map[string]string{
 		"pids.max": strconv.Itoa(maxLimited),
 	})

-	helper.CgroupData.config.Resources.PidsLimit = maxUnlimited
+	r := &configs.Resources{
+		PidsLimit: maxUnlimited,
+	}
 	pids := &PidsGroup{}
-	if err := pids.Set(helper.CgroupPath, helper.CgroupData.config); err != nil {
+	if err := pids.Set(path, r); err != nil {
 		t.Fatal(err)
 	}

-	value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "pids.max")
+	value, err := fscommon.GetCgroupParamString(path, "pids.max")
 	if err != nil {
-		t.Fatalf("Failed to parse pids.max - %s", err)
+		t.Fatal(err)
 	}
-
 	if value != "max" {
 		t.Fatalf("Expected %s, got %s for setting pids.max - unlimited", "max", value)
 	}
 }

 func TestPidsStats(t *testing.T) {
-	helper := NewCgroupTestUtil("pids", t)
-	defer helper.cleanup()
+	path := tempDir(t, "pids")

-	helper.writeFileContents(map[string]string{
+	writeFileContents(t, path, map[string]string{
 		"pids.current": strconv.Itoa(1337),
 		"pids.max":     strconv.Itoa(maxLimited),
 	})

 	pids := &PidsGroup{}
 	stats := *cgroups.NewStats()
-	if err := pids.GetStats(helper.CgroupPath, &stats); err != nil {
+	if err := pids.GetStats(path, &stats); err != nil {
 		t.Fatal(err)
 	}

@ -88,17 +86,16 @@ func TestPidsStats(t *testing.T) {
 }

 func TestPidsStatsUnlimited(t *testing.T) {
-	helper := NewCgroupTestUtil("pids", t)
-	defer helper.cleanup()
+	path := tempDir(t, "pids")

-	helper.writeFileContents(map[string]string{
+	writeFileContents(t, path, map[string]string{
 		"pids.current": strconv.Itoa(4096),
 		"pids.max":     "max",
 	})

 	pids := &PidsGroup{}
 	stats := *cgroups.NewStats()
-	if err := pids.GetStats(helper.CgroupPath, &stats); err != nil {
+	if err := pids.GetStats(path, &stats); err != nil {
 		t.Fatal(err)
 	}

--- a/libcontainer/cgroups/fs/rdma.go
+++ b/libcontainer/cgroups/fs/rdma.go
@ -0,0 +1,25 @@
+package fs
+
+import (
+	"github.com/opencontainers/runc/libcontainer/cgroups"
+	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+	"github.com/opencontainers/runc/libcontainer/configs"
+)
+
+type RdmaGroup struct{}
+
+func (s *RdmaGroup) Name() string {
+	return "rdma"
+}
+
+func (s *RdmaGroup) Apply(path string, _ *configs.Resources, pid int) error {
+	return apply(path, pid)
+}
+
+func (s *RdmaGroup) Set(path string, r *configs.Resources) error {
+	return fscommon.RdmaSet(path, r)
+}
+
+func (s *RdmaGroup) GetStats(path string, stats *cgroups.Stats) error {
+	return fscommon.RdmaGetStats(path, stats)
+}
--- a/Show More
+++ b/Show More