TLPI WebサイトのSeccomp user space notification サンプルを触ってみた

Linux アドベントカレンダー2020 その2 16日目の記事です。昨日はn01e0さんのreadfile(2) のいい話でした。

ところでその1では、先日、tenforwardさんによりSeccomp user space notificationの分かりやすい解説記事が上がっていました。

今日もSeccomp user space notificationのお話の流れに乗っかろうと思います。ではやっていきましょう。

Seccomp の仕組み（コード編）

Seccomp はLinuxにおける、システムコールのフィルタリングの仕組みです。詳細な説明は上述のtenforwardさんの記事に譲りますが...。

拙作 mruby-seccomp を用いたフィルターの例を示します。mruby-seccompを組み込んだmrubyで以下のスクリプトを起動すると、新しくシェルセッションが立ち上がります。

context = Seccomp.new(default: :allow) do |rule|
  rule.kill(:mkdir, Seccomp::ARG(:>=, 0), Seccomp::ARG(:>=, 0))
end

context.load

puts "==== It will be jailed. Please try to mkdir"
exec "/bin/sh"

このセッションでは、 mkdir の操作ができません。上記のスクリプトの rule.kill(:mkdir, ...) の箇所で mkdir(2) システムコールの呼び出しを禁止リストに加えているためです。

# ./mruby/bin/mruby examples/bash_wituout_mkdir.rb 
==== It will be jailed. Please try to mkdir
# mkdir /foo
Bad system call (core dumped)

# strace -e mkdir mkdir /foo
mkdir("/foo", 0777)                     = ?
+++ killed by SIGSYS (core dumped) +++
Bad system call (core dumped)

C言語レベルで同等のものを実装すると、以下のようなコードになります。

#define _GNU_SOURCE

#include <errno.h>
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>

#include <linux/filter.h>
#include <linux/audit.h>
#include <linux/seccomp.h>
#include <sys/prctl.h>
#include <sys/types.h>
#include <sys/syscall.h>
#include <sys/ptrace.h>

int seccomp(unsigned int op, unsigned int flags, void *args)
{
  errno = 0;
  return syscall(__NR_seccomp, op, flags, args);
}

static int install_filter(void)
{
  //ref: https://man7.org/linux/man-pages/man2/seccomp.2.html ほか

  struct sock_filter filter[] = {
    // アーキテクチャのロードと確認
    BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
             (offsetof(struct seccomp_data, arch))),
    BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, AUDIT_ARCH_X86_64, 1, 0),
    BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS),
    // システムコール番号のロード
    BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
             (offsetof(struct seccomp_data, nr))),
    // mkdir であれば、禁止
    BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_mkdir, 0, 1),
    BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL),
    // それ以外は許可
    BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
  };

  struct sock_fprog prog = {
    .len = (unsigned short) (sizeof(filter) / sizeof(filter[0])),
    .filter = filter,
  };

  if(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog)){
    perror("seccomp");
    return 1;
  }
  return 0;
}

int main(){
  if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
    perror("prctl");
    exit(1);
  }

  if(install_filter())
    exit(1);

  printf("==== It will be jailed. Please try to mkdir\n");
  char *newargv[] = { NULL };
  char *newenviron[] = { NULL };
  execve("/bin/sh", newargv, newenviron);
  // if here
  perror("execve");
  exit(1);
}

オプションにより、特定のシステムコールの禁止（denylist）のほか、逆に特定のリストのシステムコールのみの許可（allowlist）、あるいは呼び出しのトレースや単にaudit logに吐き出すだけ、など挙動を選ぶことができます。また、システムコールが特定の条件で引数を呼び出した時のみ、といったような制限もできます。拙作 mruby-seccomp ではこれらのフラグを一通りサポートしたつもりです。お試しください。

なお、一般にユーザによるOSの操作を制限する方法には、Kernel capability（rootの権限をいくつかに分割した上での許可・禁止）やAppArmorなどのLSMがありますが、seccompも基本的にはそのような位置づけです。

また、 seccomp(2) に渡すBPFプログラムを自分で書かないで済むよう、専用のCライブラリ libseccomp というのも存在します。

User space notification を動かす

Seccomp でどのようにシステムコールを禁止・許可するかは事前にseccompのルールをプログラムで記述して定義し、カーネルに投げ入れることになります。

この禁止・許可の判断をユーザスペースのプログラムに委ねることが可能で、それを実現するのがSeccomp user space notificationです。

今回、TLPIのサイトに掲載されているサンプルコードを動かしました。環境はUbuntu Focalで、カーネルはUbuntu公式リポジトリで配られている 5.8.0-29-generic を利用しています。

man7.org

ちなみに動作には scm_functions.c と scm_functions.h も必要なので同じディレクトリにダウンロードしておきます。

ビルドは以下のように。

$ gcc seccomp_user_notification.c scm_functions.c -o seccomp_user_notification
$ ./seccomp_user_notification 
At least one pathname argument should be supplied
Usage: ./seccomp_user_notification [options] <dir> <dir>...
      Options
      -d <nsecs>    Tracer delays 'nsecs' before inspecting target
      -f <val>      Install second filter whose return value is:
                    'e' - SECCOMP_RET_ERRNO
                    't' - SECCOMP_RET_TRACE
      -K            Don't kill tracer on termination of target process

動作させてみると、確かに mkdir(2) システムコールを EPERM エラーにしています。

$ sudo ./seccomp_user_notification /foo
Target process: PID = 1848
Tracer: PID = 1849

Target process: about to make directory "/foo"
Tracer: got notification for PID 1848; ID is 2a056be00eadd9f1
Tracer: mkdir("/foo", 600)
Tracer: ioctlSECCOMP_IOCTL_NOTIF_RECV: Invalid argument
Target process: mkdir: Operation not permitted
Target process: terminating
Parent: target process has terminated
Parent: killing tracer

処理もざっくり読んだのでメモがてら残します。

targetProcess() , tracerProcess() と二つの子プロセスを立ち上げる。この時事前に socketpair(2) でUNIX ドメインソケットを作っておく。
targetProcess() 内部で installNotifyFilter() としてseccompフィルタを有効にする。このseccompフィルタは mkdir(2) をトレースし、 SECCOMP_FILTER_FLAG_NEW_LISTENER フラグを渡して notifyFd を生成する。

   125  static int
   126 installNotifyFilter(void)
   127 {
   128     struct sock_filter filter[] = {
   129         X86_64_CHECK_ARCH_AND_LOAD_SYSCALL_NR,
   130 
   131         /* mkdir() triggers notification to user-space tracer */
   132 
   133         BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_mkdir, 0, 1),
   134         BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_USER_NOTIF),
   135 
   136         /* Every other system call is allowed */
   137 
   138         BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
   139     };
   140 
   141     struct sock_fprog prog = {
   142         .len = (unsigned short) (sizeof(filter) / sizeof(filter[0])),
   143         .filter = filter,
   144     };
  //...
   156     int notifyFd = seccomp(SECCOMP_SET_MODE_FILTER,
   157                            SECCOMP_FILTER_FLAG_NEW_LISTENER, &prog);
   158     if (notifyFd == -1)
   159         errExit("seccomp-install-notify-filter");
   160 
   161     return notifyFd;
   162 }

この notifyFd() はUNIX ドメインソケットを経由してfdを他のプロセスに送付する機能を用い、tracerProcessに渡される。なお、targetProcess側ではちゃんと閉じないとダメ。

   286      if (sendfd(sockPair[0], notifyFd) == -1)
   287         errExit("sendfd");
   288 
   289     /* Notification and socket FDs are no longer needed in target process */
   290 
   291     if (close(notifyFd) == -1)
   292         errExit("close-target-notify-fd");

送られたfdをtracerProcessで受け取り、 watchForNotifications() 内部で ioctl(fd, SECCOMP_IOCTL_NOTIF_RECV, buf) を呼ぶことでシステムコールが呼び出され ようとしている 通知を取得できる。

   330  static void
   331 watchForNotifications(int notifyFd, struct cmdLineOpts *opts)
   332 {
   333     struct seccomp_notif *req;
   334     struct seccomp_notif_resp *resp;
   335     struct seccomp_notif_sizes sizes;
   336     char path[PATH_MAX];
//...
   356     for (;;) {
   357 
   358         /* Wait for next notification, returning info in '*req' */
   359 
   360         if (ioctl(notifyFd, SECCOMP_IOCTL_NOTIF_RECV, req) == -1)
   361             errExit("Tracer: ioctlSECCOMP_IOCTL_NOTIF_RECV");
   362 
   363         printf("Tracer: got notification for PID %d; ID is %llx\n",
   364                 req->pid, req->id);
//...

そのシステムコール呼び出しをどうするかについては ioctl(fd, SECCOMP_IOCTL_NOTIF_SEND, resp) で指示を送り返せる。以下のL453の EPERM を他のerrnoに変更すれば任意のエラーを返せる。

   438          resp->id = req->id;
   439         resp->flags = 0;        /* Must be zero as at Linux 5.0 */
   440 
   441         /* Success return value is the length of the pathname given to
   442            mkdir() */
   443 
   444         resp->val = strlen(path);
   445 
   446         /* If the directory is in /tmp, then create it on behalf of the tracer;
   447            give an error for a directory pathname in any other location. */
   448 
   449         if (strncmp(path, "/tmp/", strlen("/tmp/")) == 0) {
   450             mkdir(path, req->data.args[1]);
   451             resp->error = 0;
   452         } else {
   453             resp->error = -EPERM;
   454         }
   455 
   456         /* Provide a response to the target process */
   457 
   458         if (ioctl(notifyFd, SECCOMP_IOCTL_NOTIF_SEND, resp) == -1) {
//...
   473         }
   474     } // L356 へ無限ループ

といった感じ。

元コードのエラーを調べてみた

無事動作確認ができたのですが、元のプログラムを動作させた際に若干気になる表示があります。

$ sudo ./seccomp_user_notification /foo /bar /buz
Tracer: PID = 1853
Target process: PID = 1852

Target process: about to make directory "/foo"
Tracer: got notification for PID 1852; ID is 20e29046893d05da
Tracer: mkdir("/foo", 600)
Tracer: ioctlSECCOMP_IOCTL_NOTIF_RECV: Invalid argument
Target process: mkdir: Operation not permitted

Target process: about to make directory "/bar"
Target process: mkdir: Function not implemented

Target process: about to make directory "/buz"
Target process: mkdir: Function not implemented
Target process: terminating
Parent: target process has terminated
Parent: killing tracer

複数の引数で複数回mkdirを呼べるのですが、2回目以降に関しては ioctl(SECCOMP_IOCTL_NOTIF_RECV) が失敗して、seccompデフォルトの禁止時のerrno（ENOSYS）になっていそうに見えます。

なぜ失敗しているのか？

ioctlの当該箇所ドキュメントがうまく見つからなかったため、カーネル5.8.0における ioctl(SECCOMP_IOCTL_NOTIF_RECV) に該当する処理を眺めてみます。

コードを見る限り EINVAL になる条件は、渡されたバッファがゼロ埋めされておらずゴミデータが残ってる場合、となっているようです。

static long seccomp_notify_recv(struct seccomp_filter *filter,
                void __user *buf)
{
    struct seccomp_knotif *knotif = NULL, *cur;
    struct seccomp_notif unotif;
    ssize_t ret;

    /* Verify that we're not given garbage to keep struct extensible. */
    ret = check_zeroed_user(buf, sizeof(unotif));
    if (ret < 0)
        return ret;
    if (!ret)
        return -EINVAL;
//...

ref: seccomp.c - kernel/seccomp.c - Linux source code (v5.8) - Bootlin

そこで、元のコードでは単に一回 malloc() しただけになっている ret バッファを、ループごとに毎回 calloc() で初期化するように変更したところ複数回のnotifiationの受け取りに成功するようになりました。

$ sudo ./seccomp_user_notification /foo /bar /buz
Target process: PID = 2426
Tracer: PID = 2427

Target process: about to make directory "/foo"
Tracer: got notification for PID 2426; ID is 443de1bca6af2f8a
Tracer: mkdir("/foo", 600)
Target process: mkdir: Operation not permitted

Target process: about to make directory "/bar"
Tracer: got notification for PID 2426; ID is 443de1bca6af2f8b
Tracer: mkdir("/bar", 600)
Target process: mkdir: Operation not permitted

Target process: about to make directory "/buz"
Tracer: got notification for PID 2426; ID is 443de1bca6af2f8c
Tracer: mkdir("/buz", 600)
Target process: mkdir: Operation not permitted
Target process: terminating
Parent: target process has terminated
Parent: killing tracer

パッチは以下のようになると思います。

diff --git a/seccomp_user_notification.orig.c b/seccomp_user_notification.c
index 70fc12c..85f5c54 100644
--- a/seccomp_user_notification.orig.c
+++ b/seccomp_user_notification.c
@@ -343,10 +343,6 @@ watchForNotifications(int notifyFd, struct cmdLineOpts *opts)
     if (seccomp(SECCOMP_GET_NOTIF_SIZES, 0, &sizes) == -1)
         errExit("Tracer: seccomp-SECCOMP_GET_NOTIF_SIZES");
 
-    req = malloc(sizes.seccomp_notif);
-    if (req == NULL)
-        errExit("Tracer: malloc");
-
     resp = malloc(sizes.seccomp_notif_resp);
     if (resp == NULL)
         errExit("Tracer: malloc");
@@ -354,6 +350,10 @@ watchForNotifications(int notifyFd, struct cmdLineOpts *opts)
     /* Loop handling notifications */
 
     for (;;) {
+        req = calloc(1, sizes.seccomp_notif);
+        if (req == NULL)
+            errExit("Tracer: malloc");
+
 
         /* Wait for next notification, returning info in '*req' */
 
@@ -471,6 +471,7 @@ watchForNotifications(int notifyFd, struct cmdLineOpts *opts)
             printf("Tracer: terminating <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n");
             exit(EXIT_FAILURE);
         }
+       free(req);
     }
 }