gramineproject · kailun-qin · Aug 24, 2023 · Aug 15, 2024 · Aug 15, 2024 · Aug 15, 2024
diff --git a/CI-Examples/blender/blender.manifest.template b/CI-Examples/blender/blender.manifest.template
@@ -24,6 +24,12 @@ fs.mounts = [
 
 sgx.debug = true
 sgx.edmm_enable = {{ 'true' if env.get('EDMM', '0') == '1' else 'false' }}
+
+# `use_exinfo = true` is needed because Blender uses `madvise(MADV_DONTNEED)`. When EDMM is enabled,
+# it will free the committed pages but automatically recommit them on subsequent accesses via page
+# fault handling.
+sgx.use_exinfo = {{ 'true' if env.get('EDMM', '0') == '1' else 'false' }}
+
 sys.stack.size = "8M"
 sgx.enclave_size = "2048M"
 sgx.max_threads = {{ '1' if env.get('EDMM', '0') == '1' else '64' }}

diff --git a/CI-Examples/nginx/nginx.manifest.template b/CI-Examples/nginx/nginx.manifest.template
@@ -32,6 +32,10 @@ sgx.edmm_enable = {{ 'true' if env.get('EDMM', '0') == '1' else 'false' }}
 sgx.enclave_size = "512M"
 sgx.max_threads = {{ '1' if env.get('EDMM', '0') == '1' else '4' }}
 
+# `use_exinfo = true` is needed because the application may trigger lazy allocation of pages
+# (through exception handling) when EDMM is enabled
+sgx.use_exinfo = {{ 'true' if env.get('EDMM', '0') == '1' else 'false' }}
+
 sgx.trusted_files = [
   "file:{{ install_dir }}/sbin/nginx",
   "file:{{ install_dir }}/conf/",

diff --git a/CI-Examples/rust/rust-hyper-http-server.manifest.template b/CI-Examples/rust/rust-hyper-http-server.manifest.template
@@ -22,6 +22,10 @@ fs.mounts = [
 sgx.debug = true
 sgx.edmm_enable = {{ 'true' if env.get('EDMM', '0') == '1' else 'false' }}
 
+# `use_exinfo = true` is needed because the application may trigger lazy allocation of pages
+# (through exception handling) when EDMM is enabled
+sgx.use_exinfo = {{ 'true' if env.get('EDMM', '0') == '1' else 'false' }}
+
 sgx.trusted_files = [
   "file:{{ self_exe }}",
   "file:{{ gramine.runtimedir() }}/",

diff --git a/CI-Examples/sqlite/manifest.template b/CI-Examples/sqlite/manifest.template
@@ -33,6 +33,10 @@ sgx.edmm_enable = {{ 'true' if env.get('EDMM', '0') == '1' else 'false' }}
 sgx.enclave_size = "256M"
 sgx.max_threads = {{ '1' if env.get('EDMM', '0') == '1' else '4' }}
 
+# `use_exinfo = true` is needed because the application may trigger lazy allocation of pages
+# (through exception handling) when EDMM is enabled
+sgx.use_exinfo = {{ 'true' if env.get('EDMM', '0') == '1' else 'false' }}
+
 sgx.trusted_files = [
   "file:{{ execdir }}/sqlite3",
   "file:{{ gramine.runtimedir() }}/",

diff --git a/Documentation/devel/features.md b/Documentation/devel/features.md
@@ -1715,13 +1715,17 @@ mappings, it depends on the type of file:
 - allowed for encrypted files (but synchronization happens only on explicit system calls like
   `msync()` and `close()`).
 
-`MAP_LOCKED`, `MAP_NORESERVE`, `MAP_POPULATE`, `MAP_NONBLOCK`, `MAP_HUGETLB`, `MAP_HUGE_2MB`,
-`MAP_HUGE_1GB` flags are ignored (allowed but have no effect). `MAP_SYNC` flag is not supported.
+`MAP_NORESERVE`'s original semantics are not implemented and it is silently ignored. However, in
+case of SGX backend and on systems supporting {term}`EDMM`, `MAP_NORESERVE` flag is used as a
+lazy-allocation heuristic/hint for anonymous mappings -- instead of pre-accepting the region of
+enclave pages on mmap requests, the enclave pages are lazily accepted on page-fault events.
+
+`MAP_LOCKED`, `MAP_POPULATE`, `MAP_NONBLOCK`, `MAP_HUGETLB`, `MAP_HUGE_2MB`, `MAP_HUGE_1GB` flags
+are ignored (allowed but have no effect). `MAP_SYNC` flag is not supported.
 
 `mprotect()` supports all flags except `PROT_SEM` and `PROT_GROWSUP`. We haven't encountered any
 applications that would use these flags. In case of SGX backend, `mprotect()` behavior differs:
-- on systems supporting {term}`EDMM`, `mprotect()` correctly applies
-  permissions;
+- on systems supporting EDMM, `mprotect()` correctly applies permissions;
 - on systems not supporting EDMM, all enclave memory is allocated with Read-Write-Execute
   permissions, and `mprotect()` calls are silently ignored.
 

diff --git a/Documentation/manifest-syntax.rst b/Documentation/manifest-syntax.rst
@@ -748,12 +748,14 @@ not support :term:`EDMM` feature.
 
 When this feature is enabled, Gramine does not add heap pages (uninitialized
 memory) to the enclave at creation time. Instead, memory is added to the enclave
-on demand. This can greatly reduce startup time for bigger enclaves, reduce
-the :term:`EPC` usage (as only actually allocated memory is used) and allow for
-changing memory permissions (without this Gramine allocates all dynamic memory
-as RWX). Unfortunately it can negatively impact performance, as adding a page
-to the enclave at runtime is a more expensive operation than adding the page
-before enclave creation (because it involves more enclave exits and syscalls).
+on demand (note that for mappings requested with `MAP_NORESERVE`, the enclave
+pages are lazily committed on page fault events). This can greatly reduce
+startup time for bigger enclaves, reduce the :term:`EPC` usage (as only actually
+allocated memory is used) and allow for changing memory permissions (without
+this Gramine allocates all dynamic memory as RWX). Unfortunately it can
+negatively impact performance, as adding a page to the enclave at runtime is a
+more expensive operation than adding the page before enclave creation (because
+it involves more enclave exits and syscalls).
 
 When this feature is enabled, it is not necessary to specify
 ``sgx.enclave_size`` (Gramine will automatically set it to 1TB which should be

diff --git a/Documentation/pal/host-abi.rst b/Documentation/pal/host-abi.rst
@@ -366,3 +366,9 @@ random bits, to obtain an attestation report and quote, etc.
 
 .. doxygenfunction:: PalDeviceMap
    :project: pal
+
+.. doxygenfunction:: PalGetLazyCommitPages
+   :project: pal
+
+.. doxygenfunction:: PalFreeThenLazyReallocCommittedPages
+   :project: pal
diff --git a/libos/include/libos_flags_conv.h b/libos/include/libos_flags_conv.h
@@ -25,7 +25,8 @@ static inline pal_prot_flags_t LINUX_PROT_TO_PAL(int prot, int map_flags) {
     return (prot & PROT_READ  ? PAL_PROT_READ  : 0) |
            (prot & PROT_WRITE ? PAL_PROT_WRITE : 0) |
            (prot & PROT_EXEC  ? PAL_PROT_EXEC  : 0) |
-           (map_flags & MAP_PRIVATE ? PAL_PROT_WRITECOPY : 0);
+           (map_flags & MAP_PRIVATE   ? PAL_PROT_WRITECOPY : 0) |
+           (map_flags & MAP_NORESERVE ? PAL_PROT_LAZYALLOC : 0);
 }
 
 static inline int PAL_PROT_TO_LINUX(pal_prot_flags_t prot) {

diff --git a/libos/src/bookkeep/libos_vma.c b/libos/src/bookkeep/libos_vma.c
@@ -1,6 +1,8 @@
 /* SPDX-License-Identifier: LGPL-3.0-or-later */
 /* Copyright (C) 2014 Stony Brook University
  * Copyright (C) 2020 Invisible Things Lab
+ * Copyright (C) 2024 Intel Corporation
+ *                    Kailun Qin <kailun.qin@intel.com>
  */
 
 #include <stddef.h> /* needed by <linux/signal.h> for size_t */
@@ -34,7 +36,7 @@ static size_t g_peak_total_memory_size = 0;
  * MAP_FIXED or unsupported flags. */
 static int filter_saved_flags(int flags) {
     return flags & (MAP_SHARED | MAP_SHARED_VALIDATE | MAP_PRIVATE | MAP_ANONYMOUS | MAP_GROWSDOWN
-                    | MAP_HUGETLB | MAP_HUGE_2MB | MAP_HUGE_1GB | MAP_STACK
+                    | MAP_HUGETLB | MAP_HUGE_2MB | MAP_HUGE_1GB | MAP_STACK | MAP_NORESERVE
                     | VMA_UNMAPPED | VMA_INTERNAL | VMA_TAINTED);
 }
 
@@ -92,6 +94,10 @@ static bool is_addr_in_vma(uintptr_t addr, struct libos_vma* vma) {
     return vma->begin <= addr && addr < vma->end;
 }
 
+static bool is_addr_in_vma_valid_range(uintptr_t addr, struct libos_vma* vma) {
+    return vma->begin <= addr && addr < vma->valid_end;
+}
+
 /* Returns whether `addr` is smaller or inside a vma (`node`). */
 static bool cmp_addr_to_vma(void* addr, struct avl_tree_node* node) {
     struct libos_vma* vma = container_of(node, struct libos_vma, tree_node);
@@ -591,14 +597,16 @@ static int _bkeep_initial_vma(struct libos_vma* new_vma) {
 
 static int pal_mem_bkeep_alloc(size_t size, uintptr_t* out_addr);
 static int pal_mem_bkeep_free(uintptr_t addr, size_t size);
+static int pal_mem_bkeep_get_vma_info(uintptr_t addr, pal_prot_flags_t* out_prot_flags);
 
 #define ASLR_BITS 12
 /* This variable is written to only once, during initialization, so it does not need to
  * be atomic. */
 static void* g_aslr_addr_top = NULL;
 
 int init_vma(void) {
-    PalSetMemoryBookkeepingUpcalls(pal_mem_bkeep_alloc, pal_mem_bkeep_free);
+    PalSetMemoryBookkeepingUpcalls(pal_mem_bkeep_alloc, pal_mem_bkeep_free,
+                                   pal_mem_bkeep_get_vma_info);
 
     size_t initial_ranges_count = 0;
     for (size_t i = 0; i < g_pal_public_state->initial_mem_ranges_len; i++) {
@@ -1241,6 +1249,30 @@ static void dump_vma(struct libos_vma_info* vma_info, struct libos_vma* vma) {
     memcpy(vma_info->comment, vma->comment, sizeof(vma_info->comment));
 }
 
+static int pal_mem_bkeep_get_vma_info(uintptr_t addr, pal_prot_flags_t* out_prot_flags) {
+    int ret = 0;
+
+    spinlock_lock(&vma_tree_lock);
+    struct libos_vma* vma = _lookup_vma((uintptr_t)addr);
+    if (!vma || !is_addr_in_vma_valid_range((uintptr_t)addr, vma)) {
+        ret = -EACCES;
+        goto out;
+    }
+
+    struct libos_vma_info vma_info;
+    dump_vma(&vma_info, vma);
+
+    *out_prot_flags = LINUX_PROT_TO_PAL(vma_info.prot, vma_info.flags);
+
+    if (vma_info.file) {
+        put_handle(vma_info.file);
+    }
+
+out:
+    spinlock_unlock(&vma_tree_lock);
+    return ret;
+}
+
 int lookup_vma(void* addr, struct libos_vma_info* vma_info) {
     assert(vma_info);
     int ret = 0;
@@ -1376,10 +1408,15 @@ void free_vma_info_array(struct libos_vma_info* vma_infos, size_t count) {
 struct madvise_dontneed_ctx {
     uintptr_t begin;
     uintptr_t end;
+#ifndef LINUX_KERNEL_PATCHED
+    uint8_t* bitvector;
+#endif
     int error;
 };
 
 static bool madvise_dontneed_visitor(struct libos_vma* vma, void* visitor_arg) {
+    assert(spinlock_is_locked(&vma_tree_lock));
+
     struct madvise_dontneed_ctx* ctx = (struct madvise_dontneed_ctx*)visitor_arg;
 
     if (vma->flags & (VMA_UNMAPPED | VMA_INTERNAL)) {
@@ -1398,6 +1435,21 @@ static bool madvise_dontneed_visitor(struct libos_vma* vma, void* visitor_arg) {
         return true;
     }
 
+/* There is a data race in the SGX driver where two enclave threads may try to add and remove the
+ * same enclave page simultaneously (e.g., if both lazy allocation and `MADV_DONTNEED` semantics are
+ * supported), see below for details:
+ * https://lore.kernel.org/lkml/20240429104330.3636113-3-dmitrii.kuvaiskii@intel.com.
+ *
+ * TODO: remove this once the Linux kernel is patched. */
+#ifdef LINUX_KERNEL_PATCHED
+    uintptr_t start = MAX(ctx->begin, vma->begin);
+    uintptr_t end = MIN(ctx->end, vma->valid_end);
+    int ret = PalFreeThenLazyReallocCommittedPages((void*)start, end - start);
+    if (ret < 0) {
+        ctx->error = pal_to_unix_errno(ret);
+        return false;
+    }
+#else
     uintptr_t zero_start = MAX(ctx->begin, vma->begin);
     uintptr_t zero_end = MIN(ctx->end, vma->valid_end);
 
@@ -1414,7 +1466,29 @@ static bool madvise_dontneed_visitor(struct libos_vma* vma, void* visitor_arg) {
         }
     }
 
-    memset((void*)zero_start, 0, zero_end - zero_start);
+    if (vma->flags & MAP_NORESERVE) {
+        /* Lazy allocation of pages, zeroize only the committed pages. Note that the uncommitted
+         * pages have to be skipped to avoid deadlocks. This is because we're holding the
+         * non-reentrant/recursive `vma_tree_lock` when we're in this visitor callback (which is
+         * invoked during VMA traversing). And if we hit page faults on accessing the uncommitted
+         * pages, our lazy allocation logic would also try to acquire the same lock for VMA lookup
+         * in g_mem_bkeep_get_vma_info_upcall (see `pal_mem_bkeep_get_vma_info()` for details). */
+        memset(ctx->bitvector, 0, UDIV_ROUND_UP((ctx->end - ctx->begin) / PAGE_SIZE, 8));
+
+        PalGetLazyCommitPages(zero_start, (zero_end - zero_start), ctx->bitvector);
+
+        size_t zero_pages = (zero_end - zero_start) / PAGE_SIZE;
+        for (size_t bit_idx = 0; bit_idx < zero_pages; bit_idx++) {
+            size_t byte_idx = bit_idx / 8;
+            size_t bit_position = bit_idx % 8;
+
+            uint8_t byte = (ctx->bitvector)[byte_idx];
+            if (!(byte & (1 << bit_position)))
+                memset((void*)(zero_start + bit_idx * PAGE_SIZE), 0, PAGE_SIZE);
+        }
+    } else {
+        memset((void*)zero_start, 0, zero_end - zero_start);
+    }
 
     if (pal_prot != pal_prot_writable) {
         /* the area was made writable above; restore the original permissions */
@@ -1424,10 +1498,12 @@ static bool madvise_dontneed_visitor(struct libos_vma* vma, void* visitor_arg) {
             BUG();
         }
     }
+#endif
     return true;
 }
 
 int madvise_dontneed_range(uintptr_t begin, uintptr_t end) {
+#ifdef LINUX_KERNEL_PATCHED
     struct madvise_dontneed_ctx ctx = {
         .begin = begin,
         .end = end,
@@ -1442,6 +1518,31 @@ int madvise_dontneed_range(uintptr_t begin, uintptr_t end) {
     if (!is_continuous)
         return -ENOMEM;
     return ctx.error;
+#else
+    /* allocate the bitvector for committed pages info outside the VMA traversing to avoid
+     * recursively holding `vma_tree_lock` */
+    size_t bitvector_size = UDIV_ROUND_UP((end - begin) / PAGE_SIZE, 8);
+    uint8_t* bitvector = calloc(1, bitvector_size);
+    if (!bitvector)
+        return -ENOMEM;
+
+    struct madvise_dontneed_ctx ctx = {
+        .begin = begin,
+        .end = end,
+        .bitvector = bitvector,
+        .error = 0,
+    };
+
+    spinlock_lock(&vma_tree_lock);
+    bool is_continuous = _traverse_vmas_in_range(begin, end, /*use_only_valid_part=*/false,
+                                                 madvise_dontneed_visitor, &ctx);
+    spinlock_unlock(&vma_tree_lock);
+
+    if (!is_continuous)
+        ctx.error = -ENOMEM;
+    free(bitvector);
+    return ctx.error;
+#endif
 }
 
 static bool vma_filter_needs_reload(struct libos_vma* vma, void* arg) {
@@ -1770,10 +1871,33 @@ BEGIN_CP_FUNC(vma) {
 
             if (!vma->file) {
                 /* Send anonymous memory region. */
-                struct libos_mem_entry* mem;
+                assert(IS_ALLOC_ALIGNED_PTR(vma->addr));
+                assert(IS_ALLOC_ALIGNED(vma->length));
                 assert(vma->valid_length == vma->length);
-                DO_CP_SIZE(memory, vma->addr, vma->valid_length, &mem);
-                mem->prot = LINUX_PROT_TO_PAL(vma->prot, /*map_flags=*/0);
+
+                size_t vma_pages = vma->length / PAGE_SIZE;
+                size_t bitvector_size = UDIV_ROUND_UP(vma_pages, 8);
+                uint8_t* bitvector = calloc(1, bitvector_size);
+                if (!bitvector)
+                    return -ENOMEM;
+
+                PalGetLazyCommitPages((uintptr_t)vma->addr, vma->length, bitvector);
+
+                for (size_t bit_idx = 0; bit_idx < vma_pages; bit_idx++) {
+                    size_t byte_idx = bit_idx / 8;
+                    size_t bit_position = bit_idx % 8;
+
+                    uint8_t byte = bitvector[byte_idx];
+                    /* skip the lazily-committed pages */
+                    if ((byte & (1 << bit_position)))
+                        continue;
+
+                    struct libos_mem_entry* mem;
+                    DO_CP_SIZE(memory, vma->addr + bit_idx * PAGE_SIZE, PAGE_SIZE, &mem);
+                    mem->prot = LINUX_PROT_TO_PAL(vma->prot, /*map_flags=*/0);
+                }
+
+                free(bitvector);
             } else {
                 /*
                  * Send file-backed memory region.

diff --git a/libos/src/sys/libos_mmap.c b/libos/src/sys/libos_mmap.c
@@ -144,6 +144,10 @@ void* libos_syscall_mmap(void* addr, size_t length, int prot, int flags, int fd,
             default:
                 return (void*)-EINVAL;
         }
+
+        /* ignore MAP_NORESERVE for file-backed mappings as we consider this rare and not worth
+         * optimizing */
+        flags &= ~MAP_NORESERVE;
     }
 
 #ifdef MAP_32BIT

diff --git a/libos/test/regression/common.h b/libos/test/regression/common.h
@@ -30,3 +30,12 @@
     static_assert(IS_STATIC_ARRAY(arr), "not a static array");  \
     sizeof(arr) / sizeof(arr[0]);                               \
 })
+
+/* We need this artificial assignment in READ_ONCE because of a GCC bug:
+ * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99258
+ */
+#define READ_ONCE(x) ({ __typeof__(x) y = *(volatile __typeof__(x)*)&(x); y;})
+
+#define WRITE_ONCE(x, y) do { *(volatile __typeof__(x)*)&(x) = (y); } while (0)
+
+#define COMPILER_BARRIER() ({ __asm__ __volatile__("" ::: "memory"); })
diff --git a/libos/test/regression/madvise.manifest.template b/libos/test/regression/madvise.manifest.template
@@ -0,0 +1,22 @@
+libos.entrypoint = "{{ entrypoint }}"
+
+loader.env.LD_LIBRARY_PATH = "/lib"
+
+fs.mounts = [
+  { path = "/lib", uri = "file:{{ gramine.runtimedir(libc) }}" },
+  { path = "/{{ entrypoint }}", uri = "file:{{ binary_dir }}/{{ entrypoint }}" },
+]
+
+sgx.max_threads = {{ '1' if env.get('EDMM', '0') == '1' else '4' }}
+sgx.debug = true
+sgx.edmm_enable = {{ 'true' if env.get('EDMM', '0') == '1' else 'false' }}
+
+# `use_exinfo = true` is needed because `madvise(MADV_DONTNEED)` is used in this test. When EDMM is
+# enabled, it will free the committed pages but automatically recommit them on subsequent accesses
+# via page fault handling.
+sgx.use_exinfo = {{ 'true' if env.get('EDMM', '0') == '1' else 'false' }}
+
+sgx.trusted_files = [
+  "file:{{ gramine.runtimedir(libc) }}/",
+  "file:{{ binary_dir }}/{{ entrypoint }}",
+]
diff --git a/libos/test/regression/meson.build b/libos/test/regression/meson.build
@@ -73,6 +73,7 @@ tests = {
     'mmap_file_backed': {},
     'mmap_file_emulated': {},
     'mmap_file_sigbus': {},
+    'mmap_map_noreserve': {},
     'mock_syscalls': {},
     'mprotect_file_fork': {},
     'mprotect_prot_growsdown': {},