Implement NVENC support for KMS and wlgrab capture methods

LizardByte · Jan 28, 2024 · 8182f59 · 8182f59
1 parent 9a3553d
commit 8182f59
Show file tree

Hide file tree

Showing 6 changed files with 391 additions and 6 deletions.
diff --git a/src/platform/linux/cuda.cpp b/src/platform/linux/cuda.cpp
@@ -4,6 +4,10 @@
  */
 #include <bitset>
 
+#include <fcntl.h>
+
+#include <filesystem>
+
 #include <NvFBC.h>
 #include <ffnvcodec/dynlink_loader.h>
 
@@ -29,6 +33,8 @@ extern "C" {
 #define CU_CHECK_IGNORE(x, y) \
   check((x), SUNSHINE_STRINGVIEW(y ": "))
 
+namespace fs = std::filesystem;
+
 using namespace std::literals;
 namespace cuda {
   constexpr auto cudaDevAttrMaxThreadsPerBlock = (CUdevice_attribute) 1;
@@ -69,6 +75,13 @@ namespace cuda {
     CU_CHECK_IGNORE(cdf->cuStreamDestroy(stream), "Couldn't destroy cuda stream");
   }
 
+  void
+  unregisterResource(CUgraphicsResource resource) {
+    CU_CHECK_IGNORE(cdf->cuGraphicsUnregisterResource(resource), "Couldn't unregister resource");
+  }
+
+  using registered_resource_t = util::safe_ptr<CUgraphicsResource_st, unregisterResource>;
+
   class img_t: public platf::img_t {
   public:
     tex_t tex;
@@ -223,6 +236,236 @@ namespace cuda {
     }
   };
 
+  /**
+   * @brief Opens the DRM device associated with the CUDA device index.
+   * @param index CUDA device index to open.
+   * @return File descriptor or -1 on failure.
+   */
+  file_t
+  open_drm_fd_for_cuda_device(int index) {
+    CUdevice device;
+    CU_CHECK(cdf->cuDeviceGet(&device, index), "Couldn't get CUDA device");
+
+    // There's no way to directly go from CUDA to a DRM device, so we'll
+    // use sysfs to look up the DRM device name from the PCI ID.
+    char pci_bus_id[13];
+    CU_CHECK(cdf->cuDeviceGetPCIBusId(pci_bus_id, sizeof(pci_bus_id), device), "Couldn't get CUDA device PCI bus ID");
+    BOOST_LOG(debug) << "Found CUDA device with PCI bus ID: "sv << pci_bus_id;
+
+    // Look for the name of the primary node in sysfs
+    char sysfs_path[PATH_MAX];
+    std::snprintf(sysfs_path, sizeof(sysfs_path), "/sys/bus/pci/devices/%s/drm", pci_bus_id);
+    fs::path sysfs_dir { sysfs_path };
+    for (auto &entry : fs::directory_iterator { sysfs_dir }) {
+      auto file = entry.path().filename();
+      auto filestring = file.generic_u8string();
+      if (std::string_view { filestring }.substr(0, 4) != "card"sv) {
+        continue;
+      }
+
+      BOOST_LOG(debug) << "Found DRM primary node: "sv << filestring;
+
+      fs::path dri_path { "/dev/dri"sv };
+      auto device_path = dri_path / file;
+      return open(device_path.c_str(), O_RDWR);
+    }
+
+    BOOST_LOG(error) << "Unable to find DRM device with PCI bus ID: "sv << pci_bus_id;
+    return -1;
+  }
+
+  class gl_cuda_vram_t: public platf::avcodec_encode_device_t {
+  public:
+    /**
+     * @brief Initialize the GL->CUDA encoding device.
+     * @param in_width Width of captured frames.
+     * @param in_height Height of captured frames.
+     * @param offset_x Offset of content in captured frame.
+     * @param offset_y Offset of content in captured frame.
+     * @return 0 on success or -1 on failure.
+     */
+    int
+    init(int in_width, int in_height, int offset_x, int offset_y) {
+      // This must be non-zero to tell the video core that it's a hardware encoding device.
+      data = (void *) 0x1;
+
+      // TODO: Support more than one CUDA device
+      file = std::move(open_drm_fd_for_cuda_device(0));
+      if (file.el < 0) {
+        char string[1024];
+        BOOST_LOG(error) << "Couldn't open DRM FD for CUDA device: "sv << strerror_r(errno, string, sizeof(string));
+        return -1;
+      }
+
+      gbm.reset(gbm::create_device(file.el));
+      if (!gbm) {
+        BOOST_LOG(error) << "Couldn't create GBM device: ["sv << util::hex(eglGetError()).to_string_view() << ']';
+        return -1;
+      }
+
+      display = egl::make_display(gbm.get());
+      if (!display) {
+        return -1;
+      }
+
+      auto ctx_opt = egl::make_ctx(display.get());
+      if (!ctx_opt) {
+        return -1;
+      }
+
+      ctx = std::move(*ctx_opt);
+
+      width = in_width;
+      height = in_height;
+
+      sequence = 0;
+
+      this->offset_x = offset_x;
+      this->offset_y = offset_y;
+
+      return 0;
+    }
+
+    /**
+     * @brief Initialize color conversion into target CUDA frame.
+     * @param frame Destination CUDA frame to write into.
+     * @param hw_frames_ctx_buf FFmpeg hardware frame context.
+     * @return 0 on success or -1 on failure.
+     */
+    int
+    set_frame(AVFrame *frame, AVBufferRef *hw_frames_ctx_buf) override {
+      this->hwframe.reset(frame);
+      this->frame = frame;
+
+      if (!frame->buf[0]) {
+        if (av_hwframe_get_buffer(hw_frames_ctx_buf, frame, 0)) {
+          BOOST_LOG(error) << "Couldn't get hwframe for VAAPI"sv;
+          return -1;
+        }
+      }
+
+      auto hw_frames_ctx = (AVHWFramesContext *) hw_frames_ctx_buf->data;
+      sw_format = hw_frames_ctx->sw_format;
+
+      auto nv12_opt = egl::create_target(frame->width, frame->height, sw_format);
+      if (!nv12_opt) {
+        return -1;
+      }
+
+      auto sws_opt = egl::sws_t::make(width, height, frame->width, frame->height, sw_format);
+      if (!sws_opt) {
+        return -1;
+      }
+
+      this->sws = std::move(*sws_opt);
+      this->nv12 = std::move(*nv12_opt);
+
+      auto cuda_ctx = (AVCUDADeviceContext *) hw_frames_ctx->device_ctx->hwctx;
+
+      stream = make_stream();
+      if (!stream) {
+        return -1;
+      }
+
+      cuda_ctx->stream = stream.get();
+
+      CU_CHECK(cdf->cuGraphicsGLRegisterImage(&y_res, nv12->tex[0], GL_TEXTURE_2D, CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY),
+        "Couldn't register Y plane texture");
+      CU_CHECK(cdf->cuGraphicsGLRegisterImage(&uv_res, nv12->tex[1], GL_TEXTURE_2D, CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY),
+        "Couldn't register UV plane texture");
+
+      return 0;
+    }
+
+    /**
+     * @brief Convert the captured image into the target CUDA frame.
+     * @param img Captured screen image.
+     * @return 0 on success or -1 on failure.
+     */
+    int
+    convert(platf::img_t &img) override {
+      auto &descriptor = (egl::img_descriptor_t &) img;
+
+      if (descriptor.sequence == 0) {
+        // For dummy images, use a blank RGB texture instead of importing a DMA-BUF
+        rgb = egl::create_blank(img);
+      }
+      else if (descriptor.sequence > sequence) {
+        sequence = descriptor.sequence;
+
+        rgb = egl::rgb_t {};
+
+        auto rgb_opt = egl::import_source(display.get(), descriptor.sd);
+
+        if (!rgb_opt) {
+          return -1;
+        }
+
+        rgb = std::move(*rgb_opt);
+      }
+
+      // Perform the color conversion and scaling in GL
+      sws.load_vram(descriptor, offset_x, offset_y, rgb->tex[0]);
+      sws.convert(nv12->buf);
+
+      auto fmt_desc = av_pix_fmt_desc_get(sw_format);
+
+      // Map the GL textures to read for CUDA
+      CUgraphicsResource resources[2] = { y_res.get(), uv_res.get() };
+      CU_CHECK(cdf->cuGraphicsMapResources(2, resources, stream.get()), "Couldn't map GL textures in CUDA");
+
+      // Copy from the GL textures to the target CUDA frame
+      for (int i = 0; i < 2; i++) {
+        CUDA_MEMCPY2D cpy = {};
+        cpy.srcMemoryType = CU_MEMORYTYPE_ARRAY;
+        CU_CHECK(cdf->cuGraphicsSubResourceGetMappedArray(&cpy.srcArray, resources[i], 0, 0), "Couldn't get mapped plane array");
+
+        cpy.dstMemoryType = CU_MEMORYTYPE_DEVICE;
+        cpy.dstDevice = (CUdeviceptr) frame->data[i];
+        cpy.dstPitch = frame->linesize[i];
+        cpy.WidthInBytes = (frame->width * fmt_desc->comp[i].step) >> (i ? fmt_desc->log2_chroma_w : 0);
+        cpy.Height = frame->height >> (i ? fmt_desc->log2_chroma_h : 0);
+
+        CU_CHECK_IGNORE(cdf->cuMemcpy2DAsync(&cpy, stream.get()), "Couldn't copy texture to CUDA frame");
+      }
+
+      // Unmap the textures to allow modification from GL again
+      CU_CHECK(cdf->cuGraphicsUnmapResources(2, resources, stream.get()), "Couldn't unmap GL textures from CUDA");
+      return 0;
+    }
+
+    /**
+     * @brief Configures shader parameters for the specified colorspace.
+     */
+    void
+    apply_colorspace() override {
+      sws.apply_colorspace(colorspace);
+    }
+
+    file_t file;
+    gbm::gbm_t gbm;
+    egl::display_t display;
+    egl::ctx_t ctx;
+
+    // This must be destroyed before display_t
+    stream_t stream;
+    frame_t hwframe;
+
+    egl::sws_t sws;
+    egl::nv12_t nv12;
+    AVPixelFormat sw_format;
+
+    int width, height;
+
+    std::uint64_t sequence;
+    egl::rgb_t rgb;
+
+    registered_resource_t y_res;
+    registered_resource_t uv_res;
+
+    int offset_x, offset_y;
+  };
+
   std::unique_ptr<platf::avcodec_encode_device_t>
   make_avcodec_encode_device(int width, int height, bool vram) {
     if (init()) {
@@ -245,6 +488,29 @@ namespace cuda {
     return cuda;
   }
 
+  /**
+   * @brief Create a GL->CUDA encoding device for consuming captured dmabufs.
+   * @param in_width Width of captured frames.
+   * @param in_height Height of captured frames.
+   * @param offset_x Offset of content in captured frame.
+   * @param offset_y Offset of content in captured frame.
+   * @return FFmpeg encoding device context.
+   */
+  std::unique_ptr<platf::avcodec_encode_device_t>
+  make_avcodec_gl_encode_device(int width, int height, int offset_x, int offset_y) {
+    if (init()) {
+      return nullptr;
+    }
+
+    auto cuda = std::make_unique<gl_cuda_vram_t>();
+
+    if (cuda->init(width, height, offset_x, offset_y)) {
+      return nullptr;
+    }
+
+    return cuda;
+  }
+
   namespace nvfbc {
     static PNVFBCCREATEINSTANCE createInstance {};
     static NVFBC_API_FUNCTION_LIST func { NVFBC_VERSION };

diff --git a/src/platform/linux/cuda.h b/src/platform/linux/cuda.h
@@ -27,6 +27,18 @@ namespace cuda {
   }
   std::unique_ptr<platf::avcodec_encode_device_t>
   make_avcodec_encode_device(int width, int height, bool vram);
+
+  /**
+   * @brief Create a GL->CUDA encoding device for consuming captured dmabufs.
+   * @param in_width Width of captured frames.
+   * @param in_height Height of captured frames.
+   * @param offset_x Offset of content in captured frame.
+   * @param offset_y Offset of content in captured frame.
+   * @return FFmpeg encoding device context.
+   */
+  std::unique_ptr<platf::avcodec_encode_device_t>
+  make_avcodec_gl_encode_device(int width, int height, int offset_x, int offset_y);
+
   int
   init();
 }  // namespace cuda

diff --git a/src/platform/linux/graphics.cpp b/src/platform/linux/graphics.cpp
@@ -647,6 +647,71 @@ namespace egl {
     return nv12;
   }
 
+  /**
+   * @brief Creates biplanar YUV textures to render into.
+   * @param width Width of the target frame.
+   * @param height Height of the target frame.
+   * @param format Format of the target frame.
+   * @return The new RGB texture.
+   */
+  std::optional<nv12_t>
+  create_target(int width, int height, AVPixelFormat format) {
+    nv12_t nv12 {
+      EGL_NO_DISPLAY,
+      EGL_NO_IMAGE,
+      EGL_NO_IMAGE,
+      gl::tex_t::make(2),
+      gl::frame_buf_t::make(2),
+    };
+
+    GLint y_format;
+    GLint uv_format;
+
+    // Determine the size of each plane element
+    auto fmt_desc = av_pix_fmt_desc_get(format);
+    if (fmt_desc->comp[0].depth <= 8) {
+      y_format = GL_R8;
+      uv_format = GL_RG8;
+    }
+    else if (fmt_desc->comp[0].depth <= 16) {
+      y_format = GL_R16;
+      uv_format = GL_RG16;
+    }
+    else {
+      BOOST_LOG(error) << "Unsupported target pixel format: "sv << format;
+      return std::nullopt;
+    }
+
+    gl::ctx.BindTexture(GL_TEXTURE_2D, nv12->tex[0]);
+    gl::ctx.TexStorage2D(GL_TEXTURE_2D, 1, y_format, width, height);
+
+    gl::ctx.BindTexture(GL_TEXTURE_2D, nv12->tex[1]);
+    gl::ctx.TexStorage2D(GL_TEXTURE_2D, 1, uv_format,
+      width >> fmt_desc->log2_chroma_w, height >> fmt_desc->log2_chroma_h);
+
+    nv12->buf.bind(std::begin(nv12->tex), std::end(nv12->tex));
+
+    GLenum attachments[] {
+      GL_COLOR_ATTACHMENT0,
+      GL_COLOR_ATTACHMENT1
+    };
+
+    for (int x = 0; x < sizeof(attachments) / sizeof(decltype(attachments[0])); ++x) {
+      gl::ctx.BindFramebuffer(GL_FRAMEBUFFER, nv12->buf[x]);
+      gl::ctx.DrawBuffers(1, &attachments[x]);
+
+      const float y_black[] = { 0.0f, 0.0f, 0.0f, 0.0f };
+      const float uv_black[] = { 0.5f, 0.5f, 0.5f, 0.5f };
+      gl::ctx.ClearBufferfv(GL_COLOR, 0, x == 0 ? y_black : uv_black);
+    }
+
+    gl::ctx.BindFramebuffer(GL_FRAMEBUFFER, 0);
+
+    gl_drain_errors;
+
+    return nv12;
+  }
+
   void
   sws_t::apply_colorspace(const video::sunshine_colorspace_t &colorspace) {
     auto color_p = video::color_vectors_from_colorspace(colorspace);