diff --git a/images/figures/1-architecture.png b/images/figures/1-architecture.png
new file mode 100644
index 0000000..ad9bca6
Binary files /dev/null and b/images/figures/1-architecture.png differ
diff --git a/images/figures/audio-qual.png b/images/figures/audio-qual.png
new file mode 100644
index 0000000..65f992e
Binary files /dev/null and b/images/figures/audio-qual.png differ
diff --git a/images/figures/comparison-prev_versions.png b/images/figures/comparison-prev_versions.png
new file mode 100644
index 0000000..7052741
Binary files /dev/null and b/images/figures/comparison-prev_versions.png differ
diff --git a/images/figures/grounding-qual.png b/images/figures/grounding-qual.png
new file mode 100644
index 0000000..ffcd9c4
Binary files /dev/null and b/images/figures/grounding-qual.png differ
diff --git a/images/figures/teaser.png b/images/figures/teaser.png
new file mode 100644
index 0000000..9eb96ba
Binary files /dev/null and b/images/figures/teaser.png differ
diff --git a/images/logos/IVAL_logo.png b/images/logos/IVAL_logo.png
new file mode 100644
index 0000000..5cd6523
Binary files /dev/null and b/images/logos/IVAL_logo.png differ
diff --git a/images/logos/MBZUAI_logo.png b/images/logos/MBZUAI_logo.png
new file mode 100644
index 0000000..1aededc
Binary files /dev/null and b/images/logos/MBZUAI_logo.png differ
diff --git a/images/logos/Oryx_logo.png b/images/logos/Oryx_logo.png
new file mode 100644
index 0000000..745cbdf
Binary files /dev/null and b/images/logos/Oryx_logo.png differ
diff --git a/images/logos/logo.png b/images/logos/logo.png
new file mode 100644
index 0000000..060457e
Binary files /dev/null and b/images/logos/logo.png differ
diff --git a/index.html b/index.html
new file mode 100644
index 0000000..36b0f8d
--- /dev/null
+++ b/index.html
@@ -0,0 +1,411 @@
+<!DOCTYPE html>
+<html>
+
+<head>
+  <meta charset="utf-8">
+  <meta name="description" content="Video-LLaVA: Pixel Grounding in Large Multimodal Video Models">
+  <meta name="keywords" content="multimodal chatbot">
+  <meta name="viewport" content="width=device-width, initial-scale=1">
+  <title>Video-LLaVA: Pixel Grounding in Large Multimodal Video Models</title>
+
+  <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro">
+  <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bulma@0.9.1/css/bulma.min.css">
+  <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.5.2/css/bootstrap.min.css">
+  <link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
+  <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.15.1/css/all.min.css">
+  <link rel="stylesheet" href="./static/css/index.css">
+  <link rel="icon" href="images/logos/logo.png">
+  <link href="https://fonts.googleapis.com/icon?family=Material+Icons" rel="stylesheet">
+
+
+  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
+  <script defer src="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.15.1/js/all.min.js"></script>
+  <script type="module" src="https://gradio.s3-us-west-2.amazonaws.com/3.27.0/gradio.js"></script>
+</head>
+
+
+<style>
+    .section {
+    margin-bottom: -30px; /* Adjust this value as needed to reduce the space */
+  }
+  .expandable-card .card-text-container {
+    max-height: 200px;
+    overflow-y: hidden;
+    position: relative;
+  }
+
+  .expandable-card.expanded .card-text-container {
+    max-height: none;
+  }
+
+  .expand-btn {
+    position: relative;
+    display: none;
+    background-color: rgba(255, 255, 255, 0.8);
+    /* margin-top: -20px; */
+    /* justify-content: center; */
+    color: #510c75;
+    border-color: transparent;
+  }
+
+  .expand-btn:hover {
+    background-color: rgba(200, 200, 200, 0.8);
+    text-decoration: none;
+    border-color: transparent;
+    color: #510c75;
+  }
+
+  .expand-btn:focus {
+    outline: none;
+    text-decoration: none;
+  }
+
+  .expandable-card:not(.expanded) .card-text-container:after {
+    content: "";
+    position: absolute;
+    bottom: 0;
+    left: 0;
+    width: 100%;
+    height: 90px;
+    background: linear-gradient(rgba(255, 255, 255, 0.2), rgba(255, 255, 255, 1));
+  }
+
+  .expandable-card:not(.expanded) .expand-btn {
+    margin-top: -40px;
+  }
+
+  .card-body {
+    padding-bottom: 5px;
+  }
+
+  .vertical-flex-layout {
+    justify-content: center;
+    align-items: center;
+    height: 100%;
+    display: flex;
+    flex-direction: column;
+    gap: 5px;
+  }
+
+  .figure-img {
+    max-width: 100%;
+    height: auto;
+  }
+
+  .adjustable-font-size {
+    font-size: calc(0.5rem + 2vw);
+  }
+
+  .chat-history {
+    flex-grow: 1;
+    overflow-y: auto;
+    /* overflow-x: hidden; */
+    padding: 5px;
+    border-bottom: 1px solid #ccc;
+    margin-bottom: 10px;
+  }
+
+  #gradio pre {
+    background-color: transparent;
+  }
+</style>
+
+<body>
+
+
+  <section class="hero">
+    <div class="hero-body">
+      <div class="container is-max-desktop">
+        <div class="columns is-centered">
+          <div class="column has-text-centered">
+            <img src="images/logos/logo.png" alt="Video-LLaVA_face" width="100">
+            <h1 class="title is-1 publication-title">Video-LLaVA: Pixel Grounding in Large Multimodal Video Models</h1>
+            <div class="is-size-5 publication-authors">
+              <!-- First Group of 3 Authors -->
+              <div class="author-group">
+                  <span class="author-block">
+                      <a href="https://shehanmunasinghe.github.io/" style="color:#f68946;font-weight:normal;">Shehan Munasinghe</a>,
+                  </span>
+                  <span class="author-block">
+                      <a href="https://thusharakart.github.io/" style="color:#008AD7;font-weight:normal;">Rusiru Thushara</a>,
+                  </span>
+              </div>
+
+              <!-- Second Group of 3 Authors -->
+              <div class="author-group">
+                  <span class="author-block">
+                      <a href="https://www.muhammadmaaz.com/" style="color:#F2A900;font-weight:normal;">Muhammad Maaz</a>,
+                  </span>
+                  <span class="author-block">
+                      <a href="https://www.hanoonarasheed.com/" style="color:#f68946;font-weight:normal;">Hanoona Rasheed</a>,
+                  </span>
+                  <span class="author-block">
+                      <a href="https://salman-h-khan.github.io" style="color:#f68946;font-weight:normal;">Salman Khan</a>,
+                  </span>
+              </div>
+
+              <!-- Third Group of 4 Authors -->
+              <div class="author-group">
+                  <span class="author-block">
+                      <a href="https://www.crcv.ucf.edu/person/mubarak-shah/" style="color:#f68946;font-weight:normal;">Mubarak Shah</a>,
+                  </span>
+                  <span class="author-block">
+                      <a href="https://sites.google.com/view/fahadkhans/home" style="color:#f68946;font-weight:normal;">Fahad S. Khan</a>
+                  </span>
+              </div>
+          </div>
+            <div class="is-size-5 publication-authors">
+              Mohamed bin Zayed University of AI, Australian National University<br>
+            </div>
+            <div class="is-size-5 publication-authors">
+            Linköping University, University of Central Florida<br>
+            </div>
+
+
+            <div class="column has-text-centered">
+              <div class="publication-links">
+                <span class="link-block">
+                  <a href="#" target="_blank"
+                    class="external-link button is-normal is-rounded is-dark">
+                    <span class="icon">
+                      <i class="ai ai-arxiv"></i>
+                    </span>
+                    <span>arXiv</span>
+                  </a>
+                </span>
+                <span class="link-block">
+                  <a href="https://github.com/mbzuai-oryx/Video-LLaVA" target="_blank"
+                    class="external-link button is-normal is-rounded is-dark">
+                    <span class="icon">
+                      <i class="fab fa-github"></i>
+                    </span>
+                    <span>Code</span>
+                  </a>
+                </span>
+                <!-- <span class="link-block">
+                  <a href="#grand-dataset" target="_blank"
+                    class="external-link button is-normal is-rounded is-dark">
+                    <span class="icon">
+                      <i class="fas fa-database"></i>
+                    </span>
+                    <span>Dataset</span>
+                  </a>
+                </span> -->
+              </div>
+            </div>
+          </div>
+        </div>
+      </div>
+    </div>
+  </section>
+
+  <!-- Abstract -->
+  <section class="hero teaser">
+    <div class="container is-max-desktop">
+      <div class="hero-body">
+        <h4 class="subtitle has-text-justified">
+            Extending image-based Large Multimodal Models (LMM) to videos is challenging due to the inherent complexity of video data. The recent approaches extending image-based LMM to videos either lack the grounding capabilities (e.g., VideoChat, Video-ChatGPT, Video-LLaMA, etc.) or do not utilize the audio-signals for better video understanding (e.g., Video-ChatGPT). Addressing these gaps, we propose Video-LLaVA, the first LMM with pixel-level grounding capability, integrating audio cues by transcribing them into text to enrich video-context understanding. Our framework uses an off-the-shelf tracker and a novel grounding module, enabling it to spatially and temporally ground objects in videos following user instructions. We evaluate Video-LLaVA using video-based generative and question-answering benchmarks and introduce new benchmarks specifically designed to measure prompt-based object grounding performance. Further, we propose the use of Vicuna over GPT-3.5, as utilized in Video-ChatGPT, for video-based conversation benchmarking, ensuring reproducibility of results, a concern with the proprietary nature of GPT-3.5. Our codes, pretrained models, and interactive demo will be made publicly available.
+        </h4>
+      </div>
+    </div>
+  </section>
+
+
+<!-- <div style="text-align:center;">
+  <iframe width="1024" height="720" src="https://www.youtube.com/embed/0dZ4dlNIGTY" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
+</div> -->
+
+
+
+  <section class="section"  style="background-color:#efeff081">
+  <div class="container is-max-desktop">
+    <!-- Abstract. -->
+    <div class="columns is-centered has-text-centered">
+      <div class="column is-six-fifths">
+        <h2 class="title is-3">🔥Highlights</h2>
+        <div class="content has-text-justified">
+            The key contributions of this work are: 
+
+          <ol type="1">
+            <li>We propose Video-LLaVA, <b>the first video-based LMM with pixel-level grounding capabilities</b>, featuring a modular design for enhanced flexibility. </li>
+              <br>
+            <li>By incorporating audio context, Video-LLaVA significantly <b>enhances its understanding of video content</b>, making it more comprehensive and aptly suited for scenarios where the audio signal is crucial for video understanding 
+                (e.g., dialogues and conversations, news videos, etc.). </li>
+              <br>
+            <li>We introduce <b>improved quantitative benchmarks</b> for video-based conversational models. Our benchmarks utilize open-source Vicuna LLM to ensure better reproducibility and transparency. We also propose benchmarks to evaluate the grounding capabilities of video-based conversational models.</li>
+          </ol>
+        </div>
+      </div>
+    </div>
+  </div>
+</section>
+
+
+
+<!--Model Arch-->
+<section class="section">
+  <div class="columns is-centered has-text-centered">
+    <div class="column is-six-fifths">
+      <h2 class="title is-3"><img src="images/logos/logo.png" alt="GLaMM_face" width="60" style="vertical-align: bottom;"> Video-LLaVA</h2>
+    </div>
+  </div>
+
+  <div class="container is-max-desktop">
+    <div class="columns is-centered">
+      <div class="column is-full-width">
+        <div class="content has-text-justified">
+          <p>
+            <!-- GLaMM consists of five core components to achieve visually grounded conversations: i) Global Image Encoder, ii) Region Encoder, iii) LLM, iv) Grounding Image Encoder, and v) Pixel Decoder. These components are cohesively designed to handle both textual and optional visual prompts (image level and region of interest), allowing for interaction at multiple levels of granularity, and generating grounded text responses. -->
+          </p>
+        </div>
+      </div>
+    </div>
+
+    <div class="columns is-centered has-text-centered">
+      <div class="column is-six-fifths" style="display: flex; align-items: flex-start; justify-content: center;">
+        <figure style="text-align: center;">
+          <img id="teaser" width="100%" src="images/figures/1-architecture.png">
+          <figcaption>
+            Overview of the Video-LLaVA architecture, showcasing the integration of a CLIP-based visual
+encoder with a multimodal language model for video understanding. The CLIP visual encoder extracts spatio-temporal features from videos
+by averaging frame-level features across temporal and spatial dimensions. These features are then projected into the LLM’s input space
+using a learnable Multi-Layer Perceptron (MLP). The system features a grounding module for spatially locating textual descriptions within
+video frames, a class-agnostic object tracker, and an entity matching module. Audio processing incorporates Voice Activity Detection,
+phoneme modeling, and Whisper-based audio transcription, culminating in a multimodal pipeline that facilitates robust video-question
+answering. The architecture is trained on a hybrid dataset of video instructions, enabling the handling of diverse conversational contexts
+with high accuracy.
+          </figcaption>
+        </figure>
+      </div>
+    </div>
+  </div>
+</section>
+<!-- Model Arch -->
+
+<!--Qualitative Results (grounding)-->
+<section class="section">
+    <div class="columns is-centered has-text-centered">
+      <div class="column is-six-fifths">
+        <h2 class="title is-3"> Qualitative Results : Video Grounding </h2>
+      </div>
+    </div>
+  
+    <div class="container is-max-desktop">
+      <!-- <div class="columns is-centered">
+        <div class="column is-full-width">
+          <div class="content has-text-justified">
+            <p>
+              caption
+            </p>
+          </div>
+        </div>
+      </div> -->
+  
+      <div class="columns is-centered has-text-centered">
+        <div class="column is-six-fifths" style="display: flex; align-items: flex-start; justify-content: center;">
+          <figure style="text-align: center;">
+            <img id="teaser" width="100%" src="images/figures/grounding-qual.png">
+            <figcaption>
+                Visual representation of the grounding capability of advanced video-conversational capabilities of Video-LLaVA. The highlighted regions in each video frame indicate the model's ability to identify and spatially locate key subjects mentioned in the textual description, such as the giraffe, the statue, and the gymnast on a balance beam.
+            </figcaption>
+          </figure>
+        </div>
+      </div>
+    </div>
+  </section>
+  <!--Qualitative Results (grounding)-->
+
+  <!--Qualitative Results (audio)-->
+<section class="section">
+    <div class="columns is-centered has-text-centered">
+      <div class="column is-six-fifths">
+        <h2 class="title is-3"> Qualitative Results : Including Audio Modality </h2>
+      </div>
+    </div>
+  
+    <div class="container is-max-desktop">
+      <!-- <div class="columns is-centered">
+        <div class="column is-full-width">
+          <div class="content has-text-justified">
+            <p>
+              caption
+            </p>
+          </div>
+        </div>
+      </div> -->
+  
+      <div class="columns is-centered has-text-centered">
+        <div class="column is-six-fifths" style="display: flex; align-items: flex-start; justify-content: center;">
+          <figure style="text-align: center;">
+            <img id="teaser" width="100%" src="images/figures/audio-qual.png">
+            <figcaption>
+                The figure illustrates the integrated audio processing pipeline that augments video-question answering with audio cues. It provides side-by-side comparisons showing how audio cues offer additional context, leading to a more accurate interpretation of the video content, as seen in the examples above.
+            </figcaption>
+          </figure>
+        </div>
+      </div>
+    </div>
+  </section>
+  <!--Qualitative Results (audio)-->
+
+  <!--Qualitative Results (prev models)-->
+<section class="section">
+    <div class="columns is-centered has-text-centered">
+      <div class="column is-six-fifths">
+        <h2 class="title is-3"> Video-ChatGPT vs Video-LLaVA</h2>
+      </div>
+    </div>
+  
+    <div class="container is-max-desktop">
+  
+      <div class="columns is-centered has-text-centered">
+        <div class="column is-six-fifths" style="display: flex; align-items: flex-start; justify-content: center;">
+          <figure style="text-align: center;">
+            <img id="teaser" width="100%" src="images/figures/comparison-prev_versions.png">
+            <figcaption>
+                Qualitative analysis of video descriptions generated by Video-ChatGPT, Video-LLaVA (7B), and Video-LLaVA (13B) models. The evolution in model performance is evident, with enhancements in the accuracy of information, richness of descriptive detail, and alignment with the video’s context and sequence of events as we move from the baseline Video-ChatGPT to the more advanced Video-LLaVA (13B) model.
+            </figcaption>
+          </figure>
+        </div>
+      </div>
+    </div>
+  </section>
+  <!--Qualitative Results (prev models)-->
+
+
+<style>
+  #BibTeX {
+    margin-bottom: -80px; /* Adjust the negative margin as needed */
+  }
+  #Acknowledgement {
+    margin-top: -80px; /* Adjust the negative margin as needed */
+  }
+</style>
+
+ 
+  <section class="section" id="Acknowledgement">
+    <div class="container is-max-desktop content">
+      <h2 class="title">Acknowledgement</h2>
+      <p>
+        This website is adapted from <a
+        href="https://github.com/nerfies/nerfies.github.io">Nerfies</a>, licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">Creative
+        Commons Attribution-ShareAlike 4.0 International License</a>.
+      </p>
+    </div>
+  </section>
+
+</body>
+
+</html>
+
+<div style="text-align: center;">
+  <a href="https://www.ival-mbzuai.com" target="_blank">
+    <img src="images/logos/IVAL_logo.png" width="200" height="100" alt="IVAL Logo">
+  </a>
+  <a href="https://github.com/mbzuai-oryx" target="_blank">
+    <img src="images/logos/Oryx_logo.png" width="100" height="100" alt="Oryx Logo">
+  </a>
+  <a href="https://mbzuai.ac.ae" target="_blank">
+    <img src="images/logos/MBZUAI_logo.png" width="360" height="85" alt="MBZUAI Logo">
+  </a>
+</div>
diff --git a/static/css/index.css b/static/css/index.css
new file mode 100644
index 0000000..338dbdd
--- /dev/null
+++ b/static/css/index.css
@@ -0,0 +1,159 @@
+body {
+    font-family: 'Noto Sans', sans-serif;
+  }
+  
+  
+  .footer .icon-link {
+      font-size: 25px;
+      color: #000;
+  }
+  
+  .link-block a {
+      margin-top: 5px;
+      margin-bottom: 5px;
+  }
+  
+  .dnerf {
+    font-variant: small-caps;
+  }
+  
+  
+  .teaser .hero-body {
+    padding-top: 0;
+    padding-bottom: 3rem;
+  }
+  
+  .teaser {
+    font-family: 'Google Sans', sans-serif;
+  }
+  
+  
+  .publication-title {
+  }
+  
+  .publication-banner {
+    max-height: parent;
+  
+  }
+  
+  .publication-banner video {
+    position: relative;
+    left: auto;
+    top: auto;
+    transform: none;
+    object-fit: fit;
+  }
+  
+  .publication-header .hero-body {
+  }
+  
+  .publication-title {
+      font-family: 'Google Sans', sans-serif;
+  }
+  
+  .publication-authors {
+      font-family: 'Google Sans', sans-serif;
+  }
+  
+  .publication-venue {
+      color: #555;
+      width: fit-content;
+      font-weight: bold;
+  }
+  
+  .publication-awards {
+      color: #ff3860;
+      /* width: fit-content; */
+      font-weight: bolder;
+  }
+  
+  .title + .publication-authors,
+  .subtitle + .publication-authors {
+      margin-top: -1.25rem;
+  }
+  
+  .publication-authors a {
+     color: hsl(204, 86%, 53%) !important;
+  }
+  
+  .publication-authors a:hover {
+      text-decoration: underline;
+  }
+  
+  .author-block {
+    display: inline-block;
+  }
+  
+  .publication-banner img {
+  }
+  
+  .publication-authors {
+    /*color: #4286f4;*/
+  }
+  
+  .publication-video {
+      position: relative;
+      width: 100%;
+      height: 0;
+      padding-bottom: 56.25%;
+  
+      overflow: hidden;
+      border-radius: 10px !important;
+  }
+  
+  .publication-video iframe {
+      position: absolute;
+      top: 0;
+      left: 0;
+      width: 100%;
+      height: 100%;
+  }
+  
+  .publication-body img {
+  }
+  
+  .results-carousel {
+    overflow: hidden;
+  }
+  
+  .results-carousel .item {
+    margin: 5px;
+    overflow: hidden;
+    border: 1px solid #bbb;
+    border-radius: 10px;
+    padding: 0;
+    font-size: 0;
+  }
+  
+  .results-carousel video {
+    margin: 0;
+  }
+  
+  
+  .interpolation-panel {
+    background: #f5f5f5;
+    border-radius: 10px;
+  }
+  
+  .interpolation-panel .interpolation-image {
+    width: 100%;
+    border-radius: 5px;
+  }
+  
+  .interpolation-video-column {
+  }
+  
+  .interpolation-panel .slider {
+    margin: 0 !important;
+  }
+  
+  .interpolation-panel .slider {
+    margin: 0 !important;
+  }
+  
+  #interpolation-image-wrapper {
+    width: 100%;
+  }
+  #interpolation-image-wrapper img {
+    border-radius: 5px;
+  }
\ No newline at end of file