process guess

cvlab-columbia · Dec 22, 2023 · ac2fa26 · ac2fa26
1 parent 12782b6
commit ac2fa26
Show file tree

Hide file tree

Showing 4 changed files with 191 additions and 26 deletions.
diff --git a/configs/base_config.yaml b/configs/base_config.yaml
@@ -43,6 +43,7 @@ best_match_model: xvlm                              # Which model to use for bes
 gpt3:                                               # GPT-3 configuration
     n_votes: 1                                      # Number of tries to use for GPT-3. Use with temperature > 0
     qa_prompt: ./prompts/gpt3/gpt3_qa.txt
+    guess_prompt: ./prompts/gpt3/gpt3_process_guess.txt
     temperature: 0.                                 # Temperature for GPT-3. Almost deterministic if 0
     model: text-davinci-003                         # See openai.Model.list() for available models
 

diff --git a/image_patch.py b/image_patch.py
@@ -434,6 +434,10 @@ def llm_query(query, context=None, long_answer=True, queues=None):
         return forward(model_name='gpt3_qa', prompt=[query, context], queues=queues)
 
 
+def process_guesses(prompt, guess1=None, guess2=None, queues=None):
+    return forward(model_name='gpt3_guess', prompt=[prompt, guess1, guess2], queues=queues)
+
+
 def coerce_to_numeric(string, no_string=False):
     """
     This function takes a string as input and returns a numeric value after removing any non-numeric characters.

diff --git a/prompts/gpt3/gpt3_process_guess.txt b/prompts/gpt3/gpt3_process_guess.txt
@@ -0,0 +1,121 @@
+Please answer the following questions using the given guesses.
+If a unique answer cannot be determined, choose only one of the possible answers.
+Aim to reply in ONE word (at MOST 2).
+
+Question: What kind of flowers are these?
+Guess 1: these flowers are purple, so lavender, lilac, iris, and hyacinth
+Guess 2: purple flowers
+Answer: lilac
+
+Question: What do these people on the bikes normally write and give out?
+Guess 1: the people on bikes are police, so Tickets
+Guess 2: tickets
+Answer: tickets
+
+Question: What kind of cold meet is this?
+Guess 1: what kind of meat is this is beef, so roast beef
+Guess 2: beef
+Answer: beef
+
+Question: Can you guess the place shown in this picture?
+Guess 1: the place is tourist attraction, so the Eiffel Tower in Paris, France
+Guess 2: big ben
+Answer: big ben
+
+Question: When was this type of vehicle with two equal sized wheels invented?
+Guess 1: the vehicle is a bicycle, so 19th century
+Guess 2: 1819
+Answer: 1800s
+
+Question: What is the flavor of the pink topping on this dessert?
+Guess 1: the topping is whipped cream, so strawberry, vanilla, chocolate, and raspberry
+Guess 2: strawberry
+Answer: strawberry
+
+Question: How are these festive lights held in place?
+Guess 1: these festive lights are christmas lights, so with hooks clips
+Guess 2: string
+Answer: string
+
+Question: Who is famous for allegedly doing this in a lightning storm?
+Guess 1: what is being done is flying a kite, so Benjamin Franklin
+Guess 2: Charles Manson
+Answer: Benjamin Franklin
+
+Question: What is the object atop the skier's head used for?
+Guess 1: the object atop the skier's head is helmet, so protection from head injuries
+Guess 2: sunglasses
+Answer: protection
+
+Question: What rank is the man on the right?
+Guess 1: who is the man on the right is sailor, so seaman
+Guess 2: captain
+Answer: captain
+
+Question: Chemically what kind of water is in the picture?
+Guess 1: the water in the picture is waves, so salt water
+Guess 2: salt water
+Answer: salt
+
+Question: Is the material tweed or canvas?
+Guess 1: the material is fabric, so fabric
+Guess 2: canvas
+Answer: canvas
+
+Question: Which type of meat are in the photo?
+Guess 1: the meat in the photo is sausage, so pork
+Guess 2: hot dogs
+Answer: hotdogs
+
+Question: What sort of predator might there be in an area like this?
+Guess 1: this area is mountains, so predators like wolves fox
+Guess 2: shark
+Answer: shark
+
+Question: Can you name a sport this person could be a part of?
+Guess 1: this person is a racer, so racing such as auto
+Guess 2: motorcycle racing
+Answer: racing
+
+Question: Who makes the yellow top worn in this photograph?
+Guess 1: the top is red, so brand is unknown
+Guess 2: Burton
+Answer: Burton
+
+Question: Is the athlete right or left handed?
+Guess 1: what is the athlete doing is playing baseball, so unclear
+Guess 2: right handed
+Answer: right handed
+
+Question: Is this food high or low on fat?
+Guess 1: what kind of food is this is sandwich, so depends on ingredients
+Guess 2: high
+Answer: high
+
+Question: What wood are those cabinets made of?
+Guess 1: what kind of cabinets are these is kitchen cabinets, so typically wood such as oak
+Guess 2: maple
+
+Question: Which objects shown are typically associated with small children?
+Guess 1: what objects are shown are stuffed animals, so toys
+Guess 2: teddy bears
+Answer: teddy bears
+
+Question: What small appliance is that stuffed animal inside?
+Guess 1: the stuffed animal is a teddy bear, so vacuum cleaner
+Guess 2: microwave
+Answer: microwave
+
+Question: What is this made with?
+Guess 1: what is this is muffin, so flour sugar eggs
+Guess 2: oats
+Answer: flour
+
+Question: What is the position name of the player squatting down?
+Guess 1: who is squatting down is the batter, so hitter
+Guess 2: catcher
+
+Question: {}
+Guess 1: {}
+Guess 2: {}
+Answer (remember, only 1-2 words):
diff --git a/vision_models.py b/vision_models.py
@@ -78,6 +78,7 @@ def list_processes(cls):
         """
         return [cls.name]
 
+
 # ------------------------------ Specific models ---------------------------- #
 
 
@@ -381,7 +382,7 @@ def forward(self, image: torch.Tensor, text: List[str], return_labels: bool = Fa
             text = [text]
         text_original = text
         text = ['a photo of a ' + t for t in text]
-        inputs = self.processor(text=text, images=image, return_tensors="pt") # padding="longest",
+        inputs = self.processor(text=text, images=image, return_tensors="pt")  # padding="longest",
         inputs = {k: v.to(self.dev) for k, v in inputs.items()}
         outputs = self.model(**inputs)
 
@@ -512,7 +513,7 @@ def compute_prediction(self, original_image, original_caption, custom_entity=Non
                 tic = timeit.time.perf_counter()
 
                 # compute predictions
-                with HiddenPrints():   # Hide some deprecated notices
+                with HiddenPrints():  # Hide some deprecated notices
                     predictions = self.model(image_list, captions=[original_caption],
                                              positive_map=positive_map_label_to_token)
                 predictions = [o.to(self.cpu_device) for o in predictions]
@@ -779,6 +780,8 @@ def __init__(self, gpu_number=0):
         super().__init__(gpu_number=gpu_number)
         with open(config.gpt3.qa_prompt) as f:
             self.qa_prompt = f.read().strip()
+        with open(config.gpt3.guess_prompt) as f:
+            self.guess_prompt = f.read().strip()
         self.temperature = config.gpt3.temperature
         self.n_votes = config.gpt3.n_votes
         self.model = config.gpt3.model
@@ -802,7 +805,40 @@ def most_frequent(answers):
         answer_counts = Counter(answers)
         return answer_counts.most_common(1)[0][0]
 
-    def get_qa(self, prompts, prompt_base: str=None) -> list[str]:
+    def process_guesses(self, prompts):
+        prompt_base = self.guess_prompt
+        prompts_total = []
+        for p in prompts:
+            question, guess1, _ = p
+            if len(guess1) == 1:
+                # In case only one option is given as a guess
+                guess1 = [guess1[0], guess1[0]]
+            prompts_total.append(prompt_base.format(question, guess1[0], guess1[1]))
+        response = self.process_guesses_fn(prompts_total)
+        if self.n_votes > 1:
+            response_ = []
+            for i in range(len(prompts)):
+                if self.model == 'chatgpt':
+                    resp_i = [r['message']['content'] for r in
+                              response['choices'][i * self.n_votes:(i + 1) * self.n_votes]]
+                else:
+                    resp_i = [r['text'] for r in response['choices'][i * self.n_votes:(i + 1) * self.n_votes]]
+                response_.append(self.most_frequent(resp_i).lstrip())
+            response = response_
+        else:
+            if self.model == 'chatgpt':
+                response = [r['message']['content'].lstrip() for r in response['choices']]
+            else:
+                response = [r['text'].lstrip() for r in response['choices']]
+        return response
+
+    def process_guesses_fn(self, prompt):
+        # The code is the same as get_qa_fn, but we separate in case we want to modify it later
+        response = self.query_gpt3(prompt, model=self.model, max_tokens=5, logprobs=1, stream=False,
+                                   stop=["\n", "<|endoftext|>"])
+        return response
+
+    def get_qa(self, prompts, prompt_base: str = None) -> list[str]:
         if prompt_base is None:
             prompt_base = self.qa_prompt
         prompts_total = []
@@ -814,8 +850,8 @@ def get_qa(self, prompts, prompt_base: str=None) -> list[str]:
             response_ = []
             for i in range(len(prompts)):
                 if self.model == 'chatgpt':
-                    resp_i = [r['message']['content']
-                              for r in response['choices'][i * self.n_votes:(i + 1) * self.n_votes]]
+                    resp_i = [r['message']['content'] for r in
+                              response['choices'][i * self.n_votes:(i + 1) * self.n_votes]]
                 else:
                     resp_i = [r['text'] for r in response['choices'][i * self.n_votes:(i + 1) * self.n_votes]]
                 response_.append(self.most_frequent(resp_i))
@@ -891,6 +927,8 @@ def forward(self, prompt, process_name):
         if len(prompt) > 0:
             if process_name == 'gpt3_qa':
                 response = self.get_qa(prompt)
+            elif process_name == 'gpt3_guess':
+                response = self.process_guesses(prompt)
             else:  # 'gpt3_general', general prompt, has to be given all of it
                 response = self.get_general(prompt)
         else:
@@ -911,7 +949,7 @@ def forward(self, prompt, process_name):
 
     @classmethod
     def list_processes(cls):
-        return ['gpt3_' + n for n in ['qa', 'general']]
+        return ['gpt3_' + n for n in ['qa', 'guess', 'general']]
 
 
 # @cache.cache
@@ -924,24 +962,26 @@ def codex_helper(extended_prompt):
         if not isinstance(extended_prompt, list):
             extended_prompt = [extended_prompt]
         responses = [openai.ChatCompletion.create(
-                model=config.codex.model,
-                messages=[
-                    # {"role": "system", "content": "You are a helpful assistant."},
-                    {"role": "system", "content": "Only answer with a function starting def execute_command."},
-                    {"role": "user", "content": prompt}
-                ],
-                temperature=config.codex.temperature,
-                max_tokens=config.codex.max_tokens,
-                top_p = 1.,
-                frequency_penalty=0,
-                presence_penalty=0,
-#                 best_of=config.codex.best_of,
-                stop=["\n\n"],
-                )
-                    for prompt in extended_prompt]
-        resp = [r['choices'][0]['message']['content'].replace("execute_command(image)", "execute_command(image, my_fig, time_wait_between_lines, syntax)") for r in responses]
-#         if len(resp) == 1:
-#             resp = resp[0]
+            model=config.codex.model,
+            messages=[
+                # {"role": "system", "content": "You are a helpful assistant."},
+                {"role": "system", "content": "Only answer with a function starting def execute_command."},
+                {"role": "user", "content": prompt}
+            ],
+            temperature=config.codex.temperature,
+            max_tokens=config.codex.max_tokens,
+            top_p=1.,
+            frequency_penalty=0,
+            presence_penalty=0,
+            #                 best_of=config.codex.best_of,
+            stop=["\n\n"],
+        )
+            for prompt in extended_prompt]
+        resp = [r['choices'][0]['message']['content'].replace("execute_command(image)",
+                                                              "execute_command(image, my_fig, time_wait_between_lines, syntax)")
+                for r in responses]
+    #         if len(resp) == 1:
+    #             resp = resp[0]
     else:
         warnings.warn('OpenAI Codex is deprecated. Please use GPT-4 or GPT-3.5-turbo.')
         response = openai.Completion.create(
@@ -1161,7 +1201,7 @@ def caption(self, image, prompt=None):
         generated_text = [cap.strip() for cap in
                           self.processor.batch_decode(generated_ids, skip_special_tokens=True)]
         return generated_text
-    
+
     def pre_question(self, question):
         # from LAVIS blip_processors
         question = re.sub(
@@ -1223,7 +1263,6 @@ class SaliencyModel(BaseModel):
 
     def __init__(self, gpu_number=0,
                  path_checkpoint=f'{config.path_pretrained_models}/saliency_inspyrenet_plus_ultra'):
-
         from base_models.inspyrenet.saliency_transforms import get_transform
         from base_models.inspyrenet.InSPyReNet import InSPyReNet
         from base_models.inspyrenet.backbones.SwinTransformer import SwinB