From 201242dcda21f04cbab03045b08696b1908f4755 Mon Sep 17 00:00:00 2001 From: afernand Date: Fri, 9 Jun 2023 14:00:28 +0200 Subject: [PATCH 1/5] Add HuggingFace support --- src/review_bot/hf_model.py | 146 ++++++++++++++++ src/review_bot/open_ai_interface.py | 253 +++++++++++++++++++++------- 2 files changed, 342 insertions(+), 57 deletions(-) create mode 100644 src/review_bot/hf_model.py diff --git a/src/review_bot/hf_model.py b/src/review_bot/hf_model.py new file mode 100644 index 00000000..e9c09b52 --- /dev/null +++ b/src/review_bot/hf_model.py @@ -0,0 +1,146 @@ +"""Module for HuggingFace model implementations.""" +import logging + +import torch +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + GenerationConfig, + StoppingCriteria, + StoppingCriteriaList, +) + +LOG = logging.getLogger(__name__) + + +class StopOnTokens(StoppingCriteria): + """Wrapper for stop token strings. + + Determines on which tokens the model should stop the inference. + + Parameters + ---------- + tokenizer : AutoTokenizer + Tokenizer that will be used with the model. + stopwords : list, optional + Words that will mark the end of the inference, by default ["<|im_end|>", "<|endoftext|>"]. + """ + + def __init__(self, tokenizer, stopwords=["<|im_end|>", "<|endoftext|>"]): + """Initialize the stop tokens.""" + self.stop_token_ids = tokenizer.convert_tokens_to_ids(stopwords) + + def __call__( + self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs + ) -> bool: + """Run when called from the model inference. + + Parameters + ---------- + input_ids : torch.LongTensor + IDs of the input tokens. + scores : torch.FloatTensor + Score of the tokens. + + Returns + ------- + bool + Whether to stop the inference or not. + """ + for stop_id in self.stop_token_ids: + if input_ids[0][-1] == stop_id: + return True + return False + + +class Inference: + """Wrapper for HuggingFace text generation models. + + Parameters + ---------- + model_name : str, optional + Name of the model in HuggingFace, by default "chavinlo/alpaca-13b" + tokenizer_name : str, optional + Name of the tokenizer in HuggingFace, by default "chavinlo/gpt4-x-alpaca" + """ + + def __init__( + self, model_name="mosaicml/mpt-7b", tokenizer_name: str = "mosaicml/mpt-7b" + ): + """Initialize the model and the tokenizer.""" + self.device = "cpu" + self.tokenizer = AutoTokenizer.from_pretrained( + tokenizer_name, trust_remote_code=True + ) + self.model = AutoModelForCausalLM.from_pretrained( + model_name, trust_remote_code=True, load_in_8bit=True, device_map="auto" + ) + print(self.model.get_memory_footprint()) + + def evaluate( + self, + instruction: str, + input: str = None, + temperature: float = 0.1, + top_p: float = 0.75, + top_k=40, + num_beams=4, + max_new_tokens=128, + **kwargs, + ): + """Perform the inference of the model. + + Parameters + ---------- + instruction : str + Query we want to ask the model. + input : str, optional + Additional context we can add to the query, by default None. + temperature : float, optional + _description_, by default 0.1 + top_p : float, optional + _description_, by default 0.75 + top_k : int, optional + _description_, by default 40 + num_beams : int, optional + _description_, by default 4 + max_new_tokens : int, optional + _description_, by default 128 + + Returns + ------- + str + Result from the query. + """ + stop = StopOnTokens(self.tokenizer) + + prompt = self._generate_prompt(instruction, input) + inputs = self.tokenizer(prompt, return_tensors="pt") + input_ids = inputs["input_ids"].to(self.device) + generation_config = GenerationConfig( + temperature=temperature, + top_p=top_p, + top_k=top_k, + num_beams=num_beams, + **kwargs, + ) + with torch.no_grad(): + generation_output = self.model.generate( + input_ids=input_ids, + generation_config=generation_config, + return_dict_in_generate=True, + output_scores=True, + max_new_tokens=max_new_tokens, + stopping_criteria=StoppingCriteriaList([stop]), + ) + s = generation_output.sequences[0] + output_str = self.tokenizer.decode(s) + + try: + output_str = output_str.split("### Response:")[1] + except IndexError: + LOG.warning("Output might be malformed.") + output_str = output_str.replace("<|endoftext|>", "") + output_str = output_str.replace("<|im_end|>", "") + + return output_str diff --git a/src/review_bot/open_ai_interface.py b/src/review_bot/open_ai_interface.py index ed757310..5a200f5e 100644 --- a/src/review_bot/open_ai_interface.py +++ b/src/review_bot/open_ai_interface.py @@ -1,4 +1,5 @@ """Functions to interface with OpenAI.""" +from enum import Enum import logging from typing import Dict, List @@ -7,8 +8,17 @@ from review_bot.exceptions import EmptyOpenAIResponseException from review_bot.gh_interface import get_changed_files_and_contents from review_bot.git_interface import LocalGit +from review_bot.hf_model import Inference from review_bot.misc import _set_open_ai_token, add_line_numbers, parse_suggestions + +class InferenceMotor(Enum): + """Enum for the types of inference motors available.""" + + OPENAI = 0 + HUGGINGFACE = 1 + + LOG = logging.getLogger(__name__) LOG.setLevel("DEBUG") @@ -17,6 +27,165 @@ OPEN_AI_MODEL = "gpt-4" +def hf_prompt_patch(filename: str, patch: str) -> str: + """Generate the prompt for HF patch review. + + Parameters + ---------- + filename : str + Name of the file to review. + patch : str + Diff patch to review + + Returns + ------- + str + Prompt result. + """ + prompt = f""" +You are a GitHub review bot. You first expect full filename. You then expect a patch from a GitHub pull request and you provide 'review items' to improve just the patch code using the context from the full source file. Do not include the line numbers in any code suggestions. There are 3 TYPEs of review items [GLOBAL, SUGGESTION, COMMENT]. Each review item must be in the format [], [(-)], [TYPE], always between brackets: + +Type: GLOBAL +This must always included. This is a general overview of the file patch. If the file looks good, simply respond with "No issues found, LGTM!". Otherwise, indicate the kind of comments and suggestions that follow. Make this section short and do not include any line numbers (i.e., leave [(-)] empty. + +Type: SUGGESTION +This is where code must be changed or should be changed. If you are replacing code, it must use the GitHub markdown code block with ```suggestion, and the [-] must match the line(s) that will be replaced. If you are adding new code, you should only include the [] where you expect the code to be inserted. Do not insert code that is outside of the patch. + +Type: COMMENT +This is for comments that do not include code that you want to replace. These should be logical errors, style suggestions, or other issues with the code. You can feel free to include example code, and if you do use markdown formatting, but this is primarily for text comments. + +Filename: {filename} + +Source: +{patch}\n\nReview the above code patch and provide recommendations for improvement or point out errors." +""" + + return prompt + + +def hf_prompt_source(source: str, patch: str) -> str: + """Generate the prompt for HF patch review with source. + + Parameters + ---------- + source : str + Source of the file to review. + patch : str + Diff patch to review + + Returns + ------- + str + Prompt result. + """ + prompt = f""" +You are a GitHub review bot. You first expect the full source of the file to be reviewed followed by the patch which contains the line number. You respond after the full source file with 'Ready for the patch.'. After the patch, you provide 'review items' to improve just the patch code using the context from the full source file. Do not include the line numbers in any code suggestions. There are 3 TYPEs of review items [GLOBAL, SUGGESTION, INFO]. Each review item must be in the format [], [(-)], [TYPE]: + +Type: GLOBAL +This must always included. This is a general overview of the file patch. If the file looks good, simply respond with "No issues found, LGTM!". Otherwise, indicate the kind of comments and suggestions that will be given in the files tab. Make this section short and do not include any line numbers (i.e., leave [(-)] empty. + +Type: SUGGESTION +This is where code must be changed or should be changed. If you are replacing code, it must use the GitHub markdown code block with ```suggestion, and the [-] must match the line(s) that will be replaced. If you are adding new code, you should only include the [] where you expect the code to be inserted. Do not insert code that is outside of the patch. + +Type: INFO +This is for comments that do not include code that you want to replace. These should be logical errors, style suggestions, or other issues with the code. You can feel free to include example code, and if you do use markdown formatting, but this is primarily for text comments. + +Source: + +{source} + +Patch: + +{patch} + +\n\nReview the above code patch and provide recommendations for improvement or point out errors. + +""" + return prompt + + +def openai_prompt_patch(filename: str, patch: str) -> List: + """Generate the prompt for OpenAI patch review. + + Parameters + ---------- + source : str + Source of the file to review. + patch : str + Diff patch to review + + Returns + ------- + List + List with the required format for OpenAI. + """ + messages = [ + { + "role": "system", + "content": """ +You are a GitHub review bot. You first expect full filename. You then expect a patch from a GitHub pull request and you provide 'review items' to improve just the patch code using the context from the full source file. Do not include the line numbers in any code suggestions. There are 3 TYPEs of review items [GLOBAL, SUGGESTION, COMMENT]. Each review item must be in the format [], [(-)], [TYPE], always between brackets: + +Type: GLOBAL +This must always included. This is a general overview of the file patch. If the file looks good, simply respond with "No issues found, LGTM!". Otherwise, indicate the kind of comments and suggestions that follow. Make this section short and do not include any line numbers (i.e., leave [(-)] empty. + +Type: SUGGESTION +This is where code must be changed or should be changed. If you are replacing code, it must use the GitHub markdown code block with ```suggestion, and the [-] must match the line(s) that will be replaced. If you are adding new code, you should only include the [] where you expect the code to be inserted. Do not insert code that is outside of the patch. + +Type: COMMENT +This is for comments that do not include code that you want to replace. These should be logical errors, style suggestions, or other issues with the code. You can feel free to include example code, and if you do use markdown formatting, but this is primarily for text comments. +""", + }, + {"role": "user", "content": filename}, + {"role": "assistant", "content": "Ready for the patch."}, + { + "role": "user", + "content": f"{patch}\n\nReview the above code patch and provide recommendations for improvement or point out errors.", + }, + ] + return messages + + +def openai_prompt_source(source: str, patch: str) -> List: + """Generate the prompt for OpenAI patch review with the source code. + + Parameters + ---------- + source : str + Source of the file to review. + patch : str + Diff patch to review + + Returns + ------- + List + List with the required format for OpenAI. + """ + messages = [ + { + "role": "system", + "content": """ +You are a GitHub review bot. You first expect the full source of the file to be reviewed followed by the patch which contains the line number. You respond after the full source file with 'Ready for the patch.'. After the patch, you provide 'review items' to improve just the patch code using the context from the full source file. Do not include the line numbers in any code suggestions. There are 3 TYPEs of review items [GLOBAL, SUGGESTION, INFO]. Each review item must be in the format [], [(-)], [TYPE]: + +Type: GLOBAL +This must always included. This is a general overview of the file patch. If the file looks good, simply respond with "No issues found, LGTM!". Otherwise, indicate the kind of comments and suggestions that will be given in the files tab. Make this section short and do not include any line numbers (i.e., leave [(-)] empty. + +Type: SUGGESTION +This is where code must be changed or should be changed. If you are replacing code, it must use the GitHub markdown code block with ```suggestion, and the [-] must match the line(s) that will be replaced. If you are adding new code, you should only include the [] where you expect the code to be inserted. Do not insert code that is outside of the patch. + +Type: INFO +This is for comments that do not include code that you want to replace. These should be logical errors, style suggestions, or other issues with the code. You can feel free to include example code, and if you do use markdown formatting, but this is primarily for text comments. +""", + }, + {"role": "user", "content": source}, + {"role": "assistant", "content": "Ready for the patch."}, + { + "role": "user", + "content": f"{patch}\n\nReview the above code patch and provide recommendations for improvement or point out errors.", + }, + ] + return messages + + def review_patch( owner, repo, pr, use_src=False, filter_filename=None, gh_access_token=None ): @@ -130,7 +299,12 @@ def review_patch_local( return suggestions -def generate_suggestions_with_source(filename, file_src, patch) -> List[Dict[str, str]]: +def generate_suggestions_with_source( + filename: str, + file_src: str, + patch: str, + inference_motor: InferenceMotor = InferenceMotor.OPENAI, +) -> List[Dict[str, str]]: """Generate suggestions for a given file source and patch. Parameters @@ -141,6 +315,8 @@ def generate_suggestions_with_source(filename, file_src, patch) -> List[Dict[str The source file text including the file name and its contents. patch : str The patch text containing line numbers and changes. + inference_motor : InferenceMotor + Which inference motor you want to use, by default OPENAI. Returns ------- @@ -152,41 +328,24 @@ def generate_suggestions_with_source(filename, file_src, patch) -> List[Dict[str LOG.debug("FILENAME: %s", filename) LOG.debug("PATCH: %s", patch) - response = openai.ChatCompletion.create( - model=OPEN_AI_MODEL, - messages=[ - { - "role": "system", - "content": """ -You are a GitHub review bot. You first expect the full source of the file to be reviewed followed by the patch which contains the line number. You respond after the full source file with 'Ready for the patch.'. After the patch, you provide 'review items' to improve just the patch code using the context from the full source file. Do not include the line numbers in any code suggestions. There are 3 TYPEs of review items [GLOBAL, SUGGESTION, INFO]. Each review item must be in the format [], [(-)], [TYPE]: - -Type: GLOBAL -This must always included. This is a general overview of the file patch. If the file looks good, simply respond with "No issues found, LGTM!". Otherwise, indicate the kind of comments and suggestions that will be given in the files tab. Make this section short and do not include any line numbers (i.e., leave [(-)] empty. - -Type: SUGGESTION -This is where code must be changed or should be changed. If you are replacing code, it must use the GitHub markdown code block with ```suggestion, and the [-] must match the line(s) that will be replaced. If you are adding new code, you should only include the [] where you expect the code to be inserted. Do not insert code that is outside of the patch. - -Type: INFO -This is for comments that do not include code that you want to replace. These should be logical errors, style suggestions, or other issues with the code. You can feel free to include example code, and if you do use markdown formatting, but this is primarily for text comments. -""", - }, - {"role": "user", "content": file_src}, - {"role": "assistant", "content": "Ready for the patch."}, - { - "role": "user", - "content": f"{patch}\n\nReview the above code patch and provide recommendations for improvement or point out errors.", - }, - ], - ) + if inference_motor == InferenceMotor.OPENAI: + response = openai.ChatCompletion.create( + model=OPEN_AI_MODEL, messages=openai_prompt_source(file_src, patch) + ) + text = response["choices"][0].message.content + elif inference_motor == InferenceMotor.HUGGINGFACE: + model = Inference() + text = model.evaluate(instruction=hf_prompt_source(file_src, patch)) # Extract suggestions - text = response["choices"][0].message.content if len(text) == 0: raise EmptyOpenAIResponseException() return parse_suggestions(text) -def generate_suggestions(filename, patch) -> List[Dict[str, str]]: +def generate_suggestions( + filename, patch, inference_motor: InferenceMotor = InferenceMotor.OPENAI +) -> List[Dict[str, str]]: """ Generate suggestions for a given file source and patch. @@ -207,36 +366,16 @@ def generate_suggestions(filename, patch) -> List[Dict[str, str]]: LOG.debug("FILENAME: %s", filename) LOG.debug("PATCH: %s", patch) - response = openai.ChatCompletion.create( - model=OPEN_AI_MODEL, - messages=[ - { - "role": "system", - "content": """ -You are a GitHub review bot. You first expect full filename. You then expect a patch from a GitHub pull request and you provide 'review items' to improve just the patch code using the context from the full source file. Do not include the line numbers in any code suggestions. There are 3 TYPEs of review items [GLOBAL, SUGGESTION, COMMENT]. Each review item must be in the format [], [(-)], [TYPE], always between brackets: - -Type: GLOBAL -This must always included. This is a general overview of the file patch. If the file looks good, simply respond with "No issues found, LGTM!". Otherwise, indicate the kind of comments and suggestions that follow. Make this section short and do not include any line numbers (i.e., leave [(-)] empty. - -Type: SUGGESTION -This is where code must be changed or should be changed. If you are replacing code, it must use the GitHub markdown code block with ```suggestion, and the [-] must match the line(s) that will be replaced. If you are adding new code, you should only include the [] where you expect the code to be inserted. Do not insert code that is outside of the patch. - -Type: COMMENT -This is for comments that do not include code that you want to replace. These should be logical errors, style suggestions, or other issues with the code. You can feel free to include example code, and if you do use markdown formatting, but this is primarily for text comments. -""", - }, - {"role": "user", "content": filename}, - {"role": "assistant", "content": "Ready for the patch."}, - { - "role": "user", - "content": f"{patch}\n\nReview the above code patch and provide recommendations for improvement or point out errors.", - }, - ], - # n=3, - ) + if inference_motor == InferenceMotor.OPENAI: + response = openai.ChatCompletion.create( + model=OPEN_AI_MODEL, messages=openai_prompt_patch(filename, patch) + ) + text = response["choices"][0].message.content + elif inference_motor == InferenceMotor.HUGGINGFACE: + model = Inference() + text = model.evaluate(instruction=hf_prompt_patch(filename, patch)) # Extract suggestions - text = response["choices"][0].message.content if len(text) == 0: raise EmptyOpenAIResponseException() return parse_suggestions(text) From 403fa141b4c2c701249a797fef5d46cf13c2d0f0 Mon Sep 17 00:00:00 2001 From: afernand Date: Fri, 9 Jun 2023 15:55:21 +0200 Subject: [PATCH 2/5] Add dependencies --- pyproject.toml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 32dd6557..bc30e218 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,7 +28,9 @@ dependencies = [ "requests", "openai", "jsonschema", - "gitpython" + "gitpython", + "torch>=2.0.1", + "transformers>=4.29.2" ] [project.optional-dependencies] From 60af2afe422fa5e12fed75dd074700464550c349 Mon Sep 17 00:00:00 2001 From: afernand Date: Fri, 9 Jun 2023 16:03:31 +0200 Subject: [PATCH 3/5] Remove conflictive comment with Sphinx --- src/review_bot/hf_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/review_bot/hf_model.py b/src/review_bot/hf_model.py index e9c09b52..a118f999 100644 --- a/src/review_bot/hf_model.py +++ b/src/review_bot/hf_model.py @@ -23,7 +23,7 @@ class StopOnTokens(StoppingCriteria): tokenizer : AutoTokenizer Tokenizer that will be used with the model. stopwords : list, optional - Words that will mark the end of the inference, by default ["<|im_end|>", "<|endoftext|>"]. + Words that will mark the end of the inference. """ def __init__(self, tokenizer, stopwords=["<|im_end|>", "<|endoftext|>"]): From 6a97626fd92ad095d82e053ef9046b033b4ce2d1 Mon Sep 17 00:00:00 2001 From: afernand Date: Mon, 12 Jun 2023 12:15:45 +0200 Subject: [PATCH 4/5] Rename file --- src/review_bot/{open_ai_interface.py => review.py} | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) rename src/review_bot/{open_ai_interface.py => review.py} (99%) diff --git a/src/review_bot/open_ai_interface.py b/src/review_bot/review.py similarity index 99% rename from src/review_bot/open_ai_interface.py rename to src/review_bot/review.py index 5a200f5e..b8302d80 100644 --- a/src/review_bot/open_ai_interface.py +++ b/src/review_bot/review.py @@ -9,7 +9,7 @@ from review_bot.gh_interface import get_changed_files_and_contents from review_bot.git_interface import LocalGit from review_bot.hf_model import Inference -from review_bot.misc import _set_open_ai_token, add_line_numbers, parse_suggestions +from review_bot.misc import _set_open_ai_config, add_line_numbers, parse_suggestions class InferenceMotor(Enum): @@ -323,7 +323,7 @@ def generate_suggestions_with_source( list[dict] A list of dictionaries containing suggestions for the patch. """ - _set_open_ai_token() + _set_open_ai_config() LOG.debug("Generating suggestions for a given file source and patch.") LOG.debug("FILENAME: %s", filename) LOG.debug("PATCH: %s", patch) @@ -361,7 +361,7 @@ def generate_suggestions( list[dict] A list of dictionaries containing suggestions for the patch. """ - _set_open_ai_token() + _set_open_ai_config() LOG.debug("Generating suggestions for a given file source and patch.") LOG.debug("FILENAME: %s", filename) LOG.debug("PATCH: %s", patch) From 0326771b2b9c64cf8d6d0384ccfd9ca4e7e547f3 Mon Sep 17 00:00:00 2001 From: afernand Date: Mon, 12 Jun 2023 12:16:23 +0200 Subject: [PATCH 5/5] Change OpenAI to azure back --- .github/workflows/ci_cd.yml | 4 +++- src/review_bot/__init__.py | 2 +- src/review_bot/misc.py | 5 ++++- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci_cd.yml b/.github/workflows/ci_cd.yml index 8b82afaa..a5697d3c 100644 --- a/.github/workflows/ci_cd.yml +++ b/.github/workflows/ci_cd.yml @@ -67,7 +67,7 @@ jobs: strategy: matrix: os: [ubuntu-latest] - python-version: ['3.7', '3.8', '3.9', '3.10', '3.11'] + python-version: ['3.8', '3.9', '3.10', '3.11'] fail-fast: false steps: - uses: ansys/actions/tests-pytest@v4 @@ -76,6 +76,8 @@ jobs: env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} OPEN_AI_TOKEN: ${{ secrets.OPEN_AI_TOKEN }} + OPEN_AI_TYPE: "azure" + OPENAI_API_BASE: "https://csebu-chatgpt.openai.azure.com/" doc-build: name: Build documentation diff --git a/src/review_bot/__init__.py b/src/review_bot/__init__.py index 3f5d28de..cfed1417 100644 --- a/src/review_bot/__init__.py +++ b/src/review_bot/__init__.py @@ -8,4 +8,4 @@ __version__ = importlib_metadata.version(__name__.replace(".", "-")) from .misc import open_logger -from .open_ai_interface import review_patch, review_patch_local +from .review import review_patch, review_patch_local diff --git a/src/review_bot/misc.py b/src/review_bot/misc.py index 95b1f81f..932722ce 100644 --- a/src/review_bot/misc.py +++ b/src/review_bot/misc.py @@ -22,12 +22,15 @@ def _get_gh_token(): return access_token -def _set_open_ai_token(): +def _set_open_ai_config(): """Return the github access token from the GITHUB_TOKEN environment variable.""" access_token = os.environ.get("OPEN_AI_TOKEN") + api_base = os.environ.get("OPENAI_API_BASE") if access_token is None: raise OSError('Missing "OPEN_AI_TOKEN" environment variable') openai.api_key = access_token + openai.api_type = "azure" + openai.api_base = api_base def open_logger(