Source code for oscopilot.modules.base_module

import re
import json
import os
from oscopilot.utils.llms import OpenAI,LLAMA
# from oscopilot.environments.py_env import PythonEnv
# from oscopilot.environments.py_jupyter_env import PythonJupyterEnv
from oscopilot.environments import Env
from oscopilot.utils import get_os_version
from dotenv import load_dotenv

load_dotenv(dotenv_path='.env', override=True)
MODEL_TYPE = os.getenv('MODEL_TYPE')


[docs]
class BaseModule:
    def __init__(self):
        """
        Initializes a new instance of BaseModule with default values for its attributes.
        """
        if MODEL_TYPE == "OpenAI":
            self.llm = OpenAI()
        elif MODEL_TYPE == "LLAMA":
            self.llm = LLAMA()
        # self.environment = PythonEnv()
        # self.environment = PythonJupyterEnv()
        self.environment = Env()
        self.system_version = get_os_version()
        

[docs]
    def extract_information(self, message, begin_str='[BEGIN]', end_str='[END]'):
        """
        Extracts substrings from a message that are enclosed within specified begin and end markers.

        Args:
            message (str): The message from which information is to be extracted.
            begin_str (str): The marker indicating the start of the information to be extracted.
            end_str (str): The marker indicating the end of the information to be extracted.

        Returns:
            list[str]: A list of extracted substrings found between the begin and end markers.
        """
        result = []
        _begin = message.find(begin_str)
        _end = message.find(end_str)
        while not (_begin == -1 or _end == -1):
            result.append(message[_begin + len(begin_str):_end].lstrip("\n"))
            message = message[_end + len(end_str):]
            _begin = message.find(begin_str)
            _end = message.find(end_str)
        return result  



[docs]
    def extract_json_from_string(self, text):
        """
        Identifies and extracts JSON data embedded within a given string.

        This method searches for JSON data within a string, specifically looking for
        JSON blocks that are marked with ```json``` notation. It attempts to parse
        and return the first JSON object found.

        Args:
            text (str): The text containing the JSON data to be extracted.

        Returns:
            dict: The parsed JSON data as a dictionary if successful.
            str: An error message indicating a parsing error or that no JSON data was found.
        """
        # Improved regular expression to find JSON data within a string
        json_regex = r'```json\s*\n\{[\s\S]*?\n\}\s*```'
        
        # Search for JSON data in the text
        matches = re.findall(json_regex, text)

        # Extract and parse the JSON data if found
        if matches:
            # Removing the ```json and ``` from the match to parse it as JSON
            json_data = matches[0].replace('```json', '').replace('```', '').strip()
            try:
                # Parse the JSON data
                parsed_json = json.loads(json_data)
                return parsed_json
            except json.JSONDecodeError as e:
                return f"Error parsing JSON data: {e}"
        else:
            return "No JSON data found in the string."

        


[docs]
    def extract_list_from_string(self, text):
        """
        Extracts a list of task descriptions from a given string containing enumerated tasks.
        This function ensures that only text immediately following a numbered bullet is captured,
        and it stops at the first newline character or at the next number, preventing the inclusion of subsequent non-numbered lines or empty lines.

        Parameters:
        text (str): A string containing multiple enumerated tasks. Each task is numbered and followed by its description.

        Returns:
        list[str]: A list of strings, each representing the description of a task extracted from the input string.
        """

        # Regular expression pattern:
        # \d+\. matches one or more digits followed by a dot, indicating the task number.
        # \s+ matches one or more whitespace characters after the dot.
        # ([^\n]*?) captures any sequence of characters except newlines (non-greedy) as the task description.
        # (?=\n\d+\.|\n\Z|\n\n) is a positive lookahead that matches a position followed by either a newline with digits and a dot (indicating the start of the next task),
        # or the end of the string, or two consecutive newlines (indicating a break between tasks or end of content).
        task_pattern = r'\d+\.\s+([^\n]*?)(?=\n\d+\.|\n\Z|\n\n)'

        # Use the re.findall function to search for all matches of the pattern in the input text.
        data_list = re.findall(task_pattern, text)

        # Return the list of matched task descriptions.
        return data_list