Merge pull request 'matrix-bot: Fix ratelimiting from openai. Only commit messages no code diff anymore.' (#211) from Qubasa/clan-infra:Qubasa-main into main

2024-07-08 12:49:45 +00:00 · 2024-07-08 12:49:45 +00:00 · 7a3fddf088
commit 7a3fddf088
parent 102b25ddc2 83bcdc0edb
7 changed files with 85 additions and 21 deletions
--- a/pkgs/matrix-bot/default.nix
+++ b/pkgs/matrix-bot/default.nix
@ -6,6 +6,7 @@
  aiohttp,
  markdown2,
  git,
+  tiktoken,
  ...
 }:

@ -16,6 +17,7 @@ let
    aiofiles
    aiohttp
    markdown2
+    tiktoken
  ];

  runtimeDependencies = [ git ];
--- a/pkgs/matrix-bot/matrix_bot/init.py
+++ b/pkgs/matrix-bot/matrix_bot/init.py
@ -36,6 +36,11 @@ def create_parser(prog: str | None = None) -> argparse.ArgumentParser:
        help="The matrix server to connect to",
        default="https://matrix.clan.lol",
    )
+    parser.add_argument(
+        "--admin",
+        help="The matrix user to ping on error",
+        default="@qubasa:gchq.icu",
+    )

    parser.add_argument(
        "--user",
@ -148,6 +153,7 @@ def main() -> None:
        publish_day=args.publish_day,
        review_room=args.review_room,
        password=matrix_password(),
+        admin=args.admin,
    )

    gitea = GiteaData(
--- a/pkgs/matrix-bot/matrix_bot/changelog_bot.py
+++ b/pkgs/matrix-bot/matrix_bot/changelog_bot.py
@ -81,7 +81,6 @@ async def git_log(repo_path: str, ndays: int) -> str:
        f"--since={ndays} days ago",
        "--pretty=format:%h - %an, %ar : %s",
        "--stat",
-        "--patch",
    ]
    log.debug(f"Running command: {shlex.join(cmd)}")
    process = await asyncio.create_subprocess_exec(
@ -163,25 +162,35 @@ async def changelog_bot(
    log.info(f"Generating changelog from {fromdate} to {todate}")

    system_prompt = f"""
-Create a concise changelog for the {matrix.changelog_frequency}.
+Create a concise changelog
 Follow these guidelines:

- The header should include the date range from {fromdate} to {todate}
- Use present tense
 - Keep the summary brief
 - Follow commit message format: "scope: message (#number)"
 - Link pull requests as: '{gitea.url}/{gitea.owner}/{gitea.repo}/pulls/<number>'
    - Use markdown links to make the pull request number clickable
- Mention each scope and pull request number only once
- Have these headers in the changelog if applicable:
-    - New Features
-    - Documentation
-    - Refactoring
-    - Bug Fixes
-    - Other Changes
+- Mention each scope and pull request number at most once
+- Focus on the most interesting changes for end users

-Changelog:
 ---
+Example Changelog:
+### Changelog:
+For the last {matrix.changelog_frequency} days from {fromdate} to {todate}
+#### New Features
+- `secrets`: added settings and generator submodules, improved tests [#1679]({gitea.url}/{gitea.owner}/{gitea.repo}/pulls/1679)
+- `sshd`: added a workaround for CVE-2024-6387 [#1674]({gitea.url}/{gitea.owner}/{gitea.repo}/pulls/1674)
+...
+#### Refactoring
+...
+#### Documentation
+...
+#### Bug Fixes
+...
+#### Other Changes
+...
+
+---
+### Changelog:
    """

    # Step 1: Create the JSONL file
--- a/pkgs/matrix-bot/matrix_bot/main.py
+++ b/pkgs/matrix-bot/matrix_bot/main.py
@ -11,7 +11,7 @@ from nio import AsyncClient, ClientConfig, ProfileGetAvatarResponse, RoomMessage
 from .changelog_bot import changelog_bot
 from .gitea import GiteaData
 from .matrix import MatrixData, set_avatar, upload_image
-from .review_bot import message_callback, review_requested_bot
+from .review_bot import message_callback, review_requested_bot, send_error


 async def bot_main(
@ -45,8 +45,20 @@ async def bot_main(
    try:
        async with aiohttp.ClientSession() as session:
            while True:
-                await changelog_bot(client, session, matrix, gitea, data_dir)
-                await review_requested_bot(client, session, matrix, gitea, data_dir)
+                try:
+                    await changelog_bot(client, session, matrix, gitea, data_dir)
+                except Exception as e:
+                    log.exception(e)
+                    await send_error(client, matrix, f"Changelog bot failed: {e}")
+
+                try:
+                    await review_requested_bot(client, session, matrix, gitea, data_dir)
+                except Exception as e:
+                    log.exception(e)
+                    await send_error(
+                        client, matrix, f"Review requested bot failed: {e}"
+                    )
+
                await asyncio.sleep(60 * 5)
    except Exception as e:
        log.exception(e)
--- a/pkgs/matrix-bot/matrix_bot/matrix.py
+++ b/pkgs/matrix-bot/matrix_bot/matrix.py
@ -86,3 +86,4 @@ class MatrixData:
    review_room: str
    changelog_frequency: int
    publish_day: str
+    admin: str
--- a/pkgs/matrix-bot/matrix_bot/openai.py
+++ b/pkgs/matrix-bot/matrix_bot/openai.py
@ -2,8 +2,10 @@ import asyncio
 import json
 import logging
 from os import environ
+from typing import Any

 import aiohttp
+import tiktoken

 log = logging.getLogger(__name__)

@ -23,15 +25,12 @@ def api_key() -> str:
        return f.read().strip()


-from typing import Any
-
-
 async def create_jsonl_data(
    *,
    user_prompt: str,
    system_prompt: str,
    model: str = "gpt-4o",
-    max_tokens: int = 1000,
+    max_tokens: int = 2046,
 ) -> bytes:
    summary_request = {
        "custom_id": "request-1",
@ -46,8 +45,34 @@ async def create_jsonl_data(
            "max_tokens": max_tokens,
        },
    }
+    dumped = json.dumps(summary_request)
+    num_tokens = count_tokens(dumped)
+    log.debug(f"Number of tokens in the JSONL data: {num_tokens}")
+    if model == "gtp-4o" and num_tokens > 90_000:
+        raise ValueError(f"Number of tokens {num_tokens} exceeds the limit of 90,000")

-    return json.dumps(summary_request).encode("utf-8")
+    return dumped.encode("utf-8")
+
+
+def count_tokens(string: str, model: str = "gpt-4") -> int:
+    """
+    Count the number of tokens in a string using the specified model's tokenizer.
+
+    Parameters:
+    - string (str): The input string to tokenize.
+    - model (str): The model to use for tokenization. Default is "gpt-4".
+
+    Returns:
+    - int: The number of tokens in the string.
+    """
+    # Get the encoder for the specified model
+    encoder = tiktoken.encoding_for_model(model)
+
+    # Encode the string to get the tokens
+    tokens = encoder.encode(string)
+
+    # Return the number of tokens
+    return len(tokens)


 async def upload_and_process_file(
@ -118,7 +143,7 @@ async def upload_and_process_file(
    async with session.get(output_url, headers=headers) as response:
        if response.status != 200:
            raise Exception(
-                f"Failed to retrieve batch results with status code {response.status}"
+                f"Failed to retrieve batch results with status code {response.status} reason {response.reason}"
            )

        # Read content as text
--- a/pkgs/matrix-bot/matrix_bot/review_bot.py
+++ b/pkgs/matrix-bot/matrix_bot/review_bot.py
@ -30,6 +30,15 @@ async def message_callback(room: MatrixRoom, event: RoomMessageText) -> None:
    )


+async def send_error(client: AsyncClient, matrix: MatrixData, msg: str) -> None:
+    # If you made a new room and haven't joined as that user, you can use
+    room: JoinResponse = await client.join(matrix.review_room)
+    if not room.transport_response.ok:
+        log.error("This can happen if the room doesn't exist or the bot isn't invited")
+        raise Exception(f"Failed to join room {room}")
+    await send_message(client, room, msg, user_ids=[matrix.admin])
+
+
 async def review_requested_bot(
    client: AsyncClient,
    http: aiohttp.ClientSession,