matrix-bot: Incorporate higher openai ratelimits
All checks were successful
buildbot/nix-build .#checks.x86_64-linux.devShell-matrix-bot Build done.
buildbot/nix-build .#checks.x86_64-linux.clan-merge Build done.
buildbot/nix-build .#checks.x86_64-linux.devShell-clan-merge Build done.
buildbot/nix-build .#checks.x86_64-linux.devShell-default Build done.
buildbot/nix-build .#checks.x86_64-linux.package-action-ensure-tea-login Build done.
buildbot/nix-build .#checks.x86_64-linux.treefmt Build done.
buildbot/nix-build .#checks.x86_64-linux.package-action-create-pr Build done.
buildbot/nix-build .#checks.x86_64-linux.package-action-flake-update-pr-clan Build done.
buildbot/nix-build .#checks.x86_64-linux.package-action-flake-update Build done.
buildbot/nix-build .#checks.x86_64-linux.package-clan-merge Build done.
buildbot/nix-build .#checks.x86_64-linux.package-gitea Build done.
buildbot/nix-build .#checks.x86_64-linux.package-matrix-bot Build done.
buildbot/nix-build .#checks.x86_64-linux.package-job-flake-update-clan-homepage Build done.
buildbot/nix-build .#checks.x86_64-linux.package-job-flake-update-clan-infra Build done.
buildbot/nix-build .#checks.x86_64-linux.package-job-flake-update-data-mesher Build done.
buildbot/nix-build .#checks.x86_64-linux.package-job-flake-update-clan-core Build done.
buildbot/nix-build .#checks.x86_64-linux.nixos-web01 Build done.
buildbot/nix-eval Build done.

This commit is contained in:
Luis Hebendanz 2024-07-16 22:42:39 +02:00
parent 8228bbcfe2
commit f845fa8525
4 changed files with 52 additions and 36 deletions

View File

@ -23,7 +23,7 @@ from .openai import create_jsonl_data, upload_and_process_file
log = logging.getLogger(__name__)
def last_ndays_to_today(ndays: int) -> (str, str):
def last_ndays_to_today(ndays: int) -> tuple[str, str]:
# Get today's date
today = datetime.datetime.now()
@ -74,18 +74,19 @@ async def git_pull(repo_path: Path) -> None:
await process.wait()
async def git_log(repo_path: str, ndays: int) -> str:
async def git_log(repo_path: Path, ndays: int) -> str:
cmd = [
"git",
"log",
f"--since={ndays} days ago",
"--pretty=format:%h - %an, %ar : %s",
"--stat",
"--patch",
]
log.debug(f"Running command: {shlex.join(cmd)}")
process = await asyncio.create_subprocess_exec(
*cmd,
cwd=repo_path,
cwd=str(repo_path),
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
@ -172,19 +173,22 @@ Create a concise changelog
Follow these guidelines:
- Keep the summary brief
- Follow commit message format: "scope: message (#number)"
- Follow commit message format: "scope: message (#number1, #number2)"
- Link pull requests as: '{gitea.url}/{gitea.owner}/{gitea.repo}/pulls/<number>'
- Use markdown links to make the pull request number clickable
- Mention each scope and pull request number at most once
- Mention each pull request number at most once
- Focus on the most interesting changes for end users
- Explain the impact of the changes in a user-friendly way
---
Example Changelog:
### Changelog:
For the last {matrix.changelog_frequency} days from {fromdate} to {todate}
#### New Features
- `secrets`: added settings and generator submodules, improved tests [#1679]({gitea.url}/{gitea.owner}/{gitea.repo}/pulls/1679)
- `sshd`: added a workaround for CVE-2024-6387 [#1674]({gitea.url}/{gitea.owner}/{gitea.repo}/pulls/1674)
- `secrets`: added settings and generator submodules, improved tests [#1679]({gitea.url}/{gitea.owner}/{gitea.repo}/pulls/1679)
> Users can now generate secrets and manage settings in the new submodules
- `sshd`: added a workaround for CVE-2024-6387 [#1674]({gitea.url}/{gitea.owner}/{gitea.repo}/pulls/1674)
> A workaround has been added to mitigate the security vulnerability
...
#### Refactoring
...

View File

@ -16,7 +16,7 @@ from nio import (
)
async def upload_image(client: AsyncClient, image_path: str) -> str:
async def upload_image(client: AsyncClient, image_path: Path) -> str:
with open(image_path, "rb") as image_file:
response: UploadResponse
response, _ = await client.upload(image_file, content_type="image/png")

View File

@ -30,9 +30,9 @@ async def create_jsonl_data(
user_prompt: str,
system_prompt: str,
model: str = "gpt-4o",
max_tokens: int = 2046,
max_tokens: int = 4096,
) -> bytes:
summary_request = {
summary_request: dict[str, Any] = {
"custom_id": "request-1",
"method": "POST",
"url": "/v1/chat/completions",
@ -45,39 +45,43 @@ async def create_jsonl_data(
"max_tokens": max_tokens,
},
}
dumped = json.dumps(summary_request)
num_tokens = count_tokens(dumped)
log.debug(f"Number of tokens in the JSONL data: {num_tokens}")
if model == "gtp-4o" and num_tokens > 90_000:
raise ValueError(f"Number of tokens {num_tokens} exceeds the limit of 90,000")
encoder = tiktoken.encoding_for_model(model)
count_tokens: int = len(encoder.encode(dumped))
used_tokens = max_tokens + count_tokens + 1000
log.debug(f"Number of tokens in the JSONL data: {used_tokens}")
if used_tokens > 128_000:
# Cut off the excess tokens
tokens_to_remove: int = used_tokens - 128_000
message = summary_request["body"]["messages"][1]
content = message["content"]
content_tokens = encoder.encode(content)
if len(content_tokens) > tokens_to_remove:
# Remove the excess tokens
encoded_content = content_tokens[:-tokens_to_remove]
log.debug(f"Removed {tokens_to_remove} tokens from the content")
# Decode the tokens back to string
content = encoder.decode(encoded_content)
summary_request["body"]["messages"][1]["content"] = content
dumped = json.dumps(summary_request)
else:
raise Exception("Not enough tokens to remove")
new_count_tokens: int = len(encoder.encode(dumped))
if new_count_tokens > 128_000:
raise Exception(f"Too many tokens in the JSONL data {new_count_tokens}")
return dumped.encode("utf-8")
def count_tokens(string: str, model: str = "gpt-4") -> int:
"""
Count the number of tokens in a string using the specified model's tokenizer.
Parameters:
- string (str): The input string to tokenize.
- model (str): The model to use for tokenization. Default is "gpt-4".
Returns:
- int: The number of tokens in the string.
"""
# Get the encoder for the specified model
encoder = tiktoken.encoding_for_model(model)
# Encode the string to get the tokens
tokens = encoder.encode(string)
# Return the number of tokens
return len(tokens)
async def upload_and_process_file(
*, session: aiohttp.ClientSession, jsonl_data: bytes, api_key: str = api_key()
) -> dict[str, Any]:
) -> list[dict[str, Any]]:
"""
Upload a JSONL file to OpenAI's Batch API and process it asynchronously.
"""

View File

@ -52,6 +52,14 @@ ignore_missing_imports = true
module = "setuptools.*"
ignore_missing_imports = true
[[tool.mypy.overrides]]
module = "nio.*"
ignore_missing_imports = true
[[tool.mypy.overrides]]
module = "markdown2.*"
ignore_missing_imports = true
[tool.ruff]
target-version = "py311"
line-length = 88