From 4686fc734e2e43371aaa578acefc95f5df815a12 Mon Sep 17 00:00:00 2001 From: Severian Date: Thu, 13 Feb 2025 09:39:34 +0800 Subject: [PATCH] chore: 1.2 out --- README.md | 1 + cardextractor_severian_dev.ipynb | 290 +++++++++++++++++++++++++++++++ src/app/api/proxy/route.ts | 97 +++++------ src/app/page.tsx | 14 +- 4 files changed, 342 insertions(+), 60 deletions(-) create mode 100644 cardextractor_severian_dev.ipynb diff --git a/README.md b/README.md index 95b9bce..2963762 100644 --- a/README.md +++ b/README.md @@ -4,5 +4,6 @@ Check package.json for commands, I can't be bothered. ### Changelog +- 1.2: Fix for Janitor having changed their prompt system - 1.1: Fix for R1 handling change, blank user persona bug - 1.0: Yup. diff --git a/cardextractor_severian_dev.ipynb b/cardextractor_severian_dev.ipynb new file mode 100644 index 0000000..bad26b8 --- /dev/null +++ b/cardextractor_severian_dev.ipynb @@ -0,0 +1,290 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "zMVSOtXUASM8" + }, + "source": [ + "## **Card Definition Extractor**\n", + "\n", + "Standalone version with directions: https://sucker.severian.dev\n", + "\n", + "I've gotten into making models at [trashpanda-org](https://huggingface.co/trashpanda-org), check out hasnonname's [Mullein](https://huggingface.co/trashpanda-org/MS-24B-Mullein-v0)!\n", + "\n", + "> _lmk on Discord if you have any issues while using this - Severian_\n", + "\n", + "---\n", + "\n", + "**Changelog:**\n", + "- v0.2: fixed to handle Janitor making changes due to R1 handling.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "background_save": true, + "base_uri": "https://localhost:8080/" + }, + "id": "a0pFE9KCDh8P", + "outputId": "d647688d-e541-4e5f-e13d-4b385ee84d8b" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: flask-cors in /usr/local/lib/python3.11/dist-packages (5.0.0)\n", + "Requirement already satisfied: Flask>=0.9 in /usr/local/lib/python3.11/dist-packages (from flask-cors) (3.1.0)\n", + "Requirement already satisfied: Werkzeug>=3.1 in /usr/local/lib/python3.11/dist-packages (from Flask>=0.9->flask-cors) (3.1.3)\n", + "Requirement already satisfied: Jinja2>=3.1.2 in /usr/local/lib/python3.11/dist-packages (from Flask>=0.9->flask-cors) (3.1.5)\n", + "Requirement already satisfied: itsdangerous>=2.2 in /usr/local/lib/python3.11/dist-packages (from Flask>=0.9->flask-cors) (2.2.0)\n", + "Requirement already satisfied: click>=8.1.3 in /usr/local/lib/python3.11/dist-packages (from Flask>=0.9->flask-cors) (8.1.8)\n", + "Requirement already satisfied: blinker>=1.9 in /usr/local/lib/python3.11/dist-packages (from Flask>=0.9->flask-cors) (1.9.0)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.11/dist-packages (from Jinja2>=3.1.2->Flask>=0.9->flask-cors) (3.0.2)\n", + "Requirement already satisfied: flask-cors in /usr/local/lib/python3.11/dist-packages (5.0.0)\n", + "Requirement already satisfied: flask_cloudflared in /usr/local/lib/python3.11/dist-packages (0.0.14)\n", + "Requirement already satisfied: Flask>=0.9 in /usr/local/lib/python3.11/dist-packages (from flask-cors) (3.1.0)\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.11/dist-packages (from flask_cloudflared) (2.32.3)\n", + "Requirement already satisfied: Werkzeug>=3.1 in /usr/local/lib/python3.11/dist-packages (from Flask>=0.9->flask-cors) (3.1.3)\n", + "Requirement already satisfied: Jinja2>=3.1.2 in /usr/local/lib/python3.11/dist-packages (from Flask>=0.9->flask-cors) (3.1.5)\n", + "Requirement already satisfied: itsdangerous>=2.2 in /usr/local/lib/python3.11/dist-packages (from Flask>=0.9->flask-cors) (2.2.0)\n", + "Requirement already satisfied: click>=8.1.3 in /usr/local/lib/python3.11/dist-packages (from Flask>=0.9->flask-cors) (8.1.8)\n", + "Requirement already satisfied: blinker>=1.9 in /usr/local/lib/python3.11/dist-packages (from Flask>=0.9->flask-cors) (1.9.0)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests->flask_cloudflared) (3.4.1)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests->flask_cloudflared) (3.10)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/dist-packages (from requests->flask_cloudflared) (2.3.0)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests->flask_cloudflared) (2024.12.14)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.11/dist-packages (from Jinja2>=3.1.2->Flask>=0.9->flask-cors) (3.0.2)\n", + " * Serving Flask app '__main__'\n", + " * Debug mode: off\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:werkzeug:\u001b[31m\u001b[1mWARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.\u001b[0m\n", + " * Running on http://127.0.0.1:5000\n", + "INFO:werkzeug:\u001b[33mPress CTRL+C to quit\u001b[0m\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " * Running on https://little-disputes-posting-palmer.trycloudflare.com\n", + " * Traffic stats available on http://127.0.0.1:8396/metrics\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:werkzeug:127.0.0.1 - - [04/Feb/2025 22:53:13] \"OPTIONS / HTTP/1.1\" 200 -\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Card definition JSON created at: /tmp/tmpynlda8kv.json\n" + ] + }, + { + "data": { + "application/javascript": "\n async function download(id, filename, size) {\n if (!google.colab.kernel.accessAllowed) {\n return;\n }\n const div = document.createElement('div');\n const label = document.createElement('label');\n label.textContent = `Downloading \"${filename}\": `;\n div.appendChild(label);\n const progress = document.createElement('progress');\n progress.max = size;\n div.appendChild(progress);\n document.body.appendChild(div);\n\n const buffers = [];\n let downloaded = 0;\n\n const channel = await google.colab.kernel.comms.open(id);\n // Send a message to notify the kernel that we're ready.\n channel.send({})\n\n for await (const message of channel.messages) {\n // Send a message to notify the kernel that we're ready.\n channel.send({})\n if (message.buffers) {\n for (const buffer of message.buffers) {\n buffers.push(buffer);\n downloaded += buffer.byteLength;\n progress.value = downloaded;\n }\n }\n }\n const blob = new Blob(buffers, {type: 'application/binary'});\n const a = document.createElement('a');\n a.href = window.URL.createObjectURL(blob);\n a.download = filename;\n div.appendChild(a);\n a.click();\n div.remove();\n }\n ", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": "download(\"download_8462ad82-aca0-40a4-8cbb-0342ff5a7e1c\", \"tmpynlda8kv.json\", 14791)", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:werkzeug:127.0.0.1 - - [04/Feb/2025 22:53:14] \"POST / HTTP/1.1\" 200 -\n" + ] + } + ], + "source": [ + "# @title Card Definition Extractor\n", + "\n", + "# @markdown Directions for use:\n", + "# @markdown - If enabled, starts the proxy in character card extraction mode.\n", + "# @markdown - Use the proxy as normal, and start a new chat with your character of choice.\n", + "# @markdown - After sending the first message, the proxy will process the character card in v1 format\n", + "# @markdown - Stop the proxy and Colab will download the JSON file on your device\n", + "# @markdown - Your custom prompt will appear on the description field so this is best used with a cleared-out custom prompt section on janitor.ai\n", + "# @markdown - You can start multiple new chats and send messages for the extractor to capture cards, and when you stop the notebook, it will download all extracted files at once.\n", + "\n", + "# @markdown **Select Tunnel Provider**\n", + "tunnel_provider = \"Cloudflare\" # @param [\"Cloudflare\", \"Localtunnel\", \"Ngrok\"]\n", + "\n", + "# @markdown **Ngrok Auth Token**: If using ngrok, sign up for an auth token at https://dashboard.ngrok.com/signup\n", + "ngrok_auth_token = \"\" # @param {type:\"string\"}\n", + "\n", + "card_definition_extractor = True\n", + "!pip install flask-cors\n", + "\n", + "import json\n", + "import requests\n", + "import time\n", + "from flask import Flask, request, jsonify\n", + "from flask_cors import CORS\n", + "import re\n", + "import tempfile\n", + "import os\n", + "\n", + "app = Flask(__name__)\n", + "CORS(app)\n", + "\n", + "# Depending on the provider, set up the tunnel\n", + "if tunnel_provider == \"Cloudflare\":\n", + " !pip install flask-cors flask_cloudflared\n", + " from flask_cloudflared import run_with_cloudflared\n", + " run_with_cloudflared(app)\n", + "elif tunnel_provider == \"Localtunnel\":\n", + " !pip install flask-cors flask_localtunnel\n", + " from flask_lt import run_with_lt\n", + " run_with_lt(app)\n", + "elif tunnel_provider == \"Ngrok\":\n", + " !pip install flask-cors pyngrok==7.1.2\n", + " from pyngrok import ngrok\n", + " if ngrok_auth_token.strip():\n", + " ngrok.set_auth_token(ngrok_auth_token.strip())\n", + " public_url = ngrok.connect(5000).public_url\n", + " print(\"Public URL:\", public_url)\n", + "\n", + "def extract_between_tags(content, tag):\n", + " \"\"\"\n", + " Extracts content between XML-like tags.\n", + " Returns empty string if tag not found.\n", + " \"\"\"\n", + " start_tag = f\"<{tag}>\"\n", + " end_tag = f\"\"\n", + " start_idx = content.find(start_tag)\n", + " if start_idx == -1:\n", + " return \"\"\n", + " \n", + " end_idx = content.find(end_tag, start_idx)\n", + " if end_idx == -1:\n", + " return \"\"\n", + " \n", + " return content[start_idx + len(start_tag):end_idx].strip()\n", + "\n", + "def find_tags_between(content, start_marker, end_marker):\n", + " \"\"\"\n", + " Finds all XML-like tags and their content between two marker tags.\n", + " Returns list of {tag, content} dictionaries.\n", + " \"\"\"\n", + " start_idx = content.find(f\"<{start_marker}>\")\n", + " if start_idx == -1:\n", + " return []\n", + " \n", + " end_idx = content.find(f\"<{end_marker}>\")\n", + " if end_idx == -1:\n", + " return []\n", + " \n", + " section = content[start_idx + len(start_marker) + 2:end_idx]\n", + " tag_regex = r\"<([^/>]+)>([^<]+)\"\n", + " matches = re.finditer(tag_regex, section)\n", + " \n", + " return [{\"tag\": match.group(1), \"content\": match.group(2).strip()} for match in matches]\n", + "\n", + "def extract_card_data(messages):\n", + " content0 = messages[0][\"content\"]\n", + " content1 = messages[2][\"content\"]\n", + "\n", + " # Find all persona tags between system and scenario, take the last one as character\n", + " personas = find_tags_between(content0, \"system\", \"scenario\")\n", + " char_persona = personas[-1] if personas else {\"tag\": \"\", \"content\": \"\"}\n", + " char_name = char_persona[\"tag\"]\n", + "\n", + " card_data = {\n", + " \"name\": char_name,\n", + " \"description\": char_persona[\"content\"],\n", + " \"scenario\": extract_between_tags(content0, \"scenario\"),\n", + " \"mes_example\": extract_between_tags(content0, \"example_dialogs\"),\n", + " \"personality\": \"\", # This field isn't used in the new format\n", + " \"first_mes\": content1\n", + " }\n", + "\n", + " # Replace character name with placeholder in all fields\n", + " def safe_replace(text, old, new):\n", + " return text.replace(old, new) if old else text\n", + "\n", + " for field in card_data:\n", + " if field != \"name\": # Exclude the \"name\" field\n", + " val = card_data[field]\n", + " val = safe_replace(val, char_name, \"{{char}}\")\n", + " card_data[field] = val\n", + "\n", + " return card_data\n", + "\n", + "@app.route('/', methods=['GET'])\n", + "def default():\n", + " return {\"status\": \"online\"}\n", + "\n", + "@app.route('/', methods=['POST'])\n", + "def process_card():\n", + " body = request.json\n", + " if 'messages' not in body:\n", + " return jsonify(error=\"Missing 'messages' in request body\"), 400\n", + "\n", + " if card_definition_extractor and len(body[\"messages\"]) >= 2:\n", + " card_data = extract_card_data(body[\"messages\"])\n", + " # If running in Colab, download the file\n", + " try:\n", + " from google.colab import files\n", + " import tempfile\n", + " temp_json = tempfile.NamedTemporaryFile(delete=False, suffix=\".json\")\n", + " with open(temp_json.name, 'w', encoding='utf-8') as f:\n", + " json.dump(card_data, f, ensure_ascii=False, indent=2)\n", + " print(\"Card definition JSON created at:\", temp_json.name)\n", + " files.download(temp_json.name)\n", + " except ImportError:\n", + " pass # Not in Colab, just return JSON\n", + "\n", + " return jsonify(card_data), 200\n", + " else:\n", + " return jsonify(status=\"Card definition extractor not enabled or insufficient messages\"), 200\n", + "\n", + "if __name__ == '__main__':\n", + " if tunnel_provider != \"Cloudflare\":\n", + " print('\\n Colab IP: ', end='')\n", + " !curl ipecho.net/plain\n", + " print('\\n')\n", + " app.run()\n" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/src/app/api/proxy/route.ts b/src/app/api/proxy/route.ts index 31628fd..f7fe105 100644 --- a/src/app/api/proxy/route.ts +++ b/src/app/api/proxy/route.ts @@ -33,17 +33,38 @@ interface CardData { scenario: string; } -function extractPersonaName(content: string, personaIndex: number = 0): string { - const personaMatches = Array.from(content.matchAll(/'s Persona:/g)); - if (personaMatches.length <= personaIndex) return ""; +interface PersonaMatch { + tag: string; + content: string; +} - const personaIdx = personaMatches[personaIndex].index!; - const lineStartIdx = content.lastIndexOf("\n", personaIdx); - const lineEndIdx = personaIdx; +function findTagsBetween(content: string, startMarker: string, endMarker: string): PersonaMatch[] { + const startIdx = content.indexOf(`<${startMarker}>`); + if (startIdx === -1) return []; + + const endIdx = content.indexOf(`<${endMarker}>`); + if (endIdx === -1) return []; + + const section = content.slice(startIdx + startMarker.length + 2, endIdx); + const tagRegex = /<([^/>]+)>([^<]+)<\/\1>/g; + const matches = Array.from(section.matchAll(tagRegex)); + + return matches.map(match => ({ + tag: match[1], + content: match[2].trim() + })); +} - return content - .slice(lineStartIdx === -1 ? 0 : lineStartIdx + 1, lineEndIdx) - .trim(); +function extractBetweenTags(content: string, tag: string): string { + const startTag = `<${tag}>`; + const endTag = ``; + const startIndex = content.indexOf(startTag); + if (startIndex === -1) return ""; + + const endIndex = content.indexOf(endTag, startIndex); + if (endIndex === -1) return ""; + + return content.slice(startIndex + startTag.length, endIndex).trim(); } function safeReplace(text: string, old: string, newStr: string): string { @@ -54,63 +75,27 @@ function extractCardData(messages: Message[]): CardData { const content0 = messages[0].content; const content1 = messages[2].content; - const userName = extractPersonaName(content0, 0); - const charName = extractPersonaName(content0, 1); + // Find all persona tags between system and scenario, take the last one as character + const personas = findTagsBetween(content0, "system", "scenario"); + const charPersona = personas[personas.length - 1]; + const charName = charPersona?.tag || ""; - const personaMatches = Array.from(content0.matchAll(/'s Persona:/g)); + // Initialize card data with the character name let cardData: CardData = { name: charName, - description: "", - scenario: "", - mes_example: "", - personality: "", + description: charPersona?.content || "", + scenario: extractBetweenTags(content0, "scenario"), + mes_example: extractBetweenTags(content0, "example_dialogs"), + personality: "", // This field isn't used in the new format first_mes: content1, }; - // blank user persona handling, or at least an attempt - let secondPersonaIdx = - personaMatches[personaMatches.length >= 2 ? 1 : 0]?.index; - const startDesc = secondPersonaIdx + "'s Persona:".length; - const remaining = content0.slice(startDesc); - - const scenarioMarker = remaining.match(/Scenario of the roleplay:/); - const exampleMarker = remaining.match(/Example conversations between/); - - let endIdx = remaining.length; - if (scenarioMarker) endIdx = Math.min(endIdx, scenarioMarker.index!); - if (exampleMarker) endIdx = Math.min(endIdx, exampleMarker.index!); - - cardData.description = remaining.slice(0, endIdx).trim(); - - if (scenarioMarker) { - const scenarioStart = scenarioMarker.index! + scenarioMarker[0].length; - const scenarioRemaining = remaining.slice(scenarioStart); - const exampleInScenarioMarker = scenarioRemaining.match( - /Example conversations between/ - ); - const scenarioEnd = exampleInScenarioMarker - ? exampleInScenarioMarker.index! - : scenarioRemaining.length; - cardData.scenario = scenarioRemaining.slice(0, scenarioEnd).trim(); - } - - if (exampleMarker) { - const exampleStart = exampleMarker.index!; - const rawExampleStr = remaining.slice(exampleStart).trim(); - const colonIdx = rawExampleStr.indexOf(":"); - cardData.mes_example = - colonIdx !== -1 - ? rawExampleStr.slice(colonIdx + 1).trim() - : rawExampleStr.trim(); - } - + // Replace character name with placeholder in all fields for (const field in cardData) { if (field !== "name") { const val = cardData[field as keyof CardData]; if (typeof val === "string") { - let newVal = safeReplace(val, userName, "{{user}}"); - newVal = safeReplace(newVal, charName, "{{char}}"); - cardData[field as keyof CardData] = newVal; + cardData[field as keyof CardData] = safeReplace(val, charName, "{{char}}"); } } } diff --git a/src/app/page.tsx b/src/app/page.tsx index e348363..68e4b28 100644 --- a/src/app/page.tsx +++ b/src/app/page.tsx @@ -187,9 +187,9 @@ export default function Home() {
-

Sucker v1.1

+

Sucker v1.2

- Fixed R1 handling change, blank user persona bug. + If it's any consolation, I hate making these changes too.