From 4686fc734e2e43371aaa578acefc95f5df815a12 Mon Sep 17 00:00:00 2001
From: Severian <yo@severian.dev>
Date: Thu, 13 Feb 2025 09:39:34 +0800
Subject: [PATCH] chore: 1.2 out

---
 README.md                        |   1 +
 cardextractor_severian_dev.ipynb | 290 +++++++++++++++++++++++++++++++
 src/app/api/proxy/route.ts       |  97 +++++------
 src/app/page.tsx                 |  14 +-
 4 files changed, 342 insertions(+), 60 deletions(-)
 create mode 100644 cardextractor_severian_dev.ipynb

diff --git a/README.md b/README.md
index 95b9bce..2963762 100644
--- a/README.md
+++ b/README.md
@@ -4,5 +4,6 @@ Check package.json for commands, I can't be bothered.
 
 ### Changelog
 
+- 1.2: Fix for Janitor having changed their prompt system
 - 1.1: Fix for R1 handling change, blank user persona bug
 - 1.0: Yup.
diff --git a/cardextractor_severian_dev.ipynb b/cardextractor_severian_dev.ipynb
new file mode 100644
index 0000000..bad26b8
--- /dev/null
+++ b/cardextractor_severian_dev.ipynb
@@ -0,0 +1,290 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "zMVSOtXUASM8"
+      },
+      "source": [
+        "## **Card Definition Extractor**\n",
+        "\n",
+        "Standalone version with directions: https://sucker.severian.dev\n",
+        "\n",
+        "I've gotten into making models at [trashpanda-org](https://huggingface.co/trashpanda-org), check out hasnonname's [Mullein](https://huggingface.co/trashpanda-org/MS-24B-Mullein-v0)!\n",
+        "\n",
+        "> _lmk on Discord if you have any issues while using this - Severian_\n",
+        "\n",
+        "---\n",
+        "\n",
+        "**Changelog:**\n",
+        "- v0.2: fixed to handle Janitor making changes due to R1 handling.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "background_save": true,
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "a0pFE9KCDh8P",
+        "outputId": "d647688d-e541-4e5f-e13d-4b385ee84d8b"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Requirement already satisfied: flask-cors in /usr/local/lib/python3.11/dist-packages (5.0.0)\n",
+            "Requirement already satisfied: Flask>=0.9 in /usr/local/lib/python3.11/dist-packages (from flask-cors) (3.1.0)\n",
+            "Requirement already satisfied: Werkzeug>=3.1 in /usr/local/lib/python3.11/dist-packages (from Flask>=0.9->flask-cors) (3.1.3)\n",
+            "Requirement already satisfied: Jinja2>=3.1.2 in /usr/local/lib/python3.11/dist-packages (from Flask>=0.9->flask-cors) (3.1.5)\n",
+            "Requirement already satisfied: itsdangerous>=2.2 in /usr/local/lib/python3.11/dist-packages (from Flask>=0.9->flask-cors) (2.2.0)\n",
+            "Requirement already satisfied: click>=8.1.3 in /usr/local/lib/python3.11/dist-packages (from Flask>=0.9->flask-cors) (8.1.8)\n",
+            "Requirement already satisfied: blinker>=1.9 in /usr/local/lib/python3.11/dist-packages (from Flask>=0.9->flask-cors) (1.9.0)\n",
+            "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.11/dist-packages (from Jinja2>=3.1.2->Flask>=0.9->flask-cors) (3.0.2)\n",
+            "Requirement already satisfied: flask-cors in /usr/local/lib/python3.11/dist-packages (5.0.0)\n",
+            "Requirement already satisfied: flask_cloudflared in /usr/local/lib/python3.11/dist-packages (0.0.14)\n",
+            "Requirement already satisfied: Flask>=0.9 in /usr/local/lib/python3.11/dist-packages (from flask-cors) (3.1.0)\n",
+            "Requirement already satisfied: requests in /usr/local/lib/python3.11/dist-packages (from flask_cloudflared) (2.32.3)\n",
+            "Requirement already satisfied: Werkzeug>=3.1 in /usr/local/lib/python3.11/dist-packages (from Flask>=0.9->flask-cors) (3.1.3)\n",
+            "Requirement already satisfied: Jinja2>=3.1.2 in /usr/local/lib/python3.11/dist-packages (from Flask>=0.9->flask-cors) (3.1.5)\n",
+            "Requirement already satisfied: itsdangerous>=2.2 in /usr/local/lib/python3.11/dist-packages (from Flask>=0.9->flask-cors) (2.2.0)\n",
+            "Requirement already satisfied: click>=8.1.3 in /usr/local/lib/python3.11/dist-packages (from Flask>=0.9->flask-cors) (8.1.8)\n",
+            "Requirement already satisfied: blinker>=1.9 in /usr/local/lib/python3.11/dist-packages (from Flask>=0.9->flask-cors) (1.9.0)\n",
+            "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests->flask_cloudflared) (3.4.1)\n",
+            "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests->flask_cloudflared) (3.10)\n",
+            "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/dist-packages (from requests->flask_cloudflared) (2.3.0)\n",
+            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests->flask_cloudflared) (2024.12.14)\n",
+            "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.11/dist-packages (from Jinja2>=3.1.2->Flask>=0.9->flask-cors) (3.0.2)\n",
+            " * Serving Flask app '__main__'\n",
+            " * Debug mode: off\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "INFO:werkzeug:\u001b[31m\u001b[1mWARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.\u001b[0m\n",
+            " * Running on http://127.0.0.1:5000\n",
+            "INFO:werkzeug:\u001b[33mPress CTRL+C to quit\u001b[0m\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            " * Running on https://little-disputes-posting-palmer.trycloudflare.com\n",
+            " * Traffic stats available on http://127.0.0.1:8396/metrics\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "INFO:werkzeug:127.0.0.1 - - [04/Feb/2025 22:53:13] \"OPTIONS / HTTP/1.1\" 200 -\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Card definition JSON created at: /tmp/tmpynlda8kv.json\n"
+          ]
+        },
+        {
+          "data": {
+            "application/javascript": "\n    async function download(id, filename, size) {\n      if (!google.colab.kernel.accessAllowed) {\n        return;\n      }\n      const div = document.createElement('div');\n      const label = document.createElement('label');\n      label.textContent = `Downloading \"${filename}\": `;\n      div.appendChild(label);\n      const progress = document.createElement('progress');\n      progress.max = size;\n      div.appendChild(progress);\n      document.body.appendChild(div);\n\n      const buffers = [];\n      let downloaded = 0;\n\n      const channel = await google.colab.kernel.comms.open(id);\n      // Send a message to notify the kernel that we're ready.\n      channel.send({})\n\n      for await (const message of channel.messages) {\n        // Send a message to notify the kernel that we're ready.\n        channel.send({})\n        if (message.buffers) {\n          for (const buffer of message.buffers) {\n            buffers.push(buffer);\n            downloaded += buffer.byteLength;\n            progress.value = downloaded;\n          }\n        }\n      }\n      const blob = new Blob(buffers, {type: 'application/binary'});\n      const a = document.createElement('a');\n      a.href = window.URL.createObjectURL(blob);\n      a.download = filename;\n      div.appendChild(a);\n      a.click();\n      div.remove();\n    }\n  ",
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "data": {
+            "application/javascript": "download(\"download_8462ad82-aca0-40a4-8cbb-0342ff5a7e1c\", \"tmpynlda8kv.json\", 14791)",
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "INFO:werkzeug:127.0.0.1 - - [04/Feb/2025 22:53:14] \"POST / HTTP/1.1\" 200 -\n"
+          ]
+        }
+      ],
+      "source": [
+        "# @title Card Definition Extractor\n",
+        "\n",
+        "# @markdown Directions for use:\n",
+        "# @markdown - If enabled, starts the proxy in character card extraction mode.\n",
+        "# @markdown - Use the proxy as normal, and start a new chat with your character of choice.\n",
+        "# @markdown - After sending the first message, the proxy will process the character card in v1 format\n",
+        "# @markdown - Stop the proxy and Colab will download the JSON file on your device\n",
+        "# @markdown - Your custom prompt will appear on the description field so this is best used with a cleared-out custom prompt section on janitor.ai\n",
+        "# @markdown - You can start multiple new chats and send messages for the extractor to capture cards, and when you stop the notebook, it will download all extracted files at once.\n",
+        "\n",
+        "# @markdown **Select Tunnel Provider**\n",
+        "tunnel_provider = \"Cloudflare\"  # @param [\"Cloudflare\", \"Localtunnel\", \"Ngrok\"]\n",
+        "\n",
+        "# @markdown **Ngrok Auth Token**: If using ngrok, sign up for an auth token at https://dashboard.ngrok.com/signup\n",
+        "ngrok_auth_token = \"\"  # @param {type:\"string\"}\n",
+        "\n",
+        "card_definition_extractor = True\n",
+        "!pip install flask-cors\n",
+        "\n",
+        "import json\n",
+        "import requests\n",
+        "import time\n",
+        "from flask import Flask, request, jsonify\n",
+        "from flask_cors import CORS\n",
+        "import re\n",
+        "import tempfile\n",
+        "import os\n",
+        "\n",
+        "app = Flask(__name__)\n",
+        "CORS(app)\n",
+        "\n",
+        "# Depending on the provider, set up the tunnel\n",
+        "if tunnel_provider == \"Cloudflare\":\n",
+        "    !pip install flask-cors flask_cloudflared\n",
+        "    from flask_cloudflared import run_with_cloudflared\n",
+        "    run_with_cloudflared(app)\n",
+        "elif tunnel_provider == \"Localtunnel\":\n",
+        "    !pip install flask-cors flask_localtunnel\n",
+        "    from flask_lt import run_with_lt\n",
+        "    run_with_lt(app)\n",
+        "elif tunnel_provider == \"Ngrok\":\n",
+        "    !pip install flask-cors pyngrok==7.1.2\n",
+        "    from pyngrok import ngrok\n",
+        "    if ngrok_auth_token.strip():\n",
+        "        ngrok.set_auth_token(ngrok_auth_token.strip())\n",
+        "    public_url = ngrok.connect(5000).public_url\n",
+        "    print(\"Public URL:\", public_url)\n",
+        "\n",
+        "def extract_between_tags(content, tag):\n",
+        "    \"\"\"\n",
+        "    Extracts content between XML-like tags.\n",
+        "    Returns empty string if tag not found.\n",
+        "    \"\"\"\n",
+        "    start_tag = f\"<{tag}>\"\n",
+        "    end_tag = f\"</{tag}>\"\n",
+        "    start_idx = content.find(start_tag)\n",
+        "    if start_idx == -1:\n",
+        "        return \"\"\n",
+        "    \n",
+        "    end_idx = content.find(end_tag, start_idx)\n",
+        "    if end_idx == -1:\n",
+        "        return \"\"\n",
+        "    \n",
+        "    return content[start_idx + len(start_tag):end_idx].strip()\n",
+        "\n",
+        "def find_tags_between(content, start_marker, end_marker):\n",
+        "    \"\"\"\n",
+        "    Finds all XML-like tags and their content between two marker tags.\n",
+        "    Returns list of {tag, content} dictionaries.\n",
+        "    \"\"\"\n",
+        "    start_idx = content.find(f\"<{start_marker}>\")\n",
+        "    if start_idx == -1:\n",
+        "        return []\n",
+        "    \n",
+        "    end_idx = content.find(f\"<{end_marker}>\")\n",
+        "    if end_idx == -1:\n",
+        "        return []\n",
+        "    \n",
+        "    section = content[start_idx + len(start_marker) + 2:end_idx]\n",
+        "    tag_regex = r\"<([^/>]+)>([^<]+)</\\1>\"\n",
+        "    matches = re.finditer(tag_regex, section)\n",
+        "    \n",
+        "    return [{\"tag\": match.group(1), \"content\": match.group(2).strip()} for match in matches]\n",
+        "\n",
+        "def extract_card_data(messages):\n",
+        "    content0 = messages[0][\"content\"]\n",
+        "    content1 = messages[2][\"content\"]\n",
+        "\n",
+        "    # Find all persona tags between system and scenario, take the last one as character\n",
+        "    personas = find_tags_between(content0, \"system\", \"scenario\")\n",
+        "    char_persona = personas[-1] if personas else {\"tag\": \"\", \"content\": \"\"}\n",
+        "    char_name = char_persona[\"tag\"]\n",
+        "\n",
+        "    card_data = {\n",
+        "        \"name\": char_name,\n",
+        "        \"description\": char_persona[\"content\"],\n",
+        "        \"scenario\": extract_between_tags(content0, \"scenario\"),\n",
+        "        \"mes_example\": extract_between_tags(content0, \"example_dialogs\"),\n",
+        "        \"personality\": \"\",  # This field isn't used in the new format\n",
+        "        \"first_mes\": content1\n",
+        "    }\n",
+        "\n",
+        "    # Replace character name with placeholder in all fields\n",
+        "    def safe_replace(text, old, new):\n",
+        "        return text.replace(old, new) if old else text\n",
+        "\n",
+        "    for field in card_data:\n",
+        "        if field != \"name\":  # Exclude the \"name\" field\n",
+        "            val = card_data[field]\n",
+        "            val = safe_replace(val, char_name, \"{{char}}\")\n",
+        "            card_data[field] = val\n",
+        "\n",
+        "    return card_data\n",
+        "\n",
+        "@app.route('/', methods=['GET'])\n",
+        "def default():\n",
+        "    return {\"status\": \"online\"}\n",
+        "\n",
+        "@app.route('/', methods=['POST'])\n",
+        "def process_card():\n",
+        "    body = request.json\n",
+        "    if 'messages' not in body:\n",
+        "        return jsonify(error=\"Missing 'messages' in request body\"), 400\n",
+        "\n",
+        "    if card_definition_extractor and len(body[\"messages\"]) >= 2:\n",
+        "        card_data = extract_card_data(body[\"messages\"])\n",
+        "        # If running in Colab, download the file\n",
+        "        try:\n",
+        "            from google.colab import files\n",
+        "            import tempfile\n",
+        "            temp_json = tempfile.NamedTemporaryFile(delete=False, suffix=\".json\")\n",
+        "            with open(temp_json.name, 'w', encoding='utf-8') as f:\n",
+        "                json.dump(card_data, f, ensure_ascii=False, indent=2)\n",
+        "            print(\"Card definition JSON created at:\", temp_json.name)\n",
+        "            files.download(temp_json.name)\n",
+        "        except ImportError:\n",
+        "            pass  # Not in Colab, just return JSON\n",
+        "\n",
+        "        return jsonify(card_data), 200\n",
+        "    else:\n",
+        "        return jsonify(status=\"Card definition extractor not enabled or insufficient messages\"), 200\n",
+        "\n",
+        "if __name__ == '__main__':\n",
+        "    if tunnel_provider != \"Cloudflare\":\n",
+        "        print('\\n Colab IP: ', end='')\n",
+        "        !curl ipecho.net/plain\n",
+        "        print('\\n')\n",
+        "    app.run()\n"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/src/app/api/proxy/route.ts b/src/app/api/proxy/route.ts
index 31628fd..f7fe105 100644
--- a/src/app/api/proxy/route.ts
+++ b/src/app/api/proxy/route.ts
@@ -33,17 +33,38 @@ interface CardData {
   scenario: string;
 }
 
-function extractPersonaName(content: string, personaIndex: number = 0): string {
-  const personaMatches = Array.from(content.matchAll(/'s Persona:/g));
-  if (personaMatches.length <= personaIndex) return "";
+interface PersonaMatch {
+  tag: string;
+  content: string;
+}
 
-  const personaIdx = personaMatches[personaIndex].index!;
-  const lineStartIdx = content.lastIndexOf("\n", personaIdx);
-  const lineEndIdx = personaIdx;
+function findTagsBetween(content: string, startMarker: string, endMarker: string): PersonaMatch[] {
+  const startIdx = content.indexOf(`<${startMarker}>`);
+  if (startIdx === -1) return [];
+  
+  const endIdx = content.indexOf(`<${endMarker}>`);
+  if (endIdx === -1) return [];
+  
+  const section = content.slice(startIdx + startMarker.length + 2, endIdx);
+  const tagRegex = /<([^/>]+)>([^<]+)<\/\1>/g;
+  const matches = Array.from(section.matchAll(tagRegex));
+  
+  return matches.map(match => ({
+    tag: match[1],
+    content: match[2].trim()
+  }));
+}
 
-  return content
-    .slice(lineStartIdx === -1 ? 0 : lineStartIdx + 1, lineEndIdx)
-    .trim();
+function extractBetweenTags(content: string, tag: string): string {
+  const startTag = `<${tag}>`;
+  const endTag = `</${tag}>`;
+  const startIndex = content.indexOf(startTag);
+  if (startIndex === -1) return "";
+  
+  const endIndex = content.indexOf(endTag, startIndex);
+  if (endIndex === -1) return "";
+  
+  return content.slice(startIndex + startTag.length, endIndex).trim();
 }
 
 function safeReplace(text: string, old: string, newStr: string): string {
@@ -54,63 +75,27 @@ function extractCardData(messages: Message[]): CardData {
   const content0 = messages[0].content;
   const content1 = messages[2].content;
 
-  const userName = extractPersonaName(content0, 0);
-  const charName = extractPersonaName(content0, 1);
+  // Find all persona tags between system and scenario, take the last one as character
+  const personas = findTagsBetween(content0, "system", "scenario");
+  const charPersona = personas[personas.length - 1];
+  const charName = charPersona?.tag || "";
 
-  const personaMatches = Array.from(content0.matchAll(/'s Persona:/g));
+  // Initialize card data with the character name
   let cardData: CardData = {
     name: charName,
-    description: "",
-    scenario: "",
-    mes_example: "",
-    personality: "",
+    description: charPersona?.content || "",
+    scenario: extractBetweenTags(content0, "scenario"),
+    mes_example: extractBetweenTags(content0, "example_dialogs"),
+    personality: "", // This field isn't used in the new format
     first_mes: content1,
   };
 
-  // blank user persona handling, or at least an attempt
-  let secondPersonaIdx =
-    personaMatches[personaMatches.length >= 2 ? 1 : 0]?.index;
-  const startDesc = secondPersonaIdx + "'s Persona:".length;
-  const remaining = content0.slice(startDesc);
-
-  const scenarioMarker = remaining.match(/Scenario of the roleplay:/);
-  const exampleMarker = remaining.match(/Example conversations between/);
-
-  let endIdx = remaining.length;
-  if (scenarioMarker) endIdx = Math.min(endIdx, scenarioMarker.index!);
-  if (exampleMarker) endIdx = Math.min(endIdx, exampleMarker.index!);
-
-  cardData.description = remaining.slice(0, endIdx).trim();
-
-  if (scenarioMarker) {
-    const scenarioStart = scenarioMarker.index! + scenarioMarker[0].length;
-    const scenarioRemaining = remaining.slice(scenarioStart);
-    const exampleInScenarioMarker = scenarioRemaining.match(
-      /Example conversations between/
-    );
-    const scenarioEnd = exampleInScenarioMarker
-      ? exampleInScenarioMarker.index!
-      : scenarioRemaining.length;
-    cardData.scenario = scenarioRemaining.slice(0, scenarioEnd).trim();
-  }
-
-  if (exampleMarker) {
-    const exampleStart = exampleMarker.index!;
-    const rawExampleStr = remaining.slice(exampleStart).trim();
-    const colonIdx = rawExampleStr.indexOf(":");
-    cardData.mes_example =
-      colonIdx !== -1
-        ? rawExampleStr.slice(colonIdx + 1).trim()
-        : rawExampleStr.trim();
-  }
-
+  // Replace character name with placeholder in all fields
   for (const field in cardData) {
     if (field !== "name") {
       const val = cardData[field as keyof CardData];
       if (typeof val === "string") {
-        let newVal = safeReplace(val, userName, "{{user}}");
-        newVal = safeReplace(newVal, charName, "{{char}}");
-        cardData[field as keyof CardData] = newVal;
+        cardData[field as keyof CardData] = safeReplace(val, charName, "{{char}}");
       }
     }
   }
diff --git a/src/app/page.tsx b/src/app/page.tsx
index e348363..68e4b28 100644
--- a/src/app/page.tsx
+++ b/src/app/page.tsx
@@ -187,9 +187,9 @@ export default function Home() {
       <div className="container mx-auto px-4 py-8">
         <div className="flex justify-between items-center mb-4">
           <div>
-            <h1 className="text-3xl font-bold">Sucker v1.1</h1>
+            <h1 className="text-3xl font-bold">Sucker v1.2</h1>
             <p className="text-sm text-muted-foreground">
-              Fixed R1 handling change, blank user persona bug.
+              If it's any consolation, I hate making these changes too.
             </p>
           </div>
           <Button
@@ -233,8 +233,14 @@ export default function Home() {
                   API settings, any value for model and key.
                 </li>
                 <li className="mb-2">
-                  Remove your custom prompt - otherwise, it'll get inserted into
-                  cards, on the example message section.
+                  <span className="line-through">
+                    Remove your custom prompt - otherwise, it'll get inserted
+                    into cards, on the example message section.
+                  </span>
+                  <span>
+                    &nbsp;No need for this anymore. At least the new prompt system has
+                    it separate now.
+                  </span>
                 </li>
                 <li className="mb-2">
                   Save settings and refresh the page. Not this page. <i>That</i>{" "}