Merge pull request #6645 from oobabooga/dev

Merge dev branch
oobabooga · Jan 9, 2025 · e6eda6a · e6eda6a
2 parents 88a6331 + f3c0f96
commit e6eda6a
Show file tree

Hide file tree

Showing 42 changed files with 838 additions and 530 deletions.
diff --git a/README.md b/README.md
@@ -10,7 +10,7 @@ Its goal is to become the [AUTOMATIC1111/stable-diffusion-webui](https://github.
 
 ## Features
 
-- Supports multiple text generation backends in one UI/API, including [Transformers](https://github.com/huggingface/transformers), [llama.cpp](https://github.com/ggerganov/llama.cpp), and [ExLlamaV2](https://github.com/turboderp/exllamav2). [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), [AutoAWQ](https://github.com/casper-hansen/AutoAWQ), [HQQ](https://github.com/mobiusml/hqq), and [AQLM](https://github.com/Vahe1994/AQLM) are also supported but you need to install them manually.
+- Supports multiple text generation backends in one UI/API, including [Transformers](https://github.com/huggingface/transformers), [llama.cpp](https://github.com/ggerganov/llama.cpp), and [ExLlamaV2](https://github.com/turboderp-org/exllamav2). [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) is supported via its own [Dockerfile](https://github.com/oobabooga/text-generation-webui/blob/main/docker/TensorRT-LLM/Dockerfile), and the Transformers loader is compatible with libraries like [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ), [AutoAWQ](https://github.com/casper-hansen/AutoAWQ), [HQQ](https://github.com/mobiusml/hqq), and [AQLM](https://github.com/Vahe1994/AQLM), but they must be installed manually.
 - OpenAI-compatible API with Chat and Completions endpoints – see [examples](https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API#examples).
 - Automatic prompt formatting using Jinja2 templates.
 - Three chat modes: `instruct`, `chat-instruct`, and `chat`, with automatic prompt templates in `chat-instruct`.

diff --git a/css/chat_style-cai-chat.css b/css/chat_style-cai-chat.css
@@ -9,6 +9,7 @@
 
 .message-body {
     margin-top: 3px;
+    font-size: 15px !important;
 }
 
 .circle-you {

diff --git a/css/main.css b/css/main.css
@@ -226,6 +226,7 @@ button {
     max-width: 500px;
     background-color: var(--input-background-fill);
     border: var(--input-border-width) solid var(--input-border-color) !important;
+    padding: 10px;
 }
 
 .file-saver > :first-child > :last-child {
@@ -499,8 +500,8 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     margin-bottom: 0.5em !important;
 }
 
-.message-body ul.long-list li,
-.message-body ol.long-list li {
+.message-body ul.long-list > li,
+.message-body ol.long-list > li {
     margin-top: 1.25em !important;
     margin-bottom: 1.25em !important;
 }
@@ -538,8 +539,9 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 }
 
 .message-body pre > code {
-    white-space: pre-wrap !important;
-    word-wrap: break-word !important;
+    white-space: pre !important;
+    overflow-x: auto !important;
+    max-width: calc(100dvw - 39px);
     border: 1px solid #666;
     border-radius: 5px;
     font-size: 82%;
@@ -838,7 +840,11 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
   Past chats menus
 ---------------------------------------------- */
 #rename-row label {
-    margin-top: var(--layout-gap);
+    margin-top: 0;
+}
+
+#rename-row > :nth-child(2) {
+    justify-content: center;
 }
 
 /* ----------------------------------------------
@@ -875,6 +881,10 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
     flex-shrink: 1;
 }
 
+#search_chat > :nth-child(2) > :first-child {
+    display: none;
+}
+
 /* ----------------------------------------------
   Keep dropdown menus above errored components
 ---------------------------------------------- */
@@ -910,7 +920,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 }
 
 #past-chats {
-    max-height: calc(100dvh - 90px);
+    max-height: calc(100dvh - 135px);
     overflow-y: scroll !important;
     border-radius: 0;
     scrollbar-width: auto;
@@ -980,6 +990,7 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
 #rename-row {
     width: 100%;
     justify-content: center;
+    gap: 9px;
 }
 
 

diff --git a/docker/amd/Dockerfile b/docker/amd/Dockerfile
@@ -13,7 +13,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,rw \
 WORKDIR /home/app/
 RUN git clone https://github.com/oobabooga/text-generation-webui.git 
 WORKDIR /home/app/text-generation-webui
-RUN GPU_CHOICE=B USE_CUDA118=FALSE LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose
+RUN GPU_CHOICE=C LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose
 COPY CMD_FLAGS.txt /home/app/text-generation-webui/
 EXPOSE ${CONTAINER_PORT:-7860} ${CONTAINER_API_PORT:-5000} ${CONTAINER_API_STREAM_PORT:-5005}
 WORKDIR /home/app/text-generation-webui

diff --git a/docker/cpu/Dockerfile b/docker/cpu/Dockerfile
@@ -17,7 +17,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,rw \
 WORKDIR /home/app/
 RUN git clone https://github.com/oobabooga/text-generation-webui.git 
 WORKDIR /home/app/text-generation-webui
-RUN GPU_CHOICE=N USE_CUDA118=FALSE LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose
+RUN GPU_CHOICE=N LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose
 COPY CMD_FLAGS.txt /home/app/text-generation-webui/
 EXPOSE ${CONTAINER_PORT:-7860} ${CONTAINER_API_PORT:-5000} ${CONTAINER_API_STREAM_PORT:-5005}
 # set umask to ensure group read / write at runtime

diff --git a/docker/intel/Dockerfile b/docker/intel/Dockerfile
@@ -13,7 +13,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,rw \
 WORKDIR /home/app/
 RUN git clone https://github.com/oobabooga/text-generation-webui.git 
 WORKDIR /home/app/text-generation-webui
-RUN GPU_CHOICE=D USE_CUDA118=FALSE LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose
+RUN GPU_CHOICE=E LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose
 COPY CMD_FLAGS.txt /home/app/text-generation-webui/
 EXPOSE ${CONTAINER_PORT:-7860} ${CONTAINER_API_PORT:-5000} ${CONTAINER_API_STREAM_PORT:-5005}
 # set umask to ensure group read / write at runtime

diff --git a/docker/nvidia/Dockerfile b/docker/nvidia/Dockerfile
@@ -13,7 +13,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,rw \
 WORKDIR /home/app/
 RUN git clone https://github.com/oobabooga/text-generation-webui.git 
 WORKDIR /home/app/text-generation-webui
-RUN GPU_CHOICE=A USE_CUDA118=FALSE LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose
+RUN GPU_CHOICE=A LAUNCH_AFTER_INSTALL=FALSE INSTALL_EXTENSIONS=TRUE ./start_linux.sh --verbose
 COPY CMD_FLAGS.txt /home/app/text-generation-webui/
 EXPOSE ${CONTAINER_PORT:-7860} ${CONTAINER_API_PORT:-5000} ${CONTAINER_API_STREAM_PORT:-5005}
 WORKDIR /home/app/text-generation-webui

diff --git a/download-model.py b/download-model.py
@@ -72,7 +72,7 @@ def sanitize_model_and_branch_names(self, model, branch):
 
         return model, branch
 
-    def get_download_links_from_huggingface(self, model, branch, text_only=False, specific_file=None):
+    def get_download_links_from_huggingface(self, model, branch, text_only=False, specific_file=None, exclude_pattern=None):
         session = self.session
         page = f"/api/models/{model}/tree/{branch}"
         cursor = b""
@@ -100,13 +100,17 @@ def get_download_links_from_huggingface(self, model, branch, text_only=False, sp
                 if specific_file not in [None, ''] and fname != specific_file:
                     continue
 
+                # Exclude files matching the exclude pattern
+                if exclude_pattern is not None and re.match(exclude_pattern, fname):
+                    continue
+
                 if not is_lora and fname.endswith(('adapter_config.json', 'adapter_model.bin')):
                     is_lora = True
 
                 is_pytorch = re.match(r"(pytorch|adapter|gptq)_model.*\.bin", fname)
                 is_safetensors = re.match(r".*\.safetensors", fname)
                 is_pt = re.match(r".*\.pt", fname)
-                is_gguf = re.match(r'.*\.gguf', fname)
+                is_gguf = re.match(r".*\.gguf", fname)
                 is_tiktoken = re.match(r".*\.tiktoken", fname)
                 is_tokenizer = re.match(r"(tokenizer|ice|spiece).*\.model", fname) or is_tiktoken
                 is_text = re.match(r".*\.(txt|json|py|md)", fname) or is_tokenizer
@@ -140,16 +144,13 @@ def get_download_links_from_huggingface(self, model, branch, text_only=False, sp
 
         # If both pytorch and safetensors are available, download safetensors only
         # Also if GGUF and safetensors are available, download only safetensors
-        # (why do people do this?)
         if (has_pytorch or has_pt or has_gguf) and has_safetensors:
             has_gguf = False
             for i in range(len(classifications) - 1, -1, -1):
                 if classifications[i] in ['pytorch', 'pt', 'gguf']:
                     links.pop(i)
 
         # For GGUF, try to download only the Q4_K_M if no specific file is specified.
-        # If not present, exclude all GGUFs, as that's likely a repository with both
-        # GGUF and fp16 files.
         if has_gguf and specific_file is None:
             has_q4km = False
             for i in range(len(classifications) - 1, -1, -1):
@@ -312,6 +313,7 @@ def check_model_files(self, model, branch, links, sha256, output_folder):
     parser.add_argument('--threads', type=int, default=4, help='Number of files to download simultaneously.')
     parser.add_argument('--text-only', action='store_true', help='Only download text files (txt/json).')
     parser.add_argument('--specific-file', type=str, default=None, help='Name of the specific file to download (if not provided, downloads all).')
+    parser.add_argument('--exclude-pattern', type=str, default=None, help='Regex pattern to exclude files from download.')
     parser.add_argument('--output', type=str, default=None, help='Save the model files to this folder.')
     parser.add_argument('--model-dir', type=str, default=None, help='Save the model files to a subfolder of this folder instead of the default one (text-generation-webui/models).')
     parser.add_argument('--clean', action='store_true', help='Does not resume the previous download.')
@@ -322,6 +324,7 @@ def check_model_files(self, model, branch, links, sha256, output_folder):
     branch = args.branch
     model = args.MODEL
     specific_file = args.specific_file
+    exclude_pattern = args.exclude_pattern
 
     if model is None:
         print("Error: Please specify the model you'd like to download (e.g. 'python download-model.py facebook/opt-1.3b').")
@@ -336,7 +339,9 @@ def check_model_files(self, model, branch, links, sha256, output_folder):
         sys.exit()
 
     # Get the download links from Hugging Face
-    links, sha256, is_lora, is_llamacpp = downloader.get_download_links_from_huggingface(model, branch, text_only=args.text_only, specific_file=specific_file)
+    links, sha256, is_lora, is_llamacpp = downloader.get_download_links_from_huggingface(
+        model, branch, text_only=args.text_only, specific_file=specific_file, exclude_pattern=exclude_pattern
+    )
 
     # Get the output folder
     if args.output:
@@ -349,4 +354,7 @@ def check_model_files(self, model, branch, links, sha256, output_folder):
         downloader.check_model_files(model, branch, links, sha256, output_folder)
     else:
         # Download files
-        downloader.download_model_files(model, branch, links, sha256, output_folder, specific_file=specific_file, threads=args.threads, is_llamacpp=is_llamacpp)
+        downloader.download_model_files(
+            model, branch, links, sha256, output_folder,
+            specific_file=specific_file, threads=args.threads, is_llamacpp=is_llamacpp
+        )
diff --git a/extensions/Training_PRO/script.py b/extensions/Training_PRO/script.py
@@ -789,7 +789,11 @@ def generate_and_tokenize_prompt(data_point):
     if not hasattr(shared.model, 'lm_head') or hasattr(shared.model.lm_head, 'weight'):
         logger.info("Getting model ready...")
         # here we can disable gradient checkpoint, by default = true,  use_gradient_checkpointing=True
-        prepare_model_for_kbit_training(shared.model)
+        if 'quantization_config' in shared.model.config.to_dict():
+            print(f"Method: {RED}QLORA{RESET}")
+            prepare_model_for_kbit_training(shared.model)
+        else:
+            print(f"Method: {RED}LoRA{RESET}")    
 
     # base model is now frozen and should not be reused for any other LoRA training than this one
     shared.model_dirty_from_training = True

diff --git a/extensions/openai/script.py b/extensions/openai/script.py
@@ -353,23 +353,38 @@ async def handle_unload_loras():
 
 
 def run_server():
-    server_addr = '0.0.0.0' if shared.args.listen else '127.0.0.1'
+    # Parse configuration
     port = int(os.environ.get('OPENEDAI_PORT', shared.args.api_port))
-
     ssl_certfile = os.environ.get('OPENEDAI_CERT_PATH', shared.args.ssl_certfile)
     ssl_keyfile = os.environ.get('OPENEDAI_KEY_PATH', shared.args.ssl_keyfile)
 
-    if shared.args.public_api:
-        def on_start(public_url: str):
-            logger.info(f'OpenAI-compatible API URL:\n\n{public_url}\n')
+    # In the server configuration:
+    server_addrs = []
+    if os.environ.get('OPENEDAI_ENABLE_IPV6', shared.args.api_enable_ipv6):
+        server_addrs.append('[::]' if shared.args.listen else '[::1]')
+    if not os.environ.get('OPENEDAI_DISABLE_IPV4', shared.args.api_disable_ipv4):
+        server_addrs.append('0.0.0.0' if shared.args.listen else '127.0.0.1')
 
-        _start_cloudflared(port, shared.args.public_api_id, max_attempts=3, on_start=on_start)
+    if not server_addrs:
+        raise Exception('you MUST enable IPv6 or IPv4 for the API to work')
+
+    # Log server information
+    if shared.args.public_api:
+        _start_cloudflared(
+            port,
+            shared.args.public_api_id,
+            max_attempts=3,
+            on_start=lambda url: logger.info(f'OpenAI-compatible API URL:\n\n{url}\n')
+        )
     else:
-        if ssl_keyfile and ssl_certfile:
-            logger.info(f'OpenAI-compatible API URL:\n\nhttps://{server_addr}:{port}\n')
+        url_proto = 'https://' if (ssl_certfile and ssl_keyfile) else 'http://'
+        urls = [f'{url_proto}{addr}:{port}' for addr in server_addrs]
+        if len(urls) > 1:
+            logger.info('OpenAI-compatible API URLs:\n\n' + '\n'.join(urls) + '\n')
         else:
-            logger.info(f'OpenAI-compatible API URL:\n\nhttp://{server_addr}:{port}\n')
+            logger.info('OpenAI-compatible API URL:\n\n' + '\n'.join(urls) + '\n')
 
+    # Log API keys
     if shared.args.api_key:
         if not shared.args.admin_key:
             shared.args.admin_key = shared.args.api_key
@@ -379,8 +394,9 @@ def on_start(public_url: str):
     if shared.args.admin_key and shared.args.admin_key != shared.args.api_key:
         logger.info(f'OpenAI API admin key (for loading/unloading models):\n\n{shared.args.admin_key}\n')
 
+    # Start server
     logging.getLogger("uvicorn.error").propagate = False
-    uvicorn.run(app, host=server_addr, port=port, ssl_certfile=ssl_certfile, ssl_keyfile=ssl_keyfile)
+    uvicorn.run(app, host=server_addrs, port=port, ssl_certfile=ssl_certfile, ssl_keyfile=ssl_keyfile)
 
 
 def setup():

diff --git a/extensions/openai/typing.py b/extensions/openai/typing.py
@@ -42,6 +42,7 @@ class GenerationOptions(BaseModel):
     truncation_length: int = 0
     max_tokens_second: int = 0
     prompt_lookup_num_tokens: int = 0
+    static_cache: bool = False
     custom_token_bans: str = ""
     sampler_priority: List[str] | str | None = Field(default=None, description="List of samplers where the first items will appear first in the stack. Example: [\"top_k\", \"temperature\", \"top_p\"].")
     auto_max_new_tokens: bool = False

diff --git a/js/main.js b/js/main.js
@@ -446,25 +446,33 @@ function toggleBigPicture() {
 //------------------------------------------------
 // Handle the chat input box growth
 //------------------------------------------------
-let currentChatInputHeight = 0;
+
+// Cache DOM elements
+const chatContainer = document.getElementById("chat").parentNode.parentNode.parentNode;
+const chatInput = document.querySelector("#chat-input textarea");
+
+// Variables to store current dimensions
+let currentChatInputHeight = chatInput.clientHeight;
 
 // Update chat layout based on chat and input dimensions
 function updateCssProperties() {
-  const chatContainer = document.getElementById("chat").parentNode.parentNode.parentNode;
-  const chatInputHeight = document.querySelector("#chat-input textarea").clientHeight;
+  const chatInputHeight = chatInput.clientHeight;
 
   // Check if the chat container is visible
   if (chatContainer.clientHeight > 0) {
-    const newChatHeight = `${chatContainer.parentNode.clientHeight - chatInputHeight + 40 - 100 - 20}px`;
+    const chatContainerParentHeight = chatContainer.parentNode.clientHeight;
+    const newChatHeight = `${chatContainerParentHeight - chatInputHeight - 80}px`;
+
     document.documentElement.style.setProperty("--chat-height", newChatHeight);
     document.documentElement.style.setProperty("--input-delta", `${chatInputHeight - 40}px`);
 
     // Adjust scrollTop based on input height change
     if (chatInputHeight !== currentChatInputHeight) {
-      if (!isScrolled && chatInputHeight < currentChatInputHeight) {
+      const deltaHeight = chatInputHeight - currentChatInputHeight;
+      if (!isScrolled && deltaHeight < 0) {
         chatContainer.scrollTop = chatContainer.scrollHeight;
       } else {
-        chatContainer.scrollTop += chatInputHeight - currentChatInputHeight;
+        chatContainer.scrollTop += deltaHeight;
       }
 
       currentChatInputHeight = chatInputHeight;