Skip to content

Commit

Permalink
Merge pull request #6271 from oobabooga/dev
Browse files Browse the repository at this point in the history
Merge dev branch
  • Loading branch information
oobabooga authored Jul 25, 2024
2 parents af839d2 + e4624fb commit dd97a83
Show file tree
Hide file tree
Showing 28 changed files with 135 additions and 123 deletions.
49 changes: 16 additions & 33 deletions Colab-TextGen-GPU.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
"source": [
"# oobabooga/text-generation-webui\n",
"\n",
"After running both cells, a public gradio URL will appear at the bottom in a few minutes. You can optionally generate an API link.\n",
"After running both cells, a public gradio URL will appear at the bottom in around 10 minutes. You can optionally generate an API link.\n",
"\n",
"* Project page: https://github.com/oobabooga/text-generation-webui\n",
"* Gradio server status: https://status.gradio.app/"
Expand Down Expand Up @@ -53,44 +53,28 @@
"\n",
"#@markdown If unsure about the branch, write \"main\" or leave it blank.\n",
"\n",
"import torch\n",
"import os\n",
"from pathlib import Path\n",
"\n",
"os.environ.pop('PYTHONPATH', None)\n",
"\n",
"if Path.cwd().name != 'text-generation-webui':\n",
" print(\"Installing the webui...\")\n",
" print(\"\\033[1;32;1m\\n --> Installing the web UI. This will take a while, but after the initial setup, you can download and test as many models as you like.\\033[0;37;0m\\n\")\n",
"\n",
" !git clone https://github.com/oobabooga/text-generation-webui\n",
" %cd text-generation-webui\n",
"\n",
" torver = torch.__version__\n",
" print(f\"TORCH: {torver}\")\n",
" is_cuda118 = '+cu118' in torver # 2.1.0+cu118\n",
"\n",
" if is_cuda118:\n",
" !python -m pip install --upgrade torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 --index-url https://download.pytorch.org/whl/cu118\n",
" else:\n",
" !python -m pip install --upgrade torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 --index-url https://download.pytorch.org/whl/cu121\n",
"\n",
" textgen_requirements = open('requirements.txt').read().splitlines()\n",
" if is_cuda118:\n",
" textgen_requirements = [req.replace('+cu121', '+cu118').replace('+cu122', '+cu118') for req in textgen_requirements]\n",
" with open('temp_requirements.txt', 'w') as file:\n",
" file.write('\\n'.join(textgen_requirements))\n",
"\n",
" !pip install -r temp_requirements.txt --upgrade\n",
"\n",
" print(\"\\033[1;32;1m\\n --> If you see a warning about \\\"previously imported packages\\\", just ignore it.\\033[0;37;0m\")\n",
" print(\"\\033[1;32;1m\\n --> There is no need to restart the runtime.\\n\\033[0;37;0m\")\n",
"\n",
" try:\n",
" import flash_attn\n",
" except:\n",
" !pip uninstall -y flash_attn\n",
" # Install the project in an isolated environment\n",
" !GPU_CHOICE=A \\\n",
" USE_CUDA118=FALSE \\\n",
" LAUNCH_AFTER_INSTALL=FALSE \\\n",
" INSTALL_EXTENSIONS=FALSE \\\n",
" ./start_linux.sh\n",
"\n",
"# Parameters\n",
"model_url = \"https://huggingface.co/TheBloke/MythoMax-L2-13B-GPTQ\" #@param {type:\"string\"}\n",
"branch = \"gptq-4bit-32g-actorder_True\" #@param {type:\"string\"}\n",
"command_line_flags = \"--n-gpu-layers 128 --load-in-4bit --use_double_quant\" #@param {type:\"string\"}\n",
"model_url = \"https://huggingface.co/turboderp/gemma-2-9b-it-exl2\" #@param {type:\"string\"}\n",
"branch = \"8.0bpw\" #@param {type:\"string\"}\n",
"command_line_flags = \"--n-gpu-layers 128 --load-in-4bit --use_double_quant --no_flash_attn\" #@param {type:\"string\"}\n",
"api = False #@param {type:\"boolean\"}\n",
"\n",
"if api:\n",
Expand All @@ -116,11 +100,10 @@
" output_folder = \"\"\n",
"\n",
"# Start the web UI\n",
"cmd = f\"python server.py --share\"\n",
"cmd = f\"./start_linux.sh {command_line_flags} --share\"\n",
"if output_folder != \"\":\n",
" cmd += f\" --model {output_folder}\"\n",
"cmd += f\" {command_line_flags}\"\n",
"print(cmd)\n",
"\n",
"!$cmd"
],
"metadata": {
Expand Down
2 changes: 1 addition & 1 deletion cmd_linux.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/bin/bash
#!/usr/bin/env bash

cd "$(dirname "${BASH_SOURCE[0]}")"

Expand Down
8 changes: 0 additions & 8 deletions css/html_instruct_style.css
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,6 @@
margin-bottom: 0 !important;
}

.dark .message-body p em {
color: rgb(198 202 214) !important;
}

.message-body p em {
color: rgb(110 110 110) !important;
}

.gradio-container .chat .assistant-message {
padding: 20px;
background: #f4f4f4;
Expand Down
8 changes: 8 additions & 0 deletions css/main.css
Original file line number Diff line number Diff line change
Expand Up @@ -406,6 +406,14 @@ div.svelte-362y77>*, div.svelte-362y77>.form>* {
color: var(--body-text-color);
}

.dark .message q {
color: #f5b031;
}

.message q::before, .message q::after {
content: "";
}

.message-body li {
list-style-position: outside;
}
Expand Down
2 changes: 0 additions & 2 deletions js/main.js
Original file line number Diff line number Diff line change
Expand Up @@ -213,12 +213,10 @@ function doSyntaxHighlighting() {
renderMathInElement(element, {
delimiters: [
{ left: "$$", right: "$$", display: true },
{ left: "$", right: "$", display: false },
{ left: "\\(", right: "\\)", display: false },
{ left: "\\[", right: "\\]", display: true },
],
});

});

observer.observe(targetElement, config);
Expand Down
2 changes: 0 additions & 2 deletions modules/LoRA.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,6 @@ def add_lora_autogptq(lora_names):
else:
if len(lora_names) > 1:
logger.warning('AutoGPTQ can only work with 1 LoRA at the moment. Only the first one in the list will be loaded.')
if not shared.args.no_inject_fused_attention:
logger.warning('Fused Attention + AutoGPTQ may break Lora loading. Disable it.')

peft_config = GPTQLoraConfig(
inference_mode=True,
Expand Down
21 changes: 18 additions & 3 deletions modules/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,11 @@
import modules.shared as shared
from modules import utils
from modules.extensions import apply_extensions
from modules.html_generator import chat_html_wrapper, make_thumbnail
from modules.html_generator import (
chat_html_wrapper,
convert_to_markdown,
make_thumbnail
)
from modules.logging_colors import logger
from modules.text_generation import (
generate_reply,
Expand Down Expand Up @@ -368,7 +372,6 @@ def chatbot_wrapper(text, state, regenerate=False, _continue=False, loading_mess


def impersonate_wrapper(text, state):

static_output = chat_html_wrapper(state['history'], state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])

prompt = generate_chat_prompt('', state, impersonate=True)
Expand Down Expand Up @@ -488,7 +491,7 @@ def start_new_chat(state):
greeting = replace_character_names(state['greeting'], state['name1'], state['name2'])
if greeting != '':
history['internal'] += [['<|BEGIN-VISIBLE-CHAT|>', greeting]]
history['visible'] += [['', apply_extensions('output', greeting, state, is_chat=True)]]
history['visible'] += [['', apply_extensions('output', html.escape(greeting), state, is_chat=True)]]

unique_id = datetime.now().strftime('%Y%m%d-%H-%M-%S')
save_history(history, unique_id, state['character_menu'], state['mode'])
Expand Down Expand Up @@ -1044,6 +1047,8 @@ def handle_unique_id_select(state):
history = load_history(state['unique_id'], state['character_menu'], state['mode'])
html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])

convert_to_markdown.cache_clear()

return [history, html]


Expand All @@ -1052,6 +1057,8 @@ def handle_start_new_chat_click(state):
histories = find_all_histories_with_first_prompts(state)
html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])

convert_to_markdown.cache_clear()

return [history, html, gr.update(choices=histories, value=histories[0][1])]


Expand All @@ -1061,6 +1068,8 @@ def handle_delete_chat_confirm_click(state):
history, unique_id = load_history_after_deletion(state, index)
html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])

convert_to_markdown.cache_clear()

return [
history,
html,
Expand Down Expand Up @@ -1099,6 +1108,8 @@ def handle_upload_chat_history(load_chat_history, state):

html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])

convert_to_markdown.cache_clear()

return [
history,
html,
Expand All @@ -1119,6 +1130,8 @@ def handle_character_menu_change(state):
histories = find_all_histories_with_first_prompts(state)
html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])

convert_to_markdown.cache_clear()

return [
history,
html,
Expand All @@ -1136,6 +1149,8 @@ def handle_mode_change(state):
histories = find_all_histories_with_first_prompts(state)
html = redraw_html(history, state['name1'], state['name2'], state['mode'], state['chat_style'], state['character_menu'])

convert_to_markdown.cache_clear()

return [
history,
html,
Expand Down
29 changes: 28 additions & 1 deletion modules/html_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,39 @@ def fix_newlines(string):
return string


def replace_quotes(text):

# Define a list of quote pairs (opening and closing), using HTML entities
quote_pairs = [
('&quot;', '&quot;'), # Double quotes
('&ldquo;', '&rdquo;'), # Unicode left and right double quotation marks
('&lsquo;', '&rsquo;'), # Unicode left and right single quotation marks
('&laquo;', '&raquo;'), # French quotes
('&bdquo;', '&ldquo;'), # German quotes
('&lsquo;', '&rsquo;'), # Alternative single quotes
('&#8220;', '&#8221;'), # Unicode quotes (numeric entities)
('&#x201C;', '&#x201D;'), # Unicode quotes (hex entities)
]

# Create a regex pattern that matches any of the quote pairs, including newlines
pattern = '|'.join(f'({re.escape(open_q)})(.*?)({re.escape(close_q)})' for open_q, close_q in quote_pairs)

# Replace matched patterns with <q> tags, keeping original quotes
replaced_text = re.sub(pattern, lambda m: f'<q>{m.group(1)}{m.group(2)}{m.group(3)}</q>', text, flags=re.DOTALL)

return replaced_text


def replace_blockquote(m):
return m.group().replace('\n', '\n> ').replace('\\begin{blockquote}', '').replace('\\end{blockquote}', '')


@functools.lru_cache(maxsize=4096)
@functools.lru_cache(maxsize=None)
def convert_to_markdown(string):

# Quote to <q></q>
string = replace_quotes(string)

# Blockquote
string = re.sub(r'(^|[\n])&gt;', r'\1>', string)
pattern = re.compile(r'\\begin{blockquote}(.*?)\\end{blockquote}', re.DOTALL)
Expand Down Expand Up @@ -124,6 +150,7 @@ def convert_to_markdown_wrapped(string, use_cache=True):


def generate_basic_html(string):
convert_to_markdown.cache_clear()
string = convert_to_markdown(string)
string = f'<style>{readable_css}</style><div class="readable-container">{string}</div>'
return string
Expand Down
10 changes: 0 additions & 10 deletions modules/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,15 +127,6 @@
'no_use_fast',
'autogptq_info',
],
'AutoAWQ': [
'cpu_memory',
'gpu_memory',
'auto_devices',
'max_seq_len',
'no_inject_fused_attention',
'trust_remote_code',
'no_use_fast',
],
'HQQ': [
'hqq_backend',
'trust_remote_code',
Expand Down Expand Up @@ -200,7 +191,6 @@ def transformers_samplers():
loaders_samplers = {
'Transformers': transformers_samplers(),
'AutoGPTQ': transformers_samplers(),
'AutoAWQ': transformers_samplers(),
'HQQ': transformers_samplers(),
'ExLlamav2': {
'temperature',
Expand Down
19 changes: 0 additions & 19 deletions modules/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,6 @@ def load_model(model_name, loader=None):
'llamacpp_HF': llamacpp_HF_loader,
'ExLlamav2': ExLlamav2_loader,
'ExLlamav2_HF': ExLlamav2_HF_loader,
'AutoAWQ': AutoAWQ_loader,
'HQQ': HQQ_loader,
'TensorRT-LLM': TensorRT_LLM_loader,
}
Expand Down Expand Up @@ -292,24 +291,6 @@ def llamacpp_HF_loader(model_name):
return model


def AutoAWQ_loader(model_name):
from awq import AutoAWQForCausalLM

model_dir = Path(f'{shared.args.model_dir}/{model_name}')

model = AutoAWQForCausalLM.from_quantized(
quant_path=model_dir,
max_new_tokens=shared.args.max_seq_len,
trust_remote_code=shared.args.trust_remote_code,
fuse_layers=not shared.args.no_inject_fused_attention,
max_memory=get_max_memory_dict(),
batch_size=1,
safetensors=any(model_dir.glob('*.safetensors')),
)

return model


def AutoGPTQ_loader(model_name):
import modules.AutoGPTQ_loader

Expand Down
2 changes: 0 additions & 2 deletions modules/models_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,8 +180,6 @@ def infer_loader(model_name, model_settings):
loader = None
elif (path_to_model / 'quantize_config.json').exists() or ('wbits' in model_settings and isinstance(model_settings['wbits'], int) and model_settings['wbits'] > 0):
loader = 'ExLlamav2_HF'
elif (path_to_model / 'quant_config.json').exists() or re.match(r'.*-awq', model_name.lower()):
loader = 'AutoAWQ'
elif len(list(path_to_model.glob('*.gguf'))) > 0 and path_to_model.is_dir() and (path_to_model / 'tokenizer_config.json').exists():
loader = 'llamacpp_HF'
elif len(list(path_to_model.glob('*.gguf'))) > 0:
Expand Down
9 changes: 2 additions & 7 deletions modules/shared.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@

# Model loader
group = parser.add_argument_group('Model loader')
group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2, AutoGPTQ, AutoAWQ.')
group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2, AutoGPTQ.')

# Transformers/Accelerate
group = parser.add_argument_group('Transformers/Accelerate')
Expand Down Expand Up @@ -160,10 +160,6 @@
group.add_argument('--wbits', type=int, default=0, help='Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported.')
group.add_argument('--groupsize', type=int, default=-1, help='Group size.')

# AutoAWQ
group = parser.add_argument_group('AutoAWQ')
group.add_argument('--no_inject_fused_attention', action='store_true', help='Disable the use of fused attention, which will use less VRAM at the cost of slower inference.')

# HQQ
group = parser.add_argument_group('HQQ')
group.add_argument('--hqq-backend', type=str, default='PYTORCH_COMPILE', help='Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.')
Expand Down Expand Up @@ -217,6 +213,7 @@
group.add_argument('--pre_layer', type=int, nargs='+', help='DEPRECATED')
group.add_argument('--checkpoint', type=str, help='DEPRECATED')
group.add_argument('--monkey-patch', action='store_true', help='DEPRECATED')
group.add_argument('--no_inject_fused_attention', action='store_true', help='DEPRECATED')

args = parser.parse_args()
args_defaults = parser.parse_args([])
Expand Down Expand Up @@ -267,8 +264,6 @@ def fix_loader_name(name):
return 'ExLlamav2'
elif name in ['exllamav2-hf', 'exllamav2_hf', 'exllama-v2-hf', 'exllama_v2_hf', 'exllama-v2_hf', 'exllama2-hf', 'exllama2_hf', 'exllama-2-hf', 'exllama_2_hf', 'exllama-2_hf']:
return 'ExLlamav2_HF'
elif name in ['autoawq', 'awq', 'auto-awq']:
return 'AutoAWQ'
elif name in ['hqq']:
return 'HQQ'
elif name in ['tensorrt', 'tensorrtllm', 'tensorrt_llm', 'tensorrt-llm', 'tensort', 'tensortllm']:
Expand Down
1 change: 0 additions & 1 deletion modules/ui.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,6 @@ def list_model_elements():
'groupsize',
'triton',
'desc_act',
'no_inject_fused_attention',
'no_inject_fused_mlp',
'no_use_cuda_fp16',
'disable_exllama',
Expand Down
4 changes: 2 additions & 2 deletions modules/ui_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,13 +84,13 @@ def create_ui():
shared.gradio['start_with'] = gr.Textbox(label='Start reply with', placeholder='Sure thing!', value=shared.settings['start_with'], elem_classes=['add_scrollbar'])

with gr.Row():
shared.gradio['mode'] = gr.Radio(choices=['chat', 'chat-instruct', 'instruct'], label='Mode', info='Defines how the chat prompt is generated. In instruct and chat-instruct modes, the instruction template Parameters > Instruction template is used.', elem_id='chat-mode')
shared.gradio['mode'] = gr.Radio(choices=['chat', 'chat-instruct', 'instruct'], value=shared.settings['mode'] if shared.settings['mode'] in ['chat', 'chat-instruct'] else None, label='Mode', info='Defines how the chat prompt is generated. In instruct and chat-instruct modes, the instruction template Parameters > Instruction template is used.', elem_id='chat-mode')

with gr.Row():
shared.gradio['chat_style'] = gr.Dropdown(choices=utils.get_available_chat_styles(), label='Chat style', value=shared.settings['chat_style'], visible=shared.settings['mode'] != 'instruct')

with gr.Row():
shared.gradio['chat-instruct_command'] = gr.Textbox(value=shared.settings['chat-instruct_command'], lines=12, label='Command for chat-instruct mode', info='<|character|> and <|prompt|> get replaced with the bot name and the regular chat prompt respectively.', visible=False, elem_classes=['add_scrollbar'])
shared.gradio['chat-instruct_command'] = gr.Textbox(value=shared.settings['chat-instruct_command'], lines=12, label='Command for chat-instruct mode', info='<|character|> and <|prompt|> get replaced with the bot name and the regular chat prompt respectively.', visible=shared.settings['mode'] == 'chat-instruct', elem_classes=['add_scrollbar'])


def create_chat_settings_ui():
Expand Down
Loading

0 comments on commit dd97a83

Please sign in to comment.