From 2db1f2f7bb4dea89fb69aff93f4f1207f2974ace Mon Sep 17 00:00:00 2001 From: Gnome Ann <> Date: Tue, 25 Jan 2022 15:05:21 -0500 Subject: [PATCH 001/301] AvrilAI-style repetition penalty test --- aiserver.py | 5 ++--- tpu_mtj_backend.py | 45 ++++++++++++++++++++------------------------- warpers.py | 2 +- 3 files changed, 23 insertions(+), 29 deletions(-) diff --git a/aiserver.py b/aiserver.py index 64470d1e8..5be7a17a5 100644 --- a/aiserver.py +++ b/aiserver.py @@ -722,8 +722,6 @@ def new_call(self, *args, **kwargs): dynamic_processor_wrap(TopPLogitsWarper, "top_p", "top_p", cond=lambda x: x < 1.0) dynamic_processor_wrap(TailFreeLogitsWarper, "tfs", "tfs", cond=lambda x: x < 1.0) dynamic_processor_wrap(TemperatureLogitsWarper, "temperature", "temp", cond=lambda x: x != 1.0) - RepetitionPenaltyLogitsProcessor.__init__ = AdvancedRepetitionPenaltyLogitsProcessor.__init__ - RepetitionPenaltyLogitsProcessor.__call__ = AdvancedRepetitionPenaltyLogitsProcessor.__call__ class LuaLogitsProcessor(LogitsProcessor): @@ -767,6 +765,7 @@ def new_get_logits_warper(beams: int = 1,) -> LogitsProcessorList: warper_list.append(TopPLogitsWarper(top_p=0.5, min_tokens_to_keep=1 + (beams > 1))) warper_list.append(TailFreeLogitsWarper(tfs=0.5, min_tokens_to_keep=1 + (beams > 1))) warper_list.append(TemperatureLogitsWarper(temperature=0.5)) + warper_list.append(AdvancedRepetitionPenaltyLogitsProcessor()) return warper_list def new_sample(self, *args, **kwargs): @@ -2771,7 +2770,7 @@ def _generate(txt, minimum, maximum, found_entries): do_sample=True, min_length=minimum, max_length=int(2e9), - repetition_penalty=1.1, + repetition_penalty=1.0, bad_words_ids=vars.badwordsids, use_cache=True, num_return_sequences=numseqs diff --git a/tpu_mtj_backend.py b/tpu_mtj_backend.py index 653f8cf1d..e7632ebaf 100644 --- a/tpu_mtj_backend.py +++ b/tpu_mtj_backend.py @@ -149,7 +149,7 @@ def apply_repetition_penalty_dynamic(logits, tokens, repetition_penalty, generat logits[tokens] = penalty_logits return logits -def kobold_sample_dynamic(key, logits, top_p=0.9, temp=0.5, top_k=0, tfs=1.0): +def kobold_sample_dynamic(key, logits, rpargs, top_p=0.9, temp=0.5, top_k=0, tfs=1.0): ''' This gets called by generate_loop_fn to apply a series of 4 filters to the logits (top-k, then top-p, then TFS, then temperature) before @@ -245,6 +245,7 @@ def tail_free_filter(logits): # Finally, pick one token using the softmax thingy again (it gives # an array whose elements sum to 1 so it can be used nicely as a # probability distribution) + logits = apply_repetition_penalty_dynamic(logits, *rpargs) return jax.random.categorical(key, logits, -1).astype(np.uint32) def apply_repetition_penalty_static(logits, tokens, repetition_penalty, generated_index, gen_length, rpslope, rprange): @@ -292,7 +293,7 @@ def apply_slope(carry): # positions in the logits array return logits.at[tokens].set(penalty_logits) -def kobold_sample_static(key, logits, top_p=0.9, temp=0.5, top_k=0, tfs=1.0): +def kobold_sample_static(key, logits, rpargs, top_p=0.9, temp=0.5, top_k=0, tfs=1.0): ''' This gets called by generate_loop_fn to apply a series of 4 filters to the logits (top-k, then top-p, then TFS, then temperature) before @@ -387,6 +388,7 @@ def temp_filter(logits): # Finally, pick one token using the softmax thingy again (it gives # an array whose elements sum to 1 so it can be used nicely as a # probability distribution) + logits = apply_repetition_penalty_static(logits, *rpargs) return jax.random.categorical(key, logits, -1).astype(jnp.uint32) pad_token_id = 50256 @@ -400,17 +402,6 @@ def sample_loop_fn(carry): # Get the pseudo-random number generator key that will # be used by kobold_sample_dynamic to randomly pick a token sample_key, new_key = jax.random.split(sample_key, num=2) - # Apply repetition penalty to all tokens that are - # currently inside the "generated" array - logits = apply_repetition_penalty_dynamic( - logits, - generated, - repetition_penalty, - generated_index, - gen_length, - rpslope, - rprange, - ) # Remove any tokens in the badwords list by setting # their logits to negative infinity which effectively # makes their probabilities of being chosen zero @@ -422,6 +413,14 @@ def sample_loop_fn(carry): next_token = kobold_sample_dynamic( sample_key, logits, + ( + generated, + repetition_penalty, + generated_index, + gen_length, + rpslope, + rprange, + ) **sampler_options, ) # Remember what token was picked @@ -493,18 +492,6 @@ def generate_loop_fn(carry): assert logits.shape == (1, config["n_vocab"]) # Flatten it into a 1D array to make it easier to use logits = logits[0] - # Apply repetition penalty to all tokens that are - # currently inside the "generated" array - if repetition_penalty is not None: - logits = apply_repetition_penalty_static( - logits, - generated, - repetition_penalty, - generated_index, - gen_length, - rpslope, - rprange, - ) # Remove any tokens in the badwords list by setting # their logits to negative infinity which effectively # makes their probabilities of being chosen zero @@ -516,6 +503,14 @@ def generate_loop_fn(carry): next_token = kobold_sample_static( sample_key, logits, + ( + generated, + repetition_penalty, + generated_index, + gen_length, + rpslope, + rprange, + ), **sampler_options, ) # Remember what token was picked diff --git a/warpers.py b/warpers.py index 07670f6d5..122bc1cdd 100644 --- a/warpers.py +++ b/warpers.py @@ -31,7 +31,7 @@ from transformers import LogitsWarper, LogitsProcessor -class AdvancedRepetitionPenaltyLogitsProcessor(LogitsProcessor): +class AdvancedRepetitionPenaltyLogitsProcessor(LogitsWarper): def __init__(self, *args, **kwargs): pass From 2ddf45141b60e596b7792280d08b2351e2a9c3ac Mon Sep 17 00:00:00 2001 From: ebolam Date: Sun, 6 Mar 2022 19:51:35 -0500 Subject: [PATCH 002/301] Initial UI based model loading. Includes all parameters except breakmodel chunks, engine # for OAI, and url for ngrok url for google colab --- aiserver.py | 1873 +++++++++++++++++++++-------------------- static/application.js | 86 ++ templates/index.html | 29 + 3 files changed, 1060 insertions(+), 928 deletions(-) diff --git a/aiserver.py b/aiserver.py index 89b371242..db507136c 100644 --- a/aiserver.py +++ b/aiserver.py @@ -39,6 +39,8 @@ import lupa +import torch + # KoboldAI import fileops import gensettings @@ -67,71 +69,67 @@ class colors: UNDERLINE = '\033[4m' # AI models -mainmenu = [ - ["Load a model from its directory", "NeoCustom", ""], - ["Load an old GPT-2 model (eg CloverEdition)", "GPT2Custom", ""], - ["Skein 6B (Hybrid)", "KoboldAI/GPT-J-6B-Skein", "16GB"], - ["Janeway 6B (Novel)", "KoboldAI/GPT-J-6B-Janeway", "16GB"], - ["Adventure 6B", "KoboldAI/GPT-J-6B-Adventure", "16GB"], - ["Lit 6B (NSFW)", "hakurei/lit-6B", "16GB"], - ["Shinen 6B (NSFW)", "KoboldAI/GPT-J-6B-Shinen", "16GB"], - ["C1 6B (Chatbot)", "hakurei/c1-6B", "16GB"], - ["Janeway Neo 2.7B (Novel)", "KoboldAI/GPT-Neo-2.7B-Janeway", "8GB"], - ["Janeway FSD 2.7B (Novel)", "KoboldAI/fairseq-dense-2.7B-Janeway", "8GB"], - ["Adventure 2.7B", "KoboldAI/GPT-Neo-2.7B-AID", "8GB"], - ["Picard 2.7B (Novel)", "KoboldAI/GPT-Neo-2.7B-Picard", "8GB"], - ["Horni 2.7B (NSFW)", "KoboldAI/GPT-Neo-2.7B-Horni", "8GB"], - ["Horni-LN 2.7B (Novel)", "KoboldAI/GPT-Neo-2.7B-Horni-LN", "8GB"], - ["Shinen 2.7B (NSFW)", "KoboldAI/GPT-Neo-2.7B-Shinen", "8GB"], - ["Untuned GPT-Neo/J", "gptneolist", ""], - ["Untuned Fairseq Dense", "fsdlist", ""], - ["Untuned XGLM", "xglmlist", ""], - ["Untuned GPT2", "gpt2list", ""], - ["Online Services", "apilist", ""], - ["Read Only (No AI)", "ReadOnly", ""] - ] - -gptneolist = [ - ["GPT-J 6B", "EleutherAI/gpt-j-6B", "16GB"], - ["GPT-Neo 2.7B", "EleutherAI/gpt-neo-2.7B", "8GB"], - ["GPT-Neo 1.3B", "EleutherAI/gpt-neo-1.3B", "6GB"], - ["Return to Main Menu", "Return", ""], -] - -gpt2list = [ - ["GPT-2 XL", "gpt2-xl", "6GB"], - ["GPT-2 Large", "gpt2-large", "4GB"], - ["GPT-2 Med", "gpt2-medium", "2GB"], - ["GPT-2", "gpt2", "2GB"], - ["Return to Main Menu", "Return", ""], - ] - -fsdlist = [ - ["Fairseq Dense 13B", "KoboldAI/fairseq-dense-13B", "32GB"], - ["Fairseq Dense 6.7B", "KoboldAI/fairseq-dense-6.7B", "16GB"], - ["Fairseq Dense 2.7B", "KoboldAI/fairseq-dense-2.7B", "8GB"], - ["Fairseq Dense 1.3B", "KoboldAI/fairseq-dense-1.3B", "6GB"], - ["Fairseq Dense 355M", "KoboldAI/fairseq-dense-355M", ""], - ["Fairseq Dense 125M", "KoboldAI/fairseq-dense-125M", ""], - ["Return to Main Menu", "Return", ""], - ] - -xglmlist = [ - ["XGLM 4.5B (Larger Dataset)", "facebook/xglm-4.5B", ""], - ["XGLM 7.5B", "facebook/xglm-7.5B", ""], - ["XGLM 2.9B", "facebook/xglm-2.9B", ""], - ["XGLM 1.7B", "facebook/xglm-1.7B", ""], - ["XGLM 564M", "facebook/xglm-564M", ""], - ["Return to Main Menu", "Return", ""], +model_menu = {'mainmenu': [ + ["Load a model from its directory", "NeoCustom", "", False], + ["Load an old GPT-2 model (eg CloverEdition)", "GPT2Custom", "", False], + ["Skein 6B (Hybrid)", "KoboldAI/GPT-J-6B-Skein", "16GB", False], + ["Janeway 6B (Novel)", "KoboldAI/GPT-J-6B-Janeway", "16GB", False], + ["Adventure 6B", "KoboldAI/GPT-J-6B-Adventure", "16GB", False], + ["Lit 6B (NSFW)", "hakurei/lit-6B", "16GB", False], + ["Shinen 6B (NSFW)", "KoboldAI/GPT-J-6B-Shinen", "16GB", False], + ["C1 6B (Chatbot)", "hakurei/c1-6B", "16GB", False], + ["Janeway Neo 2.7B (Novel)", "KoboldAI/GPT-Neo-2.7B-Janeway", "8GB", False], + ["Janeway FSD 2.7B (Novel)", "KoboldAI/fairseq-dense-2.7B-Janeway", "8GB", False], + ["Adventure 2.7B", "KoboldAI/GPT-Neo-2.7B-AID", "8GB", False], + ["Picard 2.7B (Novel)", "KoboldAI/GPT-Neo-2.7B-Picard", "8GB", False], + ["Horni 2.7B (NSFW)", "KoboldAI/GPT-Neo-2.7B-Horni", "8GB", False], + ["Horni-LN 2.7B (Novel)", "KoboldAI/GPT-Neo-2.7B-Horni-LN", "8GB", False], + ["Shinen 2.7B (NSFW)", "KoboldAI/GPT-Neo-2.7B-Shinen", "8GB", False], + ["Untuned GPT-Neo/J", "gptneolist", "", True], + ["Untuned Fairseq Dense", "fsdlist", "", True], + ["Untuned XGLM", "xglmlist", "", True], + ["Untuned GPT2", "gpt2list", "", True], + ["Online Services", "apilist", "", True], + ["Read Only (No AI)", "ReadOnly", "", False] + ], + 'gptneolist': [ + ["GPT-J 6B", "EleutherAI/gpt-j-6B", "16GB", False], + ["GPT-Neo 2.7B", "EleutherAI/gpt-neo-2.7B", "8GB", False], + ["GPT-Neo 1.3B", "EleutherAI/gpt-neo-1.3B", "6GB", False], + ["Return to Main Menu", "mainmenu", "", True], + ], + 'gpt2list': [ + ["GPT-2 XL", "gpt2-xl", "6GB", False], + ["GPT-2 Large", "gpt2-large", "4GB", False], + ["GPT-2 Med", "gpt2-medium", "2GB", False], + ["GPT-2", "gpt2", "2GB", False], + ["Return to Main Menu", "mainmenu", "", True], + ], + 'fsdlist': [ + ["Fairseq Dense 13B", "KoboldAI/fairseq-dense-13B", "32GB", False], + ["Fairseq Dense 6.7B", "KoboldAI/fairseq-dense-6.7B", "16GB", False], + ["Fairseq Dense 2.7B", "KoboldAI/fairseq-dense-2.7B", "8GB", False], + ["Fairseq Dense 1.3B", "KoboldAI/fairseq-dense-1.3B", "6GB", False], + ["Fairseq Dense 355M", "KoboldAI/fairseq-dense-355M", "", False], + ["Fairseq Dense 125M", "KoboldAI/fairseq-dense-125M", "", False], + ["Return to Main Menu", "Return", "", True], + ], + 'xglmlist': [ + ["XGLM 4.5B (Larger Dataset)", "facebook/xglm-4.5B", "", False], + ["XGLM 7.5B", "facebook/xglm-7.5B", "", False], + ["XGLM 2.9B", "facebook/xglm-2.9B", "", False], + ["XGLM 1.7B", "facebook/xglm-1.7B", "", False], + ["XGLM 564M", "facebook/xglm-564M", "", False], + ["Return to Main Menu", "mainmenu", "", True], + ], + 'apilist': [ + ["OpenAI API (requires API key)", "OAI", "", False], + ["InferKit API (requires API key)", "InferKit", "", False], + ["KoboldAI Server API (Old Google Colab)", "Colab", "", False], + ["Return to Main Menu", "mainmenu", "", True], ] + } -apilist = [ - ["GooseAI API (requires API key)", "GooseAI", ""], - ["OpenAI API (requires API key)", "OAI", ""], - ["InferKit API (requires API key)", "InferKit", ""], - ["KoboldAI Server API (Old Google Colab)", "Colab", ""], - ["Return to Main Menu", "Return", ""], -] # Variables class vars: lastact = "" # The last action received from the user @@ -261,6 +259,9 @@ class vars: #==================================================================# # Function to get model selection at startup #==================================================================# +def sendModelSelection(menu="mainmenu"): + emit('from_server', {'cmd': 'show_model_menu', 'data': model_menu[menu], 'menu': menu}, broadcast=True) + def getModelSelection(modellist): print(" # Model\t\t\t\t\t\tVRAM\n ========================================================") i = 1 @@ -717,850 +718,844 @@ def spRequest(filename): # Startup #==================================================================# -# Parsing Parameters -parser = argparse.ArgumentParser(description="KoboldAI Server") -parser.add_argument("--remote", action='store_true', help="Optimizes KoboldAI for Remote Play") -parser.add_argument("--ngrok", action='store_true', help="Optimizes KoboldAI for Remote Play using Ngrok") -parser.add_argument("--host", action='store_true', help="Optimizes KoboldAI for Remote Play without using a proxy service") -parser.add_argument("--model", help="Specify the Model Type to skip the Menu") -parser.add_argument("--path", help="Specify the Path for local models (For model NeoCustom or GPT2Custom)") -parser.add_argument("--cpu", action='store_true', help="By default unattended launches are on the GPU use this option to force CPU usage.") -parser.add_argument("--breakmodel", action='store_true', help=argparse.SUPPRESS) -parser.add_argument("--breakmodel_layers", type=int, help=argparse.SUPPRESS) -parser.add_argument("--breakmodel_gpulayers", type=str, help="If using a model that supports hybrid generation, this is a comma-separated list that specifies how many layers to put on each GPU device. For example to put 8 layers on device 0, 9 layers on device 1 and 11 layers on device 2, use --beakmodel_gpulayers 8,9,11") -parser.add_argument("--override_delete", action='store_true', help="Deleting stories from inside the browser is disabled if you are using --remote and enabled otherwise. Using this option will instead allow deleting stories if using --remote and prevent deleting stories otherwise.") -parser.add_argument("--override_rename", action='store_true', help="Renaming stories from inside the browser is disabled if you are using --remote and enabled otherwise. Using this option will instead allow renaming stories if using --remote and prevent renaming stories otherwise.") -parser.add_argument("--configname", help="Force a fixed configuration name to aid with config management.") -parser.add_argument("--colab", action='store_true', help="Optimize for Google Colab.") -parser.add_argument("--nobreakmodel", action='store_true', help="Disables Breakmodel support completely.") -parser.add_argument("--unblock", action='store_true', default=False, help="Unblocks the KoboldAI port to be accessible from other machines without optimizing for remote play (It is recommended to use --host instead)") -parser.add_argument("--quiet", action='store_true', default=False, help="If present will suppress any story related text from showing on the console") -parser.add_argument("--lowmem", action='store_true', help="Extra Low Memory loading for the GPU, slower but memory does not peak to twice the usage") - -args: argparse.Namespace = None -if(os.environ.get("KOBOLDAI_ARGS") is not None): - import shlex - args = parser.parse_args(shlex.split(os.environ["KOBOLDAI_ARGS"])) -else: - args = parser.parse_args() +# Set logging level to reduce chatter from Flask +import logging +log = logging.getLogger('werkzeug') +log.setLevel(logging.ERROR) -vars.model = args.model; +# Start flask & SocketIO +print("{0}Initializing Flask... {1}".format(colors.PURPLE, colors.END), end="") +from flask import Flask, render_template, Response, request, copy_current_request_context +from flask_socketio import SocketIO, emit +app = Flask(__name__) +app.config['SECRET KEY'] = 'secret!' +socketio = SocketIO(app, async_method="eventlet") +print("{0}OK!{1}".format(colors.GREEN, colors.END)) -if args.colab: - args.remote = True; - args.override_rename = True; - args.override_delete = True; - args.nobreakmodel = True; - args.quiet = True; - args.lowmem = True; +def general_startup(): + global args + # Parsing Parameters + parser = argparse.ArgumentParser(description="KoboldAI Server") + parser.add_argument("--remote", action='store_true', help="Optimizes KoboldAI for Remote Play") + parser.add_argument("--ngrok", action='store_true', help="Optimizes KoboldAI for Remote Play using Ngrok") + parser.add_argument("--host", action='store_true', help="Optimizes KoboldAI for Remote Play without using a proxy service") + parser.add_argument("--model", help="Specify the Model Type to skip the Menu") + parser.add_argument("--path", help="Specify the Path for local models (For model NeoCustom or GPT2Custom)") + parser.add_argument("--cpu", action='store_true', help="By default unattended launches are on the GPU use this option to force CPU usage.") + parser.add_argument("--breakmodel", action='store_true', help=argparse.SUPPRESS) + parser.add_argument("--breakmodel_layers", type=int, help=argparse.SUPPRESS) + parser.add_argument("--breakmodel_gpulayers", type=str, help="If using a model that supports hybrid generation, this is a comma-separated list that specifies how many layers to put on each GPU device. For example to put 8 layers on device 0, 9 layers on device 1 and 11 layers on device 2, use --beakmodel_gpulayers 8,9,11") + parser.add_argument("--override_delete", action='store_true', help="Deleting stories from inside the browser is disabled if you are using --remote and enabled otherwise. Using this option will instead allow deleting stories if using --remote and prevent deleting stories otherwise.") + parser.add_argument("--override_rename", action='store_true', help="Renaming stories from inside the browser is disabled if you are using --remote and enabled otherwise. Using this option will instead allow renaming stories if using --remote and prevent renaming stories otherwise.") + parser.add_argument("--configname", help="Force a fixed configuration name to aid with config management.") + parser.add_argument("--colab", action='store_true', help="Optimize for Google Colab.") + parser.add_argument("--nobreakmodel", action='store_true', help="Disables Breakmodel support completely.") + parser.add_argument("--unblock", action='store_true', default=False, help="Unblocks the KoboldAI port to be accessible from other machines without optimizing for remote play (It is recommended to use --host instead)") + parser.add_argument("--quiet", action='store_true', default=False, help="If present will suppress any story related text from showing on the console") + parser.add_argument("--lowmem", action='store_true', help="Extra Low Memory loading for the GPU, slower but memory does not peak to twice the usage") -if args.quiet: - vars.quiet = True + + if(os.environ.get("KOBOLDAI_ARGS") is not None): + import shlex + args = parser.parse_args(shlex.split(os.environ["KOBOLDAI_ARGS"])) + else: + args = parser.parse_args() -if args.nobreakmodel: - vars.nobreakmodel = True; + vars.model = args.model; -if args.remote: - vars.host = True; + if args.colab: + args.remote = True; + args.override_rename = True; + args.override_delete = True; + args.nobreakmodel = True; + args.quiet = True; + args.lowmem = True; -if args.ngrok: - vars.host = True; + if args.quiet: + vars.quiet = True -if args.host: - vars.host = True; + if args.nobreakmodel: + vars.nobreakmodel = True; -if args.cpu: - vars.use_colab_tpu = False + if args.remote: + vars.host = True; -vars.smandelete = vars.host == args.override_delete -vars.smanrename = vars.host == args.override_rename + if args.ngrok: + vars.host = True; -# Select a model to run -if args.model: - print("Welcome to KoboldAI!\nYou have selected the following Model:", vars.model) - if args.path: - print("You have selected the following path for your Model :", args.path) - vars.custmodpth = args.path; - vars.colaburl = args.path + "/request"; # Lets just use the same parameter to keep it simple + if args.host: + vars.host = True; -else: - print("{0}Welcome to the KoboldAI Server!\nListed RAM is the optimal VRAM and CPU ram can be up to twice the amount.\nMost models can run at less VRAM with reduced max tokens or less layers on the GPU.\nSelect an AI model to continue:{1}\n".format(colors.CYAN, colors.END)) - getModelSelection(mainmenu) - -# If transformers model was selected & GPU available, ask to use CPU or GPU -if(vars.model not in ["InferKit", "Colab", "OAI", "GooseAI" , "ReadOnly", "TPUMeshTransformerGPTJ"]): - vars.allowsp = True - # Test for GPU support - import torch - - # Make model path the same as the model name to make this consistent with the other loading method if it isn't a known model type - # This code is not just a workaround for below, it is also used to make the behavior consistent with other loading methods - Henk717 - if(not vars.model in ["NeoCustom", "GPT2Custom"]): - vars.custmodpth = vars.model - elif(vars.model == "NeoCustom"): - vars.model = os.path.basename(os.path.normpath(vars.custmodpth)) - - # Get the model_type from the config or assume a model type if it isn't present - from transformers import AutoConfig - if(os.path.isdir(vars.custmodpth.replace('/', '_'))): - try: - model_config = AutoConfig.from_pretrained(vars.custmodpth.replace('/', '_'), cache_dir="cache/") - vars.model_type = model_config.model_type - except ValueError as e: - vars.model_type = "not_found" - elif(os.path.isdir("models/{}".format(vars.custmodpth.replace('/', '_')))): - try: - model_config = AutoConfig.from_pretrained("models/{}".format(vars.custmodpth.replace('/', '_')), cache_dir="cache/") - vars.model_type = model_config.model_type - except ValueError as e: - vars.model_type = "not_found" - else: - try: - model_config = AutoConfig.from_pretrained(vars.custmodpth, cache_dir="cache/") - vars.model_type = model_config.model_type - except ValueError as e: - vars.model_type = "not_found" - if(vars.model_type == "not_found" and vars.model == "NeoCustom"): - vars.model_type = "gpt_neo" - elif(vars.model_type == "not_found" and vars.model == "GPT2Custom"): - vars.model_type = "gpt2" - elif(vars.model_type == "not_found"): - print("WARNING: No model type detected, assuming Neo (If this is a GPT2 model use the other menu option or --model GPT2Custom)") - vars.model_type = "gpt_neo" - -if(not vars.use_colab_tpu and vars.model not in ["InferKit", "Colab", "OAI", "GooseAI" , "ReadOnly", "TPUMeshTransformerGPTJ"]): - loadmodelsettings() - loadsettings() - print("{0}Looking for GPU support...{1}".format(colors.PURPLE, colors.END), end="") - vars.hascuda = torch.cuda.is_available() - vars.bmsupported = vars.model_type in ("gpt_neo", "gptj", "xglm") and not vars.nobreakmodel - if(args.breakmodel is not None and args.breakmodel): - print("WARNING: --breakmodel is no longer supported. Breakmodel mode is now automatically enabled when --breakmodel_gpulayers is used (see --help for details).", file=sys.stderr) - if(args.breakmodel_layers is not None): - print("WARNING: --breakmodel_layers is deprecated. Use --breakmodel_gpulayers instead (see --help for details).", file=sys.stderr) - if(args.model and vars.bmsupported and not args.breakmodel_gpulayers and not args.breakmodel_layers): - print("WARNING: Model launched without the --breakmodel_gpulayers argument, defaulting to GPU only mode.", file=sys.stderr) - vars.bmsupported = False - if(not vars.bmsupported and (args.breakmodel_gpulayers is not None or args.breakmodel_layers is not None)): - print("WARNING: This model does not support hybrid generation. --breakmodel_gpulayers will be ignored.", file=sys.stderr) - if(vars.hascuda): - print("{0}FOUND!{1}".format(colors.GREEN, colors.END)) - else: - print("{0}NOT FOUND!{1}".format(colors.YELLOW, colors.END)) - - if args.model: + if args.cpu: + vars.use_colab_tpu = False + + vars.smandelete = vars.host == args.override_delete + vars.smanrename = vars.host == args.override_rename + + +#==================================================================# +# Load Model +#==================================================================# +def load_model(use_gpu=True, key=''): + global model + global generator + vars.noai = False + set_aibusy(True) + # If transformers model was selected & GPU available, ask to use CPU or GPU + if(vars.model not in ["InferKit", "Colab", "OAI", "GooseAI" , "ReadOnly", "TPUMeshTransformerGPTJ"]): + vars.allowsp = True + # Test for GPU support + import torch + + # Make model path the same as the model name to make this consistent with the other loading method if it isn't a known model type + # This code is not just a workaround for below, it is also used to make the behavior consistent with other loading methods - Henk717 + if(not vars.model in ["NeoCustom", "GPT2Custom"]): + vars.custmodpth = vars.model + elif(vars.model == "NeoCustom"): + vars.model = os.path.basename(os.path.normpath(vars.custmodpth)) + + # Get the model_type from the config or assume a model type if it isn't present + from transformers import AutoConfig + if(os.path.isdir(vars.custmodpth.replace('/', '_'))): + try: + model_config = AutoConfig.from_pretrained(vars.custmodpth.replace('/', '_'), cache_dir="cache/") + vars.model_type = model_config.model_type + except ValueError as e: + vars.model_type = "not_found" + elif(os.path.isdir("models/{}".format(vars.custmodpth.replace('/', '_')))): + try: + model_config = AutoConfig.from_pretrained("models/{}".format(vars.custmodpth.replace('/', '_')), cache_dir="cache/") + vars.model_type = model_config.model_type + except ValueError as e: + vars.model_type = "not_found" + else: + try: + model_config = AutoConfig.from_pretrained(vars.custmodpth, cache_dir="cache/") + vars.model_type = model_config.model_type + except ValueError as e: + vars.model_type = "not_found" + if(vars.model_type == "not_found" and vars.model == "NeoCustom"): + vars.model_type = "gpt_neo" + elif(vars.model_type == "not_found" and vars.model == "GPT2Custom"): + vars.model_type = "gpt2" + elif(vars.model_type == "not_found"): + print("WARNING: No model type detected, assuming Neo (If this is a GPT2 model use the other menu option or --model GPT2Custom)") + vars.model_type = "gpt_neo" + + if(not vars.use_colab_tpu and vars.model not in ["InferKit", "Colab", "OAI", "GooseAI" , "ReadOnly", "TPUMeshTransformerGPTJ"]): + loadmodelsettings() + loadsettings() + print("{0}Looking for GPU support...{1}".format(colors.PURPLE, colors.END), end="") + vars.hascuda = torch.cuda.is_available() + vars.bmsupported = vars.model_type in ("gpt_neo", "gptj", "xglm") and not vars.nobreakmodel + if(args.breakmodel is not None and args.breakmodel): + print("WARNING: --breakmodel is no longer supported. Breakmodel mode is now automatically enabled when --breakmodel_gpulayers is used (see --help for details).", file=sys.stderr) + if(args.breakmodel_layers is not None): + print("WARNING: --breakmodel_layers is deprecated. Use --breakmodel_gpulayers instead (see --help for details).", file=sys.stderr) + if(args.model and vars.bmsupported and not args.breakmodel_gpulayers and not args.breakmodel_layers): + print("WARNING: Model launched without the --breakmodel_gpulayers argument, defaulting to GPU only mode.", file=sys.stderr) + vars.bmsupported = False + if(not vars.bmsupported and (args.breakmodel_gpulayers is not None or args.breakmodel_layers is not None)): + print("WARNING: This model does not support hybrid generation. --breakmodel_gpulayers will be ignored.", file=sys.stderr) if(vars.hascuda): - genselected = True - vars.usegpu = True - vars.breakmodel = False - if(vars.bmsupported): - vars.usegpu = False - vars.breakmodel = True - if(args.cpu): - vars.usegpu = False - vars.breakmodel = False - elif(vars.hascuda): - if(vars.bmsupported): - genselected = True - vars.usegpu = False - vars.breakmodel = True + print("{0}FOUND!{1}".format(colors.GREEN, colors.END)) else: - print(" 1 - GPU\n 2 - CPU\n") - genselected = False - else: - genselected = False - - if(vars.hascuda): - while(genselected == False): - genselect = input("Mode> ") - if(genselect == ""): - vars.breakmodel = False - vars.usegpu = True + print("{0}NOT FOUND!{1}".format(colors.YELLOW, colors.END)) + + if args.model: + if(vars.hascuda): genselected = True - elif(genselect.isnumeric() and int(genselect) == 1): - if(vars.bmsupported): - vars.breakmodel = True - vars.usegpu = False - genselected = True - else: - vars.breakmodel = False - vars.usegpu = True - genselected = True - elif(genselect.isnumeric() and int(genselect) == 2): + vars.usegpu = True vars.breakmodel = False + if(vars.bmsupported): + vars.usegpu = False + vars.breakmodel = True + if(args.cpu): vars.usegpu = False + vars.breakmodel = False + elif(vars.hascuda): + if(vars.bmsupported): genselected = True + vars.usegpu = False + vars.breakmodel = True else: - print("{0}Please enter a valid selection.{1}".format(colors.RED, colors.END)) - -# Ask for API key if InferKit was selected -if(vars.model == "InferKit"): - if(not path.exists("settings/" + getmodelname().replace('/', '_') + ".settings")): - # If the client settings file doesn't exist, create it - print("{0}Please enter your InferKit API key:{1}\n".format(colors.CYAN, colors.END)) - vars.apikey = input("Key> ") - # Write API key to file - os.makedirs('settings', exist_ok=True) - file = open("settings/" + getmodelname().replace('/', '_') + ".settings", "w") - try: - js = {"apikey": vars.apikey} - file.write(json.dumps(js, indent=3)) - finally: - file.close() - else: - # Otherwise open it up - file = open("settings/" + getmodelname().replace('/', '_') + ".settings", "r") - # Check if API key exists - js = json.load(file) - if("apikey" in js and js["apikey"] != ""): - # API key exists, grab it and close the file - vars.apikey = js["apikey"] - file.close() + genselected = False else: - # Get API key, add it to settings object, and write it to disk - print("{0}Please enter your InferKit API key:{1}\n".format(colors.CYAN, colors.END)) - vars.apikey = input("Key> ") - js["apikey"] = vars.apikey + genselected = False + + if(vars.hascuda): + while(genselected == False): + if(use_gpu): + if(vars.bmsupported): + vars.breakmodel = True + vars.usegpu = False + genselected = True + else: + vars.breakmodel = False + vars.usegpu = True + genselected = True + else: + vars.breakmodel = False + vars.usegpu = False + genselected = True + + # Ask for API key if InferKit was selected + if(vars.model == "InferKit"): + if(not path.exists("settings/" + getmodelname().replace('/', '_') + ".settings")): + # If the client settings file doesn't exist, create it + vars.apikey = key # Write API key to file + os.makedirs('settings', exist_ok=True) file = open("settings/" + getmodelname().replace('/', '_') + ".settings", "w") try: + js = {"apikey": vars.apikey} file.write(json.dumps(js, indent=3)) finally: file.close() - -# Swap OAI Server if GooseAI was selected -if(vars.model == "GooseAI"): - vars.oaiengines = "https://api.goose.ai/v1/engines" - vars.model = "OAI" - args.configname = "GooseAI" - -# Ask for API key if OpenAI was selected -if(vars.model == "OAI"): - if not args.configname: - args.configname = "OAI" - if(not path.exists("settings/" + getmodelname().replace('/', '_') + ".settings")): - # If the client settings file doesn't exist, create it - print("{0}Please enter your API key:{1}\n".format(colors.CYAN, colors.END)) - vars.oaiapikey = input("Key> ") - # Write API key to file - os.makedirs('settings', exist_ok=True) - file = open("settings/" + getmodelname().replace('/', '_') + ".settings", "w") - try: - js = {"oaiapikey": vars.oaiapikey} - file.write(json.dumps(js, indent=3)) - finally: - file.close() - else: - # Otherwise open it up - file = open("settings/" + getmodelname().replace('/', '_') + ".settings", "r") - # Check if API key exists - js = json.load(file) - if("oaiapikey" in js and js["oaiapikey"] != ""): - # API key exists, grab it and close the file - vars.oaiapikey = js["oaiapikey"] - file.close() else: - # Get API key, add it to settings object, and write it to disk - print("{0}Please enter your API key:{1}\n".format(colors.CYAN, colors.END)) - vars.oaiapikey = input("Key> ") - js["oaiapikey"] = vars.oaiapikey + # Otherwise open it up + file = open("settings/" + getmodelname().replace('/', '_') + ".settings", "r") + # Check if API key exists + js = json.load(file) + if("apikey" in js and js["apikey"] != ""): + # API key exists, grab it and close the file + vars.apikey = js["apikey"] + file.close() + else: + # Get API key, add it to settings object, and write it to disk + vars.apikey = key + js["apikey"] = vars.apikey + # Write API key to file + file = open("settings/" + getmodelname().replace('/', '_') + ".settings", "w") + try: + file.write(json.dumps(js, indent=3)) + finally: + file.close() + + # Swap OAI Server if GooseAI was selected + if(vars.model == "GooseAI"): + vars.oaiengines = "https://api.goose.ai/v1/engines" + vars.model = "OAI" + args.configname = "GooseAI" + + # Ask for API key if OpenAI was selected + if(vars.model == "OAI"): + if not args.configname: + args.configname = "OAI" + if(not path.exists("settings/" + getmodelname().replace('/', '_') + ".settings")): + # If the client settings file doesn't exist, create it + vars.oaiapikey = key # Write API key to file + os.makedirs('settings', exist_ok=True) file = open("settings/" + getmodelname().replace('/', '_') + ".settings", "w") try: + js = {"oaiapikey": vars.oaiapikey} file.write(json.dumps(js, indent=3)) finally: file.close() - - # Get list of models from OAI - print("{0}Retrieving engine list...{1}".format(colors.PURPLE, colors.END), end="") - req = requests.get( - vars.oaiengines, - headers = { - 'Authorization': 'Bearer '+vars.oaiapikey - } - ) - if(req.status_code == 200): - print("{0}OK!{1}".format(colors.GREEN, colors.END)) - print("{0}Please select an engine to use:{1}\n".format(colors.CYAN, colors.END)) - engines = req.json()["data"] - # Print list of engines - i = 0 - for en in engines: - print(" {0} - {1} ({2})".format(i, en["id"], "\033[92mready\033[0m" if en["ready"] == True else "\033[91mnot ready\033[0m")) - i += 1 - # Get engine to use - print("") - engselected = False - while(engselected == False): - engine = input("Engine #> ") - if(engine.isnumeric() and int(engine) < len(engines)): - vars.oaiurl = vars.oaiengines + "/{0}/completions".format(engines[int(engine)]["id"]) - args.configname = args.configname + "/" + engines[int(engine)]["id"] - engselected = True + else: + # Otherwise open it up + file = open("settings/" + getmodelname().replace('/', '_') + ".settings", "r") + # Check if API key exists + js = json.load(file) + if("oaiapikey" in js and js["oaiapikey"] != ""): + # API key exists, grab it and close the file + vars.oaiapikey = js["oaiapikey"] + file.close() else: - print("{0}Please enter a valid selection.{1}".format(colors.RED, colors.END)) - else: - # Something went wrong, print the message and quit since we can't initialize an engine - print("{0}ERROR!{1}".format(colors.RED, colors.END)) - print(req.json()) - quit() + # Get API key, add it to settings object, and write it to disk + vars.oaiapikey = key + js["oaiapikey"] = vars.oaiapikey + # Write API key to file + file = open("settings/" + getmodelname().replace('/', '_') + ".settings", "w") + try: + file.write(json.dumps(js, indent=3)) + finally: + file.close() + + # Get list of models from OAI + print("{0}Retrieving engine list...{1}".format(colors.PURPLE, colors.END), end="") + req = requests.get( + vars.oaiengines, + headers = { + 'Authorization': 'Bearer '+vars.oaiapikey + } + ) + if(req.status_code == 200): + print("{0}OK!{1}".format(colors.GREEN, colors.END)) + print("{0}Please select an engine to use:{1}\n".format(colors.CYAN, colors.END)) + engines = req.json()["data"] + # Print list of engines + i = 0 + for en in engines: + print(" {0} - {1} ({2})".format(i, en["id"], "\033[92mready\033[0m" if en["ready"] == True else "\033[91mnot ready\033[0m")) + i += 1 + # Get engine to use + print("") + engselected = False + while(engselected == False): + engine = input("Engine #> ") + if(engine.isnumeric() and int(engine) < len(engines)): + vars.oaiurl = vars.oaiengines + "/{0}/completions".format(engines[int(engine)]["id"]) + args.configname = args.configname + "/" + engines[int(engine)]["id"] + engselected = True + else: + print("{0}Please enter a valid selection.{1}".format(colors.RED, colors.END)) + else: + # Something went wrong, print the message and quit since we can't initialize an engine + print("{0}ERROR!{1}".format(colors.RED, colors.END)) + print(req.json()) + quit() -# Ask for ngrok url if Google Colab was selected -if(vars.model == "Colab"): - if(vars.colaburl == ""): - print("{0}NOTE: For the modern KoboldAI Colab's you open the links directly in your browser.\nThis option is only for the KoboldAI Server API, not all features are supported in this mode.\n".format(colors.YELLOW, colors.END)) - print("{0}Enter the URL of the server (For example a trycloudflare link):{1}\n".format(colors.CYAN, colors.END)) - vars.colaburl = input("URL> ") + "/request" + # Ask for ngrok url if Google Colab was selected + if(vars.model == "Colab"): + if(vars.colaburl == ""): + print("{0}NOTE: For the modern KoboldAI Colab's you open the links directly in your browser.\nThis option is only for the KoboldAI Server API, not all features are supported in this mode.\n".format(colors.YELLOW, colors.END)) + print("{0}Enter the URL of the server (For example a trycloudflare link):{1}\n".format(colors.CYAN, colors.END)) + vars.colaburl = input("URL> ") + "/request" + + if(vars.model == "ReadOnly"): + vars.noai = True + + # Start transformers and create pipeline + if(not vars.use_colab_tpu and vars.model not in ["InferKit", "Colab", "OAI", "GooseAI" , "ReadOnly", "TPUMeshTransformerGPTJ"]): + if(not vars.noai): + print("{0}Initializing transformers, please wait...{1}".format(colors.PURPLE, colors.END)) + from transformers import StoppingCriteria, GPT2TokenizerFast, GPT2LMHeadModel, GPTNeoForCausalLM, GPTNeoModel, AutoModelForCausalLM, AutoTokenizer + for m in ("GPTJModel", "XGLMModel"): + try: + globals()[m] = getattr(__import__("transformers"), m) + except: + pass + import transformers.generation_utils + from transformers import __version__ as transformers_version + + # Lazy loader + import torch_lazy_loader + def get_lazy_load_callback(n_layers, convert_to_float16=True): + if not vars.lazy_load: + return -if(vars.model == "ReadOnly"): - vars.noai = True + from tqdm import tqdm -# Set logging level to reduce chatter from Flask -import logging -log = logging.getLogger('werkzeug') -log.setLevel(logging.ERROR) + if "breakmodel" in globals(): + gpu_blocks = breakmodel.gpu_blocks + ram_blocks = ram_blocks = n_layers - sum(gpu_blocks) + cumulative_gpu_blocks = tuple(itertools.accumulate(gpu_blocks)) + else: + ram_blocks = gpu_blocks = cumulative_gpu_blocks = None -# Start flask & SocketIO -print("{0}Initializing Flask... {1}".format(colors.PURPLE, colors.END), end="") -from flask import Flask, render_template, Response, request, copy_current_request_context -from flask_socketio import SocketIO, emit -app = Flask(__name__) -app.config['SECRET KEY'] = 'secret!' -socketio = SocketIO(app, async_method="eventlet") -print("{0}OK!{1}".format(colors.GREEN, colors.END)) + def lazy_load_callback(model_dict, f, **_): + device_map = {} -# Start transformers and create pipeline -if(not vars.use_colab_tpu and vars.model not in ["InferKit", "Colab", "OAI", "GooseAI" , "ReadOnly", "TPUMeshTransformerGPTJ"]): - if(not vars.noai): - print("{0}Initializing transformers, please wait...{1}".format(colors.PURPLE, colors.END)) - from transformers import StoppingCriteria, GPT2TokenizerFast, GPT2LMHeadModel, GPTNeoForCausalLM, GPTNeoModel, AutoModelForCausalLM, AutoTokenizer - for m in ("GPTJModel", "XGLMModel"): - try: - globals()[m] = getattr(__import__("transformers"), m) - except: - pass - import transformers.generation_utils - from transformers import __version__ as transformers_version - - # Lazy loader - import torch_lazy_loader - def get_lazy_load_callback(n_layers, convert_to_float16=True): - if not vars.lazy_load: - return + for _key, spec in lazy_load_spec.get("layer_weights", {}).items(): + for layer in range(n_layers): + key = _key.format(layer=layer) + if key not in model_dict: + continue + device = vars.gpu_device if vars.hascuda and vars.usegpu else "cpu" if not vars.hascuda or not vars.breakmodel or layer < ram_blocks else bisect.bisect_right(cumulative_gpu_blocks, layer - ram_blocks) + device_map[key] = device - from tqdm import tqdm + for key, value in model_dict.items(): + if isinstance(value, torch_lazy_loader.LazyTensor) and key not in device_map: + device_map[key] = vars.gpu_device if vars.hascuda and vars.usegpu else "cpu" + + with zipfile.ZipFile(f, "r") as z: + try: + last_storage_key = None + f = None + for key in tqdm(sorted(device_map.keys(), key=lambda k: (model_dict[k].key, model_dict[k].seek_offset)), desc="Loading model tensors"): + storage_key = model_dict[key].key + if storage_key != last_storage_key: + last_storage_key = storage_key + if isinstance(f, zipfile.ZipExtFile): + f.close() + f = z.open(f"archive/data/{storage_key}") + current_offset = f.tell() + if current_offset != model_dict[key].seek_offset: + f.seek(model_dict[key].seek_offset - current_offset, 1) + device = device_map[key] + #print(f"Transferring <{key}> to {'(CPU)' if device == 'cpu' else '[device ' + str(device) + ']'} ... ", end="", flush=True) + model_dict[key] = model_dict[key].materialize(f, map_location="cpu") + if convert_to_float16 and vars.hascuda and (vars.breakmodel or vars.usegpu) and model_dict[key].dtype is torch.float32: + model_dict[key] = model_dict[key].to(torch.float16) + if not vars.usegpu and not vars.breakmodel and model_dict[key].dtype is torch.float16: + model_dict[key] = model_dict[key].to(torch.float32) + model_dict[key] = model_dict[key].to(device) + #print("OK", flush=True) + finally: + if isinstance(f, zipfile.ZipExtFile): + f.close() + + return lazy_load_callback + + lazy_load_config_path = os.path.join(path.dirname(path.realpath(__file__)), "maps", vars.model_type + ".json") + if(vars.lazy_load and "model_config" in globals() and os.path.isfile(lazy_load_config_path)): + with open(lazy_load_config_path) as f: + lazy_load_spec = json.load(f) - if "breakmodel" in globals(): - gpu_blocks = breakmodel.gpu_blocks - ram_blocks = ram_blocks = n_layers - sum(gpu_blocks) - cumulative_gpu_blocks = tuple(itertools.accumulate(gpu_blocks)) else: - ram_blocks = gpu_blocks = cumulative_gpu_blocks = None + vars.lazy_load = False - def lazy_load_callback(model_dict, f, **_): - device_map = {} + # Some versions of transformers 4.17.0.dev0 are affected by + # https://github.com/huggingface/transformers/issues/15736 + # This is a workaround for those versions of transformers. + if(transformers_version == "4.17.0.dev0"): + try: + from transformers.models.xglm.modeling_xglm import XGLMSinusoidalPositionalEmbedding + except ImportError: + pass + else: + @torch.no_grad() + def new_forward(self, input_ids: torch.Tensor = None, inputs_embeds: torch.Tensor = None, past_key_values_length: int = 0): + bsz, seq_len = inputs_embeds.size()[:-1] + input_shape = inputs_embeds.size()[:-1] + sequence_length = input_shape[1] + position_ids = torch.arange( + past_key_values_length + self.padding_idx + 1, past_key_values_length + sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device + ).unsqueeze(0).expand(input_shape).contiguous() + max_pos = self.padding_idx + 1 + seq_len + past_key_values_length + if max_pos > self.weights.size(0): + self.make_weights(max_pos + self.offset, self.embedding_dim, self.padding_idx) + return self.weights.index_select(0, position_ids.view(-1)).view(bsz, seq_len, -1).detach() + XGLMSinusoidalPositionalEmbedding.forward = new_forward + + # Patch transformers to use our soft prompt + def patch_causallm(cls): + old_forward = cls.forward + def new_causallm_forward(self, *args, **kwargs): + input_ids = kwargs.get('input_ids').to(self.device) + assert input_ids is not None + kwargs['input_ids'] = None + if(vars.sp is not None): + shifted_input_ids = input_ids - self.config.vocab_size + input_ids.clamp_(max=self.config.vocab_size-1) + if(hasattr(self, "transformer")): + inputs_embeds = self.transformer.wte(input_ids) + else: + inputs_embeds = self.model.embed_tokens(input_ids) + if(vars.sp is not None): + vars.sp = vars.sp.to(inputs_embeds.dtype).to(inputs_embeds.device) + inputs_embeds = torch.where( + (shifted_input_ids >= 0)[..., None], + vars.sp[shifted_input_ids.clamp(min=0)], + inputs_embeds, + ) + if(not hasattr(self, "transformer")): + inputs_embeds *= self.model.embed_scale + kwargs['inputs_embeds'] = inputs_embeds + return old_forward(self, *args, **kwargs) + cls.forward = new_causallm_forward + for cls in (GPT2LMHeadModel, GPTNeoForCausalLM): + patch_causallm(cls) + for c in ("GPTJForCausalLM", "XGLMForCausalLM"): + try: + patch_causallm(getattr(__import__("transformers"), c)) + except: + pass - for _key, spec in lazy_load_spec.get("layer_weights", {}).items(): - for layer in range(n_layers): - key = _key.format(layer=layer) - if key not in model_dict: - continue - device = vars.gpu_device if vars.hascuda and vars.usegpu else "cpu" if not vars.hascuda or not vars.breakmodel or layer < ram_blocks else bisect.bisect_right(cumulative_gpu_blocks, layer - ram_blocks) - device_map[key] = device - for key, value in model_dict.items(): - if isinstance(value, torch_lazy_loader.LazyTensor) and key not in device_map: - device_map[key] = vars.gpu_device if vars.hascuda and vars.usegpu else "cpu" + # Patch transformers to use our custom logit warpers + from transformers import LogitsProcessorList, LogitsWarper, LogitsProcessor, TopKLogitsWarper, TopPLogitsWarper, TemperatureLogitsWarper, RepetitionPenaltyLogitsProcessor + from warpers import AdvancedRepetitionPenaltyLogitsProcessor, TailFreeLogitsWarper - with zipfile.ZipFile(f, "r") as z: - try: - last_storage_key = None - f = None - for key in tqdm(sorted(device_map.keys(), key=lambda k: (model_dict[k].key, model_dict[k].seek_offset)), desc="Loading model tensors"): - storage_key = model_dict[key].key - if storage_key != last_storage_key: - last_storage_key = storage_key - if isinstance(f, zipfile.ZipExtFile): - f.close() - f = z.open(f"archive/data/{storage_key}") - current_offset = f.tell() - if current_offset != model_dict[key].seek_offset: - f.seek(model_dict[key].seek_offset - current_offset, 1) - device = device_map[key] - #print(f"Transferring <{key}> to {'(CPU)' if device == 'cpu' else '[device ' + str(device) + ']'} ... ", end="", flush=True) - model_dict[key] = model_dict[key].materialize(f, map_location="cpu") - if convert_to_float16 and vars.hascuda and (vars.breakmodel or vars.usegpu) and model_dict[key].dtype is torch.float32: - model_dict[key] = model_dict[key].to(torch.float16) - if not vars.usegpu and not vars.breakmodel and model_dict[key].dtype is torch.float16: - model_dict[key] = model_dict[key].to(torch.float32) - model_dict[key] = model_dict[key].to(device) - #print("OK", flush=True) - finally: - if isinstance(f, zipfile.ZipExtFile): - f.close() - - return lazy_load_callback - - lazy_load_config_path = os.path.join(path.dirname(path.realpath(__file__)), "maps", vars.model_type + ".json") - if(vars.lazy_load and "model_config" in globals() and os.path.isfile(lazy_load_config_path)): - with open(lazy_load_config_path) as f: - lazy_load_spec = json.load(f) + def dynamic_processor_wrap(cls, field_name, var_name, cond=None): + old_call = cls.__call__ + def new_call(self, *args, **kwargs): + if(not isinstance(field_name, str) and isinstance(field_name, Iterable)): + conds = [] + for f, v in zip(field_name, var_name): + conds.append(getattr(vars, v)) + setattr(self, f, conds[-1]) + else: + conds = getattr(vars, var_name) + setattr(self, field_name, conds) + assert len(args) == 2 + if(cond is None or cond(conds)): + return old_call(self, *args, **kwargs) + return args[1] + cls.__call__ = new_call + dynamic_processor_wrap(AdvancedRepetitionPenaltyLogitsProcessor, ("penalty", "penalty_slope", "penalty_range"), ("rep_pen", "rep_pen_slope", "rep_pen_range"), cond=lambda x: x[0] != 1.0) + dynamic_processor_wrap(TopKLogitsWarper, "top_k", "top_k", cond=lambda x: x > 0) + dynamic_processor_wrap(TopPLogitsWarper, "top_p", "top_p", cond=lambda x: x < 1.0) + dynamic_processor_wrap(TailFreeLogitsWarper, "tfs", "tfs", cond=lambda x: x < 1.0) + dynamic_processor_wrap(TemperatureLogitsWarper, "temperature", "temp", cond=lambda x: x != 1.0) + RepetitionPenaltyLogitsProcessor.__init__ = AdvancedRepetitionPenaltyLogitsProcessor.__init__ + RepetitionPenaltyLogitsProcessor.__call__ = AdvancedRepetitionPenaltyLogitsProcessor.__call__ + + class LuaLogitsProcessor(LogitsProcessor): + + def __init__(self): + pass + + def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: + assert scores.ndim == 2 + assert input_ids.ndim == 2 + self.regeneration_required = False + self.halt = False - else: - vars.lazy_load = False + scores_shape = scores.shape + scores_list = scores.tolist() + vars.lua_koboldbridge.logits = vars.lua_state.table() + for r, row in enumerate(scores_list): + vars.lua_koboldbridge.logits[r+1] = vars.lua_state.table(*row) + vars.lua_koboldbridge.vocab_size = scores_shape[-1] - # Some versions of transformers 4.17.0.dev0 are affected by - # https://github.com/huggingface/transformers/issues/15736 - # This is a workaround for those versions of transformers. - if(transformers_version == "4.17.0.dev0"): - try: - from transformers.models.xglm.modeling_xglm import XGLMSinusoidalPositionalEmbedding - except ImportError: - pass - else: - @torch.no_grad() - def new_forward(self, input_ids: torch.Tensor = None, inputs_embeds: torch.Tensor = None, past_key_values_length: int = 0): - bsz, seq_len = inputs_embeds.size()[:-1] - input_shape = inputs_embeds.size()[:-1] - sequence_length = input_shape[1] - position_ids = torch.arange( - past_key_values_length + self.padding_idx + 1, past_key_values_length + sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device - ).unsqueeze(0).expand(input_shape).contiguous() - max_pos = self.padding_idx + 1 + seq_len + past_key_values_length - if max_pos > self.weights.size(0): - self.make_weights(max_pos + self.offset, self.embedding_dim, self.padding_idx) - return self.weights.index_select(0, position_ids.view(-1)).view(bsz, seq_len, -1).detach() - XGLMSinusoidalPositionalEmbedding.forward = new_forward - - # Patch transformers to use our soft prompt - def patch_causallm(cls): - old_forward = cls.forward - def new_causallm_forward(self, *args, **kwargs): - input_ids = kwargs.get('input_ids').to(self.device) - assert input_ids is not None - kwargs['input_ids'] = None - if(vars.sp is not None): - shifted_input_ids = input_ids - self.config.vocab_size - input_ids.clamp_(max=self.config.vocab_size-1) - if(hasattr(self, "transformer")): - inputs_embeds = self.transformer.wte(input_ids) - else: - inputs_embeds = self.model.embed_tokens(input_ids) - if(vars.sp is not None): - vars.sp = vars.sp.to(inputs_embeds.dtype).to(inputs_embeds.device) - inputs_embeds = torch.where( - (shifted_input_ids >= 0)[..., None], - vars.sp[shifted_input_ids.clamp(min=0)], - inputs_embeds, + execute_genmod() + + scores = torch.tensor( + tuple(tuple(row.values()) for row in vars.lua_koboldbridge.logits.values()), + device=scores.device, + dtype=scores.dtype, ) - if(not hasattr(self, "transformer")): - inputs_embeds *= self.model.embed_scale - kwargs['inputs_embeds'] = inputs_embeds - return old_forward(self, *args, **kwargs) - cls.forward = new_causallm_forward - for cls in (GPT2LMHeadModel, GPTNeoForCausalLM): - patch_causallm(cls) - for c in ("GPTJForCausalLM", "XGLMForCausalLM"): - try: - patch_causallm(getattr(__import__("transformers"), c)) - except: - pass - - - # Patch transformers to use our custom logit warpers - from transformers import LogitsProcessorList, LogitsWarper, LogitsProcessor, TopKLogitsWarper, TopPLogitsWarper, TemperatureLogitsWarper, RepetitionPenaltyLogitsProcessor - from warpers import AdvancedRepetitionPenaltyLogitsProcessor, TailFreeLogitsWarper - - def dynamic_processor_wrap(cls, field_name, var_name, cond=None): - old_call = cls.__call__ - def new_call(self, *args, **kwargs): - if(not isinstance(field_name, str) and isinstance(field_name, Iterable)): - conds = [] - for f, v in zip(field_name, var_name): - conds.append(getattr(vars, v)) - setattr(self, f, conds[-1]) - else: - conds = getattr(vars, var_name) - setattr(self, field_name, conds) - assert len(args) == 2 - if(cond is None or cond(conds)): - return old_call(self, *args, **kwargs) - return args[1] - cls.__call__ = new_call - dynamic_processor_wrap(AdvancedRepetitionPenaltyLogitsProcessor, ("penalty", "penalty_slope", "penalty_range"), ("rep_pen", "rep_pen_slope", "rep_pen_range"), cond=lambda x: x[0] != 1.0) - dynamic_processor_wrap(TopKLogitsWarper, "top_k", "top_k", cond=lambda x: x > 0) - dynamic_processor_wrap(TopPLogitsWarper, "top_p", "top_p", cond=lambda x: x < 1.0) - dynamic_processor_wrap(TailFreeLogitsWarper, "tfs", "tfs", cond=lambda x: x < 1.0) - dynamic_processor_wrap(TemperatureLogitsWarper, "temperature", "temp", cond=lambda x: x != 1.0) - RepetitionPenaltyLogitsProcessor.__init__ = AdvancedRepetitionPenaltyLogitsProcessor.__init__ - RepetitionPenaltyLogitsProcessor.__call__ = AdvancedRepetitionPenaltyLogitsProcessor.__call__ - - class LuaLogitsProcessor(LogitsProcessor): - - def __init__(self): - pass - - def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: - assert scores.ndim == 2 - assert input_ids.ndim == 2 - self.regeneration_required = False - self.halt = False - - scores_shape = scores.shape - scores_list = scores.tolist() - vars.lua_koboldbridge.logits = vars.lua_state.table() - for r, row in enumerate(scores_list): - vars.lua_koboldbridge.logits[r+1] = vars.lua_state.table(*row) - vars.lua_koboldbridge.vocab_size = scores_shape[-1] - - execute_genmod() - - scores = torch.tensor( - tuple(tuple(row.values()) for row in vars.lua_koboldbridge.logits.values()), - device=scores.device, - dtype=scores.dtype, - ) - assert scores.shape == scores_shape + assert scores.shape == scores_shape - return scores - - def new_get_logits_processor(*args, **kwargs) -> LogitsProcessorList: - processors = new_get_logits_processor.old_get_logits_processor(*args, **kwargs) - processors.insert(0, LuaLogitsProcessor()) - return processors - new_get_logits_processor.old_get_logits_processor = transformers.generation_utils.GenerationMixin._get_logits_processor - transformers.generation_utils.GenerationMixin._get_logits_processor = new_get_logits_processor - - def new_get_logits_warper(beams: int = 1,) -> LogitsProcessorList: - warper_list = LogitsProcessorList() - warper_list.append(TopKLogitsWarper(top_k=1, min_tokens_to_keep=1 + (beams > 1))) - warper_list.append(TopPLogitsWarper(top_p=0.5, min_tokens_to_keep=1 + (beams > 1))) - warper_list.append(TailFreeLogitsWarper(tfs=0.5, min_tokens_to_keep=1 + (beams > 1))) - warper_list.append(TemperatureLogitsWarper(temperature=0.5)) - return warper_list - - def new_sample(self, *args, **kwargs): - assert kwargs.pop("logits_warper", None) is not None - kwargs["logits_warper"] = new_get_logits_warper( - beams=1, - ) - if(vars.newlinemode == "s"): - kwargs["eos_token_id"] = -1 - kwargs.setdefault("pad_token_id", 2) - return new_sample.old_sample(self, *args, **kwargs) - new_sample.old_sample = transformers.generation_utils.GenerationMixin.sample - transformers.generation_utils.GenerationMixin.sample = new_sample - - - # Allow bad words filter to ban <|endoftext|> token - import transformers.generation_logits_process - def new_init(self, bad_words_ids: List[List[int]], eos_token_id: int): - return new_init.old_init(self, bad_words_ids, -1) - new_init.old_init = transformers.generation_logits_process.NoBadWordsLogitsProcessor.__init__ - transformers.generation_logits_process.NoBadWordsLogitsProcessor.__init__ = new_init - - - # Sets up dynamic world info scanner - class DynamicWorldInfoScanCriteria(StoppingCriteria): - def __init__( - self, - tokenizer, - excluded_world_info: List[Set], - ): - self.regeneration_required = False - self.halt = False - self.tokenizer = tokenizer - self.excluded_world_info = excluded_world_info - def __call__( - self, - input_ids: torch.LongTensor, - scores: torch.FloatTensor, - **kwargs, - ) -> bool: - vars.generated_tkns += 1 - if(vars.lua_koboldbridge.generated_cols and vars.generated_tkns != vars.lua_koboldbridge.generated_cols): - raise RuntimeError(f"Inconsistency detected between KoboldAI Python and Lua backends ({vars.generated_tkns} != {vars.lua_koboldbridge.generated_cols})") - if(vars.abort or vars.generated_tkns >= vars.genamt): + return scores + + def new_get_logits_processor(*args, **kwargs) -> LogitsProcessorList: + processors = new_get_logits_processor.old_get_logits_processor(*args, **kwargs) + processors.insert(0, LuaLogitsProcessor()) + return processors + new_get_logits_processor.old_get_logits_processor = transformers.generation_utils.GenerationMixin._get_logits_processor + transformers.generation_utils.GenerationMixin._get_logits_processor = new_get_logits_processor + + def new_get_logits_warper(beams: int = 1,) -> LogitsProcessorList: + warper_list = LogitsProcessorList() + warper_list.append(TopKLogitsWarper(top_k=1, min_tokens_to_keep=1 + (beams > 1))) + warper_list.append(TopPLogitsWarper(top_p=0.5, min_tokens_to_keep=1 + (beams > 1))) + warper_list.append(TailFreeLogitsWarper(tfs=0.5, min_tokens_to_keep=1 + (beams > 1))) + warper_list.append(TemperatureLogitsWarper(temperature=0.5)) + return warper_list + + def new_sample(self, *args, **kwargs): + assert kwargs.pop("logits_warper", None) is not None + kwargs["logits_warper"] = new_get_logits_warper( + beams=1, + ) + if(vars.newlinemode == "s"): + kwargs["eos_token_id"] = -1 + kwargs.setdefault("pad_token_id", 2) + return new_sample.old_sample(self, *args, **kwargs) + new_sample.old_sample = transformers.generation_utils.GenerationMixin.sample + transformers.generation_utils.GenerationMixin.sample = new_sample + + + # Allow bad words filter to ban <|endoftext|> token + import transformers.generation_logits_process + def new_init(self, bad_words_ids: List[List[int]], eos_token_id: int): + return new_init.old_init(self, bad_words_ids, -1) + new_init.old_init = transformers.generation_logits_process.NoBadWordsLogitsProcessor.__init__ + transformers.generation_logits_process.NoBadWordsLogitsProcessor.__init__ = new_init + + + # Sets up dynamic world info scanner + class DynamicWorldInfoScanCriteria(StoppingCriteria): + def __init__( + self, + tokenizer, + excluded_world_info: List[Set], + ): self.regeneration_required = False self.halt = False - return True - - assert input_ids.ndim == 2 - assert len(self.excluded_world_info) == input_ids.shape[0] - self.regeneration_required = vars.lua_koboldbridge.regeneration_required - self.halt = not vars.lua_koboldbridge.generating - vars.lua_koboldbridge.regeneration_required = False - - for i in range(vars.numseqs): - vars.lua_koboldbridge.generated[i+1][vars.generated_tkns] = int(input_ids[i, -1].item()) - - if(not vars.dynamicscan): + self.tokenizer = tokenizer + self.excluded_world_info = excluded_world_info + def __call__( + self, + input_ids: torch.LongTensor, + scores: torch.FloatTensor, + **kwargs, + ) -> bool: + vars.generated_tkns += 1 + if(vars.lua_koboldbridge.generated_cols and vars.generated_tkns != vars.lua_koboldbridge.generated_cols): + raise RuntimeError(f"Inconsistency detected between KoboldAI Python and Lua backends ({vars.generated_tkns} != {vars.lua_koboldbridge.generated_cols})") + if(vars.abort or vars.generated_tkns >= vars.genamt): + self.regeneration_required = False + self.halt = False + return True + + assert input_ids.ndim == 2 + assert len(self.excluded_world_info) == input_ids.shape[0] + self.regeneration_required = vars.lua_koboldbridge.regeneration_required + self.halt = not vars.lua_koboldbridge.generating + vars.lua_koboldbridge.regeneration_required = False + + for i in range(vars.numseqs): + vars.lua_koboldbridge.generated[i+1][vars.generated_tkns] = int(input_ids[i, -1].item()) + + if(not vars.dynamicscan): + return self.regeneration_required or self.halt + tail = input_ids[..., -vars.generated_tkns:] + for i, t in enumerate(tail): + decoded = utils.decodenewlines(tokenizer.decode(t)) + _, found = checkworldinfo(decoded, force_use_txt=True, actions=vars._actions) + found -= self.excluded_world_info[i] + if(len(found) != 0): + self.regeneration_required = True + break return self.regeneration_required or self.halt - tail = input_ids[..., -vars.generated_tkns:] - for i, t in enumerate(tail): - decoded = utils.decodenewlines(tokenizer.decode(t)) - _, found = checkworldinfo(decoded, force_use_txt=True, actions=vars._actions) - found -= self.excluded_world_info[i] - if(len(found) != 0): - self.regeneration_required = True - break - return self.regeneration_required or self.halt - old_get_stopping_criteria = transformers.generation_utils.GenerationMixin._get_stopping_criteria - def new_get_stopping_criteria(self, *args, **kwargs): - stopping_criteria = old_get_stopping_criteria(self, *args, **kwargs) - global tokenizer - self.kai_scanner = DynamicWorldInfoScanCriteria( - tokenizer=tokenizer, - excluded_world_info=self.kai_scanner_excluded_world_info, - ) - stopping_criteria.insert(0, self.kai_scanner) - return stopping_criteria - transformers.generation_utils.GenerationMixin._get_stopping_criteria = new_get_stopping_criteria + old_get_stopping_criteria = transformers.generation_utils.GenerationMixin._get_stopping_criteria + def new_get_stopping_criteria(self, *args, **kwargs): + stopping_criteria = old_get_stopping_criteria(self, *args, **kwargs) + global tokenizer + self.kai_scanner = DynamicWorldInfoScanCriteria( + tokenizer=tokenizer, + excluded_world_info=self.kai_scanner_excluded_world_info, + ) + stopping_criteria.insert(0, self.kai_scanner) + return stopping_criteria + transformers.generation_utils.GenerationMixin._get_stopping_criteria = new_get_stopping_criteria - def get_hidden_size_from_model(model): - try: - return int(model.transformer.hidden_size) - except: + def get_hidden_size_from_model(model): try: - return int(model.transformer.embed_dim) + return int(model.transformer.hidden_size) except: - return int(model.lm_head.in_features) - - def maybe_low_cpu_mem_usage() -> Dict[str, Any]: - if(packaging.version.parse(transformers_version) < packaging.version.parse("4.11.0")): - print(f"\nWARNING: Please upgrade to transformers 4.11.0 for lower RAM usage. You have transformers {transformers_version}.", file=sys.stderr) - return {} - return {"low_cpu_mem_usage": True} - - @contextlib.contextmanager - def maybe_use_float16(always_use=False): - if(always_use or (vars.hascuda and args.lowmem and (vars.usegpu or vars.breakmodel))): - original_dtype = torch.get_default_dtype() - torch.set_default_dtype(torch.float16) - yield True - torch.set_default_dtype(original_dtype) - else: - yield False - - # If custom GPT2 model was chosen - if(vars.model == "GPT2Custom"): - vars.lazy_load = False - model_config = open(vars.custmodpth + "/config.json", "r") - js = json.load(model_config) - with(maybe_use_float16()): - model = GPT2LMHeadModel.from_pretrained(vars.custmodpth, cache_dir="cache/") - tokenizer = GPT2TokenizerFast.from_pretrained(vars.custmodpth, cache_dir="cache/") - vars.modeldim = get_hidden_size_from_model(model) - # Is CUDA available? If so, use GPU, otherwise fall back to CPU - if(vars.hascuda and vars.usegpu): - model = model.half().to(vars.gpu_device) - generator = model.generate - else: - model = model.to('cpu').float() - generator = model.generate - # Use the Generic implementation - else: - lowmem = maybe_low_cpu_mem_usage() - # We must disable low_cpu_mem_usage (by setting lowmem to {}) if - # using a GPT-2 model because GPT-2 is not compatible with this - # feature yet - if(vars.model_type == "gpt2"): - lowmem = {} + try: + return int(model.transformer.embed_dim) + except: + return int(model.lm_head.in_features) - # If we're using torch_lazy_loader, we need to get breakmodel config - # early so that it knows where to load the individual model tensors - if(vars.lazy_load and vars.hascuda and vars.breakmodel): - device_config(model_config) - - # Download model from Huggingface if it does not exist, otherwise load locally + def maybe_low_cpu_mem_usage() -> Dict[str, Any]: + if(packaging.version.parse(transformers_version) < packaging.version.parse("4.11.0")): + print(f"\nWARNING: Please upgrade to transformers 4.11.0 for lower RAM usage. You have transformers {transformers_version}.", file=sys.stderr) + return {} + return {"low_cpu_mem_usage": True} - #If we specify a model and it's in the root directory, we need to move it to the models directory (legacy folder structure to new) - if os.path.isdir(vars.model.replace('/', '_')): - import shutil - shutil.move(vars.model.replace('/', '_'), "models/{}".format(vars.model.replace('/', '_'))) - with maybe_use_float16(), torch_lazy_loader.use_lazy_torch_load(enable=vars.lazy_load, callback=get_lazy_load_callback(model_config.num_layers if hasattr(model_config, "num_layers") else model_config.n_layer), dematerialized_modules=True): - if(vars.lazy_load): # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time - lowmem = {} - if(os.path.isdir(vars.custmodpth)): - try: - tokenizer = AutoTokenizer.from_pretrained(vars.custmodpth, cache_dir="cache") - except ValueError as e: - tokenizer = GPT2TokenizerFast.from_pretrained(vars.custmodpth, cache_dir="cache") - try: - model = AutoModelForCausalLM.from_pretrained(vars.custmodpth, cache_dir="cache", **lowmem) - except ValueError as e: - model = GPTNeoForCausalLM.from_pretrained(vars.custmodpth, cache_dir="cache", **lowmem) - elif(os.path.isdir("models/{}".format(vars.model.replace('/', '_')))): - try: - tokenizer = AutoTokenizer.from_pretrained("models/{}".format(vars.model.replace('/', '_')), cache_dir="cache") - except ValueError as e: - tokenizer = GPT2TokenizerFast.from_pretrained("models/{}".format(vars.model.replace('/', '_')), cache_dir="cache") - try: - model = AutoModelForCausalLM.from_pretrained("models/{}".format(vars.model.replace('/', '_')), cache_dir="cache", **lowmem) - except ValueError as e: - model = GPTNeoForCausalLM.from_pretrained("models/{}".format(vars.model.replace('/', '_')), cache_dir="cache", **lowmem) + @contextlib.contextmanager + def maybe_use_float16(always_use=False): + if(always_use or (vars.hascuda and args.lowmem and (vars.usegpu or vars.breakmodel))): + original_dtype = torch.get_default_dtype() + torch.set_default_dtype(torch.float16) + yield True + torch.set_default_dtype(original_dtype) else: - try: - tokenizer = AutoTokenizer.from_pretrained(vars.model, cache_dir="cache") - except ValueError as e: - tokenizer = GPT2TokenizerFast.from_pretrained(vars.model, cache_dir="cache") - try: - model = AutoModelForCausalLM.from_pretrained(vars.model, cache_dir="cache", **lowmem) - except ValueError as e: - model = GPTNeoForCausalLM.from_pretrained(vars.model, cache_dir="cache", **lowmem) - - if not args.colab: - import shutil - model = model.half() - model.save_pretrained("models/{}".format(vars.model.replace('/', '_'))) - tokenizer.save_pretrained("models/{}".format(vars.model.replace('/', '_'))) - shutil.rmtree("cache/") - - if(vars.hascuda): - if(vars.usegpu): - vars.modeldim = get_hidden_size_from_model(model) + yield False + + # If custom GPT2 model was chosen + if(vars.model == "GPT2Custom"): + vars.lazy_load = False + model_config = open(vars.custmodpth + "/config.json", "r") + js = json.load(model_config) + with(maybe_use_float16()): + model = GPT2LMHeadModel.from_pretrained(vars.custmodpth, cache_dir="cache/") + tokenizer = GPT2TokenizerFast.from_pretrained(vars.custmodpth, cache_dir="cache/") + vars.modeldim = get_hidden_size_from_model(model) + # Is CUDA available? If so, use GPU, otherwise fall back to CPU + if(vars.hascuda and vars.usegpu): model = model.half().to(vars.gpu_device) generator = model.generate - elif(vars.breakmodel): # Use both RAM and VRAM (breakmodel) - vars.modeldim = get_hidden_size_from_model(model) - if(not vars.lazy_load): - device_config(model.config) - move_model_to_devices(model) else: model = model.to('cpu').float() - vars.modeldim = get_hidden_size_from_model(model) generator = model.generate + # Use the Generic implementation else: - model.to('cpu').float() - vars.modeldim = get_hidden_size_from_model(model) - generator = model.generate + lowmem = maybe_low_cpu_mem_usage() + # We must disable low_cpu_mem_usage (by setting lowmem to {}) if + # using a GPT-2 model because GPT-2 is not compatible with this + # feature yet + if(vars.model_type == "gpt2"): + lowmem = {} + + # If we're using torch_lazy_loader, we need to get breakmodel config + # early so that it knows where to load the individual model tensors + if(vars.lazy_load and vars.hascuda and vars.breakmodel): + device_config(model_config) + + # Download model from Huggingface if it does not exist, otherwise load locally + + #If we specify a model and it's in the root directory, we need to move it to the models directory (legacy folder structure to new) + if os.path.isdir(vars.model.replace('/', '_')): + import shutil + shutil.move(vars.model.replace('/', '_'), "models/{}".format(vars.model.replace('/', '_'))) + with maybe_use_float16(), torch_lazy_loader.use_lazy_torch_load(enable=vars.lazy_load, callback=get_lazy_load_callback(model_config.num_layers if hasattr(model_config, "num_layers") else model_config.n_layer), dematerialized_modules=True): + if(vars.lazy_load): # torch_lazy_loader.py and low_cpu_mem_usage can't be used at the same time + lowmem = {} + if(os.path.isdir(vars.custmodpth)): + try: + tokenizer = AutoTokenizer.from_pretrained(vars.custmodpth, cache_dir="cache") + except ValueError as e: + tokenizer = GPT2TokenizerFast.from_pretrained(vars.custmodpth, cache_dir="cache") + try: + model = AutoModelForCausalLM.from_pretrained(vars.custmodpth, cache_dir="cache", **lowmem) + except ValueError as e: + model = GPTNeoForCausalLM.from_pretrained(vars.custmodpth, cache_dir="cache", **lowmem) + elif(os.path.isdir("models/{}".format(vars.model.replace('/', '_')))): + try: + tokenizer = AutoTokenizer.from_pretrained("models/{}".format(vars.model.replace('/', '_')), cache_dir="cache") + except ValueError as e: + tokenizer = GPT2TokenizerFast.from_pretrained("models/{}".format(vars.model.replace('/', '_')), cache_dir="cache") + try: + model = AutoModelForCausalLM.from_pretrained("models/{}".format(vars.model.replace('/', '_')), cache_dir="cache", **lowmem) + except ValueError as e: + model = GPTNeoForCausalLM.from_pretrained("models/{}".format(vars.model.replace('/', '_')), cache_dir="cache", **lowmem) + else: + try: + tokenizer = AutoTokenizer.from_pretrained(vars.model, cache_dir="cache") + except ValueError as e: + tokenizer = GPT2TokenizerFast.from_pretrained(vars.model, cache_dir="cache") + try: + model = AutoModelForCausalLM.from_pretrained(vars.model, cache_dir="cache", **lowmem) + except ValueError as e: + model = GPTNeoForCausalLM.from_pretrained(vars.model, cache_dir="cache", **lowmem) + + if not args.colab: + import shutil + model = model.half() + model.save_pretrained("models/{}".format(vars.model.replace('/', '_'))) + tokenizer.save_pretrained("models/{}".format(vars.model.replace('/', '_'))) + shutil.rmtree("cache/") + + if(vars.hascuda): + if(vars.usegpu): + vars.modeldim = get_hidden_size_from_model(model) + model = model.half().to(vars.gpu_device) + generator = model.generate + elif(vars.breakmodel): # Use both RAM and VRAM (breakmodel) + vars.modeldim = get_hidden_size_from_model(model) + if(not vars.lazy_load): + device_config(model.config) + move_model_to_devices(model) + else: + model = model.to('cpu').float() + vars.modeldim = get_hidden_size_from_model(model) + generator = model.generate + else: + model.to('cpu').float() + vars.modeldim = get_hidden_size_from_model(model) + generator = model.generate + + # Suppress Author's Note by flagging square brackets (Old implementation) + #vocab = tokenizer.get_vocab() + #vocab_keys = vocab.keys() + #vars.badwords = gettokenids("[") + #for key in vars.badwords: + # vars.badwordsids.append([vocab[key]]) + + print("{0}OK! {1} pipeline created!{2}".format(colors.GREEN, vars.model, colors.END)) - # Suppress Author's Note by flagging square brackets (Old implementation) - #vocab = tokenizer.get_vocab() - #vocab_keys = vocab.keys() - #vars.badwords = gettokenids("[") - #for key in vars.badwords: - # vars.badwordsids.append([vocab[key]]) - - print("{0}OK! {1} pipeline created!{2}".format(colors.GREEN, vars.model, colors.END)) - + else: + from transformers import GPT2TokenizerFast + tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", cache_dir="cache/") else: - from transformers import GPT2TokenizerFast - tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", cache_dir="cache/") -else: - def tpumtjgetsofttokens(): - soft_tokens = None - if(vars.sp is None): - global np - if 'np' not in globals(): - import numpy as np - tensor = np.zeros((1, tpu_mtj_backend.params["d_model"]), dtype=np.float32) - rows = tensor.shape[0] - padding_amount = tpu_mtj_backend.params["seq"] - (tpu_mtj_backend.params["seq"] % -tpu_mtj_backend.params["cores_per_replica"]) - rows - tensor = np.pad(tensor, ((0, padding_amount), (0, 0))) - tensor = tensor.reshape( - tpu_mtj_backend.params["cores_per_replica"], - -1, - tpu_mtj_backend.params["d_model"], + def tpumtjgetsofttokens(): + soft_tokens = None + if(vars.sp is None): + global np + if 'np' not in globals(): + import numpy as np + tensor = np.zeros((1, tpu_mtj_backend.params["d_model"]), dtype=np.float32) + rows = tensor.shape[0] + padding_amount = tpu_mtj_backend.params["seq"] - (tpu_mtj_backend.params["seq"] % -tpu_mtj_backend.params["cores_per_replica"]) - rows + tensor = np.pad(tensor, ((0, padding_amount), (0, 0))) + tensor = tensor.reshape( + tpu_mtj_backend.params["cores_per_replica"], + -1, + tpu_mtj_backend.params["d_model"], + ) + vars.sp = tpu_mtj_backend.shard_xmap(tensor) + soft_tokens = np.arange( + tpu_mtj_backend.params["n_vocab"] + tpu_mtj_backend.params["n_vocab_padding"], + tpu_mtj_backend.params["n_vocab"] + tpu_mtj_backend.params["n_vocab_padding"] + vars.sp_length, + dtype=np.uint32 ) - vars.sp = tpu_mtj_backend.shard_xmap(tensor) - soft_tokens = np.arange( - tpu_mtj_backend.params["n_vocab"] + tpu_mtj_backend.params["n_vocab_padding"], - tpu_mtj_backend.params["n_vocab"] + tpu_mtj_backend.params["n_vocab_padding"] + vars.sp_length, - dtype=np.uint32 - ) - return soft_tokens + return soft_tokens - def tpumtjgenerate_warper_callback(scores) -> "np.array": - scores_shape = scores.shape - scores_list = scores.tolist() - vars.lua_koboldbridge.logits = vars.lua_state.table() - for r, row in enumerate(scores_list): - vars.lua_koboldbridge.logits[r+1] = vars.lua_state.table(*row) - vars.lua_koboldbridge.vocab_size = scores_shape[-1] + def tpumtjgenerate_warper_callback(scores) -> "np.array": + scores_shape = scores.shape + scores_list = scores.tolist() + vars.lua_koboldbridge.logits = vars.lua_state.table() + for r, row in enumerate(scores_list): + vars.lua_koboldbridge.logits[r+1] = vars.lua_state.table(*row) + vars.lua_koboldbridge.vocab_size = scores_shape[-1] - execute_genmod() + execute_genmod() - scores = np.array( - tuple(tuple(row.values()) for row in vars.lua_koboldbridge.logits.values()), - dtype=scores.dtype, - ) - assert scores.shape == scores_shape + scores = np.array( + tuple(tuple(row.values()) for row in vars.lua_koboldbridge.logits.values()), + dtype=scores.dtype, + ) + assert scores.shape == scores_shape - return scores - - def tpumtjgenerate_stopping_callback(generated, n_generated, excluded_world_info) -> Tuple[List[set], bool, bool]: - vars.generated_tkns += 1 + return scores + + def tpumtjgenerate_stopping_callback(generated, n_generated, excluded_world_info) -> Tuple[List[set], bool, bool]: + vars.generated_tkns += 1 - assert len(excluded_world_info) == len(generated) - regeneration_required = vars.lua_koboldbridge.regeneration_required - halt = vars.abort or not vars.lua_koboldbridge.generating or vars.generated_tkns >= vars.genamt - vars.lua_koboldbridge.regeneration_required = False + assert len(excluded_world_info) == len(generated) + regeneration_required = vars.lua_koboldbridge.regeneration_required + halt = vars.abort or not vars.lua_koboldbridge.generating or vars.generated_tkns >= vars.genamt + vars.lua_koboldbridge.regeneration_required = False - global past + global past - for i in range(vars.numseqs): - vars.lua_koboldbridge.generated[i+1][vars.generated_tkns] = int(generated[i, tpu_mtj_backend.params["seq"] + n_generated - 1].item()) + for i in range(vars.numseqs): + vars.lua_koboldbridge.generated[i+1][vars.generated_tkns] = int(generated[i, tpu_mtj_backend.params["seq"] + n_generated - 1].item()) + + if(not vars.dynamicscan or halt): + return excluded_world_info, regeneration_required, halt - if(not vars.dynamicscan or halt): + for i, t in enumerate(generated): + decoded = utils.decodenewlines(tokenizer.decode(past[i])) + utils.decodenewlines(tokenizer.decode(t[tpu_mtj_backend.params["seq"] : tpu_mtj_backend.params["seq"] + n_generated])) + _, found = checkworldinfo(decoded, force_use_txt=True, actions=vars._actions) + found -= excluded_world_info[i] + if(len(found) != 0): + regeneration_required = True + break return excluded_world_info, regeneration_required, halt - for i, t in enumerate(generated): - decoded = utils.decodenewlines(tokenizer.decode(past[i])) + utils.decodenewlines(tokenizer.decode(t[tpu_mtj_backend.params["seq"] : tpu_mtj_backend.params["seq"] + n_generated])) - _, found = checkworldinfo(decoded, force_use_txt=True, actions=vars._actions) - found -= excluded_world_info[i] - if(len(found) != 0): - regeneration_required = True - break - return excluded_world_info, regeneration_required, halt + def tpumtjgenerate_compiling_callback() -> None: + print(colors.GREEN + "TPU backend compilation triggered" + colors.END) + vars.compiling = True - def tpumtjgenerate_compiling_callback() -> None: - print(colors.GREEN + "TPU backend compilation triggered" + colors.END) - vars.compiling = True + def tpumtjgenerate_stopped_compiling_callback() -> None: + vars.compiling = False + + def tpumtjgenerate_settings_callback() -> dict: + return { + "top_p": float(vars.top_p), + "temp": float(vars.temp), + "top_k": int(vars.top_k), + "tfs": float(vars.tfs), + "repetition_penalty": float(vars.rep_pen), + "rpslope": float(vars.rep_pen_slope), + "rprange": int(vars.rep_pen_range), + } - def tpumtjgenerate_stopped_compiling_callback() -> None: - vars.compiling = False + # If we're running Colab or OAI, we still need a tokenizer. + if(vars.model == "Colab"): + from transformers import GPT2TokenizerFast + tokenizer = GPT2TokenizerFast.from_pretrained("EleutherAI/gpt-neo-2.7B", cache_dir="cache/") + loadsettings() + elif(vars.model == "OAI"): + from transformers import GPT2TokenizerFast + tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", cache_dir="cache/") + loadsettings() + # Load the TPU backend if requested + elif(vars.use_colab_tpu or vars.model == "TPUMeshTransformerGPTJ"): + print("{0}Initializing Mesh Transformer JAX, please wait...{1}".format(colors.PURPLE, colors.END)) + if vars.model == "TPUMeshTransformerGPTJ" and (not vars.custmodpth or not os.path.isdir(vars.custmodpth)): + raise FileNotFoundError(f"The specified model path {repr(vars.custmodpth)} is not the path to a valid folder") + import tpu_mtj_backend + tpu_mtj_backend.vars = vars + tpu_mtj_backend.warper_callback = tpumtjgenerate_warper_callback + tpu_mtj_backend.stopping_callback = tpumtjgenerate_stopping_callback + tpu_mtj_backend.compiling_callback = tpumtjgenerate_compiling_callback + tpu_mtj_backend.stopped_compiling_callback = tpumtjgenerate_stopped_compiling_callback + tpu_mtj_backend.settings_callback = tpumtjgenerate_settings_callback + vars.allowsp = True + loadmodelsettings() + loadsettings() + tpu_mtj_backend.load_model(vars.custmodpth, hf_checkpoint=vars.model != "TPUMeshTransformerGPTJ" and vars.use_colab_tpu, **vars.modelconfig) + vars.modeldim = int(tpu_mtj_backend.params["d_model"]) + tokenizer = tpu_mtj_backend.tokenizer + else: + loadsettings() + + lua_startup() + # Load scripts + load_lua_scripts() - def tpumtjgenerate_settings_callback() -> dict: - return { - "top_p": float(vars.top_p), - "temp": float(vars.temp), - "top_k": int(vars.top_k), - "tfs": float(vars.tfs), - "repetition_penalty": float(vars.rep_pen), - "rpslope": float(vars.rep_pen_slope), - "rprange": int(vars.rep_pen_range), - } - - # If we're running Colab or OAI, we still need a tokenizer. - if(vars.model == "Colab"): - from transformers import GPT2TokenizerFast - tokenizer = GPT2TokenizerFast.from_pretrained("EleutherAI/gpt-neo-2.7B", cache_dir="cache/") - loadsettings() - elif(vars.model == "OAI"): - from transformers import GPT2TokenizerFast - tokenizer = GPT2TokenizerFast.from_pretrained("gpt2", cache_dir="cache/") - loadsettings() - # Load the TPU backend if requested - elif(vars.use_colab_tpu or vars.model == "TPUMeshTransformerGPTJ"): - print("{0}Initializing Mesh Transformer JAX, please wait...{1}".format(colors.PURPLE, colors.END)) - if vars.model == "TPUMeshTransformerGPTJ" and (not vars.custmodpth or not os.path.isdir(vars.custmodpth)): - raise FileNotFoundError(f"The specified model path {repr(vars.custmodpth)} is not the path to a valid folder") - import tpu_mtj_backend - tpu_mtj_backend.vars = vars - tpu_mtj_backend.warper_callback = tpumtjgenerate_warper_callback - tpu_mtj_backend.stopping_callback = tpumtjgenerate_stopping_callback - tpu_mtj_backend.compiling_callback = tpumtjgenerate_compiling_callback - tpu_mtj_backend.stopped_compiling_callback = tpumtjgenerate_stopped_compiling_callback - tpu_mtj_backend.settings_callback = tpumtjgenerate_settings_callback - vars.allowsp = True - loadmodelsettings() - loadsettings() - tpu_mtj_backend.load_model(vars.custmodpth, hf_checkpoint=vars.model != "TPUMeshTransformerGPTJ" and vars.use_colab_tpu, **vars.modelconfig) - vars.modeldim = int(tpu_mtj_backend.params["d_model"]) - tokenizer = tpu_mtj_backend.tokenizer - else: - loadsettings() + final_startup() + set_aibusy(False) # Set up Flask routes @app.route('/') @@ -1614,29 +1609,69 @@ def download(): #============================ LUA API =============================# -if(path.exists("settings/" + getmodelname().replace('/', '_') + ".settings")): - file = open("settings/" + getmodelname().replace('/', '_') + ".settings", "r") - js = json.load(file) - if("userscripts" in js): - vars.userscripts = [] - for userscript in js["userscripts"]: - if type(userscript) is not str: - continue - userscript = userscript.strip() - if len(userscript) != 0 and all(q not in userscript for q in ("..", ":")) and all(userscript[0] not in q for q in ("/", "\\")) and os.path.exists(fileops.uspath(userscript)): - vars.userscripts.append(userscript) - if("corescript" in js and type(js["corescript"]) is str and all(q not in js["corescript"] for q in ("..", ":")) and all(js["corescript"][0] not in q for q in ("/", "\\"))): - vars.corescript = js["corescript"] - else: - vars.corescript = "default.lua" - file.close() +_bridged = {} +F = TypeVar("F", bound=Callable) +def lua_startup(): + global _bridged + global F + global bridged + if(path.exists("settings/" + getmodelname().replace('/', '_') + ".settings")): + file = open("settings/" + getmodelname().replace('/', '_') + ".settings", "r") + js = json.load(file) + if("userscripts" in js): + vars.userscripts = [] + for userscript in js["userscripts"]: + if type(userscript) is not str: + continue + userscript = userscript.strip() + if len(userscript) != 0 and all(q not in userscript for q in ("..", ":")) and all(userscript[0] not in q for q in ("/", "\\")) and os.path.exists(fileops.uspath(userscript)): + vars.userscripts.append(userscript) + if("corescript" in js and type(js["corescript"]) is str and all(q not in js["corescript"] for q in ("..", ":")) and all(js["corescript"][0] not in q for q in ("/", "\\"))): + vars.corescript = js["corescript"] + else: + vars.corescript = "default.lua" + file.close() + + #==================================================================# + # Lua runtime startup + #==================================================================# + + print("", end="", flush=True) + print(colors.PURPLE + "Initializing Lua Bridge... " + colors.END, end="", flush=True) + + # Set up Lua state + vars.lua_state = lupa.LuaRuntime(unpack_returned_tuples=True) + + # Load bridge.lua + bridged = { + "corescript_path": os.path.join(os.path.dirname(os.path.realpath(__file__)), "cores"), + "userscript_path": os.path.join(os.path.dirname(os.path.realpath(__file__)), "userscripts"), + "config_path": os.path.join(os.path.dirname(os.path.realpath(__file__)), "userscripts"), + "lib_paths": vars.lua_state.table(os.path.join(os.path.dirname(os.path.realpath(__file__)), "lualibs"), os.path.join(os.path.dirname(os.path.realpath(__file__)), "extern", "lualibs")), + "vars": vars, + } + for kwarg in _bridged: + bridged[kwarg] = _bridged[kwarg] + try: + vars.lua_kobold, vars.lua_koboldcore, vars.lua_koboldbridge = vars.lua_state.globals().dofile(os.path.join(os.path.dirname(os.path.realpath(__file__)), "bridge.lua"))( + vars.lua_state.globals().python, + bridged, + ) + except lupa.LuaError as e: + print(colors.RED + "ERROR!" + colors.END) + vars.lua_koboldbridge.obliterate_multiverse() + print("{0}{1}{2}".format(colors.RED, "***LUA ERROR***: ", colors.END), end="", file=sys.stderr) + print("{0}{1}{2}".format(colors.RED, str(e).replace("\033", ""), colors.END), file=sys.stderr) + exit(1) + print(colors.GREEN + "OK!" + colors.END) + + def lua_log_format_name(name): return f"[{name}]" if type(name) is str else "CORE" -_bridged = {} -F = TypeVar("F", bound=Callable) def bridged_kwarg(name=None): + global F def _bridged_kwarg(f: F): _bridged[name if name is not None else f.__name__[4:] if f.__name__[:4] == "lua_" else f.__name__] = f return f @@ -2172,42 +2207,6 @@ def execute_outmod(): for k in vars.lua_deleted: inlinedelete(k) -#==================================================================# -# Lua runtime startup -#==================================================================# - -print("", end="", flush=True) -print(colors.PURPLE + "Initializing Lua Bridge... " + colors.END, end="", flush=True) - -# Set up Lua state -vars.lua_state = lupa.LuaRuntime(unpack_returned_tuples=True) - -# Load bridge.lua -bridged = { - "corescript_path": os.path.join(os.path.dirname(os.path.realpath(__file__)), "cores"), - "userscript_path": os.path.join(os.path.dirname(os.path.realpath(__file__)), "userscripts"), - "config_path": os.path.join(os.path.dirname(os.path.realpath(__file__)), "userscripts"), - "lib_paths": vars.lua_state.table(os.path.join(os.path.dirname(os.path.realpath(__file__)), "lualibs"), os.path.join(os.path.dirname(os.path.realpath(__file__)), "extern", "lualibs")), - "vars": vars, -} -for kwarg in _bridged: - bridged[kwarg] = _bridged[kwarg] -try: - vars.lua_kobold, vars.lua_koboldcore, vars.lua_koboldbridge = vars.lua_state.globals().dofile(os.path.join(os.path.dirname(os.path.realpath(__file__)), "bridge.lua"))( - vars.lua_state.globals().python, - bridged, - ) -except lupa.LuaError as e: - print(colors.RED + "ERROR!" + colors.END) - vars.lua_koboldbridge.obliterate_multiverse() - print("{0}{1}{2}".format(colors.RED, "***LUA ERROR***: ", colors.END), end="", file=sys.stderr) - print("{0}{1}{2}".format(colors.RED, str(e).replace("\033", ""), colors.END), file=sys.stderr) - exit(1) -print(colors.GREEN + "OK!" + colors.END) - -# Load scripts -load_lua_scripts() - #============================ METHODS =============================# @@ -2528,6 +2527,12 @@ def get_message(msg): load_lua_scripts() unloaded, loaded = getuslist() sendUSStatItems() + elif(msg['cmd'] == 'list_model'): + sendModelSelection(menu=msg['data']) + elif(msg['cmd'] == 'load_model'): + load_model(use_gpu=msg['use_gpu'], key=msg['key']) + elif(msg['cmd'] == 'selectmodel'): + vars.model = msg['data'] elif(msg['cmd'] == 'loadselect'): vars.loadselect = msg["data"] elif(msg['cmd'] == 'spselect'): @@ -3793,10 +3798,16 @@ def refresh_settings(): def set_aibusy(state): if(state): vars.aibusy = True - emit('from_server', {'cmd': 'setgamestate', 'data': 'wait'}, broadcast=True) + try: + emit('from_server', {'cmd': 'setgamestate', 'data': 'wait'}, broadcast=True) + except: + pass else: vars.aibusy = False - emit('from_server', {'cmd': 'setgamestate', 'data': 'ready'}, broadcast=True) + try: + emit('from_server', {'cmd': 'setgamestate', 'data': 'ready'}, broadcast=True) + except: + pass #==================================================================# # @@ -5088,51 +5099,52 @@ def randomGameRequest(topic, memory=""): vars.memory = memory emit('from_server', {'cmd': 'setmemory', 'data': vars.memory}, broadcast=True) -# Prevent tokenizer from taking extra time the first time it's used -def __preempt_tokenizer(): - if("tokenizer" not in globals()): - return - utils.decodenewlines(tokenizer.decode([25678, 559])) - tokenizer.encode(utils.encodenewlines("eunoia")) -threading.Thread(target=__preempt_tokenizer).start() - -# Load soft prompt specified by the settings file, if applicable -if(path.exists("settings/" + getmodelname().replace('/', '_') + ".settings")): - file = open("settings/" + getmodelname().replace('/', '_') + ".settings", "r") - js = json.load(file) - if(vars.allowsp and "softprompt" in js and type(js["softprompt"]) is str and all(q not in js["softprompt"] for q in ("..", ":")) and (len(js["softprompt"]) == 0 or all(js["softprompt"][0] not in q for q in ("/", "\\")))): - spRequest(js["softprompt"]) - else: - vars.spfilename = "" - file.close() - -# Precompile TPU backend if required -if(vars.use_colab_tpu or vars.model in ("TPUMeshTransformerGPTJ",)): - soft_tokens = tpumtjgetsofttokens() - if(vars.dynamicscan or (not vars.nogenmod and vars.has_genmod)): - threading.Thread( - target=tpu_mtj_backend.infer_dynamic, - args=(np.tile(np.uint32((23403, 727, 20185)), (vars.numseqs, 1)),), - kwargs={ - "soft_embeddings": vars.sp, - "soft_tokens": soft_tokens, - "gen_len": 1, - "use_callback": False, - "numseqs": vars.numseqs, - "excluded_world_info": list(set() for _ in range(vars.numseqs)), - }, - ).start() - else: - threading.Thread( - target=tpu_mtj_backend.infer_static, - args=(np.uint32((23403, 727, 20185)),), - kwargs={ - "soft_embeddings": vars.sp, - "soft_tokens": soft_tokens, - "gen_len": 1, - "numseqs": vars.numseqs, - }, - ).start() +def final_startup(): + # Prevent tokenizer from taking extra time the first time it's used + def __preempt_tokenizer(): + if("tokenizer" not in globals()): + return + utils.decodenewlines(tokenizer.decode([25678, 559])) + tokenizer.encode(utils.encodenewlines("eunoia")) + threading.Thread(target=__preempt_tokenizer).start() + + # Load soft prompt specified by the settings file, if applicable + if(path.exists("settings/" + getmodelname().replace('/', '_') + ".settings")): + file = open("settings/" + getmodelname().replace('/', '_') + ".settings", "r") + js = json.load(file) + if(vars.allowsp and "softprompt" in js and type(js["softprompt"]) is str and all(q not in js["softprompt"] for q in ("..", ":")) and (len(js["softprompt"]) == 0 or all(js["softprompt"][0] not in q for q in ("/", "\\")))): + spRequest(js["softprompt"]) + else: + vars.spfilename = "" + file.close() + + # Precompile TPU backend if required + if(vars.use_colab_tpu or vars.model in ("TPUMeshTransformerGPTJ",)): + soft_tokens = tpumtjgetsofttokens() + if(vars.dynamicscan or (not vars.nogenmod and vars.has_genmod)): + threading.Thread( + target=tpu_mtj_backend.infer_dynamic, + args=(np.tile(np.uint32((23403, 727, 20185)), (vars.numseqs, 1)),), + kwargs={ + "soft_embeddings": vars.sp, + "soft_tokens": soft_tokens, + "gen_len": 1, + "use_callback": False, + "numseqs": vars.numseqs, + "excluded_world_info": list(set() for _ in range(vars.numseqs)), + }, + ).start() + else: + threading.Thread( + target=tpu_mtj_backend.infer_static, + args=(np.uint32((23403, 727, 20185)),), + kwargs={ + "soft_embeddings": vars.sp, + "soft_tokens": soft_tokens, + "gen_len": 1, + "numseqs": vars.numseqs, + }, + ).start() def send_debug(): if vars.debug: @@ -5175,6 +5187,11 @@ def send_debug(): if __name__ == "__main__": print("{0}\nStarting webserver...{1}".format(colors.GREEN, colors.END), flush=True) + general_startup() + #show_select_model_list() + vars.model = "ReadOnly" + load_model() + # Start Flask/SocketIO (Blocking, so this must be last method!) #socketio.run(app, host='0.0.0.0', port=5000) diff --git a/static/application.js b/static/application.js index 311665796..14173e21e 100644 --- a/static/application.js +++ b/static/application.js @@ -7,6 +7,7 @@ var socket; // UI references for jQuery var connect_status; +var button_loadmodel; var button_newgame; var button_rndgame; var button_save; @@ -55,6 +56,7 @@ var savepins; var topic; var saveas_accept; var saveas_close; +var loadmodelpopup; var loadpopup; var loadcontent; var load_accept; @@ -890,6 +892,17 @@ function sendSaveAsRequest() { socket.send({'cmd': 'saveasrequest', 'data': {"name": saveasinput.val(), "pins": savepins.val()}}); } +function showLoadModelPopup() { + loadmodelpopup.removeClass("hidden"); + loadmodelpopup.addClass("flex"); +} + +function hideLoadModelPopup() { + loadmodelpopup.removeClass("flex"); + loadmodelpopup.addClass("hidden"); + loadmodelcontent.html(""); +} + function showLoadPopup() { loadpopup.removeClass("hidden"); loadpopup.addClass("flex"); @@ -923,6 +936,46 @@ function hideUSPopup() { spcontent.html(""); } + +function buildLoadModelList(ar) { + disableButtons([load_model_accept]); + loadmodelcontent.html(""); + var i; + for(i=0; i\ +
" + if(ar[i][3]) { + html = html + "" + } else { + html = html + "
" + } + html = html + "
\ +
\ +
"+ar[i][0]+"
\ +
"+ar[i][2]+"
\ +
\ + " + loadmodelcontent.append(html); + if(ar[i][3]) { + $("#loadmodel"+i).off("click").on("click", (function () { + return function () { + socket.send({'cmd': 'list_model', 'data': $(this).attr("name")}); + disableButtons([load_model_accept]); + } + })(i)); + } else { + $("#loadmodel"+i).off("click").on("click", (function () { + return function () { + socket.send({'cmd': 'selectmodel', 'data': $(this).attr("name")}); + highlightLoadLine($(this)); + enableButtons([load_model_accept]); + } + })(i)); + } + } +} + function buildLoadList(ar) { disableButtons([load_accept]); loadcontent.html(""); @@ -1771,6 +1824,7 @@ $(document).ready(function(){ // Bind UI references connect_status = $('#connectstatus'); + button_loadmodel = $('#btn_loadmodel'); button_newgame = $('#btn_newgame'); button_rndgame = $('#btn_rndgame'); button_save = $('#btn_save'); @@ -1823,9 +1877,13 @@ $(document).ready(function(){ saveas_accept = $("#btn_saveasaccept"); saveas_close = $("#btn_saveasclose"); loadpopup = $("#loadcontainer"); + loadmodelpopup = $("#loadmodelcontainer"); loadcontent = $("#loadlistcontent"); + loadmodelcontent = $("#loadmodellistcontent"); load_accept = $("#btn_loadaccept"); load_close = $("#btn_loadclose"); + load_model_accept = $("#btn_loadmodelaccept"); + load_model_close = $("#btn_loadmodelclose"); sppopup = $("#spcontainer"); spcontent = $("#splistcontent"); sp_accept = $("#btn_spaccept"); @@ -2313,6 +2371,18 @@ $(document).ready(function(){ } else { debug_area.addClass("hidden"); } + } else if(msg.cmd == 'show_model_menu') { + if(msg.menu == 'gpt2list') { + $("#use_gpu_div").removeClass("hidden") + } else { + $("#use_gpu_div").addClass("hidden") + } + if(msg.menu == 'apilist') { + $("#modelkey").removeClass("hidden") + } else { + $("#modelkey").addClass("hidden") + } + buildLoadModelList(msg.data); } }); @@ -2511,12 +2581,23 @@ $(document).ready(function(){ hideLoadPopup(); }); + load_model_close.on("click", function(ev) { + hideLoadModelPopup(); + }); + load_accept.on("click", function(ev) { hideMessage(); newly_loaded = true; socket.send({'cmd': 'loadrequest', 'data': ''}); hideLoadPopup(); }); + + load_model_accept.on("click", function(ev) { + hideMessage(); + socket.send({'cmd': 'load_model', 'use_gpu': $('#use_gpu')[0].checked, 'key': $('#modelkey')[0].value}); + loadmodelcontent.html(""); + hideLoadModelPopup(); + }); sp_close.on("click", function(ev) { hideSPPopup(); @@ -2540,6 +2621,11 @@ $(document).ready(function(){ hideUSPopup(); }); + button_loadmodel.on("click", function(ev) { + showLoadModelPopup(); + socket.send({'cmd': 'list_model', 'data': 'mainmenu'}); + }); + button_newgame.on("click", function(ev) { if(connected) { showNewStoryPopup(); diff --git a/templates/index.html b/templates/index.html index 6db5c093c..e53fd02e5 100644 --- a/templates/index.html +++ b/templates/index.html @@ -33,6 +33,12 @@ +