From 88800517bed240d437e9374743ae67b6d3a9e7ce Mon Sep 17 00:00:00 2001 From: lucidrains Date: Sat, 21 Dec 2024 08:59:54 -0800 Subject: [PATCH] note --- README.md | 2 +- x_transformers/x_transformers.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 6e0a2e68..6137949c 100644 --- a/README.md +++ b/README.md @@ -2240,7 +2240,7 @@ ids_out, num_out, is_number_mask = model.generate(start_ids, start_nums, 17) } ``` -``` +```bibtex @article{Yang2017BreakingTS, title = {Breaking the Softmax Bottleneck: A High-Rank RNN Language Model}, author = {Zhilin Yang and Zihang Dai and Ruslan Salakhutdinov and William W. Cohen}, diff --git a/x_transformers/x_transformers.py b/x_transformers/x_transformers.py index f9b7cae5..807fbf60 100644 --- a/x_transformers/x_transformers.py +++ b/x_transformers/x_transformers.py @@ -1650,7 +1650,7 @@ def __init__( unet_skips = False, num_residual_streams = 1, reinject_input = False, # seen first in DEQ paper https://arxiv.org/abs/1909.01377, but later used in a number of papers trying to achieve depthwise generalization https://arxiv.org/abs/2410.03020v1 - add_value_residual = False, # resformer from Zhou et al - https://arxiv.org/abs/2410.17897v1 + add_value_residual = False, # resformer from Zhou et al - https://arxiv.org/abs/2410.17897v1 - further corroboration by https://arxiv.org/abs/2412.15113 (faster emergence of ICL) - looks like this setting may becoming a necessity for every transformer soon learned_value_residual_mix = True, # seeing big improvements when the value residual mix value is learned per token - credit goes to @faresobeid for taking the first step with learned scalar mix, then @Blinkdl for taking it a step further with data dependent. here we will use per token learned rel_pos_kwargs: dict = dict(), **kwargs