Skip to content

Commit

Permalink
fix vocab_parallel_embedding sharding
Browse files Browse the repository at this point in the history
Signed-off-by: NickLucche <[email protected]>
  • Loading branch information
NickLucche committed Jan 20, 2025
1 parent 4001ea1 commit 1191485
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 4 deletions.
42 changes: 42 additions & 0 deletions tests/models/decoder_only/language/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,3 +87,45 @@ def print_model(model):
name_0="hf",
name_1="vllm",
)

@pytest.mark.parametrize(
"model",
[
pytest.param("cognitivecomputations/TinyDolphin-2.8-1.1b"), # testing VocabParallelEmbedding crash
])
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("tp", [2])
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5])
def test_tp_models(
hf_runner,
vllm_runner,
example_prompts,
model: str,
dtype: str,
tp: int,
max_tokens: int,
num_logprobs: int,
) -> None:

with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy_logprobs_limit(
example_prompts, max_tokens, num_logprobs)

with vllm_runner(model, dtype=dtype, tensor_parallel_size=tp) as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)

# This test is for verifying whether the model's extra_repr
# can be printed correctly.
def print_model(model):
print(model)

vllm_model.apply_model(print_model)

check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
2 changes: 1 addition & 1 deletion vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,7 @@ def __init__(self,
self.max_logprobs = max_logprobs
self.disable_sliding_window = disable_sliding_window
self.skip_tokenizer_init = skip_tokenizer_init

# breakpoint()
hf_config = get_config(self.model, trust_remote_code, revision,
code_revision, config_format)

Expand Down
4 changes: 2 additions & 2 deletions vllm/model_executor/layers/vocab_parallel_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,7 +355,7 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
elif isinstance(param, UninitializedParameter):
shape = list(loaded_weight.shape)
if output_dim is not None:
shape[output_dim] = shape[output_dim] // self.tp_size
shape[output_dim] = self.num_embeddings_per_partition
param.materialize(tuple(shape), dtype=loaded_weight.dtype)

# If parameter does not have output dim, then it should
Expand All @@ -381,7 +381,7 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
else:
assert loaded_weight.shape[output_dim] == self.org_vocab_size

# Copy the data.
# Copy the data. Select chunk corresponding to current shard.
loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)

if current_platform.is_hpu():
Expand Down
4 changes: 3 additions & 1 deletion vllm/transformers_utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ def get_config(
token=HF_TOKEN,
**kwargs,
)

# config_dict["model_type"] = "granite"
# Use custom model class if it's in our registry
model_type = config_dict.get("model_type")
if model_type in _CONFIG_REGISTRY:
Expand All @@ -228,6 +228,7 @@ def get_config(
token=HF_TOKEN,
**kwargs,
)
# config.model_type = 'granite'
except ValueError as e:
if (not trust_remote_code
and "requires you to execute the configuration file"
Expand All @@ -252,6 +253,7 @@ def get_config(
if config.model_type not in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES:
raise RuntimeError(
f"Can't get gguf config for {config.model_type}.")
# model_type = MODEL_FOR_CAUSAL_LM_MAPPING_NAMES['granite']
model_type = MODEL_FOR_CAUSAL_LM_MAPPING_NAMES[config.model_type]
config.update({"architectures": [model_type]})

Expand Down

0 comments on commit 1191485

Please sign in to comment.