Some renaming

neuralmagic · Jun 25, 2024 · 4bcfde6 · 4bcfde6 · github-actions · Jun 25, 2024
1 parent d3665fa
commit 4bcfde6
Show file tree

Hide file tree

Showing 3 changed files with 5 additions and 4 deletions.
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -331,6 +331,7 @@ def get_default_config(
         }
     return config
 
+
 def fused_topk(
     hidden_states: torch.Tensor,
     gating_output: torch.Tensor,

diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -33,7 +33,7 @@ def get_scale_perms(num_bits):
             [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
     return scale_perm, scale_perm_single
 
-def marlin_permute_scales_2(s, size_k, size_n, group_size, num_bits):
+def marlin_permute_scales_numbits(s, size_k, size_n, group_size, num_bits):
     scale_perm, scale_perm_single = get_scale_perms(num_bits)
     if group_size < size_k and group_size != -1:
         s = s.reshape((-1, len(scale_perm)))[:, scale_perm]

diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py
@@ -53,7 +53,7 @@
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.sequence import SamplerOutput
 
-from vllm.model_executor.layers.quantization.utils.marlin_utils import marlin_permute_scales_2
+from vllm.model_executor.layers.quantization.utils.marlin_utils import marlin_permute_scales_numbits
 
 
 class MixtralMLP(nn.Module):
@@ -158,15 +158,15 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
             g_idx_sort_idx = torch.empty(0, dtype=torch.int, device=w13_qw.device)
             w13_qw = ops.gptq_marlin_repack(w13_qw, g_idx_sort_idx, size_k,
                                             size_n, self.quant_config.weight_bits)
-            w13_s =  marlin_permute_scales_2(w13_s, size_k, size_n,
+            w13_s =  marlin_permute_scales_numbits(w13_s, size_k, size_n,
                                                 self.quant_config.group_size,
                                                 self.quant_config.weight_bits)
 
             size_k = w2_qw.shape[0] * 8
             size_n = w2_qw.shape[1]
             w2_qw = ops.gptq_marlin_repack(w2_qw, g_idx_sort_idx, size_k,
                                             size_n, self.quant_config.weight_bits)
-            w2_s =  marlin_permute_scales_2(w2_s, size_k, size_n,
+            w2_s =  marlin_permute_scales_numbits(w2_s, size_k, size_n,
                                             self.quant_config.group_size,
                                             self.quant_config.weight_bits)
Benchmark suite	Current: `4bcfde6`	Previous: `9b2e107`	Ratio
`{"name": "request_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.0", "python_version": "3.11.4 (main, Jun 7 2023, 11:01:02) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`2.4831267624141535` prompts/s
`{"name": "token_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.0", "python_version": "3.11.4 (main, Jun 7 2023, 11:01:02) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`953.5206767670351` tokens/s
Benchmark suite	Current: `4bcfde6`	Previous: `9b2e107`	Ratio
`{"name": "request_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.0", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`2.4531545551203555` prompts/s	`2.477025972502708` prompts/s	`1.01`
`{"name": "token_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.0", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`942.0113491662165` tokens/s	`951.1779734410399` tokens/s	`1.01`
Benchmark suite	Current: `4bcfde6`	Previous: `9b2e107`	Ratio
`{"name": "request_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.0", "python_version": "3.8.17 (default, Jun 7 2023, 12:29:56) \n[GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`2.4472341907639117` prompts/s
`{"name": "token_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.0", "python_version": "3.8.17 (default, Jun 7 2023, 12:29:56) \n[GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`939.7379292533421` tokens/s
Benchmark suite	Current: `4bcfde6`	Previous: `9b2e107`	Ratio
`{"name": "request_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.0", "python_version": "3.9.17 (main, Jun 7 2023, 12:34:12) \n[GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`2.452244381869479` prompts/s
`{"name": "token_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.0", "python_version": "3.9.17 (main, Jun 7 2023, 12:34:12) \n[GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`941.66184263788` tokens/s