Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Misc] Add Gamma-Distribution Request Generation Support for Serving Benchmark. #10105

Merged
merged 3 commits into from
Nov 7, 2024
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 50 additions & 5 deletions benchmarks/benchmark_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,17 +297,41 @@ def sample_random_requests(
async def get_request(
input_requests: List[Tuple[str, int, int]],
request_rate: float,
burstiness: float = 1.0,
) -> AsyncGenerator[Tuple[str, int, int], None]:
"""
Asynchronously generates requests at a specified rate
with OPTIONAL burstiness.

Args:
input_requests:
A list of input requests, each represented as a tuple.
request_rate:
The rate at which requests are generated (requests/s).
burstiness (optional):
The burstiness factor of the request generation.
Only takes effect when request_rate is not inf.
Default value is 1, which follows a Poisson process.
Otherwise, the request intervals follow a gamma distribution.
A lower burstiness value (0 < burstiness < 1) results
in more bursty requests, while a higher burstiness value
(burstiness > 1) results in a more uniform arrival of requests.
"""
input_requests = iter(input_requests)

# Calculate scale parameter theta to maintain the desired request_rate.
theta = 1.0 / (request_rate * burstiness)
spliii marked this conversation as resolved.
Show resolved Hide resolved

for request in input_requests:
yield request

if request_rate == float("inf"):
# If the request rate is infinity, then we don't need to wait.
continue

# Sample the request interval from the exponential distribution.
interval = np.random.exponential(1.0 / request_rate)
# Sample the request interval from the gamma distribution.
# If burstiness is 1, it follows exponential distribution.
interval = np.random.gamma(shape=burstiness, scale=theta)
# The next request will be sent after the interval.
await asyncio.sleep(interval)

Expand Down Expand Up @@ -426,6 +450,7 @@ async def benchmark(
logprobs: Optional[int],
best_of: int,
request_rate: float,
burstiness: float,
disable_tqdm: bool,
profile: bool,
selected_percentile_metrics: List[str],
Expand Down Expand Up @@ -480,7 +505,13 @@ async def benchmark(
if profile_output.success:
print("Profiler started")

if burstiness == 1.0:
distribution = "Poisson process"
else:
distribution = "Gamma distribution"

print(f"Traffic request rate: {request_rate}")
print(f"Burstiness factor: {burstiness} ({distribution})")
print(f"Maximum request concurrency: {max_concurrency}")

pbar = None if disable_tqdm else tqdm(total=len(input_requests))
Expand All @@ -502,7 +533,7 @@ async def limited_request_func(request_func_input, pbar):

benchmark_start_time = time.perf_counter()
tasks: List[asyncio.Task] = []
async for request in get_request(input_requests, request_rate):
async for request in get_request(input_requests, request_rate, burstiness):
prompt, prompt_len, output_len, mm_content = request
request_func_input = RequestFuncInput(model=model_id,
prompt=prompt,
Expand Down Expand Up @@ -769,6 +800,7 @@ def main(args: argparse.Namespace):
logprobs=args.logprobs,
best_of=args.best_of,
request_rate=args.request_rate,
burstiness=args.burstiness,
disable_tqdm=args.disable_tqdm,
profile=args.profile,
selected_percentile_metrics=args.percentile_metrics.split(","),
Expand Down Expand Up @@ -807,6 +839,7 @@ def main(args: argparse.Namespace):
# Traffic
result_json["request_rate"] = (
args.request_rate if args.request_rate < float("inf") else "inf")
result_json["burstiness"] = args.burstiness
result_json["max_concurrency"] = args.max_concurrency

# Merge with benchmark result
Expand Down Expand Up @@ -922,8 +955,20 @@ def main(args: argparse.Namespace):
default=float("inf"),
help="Number of requests per second. If this is inf, "
"then all the requests are sent at time 0. "
"Otherwise, we use Poisson process to synthesize "
"the request arrival times.",
"Otherwise, we use Poisson process or gamma distribution "
"to synthesize the request arrival times.",
)
parser.add_argument(
"--burstiness",
type=float,
default=1.0,
help="Burstiness factor of the request generation. "
"Only take effect when request_rate is not inf. "
"Default value is 1, which follows Poisson process. "
"Otherwise, the request intervals follow a gamma distribution. "
"A lower burstiness value (0 < burstiness < 1) results in more "
"bursty requests. A higher burstiness value (burstiness > 1) "
"results in a more uniform arrival of requests.",
)
parser.add_argument("--seed", type=int, default=0)
parser.add_argument(
Expand Down