| | |
| | """Convert a local BF16 model into Marlin-supported quant formats via llm-compressor.""" |
| |
|
| | from __future__ import annotations |
| |
|
| | import gc |
| | import os |
| | import sys |
| | from typing import Optional |
| |
|
| | import torch |
| | from datasets import load_dataset |
| | from transformers import AutoModelForCausalLM, AutoTokenizer |
| |
|
| | |
| | LLM_COMPRESSOR_SRC = "/home/quixi/marlin-cdna/llm-compressor/src" |
| | if os.path.isdir(LLM_COMPRESSOR_SRC): |
| | sys.path.insert(0, LLM_COMPRESSOR_SRC) |
| |
|
| | from llmcompressor import oneshot |
| | from llmcompressor.modifiers.awq import AWQModifier |
| | from llmcompressor.modifiers.quantization import ( |
| | GPTQModifier, |
| | QuantizationModifier, |
| | ) |
| |
|
| | MODEL_PATH = "/home/quixi/models/Llama-3.2-1B" |
| | OUTPUT_ROOT = "/home/quixi/models" |
| |
|
| | CALIB_DATASET_ID = "HuggingFaceH4/ultrachat_200k" |
| | CALIB_DATASET_SPLIT = "train_sft" |
| | NUM_CALIBRATION_SAMPLES = 128 |
| | MAX_SEQUENCE_LENGTH = 512 |
| |
|
| |
|
| | def _load_tokenized_dataset(tokenizer): |
| | ds = load_dataset( |
| | CALIB_DATASET_ID, |
| | split=f"{CALIB_DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]", |
| | ).shuffle(seed=42) |
| |
|
| | def preprocess(example): |
| | return { |
| | "text": tokenizer.apply_chat_template( |
| | example["messages"], |
| | tokenize=False, |
| | ) |
| | } |
| |
|
| | ds = ds.map(preprocess) |
| |
|
| | def tokenize(sample): |
| | return tokenizer( |
| | sample["text"], |
| | padding=False, |
| | max_length=MAX_SEQUENCE_LENGTH, |
| | truncation=True, |
| | add_special_tokens=False, |
| | ) |
| |
|
| | return ds.map(tokenize, remove_columns=ds.column_names) |
| |
|
| |
|
| | def _load_model_and_tokenizer(): |
| | model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, dtype="auto") |
| | tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH) |
| | if torch.cuda.is_available(): |
| | model.to("cuda") |
| | return model, tokenizer |
| |
|
| |
|
| | def _cleanup(model, tokenizer): |
| | del model |
| | del tokenizer |
| | gc.collect() |
| | if torch.cuda.is_available(): |
| | torch.cuda.empty_cache() |
| |
|
| |
|
| | def _run_recipe( |
| | name: str, |
| | recipe, |
| | *, |
| | save_compressed: bool, |
| | use_calibration: bool, |
| | ) -> Optional[str]: |
| | print(f"\n=== Quantizing {name} ===") |
| | model, tokenizer = _load_model_and_tokenizer() |
| |
|
| | oneshot_kwargs = {"model": model, "recipe": recipe} |
| | if use_calibration: |
| | ds = _load_tokenized_dataset(tokenizer) |
| | oneshot_kwargs.update( |
| | dataset=ds, |
| | max_seq_length=MAX_SEQUENCE_LENGTH, |
| | num_calibration_samples=NUM_CALIBRATION_SAMPLES, |
| | ) |
| |
|
| | oneshot(**oneshot_kwargs) |
| |
|
| | base_name = os.path.basename(MODEL_PATH.rstrip("/")) |
| | save_dir = os.path.join(OUTPUT_ROOT, f"{base_name}-{name}") |
| | os.makedirs(save_dir, exist_ok=True) |
| |
|
| | if save_compressed: |
| | model.save_pretrained(save_dir, save_compressed=True) |
| | else: |
| | model.save_pretrained(save_dir) |
| | tokenizer.save_pretrained(save_dir) |
| |
|
| | _cleanup(model, tokenizer) |
| | return save_dir |
| |
|
| |
|
| | def main(): |
| | |
| | _run_recipe( |
| | "W4A16-GPTQ", |
| | GPTQModifier(targets="Linear", scheme="W4A16", ignore=["lm_head"]), |
| | save_compressed=True, |
| | use_calibration=True, |
| | ) |
| |
|
| | |
| | _run_recipe( |
| | "W4A16-AWQ", |
| | AWQModifier( |
| | targets=["Linear"], |
| | scheme="W4A16_ASYM", |
| | ignore=["lm_head"], |
| | duo_scaling="both", |
| | ), |
| | save_compressed=True, |
| | use_calibration=True, |
| | ) |
| |
|
| | |
| | _run_recipe( |
| | "W8A16-GPTQ", |
| | GPTQModifier(targets="Linear", scheme="W8A16", ignore=["lm_head"]), |
| | save_compressed=True, |
| | use_calibration=True, |
| | ) |
| |
|
| | |
| | _run_recipe( |
| | "FP8-Dynamic", |
| | QuantizationModifier(targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]), |
| | save_compressed=False, |
| | use_calibration=False, |
| | ) |
| |
|
| | |
| | _run_recipe( |
| | "NVFP4A16", |
| | QuantizationModifier(targets="Linear", scheme="NVFP4A16", ignore=["lm_head"]), |
| | save_compressed=True, |
| | use_calibration=False, |
| | ) |
| |
|
| | |
| | _run_recipe( |
| | "MXFP4", |
| | QuantizationModifier(targets="Linear", scheme="MXFP4", ignore=["lm_head"]), |
| | save_compressed=True, |
| | use_calibration=False, |
| | ) |
| |
|
| |
|
| | if __name__ == "__main__": |
| | main() |