| |
| """ |
| Extract text-only LLM from HyperCLOVAX-SEED-Think-32B VLM. |
| Converts to LLaMA-compatible format for standard inference engines. |
| |
| Usage: |
| python extract_llm.py --input ./HyperCLOVAX-SEED-Think-32B --output ./HyperCLOVAX-SEED-Text-Think-32B |
| |
| Requirements: |
| pip install safetensors torch tqdm |
| """ |
|
|
| import argparse |
| import json |
| import os |
| import shutil |
| from pathlib import Path |
| from collections import defaultdict |
| from safetensors import safe_open |
| from safetensors.torch import save_file |
| import torch |
| from tqdm import tqdm |
|
|
|
|
| def load_weight_index(model_path: Path) -> dict: |
| """Load the safetensors weight index file.""" |
| index_path = model_path / "model.safetensors.index.json" |
| with open(index_path, "r") as f: |
| return json.load(f) |
|
|
|
|
| def extract_llm_weights(model_path: Path, output_path: Path): |
| """ |
| Extract LLM weights from VLM. |
| |
| Key mapping: |
| - model.language_model.model.* → model.* |
| - model.language_model.lm_head.* → lm_head.* |
| |
| All vision encoder and MM projector weights are excluded. |
| """ |
| output_path.mkdir(parents=True, exist_ok=True) |
| |
| weight_index = load_weight_index(model_path) |
| weight_map = weight_index["weight_map"] |
| |
| |
| llm_weights = {} |
| for key, shard_file in weight_map.items(): |
| if key.startswith("model.language_model."): |
| if key.startswith("model.language_model.model."): |
| new_key = key.replace("model.language_model.model.", "model.") |
| elif key.startswith("model.language_model.lm_head."): |
| new_key = key.replace("model.language_model.", "") |
| else: |
| new_key = key.replace("model.language_model.", "") |
| llm_weights[new_key] = (key, shard_file) |
| |
| print(f"Found {len(llm_weights)} LLM weight tensors") |
| print(f"Excluded {len(weight_map) - len(llm_weights)} vision/projector tensors") |
| |
| |
| shard_to_weights = defaultdict(list) |
| for new_key, (old_key, shard_file) in llm_weights.items(): |
| shard_to_weights[shard_file].append((old_key, new_key)) |
| |
| |
| all_tensors = {} |
| shard_files = sorted(set(shard_to_weights.keys())) |
| |
| print(f"\nLoading weights from {len(shard_files)} shards...") |
| for shard_file in tqdm(shard_files, desc="Loading shards"): |
| shard_path = model_path / shard_file |
| with safe_open(shard_path, framework="pt", device="cpu") as f: |
| for old_key, new_key in shard_to_weights[shard_file]: |
| tensor = f.get_tensor(old_key) |
| all_tensors[new_key] = tensor |
| |
| print(f"\nTotal tensors extracted: {len(all_tensors)}") |
| |
| total_size = sum(t.numel() * t.element_size() for t in all_tensors.values()) |
| print(f"Total size: {total_size / 1e9:.2f} GB") |
| |
| |
| max_shard_size = 5 * 1024 * 1024 * 1024 |
| |
| print("\nSaving extracted weights...") |
| save_sharded_safetensors(all_tensors, output_path, max_shard_size) |
| |
| return list(all_tensors.keys()) |
|
|
|
|
| def save_sharded_safetensors(tensors: dict, output_path: Path, max_shard_size: int): |
| """Save tensors as sharded safetensors files with index.""" |
| sorted_keys = sorted(tensors.keys()) |
| |
| shards = [] |
| current_shard = {} |
| current_size = 0 |
| shard_idx = 1 |
| weight_map = {} |
| |
| for key in sorted_keys: |
| tensor = tensors[key] |
| tensor_size = tensor.numel() * tensor.element_size() |
| |
| if current_size + tensor_size > max_shard_size and current_shard: |
| shards.append((shard_idx, current_shard)) |
| shard_idx += 1 |
| current_shard = {} |
| current_size = 0 |
| |
| current_shard[key] = tensor |
| current_size += tensor_size |
| |
| if current_shard: |
| shards.append((shard_idx, current_shard)) |
| |
| total_shards = len(shards) |
| total_size = sum(t.numel() * t.element_size() for t in tensors.values()) |
| |
| for shard_idx, shard_tensors in tqdm(shards, desc="Saving shards"): |
| shard_name = f"model-{shard_idx:05d}-of-{total_shards:05d}.safetensors" |
| shard_path = output_path / shard_name |
| save_file(shard_tensors, shard_path) |
| |
| for key in shard_tensors.keys(): |
| weight_map[key] = shard_name |
| |
| |
| index = { |
| "metadata": {"total_size": total_size}, |
| "weight_map": weight_map |
| } |
| index_path = output_path / "model.safetensors.index.json" |
| with open(index_path, "w") as f: |
| json.dump(index, f, indent=2) |
| |
| print(f"Saved {total_shards} shards to {output_path}") |
|
|
|
|
| def create_llama_config(original_config_path: Path, output_path: Path): |
| """ |
| Create LLaMA-compatible config from VLM config. |
| |
| Note: HyperCLOVAX uses attention_multiplier ≈ 1/sqrt(head_dim) |
| which matches standard LLaMA scaled dot-product attention. |
| """ |
| with open(original_config_path, "r") as f: |
| vlm_config = json.load(f) |
| |
| text_config = vlm_config["text_config"] |
| |
| llama_config = { |
| "architectures": ["LlamaForCausalLM"], |
| "attention_bias": text_config.get("attention_bias", False), |
| "attention_dropout": text_config.get("attention_dropout", 0.0), |
| "bos_token_id": text_config.get("bos_token_id", 128000), |
| "eos_token_id": text_config.get("eos_token_id", 128001), |
| "head_dim": text_config.get("head_dim", 128), |
| "hidden_act": text_config.get("hidden_act", "silu"), |
| "hidden_size": text_config.get("hidden_size", 5120), |
| "initializer_range": text_config.get("initializer_range", 0.006), |
| "intermediate_size": text_config.get("intermediate_size", 24192), |
| "max_position_embeddings": text_config.get("max_position_embeddings", 131072), |
| "mlp_bias": text_config.get("mlp_bias", False), |
| "model_type": "llama", |
| "num_attention_heads": text_config.get("num_attention_heads", 40), |
| "num_hidden_layers": text_config.get("num_hidden_layers", 72), |
| "num_key_value_heads": text_config.get("num_key_value_heads", 8), |
| "pad_token_id": text_config.get("pad_token_id", 0), |
| "pretraining_tp": 1, |
| "rms_norm_eps": text_config.get("rms_norm_eps", 1e-05), |
| "rope_scaling": text_config.get("rope_scaling", None), |
| "rope_theta": text_config.get("rope_theta", 50000000), |
| "tie_word_embeddings": text_config.get("tie_word_embeddings", False), |
| "torch_dtype": "bfloat16", |
| "transformers_version": "4.52.4", |
| "use_cache": True, |
| "vocab_size": text_config.get("vocab_size", 128256), |
| } |
| |
| config_path = output_path / "config.json" |
| with open(config_path, "w") as f: |
| json.dump(llama_config, f, indent=2) |
| |
| print(f"Saved LLaMA config to {config_path}") |
| |
| |
| gen_config = { |
| "bos_token_id": llama_config["bos_token_id"], |
| "eos_token_id": llama_config["eos_token_id"], |
| "pad_token_id": llama_config["pad_token_id"], |
| "do_sample": True, |
| "temperature": 0.7, |
| "top_p": 0.9, |
| "max_length": 4096 |
| } |
| gen_config_path = output_path / "generation_config.json" |
| with open(gen_config_path, "w") as f: |
| json.dump(gen_config, f, indent=2) |
| |
| return llama_config |
|
|
|
|
| def copy_tokenizer_files(original_path: Path, output_path: Path): |
| """Copy tokenizer files from original model.""" |
| tokenizer_files = [ |
| "tokenizer.json", |
| "tokenizer_config.json", |
| "special_tokens_map.json", |
| "added_tokens.json", |
| "vocab.json", |
| "merges.txt", |
| "chat_template.jinja" |
| ] |
| |
| copied = [] |
| for fname in tokenizer_files: |
| src = original_path / fname |
| if src.exists(): |
| dst = output_path / fname |
| shutil.copy2(src, dst) |
| copied.append(fname) |
| |
| print(f"Copied tokenizer files: {copied}") |
|
|
|
|
| def main(): |
| parser = argparse.ArgumentParser( |
| description="Extract text-only LLM from HyperCLOVAX-SEED-Think-32B VLM", |
| formatter_class=argparse.RawDescriptionHelpFormatter, |
| epilog=""" |
| Example: |
| # Download original VLM |
| huggingface-cli download naver-hyperclovax/HyperCLOVAX-SEED-Think-32B \\ |
| --local-dir ./HyperCLOVAX-SEED-Think-32B |
| |
| # Extract text-only LLM |
| python extract_llm.py \\ |
| --input ./HyperCLOVAX-SEED-Think-32B \\ |
| --output ./HyperCLOVAX-SEED-Text-Think-32B |
| """ |
| ) |
| parser.add_argument( |
| "--input", "-i", |
| type=Path, |
| required=True, |
| help="Path to original HyperCLOVAX-SEED-Think-32B VLM" |
| ) |
| parser.add_argument( |
| "--output", "-o", |
| type=Path, |
| required=True, |
| help="Output path for extracted text-only LLM" |
| ) |
| |
| args = parser.parse_args() |
| |
| if not args.input.exists(): |
| print(f"Error: Input path does not exist: {args.input}") |
| return 1 |
| |
| if not (args.input / "model.safetensors.index.json").exists(): |
| print(f"Error: model.safetensors.index.json not found in {args.input}") |
| return 1 |
| |
| print("=" * 60) |
| print("HyperCLOVAX VLM → Text-only LLM Extraction") |
| print("=" * 60) |
| print(f"Input: {args.input}") |
| print(f"Output: {args.output}") |
| |
| print("\n[Step 1] Extracting LLM weights...") |
| extracted_keys = extract_llm_weights(args.input, args.output) |
| |
| print("\n[Step 2] Creating LLaMA-compatible config...") |
| config = create_llama_config(args.input / "config.json", args.output) |
| |
| print("\n[Step 3] Copying tokenizer files...") |
| copy_tokenizer_files(args.input, args.output) |
| |
| print("\n" + "=" * 60) |
| print("Extraction complete!") |
| print(f"Output: {args.output}") |
| print("=" * 60) |
| |
| print(f"\nModel summary:") |
| print(f" - Architecture: LlamaForCausalLM") |
| print(f" - Hidden size: {config['hidden_size']}") |
| print(f" - Layers: {config['num_hidden_layers']}") |
| print(f" - Attention heads: {config['num_attention_heads']}") |
| print(f" - KV heads: {config['num_key_value_heads']}") |
| print(f" - Vocab size: {config['vocab_size']}") |
| print(f" - Max context: {config['max_position_embeddings']}") |
| |
| print(f"\nYou can now use the model with vLLM, transformers, or other LLaMA-compatible frameworks.") |
| return 0 |
|
|
|
|
| if __name__ == "__main__": |
| exit(main()) |
|
|