LLMCompress/models.py at main · rotask/LLMCompress · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import torch
import logging
from transformers import AutoModelForCausalLM, AutoTokenizer, MixtralForCausalLM, BitsAndBytesConfig, GPT2LMHeadModel, GPT2Tokenizer

def _make_bnb_config():
    """
    Build the 4-bit NF4 quantization config used by all non-GPT-2 models.
    Lazy: constructed only when actually needed so that CPU-only environments
    (no bitsandbytes installed) can still load GPT-2 for smoke tests.
    """
    return BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )


def get_model_and_tokenizer(model_name):
    """
    Get the specified model and its corresponding tokenizer.

    Args:
        model_name (str): Name of the model to load.

    Returns:
        tuple: (model, tokenizer) for the specified model.

    Raises:
        ValueError: If an unsupported model name is provided.
    """

    # Load the specified model and tokenizer
    if model_name == "Mixtral":
        model = MixtralForCausalLM.from_pretrained(
            "mistralai/Mixtral-8x7B-v0.1",
            quantization_config=_make_bnb_config(),
            attn_implementation="flash_attention_2",
            device_map="auto"
        )
        tokenizer = AutoTokenizer.from_pretrained("mistralai/Mixtral-8x7B-v0.1")
    elif model_name == "gpt2":
        model = GPT2LMHeadModel.from_pretrained("gpt2", device_map="auto")
        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    elif model_name == "Yi":
        model = AutoModelForCausalLM.from_pretrained(
            "01-ai/Yi-34B",
            quantization_config=_make_bnb_config(),
            attn_implementation="flash_attention_2",
            device_map="auto"
        )
        tokenizer = AutoTokenizer.from_pretrained("01-ai/Yi-34B")
    elif model_name == "Nemo":
        model = AutoModelForCausalLM.from_pretrained(
            "mistralai/Mistral-Nemo-Base-2407",
            quantization_config=_make_bnb_config(),
            device_map="auto"
        )
        tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-Nemo-Base-2407")
    elif model_name == "llama_2":
        model = AutoModelForCausalLM.from_pretrained(
            "meta-llama/Llama-2-7b-hf",
            quantization_config=_make_bnb_config(),
            attn_implementation="flash_attention_2",
            device_map="auto"
        )
        tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
    elif model_name == "llama_3":
        model = AutoModelForCausalLM.from_pretrained(
            "meta-llama/Meta-Llama-3-8B",
            quantization_config=_make_bnb_config(),
            attn_implementation="flash_attention_2",
            device_map="auto"
        )
        tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")
    elif model_name == "llama_3.1":
        model = AutoModelForCausalLM.from_pretrained(
            "meta-llama/Meta-Llama-3.1-8B",
            quantization_config=_make_bnb_config(),
            attn_implementation="flash_attention_2",
            device_map="auto"
        )
        tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B")
    elif model_name == "Mistral_7B":
        model = AutoModelForCausalLM.from_pretrained(
            "mistralai/Mistral-7B-Instruct-v0.3",
            quantization_config=_make_bnb_config(),
            attn_implementation="flash_attention_2",
            device_map="auto"
        )
        tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3")
    else:
        logging.error(f"Unsupported model: {model_name}")
        raise ValueError(f"Unsupported model: {model_name}")

    return model, tokenizer