vocab : add midm-2.0 model pre-tokenizer (#14626)

This commit is contained in:
Dowon 2025-07-11 16:36:04 +09:00 committed by GitHub
parent 0aedae00e6
commit 576c82eda2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 6 additions and 1 deletions

View File

@ -833,6 +833,9 @@ class TextModel(ModelBase):
if chkhsh == "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b": if chkhsh == "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b":
# ref: https://huggingface.co/tiiuae/Falcon-H1-34B-Base # ref: https://huggingface.co/tiiuae/Falcon-H1-34B-Base
res = "falcon-h1" res = "falcon-h1"
if chkhsh == "f6791d196f87ce6b56a7d234be618e0d58f8cda3549416635b2bebcd22cd95c4":
# ref: https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct
res = "midm-2.0"
if res is None: if res is None:
logger.warning("\n") logger.warning("\n")

View File

@ -129,6 +129,7 @@ models = [
{"name": "pixtral", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistral-community/pixtral-12b", }, {"name": "pixtral", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistral-community/pixtral-12b", },
{"name": "seed-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", }, {"name": "seed-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", },
{"name": "a.x-4.0", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/skt/A.X-4.0", }, {"name": "a.x-4.0", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/skt/A.X-4.0", },
{"name": "midm-2.0", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct", },
] ]
# some models are known to be broken upstream, so we will skip them as exceptions # some models are known to be broken upstream, so we will skip them as exceptions

View File

@ -1524,7 +1524,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
tokenizer_pre == "llama-bpe"|| tokenizer_pre == "llama-bpe"||
tokenizer_pre == "falcon3" || tokenizer_pre == "falcon3" ||
tokenizer_pre == "falcon-h1" || tokenizer_pre == "falcon-h1" ||
tokenizer_pre == "pixtral") { tokenizer_pre == "pixtral" ||
tokenizer_pre == "midm-2.0") {
pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3; pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
ignore_merges = true; ignore_merges = true;
add_bos = true; add_bos = true;