From 576c82eda210ca0111c04f5256bf77897a4d4cc4 Mon Sep 17 00:00:00 2001 From: Dowon Date: Fri, 11 Jul 2025 16:36:04 +0900 Subject: [PATCH] vocab : add midm-2.0 model pre-tokenizer (#14626) --- convert_hf_to_gguf.py | 3 +++ convert_hf_to_gguf_update.py | 1 + src/llama-vocab.cpp | 3 ++- 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 52aa87d6a..3d5e7e5a4 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -833,6 +833,9 @@ class TextModel(ModelBase): if chkhsh == "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b": # ref: https://huggingface.co/tiiuae/Falcon-H1-34B-Base res = "falcon-h1" + if chkhsh == "f6791d196f87ce6b56a7d234be618e0d58f8cda3549416635b2bebcd22cd95c4": + # ref: https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct + res = "midm-2.0" if res is None: logger.warning("\n") diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index b8cb6027d..9f9b88da8 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -129,6 +129,7 @@ models = [ {"name": "pixtral", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistral-community/pixtral-12b", }, {"name": "seed-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", }, {"name": "a.x-4.0", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/skt/A.X-4.0", }, + {"name": "midm-2.0", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct", }, ] # some models are known to be broken upstream, so we will skip them as exceptions diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 10823b183..02cdc244a 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1524,7 +1524,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { tokenizer_pre == "llama-bpe"|| tokenizer_pre == "falcon3" || tokenizer_pre == "falcon-h1" || - tokenizer_pre == "pixtral") { + tokenizer_pre == "pixtral" || + tokenizer_pre == "midm-2.0") { pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3; ignore_merges = true; add_bos = true;