GLM-4-9B-Chat-1M-int4
GLM-4-9B-Chat-1M int4量化的模型文件,支持1M
上下文长度。
运行模型
gradio demo:
import torch
# from transformers import AutoModelForCausalLM, AutoTokenizer
from modelscope import AutoModelForCausalLM, AutoTokenizer
import gradio
device = "cuda"
tokenizer = AutoTokenizer.from_pretrained(r".\glm-4-9b-chat-1m-int4",trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
r".\glm-4-9b-chat-1m-int4",
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
trust_remote_code=True
)
def generate(query, history):
with torch.no_grad():
for response, _ in model.stream_chat(tokenizer, query):
yield response
gradio.ChatInterface(generate).launch(inbrowser=True)
评论