langchain 使用本地通义千问,7B,14B,glm4

关于 glm4-9b 下载到本地后,加载到langchain 中,也会报错,跟 qwen-7b-chat 一样的,对glm参考解决方案:self-llm/GLM-4/02-GLM-4-9B-chat langchain 接入.md at master · datawhalechina/self-llm · GitHub

  1. from langchain.llms.base import LLM
  2. from typing import Any, List, Optional, Dict
  3. from langchain.callbacks.manager import CallbackManagerForLLMRun
  4. from transformers import AutoTokenizer, AutoModelForCausalLM
  5. import torch
  6. class ChatGLM4_LLM(LLM):
  7. # 基于本地 ChatGLM4 自定义 LLM 类
  8. tokenizer: AutoTokenizer = None
  9. model: AutoModelForCausalLM = None
  10. gen_kwargs: dict = None
  11. def __init__(self, mode_name_or_path: str, gen_kwargs: dict = None):
  12. super().__init__()
  13. print("正在从本地加载模型...")
  14. self.tokenizer = AutoTokenizer.from_pretrained(
  15. mode_name_or_path, trust_remote_code=True
  16. )
  17. self.model = AutoModelForCausalLM.from_pretrained(
  18. mode_name_or_path,
  19. torch_dtype=torch.bfloat16,
  20. trust_remote_code=True,
  21. device_map="auto"
  22. ).eval()
  23. print("完成本地模型的加载")
  24. if gen_kwargs is None:
  25. gen_kwargs = {"max_length": 2500, "do_sample": True, "top_k": 1}
  26. self.gen_kwargs = gen_kwargs
  27. def _call(self, prompt: str, stop: Optional[List[str]] = None,
  28. run_manager: Optional[CallbackManagerForLLMRun] = None,
  29. **kwargs: Any) -> str:
  30. messages = [{"role": "user", "content": prompt}]
  31. model_inputs = self.tokenizer.apply_chat_template(
  32. messages, tokenize=True, return_tensors="pt", return_dict=True, add_generation_prompt=True
  33. )
  34. generated_ids = self.model.generate(**model_inputs, **self.gen_kwargs)
  35. generated_ids = [
  36. output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs['input_ids'], generated_ids)
  37. ]
  38. response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
  39. return response
  40. @property
  41. def _identifying_params(self) -> Dict[str, Any]:
  42. """返回用于识别LLM的字典,这对于缓存和跟踪目的至关重要。"""
  43. return {
  44. "model_name": "glm-4-9b-chat",
  45. "max_length": self.gen_kwargs.get("max_length"),
  46. "do_sample": self.gen_kwargs.get("do_sample"),
  47. "top_k": self.gen_kwargs.get("top_k"),
  48. }
  49. @property
  50. def _llm_type(self) -> str:
  51. return "glm-4-9b-chat"
  1. from LLM import ChatGLM4_LLM
  2. gen_kwargs = {"max_length": 2500, "do_sample": True, "top_k": 1}
  3. llm = ChatGLM4_LLM(mode_name_or_path="/root/autodl-tmp/ZhipuAI/glm-4-9b-chat", gen_kwargs=gen_kwargs)
  4. print(llm.invoke("你是谁"))

 

langchian 使用已经下载到本地的模型,我们使用通义千问

显存:24G

模型:qwen1.5-7B-Chat,qwen-7B-Chat

先使用 qwen-7B-Chat,会报错用不了:

看了下是不支持这中模型,但看列表中有一个 Qwen 字样,想着应该是支持的,就去 hugging face 搜了下这个东西 “Qwen2”找到了对应的 qwen1.5-7B-Chat 模型

https://huggingface.co/Qwen/Qwen1.5-7B-Chat

其实也就是一种公测版本,,所以总结来说目前直接导入本地 通义千问 langchaing 支持不是很好,可以使用 ollama,但这个下载非常慢,还会失败

 qwen1.5-7B-Chat 我们用这个模型,是可以加载成功的,并输出的,但是非常非常慢

  1. from transformers import AutoTokenizer, AutoModelForCausalLM
  2. from transformers import pipeline
  3. from langchain import HuggingFacePipeline
  4. from langchain_core.prompts import ChatPromptTemplate
  5. import torch
  6. device = torch.device("cuda")
  7. model_path = "/root/autodl-tmp/Qwen1___5-7B-Chat"
  8. tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
  9. model = AutoModelForCausalLM.from_pretrained(
  10. model_path,
  11. device_map='auto',
  12. trust_remote_code=True
  13. ).to(device).eval()
  14. pipe = pipeline(
  15. "text-generation",
  16. model=model.to(device),
  17. tokenizer=tokenizer,
  18. # max_length=4096,
  19. # max_tokens=4096,
  20. max_new_tokens=512,
  21. top_p=1,
  22. repetition_penalty=1.15
  23. )
  24. llama_model = HuggingFacePipeline(pipeline=pipe)
  25. prompt = ChatPromptTemplate.from_template("请编写一篇关于{topic}的中文小故事,不超过100字")
  26. chain = prompt | llama_model
  27. res = chain.invoke({"topic": "小白兔"})
  28. print(res)

qwen-14b-chat 可以运行 

指定 gpu,必须指定到开头,langchain 前面

  1. import os
  2. os.environ["CUDA_VISIBLE_DEVICES"] = "5,6"
  3. from transformers import AutoTokenizer, AutoModelForCausalLM
  4. from transformers import pipeline
  5. from langchain import HuggingFacePipeline
  6. from langchain_core.prompts import ChatPromptTemplate
  7. tokenizer = AutoTokenizer.from_pretrained("/home/qwen-14b-chat/",
  8. trust_remote_code=True)
  9. model = AutoModelForCausalLM.from_pretrained("/home/qwen-14b-chat/",
  10. device_map="auto",
  11. trust_remote_code=True).eval()
  12. pipe = pipeline(
  13. "text-generation",
  14. model=model,
  15. tokenizer=tokenizer,
  16. # max_length=4096,
  17. # max_tokens=4096,
  18. max_new_tokens=512,
  19. top_p=1,
  20. repetition_penalty=1.15
  21. )
  22. llama_model = HuggingFacePipeline(pipeline=pipe)
  23. prompt = ChatPromptTemplate.from_template("请编写一篇关于{topic}的中文小故事,不超过100字")
  24. chain = prompt | llama_model
  25. res = chain.invoke({"topic": "小白兔"})
  26. print(res)