from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig from config import Config import torch class LLMEngine: def __init__(self): self.model = None self.tokenizer = None self.config = Config() def initialize(self): device = "cuda" if torch.cuda.is_available() else "cpu" dtype = torch.float16 if device == "cuda" else torch.float32 self.tokenizer = AutoTokenizer.from_pretrained( self.config.LLM_MODEL, trust_remote_code=True, ) self.model = AutoModelForCausalLM.from_pretrained( self.config.LLM_MODEL, torch_dtype=dtype, device_map="auto", trust_remote_code=True, ) def generate(self, user_text: str, system_prompt: str = None) -> str: if not self.model: self.initialize() if system_prompt is None: system_prompt = "Ты полезный ассистент. Отвечай на русском языке кратко и по делу." messages = [ {"role": "system", "content": system_prompt}, {"role": "user", "content": user_text}, ] text = self.tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True, ) inputs = self.tokenizer(text, return_tensors="pt").to(self.model.device) with torch.no_grad(): outputs = self.model.generate( **inputs, max_new_tokens=self.config.LLM_MAX_TOKENS, temperature=self.config.LLM_TEMPERATURE, do_sample=True, top_p=0.9, repetition_penalty=1.1, ) generated = outputs[0][inputs["input_ids"].shape[1]:] response = self.tokenizer.decode(generated, skip_special_tokens=True) return response.strip()