from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer from peft import LoraConfig, get_peft_model from datasets import Dataset import torch import os # Load model and tokenizer from HF model_name = "Qwen/Qwen2.5-1.5B" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto") # GPU # Prepare dataset: each .txt file as one example content_dir = "./content" texts = [] for txt_file in os.listdir(content_dir): if txt_file.endswith(".txt"): with open(os.path.join(content_dir, txt_file), "r", encoding="utf-8") as tf: # Join all lines in the file into one text text = " ".join(line.strip() for line in tf.readlines() if line.strip()) texts.append(text) dataset = Dataset.from_dict({"text": texts}) print(f"Dataset size: {len(dataset)}") # Should be ~300 def tokenize_function(examples): # Tokenize the text tokenized = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128) # Create labels for causal LM tokenized["labels"] = tokenized["input_ids"].copy() return tokenized tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"]) # Configure LoRA lora_config = LoraConfig( r=8, lora_alpha=32, target_modules=["q_proj", "v_proj"], lora_dropout=0.1, ) model = get_peft_model(model, lora_config) # Training arguments training_args = TrainingArguments( output_dir="./fine_tuned_qwen2_5_1_5b", per_device_train_batch_size=8, gradient_accumulation_steps=1, num_train_epochs=5, learning_rate=2e-4, save_steps=50, logging_steps=10, fp16=True, ) # Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_dataset, ) # Fine-tune trainer.train() # Save model.save_pretrained("./fine_tuned_qwen2_5_1_5b") tokenizer.save_pretrained("./fine_tuned_qwen2_5_1_5b")