RAG 시스템 실전 구축 (v14)

RAG 시스템 실전 구축 (v14) 실제로 구현할 수 있는 RAG 시스템을 빠르게 구축하는 가이드 1. RAG 기초 개념 RAG (Retrieval-Augmented Generation)는 검색 기반 생성 모델로, 대규모 언어 모델(LLM)이 외부 정보를 검색하고 이를 기반으로 생성 응답을 만드는 아키텍처입니다. 검색 → 보완 → 생성 루프: 1. 사용자 질의를 벡터화 2. 벡터 데이터베이스에서 유사한 문서 검색 3. 검색된 문서들을 LLM 프롬프트에 포함 4. LLM이 검색된 정보를 기반으로 응답 생성 2. 청킹 전략 2.1 의미 청킹 (Semantic Chunking) from langchain.text_splitter import RecursiveCharacterTextSplitter import tiktoken def semantic_chunking ( text , chunk_size = 500 , chunk_overlap = 50 ): # 의미 단위로 청킹 splitter = RecursiveCharacterTextSplitter ( chunk_size = chunk_size , chunk_overlap = chunk_overlap , separators = [ " \n\n " , " \n " , " " , "" ] ) return splitter . split_text ( text ) # 예시 사용 chunks = semantic_chunking ( " 문서 내용... " , chunk_size = 1000 ) 2.2 재귀적 청킹 (Recursive Chunking) from langchain.text_splitter import RecursiveCharacterTextSplitter def recursive_chunking ( text ): # 긴 텍스트를 재귀적으로 분할 text_splitter = RecursiveCharacterTextSplitter ( chunk_size = 1000 , chunk_overlap = 200 , separators = [ " \n\n " , " \n " , " " , "" ] ) return text_splitter . split_text ( text ) 2.3 에이전트 기반 청킹 import re from typing import List def agentic_chunking ( text : str , max_chunk_size : int = 1000 ) -> List [ str ]: """ 문장 단위로 청킹하고, 문법적 의미를 고려하여 최적화 """ # 문장 단위 분할 sentences = re . split ( r ' [.!?]+ ' , text ) chunks = [] current_chunk = "" for sentence in sentences : if len ( current_chunk + sentence ) < max_chunk_size : current_chunk += sentence + " . " else : if current_chunk : chunks . append ( current_chunk . strip ()) current_chunk = sentence + " . " if current_chunk : chunks . append ( current_chunk . strip ()) return chunks 3. 임베딩 모델 선택 및 비교 3.1 다양한 임베딩 모델 비교 from sentence_transformers import SentenceTransformer from langchain.embeddings import HuggingFaceEmbeddings import numpy as np # 1. Sentence Transformers (유명한 오픈소스 모델) def get_sentence_transformer_embeddings (): model = SentenceTransformer ( ' all-MiniLM-L6-v2 ' ) return model # 2. HuggingFace 임베딩 (커스터마이징 가능) def get_huggingface_embeddings (): embeddings = HuggingFaceEmbeddings ( model_name = " sentence-transformers/all-MiniLM-L6-v2 " , model_kwargs = { " device " : " cuda " } ) return embeddings # 3. OpenAI 임베딩 (정확도 높지만 비용 발생) def get_openai_embeddings (): from langchain_openai import OpenAIEmbeddings return OpenAIEmbeddings ( model = " text-embedding-3-small " ) 3.2 성능 비교 코드 import time from sklearn.metrics.pairwise import cosine_similarity def benchmark_embeddings ( embedding_model , texts , query ): # 임베딩 생성 시간 측정 start_time = time . time () embeddings = embedding_model . encode ( texts ) encode_time = time . time () - start_time # 질의 임베딩 query_embedding = embedding_model . encode ([ query ]) # 유사도 계산 similarities = cosine_similarity ( query_embedding , embeddings )[ 0 ] return { ' encode_time ' : encode_time , ' similarities ' : similarities , ' top_matches ' : np . argsort ( similarities )[ - 3 :][:: - 1 ] } # 사용 예시 # results = benchmark_embeddings(model, chunks, "질의 내용") 4. 벡터 데이터베이스 비교 4.1 Chroma (가볍고 빠른 로컬 저장소) import chromadb from chromadb.config import Settings # Chroma 설정 chroma_client = chromadb . Client ( Settings ( chroma_db_impl = " duckdb+parquet " , persist_directory = " ./chroma_db " ) ) collection = chroma_client . get_or_create_collection ( " rag_collection " ) # 문서 추가 def add_documents_chroma ( documents , embeddings ): collection . add ( documents = documents , embeddings = embeddings , ids = [ str ( i ) for i in range ( len ( documents ))] ) # 검색 def search_chroma ( query_embedding , n_results = 5 ): results = collection . query ( query_embeddings = [ query_embedding ], n_results = n_results ) return results 4.2 Qdrant (고성능 클라우드 기반) from qdrant_client import QdrantClient from qdrant_client.models import Filter , FieldCondition , MatchValue client = QdrantClient ( path = " ./qdrant_storage " ) # 인덱스 생성 def create_qdrant_index (): client . recreate_collection ( collection_name = " rag_collection " , vectors_config = { " size " : 384 , " distance " : " Cosine " } ) # 검색 def search_qdrant ( query_vector , n_results = 5 ): results = client . search ( collection_name = " rag_collection " , query_vector = query_vector , limit = n_results ) return results 4.3 pgvector (PostgreSQL 확장) import psycopg2 from psycopg2.extras import Json def create_pgvector_table (): conn = psycopg2 . connect ( " postgresql://user:pass@localhost/db " ) cur = conn . cursor () # pgvector 테이블 생성 cur . execute ( """ CREATE TABLE IF NOT EXISTS rag_documents ( id UUID PRIMARY KEY, content TEXT, embedding VECTOR(384) ); """ ) # 인덱스 생성 cur . execute ( """ CREATE INDEX IF NOT EXISTS idx_embedding ON rag_documents USING ivfflat (embedding vector_cosine_ops); """ ) conn . commit () cur . close () conn . close () def search_pgvector ( query_embedding , n_results = 5 ): conn = psycopg2 . connect ( " postgresql://user:pass@localhost/db " ) cur = conn . cursor () cur . execute ( """ SELECT content FROM rag_documents ORDER BY embedding <-> %s LIMIT %s; """ , ( query_embedding , n_results )) results = cur . fetchall () cur . close () conn . close () return [ r [ 0 ] for r in results ] 5. RAG 전체 파이프라인 구현 python import os from sentence_transformers import SentenceTransformer from langchain.text_splitter import RecursiveCharacterTextSplitter from chromadb import Client import numpy as np class RAGPipeline: def init(self, model_name="all-MiniLM-L6-v2"): self.embedder = SentenceTransformer(model_name) self.text_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=200 ) self.chroma_client = Client() self.collection = self.chroma_client.get_or_create_collection("rag_docs") def add_document(self, text: str, doc_id: str): """문서 추가""" # 청킹 chunks = self.text_splitter.split_text(text) # 임베딩 embeddings = self.embedder.encode(chunks) # Chroma에 저장 self.collection.add( documents=chunks, embeddings=embeddings.tolist(), ids=[f"{doc_id}_{i}" for i in range(len(chunks))] ) def search(self, query: str, top_k: int = 3): """검색""" query_embedding = self.embedder.encode([query]) results = self.collection.query( query_embeddings=query_embedding.tolist(), n_results=top_k ) return results['documents'][0] if results['documents'] else [] def generate_response(self, query: str, context: list, model_name: str = "gpt-3.5-turbo"): """응답 생성 (예시)""" context_text = "\n".join(context) prompt = f""" 주어진 문맥을 기반으로 질문에 답하세요. 문맥: {context_text} 질문: {query} 답변: """ # 실제 구현에서는 LLM 호출 return f"질문: {query}\n --- 📥 Get the full guide on Gumroad: https://gumroad.com/l/auto ($7)

RAG 시스템 실전 구축 (v14)

요약

핵심 포인트

댓글