本章概述
文档处理与数据预处理是RAG系统的基础环节,直接影响检索质量和生成效果。本章将深入介绍多格式文档加载、文本清理标准化、智能分块策略、元数据管理等核心技术。
学习目标
- 掌握多种文档格式的加载和解析方法
- 学习文本清理和标准化技术
- 了解智能分块策略和优化方法
- 熟悉元数据管理和索引构建
- 掌握数据质量评估和优化技巧
1. 多格式文档加载
1.1 文档加载器框架
# src/data/loaders.py - 文档加载器
from abc import ABC, abstractmethod
from typing import List, Dict, Any, Optional, Union
from pathlib import Path
import mimetypes
from dataclasses import dataclass
from datetime import datetime
@dataclass
class Document:
"""文档数据结构"""
content: str
metadata: Dict[str, Any]
doc_id: Optional[str] = None
source: Optional[str] = None
created_at: Optional[datetime] = None
def __post_init__(self):
if self.created_at is None:
self.created_at = datetime.now()
if self.doc_id is None:
import hashlib
self.doc_id = hashlib.md5(
(self.content + str(self.metadata)).encode()
).hexdigest()[:12]
class BaseDocumentLoader(ABC):
"""文档加载器基类"""
def __init__(self, file_path: Union[str, Path]):
self.file_path = Path(file_path)
self.metadata = self._extract_file_metadata()
def _extract_file_metadata(self) -> Dict[str, Any]:
"""提取文件元数据"""
stat = self.file_path.stat()
return {
"file_name": self.file_path.name,
"file_path": str(self.file_path),
"file_size": stat.st_size,
"file_type": self.file_path.suffix.lower(),
"mime_type": mimetypes.guess_type(str(self.file_path))[0],
"created_time": datetime.fromtimestamp(stat.st_ctime),
"modified_time": datetime.fromtimestamp(stat.st_mtime)
}
@abstractmethod
def load(self) -> List[Document]:
"""加载文档"""
pass
def validate_file(self) -> bool:
"""验证文件"""
if not self.file_path.exists():
raise FileNotFoundError(f"文件不存在: {self.file_path}")
if not self.file_path.is_file():
raise ValueError(f"路径不是文件: {self.file_path}")
if self.file_path.stat().st_size == 0:
raise ValueError(f"文件为空: {self.file_path}")
return True
class TextLoader(BaseDocumentLoader):
"""文本文件加载器"""
def __init__(self, file_path: Union[str, Path], encoding: str = "utf-8"):
super().__init__(file_path)
self.encoding = encoding
def load(self) -> List[Document]:
"""加载文本文件"""
self.validate_file()
try:
with open(self.file_path, 'r', encoding=self.encoding) as f:
content = f.read()
metadata = self.metadata.copy()
metadata.update({
"encoding": self.encoding,
"line_count": content.count('\n') + 1,
"char_count": len(content),
"word_count": len(content.split())
})
return [Document(
content=content,
metadata=metadata,
source=str(self.file_path)
)]
except Exception as e:
raise RuntimeError(f"加载文本文件失败: {e}")
class PDFLoader(BaseDocumentLoader):
"""PDF文件加载器"""
def load(self) -> List[Document]:
"""加载PDF文件"""
self.validate_file()
try:
import PyPDF2
documents = []
with open(self.file_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
for page_num, page in enumerate(pdf_reader.pages):
text = page.extract_text()
page_metadata = self.metadata.copy()
page_metadata.update({
"page_number": page_num + 1,
"total_pages": len(pdf_reader.pages),
"char_count": len(text),
"word_count": len(text.split())
})
if text.strip():
documents.append(Document(
content=text,
metadata=page_metadata,
source=f"{self.file_path}#page_{page_num + 1}"
))
return documents
except ImportError:
raise RuntimeError("请安装PyPDF2库: pip install PyPDF2")
except Exception as e:
raise RuntimeError(f"加载PDF文件失败: {e}")
class DocumentLoaderFactory:
"""文档加载器工厂"""
_loaders = {
'.txt': TextLoader,
'.md': TextLoader,
'.pdf': PDFLoader
}
@classmethod
def create_loader(cls, file_path: Union[str, Path], **kwargs):
"""创建文档加载器"""
file_path = Path(file_path)
file_ext = file_path.suffix.lower()
if file_ext not in cls._loaders:
raise ValueError(f"不支持的文件类型: {file_ext}")
loader_class = cls._loaders[file_ext]
return loader_class(file_path, **kwargs)
2. 文本清理与标准化
2.1 文本预处理器
”`python
src/data/processors.py - 文本预处理
import re import unicodedata from typing import List, Dict, Any, Optional from dataclasses import dataclass import html from datetime import datetime import logging from tqdm import tqdm
@dataclass class ProcessingConfig: “”“预处理配置”“” # 基础清理 remove_extra_whitespace: bool = True normalize_unicode: bool = True fix_encoding: bool = True
# HTML处理
remove_html_tags: bool = True
decode_html_entities: bool = True
# 特殊字符处理
remove_control_chars: bool = True
normalize_quotes: bool = True
normalize_dashes: bool = True
# 换行处理
normalize_line_breaks: bool = True
remove_empty_lines: bool = True
# 语言特定
language: str = "zh" # zh, en, auto
remove_non_printable: bool = True
# 自定义规则
custom_replacements: Dict[str, str] = None
custom_regex_patterns: List[tuple] = None
def __post_init__(self):
if self.custom_replacements is None:
self.custom_replacements = {}
if self.custom_regex_patterns is None:
self.custom_regex_patterns = []
class TextProcessor: “”“文本预处理器”“”
def __init__(self, config: ProcessingConfig = None):
self.config = config or ProcessingConfig()
self._compile_patterns()
def _compile_patterns(self):
"""编译正则表达式模式"""
# HTML标签
self.html_tag_pattern = re.compile(r'<[^>]+>')
# 控制字符
self.control_char_pattern = re.compile(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]')
# 多个空白字符
self.whitespace_pattern = re.compile(r'\s+')
# 多个换行符
self.multiple_newlines_pattern = re.compile(r'\n\s*\n')
# 引号标准化
self.quote_patterns = [
(re.compile(r'[