本章概述

文档处理与数据预处理是RAG系统的基础环节,直接影响检索质量和生成效果。本章将深入介绍多格式文档加载、文本清理标准化、智能分块策略、元数据管理等核心技术。

学习目标

  • 掌握多种文档格式的加载和解析方法
  • 学习文本清理和标准化技术
  • 了解智能分块策略和优化方法
  • 熟悉元数据管理和索引构建
  • 掌握数据质量评估和优化技巧

1. 多格式文档加载

1.1 文档加载器框架

# src/data/loaders.py - 文档加载器
from abc import ABC, abstractmethod
from typing import List, Dict, Any, Optional, Union
from pathlib import Path
import mimetypes
from dataclasses import dataclass
from datetime import datetime

@dataclass
class Document:
    """文档数据结构"""
    content: str
    metadata: Dict[str, Any]
    doc_id: Optional[str] = None
    source: Optional[str] = None
    created_at: Optional[datetime] = None
    
    def __post_init__(self):
        if self.created_at is None:
            self.created_at = datetime.now()
        if self.doc_id is None:
            import hashlib
            self.doc_id = hashlib.md5(
                (self.content + str(self.metadata)).encode()
            ).hexdigest()[:12]

class BaseDocumentLoader(ABC):
    """文档加载器基类"""
    
    def __init__(self, file_path: Union[str, Path]):
        self.file_path = Path(file_path)
        self.metadata = self._extract_file_metadata()
    
    def _extract_file_metadata(self) -> Dict[str, Any]:
        """提取文件元数据"""
        stat = self.file_path.stat()
        return {
            "file_name": self.file_path.name,
            "file_path": str(self.file_path),
            "file_size": stat.st_size,
            "file_type": self.file_path.suffix.lower(),
            "mime_type": mimetypes.guess_type(str(self.file_path))[0],
            "created_time": datetime.fromtimestamp(stat.st_ctime),
            "modified_time": datetime.fromtimestamp(stat.st_mtime)
        }
    
    @abstractmethod
    def load(self) -> List[Document]:
        """加载文档"""
        pass
    
    def validate_file(self) -> bool:
        """验证文件"""
        if not self.file_path.exists():
            raise FileNotFoundError(f"文件不存在: {self.file_path}")
        
        if not self.file_path.is_file():
            raise ValueError(f"路径不是文件: {self.file_path}")
        
        if self.file_path.stat().st_size == 0:
            raise ValueError(f"文件为空: {self.file_path}")
        
        return True

class TextLoader(BaseDocumentLoader):
    """文本文件加载器"""
    
    def __init__(self, file_path: Union[str, Path], encoding: str = "utf-8"):
        super().__init__(file_path)
        self.encoding = encoding
    
    def load(self) -> List[Document]:
        """加载文本文件"""
        self.validate_file()
        
        try:
            with open(self.file_path, 'r', encoding=self.encoding) as f:
                content = f.read()
            
            metadata = self.metadata.copy()
            metadata.update({
                "encoding": self.encoding,
                "line_count": content.count('\n') + 1,
                "char_count": len(content),
                "word_count": len(content.split())
            })
            
            return [Document(
                content=content,
                metadata=metadata,
                source=str(self.file_path)
            )]
            
        except Exception as e:
            raise RuntimeError(f"加载文本文件失败: {e}")

class PDFLoader(BaseDocumentLoader):
    """PDF文件加载器"""
    
    def load(self) -> List[Document]:
        """加载PDF文件"""
        self.validate_file()
        
        try:
            import PyPDF2
            
            documents = []
            
            with open(self.file_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                
                for page_num, page in enumerate(pdf_reader.pages):
                    text = page.extract_text()
                    
                    page_metadata = self.metadata.copy()
                    page_metadata.update({
                        "page_number": page_num + 1,
                        "total_pages": len(pdf_reader.pages),
                        "char_count": len(text),
                        "word_count": len(text.split())
                    })
                    
                    if text.strip():
                        documents.append(Document(
                            content=text,
                            metadata=page_metadata,
                            source=f"{self.file_path}#page_{page_num + 1}"
                        ))
            
            return documents
            
        except ImportError:
            raise RuntimeError("请安装PyPDF2库: pip install PyPDF2")
        except Exception as e:
            raise RuntimeError(f"加载PDF文件失败: {e}")

class DocumentLoaderFactory:
    """文档加载器工厂"""
    
    _loaders = {
        '.txt': TextLoader,
        '.md': TextLoader,
        '.pdf': PDFLoader
    }
    
    @classmethod
    def create_loader(cls, file_path: Union[str, Path], **kwargs):
        """创建文档加载器"""
        file_path = Path(file_path)
        file_ext = file_path.suffix.lower()
        
        if file_ext not in cls._loaders:
            raise ValueError(f"不支持的文件类型: {file_ext}")
        
        loader_class = cls._loaders[file_ext]
        return loader_class(file_path, **kwargs)

2. 文本清理与标准化

2.1 文本预处理器

”`python

src/data/processors.py - 文本预处理

import re import unicodedata from typing import List, Dict, Any, Optional from dataclasses import dataclass import html from datetime import datetime import logging from tqdm import tqdm

@dataclass class ProcessingConfig: “”“预处理配置”“” # 基础清理 remove_extra_whitespace: bool = True normalize_unicode: bool = True fix_encoding: bool = True

# HTML处理
remove_html_tags: bool = True
decode_html_entities: bool = True

# 特殊字符处理
remove_control_chars: bool = True
normalize_quotes: bool = True
normalize_dashes: bool = True

# 换行处理
normalize_line_breaks: bool = True
remove_empty_lines: bool = True

# 语言特定
language: str = "zh"  # zh, en, auto
remove_non_printable: bool = True

# 自定义规则
custom_replacements: Dict[str, str] = None
custom_regex_patterns: List[tuple] = None

def __post_init__(self):
    if self.custom_replacements is None:
        self.custom_replacements = {}
    if self.custom_regex_patterns is None:
        self.custom_regex_patterns = []

class TextProcessor: “”“文本预处理器”“”

def __init__(self, config: ProcessingConfig = None):
    self.config = config or ProcessingConfig()
    self._compile_patterns()

def _compile_patterns(self):
    """编译正则表达式模式"""
    # HTML标签
    self.html_tag_pattern = re.compile(r'<[^>]+>')

    # 控制字符
    self.control_char_pattern = re.compile(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]')

    # 多个空白字符
    self.whitespace_pattern = re.compile(r'\s+')

    # 多个换行符
    self.multiple_newlines_pattern = re.compile(r'\n\s*\n')

    # 引号标准化
    self.quote_patterns = [
        (re.compile(r'[