1. 自然语言处理概述
自然语言处理(Natural Language Processing, NLP)是人工智能的一个重要分支,旨在让计算机能够理解、处理和生成人类语言。在Text2SQL任务中,NLP技术用于理解用户的自然语言查询意图,并将其转换为结构化的SQL语句。
1.1 NLP的发展历程
规则基础时期(1950s-1980s)
- 特点:基于手工编写的语法规则和词典
- 代表:ELIZA、SHRDLU等早期系统
- 局限:规则复杂,难以处理语言的多样性
统计方法时期(1990s-2000s)
- 特点:基于统计模型和机器学习
- 代表:隐马尔可夫模型、条件随机场
- 优势:能够从数据中学习,处理不确定性
深度学习时期(2010s-至今)
- 特点:基于神经网络的端到端学习
- 代表:RNN、LSTM、Transformer、BERT、GPT
- 突破:大幅提升了NLP任务的性能
1.2 NLP的核心任务
- 词法分析:分词、词性标注、命名实体识别
- 句法分析:语法解析、依存分析
- 语义分析:语义角色标注、语义解析
- 语用分析:指代消解、情感分析
- 生成任务:机器翻译、文本摘要、对话生成
2. 文本预处理
2.1 分词(Tokenization)
分词是将连续的文本分割成有意义的词汇单元的过程。
英文分词
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
# 句子分割
text = "What is the average salary? Show me all employees."
sentences = sent_tokenize(text)
print(sentences)
# ['What is the average salary?', 'Show me all employees.']
# 词汇分割
words = word_tokenize(sentences[0])
print(words)
# ['What', 'is', 'the', 'average', 'salary', '?']
中文分词
import jieba
# 中文分词
text = "查询所有员工的平均工资"
words = jieba.lcut(text)
print(words)
# ['查询', '所有', '员工', '的', '平均', '工资']
子词分词(Subword Tokenization)
from transformers import AutoTokenizer
# 使用BERT分词器
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
text = "What is the average salary?"
tokens = tokenizer.tokenize(text)
print(tokens)
# ['what', 'is', 'the', 'average', 'salary', '?']
# 转换为ID
token_ids = tokenizer.encode(text)
print(token_ids)
2.2 文本清洗
import re
import string
def clean_text(text):
"""文本清洗函数"""
# 转换为小写
text = text.lower()
# 移除标点符号
text = text.translate(str.maketrans('', '', string.punctuation))
# 移除多余空格
text = re.sub(r'\s+', ' ', text).strip()
# 移除数字(可选)
# text = re.sub(r'\d+', '', text)
return text
# 示例
original = "What's the AVERAGE salary???"
cleaned = clean_text(original)
print(f"原文: {original}")
print(f"清洗后: {cleaned}")
2.3 停用词处理
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# 获取英文停用词
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
"""移除停用词"""
words = word_tokenize(text.lower())
filtered_words = [word for word in words if word not in stop_words]
return ' '.join(filtered_words)
# 示例
text = "What is the average salary of all employees?"
filtered = remove_stopwords(text)
print(f"原文: {text}")
print(f"移除停用词后: {filtered}")
# 原文: What is the average salary of all employees?
# 移除停用词后: average salary employees ?
2.4 词干提取和词形还原
词干提取(Stemming)
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
words = ['running', 'runs', 'ran', 'runner']
stemmed = [stemmer.stem(word) for word in words]
print(f"原词: {words}")
print(f"词干: {stemmed}")
# 原词: ['running', 'runs', 'ran', 'runner']
# 词干: ['run', 'run', 'ran', 'runner']
词形还原(Lemmatization)
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
lemmatizer = WordNetLemmatizer()
def get_wordnet_pos(word):
"""获取词性"""
tag = nltk.pos_tag([word])[0][1][0].upper()
tag_dict = {"J": wordnet.ADJ,
"N": wordnet.NOUN,
"V": wordnet.VERB,
"R": wordnet.ADV}
return tag_dict.get(tag, wordnet.NOUN)
words = ['running', 'runs', 'ran', 'better', 'good']
lemmatized = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in words]
print(f"原词: {words}")
print(f"词形还原: {lemmatized}")
3. 词汇表示
3.1 独热编码(One-Hot Encoding)
import numpy as np
from sklearn.preprocessing import LabelEncoder
# 构建词汇表
vocab = ['what', 'is', 'the', 'average', 'salary']
vocab_size = len(vocab)
# 创建词汇到索引的映射
word_to_idx = {word: idx for idx, word in enumerate(vocab)}
idx_to_word = {idx: word for word, idx in word_to_idx.items()}
def one_hot_encode(word, vocab_size, word_to_idx):
"""独热编码"""
vector = np.zeros(vocab_size)
if word in word_to_idx:
vector[word_to_idx[word]] = 1
return vector
# 示例
word = 'average'
vector = one_hot_encode(word, vocab_size, word_to_idx)
print(f"词汇: {word}")
print(f"独热编码: {vector}")
3.2 词袋模型(Bag of Words)
from sklearn.feature_extraction.text import CountVectorizer
# 文档集合
documents = [
"What is the average salary",
"Show me all employees",
"What is the maximum salary"
]
# 创建词袋模型
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(documents)
# 获取特征名称
feature_names = vectorizer.get_feature_names_out()
print("词汇表:", feature_names)
print("词袋矩阵:")
print(bow_matrix.toarray())
3.3 TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
# 创建TF-IDF向量化器
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
print("TF-IDF矩阵:")
print(tfidf_matrix.toarray())
# 查看特定文档的TF-IDF值
feature_names = tfidf_vectorizer.get_feature_names_out()
for i, doc in enumerate(documents):
print(f"\n文档 {i+1}: {doc}")
tfidf_scores = tfidf_matrix[i].toarray()[0]
for j, score in enumerate(tfidf_scores):
if score > 0:
print(f" {feature_names[j]}: {score:.4f}")
3.4 词向量(Word Embeddings)
Word2Vec
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
# 准备训练数据
sentences = [
"what is the average salary",
"show me all employees",
"what is the maximum salary",
"list all departments",
"count employees in each department"
]
# 分词
tokenized_sentences = [word_tokenize(sent.lower()) for sent in sentences]
# 训练Word2Vec模型
model = Word2Vec(sentences=tokenized_sentences,
vector_size=100, # 向量维度
window=5, # 上下文窗口大小
min_count=1, # 最小词频
workers=4) # 线程数
# 获取词向量
word = 'salary'
if word in model.wv:
vector = model.wv[word]
print(f"'{word}'的词向量维度: {vector.shape}")
print(f"向量前5个元素: {vector[:5]}")
# 查找相似词
similar_words = model.wv.most_similar('salary', topn=3)
print(f"与'salary'最相似的词: {similar_words}")
预训练词向量
import gensim.downloader as api
# 下载预训练的词向量(需要网络连接)
# word_vectors = api.load("glove-wiki-gigaword-100")
# 使用预训练词向量
# vector = word_vectors['salary']
# similar_words = word_vectors.most_similar('salary')
4. 语言模型
4.1 N-gram语言模型
from collections import defaultdict, Counter
import re
class NGramLanguageModel:
def __init__(self, n=2):
self.n = n
self.ngrams = defaultdict(Counter)
self.vocab = set()
def train(self, texts):
"""训练N-gram模型"""
for text in texts:
# 添加句子开始和结束标记
tokens = ['<s>'] * (self.n - 1) + text.lower().split() + ['</s>']
self.vocab.update(tokens)
# 生成N-gram
for i in range(len(tokens) - self.n + 1):
context = tuple(tokens[i:i+self.n-1])
next_word = tokens[i+self.n-1]
self.ngrams[context][next_word] += 1
def probability(self, context, word):
"""计算条件概率"""
context = tuple(context)
if context not in self.ngrams:
return 0.0
total_count = sum(self.ngrams[context].values())
word_count = self.ngrams[context][word]
return word_count / total_count if total_count > 0 else 0.0
def generate_next_word(self, context):
"""生成下一个词"""
context = tuple(context)
if context not in self.ngrams:
return None
# 选择概率最高的词
return self.ngrams[context].most_common(1)[0][0]
# 训练示例
texts = [
"what is the average salary",
"show me the maximum salary",
"what is the minimum salary",
"display all employee names"
]
model = NGramLanguageModel(n=2) # 二元语法
model.train(texts)
# 测试
context = ['what']
next_word = model.generate_next_word(context)
print(f"给定上下文 {context},下一个词可能是: {next_word}")
# 计算概率
prob = model.probability(['what'], 'is')
print(f"P(is|what) = {prob:.4f}")
4.2 神经语言模型
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
class SimpleRNNLanguageModel(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim):
super(SimpleRNNLanguageModel, self).__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
self.linear = nn.Linear(hidden_dim, vocab_size)
self.dropout = nn.Dropout(0.2)
def forward(self, x, hidden=None):
embedded = self.embedding(x)
output, hidden = self.rnn(embedded, hidden)
output = self.dropout(output)
output = self.linear(output)
return output, hidden
# 模型参数
vocab_size = 1000
embedding_dim = 128
hidden_dim = 256
# 创建模型
model = SimpleRNNLanguageModel(vocab_size, embedding_dim, hidden_dim)
print(f"模型参数数量: {sum(p.numel() for p in model.parameters())}")
5. 序列标注
5.1 命名实体识别(NER)
import spacy
# 加载spaCy模型
nlp = spacy.load("en_core_web_sm")
def extract_entities(text):
"""提取命名实体"""
doc = nlp(text)
entities = []
for ent in doc.ents:
entities.append({
'text': ent.text,
'label': ent.label_,
'start': ent.start_char,
'end': ent.end_char
})
return entities
# 示例
text = "Show me employees from New York with salary greater than 50000"
entities = extract_entities(text)
print("命名实体:")
for entity in entities:
print(f" {entity['text']} -> {entity['label']}")
5.2 词性标注(POS Tagging)
import nltk
from nltk.tokenize import word_tokenize
def pos_tagging(text):
"""词性标注"""
tokens = word_tokenize(text)
pos_tags = nltk.pos_tag(tokens)
return pos_tags
# 示例
text = "What is the average salary of employees?"
pos_tags = pos_tagging(text)
print("词性标注:")
for word, pos in pos_tags:
print(f" {word} -> {pos}")
5.3 依存句法分析
import spacy
nlp = spacy.load("en_core_web_sm")
def dependency_parsing(text):
"""依存句法分析"""
doc = nlp(text)
dependencies = []
for token in doc:
dependencies.append({
'text': token.text,
'pos': token.pos_,
'dep': token.dep_,
'head': token.head.text,
'children': [child.text for child in token.children]
})
return dependencies
# 示例
text = "Show me the average salary"
deps = dependency_parsing(text)
print("依存关系:")
for dep in deps:
print(f" {dep['text']} ({dep['pos']}) --{dep['dep']}--> {dep['head']}")
6. 语义表示
6.1 语义角色标注(SRL)
# 使用AllenNLP进行语义角色标注
# pip install allennlp allennlp-models
from allennlp.predictors.predictor import Predictor
# 加载预训练的SRL模型
# predictor = Predictor.from_path(
# "https://storage.googleapis.com/allennlp-public-models/bert-base-srl-2020.03.24.tar.gz"
# )
def semantic_role_labeling(text):
"""语义角色标注"""
# result = predictor.predict(sentence=text)
# return result
pass
# 示例(需要网络连接下载模型)
# text = "John gave Mary a book"
# srl_result = semantic_role_labeling(text)
6.2 语义解析
语义解析是将自然语言转换为机器可理解的语义表示的过程,这正是Text2SQL的核心任务。
class SemanticParser:
"""简单的语义解析器示例"""
def __init__(self):
# 定义语义模板
self.templates = {
'select_all': {
'pattern': r'(show|display|list).*all.*(\w+)',
'sql_template': 'SELECT * FROM {table}'
},
'count': {
'pattern': r'(count|number of).*(\w+)',
'sql_template': 'SELECT COUNT(*) FROM {table}'
},
'average': {
'pattern': r'average.*(\w+)',
'sql_template': 'SELECT AVG({column}) FROM {table}'
}
}
def parse(self, text):
"""解析自然语言查询"""
import re
text = text.lower()
for intent, template in self.templates.items():
match = re.search(template['pattern'], text)
if match:
return {
'intent': intent,
'sql_template': template['sql_template'],
'entities': match.groups()
}
return None
# 示例
parser = SemanticParser()
queries = [
"Show me all employees",
"Count the number of departments",
"What is the average salary"
]
for query in queries:
result = parser.parse(query)
print(f"查询: {query}")
print(f"解析结果: {result}")
print()
7. 预训练语言模型
7.1 BERT
from transformers import BertTokenizer, BertModel
import torch
# 加载预训练的BERT模型
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
def get_bert_embeddings(text):
"""获取BERT词向量"""
# 分词和编码
inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
# 获取模型输出
with torch.no_grad():
outputs = model(**inputs)
# 获取最后一层的隐藏状态
last_hidden_states = outputs.last_hidden_state
return last_hidden_states
# 示例
text = "What is the average salary?"
embeddings = get_bert_embeddings(text)
print(f"BERT嵌入维度: {embeddings.shape}")
print(f"序列长度: {embeddings.shape[1]}")
print(f"隐藏维度: {embeddings.shape[2]}")
7.2 GPT
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
# 加载GPT-2模型
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
# 设置pad_token
tokenizer.pad_token = tokenizer.eos_token
def generate_text(prompt, max_length=50):
"""使用GPT-2生成文本"""
inputs = tokenizer.encode(prompt, return_tensors='pt')
with torch.no_grad():
outputs = model.generate(
inputs,
max_length=max_length,
num_return_sequences=1,
temperature=0.7,
pad_token_id=tokenizer.eos_token_id
)
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
return generated_text
# 示例
prompt = "The SQL query to find all employees is"
generated = generate_text(prompt)
print(f"提示: {prompt}")
print(f"生成: {generated}")
8. Text2SQL中的NLP应用
8.1 查询意图识别
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
class QueryIntentClassifier:
"""查询意图分类器"""
def __init__(self):
self.pipeline = Pipeline([
('tfidf', TfidfVectorizer(ngram_range=(1, 2))),
('classifier', MultinomialNB())
])
# 定义意图类别
self.intents = {
'select': 'SELECT查询',
'count': '计数查询',
'aggregate': '聚合查询',
'filter': '过滤查询',
'sort': '排序查询'
}
def train(self, texts, labels):
"""训练分类器"""
self.pipeline.fit(texts, labels)
def predict(self, text):
"""预测查询意图"""
intent = self.pipeline.predict([text])[0]
confidence = max(self.pipeline.predict_proba([text])[0])
return intent, confidence
# 训练数据示例
train_texts = [
"Show me all employees",
"List all departments",
"How many employees are there",
"Count the number of departments",
"What is the average salary",
"Find the maximum salary",
"Show employees with salary > 50000",
"Find employees in IT department",
"Sort employees by salary",
"Order departments by name"
]
train_labels = [
'select', 'select',
'count', 'count',
'aggregate', 'aggregate',
'filter', 'filter',
'sort', 'sort'
]
# 训练和测试
classifier = QueryIntentClassifier()
classifier.train(train_texts, train_labels)
# 测试
test_queries = [
"Display all products",
"How many orders were placed",
"What is the total revenue"
]
for query in test_queries:
intent, confidence = classifier.predict(query)
print(f"查询: {query}")
print(f"意图: {intent} (置信度: {confidence:.3f})")
print()
8.2 实体抽取
import re
from typing import List, Dict
class SQLEntityExtractor:
"""SQL实体抽取器"""
def __init__(self):
# 定义实体模式
self.patterns = {
'table_names': [
r'\b(employees?|departments?|products?|orders?|customers?)\b',
r'\b(users?|accounts?|transactions?)\b'
],
'column_names': [
r'\b(name|salary|age|id|email|phone)\b',
r'\b(price|quantity|date|status)\b'
],
'operators': [
r'\b(greater than|more than|>)\b',
r'\b(less than|<)\b',
r'\b(equal to|equals?|=)\b'
],
'aggregations': [
r'\b(average|avg|mean)\b',
r'\b(sum|total)\b',
r'\b(count|number)\b',
r'\b(max|maximum)\b',
r'\b(min|minimum)\b'
],
'values': [
r'\b\d+\b', # 数字
r'\b\d+\.\d+\b', # 小数
r"'[^']*'", # 字符串
r'"[^"]*"' # 字符串
]
}
def extract_entities(self, text: str) -> Dict[str, List[str]]:
"""抽取实体"""
text = text.lower()
entities = {}
for entity_type, patterns in self.patterns.items():
entities[entity_type] = []
for pattern in patterns:
matches = re.findall(pattern, text)
entities[entity_type].extend(matches)
# 去重
for entity_type in entities:
entities[entity_type] = list(set(entities[entity_type]))
return entities
# 示例
extractor = SQLEntityExtractor()
queries = [
"Show me all employees with salary greater than 50000",
"What is the average age of customers",
"Count the number of orders in 2023"
]
for query in queries:
entities = extractor.extract_entities(query)
print(f"查询: {query}")
print("抽取的实体:")
for entity_type, values in entities.items():
if values:
print(f" {entity_type}: {values}")
print()
9. 评估指标
9.1 文本相似度
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
def calculate_text_similarity(text1, text2, method='cosine'):
"""计算文本相似度"""
if method == 'cosine':
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([text1, text2])
similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
return similarity
elif method == 'jaccard':
set1 = set(text1.lower().split())
set2 = set(text2.lower().split())
intersection = len(set1.intersection(set2))
union = len(set1.union(set2))
return intersection / union if union > 0 else 0
# 示例
text1 = "Show me all employees"
text2 = "Display all workers"
cosine_sim = calculate_text_similarity(text1, text2, 'cosine')
jaccard_sim = calculate_text_similarity(text1, text2, 'jaccard')
print(f"文本1: {text1}")
print(f"文本2: {text2}")
print(f"余弦相似度: {cosine_sim:.4f}")
print(f"Jaccard相似度: {jaccard_sim:.4f}")
9.2 BLEU分数
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
from nltk.tokenize import word_tokenize
def calculate_bleu(reference, candidate):
"""计算BLEU分数"""
# 分词
ref_tokens = word_tokenize(reference.lower())
cand_tokens = word_tokenize(candidate.lower())
# 计算BLEU分数
bleu_score = sentence_bleu([ref_tokens], cand_tokens)
return bleu_score
# 示例
reference = "SELECT * FROM employees WHERE salary > 50000"
candidate = "SELECT * FROM employees WHERE salary > 50000"
bleu = calculate_bleu(reference, candidate)
print(f"参考: {reference}")
print(f"候选: {candidate}")
print(f"BLEU分数: {bleu:.4f}")
10. 实践项目
10.1 简单的Text2SQL预处理器
class Text2SQLPreprocessor:
"""Text2SQL预处理器"""
def __init__(self):
self.entity_extractor = SQLEntityExtractor()
self.intent_classifier = QueryIntentClassifier()
# 数据库schema信息
self.schema = {
'tables': ['employees', 'departments', 'projects'],
'columns': {
'employees': ['id', 'name', 'salary', 'department_id'],
'departments': ['id', 'name', 'location'],
'projects': ['id', 'name', 'budget', 'department_id']
}
}
def preprocess(self, query):
"""预处理查询"""
# 1. 文本清洗
cleaned_query = self.clean_text(query)
# 2. 实体抽取
entities = self.entity_extractor.extract_entities(cleaned_query)
# 3. Schema链接
linked_entities = self.link_to_schema(entities)
# 4. 构建结构化表示
structured_query = {
'original_query': query,
'cleaned_query': cleaned_query,
'entities': linked_entities,
'schema_info': self.get_relevant_schema(linked_entities)
}
return structured_query
def clean_text(self, text):
"""文本清洗"""
# 基本清洗
text = text.strip().lower()
# 标准化同义词
synonyms = {
'show': 'select',
'display': 'select',
'list': 'select',
'workers': 'employees',
'staff': 'employees'
}
for synonym, standard in synonyms.items():
text = text.replace(synonym, standard)
return text
def link_to_schema(self, entities):
"""将实体链接到数据库schema"""
linked = {}
# 链接表名
for table in entities.get('table_names', []):
if table in self.schema['tables']:
linked.setdefault('tables', []).append(table)
# 链接列名
for column in entities.get('column_names', []):
for table, columns in self.schema['columns'].items():
if column in columns:
linked.setdefault('columns', []).append({
'column': column,
'table': table
})
return linked
def get_relevant_schema(self, linked_entities):
"""获取相关的schema信息"""
relevant_schema = {}
# 获取相关表的完整信息
for table in linked_entities.get('tables', []):
relevant_schema[table] = self.schema['columns'][table]
return relevant_schema
# 示例使用
preprocessor = Text2SQLPreprocessor()
query = "Show me all employees with salary greater than 50000"
result = preprocessor.preprocess(query)
print("预处理结果:")
for key, value in result.items():
print(f"{key}: {value}")
总结
本章详细介绍了自然语言处理的基础知识,这些是理解和实现Text2SQL系统的重要基础:
- 文本预处理:分词、清洗、标准化等基础操作
- 词汇表示:从独热编码到词向量的各种表示方法
- 语言模型:N-gram模型和神经语言模型
- 序列标注:NER、POS、依存分析等任务
- 语义表示:语义角色标注和语义解析
- 预训练模型:BERT、GPT等现代NLP模型
- 实际应用:在Text2SQL中的具体应用
- 评估方法:文本相似度、BLEU等评估指标
掌握这些NLP基础知识后,您将能够更好地理解Text2SQL系统如何处理自然语言输入,为后续的模型架构设计和实现打下坚实基础。在下一章中,我们将深入学习Text2SQL的具体模型架构。