1. SQL语言概述
SQL(Structured Query Language,结构化查询语言)是用于管理关系型数据库的标准语言。理解SQL是掌握Text2SQL技术的基础,因为我们的目标是将自然语言转换为正确的SQL语句。
1.1 SQL的特点
- 声明式语言:描述想要什么结果,而不是如何获得结果
- 标准化:遵循ANSI/ISO标准,具有良好的可移植性
- 功能强大:支持数据查询、插入、更新、删除等操作
- 易于学习:语法接近自然语言,相对容易掌握
1.2 SQL的分类
- DDL(Data Definition Language):数据定义语言
- CREATE、ALTER、DROP等
- DML(Data Manipulation Language):数据操作语言
- SELECT、INSERT、UPDATE、DELETE等
- DCL(Data Control Language):数据控制语言
- GRANT、REVOKE等
- TCL(Transaction Control Language):事务控制语言
- COMMIT、ROLLBACK等
2. 数据库基础概念
2.1 关系型数据库
关系型数据库基于关系模型,数据以表格形式存储:
- 表(Table):数据的集合,由行和列组成
- 行(Row/Record):表中的一条记录
- 列(Column/Field):表中的一个属性
- 主键(Primary Key):唯一标识表中每一行的列
- 外键(Foreign Key):引用其他表主键的列
2.2 数据类型
数值类型
INT, INTEGER -- 整数
FLOAT, REAL -- 浮点数
DECIMAL(p,s) -- 定点数
NUMERIC(p,s) -- 数值型
字符串类型
CHAR(n) -- 定长字符串
VARCHAR(n) -- 变长字符串
TEXT -- 长文本
日期时间类型
DATE -- 日期
TIME -- 时间
DATETIME -- 日期时间
TIMESTAMP -- 时间戳
布尔类型
BOOLEAN -- 布尔值(TRUE/FALSE)
3. 基础SQL查询
3.1 SELECT语句结构
SELECT [DISTINCT] column_list
FROM table_name
[WHERE condition]
[GROUP BY column_list]
[HAVING condition]
[ORDER BY column_list [ASC|DESC]]
[LIMIT number];
3.2 简单查询
查询所有列
SELECT * FROM employees;
查询指定列
SELECT first_name, last_name, salary
FROM employees;
使用别名
SELECT first_name AS 名字,
last_name AS 姓氏,
salary AS 工资
FROM employees;
去重查询
SELECT DISTINCT department_id
FROM employees;
3.3 条件查询(WHERE子句)
比较操作符
-- 等于
SELECT * FROM employees WHERE salary = 50000;
-- 不等于
SELECT * FROM employees WHERE salary != 50000;
SELECT * FROM employees WHERE salary <> 50000;
-- 大于、小于
SELECT * FROM employees WHERE salary > 50000;
SELECT * FROM employees WHERE salary < 50000;
SELECT * FROM employees WHERE salary >= 50000;
SELECT * FROM employees WHERE salary <= 50000;
逻辑操作符
-- AND
SELECT * FROM employees
WHERE salary > 50000 AND department_id = 10;
-- OR
SELECT * FROM employees
WHERE salary > 80000 OR department_id = 20;
-- NOT
SELECT * FROM employees
WHERE NOT department_id = 10;
范围查询
-- BETWEEN
SELECT * FROM employees
WHERE salary BETWEEN 40000 AND 60000;
-- IN
SELECT * FROM employees
WHERE department_id IN (10, 20, 30);
-- NOT IN
SELECT * FROM employees
WHERE department_id NOT IN (10, 20);
模糊查询
-- LIKE
SELECT * FROM employees
WHERE first_name LIKE 'J%'; -- 以J开头
SELECT * FROM employees
WHERE first_name LIKE '%son'; -- 以son结尾
SELECT * FROM employees
WHERE first_name LIKE '%oh%'; -- 包含oh
SELECT * FROM employees
WHERE first_name LIKE 'J_hn'; -- J后面任意一个字符然后hn
空值查询
-- IS NULL
SELECT * FROM employees
WHERE commission_pct IS NULL;
-- IS NOT NULL
SELECT * FROM employees
WHERE commission_pct IS NOT NULL;
3.4 排序(ORDER BY)
-- 升序排序(默认)
SELECT * FROM employees
ORDER BY salary;
-- 降序排序
SELECT * FROM employees
ORDER BY salary DESC;
-- 多列排序
SELECT * FROM employees
ORDER BY department_id ASC, salary DESC;
3.5 限制结果数量
-- MySQL/PostgreSQL
SELECT * FROM employees
ORDER BY salary DESC
LIMIT 10;
-- SQL Server
SELECT TOP 10 * FROM employees
ORDER BY salary DESC;
-- Oracle
SELECT * FROM employees
WHERE ROWNUM <= 10
ORDER BY salary DESC;
4. 聚合函数和分组
4.1 常用聚合函数
-- 计数
SELECT COUNT(*) FROM employees; -- 总行数
SELECT COUNT(commission_pct) FROM employees; -- 非空值计数
-- 求和
SELECT SUM(salary) FROM employees;
-- 平均值
SELECT AVG(salary) FROM employees;
-- 最大值和最小值
SELECT MAX(salary), MIN(salary) FROM employees;
4.2 分组查询(GROUP BY)
-- 按部门分组统计
SELECT department_id, COUNT(*) as emp_count
FROM employees
GROUP BY department_id;
-- 按部门分组计算平均工资
SELECT department_id, AVG(salary) as avg_salary
FROM employees
GROUP BY department_id;
-- 多列分组
SELECT department_id, job_id, COUNT(*)
FROM employees
GROUP BY department_id, job_id;
4.3 分组过滤(HAVING)
-- 查找员工数量大于5的部门
SELECT department_id, COUNT(*) as emp_count
FROM employees
GROUP BY department_id
HAVING COUNT(*) > 5;
-- 查找平均工资大于50000的部门
SELECT department_id, AVG(salary) as avg_salary
FROM employees
GROUP BY department_id
HAVING AVG(salary) > 50000;
5. 多表查询
5.1 连接类型
内连接(INNER JOIN)
SELECT e.first_name, e.last_name, d.department_name
FROM employees e
INNER JOIN departments d ON e.department_id = d.department_id;
左外连接(LEFT JOIN)
SELECT e.first_name, e.last_name, d.department_name
FROM employees e
LEFT JOIN departments d ON e.department_id = d.department_id;
右外连接(RIGHT JOIN)
SELECT e.first_name, e.last_name, d.department_name
FROM employees e
RIGHT JOIN departments d ON e.department_id = d.department_id;
全外连接(FULL OUTER JOIN)
SELECT e.first_name, e.last_name, d.department_name
FROM employees e
FULL OUTER JOIN departments d ON e.department_id = d.department_id;
交叉连接(CROSS JOIN)
SELECT e.first_name, d.department_name
FROM employees e
CROSS JOIN departments d;
5.2 多表连接
SELECT e.first_name, e.last_name,
d.department_name,
l.city, l.country_id
FROM employees e
INNER JOIN departments d ON e.department_id = d.department_id
INNER JOIN locations l ON d.location_id = l.location_id;
5.3 自连接
-- 查找每个员工及其经理的姓名
SELECT e.first_name as employee_name,
m.first_name as manager_name
FROM employees e
LEFT JOIN employees m ON e.manager_id = m.employee_id;
6. 子查询
6.1 标量子查询
-- 查找工资高于平均工资的员工
SELECT first_name, last_name, salary
FROM employees
WHERE salary > (SELECT AVG(salary) FROM employees);
6.2 列子查询
-- 查找在特定部门工作的员工
SELECT first_name, last_name
FROM employees
WHERE department_id IN (
SELECT department_id
FROM departments
WHERE location_id = 1700
);
6.3 行子查询
-- 查找与特定员工同部门同职位的其他员工
SELECT first_name, last_name
FROM employees
WHERE (department_id, job_id) = (
SELECT department_id, job_id
FROM employees
WHERE employee_id = 100
);
6.4 表子查询
-- 查找每个部门工资最高的员工
SELECT e.first_name, e.last_name, e.salary, e.department_id
FROM employees e
INNER JOIN (
SELECT department_id, MAX(salary) as max_salary
FROM employees
GROUP BY department_id
) dept_max ON e.department_id = dept_max.department_id
AND e.salary = dept_max.max_salary;
6.5 相关子查询
-- 查找工资高于本部门平均工资的员工
SELECT first_name, last_name, salary, department_id
FROM employees e1
WHERE salary > (
SELECT AVG(salary)
FROM employees e2
WHERE e2.department_id = e1.department_id
);
6.6 EXISTS子查询
-- 查找有员工的部门
SELECT department_name
FROM departments d
WHERE EXISTS (
SELECT 1
FROM employees e
WHERE e.department_id = d.department_id
);
7. 高级SQL特性
7.1 窗口函数
-- 排名函数
SELECT first_name, last_name, salary,
ROW_NUMBER() OVER (ORDER BY salary DESC) as row_num,
RANK() OVER (ORDER BY salary DESC) as rank_num,
DENSE_RANK() OVER (ORDER BY salary DESC) as dense_rank_num
FROM employees;
-- 分区排名
SELECT first_name, last_name, salary, department_id,
RANK() OVER (PARTITION BY department_id ORDER BY salary DESC) as dept_rank
FROM employees;
-- 聚合窗口函数
SELECT first_name, last_name, salary,
SUM(salary) OVER (ORDER BY salary) as running_total,
AVG(salary) OVER (ORDER BY salary ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) as moving_avg
FROM employees;
7.2 公用表表达式(CTE)
-- 简单CTE
WITH high_salary_emp AS (
SELECT first_name, last_name, salary, department_id
FROM employees
WHERE salary > 50000
)
SELECT * FROM high_salary_emp
WHERE department_id = 10;
-- 递归CTE(查找组织层次结构)
WITH RECURSIVE emp_hierarchy AS (
-- 锚点:顶级经理
SELECT employee_id, first_name, last_name, manager_id, 1 as level
FROM employees
WHERE manager_id IS NULL
UNION ALL
-- 递归部分
SELECT e.employee_id, e.first_name, e.last_name, e.manager_id, eh.level + 1
FROM employees e
INNER JOIN emp_hierarchy eh ON e.manager_id = eh.employee_id
)
SELECT * FROM emp_hierarchy;
7.3 CASE表达式
-- 简单CASE
SELECT first_name, last_name, salary,
CASE
WHEN salary < 30000 THEN '低'
WHEN salary < 60000 THEN '中'
ELSE '高'
END as salary_level
FROM employees;
-- 搜索CASE
SELECT first_name, last_name,
CASE department_id
WHEN 10 THEN 'Administration'
WHEN 20 THEN 'Marketing'
WHEN 30 THEN 'Purchasing'
ELSE 'Other'
END as department_name
FROM employees;
8. 数据库设计原则
8.1 范式理论
第一范式(1NF)
- 每个列都是不可分割的原子值
- 每行都是唯一的
第二范式(2NF)
- 满足1NF
- 非主键列完全依赖于主键
第三范式(3NF)
- 满足2NF
- 非主键列不依赖于其他非主键列
8.2 实体关系设计
实体(Entity)
- 现实世界中可以区别的对象
- 例如:员工、部门、项目
属性(Attribute)
- 实体的特征
- 例如:员工的姓名、年龄、工资
关系(Relationship)
- 实体之间的联系
- 一对一(1:1)
- 一对多(1:N)
- 多对多(M:N)
8.3 索引设计
-- 创建索引
CREATE INDEX idx_employee_lastname ON employees(last_name);
-- 复合索引
CREATE INDEX idx_employee_dept_salary ON employees(department_id, salary);
-- 唯一索引
CREATE UNIQUE INDEX idx_employee_email ON employees(email);
9. Text2SQL中的SQL模式
9.1 常见查询模式
简单选择
-- 自然语言:"显示所有员工的姓名"
SELECT first_name, last_name FROM employees;
条件过滤
-- 自然语言:"显示工资大于50000的员工"
SELECT * FROM employees WHERE salary > 50000;
聚合统计
-- 自然语言:"每个部门有多少员工?"
SELECT department_id, COUNT(*)
FROM employees
GROUP BY department_id;
排序查询
-- 自然语言:"按工资从高到低显示员工"
SELECT * FROM employees ORDER BY salary DESC;
连接查询
-- 自然语言:"显示员工姓名和所在部门名称"
SELECT e.first_name, e.last_name, d.department_name
FROM employees e
JOIN departments d ON e.department_id = d.department_id;
9.2 复杂查询模式
嵌套查询
-- 自然语言:"显示工资最高的员工"
SELECT * FROM employees
WHERE salary = (SELECT MAX(salary) FROM employees);
多条件查询
-- 自然语言:"显示IT部门工资大于60000的员工"
SELECT e.*
FROM employees e
JOIN departments d ON e.department_id = d.department_id
WHERE d.department_name = 'IT' AND e.salary > 60000;
10. 实践练习
10.1 创建示例数据库
-- 创建部门表
CREATE TABLE departments (
department_id INT PRIMARY KEY,
department_name VARCHAR(50) NOT NULL,
location_id INT
);
-- 创建员工表
CREATE TABLE employees (
employee_id INT PRIMARY KEY,
first_name VARCHAR(50),
last_name VARCHAR(50),
email VARCHAR(100) UNIQUE,
salary DECIMAL(10,2),
department_id INT,
hire_date DATE,
FOREIGN KEY (department_id) REFERENCES departments(department_id)
);
-- 插入示例数据
INSERT INTO departments VALUES
(10, 'Administration', 1700),
(20, 'Marketing', 1800),
(30, 'Purchasing', 1700),
(40, 'Human Resources', 2400);
INSERT INTO employees VALUES
(100, 'Steven', 'King', 'sking@company.com', 24000, 10, '2003-06-17'),
(101, 'Neena', 'Kochhar', 'nkochhar@company.com', 17000, 10, '2005-09-21'),
(102, 'Lex', 'De Haan', 'ldehaan@company.com', 17000, 20, '2001-01-13'),
(103, 'Alexander', 'Hunold', 'ahunold@company.com', 9000, 30, '2006-01-03');
10.2 练习题目
- 查询所有员工的基本信息
- 查询工资大于15000的员工
- 查询每个部门的员工数量
- 查询工资最高的前3名员工
- 查询每个部门的平均工资
- 查询员工姓名和部门名称
- 查询没有员工的部门
- 查询工资高于平均工资的员工
总结
本章详细介绍了SQL基础知识和数据库设计原则,这些是理解Text2SQL技术的重要基础:
- SQL语法结构:SELECT语句的完整语法和各个子句的作用
- 查询技巧:条件过滤、排序、分组、聚合等常用操作
- 多表操作:各种连接类型和子查询的使用
- 高级特性:窗口函数、CTE、CASE表达式等
- 设计原则:数据库范式和实体关系设计
- 实际应用:Text2SQL中常见的SQL模式
掌握这些SQL知识后,您将能够更好地理解Text2SQL系统需要生成的目标SQL语句,为后续的自然语言处理和模型训练打下坚实基础。在下一章中,我们将学习自然语言处理的基础知识。