-- 1. 在有索引的列上使用DISTINCT效果更好 SELECTDISTINCT customer_id FROM orders; -- customer_id有索引
-- 2. 尽量减少DISTINCT的列数 SELECTDISTINCT city FROM customers; -- 好于下面的写法 -- SELECT DISTINCT city, address FROM customers; -- 如果不必要,避免多列
-- 3. 结合LIMIT使用,避免处理过多数据 SELECTDISTINCT category FROM products LIMIT 10;
-- 4. 使用EXISTS替代DISTINCT IN子查询(某些情况下性能更好) -- 不推荐: SELECT*FROM customers WHERE city IN (SELECTDISTINCT city FROM stores); -- 推荐: SELECT*FROM customers c WHEREEXISTS (SELECT1FROM stores s WHERE s.city = c.city);
二、GROUP BY去重操作
(一)GROUP BY基本去重
GROUP BY不仅可以用于分组统计,也是一种强大的去重工具。相比DISTINCT,GROUP BY提供了更多的灵活性:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
-- 使用GROUP BY实现去重 SELECT city FROM customers GROUPBY city;
-- 等价于DISTINCT的写法 SELECTDISTINCT city FROM customers;
-- GROUP BY的优势:可以同时进行统计 SELECT city, COUNT(*) as customer_count FROM customers GROUPBY city;
-- 多列分组去重 SELECT department_id, job_title, COUNT(*) as employee_count FROM employees GROUPBY department_id, job_title;
-- 1. 先查看重复数据 SELECT email, COUNT(*) as duplicate_count FROM customers GROUPBY email HAVINGCOUNT(*) >1;
-- 2. 使用ROW_NUMBER()标记重复数据 SELECT id, name, email, -- PARTITION BY email:按邮箱分组,相同邮箱的记录在同一组 -- ORDER BY id:在每个邮箱组内按ID升序排列 -- 结果:每个邮箱组内的记录被分配序号1,2,3... ROW_NUMBER() OVER (PARTITIONBY email ORDERBY id) as row_num FROM customers;
-- 3. 删除重复数据(保留row_num=1的记录) DELETE c1 FROM customers c1 INNER JOIN ( SELECT id, ROW_NUMBER() OVER (PARTITION BY email ORDER BY id) as row_num FROM customers ) c2 ON c1.id = c2.id WHERE c2.row_num > 1;
-- 使用相关子查询实现相同功能(性能可能较差) SELECT u.user_id, u.user_name, o.order_id, o.product_name, o.order_date, o.amount FROM users u INNERJOIN orders o ON u.user_id = o.user_id WHERE o.order_date = ( -- 子查询:找到该用户的最新订单日期 SELECTMAX(order_date) FROM orders o2 WHERE o2.user_id = u.user_id );
-- 注意:如果同一天有多条记录,上述查询可能返回多条结果 -- 需要进一步处理: SELECT u.user_id, u.user_name, o.order_id, o.product_name, o.order_date, o.amount FROM users u INNERJOIN orders o ON u.user_id = o.user_id WHERE (o.order_date, o.order_id) = ( -- 使用复合条件确保唯一性 SELECT order_date, MAX(order_id) FROM orders o2 WHERE o2.user_id = u.user_id AND o2.order_date = ( SELECTMAX(order_date) FROM orders o3 WHERE o3.user_id = u.user_id ) );
(四)使用UNION去重
UNION操作符会自动去除重复行,而UNION ALL则保留所有行:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
-- UNION自动去重 SELECT city FROM customers_north UNION SELECT city FROM customers_south;
-- UNION ALL保留重复 SELECT city FROM customers_north UNIONALL SELECT city FROM customers_south;
-- 复杂的UNION去重查询 SELECT'VIP客户'as customer_type, name, email FROM vip_customers UNION SELECT'普通客户'as customer_type, name, email FROM regular_customers ORDERBY customer_type, name;
(五)临时表去重方法
对于大量数据的去重操作,有时使用临时表会更高效:
1 2 3 4 5 6 7 8 9 10 11 12 13 14
-- 创建临时表存储去重结果 CREATE TEMPORARY TABLE temp_unique_customers AS SELECTDISTINCT customer_id, name, email FROM customers;
-- 清空原表 TRUNCATETABLE customers;
-- 将去重数据插入回原表 INSERT INTO customers (customer_id, name, email) SELECT customer_id, name, email FROM temp_unique_customers;
-- 删除临时表 DROP TEMPORARY TABLE temp_unique_customers;