SQL: my workflow
SQL
My workflow on exploring new datasets
Summary on one table
- get to know what variables (columns) are available in the table -> print first n rows (potential 1)
- number of rows with
COUNT(*), combined with conditions
-- prints first row
SELECT * FROM data LIMIT 1
-- prints nrow
SELECT COUNT(*) FROM data
-- number of the first names that start with "C"
SELECT COUNT(*) FROM data WHERE first_name LIKE "C%" Subsetting with IN syntax. It also works on texts.
SELECT *
FROM patients
WHERE patient_id IN (1, 45, 534, 879, 1000)grouping by ID, then count records per ID (no need to compile the inner data first)
SELECT
patient_id,
COUNT(*) AS total_admissions
FROM admissions
WHERE patient_id = 579;Data quality
check missing (NULL)
Numerial column
- range of values, with
maxandmin - combine with values derived from inside the table, say the maximum of a column
SELECT
first_name,
last_name,
height
FROM patients
WHERE height = (
SELECT max(height)
FROM patients
)Date
select year that starts from a certain value, like 2020
SELECT count(*)
FROM patients
WHERE birth_date like "2010%"
-- the following should also work
-- WHERE YEAR(birth_date) = '2010'
-- WHERE EXTRACT(YEAR from birth_date) = '2010'
-- WHERE substring(birth_date, 4) = '2010'Text
- unique categories
- text length with
LEN() - get the number of unique values
- select count directly
WITHclause
SELECT DISTINCT xvar FROM data
-- solution 1
SELECT COUNT(allergies) FROM (select distinct allergies
FROM patients
WHERE allergies IS NOT NULL)
-- solution 2 (with clause)
WITH alls AS (select distinct allergies
FROM patients
WHERE allergies IS NOT NULL)
SELECT COUNT(allergies) FROM alls
-- get the id and firstname where it starts and end with s, with length greater or equal to 6
SELECT patient_id, first_name
FROM patients
WHERE first_name LIKE "S%s"
AND LEN(first_name)>=6Multiple tables: JOIN
SELECT first_name, last_name, province_name
FROM patients LEFT JOIN province_names
ON patients.province_id = province_names.province_idSubquery
WITH and HAVING
WITH alls AS (select distinct allergies
FROM patients
WHERE allergies IS NOT NULL)
SELECT COUNT(allergies) FROM allsWITH tmp AS (
select patient_id, diagnosis, count(*) as freq
from admissions
group by patient_id, diagnosis)
select patient_id, diagnosis from tmp where freq >1
-- equivalent to
SELECT
patient_id,
diagnosis
FROM admissions
GROUP BY
patient_id,
diagnosis
HAVING COUNT(*) > 1Column-wise presentation
SELECT does not need to always end with FROM at the very end, it can combine results from other selects.
select
(select count(*) FROM patients where gender = 'M') as male_count,
(select count(*) FROM patients where gender = 'F') as female_count