Home 리텐션 - 희소 코호트 다루기
Post
Cancel

리텐션 - 희소 코호트 다루기

ID별 첫 번째 임기 시작 날짜와 state

1
2
3
4
5
SELECT
	DISTINCT(id_bioguide),
	MIN(term_start) OVER(PARTITION BY id_bioguide) AS first_term,
	FIRST_VALUE(state) OVER(PARTITION BY id_bioguide ORDER BY term_start) AS first_state
FROM legislators_terms;


state, gender, period별 코호트 숫자

  • 위 쿼리를 서브쿼리로 사용한다.
  • 이전 게시글에서와 마찬가지로 여러 테이블을 JOIN 한다. (상세한 JOIN 조건들에 대해서는 이전 글들을 참고)
  • A 테이블에서 계산한 first_term이 1917-01-01부터 1999-12-31까지에 해당하는 테이블만을 조회한다.
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    
    SELECT
      A.first_state,
      D.gender,
      COALESCE(DATE_PART('year', AGE(C.date, A.first_term)), 0) AS period,
      COUNT(DISTINCT(A.id_bioguide)) AS cohort_retained
    FROM
    (
      SELECT
          DISTINCT(id_bioguide),
          MIN(term_start) OVER(PARTITION BY id_bioguide) AS first_term,
          FIRST_VALUE(state) OVER(PARTITION BY id_bioguide ORDER BY term_start) AS first_state
      FROM legislators_terms
    ) AS A JOIN legislators_terms AS B ON A.id_bioguide = B.id_bioguide
      LEFT JOIN date_dim AS C ON C.date BETWEEN B.term_start AND B.term_end
          AND C.month_name = 'December' AND C.day_of_month = 31
      JOIN legislators AS D ON A.id_bioguide = D.id_bioguide
    WHERE A.first_term BETWEEN '1917-01-01' AND '1999-12-31'
    GROUP BY 1, 2, 3;
    


state, gender, period별 코호트 숫자와 비율

  • 위 테이블을 서브쿼리로 사용한다.
  • state, gender별로 period를 오름차순 정렬했을 때 처음 나타나는 코호트 숫자와 비율을 계산한다.
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    
    SELECT
      first_state,
      gender,
      period,
      cohort_retained,
      FIRST_VALUE(cohort_retained) OVER(PARTITION BY first_state, gender ORDER BY period) AS cohort_size,
      ROUND(cohort_retained * 1.0 / FIRST_VALUE(cohort_retained) OVER(PARTITION BY first_state, gender ORDER BY period), 2) AS pct_retained
    FROM
    (
      SELECT
          A.first_state,
          D.gender,
          COALESCE(DATE_PART('year', AGE(C.date, A.first_term)), 0) AS period,
          COUNT(DISTINCT(A.id_bioguide)) AS cohort_retained
      FROM
      (
          SELECT
              DISTINCT(id_bioguide),
              MIN(term_start) OVER(PARTITION BY id_bioguide) AS first_term,
              FIRST_VALUE(state) OVER(PARTITION BY id_bioguide ORDER BY term_start) AS first_state
          FROM legislators_terms
      ) AS A JOIN legislators_terms AS B ON A.id_bioguide = B.id_bioguide
          LEFT JOIN date_dim AS C ON C.date BETWEEN B.term_start AND B.term_end
              AND C.month_name = 'December' AND C.day_of_month = 31
          JOIN legislators AS D ON A.id_bioguide = D.id_bioguide
      WHERE A.first_term BETWEEN '1917-01-01' AND '1999-12-31'
      GROUP BY 1, 2, 3
    ) AS sub;
    


gender, state별 코호트 숫자

1
2
3
4
5
6
7
8
9
10
11
12
13
14
SELECT
	B.gender,
	A.first_state,
	COUNT(DISTINCT(A.id_bioguide)) AS cohort_size
FROM
(
	SELECT
		DISTINCT(id_bioguide),
		MIN(term_start) OVER(PARTITION BY id_bioguide) AS first_term,
		FIRST_VALUE(state) OVER(PARTITION BY id_bioguide ORDER BY term_start) AS first_state
	FROM legislators_terms
) AS A JOIN legislators AS B ON A.id_bioguide = B.id_bioguide
WHERE A.first_term BETWEEN '1917-01-01' AND '1999-12-31'
GROUP BY 1, 2;


이전 테이블과 period JOIN

  • 해당 테이블을 T1이라고 가정하자.
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    
    SELECT B.gender, B.first_state, C.period, B.cohort_size
    FROM
    (
      SELECT B.gender, A.first_state, COUNT(DISTINCT(A.id_bioguide)) AS cohort_size
      FROM
      (
          SELECT
              DISTINCT(id_bioguide),
              MIN(term_start) OVER(PARTITION BY id_bioguide) AS first_term,
              FIRST_VALUE(state) OVER(PARTITION BY id_bioguide ORDER BY term_start ASC) AS first_state
          FROM legislators_terms
      ) AS A JOIN legislators AS B ON A.id_bioguide = B.id_bioguide
      WHERE A.first_term BETWEEN '1917-01-01' AND '1999-12-31'
      GROUP BY 1, 2
    ) AS B JOIN (SELECT generate_series AS period
              FROM generate_series(0, 20, 1)) AS C ON 1 = 1;
    


state, gender, period별 코호트 숫자

  • 해당 테이블은 이전 게시글에서 다뤘던 내용과 같은 쿼리이다.
  • 해당 테이블을 T2라고 가정하자.
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    
    SELECT
      D.first_state,
      G.gender,
      COALESCE(DATE_PART('year', AGE(F.date, D.first_term)), 0) AS period,
      COUNT(DISTINCT(D.id_bioguide)) AS cohort_retained
    FROM
    (
      SELECT
          DISTINCT(id_bioguide),
          MIN(term_start) OVER(PARTITION BY id_bioguide) AS first_term,
          FIRST_VALUE(state) OVER(PARTITION BY id_bioguide ORDER BY term_start ASC) AS first_state
      FROM legislators_terms
    ) AS D JOIN legislators_terms AS E ON D.id_bioguide = E.id_bioguide
      LEFT JOIN date_dim AS F ON F.date BETWEEN E.term_start AND E.term_end
          AND F.month_name = 'December' AND F.day_of_month = 31
      JOIN legislators AS G ON D.id_bioguide = G.id_bioguide
    WHERE D.first_term BETWEEN '1917-01-01' AND '1999-12-31'
    GROUP BY 1, 2, 3;
    


위 두 테이블 T1과 T2 JOIN

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
SELECT
	aa.gender,
	aa.first_state,
	aa.period,
	aa.cohort_size,
	COALESCE(bb.cohort_retained, 0) AS cohort_retained,
	ROUND(COALESCE(bb.cohort_retained, 0) * 1.0 / aa.cohort_size, 2) AS pct_retained
FROM
(
	SELECT B.gender, B.first_state, C.period, B.cohort_size
	FROM
	(
		SELECT B.gender, A.first_state, COUNT(DISTINCT(A.id_bioguide)) AS cohort_size
		FROM
		(
			SELECT
				DISTINCT(id_bioguide),
				MIN(term_start) OVER(PARTITION BY id_bioguide) AS first_term,
				FIRST_VALUE(state) OVER(PARTITION BY id_bioguide ORDER BY term_start ASC) AS first_state
			FROM legislators_terms
		) AS A JOIN legislators AS B ON A.id_bioguide = B.id_bioguide
		WHERE A.first_term BETWEEN '1917-01-01' AND '1999-12-31'
		GROUP BY 1, 2
	) AS B JOIN (SELECT generate_series AS period
				FROM generate_series(0, 20, 1)) AS C ON 1 = 1
) AS aa 
	LEFT JOIN
(
	SELECT
	D.first_state,
	G.gender,
	COALESCE(DATE_PART('year', AGE(F.date, D.first_term)), 0) AS period,
	COUNT(DISTINCT(D.id_bioguide)) AS cohort_retained
FROM
(
	SELECT
		DISTINCT(id_bioguide),
		MIN(term_start) OVER(PARTITION BY id_bioguide) AS first_term,
		FIRST_VALUE(state) OVER(PARTITION BY id_bioguide ORDER BY term_start ASC) AS first_state
	FROM legislators_terms
) AS D JOIN legislators_terms AS E ON D.id_bioguide = E.id_bioguide
	LEFT JOIN date_dim AS F ON F.date BETWEEN E.term_start AND E.term_end
		AND F.month_name = 'December' AND F.day_of_month = 31
	JOIN legislators AS G ON D.id_bioguide = G.id_bioguide
WHERE D.first_term BETWEEN '1917-01-01' AND '1999-12-31'
GROUP BY 1, 2, 3
) AS bb ON aa.gender = bb.gender AND aa.first_state = bb.first_state AND aa.period = bb.period
ORDER BY 1, 2, 3;


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
SELECT
	gender, first_state, cohort_size,
	MAX(CASE WHEN period=0 THEN pct_retained END) AS yr0,
	MAX(CASE WHEN period=2 THEN pct_retained END) AS yr2,
	MAX(CASE WHEN period=4 THEN pct_retained END) AS yr4,
	MAX(CASE WHEN period=6 THEN pct_retained END) AS yr6,
	MAX(CASE WHEN period=8 THEN pct_retained END) AS yr8,
	MAX(CASE WHEN period=10 THEN pct_retained END) AS yr10
FROM
(
	SELECT
		aa.gender,
		aa.first_state,
		aa.period,
		aa.cohort_size,
		COALESCE(bb.cohort_retained, 0) AS cohort_retained,
		ROUND(COALESCE(bb.cohort_retained, 0) * 1.0 / aa.cohort_size, 2) AS pct_retained
	FROM
	(
		SELECT B.gender, B.first_state, C.period, B.cohort_size
		FROM
		(
			SELECT B.gender, A.first_state, COUNT(DISTINCT(A.id_bioguide)) AS cohort_size
			FROM
			(
				SELECT
					DISTINCT(id_bioguide),
					MIN(term_start) OVER(PARTITION BY id_bioguide) AS first_term,
					FIRST_VALUE(state) OVER(PARTITION BY id_bioguide ORDER BY term_start ASC) AS first_state
				FROM legislators_terms
			) AS A JOIN legislators AS B ON A.id_bioguide = B.id_bioguide
			WHERE A.first_term BETWEEN '1917-01-01' AND '1999-12-31'
			GROUP BY 1, 2
		) AS B JOIN (SELECT generate_series AS period
					FROM generate_series(0, 20, 1)) AS C ON 1 = 1
	) AS aa 
		LEFT JOIN
	(
		SELECT
		D.first_state,
		G.gender,
		COALESCE(DATE_PART('year', AGE(F.date, D.first_term)), 0) AS period,
		COUNT(DISTINCT(D.id_bioguide)) AS cohort_retained
	FROM
	(
		SELECT
			DISTINCT(id_bioguide),
			MIN(term_start) OVER(PARTITION BY id_bioguide) AS first_term,
			FIRST_VALUE(state) OVER(PARTITION BY id_bioguide ORDER BY term_start ASC) AS first_state
		FROM legislators_terms
	) AS D JOIN legislators_terms AS E ON D.id_bioguide = E.id_bioguide
		LEFT JOIN date_dim AS F ON F.date BETWEEN E.term_start AND E.term_end
			AND F.month_name = 'December' AND F.day_of_month = 31
		JOIN legislators AS G ON D.id_bioguide = G.id_bioguide
	WHERE D.first_term BETWEEN '1917-01-01' AND '1999-12-31'
	GROUP BY 1, 2, 3
	) AS bb ON aa.gender = bb.gender AND aa.first_state = bb.first_state AND aa.period = bb.period
) AS result
GROUP BY 1, 2, 3;

This post is licensed under CC BY 4.0 by the author.