All Articles

Django ArrayAgg, multiple aggregation, Subquery

for 루프 안의 Django ORM 때문에 로딩이 너무 오래 걸리는 페이지가 있었다. 이 페이지는 클라이언트와 서버 모두 총체적 난국 이었다. 클라이언트는 pagination 처리 조차 되어 있지 않아, 한 두 번의 api 통신으로 모든 데이터를 가져오는 구조다.

client, server 모두 고쳐야 했지만, CTO님은 일단 resource가 한정적이니, 서버만 일부 고쳐서 개선해보자고 하여 그렇게 진행하게 된 작업.

문제의 for loop 함수

def cuttingContents(contents):
    result = []
    for c in contents:
        data = dict({
            'id': c.id,
            'created_at': localtime(c.created_at).strftime('%Y-%m-%d'),
            'subject': c.subject,
            'content_type': c.content_type,
            'teacher': None,
            'tag': c.tag,
            'count': 0
        })
        categories = c.categories.filter(parent__isnull=False)
        if c.content_type == Content.TYPE_VIDEO_LECTURE:
            if c.teachers:
                data['teacher'] = [ {'id': o.id, 'name': o.name} for o in c.teachers.all() ]
            data['count'] = c.extra.lectures.filter(is_deleted=False).count()
            categories = c.categories.filter(id__in=total_video_categories)
            data['review_count'] = c.public_reviews.count()
            data['review_score'] = c.review_avg_score
            view_count_date = (timezone.now() - timedelta(days=180)).strftime('%Y-%m-%d 00:00:00')
            data['view_count'] = VideoViewCount.objects.filter(video=c.video.id, created_at__gte=view_count_date).count()
            weekago = (timezone.now() - timedelta(days=3)).strftime('%Y-%m-%d 00:00:00')
            data['look_count'] = VideoViewCount.objects.filter(video=c.video.id, created_at__gte=weekago).count()

        if c.content_type == Content.TYPE_EXAM:
            categories = [c.exam.company]

        data['categories'] = [ {'id': o.id, 'name': o.name} for o in categories ]

        result.append(data)
    return result

해당 페이지 api의 모든 로직을 변경할 필요는 없었다. 문제의 for loop 함수만 잘 고친다면 말이다.

새로 만든 함수

def get_data_of_contents(contents, user_content_id, content_type):
    exclude_added_usercontents = """
        AND NOT ("content_content"."id" IN (SELECT U0."id" FROM "content_content" U0
        INNER JOIN "payment_usercontent" U1 ON (U0."id" = U1."content_id")
        WHERE (U1."parent_id" = %s
        AND U1."expired_at" > %s AT TIME ZONE 'utc' AT TIME ZONE 'kst')))"""
    current_time = timezone.now()
    base_query = contents.query
    query = """
        WITH "base_table" AS (
            {base_query} {exclude_added_usercontents}
        )
        SELECT "base_table"."id", "base_table"."subject", "base_table"."content_type",
        "base_table"."tag", ("base_table"."created_at" AT TIME ZONE 'Asia/Seoul')::date AS "created_date", """\
        .format(base_query=base_query, exclude_added_usercontents=exclude_added_usercontents)

    if content_type == Content.TYPE_VIDEO_LECTURE:
        review_count = """
            SELECT COUNT(*) AS "review_count"
            FROM "content_content"
            INNER JOIN "review_userreview" ON ("content_content"."id" = "review_userreview"."content_id")
            WHERE ("review_userreview"."is_public" = True AND "review_userreview"."content_id" = "base_table"."id")"""

        review_score = """
            SELECT AVG("review_userreview"."score") AS "review_score"
            FROM "content_content"
            INNER JOIN "review_userreview" ON ("content_content"."id" = "review_userreview"."content_id")
            WHERE ("review_userreview"."is_public" = True AND "review_userreview"."content_id" = "base_table"."id")"""

        lecture_count = """
            SELECT COUNT(*) as "count"
            FROM "content_content"
            INNER JOIN "video_videocontent" ON ("content_content"."id" = "video_videocontent"."content_id")
            INNER JOIN "video_videocontentlecture" ON ("video_videocontent"."id" = "video_videocontentlecture"."video_content_id")
            WHERE ("video_videocontentlecture"."is_deleted" = False AND "video_videocontent"."content_id" = "base_table"."id") """

        number_of_view = """
            SELECT COUNT(*) as "view_count"
            FROM "content_content"
            INNER JOIN "video_videocontent" ON ("content_content"."id" = "video_videocontent"."content_id")
            INNER JOIN "video_videoviewcount" ON ("video_videocontent"."id" = "video_videoviewcount"."video_id")
            WHERE "video_videoviewcount"."created_at" >= %s AT TIME ZONE 'utc' AT TIME ZONE 'kst'
            AND "video_videocontent"."content_id" = "base_table"."id" """

        number_of_view_per_week = """
            SELECT COUNT(*) as "look_count"
            FROM "content_content"
            INNER JOIN "video_videocontent" ON ("content_content"."id" = "video_videocontent"."content_id")
            INNER JOIN "video_videoviewcount" ON ("video_videocontent"."id" = "video_videoviewcount"."video_id")
            WHERE "video_videoviewcount"."created_at" >= %s AT TIME ZONE 'utc' AT TIME ZONE 'kst'
            AND "video_videocontent"."content_id" = "base_table"."id" """

        categories = """
            SELECT ARRAY_AGG("content_contentcategory"."name") AS "categories"
            FROM "content_content"
            INNER JOIN "content_content_categories" ON ("content_content"."id" = "content_content_categories"."content_id")
            INNER JOIN "content_contentcategory" ON ("content_content_categories"."contentcategory_id" = "content_contentcategory"."id")
            WHERE ("content_content_categories"."contentcategory_id" IN %s
            AND "content_content_categories"."content_id" = "base_table"."id") """

        teachers = """
            SELECT ARRAY_AGG("content_content_teachers"."teacher_id") AS "teacher"
            FROM "content_content"
            INNER JOIN "content_content_teachers" ON ("content_content"."id" = "content_content_teachers"."content_id")
            WHERE "content_content_teachers"."content_id" = "base_table"."id" """

        query += """
            ({review_count}), ({review_score}), ({lecture_count}),
            ({number_of_view}), ({number_of_view_per_week}), ({categories}), ({teachers})
            FROM "base_table"
            GROUP BY "base_table"."id", "base_table"."created_at",
            "base_table"."subject", "base_table"."content_type", "base_table"."tag"
            ORDER BY "base_table"."created_at" DESC; """\
            .format(
                review_count=review_count, review_score=review_score, lecture_count=lecture_count,
                number_of_view=number_of_view, number_of_view_per_week=number_of_view_per_week,
                categories=categories, teachers=teachers)

        view_count_date = (current_time - timedelta(days=180)).strftime('%Y-%m-%d 00:00:00')
        weekago = (current_time - timedelta(days=7)).strftime('%Y-%m-%d 00:00:00')
        total_video_categories = 12, 6, 11, 7, 32, 24, 31, 23, 30, 22, 29, 21, 28, 20, 19, 27, 209, 3
        params = [user_content_id, current_time, view_count_date, weekago, total_video_categories]

    else:
        if content_type == Content.TYPE_EXAM:
            categories = """
                SELECT ARRAY_AGG("content_company"."name") AS "categories"
                FROM "content_content"
                INNER JOIN "exam_examcontent" ON ("content_content"."id" = "exam_examcontent"."content_id")
                INNER JOIN "content_company" ON ("exam_examcontent"."company_id" = "content_company"."id")
                WHERE ("exam_examcontent"."content_id" = "base_table"."id") """

        elif content_type == Content.TYPE_DOCUMENT:
            categories = """
                SELECT ARRAY_AGG("content_contentcategory"."name") AS "categories"
                FROM "content_content"
                INNER JOIN "content_content_categories" ON ("content_content"."id" = "content_content_categories"."content_id")
                INNER JOIN "content_contentcategory" ON ("content_content_categories"."contentcategory_id" = "content_contentcategory"."id")
                WHERE ("content_contentcategory"."key" IN ('job', 'industry', 'introduce', 'specialism', 'company')
                AND "content_contentcategory"."service" = 1
                AND "content_contentcategory"."content_type" = 2
                AND "content_contentcategory"."parent_id" IS NOT NULL
                AND "content_content_categories"."content_id" = "base_table"."id") """

        params = [user_content_id, current_time]
        query += """
            ({categories}) FROM "base_table"
            GROUP BY "base_table"."id", "base_table"."created_at",
            "base_table"."subject", "base_table"."content_type", "base_table"."tag"
            ORDER BY "base_table"."created_at" DESC; """\
            .format(categories=categories)

    with connection.cursor() as cursor:
        cursor.execute(query, params)
        result = dictfetchall(cursor)

    return result

문제의 함수는 또 다른 ORM의 결과값인 QuerySet을 인자값으로 받고 있었다.

Django Subquery()를 썼다면 인자값으로 받은 QuerySetSubquery를 결합하여 더 짧고 가시적인 코드를 짤 수 있었을 것이다.

https://docs.djangoproject.com/en/2.1/ref/models/expressions/#subquery-expressions

허나, Subquery()는 Django 1.11 버젼부터 도입된 거라 사용할 수 없었다. 별 수 없이 sql문을 짜기로 방향을 틀었다.

base_query = contents.query 와 같이 인자값으로 받은 QuerySetquery를 호출하여 장고 쿼리셋을 sql문으로 바꾸기로 했다. QuerySet에 포함됐던 일부 로직을 제거하고, sql문에 작성했다.

문자열 포맷팅으로 query를 완성해갔다.

Postgresql ARRAY_AGG

이번 작업으로 postgresql의 ARRAY_AGG 기능을 알 수 있었다. 이 기능으로 sql의 결과를 array로 받을 수 있다.

이는 장고에도 구현되어 있다.

https://docs.djangoproject.com/en/3.0/ref/contrib/postgres/aggregates/#arrayagg

Combining multiple aggregations

count든 average든 하나의 쿼리에서 하지 않고, 여러 개의 서브 쿼리로 만든 이유는?

여러 개의 aggregation을 annotate()를 사용하여 하나의 쿼리에 담으면 결과에 문제가 생길 수 있다. Combining multiple aggregations with annotate() will yield the wrong results because joins are used instead of subqueries

https://docs.djangoproject.com/en/2.1/topics/db/aggregation/#combining-multiple-aggregations