#30158: Subquery Expressions Incorrectly Added to Group by
-------------------------------------+-------------------------------------
               Reporter:             |          Owner:  nobody
  JonnyWaffles                       |
                   Type:             |         Status:  new
  Uncategorized                      |
              Component:  Database   |        Version:  2.1
  layer (models, ORM)                |
               Severity:  Normal     |       Keywords:  subquery, group_by
           Triage Stage:             |      Has patch:  0
  Unreviewed                         |
    Needs documentation:  0          |    Needs tests:  0
Patch needs improvement:  0          |  Easy pickings:  0
                  UI/UX:  0          |
-------------------------------------+-------------------------------------
 Hi friends,

 My Django/SQL skills are not good enough to properly isolate the problem
 independently of my use case detailed below. I believe the problem is
 Subqueries being forced in to the group_by clause because they are select
 expressions. Per the below if I remove the subqueries manually from sql
 group_by, my query runs perfectly. I tried to manually edit the
 qs.query.group_by, but because it is created by compiler.get_group_by() I
 cannot fix the ORM query group by clause.

 Are Subquery expressions always supposed to be included in group_by? If
 this is desired behavior is it possible to toggle it off so the ORM can
 produce the accurate query?

 {{{
 """
 Problem Statement: The individual annotations work fine when run
 independently, but when chained the query takes 5 minutes. This is due to
 the final group by clause unexpectedly receiving the Subquery as extra
 fields.
 """
 ​
 # relevant models and querysets
 class ClaimQuerySet(models.QuerySet):
     def annotate_all(self):
         """Adds ``results``, ``latest_note_text``, and
 ``latest_assessment_text`` to the queryset."""
         return
 
self.annotate_latest_results().annotate_most_recent_note().annotate_most_recent_assessment()
 ​
     def prefetch_all(self, annotate_sum=True):
         return self.prefetch_notes().prefetch_latest_results(annotate_sum)
 ​
     def prefetch_latest_results(self, annotate_sum: bool=True):
         """Prefetches the most result :class:`RulesEngineResult` object
 and optionally
         preload its :attr:`RulesEngineResult.score`.
 ​
         Args:
             annotate_sum:
         """
         latest_runs = self.latest_runs
         if annotate_sum:
             latest_runs =
 latest_runs.annotate(_score=Sum('results__value'))
         return self.prefetch_related(Prefetch(
             'rules_engine_results', queryset=latest_runs,
 to_attr='_last_run')
         )
 ​
     def prefetch_notes(self):
         """Prefetches all related notes and assessments."""
         return self.prefetch_related('notes', 'assessments')
 ​
     @property
     def latest_runs(self):
         """Shortcut for
 :attr:`RulesEngineResultQuerySet.get_latest_runs`"""
         return RulesEngineResult.objects.get_latest_runs()
 ​
     def annotate_latest_results(self) -> 'ClaimQuerySet':
         """Annotates the queryset with a new field ``results`` whose value
 is the Sum
         of the last :attr:`RulesEngineResult.results` for the claim.
         """
         # Only Sum on runs in the above set.
         filter_q = Q(rules_engine_results__in=self.latest_runs)
         # noinspection PyTypeChecker
         return
 self.annotate(results=Sum('rules_engine_results__results__value',
 filter=filter_q))
 ​
     def annotate_most_recent_note(self) -> 'ClaimQuerySet':
         """Annotates the queryset with a field ``latest_note_text`` whose
 value is the last
         entered :attr:`Note.text` for the claim or ``None`` if there are
 no associated notes.
         """
         return self._annotate_most_recent_basenote(Note,
 'latest_note_text')
 ​
     def annotate_most_recent_assessment(self) -> 'ClaimQuerySet':
         """Annotates the queryset with a field ``latest_assessment_text``
 whose value is the last
         entered :attr:`Assessment.text` for the claim or ``None`` if there
 are no associated assessments.
         """
         return self._annotate_most_recent_basenote(Assessment,
 'latest_assessment_text')
 ​
     def _annotate_most_recent_basenote(self, model: Type['BaseNote'],
 field_name: str) -> 'ClaimQuerySet':
         newest =
 model.objects.filter(claim=OuterRef('id')).order_by('-created')
         annotate_kwargs = {
             field_name: Subquery(newest.values('text')[:1])
         }
         # noinspection PyTypeChecker
         return self.annotate(**annotate_kwargs)
 ​
 ​
 class Claim(BaseClaim):
     """Concrete :class:`~mkl.fraud_django.models.BaseClaim` for
     :mod:`~mkl.fraud_django.workers_comp` claims.
     """
     objects = ClaimQuerySet.as_manager()
     first_rules_engine_run = models.DateField()
 ​
     @property
     def latest_note(self) -> 'Note':
         """Returns the latest :class:`Note`."""
         return self.notes.latest()
 ​
     @property
     def latest_assessment(self) -> 'Assessment':
         """Retrieves the latest :class:`Assessment`."""
         return self.assessments.latest()
 ​
     @property
     def latest_rulesengine_run(self) -> 'RulesEngineResult':
         """Returns the most most recent run.
 ​
         .. note::
 ​
             Use :meth:`ClaimQuerySet.prefetch_latest_results` to
             prefetch the last_run, falls back on querying for the latest
 value.
 ​
             Note, if used in a prefetched queryset the value could be
 stale.
         """
         return self._get_latest(RulesEngineResult, '_last_run')
 ​
     def _get_latest(self, model: Type[models.Model], cache_attr: str):
         """Handler to return None if a latest related object does not
 exist,
         checks the cache first."""
         if hasattr(self, cache_attr):
             try:
                 return getattr(self, cache_attr)[0]
             except IndexError:
                 return None
         try:
             return model.objects.filter(claim=self).latest()
         except model.DoesNotExist:
             return None
 ​
     def __unicode__(self):
         return self.claim_number
 ​
 ​
 class BaseNote(models.Model):
     """Abstract Base Model for both Notes and Assessments.
 ​
     Use this base for any claim related editable field whose
     historical data is important.
 ​
     On the claim we can write functions to retrieve the latest.
 ​
     .. note:: The related name will be the class name lower case with an
 's'.
 ​
     Attributes:
         text (str): The user provided content
         created (datetime.datetime): Created time stamp
         claim (:class:`Claim`): The claim related to the note.
     """
     id = models.AutoField(primary_key=True)
     text = models.TextField(max_length=1000)
     created = models.DateTimeField(auto_now_add=True)
     claim = models.ForeignKey('Claim', on_delete=models.PROTECT,
 related_name='%(class)ss')
 ​
     class Meta:
         abstract = True
         get_latest_by = 'created'
         ordering = ('-created',)
 ​
 ​
 class Note(BaseNote):
     """Concrete class for Notes, related_name will become ``notes``."""
 ​
 ​
 class Assessment(BaseNote):
     """Concrete class for Assessment, related_name will become
 ``assessments``."""
     CHOICES = (
         ('01', 'Will open a case'),
         ('02', 'Will not open a case'),
         ('03', 'Previously opened'),
         ('04', "Appears suspicious but won't open"),
         ('05', 'Not enough info to determine'),
         ('06', 'Existing vendor request'),
     )
     text = models.CharField(max_length=1000, choices=CHOICES)
 ​
     def get_choice_value(self) -> str:
         """Returns the value as the choice human readable text."""
         db_text = self.text
         return dict(self.CHOICES)[db_text]
 ​
 ​
 class RuleResult(models.Model):
     """The result of running the engine for a particular claim against a
 :class:`Rule`.
 ​
     Attributes:
         rule: The rule to be checked
         value: The numeric weight of the result
         result: The rules engine result of all rules run against the claim
     """
     id = models.AutoField(primary_key=True)
     rule = models.ForeignKey('Rule', on_delete=models.PROTECT)
     value = models.IntegerField()
     result = models.ForeignKey('RulesEngineResult',
 on_delete=models.PROTECT, related_name='results')
 ​
 ​
 class RulesEngineResultQuerySet(models.QuerySet):
     def get_latest_runs(self):
         """Filters to only the most recent
 :class:`RulesEngineResult`\s."""
         annotated = self.annotate(
             latest=Max('claim__rules_engine_results__created')
         )
         return annotated.filter(created=F('latest'))
 ​
 ​
 class RulesEngineResult(models.Model):
     """
     RulesEngine run result.
 ​
     Attributes:
         claim (:class:`Claim`): The claim run through the RulesEngine.
         results (List[:class:`RuleResult`]): Collection of results for
 each rule.
     """
     id = models.AutoField(primary_key=True)
     created = models.DateTimeField(auto_now_add=True)
     claim = models.ForeignKey('Claim', on_delete=models.PROTECT,
 related_name='rules_engine_results')
     objects = RulesEngineResultQuerySet.as_manager()
 ​
     class Meta:
         get_latest_by = 'created'
 ​
     @property
     def score(self) -> int:
         """Returns the aggregate score of all related results. Checks
 prefetched cache first."""
         if hasattr(self, '_score'):
             return self._score
         d = self.results.aggregate(score=models.Sum('value'))
         return d['score']
 ​
 ​
 """
 Individual Query rendering
 """
 # Recent Note
 qs = models.Claim.objects.annotate_most_recent_note()
 ​
 SELECT "workers_comp_claim"."id",
        "workers_comp_claim"."claim_number",
        "workers_comp_claim"."first_rules_engine_run",
        (SELECT U0."text"
         FROM "workers_comp_note" U0
         WHERE U0."claim_id" = ("workers_comp_claim"."id")
         ORDER BY U0."created" DESC
         LIMIT 1) AS "latest_note_text"
 FROM "workers_comp_claim"
 ​
 # Recent Assessment
 qs = models.Claim.objects.annotate_most_recent_assessment()
 ​
 SELECT "workers_comp_claim"."id",
        "workers_comp_claim"."claim_number",
        "workers_comp_claim"."first_rules_engine_run",
        (SELECT U0."text"
         FROM "workers_comp_assessment" U0
         WHERE U0."claim_id" = ("workers_comp_claim"."id")
         ORDER BY U0."created" DESC
         LIMIT 1) AS "latest_assessment_text"
 FROM "workers_comp_claim"
 ​
 # Latest Results (Run)
 qs = models.Claim.objects.annotate_latest_results()
 ​
 SELECT "workers_comp_claim"."id",
        "workers_comp_claim"."claim_number",
        "workers_comp_claim"."first_rules_engine_run",
        SUM("workers_comp_ruleresult"."value") FILTER (WHERE
 "workers_comp_rulesengineresult"."id" IN (SELECT U0."id"
 FROM "workers_comp_rulesengineresult" U0
 INNER JOIN "workers_comp_claim" U1 ON (U0."claim_id" = U1."id")
 LEFT OUTER JOIN "workers_comp_rulesengineresult" U2 ON (U1."id" =
 U2."claim_id")
 GROUP BY U0."id"
 HAVING U0."created" = (MAX(U2."created")))) AS "results"
 FROM "workers_comp_claim"
        LEFT OUTER JOIN "workers_comp_rulesengineresult"
                        ON ("workers_comp_claim"."id" =
 "workers_comp_rulesengineresult"."claim_id")
        LEFT OUTER JOIN "workers_comp_ruleresult"
                        ON ("workers_comp_rulesengineresult"."id" =
 "workers_comp_ruleresult"."result_id")
 GROUP BY "workers_comp_claim"."id"
 ​
 ​
 """
 When chained the query renders incorrectly like this
 """
 qs =
 
models.Claim.objects.annotate_latest_results().annotate_most_recent_note().annotate_most_recent_assessment()
 ​
 SELECT "workers_comp_claim"."id",
        "workers_comp_claim"."claim_number",
        "workers_comp_claim"."first_rules_engine_run",
        SUM("workers_comp_ruleresult"."value") FILTER (WHERE
 "workers_comp_rulesengineresult"."id" IN (SELECT U0."id"
 FROM "workers_comp_rulesengineresult" U0
 INNER JOIN "workers_comp_claim" U1 ON (U0."claim_id" = U1."id")
 LEFT OUTER JOIN "workers_comp_rulesengineresult" U2 ON (U1."id" =
 U2."claim_id")
 GROUP BY U0."id"
 HAVING U0."created" = (MAX(U2."created")))) AS "results",
        (SELECT U0."text"
         FROM "workers_comp_note" U0
         WHERE U0."claim_id" = ("workers_comp_claim"."id")
         ORDER BY U0."created" DESC
         LIMIT 1)
 AS "latest_note_text",
        (SELECT U0."text"
         FROM "workers_comp_assessment" U0
         WHERE U0."claim_id" = ("workers_comp_claim"."id")
         ORDER BY U0."created" DESC
         LIMIT 1)
 AS "latest_assessment_text"
 FROM "workers_comp_claim"
        LEFT OUTER JOIN "workers_comp_rulesengineresult"
                        ON ("workers_comp_claim"."id" =
 "workers_comp_rulesengineresult"."claim_id")
        LEFT OUTER JOIN "workers_comp_ruleresult"
                        ON ("workers_comp_rulesengineresult"."id" =
 "workers_comp_ruleresult"."result_id")
 GROUP BY "workers_comp_claim"."id", (SELECT U0."text"
                                      FROM "workers_comp_note" U0
                                      WHERE U0."claim_id" =
 ("workers_comp_claim"."id")
                                      ORDER BY U0."created" DESC
                                      LIMIT 1), (SELECT U0."text"
                                                 FROM
 "workers_comp_assessment" U0
                                                 WHERE U0."claim_id" =
 ("workers_comp_claim"."id")
                                                 ORDER BY U0."created" DESC
                                                 LIMIT 1)
 ​
 ​
 """
 Why is Django performing the group by with the Subqueries? How do I make
 it render correctly like this:
 """
 SELECT "workers_comp_claim"."id",
        "workers_comp_claim"."claim_number",
        "workers_comp_claim"."first_rules_engine_run",
        SUM("workers_comp_ruleresult"."value") FILTER (WHERE
 "workers_comp_rulesengineresult"."id" IN (SELECT U0."id"
 FROM "workers_comp_rulesengineresult" U0
 INNER JOIN "workers_comp_claim" U1 ON (U0."claim_id" = U1."id")
 LEFT OUTER JOIN "workers_comp_rulesengineresult" U2 ON (U1."id" =
 U2."claim_id")
 GROUP BY U0."id"
 HAVING U0."created" = (MAX(U2."created")))) AS "results",
        (SELECT U0."text"
         FROM "workers_comp_note" U0
         WHERE U0."claim_id" = ("workers_comp_claim"."id")
         ORDER BY U0."created" DESC
         LIMIT 1)
 AS "latest_note_text",
        (SELECT U0."text"
         FROM "workers_comp_assessment" U0
         WHERE U0."claim_id" = ("workers_comp_claim"."id")
         ORDER BY U0."created" DESC
         LIMIT 1)
 AS "latest_assessment_text"
 FROM "workers_comp_claim"
        LEFT OUTER JOIN "workers_comp_rulesengineresult"
                        ON ("workers_comp_claim"."id" =
 "workers_comp_rulesengineresult"."claim_id")
        LEFT OUTER JOIN "workers_comp_ruleresult"
                        ON ("workers_comp_rulesengineresult"."id" =
 "workers_comp_ruleresult"."result_id")
 GROUP BY "workers_comp_claim"."id";
 }}}

-- 
Ticket URL: <https://code.djangoproject.com/ticket/30158>
Django <https://code.djangoproject.com/>
The Web framework for perfectionists with deadlines.

-- 
You received this message because you are subscribed to the Google Groups 
"Django updates" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to django-updates+unsubscr...@googlegroups.com.
To post to this group, send email to django-updates@googlegroups.com.
To view this discussion on the web visit 
https://groups.google.com/d/msgid/django-updates/055.66744c40ff4a14b234f369a840a3c5e4%40djangoproject.com.
For more options, visit https://groups.google.com/d/optout.

Reply via email to