From a547e6867527ca16628a3fb1cf3ef6f785210a31 Mon Sep 17 00:00:00 2001
From: Jeff Davis <jdavis@postgresql.org>
Date: Mon, 7 Sep 2020 13:31:59 -0700
Subject: [PATCH] Adjust cost model for HashAgg that spills to disk.

Tomas Vondra observed that the IO behavior for HashAgg tends to be
worse than for Sort. Penalize HashAgg IO costs accordingly.

Also, account for the CPU effort of spilling the tuples and reading
them back.

Discussion: https://postgr.es/m/20200906212112.nzoy5ytrzjjodpfh@development
Reviewed-by: Tomas Vondra
Backpatch-through: 13
---
 src/backend/optimizer/path/costsize.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index fda4b2c6e8..cd3716d494 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -2416,6 +2416,7 @@ cost_agg(Path *path, PlannerInfo *root,
 		double		pages;
 		double		pages_written = 0.0;
 		double		pages_read = 0.0;
+		double		spill_cost;
 		double		hashentrysize;
 		double		nbatches;
 		Size		mem_limit;
@@ -2453,9 +2454,21 @@ cost_agg(Path *path, PlannerInfo *root,
 		pages = relation_byte_size(input_tuples, input_width) / BLCKSZ;
 		pages_written = pages_read = pages * depth;
 
+		/*
+		 * HashAgg has somewhat worse IO behavior than Sort on typical
+		 * hardware/OS combinations. Account for this with a generic penalty.
+		 */
+		pages_read *= 2.0;
+		pages_written *= 2.0;
+
 		startup_cost += pages_written * random_page_cost;
 		total_cost += pages_written * random_page_cost;
 		total_cost += pages_read * seq_page_cost;
+
+		/* account for CPU cost of spilling a tuple and reading it back */
+		spill_cost = depth * input_tuples * 2.0 * cpu_tuple_cost;
+		startup_cost += spill_cost;
+		total_cost += spill_cost;
 	}
 
 	/*