Skip to content

Commit e30d0d8

Browse files
committed
WAL-log inplace update before revealing it to other sessions.
A buffer lock won't stop a reader having already checked tuple visibility. If a vac_update_datfrozenid() and then a crash happened during inplace update of a relfrozenxid value, datfrozenxid could overtake relfrozenxid. That could lead to "could not access status of transaction" errors. Back-patch to v12 (all supported versions). In v14 and earlier, this also back-patches the assertion removal from commit 7fcf2fa. Discussion: https://postgr.es/m/20240620012908.92.nmisch@google.com
1 parent 4cf948c commit e30d0d8

File tree

3 files changed

+46
-18
lines changed

3 files changed

+46
-18
lines changed

src/backend/access/heap/README.tuplock

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,4 @@ Inplace updates create an exception to the rule that tuple data won't change
203203
under a reader holding a pin. A reader of a heap_fetch() result tuple may
204204
witness a torn read. Current inplace-updated fields are aligned and are no
205205
wider than four bytes, and current readers don't need consistency across
206-
fields. Hence, they get by with just fetching each field once. XXX such a
207-
caller may also read a value that has not reached WAL; see
208-
systable_inplace_update_finish().
206+
fields. Hence, they get by with just fetching each field once.

src/backend/access/heap/heapam.c

Lines changed: 45 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -6135,13 +6135,18 @@ heap_inplace_update_and_unlock(Relation relation,
61356135
HeapTupleHeader htup = oldtup->t_data;
61366136
uint32 oldlen;
61376137
uint32 newlen;
6138+
char *dst;
6139+
char *src;
61386140

61396141
Assert(ItemPointerEquals(&oldtup->t_self, &tuple->t_self));
61406142
oldlen = oldtup->t_len - htup->t_hoff;
61416143
newlen = tuple->t_len - tuple->t_data->t_hoff;
61426144
if (oldlen != newlen || htup->t_hoff != tuple->t_data->t_hoff)
61436145
elog(ERROR, "wrong tuple length");
61446146

6147+
dst = (char *) htup + htup->t_hoff;
6148+
src = (char *) tuple->t_data + tuple->t_data->t_hoff;
6149+
61456150
/*
61466151
* Construct shared cache inval if necessary. Note that because we only
61476152
* pass the new version of the tuple, this mustn't be used for any
@@ -6160,15 +6165,15 @@ heap_inplace_update_and_unlock(Relation relation,
61606165
*/
61616166
PreInplace_Inval();
61626167

6163-
/* NO EREPORT(ERROR) from here till changes are logged */
6164-
START_CRIT_SECTION();
6165-
6166-
memcpy((char *) htup + htup->t_hoff,
6167-
(char *) tuple->t_data + tuple->t_data->t_hoff,
6168-
newlen);
6169-
61706168
/*----------
6171-
* XXX A crash here can allow datfrozenxid() to get ahead of relfrozenxid:
6169+
* NO EREPORT(ERROR) from here till changes are complete
6170+
*
6171+
* Our buffer lock won't stop a reader having already pinned and checked
6172+
* visibility for this tuple. Hence, we write WAL first, then mutate the
6173+
* buffer. Like in MarkBufferDirtyHint() or RecordTransactionCommit(),
6174+
* checkpoint delay makes that acceptable. With the usual order of
6175+
* changes, a crash after memcpy() and before XLogInsert() could allow
6176+
* datfrozenxid to overtake relfrozenxid:
61726177
*
61736178
* ["D" is a VACUUM (ONLY_DATABASE_STATS)]
61746179
* ["R" is a VACUUM tbl]
@@ -6178,31 +6183,57 @@ heap_inplace_update_and_unlock(Relation relation,
61786183
* D: raise pg_database.datfrozenxid, XLogInsert(), finish
61796184
* [crash]
61806185
* [recovery restores datfrozenxid w/o relfrozenxid]
6186+
*
6187+
* Like in MarkBufferDirtyHint() subroutine XLogSaveBufferForHint(), copy
6188+
* the buffer to the stack before logging. Here, that facilitates a FPI
6189+
* of the post-mutation block before we accept other sessions seeing it.
61816190
*/
6182-
6183-
MarkBufferDirty(buffer);
6191+
Assert(!MyPgXact->delayChkpt);
6192+
START_CRIT_SECTION();
6193+
MyPgXact->delayChkpt = true;
61846194

61856195
/* XLOG stuff */
61866196
if (RelationNeedsWAL(relation))
61876197
{
61886198
xl_heap_inplace xlrec;
6199+
PGAlignedBlock copied_buffer;
6200+
char *origdata = (char *) BufferGetBlock(buffer);
6201+
Page page = BufferGetPage(buffer);
6202+
uint16 lower = ((PageHeader) page)->pd_lower;
6203+
uint16 upper = ((PageHeader) page)->pd_upper;
6204+
uintptr_t dst_offset_in_block;
6205+
RelFileNode rnode;
6206+
ForkNumber forkno;
6207+
BlockNumber blkno;
61896208
XLogRecPtr recptr;
61906209

61916210
xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
61926211

61936212
XLogBeginInsert();
61946213
XLogRegisterData((char *) &xlrec, SizeOfHeapInplace);
61956214

6196-
XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
6197-
XLogRegisterBufData(0, (char *) htup + htup->t_hoff, newlen);
6215+
/* register block matching what buffer will look like after changes */
6216+
memcpy(copied_buffer.data, origdata, lower);
6217+
memcpy(copied_buffer.data + upper, origdata + upper, BLCKSZ - upper);
6218+
dst_offset_in_block = dst - origdata;
6219+
memcpy(copied_buffer.data + dst_offset_in_block, src, newlen);
6220+
BufferGetTag(buffer, &rnode, &forkno, &blkno);
6221+
Assert(forkno == MAIN_FORKNUM);
6222+
XLogRegisterBlock(0, &rnode, forkno, blkno, copied_buffer.data,
6223+
REGBUF_STANDARD);
6224+
XLogRegisterBufData(0, src, newlen);
61986225

61996226
/* inplace updates aren't decoded atm, don't log the origin */
62006227

62016228
recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_INPLACE);
62026229

6203-
PageSetLSN(BufferGetPage(buffer), recptr);
6230+
PageSetLSN(page, recptr);
62046231
}
62056232

6233+
memcpy(dst, src, newlen);
6234+
6235+
MarkBufferDirty(buffer);
6236+
62066237
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
62076238

62086239
/*
@@ -6215,6 +6246,7 @@ heap_inplace_update_and_unlock(Relation relation,
62156246
*/
62166247
AtInplace_Inval();
62176248

6249+
MyPgXact->delayChkpt = false;
62186250
END_CRIT_SECTION();
62196251
UnlockTuple(relation, &tuple->t_self, InplaceUpdateTupleLock);
62206252

src/backend/access/transam/xloginsert.c

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -268,8 +268,6 @@ XLogRegisterBlock(uint8 block_id, RelFileNode *rnode, ForkNumber forknum,
268268
{
269269
registered_buffer *regbuf;
270270

271-
/* This is currently only used to WAL-log a full-page image of a page */
272-
Assert(flags & REGBUF_FORCE_IMAGE);
273271
Assert(begininsert_called);
274272

275273
if (block_id >= max_registered_block_id)

0 commit comments

Comments
 (0)