Skip to content

Commit a584bc5

Browse files
committed
Fix O(N^2) behavior in pg_dump when many objects are in dependency loops.
Combining the loop workspace with the record of already-processed objects might have been a cute trick, but it behaves horridly if there are many dependency loops to repair: the time spent in the first step of findLoop() grows as O(N^2). Instead use a separate flag array indexed by dump ID, which we can check in constant time. The length of the workspace array is now never more than the actual length of a dependency chain, which should be reasonably short in all cases of practical interest. The code is noticeably easier to understand this way, too. Per gripe from Mike Roest. Since this is a longstanding performance bug, backpatch to all supported versions.
1 parent 80c40f0 commit a584bc5

File tree

1 file changed

+59
-60
lines changed

1 file changed

+59
-60
lines changed

src/bin/pg_dump/pg_dump_sort.c

Lines changed: 59 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -104,11 +104,11 @@ static bool TopoSort(DumpableObject **objs,
104104
static void addHeapElement(int val, int *heap, int heapLength);
105105
static int removeHeapElement(int *heap, int heapLength);
106106
static void findDependencyLoops(DumpableObject **objs, int nObjs, int totObjs);
107-
static bool findLoop(DumpableObject *obj,
107+
static int findLoop(DumpableObject *obj,
108108
DumpId startPoint,
109+
bool *processed,
109110
DumpableObject **workspace,
110-
int depth,
111-
int *newDepth);
111+
int depth);
112112
static void repairDependencyLoop(DumpableObject **loop,
113113
int nLoop);
114114
static void describeDumpableObject(DumpableObject *obj,
@@ -478,58 +478,52 @@ static void
478478
findDependencyLoops(DumpableObject **objs, int nObjs, int totObjs)
479479
{
480480
/*
481-
* We use a workspace array, the initial part of which stores objects
482-
* already processed, and the rest of which is used as temporary space to
483-
* try to build a loop in. This is convenient because we do not care
484-
* about loops involving already-processed objects (see notes above); we
485-
* can easily reject such loops in findLoop() because of this
486-
* representation. After we identify and process a loop, we can add it to
487-
* the initial part of the workspace just by moving the boundary pointer.
488-
*
489-
* When we determine that an object is not part of any interesting loop,
490-
* we also add it to the initial part of the workspace. This is not
491-
* necessary for correctness, but saves later invocations of findLoop()
492-
* from uselessly chasing references to such an object.
493-
*
494-
* We make the workspace large enough to hold all objects in the original
495-
* universe. This is probably overkill, but it's provably enough space...
481+
* We use two data structures here. One is a bool array processed[],
482+
* which is indexed by dump ID and marks the objects already processed
483+
* during this invocation of findDependencyLoops(). The other is a
484+
* workspace[] array of DumpableObject pointers, in which we try to build
485+
* lists of objects constituting loops. We make workspace[] large enough
486+
* to hold all the objects, which is huge overkill in most cases but could
487+
* theoretically be necessary if there is a single dependency chain
488+
* linking all the objects.
496489
*/
490+
bool *processed;
497491
DumpableObject **workspace;
498-
int initiallen;
499492
bool fixedloop;
500493
int i;
501494

495+
processed = (bool *) calloc(getMaxDumpId() + 1, sizeof(bool));
502496
workspace = (DumpableObject **) malloc(totObjs * sizeof(DumpableObject *));
503-
if (workspace == NULL)
497+
if (processed == NULL || workspace == NULL)
504498
exit_horribly(NULL, modulename, "out of memory\n");
505-
initiallen = 0;
506499
fixedloop = false;
507500

508501
for (i = 0; i < nObjs; i++)
509502
{
510503
DumpableObject *obj = objs[i];
511-
int newlen;
504+
int looplen;
505+
int j;
512506

513-
workspace[initiallen] = NULL; /* see test below */
507+
looplen = findLoop(obj, obj->dumpId, processed, workspace, 0);
514508

515-
if (findLoop(obj, obj->dumpId, workspace, initiallen, &newlen))
509+
if (looplen > 0)
516510
{
517-
/* Found a loop of length newlen - initiallen */
518-
repairDependencyLoop(&workspace[initiallen], newlen - initiallen);
519-
/* Add loop members to workspace */
520-
initiallen = newlen;
511+
/* Found a loop, repair it */
512+
repairDependencyLoop(workspace, looplen);
521513
fixedloop = true;
514+
/* Mark loop members as processed */
515+
for (j = 0; j < looplen; j++)
516+
processed[workspace[j]->dumpId] = true;
522517
}
523518
else
524519
{
525520
/*
526-
* Didn't find a loop, but add this object to workspace anyway,
527-
* unless it's already present. We piggyback on the test that
528-
* findLoop() already did: it won't have tentatively added obj to
529-
* workspace if it's already present.
521+
* There's no loop starting at this object, but mark it processed
522+
* anyway. This is not necessary for correctness, but saves later
523+
* invocations of findLoop() from uselessly chasing references to
524+
* such an object.
530525
*/
531-
if (workspace[initiallen] == obj)
532-
initiallen++;
526+
processed[obj->dumpId] = true;
533527
}
534528
}
535529

@@ -538,45 +532,51 @@ findDependencyLoops(DumpableObject **objs, int nObjs, int totObjs)
538532
exit_horribly(NULL, modulename, "could not identify dependency loop\n");
539533

540534
free(workspace);
535+
free(processed);
541536
}
542537

543538
/*
544539
* Recursively search for a circular dependency loop that doesn't include
545-
* any existing workspace members.
540+
* any already-processed objects.
546541
*
547542
* obj: object we are examining now
548543
* startPoint: dumpId of starting object for the hoped-for circular loop
549-
* workspace[]: work array for previously processed and current objects
544+
* processed[]: flag array marking already-processed objects
545+
* workspace[]: work array in which we are building list of loop members
550546
* depth: number of valid entries in workspace[] at call
551-
* newDepth: if successful, set to new number of workspace[] entries
552547
*
553-
* On success, *newDepth is set and workspace[] entries depth..*newDepth-1
554-
* are filled with pointers to the members of the loop.
548+
* On success, the length of the loop is returned, and workspace[] is filled
549+
* with pointers to the members of the loop. On failure, we return 0.
555550
*
556551
* Note: it is possible that the given starting object is a member of more
557552
* than one cycle; if so, we will find an arbitrary one of the cycles.
558553
*/
559-
static bool
554+
static int
560555
findLoop(DumpableObject *obj,
561556
DumpId startPoint,
557+
bool *processed,
562558
DumpableObject **workspace,
563-
int depth,
564-
int *newDepth)
559+
int depth)
565560
{
566561
int i;
567562

568563
/*
569-
* Reject if obj is already present in workspace. This test serves three
570-
* purposes: it prevents us from finding loops that overlap
571-
* previously-processed loops, it prevents us from going into infinite
572-
* recursion if we are given a startPoint object that links to a cycle
573-
* it's not a member of, and it guarantees that we can't overflow the
574-
* allocated size of workspace[].
564+
* Reject if obj is already processed. This test prevents us from finding
565+
* loops that overlap previously-processed loops.
566+
*/
567+
if (processed[obj->dumpId])
568+
return 0;
569+
570+
/*
571+
* Reject if obj is already present in workspace. This test prevents us
572+
* from going into infinite recursion if we are given a startPoint object
573+
* that links to a cycle it's not a member of, and it guarantees that we
574+
* can't overflow the allocated size of workspace[].
575575
*/
576576
for (i = 0; i < depth; i++)
577577
{
578578
if (workspace[i] == obj)
579-
return false;
579+
return 0;
580580
}
581581

582582
/*
@@ -590,10 +590,7 @@ findLoop(DumpableObject *obj,
590590
for (i = 0; i < obj->nDeps; i++)
591591
{
592592
if (obj->dependencies[i] == startPoint)
593-
{
594-
*newDepth = depth;
595-
return true;
596-
}
593+
return depth;
597594
}
598595

599596
/*
@@ -602,18 +599,20 @@ findLoop(DumpableObject *obj,
602599
for (i = 0; i < obj->nDeps; i++)
603600
{
604601
DumpableObject *nextobj = findObjectByDumpId(obj->dependencies[i]);
602+
int newDepth;
605603

606604
if (!nextobj)
607605
continue; /* ignore dependencies on undumped objects */
608-
if (findLoop(nextobj,
609-
startPoint,
610-
workspace,
611-
depth,
612-
newDepth))
613-
return true;
606+
newDepth = findLoop(nextobj,
607+
startPoint,
608+
processed,
609+
workspace,
610+
depth);
611+
if (newDepth > 0)
612+
return newDepth;
614613
}
615614

616-
return false;
615+
return 0;
617616
}
618617

619618
/*

0 commit comments

Comments
 (0)