29 patches for repository /home/simonmar/ghc-HEAD: Fri Sep 3 14:49:16 BST 2010 Simon Marlow * add comment about locking rules for storage data structures Tue Sep 7 10:11:09 BST 2010 Simon Marlow * comment fix Tue Oct 12 16:50:42 BST 2010 Simon Marlow * sanity checking fix for PAPs Tue Oct 12 16:51:01 BST 2010 Simon Marlow * add a missing forwarding-ptr check to the generic apply code Wed Oct 13 16:48:24 BST 2010 Simon Marlow * bugfix (debug output) Fri Oct 15 11:00:54 BST 2010 Simon Marlow * add assertion Tue Oct 19 11:59:53 BST 2010 Simon Marlow * fix for +RTS -qg Tue Oct 19 14:07:52 BST 2010 Simon Marlow * comment Thu Oct 28 15:01:43 BST 2010 Simon Marlow * merge Thu Oct 28 15:02:40 BST 2010 Simon Marlow * small sanity fix Thu Oct 28 15:03:41 BST 2010 Simon Marlow * merge Mon Nov 1 13:51:23 GMT 2010 Simon Marlow * relax ASSERT_PARTIAL_CAPABILITY_INVARIANTS Fri Nov 5 14:03:13 GMT 2010 Simon Marlow * Fix parallel GC (duh) Wed Jan 5 15:29:16 GMT 2011 Simon Marlow * commented-out code to implement multiple allocations in todo_block_full Mon Jan 17 11:28:02 GMT 2011 Simon Marlow * Local GC Tue Jan 18 12:33:26 GMT 2011 Simon Marlow * fix an assertion Tue Jan 18 12:33:54 GMT 2011 Simon Marlow * allocate 4 blocks at a time to reduce contention and overhead Tue Jan 18 12:34:03 GMT 2011 Simon Marlow * sanity check fix Tue Jan 18 12:34:19 GMT 2011 Simon Marlow * sweep fix Thu Jan 20 12:39:23 GMT 2011 Simon Marlow * more comments on the bdescr type Thu Jan 20 12:39:48 GMT 2011 Simon Marlow * don't use the global g0, use cap->r.rG0 instead Thu Jan 20 12:42:35 GMT 2011 Simon Marlow * fix slop calculation Thu Jan 20 12:42:47 GMT 2011 Simon Marlow * 80 columnize Thu Jan 20 15:05:18 GMT 2011 Simon Marlow * use allocatePrim for MUT_ARR_PTRS_FROZEN Thu Jan 20 16:09:28 GMT 2011 Simon Marlow * PLAN updates Thu Jan 20 16:09:48 GMT 2011 Simon Marlow * fix for globalising BLOCKING_QUEUEs Thu Jan 20 16:10:18 GMT 2011 Simon Marlow * allocate prim blocks from the block allocator, not the nursery Fri Jan 21 12:49:16 GMT 2011 Simon Marlow * add -HA, fixedAllHeapSizeSuggestion for measurements Tue Jan 25 12:53:03 GMT 2011 Simon Marlow * fix live/slop stats New patches: [add comment about locking rules for storage data structures Simon Marlow **20100903134916 Ignore-this: 4adbcb37aeadff1975bb232c823b4636 ] hunk ./rts/sm/Storage.c 36 #include "ffi.h" -/* - * All these globals require sm_mutex to access in THREADED_RTS mode. +/* ----------------------------------------------------------------------------- + Storage manager state + ------------------------------------------------------------------------- */ + +/* LOCKING RULES + * + * protected by sm_mutex: + * - the block allocator + * - caf_list + * - revertible_caf_list + * - keepCAFs + * - exec_block + * + * protected by gen->sync: + * - all generation structs except when gen->is_local + * + * during parallel GC, the rules are modified: + * - the block allocator requires gc_alloc_block_sync, not sm_mutex + * - even local generations require gen->sync + * + * owned by a Capability: + * - local generations + * - nurseries */ StgClosure *caf_list = NULL; StgClosure *revertible_caf_list = NULL; [comment fix Simon Marlow **20100907091109 Ignore-this: 43c9d5c5503c4e97b3558290c9d5eda8 ] { hunk ./rts/Task.c 321 taskTimeStamp (Task *task USED_IF_THREADS) { #if defined(THREADED_RTS) - Ticks currentElapsedTime, currentUserTime, elapsedGCTime; + Ticks currentElapsedTime, currentUserTime; currentUserTime = getThreadCPUTime(); currentElapsedTime = getProcessElapsedTime(); hunk ./rts/Task.c 326 - // XXX this is wrong; we want elapsed GC time since the - // Task started. - elapsedGCTime = stat_getElapsedGCTime(); - task->mut_time = currentUserTime - task->muttimestart - task->gc_time; task->mut_etime = hunk ./rts/Task.c 329 - currentElapsedTime - task->elapsedtimestart - elapsedGCTime; + currentElapsedTime - task->elapsedtimestart - task->gc_etime; hunk ./rts/Task.c 331 + if (task->gc_time < 0) { task->gc_time = 0; } + if (task->gc_etime < 0) { task->gc_etime = 0; } if (task->mut_time < 0) { task->mut_time = 0; } if (task->mut_etime < 0) { task->mut_etime = 0; } #endif hunk ./rts/Task.c 338 } +void +taskDoneGC (Task *task, Ticks cpu_time, Ticks elapsed_time) +{ + task->gc_time += cpu_time; + task->gc_etime += elapsed_time; +} + #if defined(THREADED_RTS) void hunk ./rts/Task.c 409 // A worker always gets a fresh Task structure. task = newTask(rtsTrue); - // The lock here is to synchronise with taskStart(), to make sure + // The lock here is to synchronise with workerStart(), to make sure // that we have finished setting up the Task structure before the // worker thread reads it. ACQUIRE_LOCK(&task->lock); } [sanity checking fix for PAPs Simon Marlow **20101012155042 Ignore-this: d04ece2a88190cdbe0293db9aa79e97d ] { hunk ./rts/sm/Sanity.c 215 StgClosure *fun; StgClosure *p; StgFunInfoTable *fun_info; + const StgInfoTable *info; fun = UNTAG_CLOSURE(tagged_fun); ASSERT(LOOKS_LIKE_CLOSURE_PTR(fun)); hunk ./rts/sm/Sanity.c 219 - fun_info = get_fun_itbl(fun); + + info = fun->header.info; + if (IS_FORWARDING_PTR(info)) { + fun = (StgClosure*)UN_FORWARDING_PTR(info); + info = fun->header.info; + } + fun_info = FUN_INFO_PTR_TO_STRUCT(info); p = (StgClosure *)payload; switch (fun_info->f.fun_type) { } [add a missing forwarding-ptr check to the generic apply code Simon Marlow **20101012155101 Ignore-this: 277bc429ed31d1cba0d0be2a14099194 ] hunk ./utils/genapply/GenApply.hs 505 -- Functions can be tagged, so we untag them! text "R1 = UNTAG(R1);", text "info = %INFO_PTR(R1);", + -- check for forwarding pointers + text "if (IS_FORWARDING_PTR(info)) { R1 = UN_FORWARDING_PTR(info); info = %INFO_PTR(R1); }", -- if fast == 1: -- print " goto *lbls[info->type];"; [bugfix (debug output) Simon Marlow **20101013154824 Ignore-this: f163a7c3f90ed1ab5e1c49ae195f8d59 ] hunk ./rts/Trace.c 140 [ThreadYielding] = "yielding", [ThreadBlocked] = "blocked", [ThreadFinished] = "finished", - [THREAD_SUSPENDED_FOREIGN_CALL] = "suspended while making a foreign call" + [THREAD_SUSPENDED_FOREIGN_CALL] = "suspended while making a foreign call", + [6 + BlockedOnMVar] = "blocked on an MVar", + [6 + BlockedOnBlackHole] = "blocked on a black hole", + [6 + BlockedOnRead] = "blocked on a read operation", + [6 + BlockedOnWrite] = "blocked on a write operation", + [6 + BlockedOnDelay] = "blocked on a delay operation", + [6 + BlockedOnSTM] = "blocked on STM", + [6 + BlockedOnDoProc] = "blocked on asyncDoProc", + [6 + BlockedOnCCall] = "blocked on a foreign call", + [6 + BlockedOnCCall_NoUnblockExc] = "blocked on a foreign call", + [6 + BlockedOnMsgThrowTo] = "blocked on throwTo", + [6 + ThreadMigrating] = "migrating", + [6 + BlockedOnMsgGlobalise] = "blocked waiting for data to be globalised" }; #endif [add assertion Simon Marlow **20101015100054 Ignore-this: ef7bcd17f97c6fbc08d690915e408915 ] hunk ./rts/Schedule.c 413 ASSERT_FULL_CAPABILITY_INVARIANTS(cap,task); ASSERT(t->cap == cap); ASSERT(t->bound ? t->bound->task->cap == cap : 1); + ASSERT(isGlobal((StgClosure*)t) || Bdescr((P_)t)->gen_ix == cap->no); prev_what_next = t->what_next; [fix for +RTS -qg Simon Marlow **20101019105953 Ignore-this: e1ef4bf4a6ee2a8c53687c6dc673b4f5 ] hunk ./rts/sm/GC.c 723 // zero the scavenged static object list if (major_gc) { nat i; - for (i = 0; i < n_gc_threads; i++) { - zero_static_object_list(gc_threads[i]->scavenged_static_objects); + if (gc_type == GC_SEQ) { + zero_static_object_list(gct->scavenged_static_objects); + } else { + for (i = 0; i < n_gc_threads; i++) { + zero_static_object_list(gc_threads[i]->scavenged_static_objects); + } } } [comment Simon Marlow **20101019130752 Ignore-this: a8ff155f3175ef2590446fede80a5838 ] hunk ./rts/sm/GC.c 1349 /* ---------------------------------------------------------------------------- - Initialise a generation that is *not* to be collected + Save the mutable lists in saved_mut_lists ------------------------------------------------------------------------- */ static void [merge Simon Marlow **20101028140143 Ignore-this: bdb191cfb48a6a67468eb7b2ec83398c ] hunk ./rts/Trace.c 149 [6 + BlockedOnSTM] = "blocked on STM", [6 + BlockedOnDoProc] = "blocked on asyncDoProc", [6 + BlockedOnCCall] = "blocked on a foreign call", - [6 + BlockedOnCCall_NoUnblockExc] = "blocked on a foreign call", + [6 + BlockedOnCCall_Interruptible] = "blocked on a foreign call (interruptible)", [6 + BlockedOnMsgThrowTo] = "blocked on throwTo", [6 + ThreadMigrating] = "migrating", [6 + BlockedOnMsgGlobalise] = "blocked waiting for data to be globalised" [small sanity fix Simon Marlow **20101028140240 Ignore-this: 3b4c821ca6ffa3b89b646e9d599b06b9 ] { hunk ./rts/sm/Sanity.c 595 // be on the mutable list. if (tso->dirty) { ASSERT(Bdescr((P_)tso)->gen_no == 0 || (tso->flags & TSO_MARKED)); - tso->flags &= ~TSO_MARKED; } hunk ./rts/sm/Sanity.c 596 + tso->flags &= ~TSO_MARKED; } } } } [merge Simon Marlow **20101028140341 Ignore-this: 4d5424899a1ccb50a653bce0310665 ] hunk ./compiler/ghci/RtClosureInspect.hs 205 | i' == aP_CODE = AP | i == AP_STACK = AP | i' == pAP_CODE = PAP - | i == MUT_VAR_CLEAN || i == MUT_VAR_DIRTY= MutVar i' + | i == MUT_VAR_LOCAL || i == MUT_VAR_GLOBAL= MutVar i' | i == MVAR_CLEAN || i == MVAR_DIRTY = MVar i' | otherwise = Other i' where i' = fromIntegral i [relax ASSERT_PARTIAL_CAPABILITY_INVARIANTS Simon Marlow **20101101135123 Ignore-this: 3b8a3bc87500044bb75c0b07e760725c ] hunk ./rts/Capability.h 143 #define ASSERT_PARTIAL_CAPABILITY_INVARIANTS(cap,task) \ ASSERT(cap->run_queue_hd == END_TSO_QUEUE ? \ cap->run_queue_tl == END_TSO_QUEUE : 1); \ - ASSERT(myTask() == task); \ - ASSERT_TASK_ID(task); + if (task != NULL) { \ + ASSERT(myTask() == task); \ + ASSERT_TASK_ID(task); \ + } // Converts a *StgRegTable into a *Capability. // [Fix parallel GC (duh) Simon Marlow **20101105140313 Ignore-this: c5d4d9599a818544496d349f12116791 ] { hunk ./rts/sm/Scav.c 1841 #if defined(THREADED_RTS) if (work_stealing) { + int i; // look for work to steal hunk ./rts/sm/Scav.c 1843 - for (g = RtsFlags.GcFlags.generations-1; g >= 0; g--) { - if ((bd = steal_todo_block(g)) != NULL) { + for (i = total_generations-1; i >= 0; i--) { + if (all_generations[i].is_local) continue; + if ((bd = steal_todo_block(i)) != NULL) { scavenge_block(bd); did_something = rtsTrue; break; } [commented-out code to implement multiple allocations in todo_block_full Simon Marlow **20110105152916 Ignore-this: cd930a2dfef31e33eecb46e410762f6c ] { hunk ./rts/sm/GCUtils.c 53 } -#if 0 static void allocBlocks_sync(nat n, bdescr **hd, bdescr **tl, hunk ./rts/sm/GCUtils.c 55 - nat gen_no, step *stp, - StgWord32 flags) + generation *gen, StgWord32 flags) { bdescr *bd; nat i; hunk ./rts/sm/GCUtils.c 59 - ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync); - bd = allocGroup(n); + + if (gct->gc_type == GC_LOCAL) { + bd = allocGroup_lock(n); + } else { + ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync); + bd = allocGroup(n); + RELEASE_SPIN_LOCK(&gc_alloc_block_sync); + } + for (i = 0; i < n; i++) { bd[i].blocks = 1; hunk ./rts/sm/GCUtils.c 70 - bd[i].gen_no = gen_no; - bd[i].step = stp; + initBdescr(&bd[i], gen, gen->to); bd[i].flags = flags; bd[i].link = &bd[i+1]; bd[i].u.scan = bd[i].free = bd[i].start; hunk ./rts/sm/GCUtils.c 77 } *hd = bd; *tl = &bd[n-1]; - RELEASE_SPIN_LOCK(&gc_alloc_block_sync); } hunk ./rts/sm/GCUtils.c 78 -#endif void freeChain_sync(bdescr *bd) hunk ./rts/sm/GCUtils.c 246 StgPtr alloc_todo_block (gen_workspace *ws, nat size) { - bdescr *bd/*, *hd, *tl */; + bdescr *bd; // Grab a part block if we have one, and it has enough room bd = ws->part_list; hunk ./rts/sm/GCUtils.c 260 { // blocks in to-space get the BF_EVACUATED flag. -// allocBlocks_sync(16, &hd, &tl, -// ws->step->gen_no, ws->step, BF_EVACUATED); -// -// tl->link = ws->part_list; -// ws->part_list = hd->link; -// ws->n_part_blocks += 15; -// -// bd = hd; - if (size > BLOCK_SIZE_W) { bd = allocGroup_sync((lnat)BLOCK_ROUND_UP(size*sizeof(W_)) / BLOCK_SIZE); hunk ./rts/sm/GCUtils.c 264 } else { +#if 1 bd = allocBlock_sync(); hunk ./rts/sm/GCUtils.c 266 +#else + bdescr *hd, *tl; + + allocBlocks_sync(16, &hd, &tl, ws->gen, BF_EVACUATED); + tl->link = ws->part_list; + ws->part_list = hd->link; + ws->n_part_blocks += 15; + + bd = hd; +#endif } initBdescr(bd, ws->gen, ws->gen->to); bd->flags = BF_EVACUATED; } [Local GC Simon Marlow **20110117112802 Ignore-this: 6674dd927a1f87fdb495769f814f6202 ] { addfile ./PLAN hunk ./PLAN 1 +----------------------------------------------------------------------------- +-- Plan + + - allocate multiple blocks at the same time to reduce sm_mutex + contention. This caused crashes before when I tried it - why? + + - fix crashes on the 24-core machine. + + - measure CnC properly + + - test a multicore HTTP server + + - try to improve the message-queue locking. Multiple writers should + be non-blocking. + + - parTree: do we have deep stacks? + + - Push down local_gc_lock further? Do we have contention problems? + Measure. + + - BLOCKING_QUEUE can point to IND_LOCAL instead of the BLACKHOLE, + assertion at Message.c:300 fails + + - LinSolv has been broken by WEAK sparks + - transclos only seems to parallelise when not optimised :-( + + - Don't steal another spark if we're waiting for a reply to + MSG_GLOBALISE on the first one (or maybe limit the number of + outstanding MSG_GLOBALISE messages before we stop stealing). + + - running messageGlobalise eagerly is bad if the target is a BLACKHOLE + + - check the perf of stage2 (need to compile it optimised) + + - sort out where we allocate prim memory from. Using the nursery is + a bit of a hack, and the nursery has to be resized after each + collection. + + - run tests, stabilise + + - calcAllocated: take into account allocations from allocatePrim(). + + - threadStackOverflow: hacky -1s to account for global flags + + - PROFILING??? + - selector thunks: must check for forwarding ptr before entering + + - is HEAP_ALLOCED_GC() safe to call during local GC? + + - no need to maintain the global heap invariant for non-THREADED_RTS + and when -N1. + + - sparks: we should look through the IND_LOCAL when deciding whether + to fizzle a spark during GC. + + - STM invariants + + - Now can we do pinning of arbitrary objects? + - a BF_PINNED page gets mark-swept + - we want to support unpinning too: how can we tell when to unpin + the page again? + +----------------------------------------------------------------------------- +Tuning + + - ray: using parBuffer 1000 much better than parBuffer 200 for -N8, + and we beat HEAD now. + + - for some reason, allowing the inbox of a capability to accumulate + messages (up to 50) before context-switching it works much better + than eagerly context-switching. + + - instead of setting failed_to_evac, just globalise_evac + immediately. + + - sparks: can we do better than just globalising all sparks? + e.g. separate the spark pool into local and global pools + + - if a Cap is idle, it should probably do a local GC, otherwise it + will keep receiving messages requesting globalisation. + + - options for global mark bits: + - word before each object + - bitmap at the beginning of the block + - low bit in the info pointer + + - GC'ing IND_LOCAL: + - can't squash it to IND when scavenging, because although the IND + may point to the global gen, it might refer transitively to + something in the local gen, so this is not safe until after + globalise_mut_list. + - in globalise_mut_list, we want to squash IND_LOCALs on the mut + list; might this need two passes? + + - checking whether a closure pointer is global is too complex: + + EXTERN_INLINE rtsBool isGlobal (StgClosure *p) + { + bdescr *bd; + bd = Bdescr((P_)p); + return bd->gen_no != 0 || ((bd->flags & BF_PRIM) && isGlobalPrim(p)); + } + + (used in globalise_wrt, checkMutableList, + dirty_MUT_VAR, dirty_MUT_ARR, recordClosureMutated_) + + how can we make this easier? + + 1. use MUT_VAR_GLOBAL iff the MUT_VAR is in the global gen, + otherwise use MUT_VAR_LOCAL. Only MUT_VAR_GLOBAL writes need to + publish. Inline dirty_MUT_VAR. + + 2. similarly for TSO: !tso->dirty means global and clean. + + 3. can omit the BF_PRIM check for objects we know to be prim. + + - scavenge_mark_stack should probably set evac_gen_ix to the old gen + if the marked closure is global. + + - the first time we sweep a prim block that has a global object in it, + mark the block BF_GLOBAL so we don't sweep it again? + + - we could inline part of allocatePrim if newMVar, newMutVar + performance is important. + + - An alternative: allocate prim objects in the nursery as normal, + but pin the whole block if they become global. Other private + objects would be copied out of the block as normal. The pinned + block then becomes part of the global heap (if we did Immix + marking then we could reclaim the free parts of the block). + + We still have to mark individual objects as global/local, and we + can't use word-before-object. + + - fix gc_bench: 38% slower + +----------------------------------------------------------------------------- +Things to measure + + - sendMessage to an idle processor: do the message eagerly or not? + + - INBOX_THRESHOLD setting + + - not eagerly promoting THUNKS: + - on average 3% slower over nofib/gc, constraints +13%, power +11%. + - also slower with -A1m (similar results) + - power is the biggest culprit, because it does a lot of updates + where the value is a constructor with 2 thunk fields, so we make + 2 IND_LOCALs rather than 1. + + - single-threaded GC perf: most things got faster, gc_bench?? + + - publish vs. globalise vs. globalise-to-depth vs. globalise-but-not-thunks + - for updates + - for MUT_VAR / MUT_ARR / TVAR / MVAR + - IND_LOCAL on the mut list during GC + - sparks + + - right now we're not promoting anything from the local prim heap + unless it is referred to by the global heap. Is this the best + option? Alternative: mark closures global in evacuate(). + + - different choices of where to store mark bits. + + - sparks: + - keep a local spark pool and steal by messages, compared to + a global spark pool with a work-stealing queue. + - separate the spark pool into local and global pools + + - allocating MSG_BLACKHOLE globally immediately, or globalise on + demand. + + - code size impact of extra tag & fwd ptr checking + - mutator impact (just the codeGen changes, no GC changes) + +----------------------------------------------------------------------------- +-- Truly asynchronous globalisation + + * Add a new per-CPU lock in the gc_thread: local_gc_lock. This protects + the local heap during a GC: we allow concurrent globalise(), but a + local GC gets exclusive access to the heap. + - take it when doing a GC_LOCAL + - take it when globalising from another CPU. + - take it in globalise_TSO: in here we set the TSO_GLOBALISE flag + on the TSO, we don't want concurrent globalises to find this flag + set. + + * we need a separate set of mut_lists to use when globalising data + from another processor's local heap. These mut_lists are locked by + local_gc_lock. + - add cap->ext_mut_lists + - add recordMutableExt() in Capability.h + + * add appropriate write_barrier()s in Globalise.c:copy_tag() etc. + + * in globalise_maybe_record_mutable(), check whether we're recording + a mutable object that points to another cap's local heap, and use + ext_mut_lists if so. + + * in atomicModifyIORef, writeIORef, we cannot be sure that the IORef + is not being globalised while we are modifying it, so we don't know + for sure whether we should globalise the value or not. How to + solve this race condition? + +----------------------------------------------------------------------------- +-- Plan, later + +* strange behaviour: iograph 30000 +RTS -H300m + 612 MB total memory in use (190 MB lost due to fragmentation) ??? + +* MUT_VAR_CLEAN and MUT_VAR_DIRTY aren't being used. Decide what to + do about them. (similarly for other clean/dirty pairs?) + +* Can we do globalise_capability_mut_lists() in the worker threads for + GC_PAR, rather than having the main thread do all of them? The + problem is that mut_list entries might be on the wrong mut_list. + +* Clean up mutable arrays: + - card marking is unnecessary for an array in the global heap, because + we publish/globalise every pointer written into it + +* leaving an IND_LOCAL every time we write to a mutable object in the + global gen is bad, because we'll accumulate floating garbage from + all the old values. Can we do anything at all here? + - for objects that already have a read barrier (e.g. MVar), we + could do better. + +* maybe we should have MUT_VAR_PUBLIC / MUT_VAR_PRIVATE? + +* Push down SM lock in GC.c + * GLOBALS that need locking in GC: + * threads lists attached to old generations (modified in tidyThreadList) + * gen->weak_ptrs lists + * gen->large_objects lists + * resurrectUnreachableThreads() can be pulled outside the lock, it only + touches the threads lists of generations collected. + +* XXX we broke LDV profiling in CgTailCall +* XXX we probably don't need to duplicate the whole of Scav.c/Evac.c +* XXX slop calculation is wrong: 4,294,967,128 bytes maximum slop +* XXX major_gc is global, we rely on it always being rtsFalse for local GC +* XXX what to do about heapSizeSuggestion and resize_nursery: should + we resize the current nursery accoring to the amount of stuff that + was retained? +* XXX -G1 is not compatible with -N2 and greater, becauser there would + be no global heap. +* XXX should make publish() faster by inlining the allocation + + * ToDo: check whether the interpreter looks at the info table of a + tagged closure + +- do more sanity checking: + - check for proper BF_EVACUATED flags on blocks + +----------------------------------------------------------------------------- +-- STM + + TRecHeader (MUT_PRIM) -> { TRecHeader enclosing, + TRecChunk, + InvariantCheckQueue } + + * TRecHeaders must be visible to all Capabilities, because a TVar is + locked by writing a pointer to the current TRecHeader into the + tvar->current_value field. Other Capabilities need to examine + this closure to check whether the TVar is locked. + + Hence, we allocate all TRecHeaders in the global heap. Their + contents are PRIVATE, they may contain global->local pointers, and + they must be on the mutable list. + + TRecChunk (TREC_CHUNK) -> + { TRecChunk prev_chunk, + TRecEntry -> { TVar, expected, new } + } + + * We consider the TRecChunk to be PRIVATE. They can contain + global->local pointers, and must be on the correct mutable list, + but they do not have to be allocated in the global heap. + + TVar (MUT_PRIM) -> { value, TVarWatchQueue } + + * When we write to a TVar, we must globalise as usual. Both TVars + and TVarWatchQueues are MUT_PRIM, so they both get eagerly promoted. + + TVarWatchQueue (MUT_PRIM) -> { closure, prev, next } + + * When creating a TVarWatchQueue we first need to check whether to + allocate it globally or locally, depending on the TVar. + + AtomicInvariant (MUT_PRIM) -> { code, TrecHeader } + + * TODO + + InvariantCheckQueue (MUT_PRIM) -> { AtomicInvariant, TrecHeader, next } + + * TODO + +----------------------------------------------------------------------------- +-- BLOCKING_QUEUE structures + +typedef struct StgBlockingQueue_ { + StgHeader header; + struct StgBlockingQueue_ *link; // here so it looks like an IND + StgClosure *bh; // the BLACKHOLE + StgTSO *owner; + struct MessageBlackHole_ *queue; +} StgBlockingQueue; + + - BLOCKING_QUEUE and MSG_BLACKHOLE always get eagerly promoted, so we + never end up with global->local pointers in the link and queue + fields. + + - BLOCKING_QUEUE may be promoted ahead of BLACKHOLE, so the BH field + may be global->local. But: a BLOCKING_QUEUE object is only pointed + to by the BH and the TSO that owns it, and the TSO must belong to + the owner of the BH, so this should be safe. + +----------------------------------------------------------------------------- +-- Remembered set overhaul + +If we're going to support more than one local generation, we may want +to simplify the remembered set handling. Here are some thoughts on +that: + + - All remembered sets contain only TSO / IND_LOCAL objects + - every update of an old-gen thunk promotes the transitive closure + - no more recordMutable + - every GC must do globalise_mut_list for all mut lists + - write barriers: + - MUT_VAR/MUT_ARR: publish the written pointer + - TSO: as before, TSO can be in the remembered set + - MVAR + - blocking on a BLACKHOLE (modifies the BH, or the BQ) + - sending a message (modifies its link) + +----------------------------------------------------------------------------- +-- Allocate MSG_BLACKHOLE locally or globally? (also MSG_THROWTO) + + We have a choice here; + 1. allocate the MSG_BLACKHOLE in global memory, + and globalise CurrentTSO immediately. + + 2. allocate the MSG_BLACKHOLE in local memory, leaving it + to messageBlackHole to globalise it (and CurrentTSO) if + necessary. + + We need the MSG_BLACKHOLE to be global if: + - the BLACKHOLE is already in global memory, or + - the MSG_BLACKHOLE will be sent to another CPU + + For now we do (2), ToDo: try (1) and measure + +----------------------------------------------------------------------------- +-- Invariants + +1. Pointers from the global heap to the local heap are allowed only in + two places: IND_LOCAL, or the stack of a TSO + + 1a. TSO stacks point to at most one local heap. + + +2. every global object that points into a Capability's local heap must + be in that Capability's remembered set (cap->mut_lists[g]). + + This means that + * GC_LOCAL need only consult the local mut lists. + * MAINTAINING THE INVARIANT: + * GC_LCCAL needs to do nothing special: the invariant is + automatically maintained. + * GC_SEQ needs to add entries to the appropriate mut list + * GC_PAR may need to save up entries for adding to the appropriate + list. + +3. Objects in a Capability's remembered set may only point to the + local heap of that Capability or the global heap. + + So when doing a local GC, we only visit objects in the global heap + or our local heap. + +----------------------------------------------------------------------------- +-- Why this design? + +1. Why have IND_LOCAL rather than always promoting the transitive + closure to maintain the global heap invariant? + + - we can have some closures that we do not globalise + - closures with identity, e.g. primitive arrays and suchlike + - mutable objects can still be allocated in the local heap + - we probably want to try to keep mutable closures in the private + heap, otherwise we have to do globalisation/publish on every + write. For this we might want to have two generations in the + local heap. + + - might be able to reduce the latency of globalisation requests + from another CPU, by not promoting the whole transitive closure. + + - we can use a shared public spark pool and work-stealing without + globalising the entire transitive closure of its contents. + +----------------------------------------------------------------------------- +-- How to globalise different closure types [GLOBALISE] + +Two operations: + + StgClosure * publish (StgClosure *p) + // make an object representing P in the global heap + + void globalise (StgClosure **p) + // move P from local to global storage + +Every time we write a pointer P into an object in the old generation, +if P points to the local heap, we must write publish(P) instead. This +includes updates. (we cannot overwrite with an IND_LOCAL, because a +race condition between two processors updating the same thunk could +leave an IND_LOCAL pointing to the wrong local heap). + +If we receive a message requesting that we globalise a pointer P, then +we try to copy the transitive closure of the pointer into the global +heap, this is globalise(P). + +class B: BLACKHOLE +class T: THUNK/PAP/AP, untagged +class C: CONSTR, tagged +class F: FUN, tagged or untagged +class L: large or pinned unpointed +class T: TSOs +class U: small unpointed (MVar#, MutVar#, Array#, Weak#, BCO#, + StableName#, TVar#, PRIM, MUT_PRIM) + +publish: + B: IND_LOCAL + T: globalise, or IND_LOCAL + C: globalise, or IND_LOCAL + F: globalise, or IND_LOCAL + L: globalise, or IND_LOCAL + T: globalise, or IND_LOCAL + U: NO + +globalise: + + B: NO (pointed to by stack, will be updated) + T: copy and replace with IND, publish children + C: copy and replace with forwarding ptr, publish children + F: copy and replace with forwarding ptr, publish children + L: re-link the block, publish children + T: copy, replacing with ThreadRelocated, publish children + U: NO + +Note that we can neither publish nor globalise U, hence objects that +point to U can only be published, not globalised. + +Since we cannot globalise U, what happens to the closures that point to U? + + WE NEVER GLOBALISE A CLOSURE THAT POINTS TO U + + Thunks, functions and constructors that point to U are flagged in + their info tables (info->flags & HAS_UNLIFTED_FIELDS), and if + globalise() sees one of these it refuses to globalise the closure. + + +What about during GC? + + U CLOSURES ARE EAGERLY PROMOTED DURING GC + + This is to avoid the situation where we have promoted a closure + that points to U, but not the U itself. There is no way to fix + this up post-GC to reestablish the local heap invariant, so instead + we avoid it happening by automatically promoting U closures. + + L->U: we can promote L without moving it, so everything's fine. + +Alternative: + + do *not* eagerly promote C->U, F->U or T->U. We can guarantee + that the C->U, F->U, T->U pointers are not back-pointers, so as long + as we do not eagerly promote the C or F or T closure, we can be sure + that they stay forward-pointers, and hence we never get into a + state where C/F/T has been promoted but not U. + +Should we change ENTER() to check for forwarding pointers? why has +this not gone wrong so far? - because forwarded closures are always +tagged, so ENTER just returns. + +----------------------------------------------------------------------------- +How to globalise large-family constructors? + +Problem is that to get the constructor tag we have to read the info +table, but we want to globalise constructors by replacing the info +pointer with a forwarding pointer. + + 1. don't tag large-family constructors at all, + case code checks closure type before entering + - extra 2 memory refs on entry :-( + 1a. like 1, but code to check closure type is at the return pt, + so can share loading of info ptr. + - 1 extra memory ref + - extra test/branch in common case for returns + - need to distinguish tagged from untagged CONSTR in globalise() :-( + 2. tag large-family constructors as before + info table identifies large-family somehow + globalise code replaces with an IND + case return code checks for a special tag value (eg. 0) for IND, + and re-enters + - extra code for re-entry, extra tag in info table :-( + 3. tag large-family constructors as before + case join point checks for forwarding ptr + - extra test/branch at return pt for large-family constrs + 4. globalising a large-family constructor requires building a new + info table instance in the heap. The new itbl has type + CON_GLOBALISED, and contains the address of the relocated + closure. + - complicated, but no runtime overhead + +we currently do (3). + +----------------------------------------------------------------------------- +Globalising the mutable list after GC + +An object may contain back-pointers after scavenging it; in this case +the GC puts a reference to the object on the appropriate mutable list. + +In order to respect the global heap invariant, we must ensure that the +mutable lists of the global generation(s) only contain TSO or +IND_LOCAL objects. + +GC always promotes U objects (see [GLOBALISE]), so that directly after +GC globalising a pointer is always possible. So after GC we can +always globalise all the objects on the mutable list, except for TSO +and IND_LOCAL. + +The remaining problem is whether we might have TSO or IND_LOCAL +objects on the *wrong* mutable list, i.e. pointing into the wrong +local heap. + + TSO: this can certainly happen, we must move the TSO to the correct + mutable list. + + IND_LOCAL: this cannot happen, since local GC begins with all + IND_LOCALs pointing to the right local heap, and global GC always + shorts out the IND_LOCALs. + +----------------------------------------------------------------------------- +More generations + +We would like to be able to expand the heap structure to allow more +generations both local and global. Here I collect notes about the +ramifications of doing so: + +1. More local generations + + - write barriers are more complicated: if the object is local but + in a non-zero generation, then we use the ordinary write barrier + (add it to the mutable list), otherwise globalise. + + - we probably want to keep data in the local old-generation unless + it is required to be global, so the local old-gen's dest points + to itself. Might even want to mark-sweep or compact these. + +2. More global generations + + - mutable lists may contain not only TSO and IND_LOCAL, but also + objects that point to younger global generations. + + + - recordClosureMutated_: gen is wrong if we have >2 gens + +----------------------------------------------------------------------------- +* TSO fields + + The only TSO fields that may be read by another Capability are + + tso->cap + - to determine whether the TSO belongs to the current cap or not + + A TSO is special in that it may hold global->local pointers either + in its metadata (e.g. tso->bq) or on its stack. The TSO itself must + still reside in the global heap if it is referenced from there, but + clients other than the owning capability can read only the tso->cap + field. + + TSO fields are subject to the write barrier as usual. A TSO is + marked dirty (tso->dirty) if any of its fields may contain + global->local pointers. If only the tso->_link and + tso->block_info.prev fields are dirty, then instead of setting + tso->dirty, the TSO_LINK_DIRTY bit may be set in tso->flags. When + marking a TSO dirty it is placed on the mutable list of the owning + Capability. + +* ThreadRelocated + + When a TSO is moved, the old tso has tso->what_next set to + ThreadRelocated, and tso->_link points to the new instance. + tso->cap in a ThreadRelocated may still be read, but it might be + incorrect (the TSO may have migrated to another Capability). This + may lead to a message being sent to the wrong Capability, which + shouldn't cause any problems other than a message having to be + redirected. + +* XXX + + The MVar operations still modify a TSO directly, but I think + harmlessly. They modify + + ** tso->_link + To set it to END_TSO_QUEUE, which is fine + + ** the stack + To pass the value back for takeMVar, but if the MVar is global + then the TSO and the value will be global too. + +----------------------------------------------------------------------------- +-- Weak pointers. + + - per-generation weak pointer lists: (WE DO THIS ONE) + + - old-gen weak pointers would need to be on the mut list if they + pointed into a younger gen + - would need to globalise_scavenge them on the mut list + - key/value/finalizer could end up being IND_LOCALs + - link could never be an IND_LOCAL, because the link should + always point only to WEAKs in the same gen. + - in global GC, the leader thread calls traverseWeakPtrList + - in local GC, the thread traverses the weak ptr list for its + local gen(s) + + - alternative: all WEAK objects are in the global heap + + - no need to traverse WEAK objects during local GC + - finalizers do not start until a global GC happens + - must globalise the fields of a WEAK when creating one + - probably terrible for memo tables etc. + - probably terrible for ForeignPtrs + +----------------------------------------------------------------------------- +Shorting out IND_LOCAL + + - better not short out IND_LOCAL during global GC, as then a TSO + might end up pointing to more than one local heap. Instead, the + best we can do is turn them into IND during scavenge if we find + they no longer point to the local heap. + +----------------------------------------------------------------------------- +ALTERNATIVE IDEA + +Primitive and/or mutable objects are allocated in a special area of +the global heap (one per HEC), and marked as private or global. e.g. + + MUT_VAR_PRIVATE, MUT_VAR_GLOBAL + +Thus we can globalise one of these objects by marking it global and +globalising its children. All objects can therefore be globalised +(with the possible exception of BLACKHOLEs). + +During a local GC, we must use mark/sweep for this area of the global +heap. Global objects are treated as marked, and we can free any +unreachable private objects (as in Domani et. al. ISMM'00). In GHC we +would either free complete blocks only, or implement Immix-style +region marking. + +Compared to the other scheme: + + - no need for eager promotion of primitive objects! (well actually, + primitive objects do get eagerly promoted, but without moving + them). + + - we can globalise everything, globalise is simpler. + + - write barriers are a bit simpler and cheaper: check info pointer, + maybe globalise (well actually, currently the check is worse + because it is a bdescr->gen check plus isGlobal). + + - we need to duplicate all info tables for primitive objects, but + private/global can be a bit in the flags field of the info table, + so don't need to duplicate all the closure types too. + + - we need to set up the per-Capability primitive areas of the global + generation, and arrange that they get marked/swept. Sweeping has + to traverse the primitive heap linearly looking for global objects. + + - TSOs don't need to be globalised, they can be implicitly global, + similarly for BLOCKING_QUEUE and other private objects. + + - Evac: + - in local GC: + if the closure is global, we're done + if the closure is private, mark it and push it on the mark stack. + - in global GC: + copy the closure as normal. + + Even though the prim areas are local heap, we don't have to + record pointers from the global heap to the prim heap in the + remembered set. Instead we transitively mark all the global + objects in the prim area, so that they don't get discarded by + local GC. + + +What about clean/dirty? + + - for TSO: still need clean/dirty + + - for MUT_VAR and others: no need to do this, as the write barrier + globalises. Though we could also have muliple local generations, + and then we would need + + MUT_VAR_CLEAN, MUT_VAR_DIRTY, MUT_VAR_GLOBAL + + and a 3-way write barrier. + + +----------------------------------------------------------------------------- +Sources of overhead + + - checking for forwarding pointers in apply code, PAP code, large + family constructor cases. + + - slower allocation of primitive objects + + - not promoting prim objects in the local heap, so they get + repeatedly scanned during local GC. + + - thread migration is more expensive + +----------------------------------------------------------------------------- +Related work + +- DLG +- "Thread-local heaps for Java" (Domani et. al.) + (source of idea for mark-sweep with a bit to represent local/global) +- "Thread-specific heaps for Multi-threaded programs" (Steensgaard) +- "Optimisations in a private-nursery based collector" + (forces local GC to avoid violating local heap invariant; mutables + are allowed in the young gen) + +To look at: + +- "Pillar: A parallel implementation language" hunk ./compiler/cmm/CLabel.hs 60 mkSplitMarkerLabel, mkDirty_MUT_VAR_Label, + mkDirty_MUT_ARR_Label, mkUpdInfoLabel, mkBHUpdInfoLabel, mkIndStaticInfoLabel, hunk ./compiler/cmm/CLabel.hs 65 mkMainCapabilityLabel, + mkMUT_VAR_GLOBAL_infoLabel, mkMAP_FROZEN_infoLabel, hunk ./compiler/cmm/CLabel.hs 67 - mkMAP_DIRTY_infoLabel, + mkMAP_GLOBAL_infoLabel, mkEMPTY_MVAR_infoLabel, mkTopTickyCtrLabel, hunk ./compiler/cmm/CLabel.hs 405 -- Constructing Cmm Labels mkSplitMarkerLabel = CmmLabel rtsPackageId (fsLit "__stg_split_marker") CmmCode mkDirty_MUT_VAR_Label = CmmLabel rtsPackageId (fsLit "dirty_MUT_VAR") CmmCode +mkDirty_MUT_ARR_Label = CmmLabel rtsPackageId (fsLit "dirty_MUT_ARR") CmmCode mkUpdInfoLabel = CmmLabel rtsPackageId (fsLit "stg_upd_frame") CmmInfo mkBHUpdInfoLabel = CmmLabel rtsPackageId (fsLit "stg_bh_upd_frame" ) CmmInfo mkIndStaticInfoLabel = CmmLabel rtsPackageId (fsLit "stg_IND_STATIC") CmmInfo hunk ./compiler/cmm/CLabel.hs 410 mkMainCapabilityLabel = CmmLabel rtsPackageId (fsLit "MainCapability") CmmData +mkMUT_VAR_GLOBAL_infoLabel = CmmLabel rtsPackageId (fsLit "stg_MUT_VAR_GLOBAL") CmmInfo mkMAP_FROZEN_infoLabel = CmmLabel rtsPackageId (fsLit "stg_MUT_ARR_PTRS_FROZEN0") CmmInfo hunk ./compiler/cmm/CLabel.hs 412 -mkMAP_DIRTY_infoLabel = CmmLabel rtsPackageId (fsLit "stg_MUT_ARR_PTRS_DIRTY") CmmInfo +mkMAP_GLOBAL_infoLabel = CmmLabel rtsPackageId (fsLit "stg_MUT_ARR_PTRS_GLOBAL") CmmInfo mkEMPTY_MVAR_infoLabel = CmmLabel rtsPackageId (fsLit "stg_EMPTY_MVAR") CmmInfo mkTopTickyCtrLabel = CmmLabel rtsPackageId (fsLit "top_ct") CmmData mkCAFBlackHoleInfoTableLabel = CmmLabel rtsPackageId (fsLit "stg_CAF_BLACKHOLE") CmmInfo hunk ./compiler/codeGen/CgCase.lhs 402 ; cgPrimAlts GCMayHappen alt_type reg alts } ; lbl <- emitReturnTarget (idName bndr) abs_c - ; returnFC (CaseAlts lbl Nothing bndr) } + ; returnFC (CaseAlts lbl bndr 0) } cgEvalAlts cc_slot bndr (UbxTupAlt _) [(con,args,_,rhs)] = -- Unboxed tuple case hunk ./compiler/codeGen/CgCase.lhs 424 ; unbxTupleHeapCheck live_regs ptrs nptrs noStmts (cgExpr rhs) } ; lbl <- emitReturnTarget (idName bndr) abs_c - ; returnFC (CaseAlts lbl Nothing bndr) } + ; returnFC (CaseAlts lbl bndr 0) } cgEvalAlts cc_slot bndr alt_type alts = -- Algebraic and polymorphic case hunk ./compiler/codeGen/CgCase.lhs 442 ; (alts, mb_deflt) <- cgAlgAlts GCMayHappen cc_slot alt_type alts - ; (lbl, branches) <- emitAlgReturnTarget (idName bndr) - alts mb_deflt fam_sz + ; lbl <- emitAlgReturnTarget (idName bndr) alts mb_deflt fam_sz hunk ./compiler/codeGen/CgCase.lhs 444 - ; returnFC (CaseAlts lbl branches bndr) } + ; returnFC (CaseAlts lbl bndr fam_sz) } where fam_sz = case alt_type of AlgAlt tc -> tyConFamilySize tc hunk ./compiler/codeGen/CgClosure.lhs 571 ; hp_offset <- allocDynClosure bh_cl_info use_cc blame_cc [(tso,fixedHdrSize)] ; hp_rel <- getHpRelOffset hp_offset - -- Call the RTS function newCAF to add the CAF to the CafList - -- so that the garbage collector can find them + -- Call the RTS function newCAF. This: + -- - creates an IND_LOCAL to point to the local BH + -- - updates the CAF with an IND_STATIC pointing to the IND_LOCAL + -- - adds the CAF to the CAF list if necessary -- This must be done *before* the info table pointer is overwritten, -- because the old info table ptr is needed for reversion ; emitRtsCallWithVols rtsPackageId (fsLit "newCAF") hunk ./compiler/codeGen/CgClosure.lhs 579 [ CmmHinted (CmmReg (CmmGlobal BaseReg)) AddrHint, - CmmHinted (CmmReg nodeReg) AddrHint ] + CmmHinted (CmmReg nodeReg) AddrHint, + CmmHinted hp_rel AddrHint ] [node] False -- node is live, so save it. hunk ./compiler/codeGen/CgClosure.lhs 584 - -- Overwrite the closure with a (static) indirection - -- to the newly-allocated black hole - ; stmtsC [ CmmStore (cmmRegOffW nodeReg off_indirectee) hp_rel - , CmmStore (CmmReg nodeReg) ind_static_info ] - ; returnFC hp_rel } where bh_cl_info :: ClosureInfo hunk ./compiler/codeGen/CgCon.lhs 47 import Type import PrelInfo import Outputable -import ListSetOps import Util import Module import FastString hunk ./compiler/codeGen/CgCon.lhs 244 -- The binding below forces the masking out of the tag bits -- when accessing the constructor field. bind_arg (arg, offset) = bindNewToUntagNode arg offset (mkLFArgument arg) (tagForCon con) - (_, args_w_offsets) = layOutDynConstr con (addIdReps args) + (_, _, args_w_offsets) = mkVirtHeapOffsets False{-not a thunk-} + (addIdReps args) -- ASSERT(not (isUnboxedTupleCon con)) return () mapCs bind_arg args_w_offsets hunk ./compiler/codeGen/CgCon.lhs 319 | opt_SccProfilingOn = build_it_then enter_it | otherwise = ASSERT( amodes `lengthIs` dataConRepArity con ) - do { EndOfBlockInfo _ sequel <- getEndOfBlockInfo - ; case sequel of - CaseAlts _ (Just (alts, deflt_lbl)) bndr - -> -- Ho! We know the constructor so we can - -- go straight to the right alternative - case assocMaybe alts (dataConTagZ con) of { - Just join_lbl -> build_it_then (jump_to join_lbl); - Nothing - -- Special case! We're returning a constructor to the default case - -- of an enclosing case. For example: - -- - -- case (case e of (a,b) -> C a b) of - -- D x -> ... - -- y -> ...... - -- - -- In this case, - -- if the default is a non-bind-default (ie does not use y), - -- then we should simply jump to the default join point; - - | isDeadBinder bndr -> performReturn (jump_to deflt_lbl) - | otherwise -> build_it_then (jump_to deflt_lbl) } - - _otherwise -- The usual case - -> build_it_then emitReturnInstr - } + build_it_then emitReturnInstr where enter_it = stmtsC [ CmmAssign nodeReg (cmmUntag (CmmReg nodeReg)), CmmJump (entryCode (closureInfoPtr (CmmReg nodeReg))) [] ] hunk ./compiler/codeGen/CgExpr.lhs 344 where lf_info = mkSelectorLFInfo bndr offset_into_int (isUpdatable upd_flag) - (_, params_w_offsets) = layOutDynConstr con (addIdReps params) + + (_, _, params_w_offsets) = mkVirtHeapOffsets False{-not a thunk-} + (addIdReps params) + -- Just want the layout maybe_offset = assocMaybe params_w_offsets selectee Just the_offset = maybe_offset hunk ./compiler/codeGen/CgInfoTbls.hs 264 -> [(ConTagZ, CgStmts)] -- Tagged branches -> Maybe CgStmts -- Default branch (if any) -> Int -- family size - -> FCode (CLabel, SemiTaggingStuff) + -> FCode CLabel emitAlgReturnTarget name branches mb_deflt fam_sz hunk ./compiler/codeGen/CgInfoTbls.hs 267 - = do { blks <- getCgStmts $ - -- is the constructor tag in the node reg? - if isSmallFamily fam_sz - then do -- yes, node has constr. tag - let tag_expr = cmmConstrTag1 (CmmReg nodeReg) - branches' = [(tag+1,branch)|(tag,branch)<-branches] - emitSwitch tag_expr branches' mb_deflt 1 fam_sz - else do -- no, get tag from info table - let -- Note that ptr _always_ has tag 1 - -- when the family size is big enough - untagged_ptr = cmmRegOffB nodeReg (-1) - tag_expr = getConstrTag (untagged_ptr) - emitSwitch tag_expr branches mb_deflt 0 (fam_sz - 1) - ; lbl <- emitReturnTarget name blks - ; return (lbl, Nothing) } - -- Nothing: the internal branches in the switch don't have - -- global labels, so we can't use them at the 'call site' + = do + blks <- getCgStmts $ + -- is the constructor tag in the node reg? + if isSmallFamily fam_sz + then do -- yes, node has constr. tag + let tag_expr = cmmConstrTag1 (CmmReg nodeReg) + branches' = [(tag+1,branch)|(tag,branch)<-branches] + emitSwitch tag_expr branches' mb_deflt 1 fam_sz + else do -- no, get tag from info table + redo <- newLabelC + is_fwd <- newLabelC + labelC redo + ip <- newTemp bWord + tag <- newTemp bWord + let iptr = CmmReg (CmmLocal ip) + let untagged_ptr = cmmRegOffB nodeReg (-1) + let one = CmmLit (mkIntCLit 1) + stmtC $ CmmAssign (CmmLocal ip) (closureInfoPtr untagged_ptr) + -- check for a forwarding pointer; if it is, we have to + -- follow it to the real closure + let + cond = CmmMachOp mo_wordNe [ + CmmMachOp mo_wordAnd [iptr, one], + CmmLit zeroCLit + ] + tag_expr = CmmMachOp (MO_UU_Conv halfWordWidth wordWidth) [ + infoTableConstrTag (infoTable iptr) + ] + stmtC $ CmmCondBranch cond is_fwd + emitSwitch tag_expr branches mb_deflt 0 (fam_sz - 1) + labelC is_fwd + -- iptr is already tagged with 1, no need to do any arithmetic + stmtC $ CmmAssign nodeReg iptr + stmtC $ CmmBranch redo + -- + emitReturnTarget name blks -------------------------------- emitReturnInstr :: Code hunk ./compiler/codeGen/CgMonad.lhs 30 forkLabelledCode, forkClosureBody, forkStatics, forkAlts, forkEval, forkEvalHelp, forkProc, codeOnly, - SemiTaggingStuff, ConTagZ, + ConTagZ, EndOfBlockInfo(..), setEndOfBlockInfo, getEndOfBlockInfo, hunk ./compiler/codeGen/CgMonad.lhs 172 = OnStack -- Continuation is on the stack | CaseAlts - CLabel -- Jump to this; if the continuation is for a vectored - -- case this might be the label of a return vector - SemiTaggingStuff + CLabel -- Jump to this Id -- The case binder, only used to see if it's dead hunk ./compiler/codeGen/CgMonad.lhs 174 - -type SemiTaggingStuff - = Maybe -- Maybe[1] we don't have any semi-tagging stuff... - ([(ConTagZ, CmmLit)], -- Alternatives - CmmLit) -- Default (will be a can't happen RTS label if can't happen) + Int -- family size type ConTagZ = Int -- A *zero-indexed* contructor tag hunk ./compiler/codeGen/CgPrimOp.hs 131 emitPrimOp [res] ReadMutVarOp [mutv] _ = stmtC (CmmAssign (CmmLocal res) (cmmLoadIndexW mutv fixedHdrSize gcWord)) -emitPrimOp [] WriteMutVarOp [mutv,var] live +emitPrimOp [] WriteMutVarOp [mutv,val] live = do hunk ./compiler/codeGen/CgPrimOp.hs 133 - stmtC (CmmStore (cmmOffsetW mutv fixedHdrSize) var) vols <- getVolatileRegs live hunk ./compiler/codeGen/CgPrimOp.hs 134 + join <- newLabelC + + -- store the value in a register, in case it is a non-trivial expression + tmpl <- newTemp gcWord + let tmp = CmmReg (CmmLocal tmpl) + stmtC (CmmAssign (CmmLocal tmpl) val) + + -- save the address of the mut var in a temporary, becuase it may not + -- be present in 'live' and hence won't be automatically saved across + -- the foreign call below (live only contains live-in-alts, not + -- live-in-whole-case, when the primop is the scrutinee of a case). + mutl <- newTemp gcWord + let mut = CmmReg (CmmLocal mutl) + stmtC (CmmAssign (CmmLocal mutl) mutv) + + stmtC (CmmCondBranch (CmmMachOp mo_wordNe [ + closureInfoPtr mut, + CmmLit (CmmLabel mkMUT_VAR_GLOBAL_infoLabel) ]) + join) emitForeignCall' PlayRisky hunk ./compiler/codeGen/CgPrimOp.hs 154 - [{-no results-}] - (CmmCallee (CmmLit (CmmLabel mkDirty_MUT_VAR_Label)) - CCallConv) + [CmmHinted tmpl AddrHint] + (CmmCallee (CmmLit (CmmLabel mkDirty_MUT_VAR_Label)) CCallConv) [ (CmmHinted (CmmReg (CmmGlobal BaseReg)) AddrHint) hunk ./compiler/codeGen/CgPrimOp.hs 157 - , (CmmHinted mutv AddrHint) ] + , (CmmHinted tmp AddrHint) ] (Just vols) NoC_SRT -- No SRT b/c we do PlayRisky CmmMayReturn hunk ./compiler/codeGen/CgPrimOp.hs 161 + labelC join + stmtC (CmmStore (cmmOffsetW mut fixedHdrSize) tmp) -- #define sizzeofByteArrayzh(r,a) \ -- r = ((StgArrWords *)(a))->bytes hunk ./compiler/codeGen/CgPrimOp.hs 231 emitPrimOp [r] ReadArrayOp [obj,ix] _ = doReadPtrArrayOp r obj ix emitPrimOp [r] IndexArrayOp [obj,ix] _ = doReadPtrArrayOp r obj ix -emitPrimOp [] WriteArrayOp [obj,ix,v] _ = doWritePtrArrayOp obj ix v +emitPrimOp [] WriteArrayOp [obj,ix,v] live = doWritePtrArrayOp obj ix v live -- IndexXXXoffAddr hunk ./compiler/codeGen/CgPrimOp.hs 588 doWriteByteArrayOp _ _ _ _ = panic "CgPrimOp: doWriteByteArrayOp" -doWritePtrArrayOp :: CmmExpr -> CmmExpr -> CmmExpr -> Code -doWritePtrArrayOp addr idx val - = do mkBasicIndexedWrite arrPtrsHdrSize Nothing bWord addr idx val - stmtC (setInfo addr (CmmLit (CmmLabel mkMAP_DIRTY_infoLabel))) - -- the write barrier. We must write a byte into the mark table: - -- bits8[a + header_size + StgMutArrPtrs_size(a) + x >> N] - stmtC $ CmmStore ( - cmmOffsetExpr - (cmmOffsetExprW (cmmOffsetB addr arrPtrsHdrSize) - (loadArrPtrsSize addr)) - (CmmMachOp mo_wordUShr [idx, - CmmLit (mkIntCLit mUT_ARR_PTRS_CARD_BITS)]) - ) (CmmLit (CmmInt 1 W8)) +doWritePtrArrayOp :: CmmExpr -> CmmExpr -> CmmExpr -> StgLiveVars -> Code +doWritePtrArrayOp addr idx val live + = do vols <- getVolatileRegs live + join <- newLabelC + + -- store the value in a register, in case it is a non-trivial expression + tmpl <- newTemp gcWord + let tmp = CmmReg (CmmLocal tmpl) + stmtC (CmmAssign (CmmLocal tmpl) val) + + -- save the address of the array in a temporary, becuase it may not + -- be present in 'live' and hence won't be automatically saved across + -- the foreign call below (live only contains live-in-alts, not + -- live-in-whole-case, when the primop is the scrutinee of a case). + arrl <- newTemp gcWord + let arr = CmmReg (CmmLocal arrl) + stmtC (CmmAssign (CmmLocal arrl) addr) + stmtC (CmmCondBranch (CmmMachOp mo_wordNe [ + closureInfoPtr arr, + CmmLit (CmmLabel mkMAP_GLOBAL_infoLabel) ]) + join) + emitForeignCall' PlayRisky + [CmmHinted tmpl AddrHint] + (CmmCallee (CmmLit (CmmLabel mkDirty_MUT_ARR_Label)) CCallConv) + [ (CmmHinted (CmmReg (CmmGlobal BaseReg)) AddrHint) + , (CmmHinted tmp AddrHint) ] + (Just vols) + NoC_SRT -- No SRT b/c we do PlayRisky + CmmMayReturn + labelC join + mkBasicIndexedWrite arrPtrsHdrSize Nothing bWord arr idx tmp + +-- -- the write barrier. We must write a byte into the mark table: +-- -- bits8[a + header_size + StgMutArrPtrs_size(a) + x >> N] +-- stmtC (setInfo addr (CmmLit (CmmLabel mkMAP_DIRTY_infoLabel))) +-- stmtC $ CmmStore ( +-- cmmOffsetExpr +-- (cmmOffsetExprW (cmmOffsetB addr arrPtrsHdrSize) +-- (loadArrPtrsSize addr)) +-- (CmmMachOp mo_wordUShr [idx, +-- CmmLit (mkIntCLit mUT_ARR_PTRS_CARD_BITS)]) +-- ) (CmmLit (CmmInt 1 W8)) loadArrPtrsSize :: CmmExpr -> CmmExpr loadArrPtrsSize addr = CmmLoad (cmmOffsetB addr off) bWord hunk ./compiler/codeGen/CgTailCall.lhs 185 fun_name = idName fun_id lf_info = cgIdInfoLF fun_info fun_has_cafs = idCafInfo fun_id - untag_node = CmmAssign nodeReg (cmmUntag (CmmReg nodeReg)) -- Test if closure is a constructor maybeSwitchOnCons enterClosure eob | EndOfBlockInfo _ (CaseAlts lbl _ _) <- eob, hunk ./compiler/codeGen/CgTailCall.lhs 197 ; stmtC (CmmCondBranch (cmmIsTagged (CmmReg nodeReg)) is_constr) -- No, enter the closure. - ; enterClosure + ; _ <- enterClosure ; labelC is_constr ; stmtC (CmmJump (entryCode $ CmmLit (CmmLabel lbl)) []) } hunk ./compiler/codeGen/CgTailCall.lhs 222 -} -- No case expression involved, enter the closure. | otherwise - = do { stmtC untag_node - ; enterClosure + = do { is_constr <- newLabelC + -- Is the pointer tagged? + -- Yes, jump to switch statement + ; stmtC (CmmCondBranch (cmmIsTagged (CmmReg nodeReg)) + is_constr) + -- No, enter the closure. + ; _ <- enterClosure + ; labelC is_constr + ; emitReturnInstr } where --cond1 tag = cmmULtWord tag lowCons hunk ./compiler/codeGen/StgCmmBind.hs 210 body@(StgCase (StgApp scrutinee [{-no args-}]) _ _ _ _ -- ignore uniq, etc. (AlgAlt _) - [(DataAlt con, params, _use_mask, + [(DataAlt _, params, _use_mask, (StgApp selectee [{-no args-}]))]) | the_fv == scrutinee -- Scrutinee is the only free variable && maybeToBool maybe_offset -- Selectee is a component of the tuple hunk ./compiler/codeGen/StgCmmBind.hs 227 where lf_info = mkSelectorLFInfo bndr offset_into_int (isUpdatable upd_flag) - (_, params_w_offsets) = layOutDynConstr con (addIdReps params) - -- Just want the layout + (_, _, params_w_offsets) = mkVirtHeapOffsets False{-not a thunk-} + (addIdReps params) maybe_offset = assocMaybe params_w_offsets (NonVoid selectee) Just the_offset = maybe_offset offset_into_int = the_offset - fixedHdrSize hunk ./compiler/codeGen/StgCmmBind.hs 278 ; c_srt <- getSRTInfo srt ; let name = idName bndr descr = closureDescription mod_name name + fv_infos = addIdReps (map stripNV reduced_fvs) fv_details :: [(NonVoid Id, VirtualHpOffset)] (tot_wds, ptr_wds, fv_details) hunk ./compiler/codeGen/StgCmmBind.hs 281 - = mkVirtHeapOffsets (isLFThunk lf_info) - (addIdReps (map stripNV reduced_fvs)) + = mkVirtHeapOffsets (isLFThunk lf_info) fv_infos closure_info = mkClosureInfo False -- Not static bndr lf_info tot_wds ptr_wds c_srt descr hunk ./compiler/codeGen/StgCmmCon.hs 220 = ASSERT(not (isUnboxedTupleCon con)) mapM bind_arg args_w_offsets where - (_, args_w_offsets) = layOutDynConstr con (addIdReps args) + (_, _, args_w_offsets) = mkVirtHeapOffsets False{-not a thunk-} + (addIdReps args) tag = tagForCon con hunk ./compiler/codeGen/StgCmmPrim.hs 637 doWritePtrArrayOp :: CmmExpr -> CmmExpr -> CmmExpr -> FCode () doWritePtrArrayOp addr idx val = do mkBasicIndexedWrite arrPtrsHdrSize Nothing addr idx val - emit (setInfo addr (CmmLit (CmmLabel mkMAP_DIRTY_infoLabel))) + emit (setInfo addr (CmmLit (CmmLabel undefined {-TODO: mkMAP_DIRTY_infoLabel-}))) -- the write barrier. We must write a byte into the mark table: -- bits8[a + header_size + StgMutArrPtrs_size(a) + x >> N] emit $ mkStore ( hunk ./includes/Cmm.h 112 #define UNTAG(p) (p & ~TAG_MASK) #define GETTAG(p) (p & TAG_MASK) +#define IS_FORWARDING_PTR(p) (((p) & 1) != 0) +#define UN_FORWARDING_PTR(p) ((p) - 1) + #if SIZEOF_INT == 4 #define CInt bits32 #elif SIZEOF_INT == 8 hunk ./includes/Cmm.h 302 case \ IND, \ IND_PERM, \ - IND_STATIC: \ + IND_STATIC: \ { \ P1 = StgInd_indirectee(P1); \ goto again; \ hunk ./includes/Cmm.h 389 // allocate() - this includes many of the primops. #define MAYBE_GC(liveness,reentry) \ if (bdescr_link(CurrentNursery) == NULL || \ - generation_n_new_large_words(W_[g0]) >= CLong[large_alloc_lim]) { \ - R9 = liveness; \ + generation_n_new_large_words(StgRegTable_rG0(BaseReg)) >= CLong[large_alloc_lim]) { \ + R9 = liveness; \ R10 = reentry; \ HpAlloc = 0; \ jump stg_gc_gen_hp; \ hunk ./includes/Cmm.h 437 /* Debugging macros */ #define LOOKS_LIKE_INFO_PTR(p) \ ((p) != NULL && \ - LOOKS_LIKE_INFO_PTR_NOT_NULL(p)) + (IS_FORWARDING_PTR(p) || \ + LOOKS_LIKE_INFO_PTR_NOT_NULL(p))) #define LOOKS_LIKE_INFO_PTR_NOT_NULL(p) \ ( (TO_W_(%INFO_TYPE(%STD_INFO(p))) != INVALID_OBJECT) && \ hunk ./includes/Cmm.h 474 #define mutArrPtrsCardWords(n) \ ROUNDUP_BYTES_TO_WDS(((n) + (1 << MUT_ARR_PTRS_CARD_BITS) - 1) >> MUT_ARR_PTRS_CARD_BITS) +#define isGlobalPrim(bd,p) \ + (TO_W_(bdescr_gen_ix(bd)) >= TO_W_(CInt[global_gen_ix]) || (W_[(p) - WDS(1)] != 0)) + #if defined(PROFILING) || (!defined(THREADED_RTS) && defined(DEBUG)) #define OVERWRITING_CLOSURE(c) foreign "C" overwritingClosure(c "ptr") #else hunk ./includes/mkDerivedConstants.c 230 field_offset(StgRegTable, rHpAlloc); struct_field(StgRegTable, rRet); struct_field(StgRegTable, rNursery); + struct_field(StgRegTable, rG0); def_offset("stgEagerBlackholeInfo", FUN_OFFSET(stgEagerBlackholeInfo)); def_offset("stgGCEnter1", FUN_OFFSET(stgGCEnter1)); hunk ./includes/mkDerivedConstants.c 238 field_offset(Capability, r); field_offset(Capability, lock); + struct_field(Capability, no); struct_field(Capability, mut_lists); struct_field(Capability, context_switch); struct_field(Capability, sparks); hunk ./includes/mkDerivedConstants.c 247 struct_field(bdescr, free); struct_field(bdescr, blocks); struct_field(bdescr, gen_no); + struct_field(bdescr, gen_ix); struct_field(bdescr, link); struct_size(generation); hunk ./includes/mkDerivedConstants.c 251 - struct_field(generation, mut_list); struct_field(generation, n_new_large_words); hunk ./includes/mkDerivedConstants.c 252 + struct_field(generation, weak_ptrs); struct_size(CostCentreStack); struct_field(CostCentreStack, ccsID); hunk ./includes/rts/Constants.h 218 /* Win32 only: */ #define BlockedOnDoProc 7 -/* Only relevant for PAR: */ - /* blocked on a remote closure represented by a Global Address: */ -#define BlockedOnGA 8 - /* same as above but without sending a Fetch message */ -#define BlockedOnGA_NoSend 9 /* Only relevant for THREADED_RTS: */ hunk ./includes/rts/Constants.h 219 -#define BlockedOnCCall 10 -#define BlockedOnCCall_Interruptible 11 - /* same as above but permit killing the worker thread */ +#define BlockedOnCCall 8 +#define BlockedOnCCall_Interruptible 9 + /* same as above but permit interrupting the call */ /* Involved in a message sent to tso->msg_cap */ hunk ./includes/rts/Constants.h 224 -#define BlockedOnMsgThrowTo 12 +#define BlockedOnMsgThrowTo 10 /* The thread is not on any run queues, but can be woken up by tryWakeupThread() */ hunk ./includes/rts/Constants.h 228 -#define ThreadMigrating 13 +#define ThreadMigrating 11 + +/* Involved in a message sent to tso->msg_cap */ +#define BlockedOnMsgGlobalise 12 /* * These constants are returned to the scheduler by a thread that has hunk ./includes/rts/EventLogFormat.h 143 * #define ThreadFinished 5 */ #define THREAD_SUSPENDED_FOREIGN_CALL 6 +/* + * 7-20 are the tso->why_blocked values (+6) + * + * #define BlockedOnMVar 7 + * #define BlockedOnBlackHole 8 + * #define BlockedOnRead 9 + * #define BlockedOnWrite 10 + * #define BlockedOnDelay 11 + * #define BlockedOnSTM 12 + * #define BlockedOnDoProc 13 + * #define BlockedOnCCall 14 + * #define BlockedOnCCall_NoUnblockExc 15 + * #define BlockedOnMsgThrowTo 16 + * #define ThreadMigrating 17 + * #define BlockedOnMsgGlobalise 18 + */ #ifndef EVENTLOG_CONSTANTS_ONLY hunk ./includes/rts/Flags.h 77 rtsBool squeeze; /* 'z' stack squeezing & lazy blackholing */ rtsBool hpc; /* 'c' coverage */ rtsBool sparks; /* 'r' */ + rtsBool mallocleaks; /* 'k' */ }; struct COST_CENTRE_FLAGS { hunk ./includes/rts/OSThreads.h 56 barf("multiple ACQUIRE_LOCK: %s %d", __FILE__,__LINE__); \ } +// Returns zero if the lock was acquired. +EXTERN_INLINE int TRY_ACQUIRE_LOCK(pthread_mutex_t *mutex); +EXTERN_INLINE int TRY_ACQUIRE_LOCK(pthread_mutex_t *mutex) +{ + LOCK_DEBUG_BELCH("TRY_ACQUIRE_LOCK", mutex); + return pthread_mutex_trylock(mutex); +} + #define RELEASE_LOCK(mutex) \ LOCK_DEBUG_BELCH("RELEASE_LOCK", mutex); \ if (pthread_mutex_unlock(mutex) != 0) { \ hunk ./includes/rts/OSThreads.h 128 #else -#define ACQUIRE_LOCK(mutex) EnterCriticalSection(mutex) -#define RELEASE_LOCK(mutex) LeaveCriticalSection(mutex) +#define ACQUIRE_LOCK(mutex) EnterCriticalSection(mutex) +#define TRY_ACQUIRE_LOCK(mutex) (TryEnterCriticalSection(mutex) != 0) +#define RELEASE_LOCK(mutex) LeaveCriticalSection(mutex) // I don't know how to do this. TryEnterCriticalSection() doesn't do // the right thing. hunk ./includes/rts/prof/LDV.h 19 #ifdef PROFILING +void LDV_recordDead (StgClosure *c, nat size); + /* retrieves the LDV word from closure c */ #define LDVW(c) (((StgClosure *)(c))->header.prof.hp.ldvw) hunk ./includes/rts/prof/LDV.h 38 #else +#define LDV_RECORD_DEAD(c,size) \ + LDV_recordDead((StgClosure *)(p), size); + #define LDV_RECORD_CREATE(c) \ LDVW((c)) = ((StgWord)RTS_DEREF(era) << LDV_SHIFT) | LDV_STATE_CREATE hunk ./includes/rts/prof/LDV.h 48 #else /* !PROFILING */ -#define LDV_RECORD_CREATE(c) /* nothing */ +#define LDV_RECORD_CREATE(c) /* nothing */ +#define LDV_RECORD_DEAD(c,size) /* nothing */ #endif /* PROFILING */ hunk ./includes/rts/storage/Block.h 61 } u; struct generation_ *gen; /* generation */ - struct generation_ *dest; /* destination gen */ hunk ./includes/rts/storage/Block.h 62 - StgWord32 blocks; /* no. of blocks (if grp head, 0 otherwise) */ + StgWord16 gen_no; // gen->no, cached + StgWord16 gen_ix; // gen->ix, cached + StgWord16 dest_ix; // ix of destination generation hunk ./includes/rts/storage/Block.h 66 - StgWord16 gen_no; StgWord16 flags; /* block flags, see below */ hunk ./includes/rts/storage/Block.h 67 + + StgWord32 blocks; /* no. of blocks (if grp head, 0 otherwise) */ + #if SIZEOF_VOID_P == 8 StgWord32 _padding[2]; #else hunk ./includes/rts/storage/Block.h 106 #define BF_KNOWN 128 /* Block was swept in the last generation */ #define BF_SWEPT 256 +/* contains primitive objects */ +#define BF_PRIM 512 +/* contains global objects */ +#define BF_GLOBAL 1024 /* Finding the block descriptor for a given block -------------------------- */ hunk ./includes/rts/storage/ClosureMacros.h 132 SET_HDR(c,info,costCentreStack); \ (c)->bytes = n_bytes; -// Use when changing a closure from one kind to another -#define OVERWRITE_INFO(c, new_info) \ - OVERWRITING_CLOSURE((StgClosure *)(c)); \ - SET_INFO((c), (new_info)); \ - LDV_RECORD_CREATE(c); - /* ----------------------------------------------------------------------------- How to get hold of the static link field for a static closure. -------------------------------------------------------------------------- */ hunk ./includes/rts/storage/ClosureMacros.h 344 return pap_sizeW((StgPAP *)p); case IND: case IND_PERM: + case IND_LOCAL: return sizeofW(StgInd); case ARR_WORDS: return arr_words_sizeW((StgArrWords *)p); hunk ./includes/rts/storage/ClosureMacros.h 348 - case MUT_ARR_PTRS_CLEAN: - case MUT_ARR_PTRS_DIRTY: + case MUT_ARR_PTRS_LOCAL: + case MUT_ARR_PTRS_GLOBAL: case MUT_ARR_PTRS_FROZEN: case MUT_ARR_PTRS_FROZEN0: return mut_arr_ptrs_sizeW((StgMutArrPtrs*)p); hunk ./includes/rts/storage/ClosureMacros.h 463 #define OVERWRITING_CLOSURE(c) /* nothing */ #endif -#ifdef PROFILING -void LDV_recordDead (StgClosure *c, nat size); -#endif +#define OVERWRITE_PRIM_CLOSURE(c,new_info,old_size,new_size) \ + overwritingPrimClosure((c),old_size,new_size); \ + SET_INFO((c), (new_info)); \ + LDV_RECORD_CREATE(c); EXTERN_INLINE void overwritingClosure (StgClosure *p); EXTERN_INLINE void overwritingClosure (StgClosure *p) hunk ./includes/rts/storage/ClosureMacros.h 473 { nat size, i; -#if defined(PROFILING) +#if defined(PROFILING) && !defined(DEBUG) if (era <= 0) return; #endif hunk ./includes/rts/storage/ClosureMacros.h 480 size = closure_sizeW(p); // For LDV profiling, we need to record the closure as dead -#if defined(PROFILING) - LDV_recordDead((StgClosure *)(p), size); -#endif + LDV_RECORD_DEAD(p,size); for (i = 0; i < size - sizeofW(StgThunkHeader); i++) { ((StgThunk *)(p))->payload[i] = 0; hunk ./includes/rts/storage/ClosureMacros.h 487 } } +// slop in the prim area is marked with '-size' +EXTERN_INLINE void fillPrimSlop (StgClosure *p, + StgWord old_size, StgWord new_size); +EXTERN_INLINE void fillPrimSlop (StgClosure *p, + StgWord old_size, StgWord new_size) +{ + nat i, slop; + slop = old_size - new_size; + // this might be an object in the prim heap, or it might be in the + // global heap. For the prim heap we write a word that says how + // many words of slop to skip over; for the global heap we fill + // the remaining words with zero. + if (slop > 0) { + *((P_)p + new_size) = slop + 2; + for (i = 1; i < slop; i++) { + *((P_)p+ new_size + i) = 0; + } + } +} + + +EXTERN_INLINE void overwritingPrimClosure (StgClosure *p, + nat old_size, nat new_size); +EXTERN_INLINE void overwritingPrimClosure (StgClosure *p, + nat old_size, nat new_size) +{ +#if defined(PROFILING) && !defined(DEBUG) + if (era <= 0) return; +#endif + + // For LDV profiling, we need to record the closure as dead + LDV_RECORD_DEAD(p,old_size); + + fillPrimSlop(p, old_size, new_size); + +} + +// used occasionally to fill slop when other methods won't do +INLINE_HEADER void +zeroSlop (StgPtr p, nat words) +{ + nat i; + for (i = 0; i < words; i++) { + *(p + i) = 0; + } +} + #endif /* RTS_STORAGE_CLOSUREMACROS_H */ hunk ./includes/rts/storage/ClosureTypes.h 29 #define CONSTR_1_1 5 #define CONSTR_0_2 6 #define CONSTR_STATIC 7 -#define CONSTR_NOCAF_STATIC 8 +#define CONSTR_NOCAF_STATIC 8 #define FUN 9 #define FUN_1_0 10 #define FUN_0_1 11 hunk ./includes/rts/storage/ClosureTypes.h 52 #define IND 28 #define IND_PERM 29 #define IND_STATIC 30 -#define RET_BCO 31 -#define RET_SMALL 32 -#define RET_BIG 33 -#define RET_DYN 34 -#define RET_FUN 35 -#define UPDATE_FRAME 36 -#define CATCH_FRAME 37 -#define UNDERFLOW_FRAME 38 -#define STOP_FRAME 39 -#define BLOCKING_QUEUE 40 -#define BLACKHOLE 41 -#define MVAR_CLEAN 42 -#define MVAR_DIRTY 43 -#define ARR_WORDS 44 -#define MUT_ARR_PTRS_CLEAN 45 -#define MUT_ARR_PTRS_DIRTY 46 -#define MUT_ARR_PTRS_FROZEN0 47 -#define MUT_ARR_PTRS_FROZEN 48 -#define MUT_VAR_CLEAN 49 -#define MUT_VAR_DIRTY 50 -#define WEAK 51 -#define PRIM 52 -#define MUT_PRIM 53 -#define TSO 54 -#define STACK 55 -#define TREC_CHUNK 56 -#define ATOMICALLY_FRAME 57 -#define CATCH_RETRY_FRAME 58 -#define CATCH_STM_FRAME 59 -#define WHITEHOLE 60 -#define N_CLOSURE_TYPES 61 +#define IND_LOCAL 31 +#define RET_BCO 32 +#define RET_SMALL 33 +#define RET_BIG 34 +#define RET_DYN 35 +#define RET_FUN 36 +#define UPDATE_FRAME 37 +#define CATCH_FRAME 38 +#define UNDERFLOW_FRAME 39 +#define STOP_FRAME 40 +#define BLOCKING_QUEUE 41 +#define BLACKHOLE 42 +#define MVAR_CLEAN 43 +#define MVAR_DIRTY 44 +#define ARR_WORDS 45 +#define MUT_ARR_PTRS_LOCAL 46 +#define MUT_ARR_PTRS_GLOBAL 47 +#define MUT_ARR_PTRS_FROZEN0 48 +#define MUT_ARR_PTRS_FROZEN 49 +#define MUT_VAR_LOCAL 50 +#define MUT_VAR_GLOBAL 51 +#define WEAK 52 +#define PRIM 53 +#define MUT_PRIM 54 +#define TSO 55 +#define STACK 56 +#define TREC_CHUNK 57 +#define ATOMICALLY_FRAME 58 +#define CATCH_RETRY_FRAME 59 +#define CATCH_STM_FRAME 60 +#define WHITEHOLE 61 +#define N_CLOSURE_TYPES 62 #endif /* RTS_STORAGE_CLOSURETYPES_H */ hunk ./includes/rts/storage/Closures.h 127 StgHeader header; StgClosure *indirectee; StgClosure *static_link; - StgInfoTable *saved_info; + const StgInfoTable *saved_info; } StgIndStatic; typedef struct StgBlockingQueue_ { hunk ./includes/rts/storage/Closures.h 243 #define BCO_BITMAP_SIZEW(bco) ((BCO_BITMAP_SIZE(bco) + BITS_IN(StgWord) - 1) \ / BITS_IN(StgWord)) + /* ----------------------------------------------------------------------------- Dynamic stack frames for generic heap checks. hunk ./includes/rts/storage/Closures.h 392 typedef struct StgTRecChunk_ { StgHeader header; struct StgTRecChunk_ *prev_chunk; - StgWord next_entry_idx; + StgHalfWord next_entry_idx; + StgHalfWord cap_no; // which Capability this TRec belongs to TRecEntry entries[TREC_CHUNK_NUM_ENTRIES]; } StgTRecChunk; hunk ./includes/rts/storage/Closures.h 417 struct StgTRecHeader_ *enclosing_trec; StgTRecChunk *current_chunk; StgInvariantCheckQueue *invariants_to_check; - TRecState state; + StgHalfWord state; + StgHalfWord cap_no; // which Capability this TRec belongs to }; typedef struct { hunk ./includes/rts/storage/Closures.h 471 StgClosure *bh; } MessageBlackHole; +typedef struct MessageGlobalise_ { + StgHeader header; + struct Message_ *link; + StgTSO *tso; + StgClosure *req; // closure to globalise +} MessageGlobalise; + #endif /* RTS_STORAGE_CLOSURES_H */ hunk ./includes/rts/storage/GC.h 63 typedef struct generation_ { unsigned int no; // generation number + unsigned int ix; // generation "index", i.e the + // offset in the all_generations array + + unsigned int is_local; // true if a local generation + unsigned int cap; // capability this gen is local to bdescr * blocks; // blocks in this gen unsigned int n_blocks; // number of blocks hunk ./includes/rts/storage/GC.h 78 unsigned long n_new_large_words; // words of new large objects // (for allocation stats) + bdescr * prim_blocks; // blocks of primitive objects + unsigned int n_prim_blocks; // these blocks are marked/swept, + unsigned int n_prim_words; // not copied. + unsigned int max_blocks; // max blocks hunk ./includes/rts/storage/GC.h 83 - bdescr *mut_list; // mut objects in this gen (not G0) StgTSO * threads; // threads in this gen // linked via global_link hunk ./includes/rts/storage/GC.h 86 + + StgWeak * weak_ptrs; // WEAK objects in this gen + // linked via w->link + struct generation_ *to; // destination gen for live objects // stats information hunk ./includes/rts/storage/GC.h 101 // Fields below are used during GC only #if defined(THREADED_RTS) - char pad[128]; // make sure the following is + StgWord8 pad[128]; // make sure the following is // on a separate cache line. hunk ./includes/rts/storage/GC.h 103 - SpinLock sync_large_objects; // lock for large_objects - // and scavenged_large_objects + SpinLock sync; // lock for large_objects, + // blocks, n_blocks, n_words, + // scavenged_large_objects #endif int mark; // mark (not copy)? (old gen only) hunk ./includes/rts/storage/GC.h 118 unsigned int n_old_blocks; // number of blocks in from-space unsigned int live_estimate; // for sweeping: estimate of live data - bdescr * saved_mut_list; - - bdescr * part_blocks; // partially-full scanned blocks - unsigned int n_part_blocks; // count of above - bdescr * scavenged_large_objects; // live large objs after GC (d-link) unsigned int n_scavenged_large_blocks; // size (not count) of above hunk ./includes/rts/storage/GC.h 126 StgTSO * old_threads; } generation; -extern generation * generations; -extern generation * g0; -extern generation * oldest_gen; +extern nat total_generations; // size of all_generations +extern generation * all_generations; // indexed by gen->ix +extern generation * old_generations; // indexed by gen->no +extern generation * g0; // == &old_generations[0] +extern generation * oldest_gen; // == &old_generations[G-1] +extern generation * global_gen; // == &all_generations[global_gen_ix] +extern nat global_gen_ix; // ix of first non-local gen +extern nat global_gen_no; // no of first non-local gen /* ----------------------------------------------------------------------------- Generic allocation hunk ./includes/rts/storage/GC.h 165 StgPtr allocate ( Capability *cap, lnat n ); StgPtr allocatePinned ( Capability *cap, lnat n ); +StgPtr allocatePrim ( Capability *cap, lnat n ); /* memory allocator for executable memory */ void * allocateExec(unsigned int len, void **exec_addr); hunk ./includes/rts/storage/GC.h 185 The CAF table - used to let us revert CAFs in GHCi -------------------------------------------------------------------------- */ -void newCAF (StgRegTable *reg, StgClosure *); -void newDynCAF (StgRegTable *reg, StgClosure *); +void newCAF (StgRegTable *reg, StgClosure *, StgClosure *); +void newDynCAF (StgRegTable *reg, StgClosure *, StgClosure *); void revertCAFs (void); // Request that all CAFs are retained indefinitely. hunk ./includes/rts/storage/GC.h 206 and is put on the mutable list. -------------------------------------------------------------------------- */ -void dirty_MUT_VAR(StgRegTable *reg, StgClosure *p); +StgClosure *dirty_MUT_VAR(StgRegTable *reg, StgClosure *p); + +// similarly, the barrier for arrays: +StgClosure *dirty_MUT_ARR (StgRegTable *reg, StgClosure *p); + +// and for MVars: +void dirty_MVAR(StgRegTable *reg, StgClosure *p); + +/* --------------------------------------------------------------------------- + Globalisation + ------------------------------------------------------------------------- */ + +INLINE_HEADER rtsBool +isLocal (StgClosure *p) +{ + return Bdescr((P_)p)->gen->is_local; +} + +/* --------------------------------------------------------------------------- + Misc + ------------------------------------------------------------------------- */ /* set to disable CAF garbage collection in GHCi. */ /* (needed when dynamic libraries are used). */ hunk ./includes/rts/storage/GC.h 234 INLINE_HEADER void initBdescr(bdescr *bd, generation *gen, generation *dest) { - bd->gen = gen; - bd->gen_no = gen->no; - bd->dest = dest; + bd->gen = gen; + bd->gen_no = gen->no; + bd->gen_ix = gen->ix; + bd->dest_ix = dest->ix; +} + +/* ----------------------------------------------------------------------------- + Primitive heap area + + For now, just use a word before the primitive object to indicate + global/private. Later try using a bitmap. + -------------------------------------------------------------------------- */ + +EXTERN_INLINE rtsBool isGlobalPrim (StgClosure *p); +EXTERN_INLINE rtsBool isGlobalPrim (StgClosure *p) +{ + return (rtsBool) *((P_)p - 1); +} + +EXTERN_INLINE rtsBool isGlobal (StgClosure *p); +EXTERN_INLINE rtsBool isGlobal (StgClosure *p) +{ + bdescr *bd; + bd = Bdescr((P_)p); + return bd->gen_no != 0 || ((bd->flags & BF_PRIM) && isGlobalPrim(p)); +} + +INLINE_HEADER void setGlobal (StgClosure *p) +{ + *((P_)p - 1) = 1; } #endif /* RTS_STORAGE_GC_H */ hunk ./includes/rts/storage/InfoTables.h 239 - the constructor tag In a FUN/THUNK - a bitmap of SRT entries + In an IND_LOCAL + - the owning Capability */ #ifdef TABLES_NEXT_TO_CODE hunk ./includes/rts/storage/TSO.h 169 typedef struct StgStack_ { StgHeader header; + StgTSO *tso; // owner StgWord32 stack_size; // stack size in *words* StgWord32 dirty; // non-zero => dirty StgPtr sp; // current stack pointer hunk ./includes/stg/MiscClosures.h 85 RTS_RET(stg_apply_interp); +#if IN_STG_CODE +extern W_ stg_IND_LOCAL_tbl[]; +#else +extern DLL_IMPORT_RTS const StgInfoTable *stg_IND_LOCAL_tbl[]; +#endif + +RTS_ENTRY(stg_IND_LOCAL0); +RTS_ENTRY(stg_IND_LOCAL1); +RTS_ENTRY(stg_IND_LOCAL2); +RTS_ENTRY(stg_IND_LOCAL3); +RTS_ENTRY(stg_IND_LOCAL4); +RTS_ENTRY(stg_IND_LOCAL5); +RTS_ENTRY(stg_IND_LOCAL6); +RTS_ENTRY(stg_IND_LOCAL7); +RTS_ENTRY(stg_IND_LOCAL8); +RTS_ENTRY(stg_IND_LOCAL9); +RTS_ENTRY(stg_IND_LOCAL10); +RTS_ENTRY(stg_IND_LOCAL11); +RTS_ENTRY(stg_IND_LOCAL12); +RTS_ENTRY(stg_IND_LOCAL13); +RTS_ENTRY(stg_IND_LOCAL14); +RTS_ENTRY(stg_IND_LOCAL15); +RTS_ENTRY(stg_IND_LOCAL16); +RTS_ENTRY(stg_IND_LOCAL17); +RTS_ENTRY(stg_IND_LOCAL18); +RTS_ENTRY(stg_IND_LOCAL19); +RTS_ENTRY(stg_IND_LOCAL20); +RTS_ENTRY(stg_IND_LOCAL21); +RTS_ENTRY(stg_IND_LOCAL22); +RTS_ENTRY(stg_IND_LOCAL23); +RTS_ENTRY(stg_IND_LOCAL24); +RTS_ENTRY(stg_IND_LOCAL25); +RTS_ENTRY(stg_IND_LOCAL26); +RTS_ENTRY(stg_IND_LOCAL27); +RTS_ENTRY(stg_IND_LOCAL28); +RTS_ENTRY(stg_IND_LOCAL29); +RTS_ENTRY(stg_IND_LOCAL30); +RTS_ENTRY(stg_IND_LOCAL31); +RTS_ENTRY(stg_IND_LOCAL32); +RTS_ENTRY(stg_IND_LOCAL33); +RTS_ENTRY(stg_IND_LOCAL34); +RTS_ENTRY(stg_IND_LOCAL35); +RTS_ENTRY(stg_IND_LOCAL36); +RTS_ENTRY(stg_IND_LOCAL37); +RTS_ENTRY(stg_IND_LOCAL38); +RTS_ENTRY(stg_IND_LOCAL39); +RTS_ENTRY(stg_IND_LOCAL40); +RTS_ENTRY(stg_IND_LOCAL41); +RTS_ENTRY(stg_IND_LOCAL42); +RTS_ENTRY(stg_IND_LOCAL43); +RTS_ENTRY(stg_IND_LOCAL44); +RTS_ENTRY(stg_IND_LOCAL45); +RTS_ENTRY(stg_IND_LOCAL46); +RTS_ENTRY(stg_IND_LOCAL47); +RTS_ENTRY(stg_IND_LOCAL48); +RTS_ENTRY(stg_IND_LOCAL49); +RTS_ENTRY(stg_IND_LOCAL50); +RTS_ENTRY(stg_IND_LOCAL51); +RTS_ENTRY(stg_IND_LOCAL52); +RTS_ENTRY(stg_IND_LOCAL53); +RTS_ENTRY(stg_IND_LOCAL54); +RTS_ENTRY(stg_IND_LOCAL55); +RTS_ENTRY(stg_IND_LOCAL56); +RTS_ENTRY(stg_IND_LOCAL57); +RTS_ENTRY(stg_IND_LOCAL58); +RTS_ENTRY(stg_IND_LOCAL59); +RTS_ENTRY(stg_IND_LOCAL60); +RTS_ENTRY(stg_IND_LOCAL61); +RTS_ENTRY(stg_IND_LOCAL62); +RTS_ENTRY(stg_IND_LOCAL63); +RTS_ENTRY(stg_IND_LOCAL64); + RTS_ENTRY(stg_IND); RTS_ENTRY(stg_IND_direct); hunk ./includes/stg/MiscClosures.h 159 +RTS_ENTRY(stg_IND_noenter); RTS_ENTRY(stg_IND_STATIC); RTS_ENTRY(stg_IND_PERM); RTS_ENTRY(stg_BLACKHOLE); hunk ./includes/stg/MiscClosures.h 168 RTS_ENTRY(stg_WHITEHOLE); RTS_ENTRY(stg_BLOCKING_QUEUE_CLEAN); RTS_ENTRY(stg_BLOCKING_QUEUE_DIRTY); +RTS_ENTRY(stg_STUB_BLOCKING_QUEUE); RTS_FUN(stg_BCO); RTS_ENTRY(stg_EVACUATED); hunk ./includes/stg/MiscClosures.h 181 RTS_ENTRY(stg_STACK); RTS_ENTRY(stg_ARR_WORDS); RTS_ENTRY(stg_MUT_ARR_WORDS); -RTS_ENTRY(stg_MUT_ARR_PTRS_CLEAN); -RTS_ENTRY(stg_MUT_ARR_PTRS_DIRTY); +RTS_ENTRY(stg_MUT_ARR_PTRS_LOCAL); +RTS_ENTRY(stg_MUT_ARR_PTRS_GLOBAL); RTS_ENTRY(stg_MUT_ARR_PTRS_FROZEN); RTS_ENTRY(stg_MUT_ARR_PTRS_FROZEN0); hunk ./includes/stg/MiscClosures.h 185 -RTS_ENTRY(stg_MUT_VAR_CLEAN); -RTS_ENTRY(stg_MUT_VAR_DIRTY); +RTS_ENTRY(stg_MUT_VAR_LOCAL); +RTS_ENTRY(stg_MUT_VAR_GLOBAL); RTS_ENTRY(stg_END_TSO_QUEUE); RTS_ENTRY(stg_MSG_TRY_WAKEUP); RTS_ENTRY(stg_MSG_THROWTO); hunk ./includes/stg/MiscClosures.h 191 RTS_ENTRY(stg_MSG_BLACKHOLE); +RTS_ENTRY(stg_STUB_MSG_BLACKHOLE); RTS_ENTRY(stg_MSG_NULL); hunk ./includes/stg/MiscClosures.h 193 +RTS_ENTRY(stg_MSG_GLOBALISE); RTS_ENTRY(stg_MVAR_TSO_QUEUE); RTS_ENTRY(stg_catch); RTS_ENTRY(stg_PAP); hunk ./includes/stg/MiscClosures.h 383 RTS_FUN_DECL(stg_gen_block); RTS_FUN_DECL(stg_block_noregs); RTS_FUN_DECL(stg_block_1); -RTS_FUN_DECL(stg_block_blackhole); -RTS_FUN_DECL(stg_block_blackhole_finally); +RTS_FUN_DECL(stg_block_enter); RTS_FUN_DECL(stg_block_takemvar); RTS_RET(stg_block_takemvar); RTS_FUN_DECL(stg_block_putmvar); hunk ./includes/stg/Regs.h 84 StgPtr rHp; StgPtr rHpLim; struct StgTSO_ * rCurrentTSO; - struct nursery_ * rNursery; + struct nursery_ * rNursery; /* nursery struct */ + struct generation_ * rG0; /* the local generation 0 */ struct bdescr_ * rCurrentNursery; /* Hp/HpLim point into this block */ struct bdescr_ * rCurrentAlloc; /* for allocation using allocate() */ StgWord rHpAlloc; /* number of *bytes* being allocated in heap */ hunk ./includes/stg/Types.h 96 typedef StgWord64 StgWord; typedef StgInt32 StgHalfInt; typedef StgWord32 StgHalfWord; +typedef StgInt16 StgQtrInt; +typedef StgWord16 StgQtrWord; #else #if SIZEOF_VOID_P == 4 typedef StgInt32 StgInt; hunk ./includes/stg/Types.h 104 typedef StgWord32 StgWord; typedef StgInt16 StgHalfInt; typedef StgWord16 StgHalfWord; +typedef StgInt8 StgQtrInt; +typedef StgWord8 StgQtrWord; #else #error GHC untested on this architecture: sizeof(void *) != 4 or 8 #endif hunk ./rts/Apply.cmm 123 // Off we go! TICK_ENT_VIA_NODE(); + W_ info; + info = %INFO_PTR(UNTAG(R1)); + if (IS_FORWARDING_PTR(info)) { + R1 = UN_FORWARDING_PTR(info) | GETTAG(R1); + } + #ifdef NO_ARG_REGS jump %GET_ENTRY(UNTAG(R1)); #else hunk ./rts/Apply.cmm 132 - W_ info; info = %GET_FUN_INFO(UNTAG(R1)); W_ type; type = TO_W_(StgFunInfoExtra_fun_type(info)); hunk ./rts/Apply.cmm 205 // Off we go! TICK_ENT_VIA_NODE(); + W_ info; + info = %INFO_PTR(UNTAG(R1)); + if (IS_FORWARDING_PTR(info)) { + R1 = UN_FORWARDING_PTR(info) | GETTAG(R1); + } + #ifdef NO_ARG_REGS jump %GET_ENTRY(UNTAG(R1)); #else hunk ./rts/Apply.cmm 214 - W_ info; info = %GET_FUN_INFO(UNTAG(R1)); W_ type; type = TO_W_(StgFunInfoExtra_fun_type(info)); hunk ./rts/Apply.cmm 280 // Off we go! TICK_ENT_VIA_NODE(); + W_ info; + info = %INFO_PTR(UNTAG(R1)); + if (IS_FORWARDING_PTR(info)) { + R1 = UN_FORWARDING_PTR(info) | GETTAG(R1); + } + #ifdef NO_ARG_REGS jump %GET_ENTRY(UNTAG(R1)); #else hunk ./rts/Apply.cmm 289 - W_ info; info = %GET_FUN_INFO(UNTAG(R1)); W_ type; type = TO_W_(StgFunInfoExtra_fun_type(info)); hunk ./rts/Arena.c 88 arena_blocks += req_blocks; bd->gen_no = 0; + bd->gen_ix = 0; bd->gen = NULL; hunk ./rts/Arena.c 90 - bd->dest = NULL; + bd->dest_ix = 0; bd->flags = 0; bd->free = bd->start; bd->link = arena->current; hunk ./rts/Capability.c 227 cap->returning_tasks_hd = NULL; cap->returning_tasks_tl = NULL; cap->inbox = (Message*)END_TSO_QUEUE; + cap->n_inbox = 0; cap->sparks_created = 0; cap->sparks_dud = 0; cap->sparks_converted = 0; hunk ./rts/Capability.c 250 cap->mut_lists[g] = NULL; } - cap->free_tvar_watch_queues = END_STM_WATCH_QUEUE; cap->free_invariant_check_queues = END_INVARIANT_CHECK_QUEUE; cap->free_trec_chunks = END_STM_CHUNK_LIST; cap->free_trec_headers = NO_TREC; hunk ./rts/Capability.c 387 return; } - if (waiting_for_gc == PENDING_GC_SEQ) { + if (waiting_for_gc == GC_SEQ) { last_free_capability = cap; // needed? debugTrace(DEBUG_sched, "GC pending, set capability %d free", cap->no); return; hunk ./rts/Capability.c 601 { Capability *cap = *pCap; - if (waiting_for_gc == PENDING_GC_PAR) { + if (waiting_for_gc == GC_PAR) { traceEventGcStart(cap); gcWorkerThread(cap); traceEventGcEnd(cap); hunk ./rts/Capability.c 845 ------------------------------------------------------------------------ */ void -markSomeCapabilities (evac_fn evac, void *user, nat i0, nat delta, - rtsBool no_mark_sparks USED_IF_THREADS) +markCapability (evac_fn evac, void *user, Capability *cap, + rtsBool no_mark_sparks USED_IF_THREADS) { hunk ./rts/Capability.c 848 - nat i; - Capability *cap; InCall *incall; // Each GC thread is responsible for following roots from the hunk ./rts/Capability.c 855 // or fewer Capabilities as GC threads, but just in case there // are more, we mark every Capability whose number is the GC // thread's index plus a multiple of the number of GC threads. - for (i = i0; i < n_capabilities; i += delta) { - cap = &capabilities[i]; - evac(user, (StgClosure **)(void *)&cap->run_queue_hd); - evac(user, (StgClosure **)(void *)&cap->run_queue_tl); + evac(user, (StgClosure **)(void *)&cap->run_queue_hd); + evac(user, (StgClosure **)(void *)&cap->run_queue_tl); #if defined(THREADED_RTS) hunk ./rts/Capability.c 858 - evac(user, (StgClosure **)(void *)&cap->inbox); + evac(user, (StgClosure **)(void *)&cap->inbox); #endif hunk ./rts/Capability.c 860 - for (incall = cap->suspended_ccalls; incall != NULL; - incall=incall->next) { - evac(user, (StgClosure **)(void *)&incall->suspended_tso); - } + + for (incall = cap->suspended_ccalls; incall != NULL; + incall = incall->next) { + evac(user, (StgClosure **)(void *)&incall->suspended_tso); + } #if defined(THREADED_RTS) hunk ./rts/Capability.c 867 - if (!no_mark_sparks) { - traverseSparkQueue (evac, user, cap); - } -#endif + if (!no_mark_sparks) { + traverseSparkQueue (evac, user, cap); } hunk ./rts/Capability.c 870 +#endif hunk ./rts/Capability.c 872 -#if !defined(THREADED_RTS) - evac(user, (StgClosure **)(void *)&blocked_queue_hd); - evac(user, (StgClosure **)(void *)&blocked_queue_tl); - evac(user, (StgClosure **)(void *)&sleeping_queue); -#endif + // Free STM structures for this Capability + stmPreGCHook(cap); } void hunk ./rts/Capability.c 879 markCapabilities (evac_fn evac, void *user) { - markSomeCapabilities(evac, user, 0, 1, rtsFalse); + nat n; + for (n = 0; n < n_capabilities; n++) { + markCapability(evac, user, &capabilities[n], rtsFalse); + } } hunk ./rts/Capability.c 884 - -/* ----------------------------------------------------------------------------- - Messages - -------------------------------------------------------------------------- */ - hunk ./rts/Capability.h 97 // Messages, or END_TSO_QUEUE. Message *inbox; + nat n_inbox; +#if 0 + rtsBool await_reply; +#endif SparkPool *sparks; hunk ./rts/Capability.h 199 extern Capability *last_free_capability; // GC indicator, in scope for the scheduler -#define PENDING_GC_SEQ 1 -#define PENDING_GC_PAR 2 +#define GC_SEQ 1 +#define GC_PAR 2 +#define GC_LOCAL 3 extern volatile StgWord waiting_for_gc; // Acquires a capability at a return point. If *cap is non-NULL, then hunk ./rts/Capability.h 215 // void waitForReturnCapability (Capability **cap/*in/out*/, Task *task); -EXTERN_INLINE void recordMutableCap (StgClosure *p, Capability *cap, nat gen); +EXTERN_INLINE void recordMutableCap (Capability *cap, StgClosure *p, nat gen); EXTERN_INLINE void recordClosureMutated (Capability *cap, StgClosure *p); hunk ./rts/Capability.h 218 +EXTERN_INLINE void recordClosureMutated_ (Capability *cap, StgClosure *p); #if defined(THREADED_RTS) hunk ./rts/Capability.h 289 void freeCapabilities (void); // For the GC: -void markSomeCapabilities (evac_fn evac, void *user, nat i0, nat delta, - rtsBool no_mark_sparks); +void markCapability (evac_fn evac, void *user, Capability *cap, + rtsBool no_mark_sparks USED_IF_THREADS); + void markCapabilities (evac_fn evac, void *user); hunk ./rts/Capability.h 293 + void traverseSparkQueues (evac_fn evac, void *user); /* ----------------------------------------------------------------------------- hunk ./rts/Capability.h 311 * -------------------------------------------------------------------------- */ EXTERN_INLINE void -recordMutableCap (StgClosure *p, Capability *cap, nat gen) +recordMutableCap (Capability *cap, StgClosure *p, nat gen_no) { bdescr *bd; hunk ./rts/Capability.h 318 // We must own this Capability in order to modify its mutable list. // ASSERT(cap->running_task == myTask()); // NO: assertion is violated by performPendingThrowTos() - bd = cap->mut_lists[gen]; + bd = cap->mut_lists[gen_no]; if (bd->free >= bd->start + BLOCK_SIZE_W) { bdescr *new_bd; new_bd = allocBlock_lock(); hunk ./rts/Capability.h 324 new_bd->link = bd; bd = new_bd; - cap->mut_lists[gen] = bd; + cap->mut_lists[gen_no] = bd; } *bd->free++ = (StgWord)p; } hunk ./rts/Capability.h 329 -EXTERN_INLINE void -recordClosureMutated (Capability *cap, StgClosure *p) -{ - bdescr *bd; - bd = Bdescr((StgPtr)p); - if (bd->gen_no != 0) recordMutableCap(p,cap,bd->gen_no); -} +/* + * Not currently called, but might be required in the future if we + * have multiple local generations. + */ +// EXTERN_INLINE void +// recordClosureMutated (Capability *cap, StgClosure *p) +// { +// bdescr *bd; +// bd = Bdescr((StgPtr)p); +// if (bd->gen_no != 0) { +// barf("recordClosureMutated", cap); // recordMutableCap(cap,p,bd->gen_no); +// } +// } hunk ./rts/Capability.h 343 +// For TSO and IND_LOCALs: +EXTERN_INLINE void +recordClosureMutated_ (Capability *cap, StgClosure *p) +{ + if (isGlobal(p)) { + recordMutableCap(cap,p,global_gen_no); // XXX gen wrong + } +} #if defined(THREADED_RTS) INLINE_HEADER rtsBool hunk ./rts/ClosureFlags.c 55 [IND] = ( _NS| _IND ), [IND_PERM] = ( _NS| _IND ), [IND_STATIC] = ( _NS|_STA| _IND ), + [IND_LOCAL] = ( _NS| _IND ), [RET_BCO] = ( _BTM ), [RET_SMALL] = ( _BTM| _SRT ), [RET_BIG] = ( _SRT ), hunk ./rts/ClosureFlags.c 70 [MVAR_CLEAN] = (_HNF| _NS| _MUT|_UPT ), [MVAR_DIRTY] = (_HNF| _NS| _MUT|_UPT ), [ARR_WORDS] = (_HNF| _NS| _UPT ), - [MUT_ARR_PTRS_CLEAN] = (_HNF| _NS| _MUT|_UPT ), - [MUT_ARR_PTRS_DIRTY] = (_HNF| _NS| _MUT|_UPT ), + [MUT_ARR_PTRS_LOCAL] = (_HNF| _NS| _MUT|_UPT ), + [MUT_ARR_PTRS_GLOBAL] = (_HNF| _NS| _MUT|_UPT ), [MUT_ARR_PTRS_FROZEN0] = (_HNF| _NS| _MUT|_UPT ), [MUT_ARR_PTRS_FROZEN] = (_HNF| _NS| _UPT ), hunk ./rts/ClosureFlags.c 74 - [MUT_VAR_CLEAN] = (_HNF| _NS| _MUT|_UPT ), - [MUT_VAR_DIRTY] = (_HNF| _NS| _MUT|_UPT ), + [MUT_VAR_LOCAL] = (_HNF| _NS| _MUT|_UPT ), + [MUT_VAR_GLOBAL] = (_HNF| _NS| _MUT|_UPT ), [WEAK] = (_HNF| _NS| _UPT ), [PRIM] = (_HNF| _NS| _UPT ), [MUT_PRIM] = (_HNF| _NS| _MUT|_UPT ), hunk ./rts/ClosureFlags.c 88 [WHITEHOLE] = ( 0 ) }; -#if N_CLOSURE_TYPES != 61 +#if N_CLOSURE_TYPES != 62 #error Closure types changed: update ClosureFlags.c! #endif hunk ./rts/Exception.cmm 311 W_ msg; out = Sp - WDS(1); /* ok to re-use stack space here */ + SAVE_THREAD_STATE(); (msg) = foreign "C" throwTo(MyCapability() "ptr", CurrentTSO "ptr", target "ptr", hunk ./rts/Exception.cmm 316 exception "ptr") [R1,R2]; + + // throwTo may have to globalise the current TSO and the + // target, so make sure we get the new ones: + LOAD_THREAD_STATE(); if (msg == NULL) { jump %ENTRY_CODE(Sp(0)); hunk ./rts/HeapStackCheck.cmm 334 W_ info; W_ type; + info = %INFO_PTR(UNTAG(R1)); + if (IS_FORWARDING_PTR(info)) { + R1 = UN_FORWARDING_PTR(info) | GETTAG(R1); + } info = %GET_FUN_INFO(UNTAG(R1)); // cache the size hunk ./rts/HeapStackCheck.cmm 396 { R1 = Sp(2); Sp_adj(3); + + W_ info; + info = %INFO_PTR(UNTAG(R1)); + if (IS_FORWARDING_PTR(info)) { + R1 = UN_FORWARDING_PTR(info) + GETTAG(R1); + info = %INFO_PTR(UNTAG(R1)); + } + #ifdef NO_ARG_REGS // Minor optimisation: there are no argument registers to load up, // so we can just jump straight to the function's entry point. hunk ./rts/HeapStackCheck.cmm 407 - jump %GET_ENTRY(UNTAG(R1)); + jump %ENTRY_CODE(info); #else W_ info; W_ type; hunk ./rts/HeapStackCheck.cmm 412 - info = %GET_FUN_INFO(UNTAG(R1)); + info = %FUN_INFO(info); type = TO_W_(StgFunInfoExtra_fun_type(info)); if (type == ARG_GEN || type == ARG_GEN_BIG) { jump StgFunInfoExtra_slow_apply(info); hunk ./rts/HeapStackCheck.cmm 640 BLOCK_BUT_FIRST(stg_block_putmvar_finally); } -stg_block_blackhole +stg_block_enter { Sp_adj(-2); Sp(1) = R1; hunk ./rts/Interpreter.c 280 case IND: case IND_PERM: + case IND_LOCAL: case IND_STATIC: { tagged_obj = ((StgInd*)obj)->indirectee; hunk ./rts/LdvProfile.c 66 case STACK: case MVAR_CLEAN: case MVAR_DIRTY: - case MUT_ARR_PTRS_CLEAN: - case MUT_ARR_PTRS_DIRTY: + case MUT_ARR_PTRS_LOCAL: + case MUT_ARR_PTRS_GLOBAL: case MUT_ARR_PTRS_FROZEN: case MUT_ARR_PTRS_FROZEN0: case ARR_WORDS: hunk ./rts/LdvProfile.c 72 case WEAK: - case MUT_VAR_CLEAN: - case MUT_VAR_DIRTY: + case MUT_VAR_LOCAL: + case MUT_VAR_GLOBAL: case BCO: case PRIM: case MUT_PRIM: hunk ./rts/LdvProfile.c 118 // because they will perish before the next census at any // rate. case IND: + case IND_LOCAL: // Found a dead closure: record its size LDV_recordDead(c, size); return size; hunk ./rts/LdvProfile.c 235 } else { processNurseryForDead(); for (g = 0; g <= N; g++) { - processHeapForDead(generations[g].old_blocks); - processChainForDead(generations[g].large_objects); + processHeapForDead(old_generations[g].old_blocks); + processChainForDead(old_generations[g].large_objects); } } } hunk ./rts/LdvProfile.h 26 // Invoked when: // 1) Hp is incremented and exceeds HpLim (in Updates.hc). // 2) copypart() is called (in GC.c). -#define LDV_FILL_SLOP(from, howMany) \ +#define LDV_FILL_SLOP(from, howMany) \ if (era > 0) { \ hunk ./rts/LdvProfile.h 28 - int i; \ - for (i = 0;i < (howMany); i++) \ - ((StgWord *)(from))[i] = 0; \ + ZERO_SLOP(from, i); \ } // Informs the LDV profiler that closure c has just been evacuated. hunk ./rts/Linker.c 779 SymI_HasProto(stg_deRefWeakzh) \ SymI_HasProto(stg_deRefStablePtrzh) \ SymI_HasProto(dirty_MUT_VAR) \ + SymI_HasProto(dirty_MUT_ARR) \ SymI_HasProto(stg_forkzh) \ SymI_HasProto(stg_forkOnzh) \ SymI_HasProto(forkProcess) \ hunk ./rts/Linker.c 911 SymI_HasProto(stg_MVAR_DIRTY_info) \ SymI_HasProto(stg_IND_STATIC_info) \ SymI_HasProto(stg_ARR_WORDS_info) \ - SymI_HasProto(stg_MUT_ARR_PTRS_DIRTY_info) \ - SymI_HasProto(stg_MUT_ARR_PTRS_FROZEN_info) \ - SymI_HasProto(stg_MUT_ARR_PTRS_FROZEN0_info) \ + SymI_HasProto(stg_MUT_ARR_PTRS_LOCAL_info) \ + SymI_HasProto(stg_MUT_ARR_PTRS_GLOBAL_info) \ + SymI_HasProto(stg_MUT_ARR_PTRS_FROZEN_info) \ + SymI_HasProto(stg_MUT_ARR_PTRS_FROZEN0_info) \ SymI_HasProto(stg_WEAK_info) \ SymI_HasProto(stg_ap_v_info) \ SymI_HasProto(stg_ap_f_info) \ hunk ./rts/Linker.c 986 SymI_NeedsProto(stg_interp_constr_entry) \ SymI_HasProto(stg_arg_bitmaps) \ SymI_HasProto(large_alloc_lim) \ - SymI_HasProto(g0) \ SymI_HasProto(allocate) \ SymI_HasProto(allocateExec) \ SymI_HasProto(freeExec) \ hunk ./rts/Messages.c 17 #include "Threads.h" #include "RaiseAsync.h" #include "sm/Storage.h" +#include "sm/Globalise.h" +#include "sm/GCThread.h" +#include "WritePolicy.h" /* ---------------------------------------------------------------------------- Send a message to another Capability hunk ./rts/Messages.c 27 #ifdef THREADED_RTS -void sendMessage(Capability *from_cap, Capability *to_cap, Message *msg) +#define INBOX_THRESHOLD 0 + +void sendMessage(Capability *from_cap STG_UNUSED, Capability *to_cap, Message *msg) { ACQUIRE_LOCK(&to_cap->lock); hunk ./rts/Messages.c 39 if (i != &stg_MSG_THROWTO_info && i != &stg_MSG_BLACKHOLE_info && i != &stg_MSG_TRY_WAKEUP_info && - i != &stg_IND_info && // can happen if a MSG_BLACKHOLE is revoked + i != &stg_STUB_MSG_BLACKHOLE_info && // can happen if a MSG_BLACKHOLE is revoked + i != &stg_MSG_GLOBALISE_info && i != &stg_WHITEHOLE_info) { barf("sendMessage: %p", i); } hunk ./rts/Messages.c 45 } -#endif hunk ./rts/Messages.c 46 - msg->link = to_cap->inbox; - to_cap->inbox = msg; - - recordClosureMutated(from_cap,(StgClosure*)msg); +#endif + // not necessary, since all messages are in the global heap: + // recordClosureMutated(from_cap,(StgClosure*)msg); + ASSERT(isGlobal((StgClosure*)msg)); if (to_cap->running_task == NULL) { hunk ./rts/Messages.c 52 - to_cap->running_task = myTask(); +#if 0 + Task *task = myTask(); + to_cap->running_task = task; // precond for releaseCapability_() hunk ./rts/Messages.c 56 + task->cap = to_cap; + executeMessage(to_cap, msg); releaseCapability_(to_cap,rtsFalse); hunk ./rts/Messages.c 59 + task->cap = from_cap; +#else + msg->link = to_cap->inbox; + to_cap->inbox = msg; + to_cap->n_inbox++; + to_cap->running_task = myTask(); + releaseCapability_(to_cap,rtsFalse); +#endif } else { hunk ./rts/Messages.c 68 - contextSwitchCapability(to_cap); + if (to_cap->n_inbox > INBOX_THRESHOLD) { + contextSwitchCapability(to_cap); + } + msg->link = to_cap->inbox; + to_cap->inbox = msg; + to_cap->n_inbox++; } RELEASE_LOCK(&to_cap->lock); hunk ./rts/Messages.c 98 if (i == &stg_MSG_TRY_WAKEUP_info) { StgTSO *tso = ((MessageWakeup *)m)->tso; - debugTraceCap(DEBUG_sched, cap, "message: try wakeup thread %ld", + debugTraceCap(DEBUG_sched, cap, "exec message: try wakeup thread %ld", (lnat)tso->id); tryWakeupThread(cap, tso); } hunk ./rts/Messages.c 114 goto loop; } - debugTraceCap(DEBUG_sched, cap, "message: throwTo %ld -> %ld", + debugTraceCap(DEBUG_sched, cap, "exec message: throwTo %ld -> %ld", (lnat)t->source->id, (lnat)t->target->id); ASSERT(t->source->why_blocked == BlockedOnMsgThrowTo); hunk ./rts/Messages.c 147 } return; } - else if (i == &stg_IND_info || i == &stg_MSG_NULL_info) + else if (i == &stg_MSG_GLOBALISE_info) + { + MessageGlobalise *g = (MessageGlobalise*)m; + StgClosure *p; + const StgInfoTable *info; + + debugTraceCap(DEBUG_sched, cap, "exec message: globalise %p for thread %lu", + g->req, (lnat)g->tso->id); + + p = UNTAG_CLOSURE(g->req); + ASSERT(isGlobal((StgClosure*)p)); + info = get_itbl(p); + + // paranoia + if (info->type == IND_LOCAL && info->srt_bitmap == cap->no) { + info = get_itbl(UNTAG_CLOSURE(((StgInd*)p)->indirectee)); + if (info->type == BLACKHOLE) { + nat r; + r = messageBlackHole(cap,(MessageBlackHole*)m); + if (r != 0) return; + } + ((StgInd*)p)->indirectee = + MSG_GLOB_GLOBALISE(cap, ((StgInd*)p)->indirectee); + SET_INFO(p, &stg_IND_info); + } + + // even if we didn't globalise: wake up the thread. If the + // message came to the wrong place, then the source thread + // will try again and emit a message to the right place. + tryWakeupThread(cap, g->tso); + return; + + } + else if (i == &stg_STUB_MSG_BLACKHOLE_info || i == &stg_MSG_NULL_info) { // message was revoked return; hunk ./rts/Messages.c 224 StgClosure *bh = UNTAG_CLOSURE(msg->bh); StgTSO *owner; - debugTraceCap(DEBUG_sched, cap, "message: thread %d blocking on blackhole %p", + debugTraceCap(DEBUG_sched, cap, "exec message: thread %d blocking on blackhole %p", (lnat)msg->tso->id, msg->bh); info = bh->header.info; hunk ./rts/Messages.c 252 p = UNTAG_CLOSURE((StgClosure*)VOLATILE_LOAD(&((StgInd*)bh)->indirectee)); info = p->header.info; - if (info == &stg_IND_info) + if (info == &stg_STUB_BLOCKING_QUEUE_info) { // This could happen, if e.g. we got a BLOCKING_QUEUE that has // just been replaced with an IND by another thread in hunk ./rts/Messages.c 267 #ifdef THREADED_RTS if (owner->cap != cap) { + msg->link = (MessageBlackHole*)END_TSO_QUEUE; // just make it valid + msg = (MessageBlackHole*)globalise_(cap, (StgClosure*)msg); sendMessage(cap, owner->cap, (Message*)msg); debugTraceCap(DEBUG_sched, cap, "forwarding message to cap %d", owner->cap->no); return 1; hunk ./rts/Messages.c 278 // Capability. msg->tso is the first thread to block on this // BLACKHOLE, so we first create a BLOCKING_QUEUE object. - bq = (StgBlockingQueue*)allocate(cap, sizeofW(StgBlockingQueue)); - + bq = (StgBlockingQueue*)allocatePrim(cap, sizeofW(StgBlockingQueue)); + // initialise the BLOCKING_QUEUE object SET_HDR(bq, &stg_BLOCKING_QUEUE_DIRTY_info, CCS_SYSTEM); bq->bh = bh; hunk ./rts/Messages.c 310 pushOnRunQueue(cap,owner); } + // if the BLACKHOLE is global, the BLOCKING_QUEUE and the TSO + // must also be visible globally. + globaliseWRT(cap, bh, (StgClosure**)&bq); + // point to the BLOCKING_QUEUE from the BLACKHOLE write_barrier(); // make the BQ visible ((StgInd*)bh)->indirectee = (StgClosure *)bq; hunk ./rts/Messages.c 317 - recordClosureMutated(cap,bh); // bh was mutated + // not necessary: if the bh was global, we allocated the BQ globally + // recordClosureMutated(cap,bh); debugTraceCap(DEBUG_sched, cap, "thread %d blocked on thread %d", (lnat)msg->tso->id, (lnat)owner->id); hunk ./rts/Messages.c 338 #ifdef THREADED_RTS if (owner->cap != cap) { + msg->link = (MessageBlackHole*)END_TSO_QUEUE; // just make it valid + msg = (MessageBlackHole*)globalise_(cap, (StgClosure*)msg); sendMessage(cap, owner->cap, (Message*)msg); debugTraceCap(DEBUG_sched, cap, "forwarding message to cap %d", owner->cap->no); return 1; hunk ./rts/Messages.c 348 msg->link = bq->queue; bq->queue = msg; - recordClosureMutated(cap,(StgClosure*)msg); + // no need to do this, the msg will be global if the bq is + // recordClosureMutated(cap,(StgClosure*)msg); if (info == &stg_BLOCKING_QUEUE_CLEAN_info) { bq->header.info = &stg_BLOCKING_QUEUE_DIRTY_info; hunk ./rts/Messages.c 353 - recordClosureMutated(cap,(StgClosure*)bq); + // no need to do this, the bq will be global if the msg is + // recordClosureMutated(cap,(StgClosure*)bq); } debugTraceCap(DEBUG_sched, cap, "thread %d blocked on thread %d", hunk ./rts/Messages.c 372 return 0; // not blocked } + +/* ---------------------------------------------------------------------------- + Requesting globalisation of a closure + -------------------------------------------------------------------------- */ + + +nat +#ifndef THREADED_RTS + GNUC3_ATTRIBUTE(__noreturn__) +#endif +messageGlobalise (Capability *cap USED_IF_THREADS, + StgTSO *tso USED_IF_THREADS, + StgClosure *p USED_IF_THREADS, + nat owner USED_IF_THREADS) +{ +#ifndef THREADED_RTS + + barf("messageGlobalise in non-THREADED_RTS"); + +#else + +#if ASYNC_GLOBALISE + int r; + r = pthread_mutex_trylock(&gc_threads[owner]->local_gc_lock); + if (r == 0) + { + StgClosure *res; + res = globaliseFull_(cap, ((StgInd*)p)->indirectee); + + // If we didn't manage to globalise it (maybe a BLACKHOLE), + // then continue below and send a MSG_GLOBALISE. + if (get_itbl(UNTAG_CLOSURE(res))->type != IND_LOCAL) { + ((StgInd*)p)->indirectee = res; + SET_INFO(p, &stg_IND_info); + RELEASE_LOCK(&gc_threads[owner]->local_gc_lock); + return 0; + } + + RELEASE_LOCK(&gc_threads[owner]->local_gc_lock); + } +#endif + + { + Capability *dest; + MessageGlobalise *msg; + + // No: The owner might be turning it into an IND. Don't look at + // the info table. + // ASSERT(get_itbl(p)->type == IND_LOCAL); + + ASSERT(Bdescr((P_)p)->gen_ix == global_gen_ix); + + dest = &capabilities[owner]; + + debugTraceCap(DEBUG_sched, cap, + "thread %lu requesting globalisation of closure at %p from cap %u", + (lnat)tso->id, p, (nat)dest->no); + + ACQUIRE_LOCK(&dest->lock); + + if (dest->running_task == NULL) + { + Task *task = cap->running_task; + const StgInfoTable *info; + + info = get_itbl(UNTAG_CLOSURE(p)); + + if (info->type == IND_LOCAL && info->srt_bitmap == dest->no) { + + // if the closure is a BLACKHOLE, it cannot be + // globalised, and we have to block anyway. So + // we fall back to sending a message. + info = get_itbl(UNTAG_CLOSURE(((StgInd*)p)->indirectee)); + if (info->type == BLACKHOLE) { + goto message; + } + + // release the lock on the cap, because we're about to + // do a (possibly lengthy) globalise operation. Make + // sure we mark the cap as owned by the current Task + // first, however. + dest->running_task = task; + RELEASE_LOCK(&dest->lock); + + ((StgInd*)p)->indirectee = + globaliseFull_(dest, ((StgInd*)p)->indirectee); + SET_INFO(p, &stg_IND_info); + + releaseCapability(dest); + } + else + { + releaseCapability_(dest,rtsFalse); + RELEASE_LOCK(&dest->lock); + } + + return 0; + } + + message: + msg = (MessageGlobalise*)allocatePrim(cap, sizeofW(MessageGlobalise)); + setGlobal((StgClosure*)msg); + + SET_HDR(msg, &stg_MSG_GLOBALISE_info, CCS_SYSTEM); + msg->tso = tso; + msg->req = p; + + msg->link = dest->inbox; + dest->inbox = (Message*)msg; + dest->n_inbox++; + + if (dest->running_task == NULL) { + releaseCapability_(dest,rtsFalse); + } else { + if (dest->n_inbox > INBOX_THRESHOLD) { + contextSwitchCapability(dest); + } + } + RELEASE_LOCK(&dest->lock); + + tso->block_info.closure = (StgClosure*)msg; + tso->why_blocked = BlockedOnMsgGlobalise; + return 1; + } +#endif +} + hunk ./rts/Messages.h 11 #include "BeginPrivate.h" -nat messageBlackHole(Capability *cap, MessageBlackHole *msg); +nat messageBlackHole (Capability *cap, MessageBlackHole *msg); +nat messageGlobalise (Capability *cap, StgTSO *tso, StgClosure *p, nat owner); #ifdef THREADED_RTS void executeMessage (Capability *cap, Message *m); hunk ./rts/Messages.h 25 INLINE_HEADER void doneWithMsgThrowTo (MessageThrowTo *m) { - OVERWRITING_CLOSURE((StgClosure*)m); + overwritingPrimClosure((StgClosure*)m, + sizeofW(MessageThrowTo), sizeofW(Message)); unlockClosure((StgClosure*)m, &stg_MSG_NULL_info); LDV_RECORD_CREATE(m); } hunk ./rts/PrimOps.cmm 29 * ---------------------------------------------------------------------------*/ #include "Cmm.h" +#include "WritePolicy.h" #ifdef __PIC__ import pthread_mutex_lock; hunk ./rts/PrimOps.cmm 65 n = R1; payload_words = ROUNDUP_BYTES_TO_WDS(n); words = BYTES_TO_WDS(SIZEOF_StgArrWords) + payload_words; - ("ptr" p) = foreign "C" allocate(MyCapability() "ptr",words) []; + ("ptr" p) = foreign "C" allocatePrim(MyCapability() "ptr",words) []; TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0); SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]); StgArrWords_bytes(p) = n; hunk ./rts/PrimOps.cmm 158 // number of words. size = n + mutArrPtrsCardWords(n); words = BYTES_TO_WDS(SIZEOF_StgMutArrPtrs) + size; - ("ptr" arr) = foreign "C" allocate(MyCapability() "ptr",words) [R2]; + ("ptr" arr) = foreign "C" allocatePrim(MyCapability() "ptr",words) [R2]; TICK_ALLOC_PRIM(SIZEOF_StgMutArrPtrs, WDS(n), 0); hunk ./rts/PrimOps.cmm 161 - SET_HDR(arr, stg_MUT_ARR_PTRS_DIRTY_info, W_[CCCS]); + SET_HDR(arr, stg_MUT_ARR_PTRS_LOCAL_info, W_[CCCS]); StgMutArrPtrs_ptrs(arr) = n; StgMutArrPtrs_size(arr) = size; hunk ./rts/PrimOps.cmm 205 // we put it on the mutable list more than once, but it would get scavenged // multiple times during GC, which would be unnecessarily slow. // - if (StgHeader_info(R1) != stg_MUT_ARR_PTRS_FROZEN0_info) { - SET_INFO(R1,stg_MUT_ARR_PTRS_DIRTY_info); - recordMutable(R1, R1); - // must be done after SET_INFO, because it ASSERTs closure_MUTABLE() - RET_P(R1); - } else { - SET_INFO(R1,stg_MUT_ARR_PTRS_DIRTY_info); - RET_P(R1); - } + W_ bd; + bd = Bdescr(R1); + if (isGlobalPrim(bd,R1)) { + SET_INFO(R1,stg_MUT_ARR_PTRS_GLOBAL_info); + } else { + SET_INFO(R1,stg_MUT_ARR_PTRS_LOCAL_info); + } } /* ----------------------------------------------------------------------------- hunk ./rts/PrimOps.cmm 223 W_ mv; /* Args: R1 = initialisation value */ - ALLOC_PRIM( SIZEOF_StgMutVar, R1_PTR, stg_newMutVarzh); - - mv = Hp - SIZEOF_StgMutVar + WDS(1); - SET_HDR(mv,stg_MUT_VAR_DIRTY_info,W_[CCCS]); + MAYBE_GC(R1_PTR, stg_newMutVarzh); + ("ptr" mv) = foreign "C" allocatePrim(MyCapability() "ptr", + BYTES_TO_WDS(SIZEOF_StgMutVar)) [R1]; + + SET_HDR(mv,stg_MUT_VAR_LOCAL_info,W_[CCCS]); StgMutVar_var(mv) = R1; RET_P(mv); hunk ./rts/PrimOps.cmm 298 LDV_RECORD_CREATE(r); StgThunk_payload(r,0) = z; - retry: x = StgMutVar_var(mv); StgThunk_payload(z,1) = x; hunk ./rts/PrimOps.cmm 300 + + // globalise the value we're going to store in the MUT_VAR, if + // necessary. If the MUT_VAR is not global, it must be in our + // local heap, and hence it can't be globalised during the + // operation, so we only need to check once: + // + W_ y_gbl; + if (GET_INFO(mv) == stg_MUT_VAR_GLOBAL_info) { + ("ptr" y_gbl) = foreign "C" dirty_MUT_VAR(BaseReg "ptr", y) []; + // don't forget, z may have moved now: + z = StgThunk_payload(y_gbl,0); + } else { + y_gbl = y; + } + + retry: + #ifdef THREADED_RTS hunk ./rts/PrimOps.cmm 318 - (h) = foreign "C" cas(mv + SIZEOF_StgHeader + OFFSET_StgMutVar_var, x, y) []; - if (h != x) { goto retry; } + (h) = foreign "C" cas(mv + SIZEOF_StgHeader + OFFSET_StgMutVar_var, x, y_gbl) []; + if (h != x) { + x = StgMutVar_var(mv); + StgThunk_payload(z,1) = x; + goto retry; + } #else hunk ./rts/PrimOps.cmm 325 - StgMutVar_var(mv) = y; + StgMutVar_var(mv) = y_gbl; #endif hunk ./rts/PrimOps.cmm 328 - if (GET_INFO(mv) == stg_MUT_VAR_CLEAN_info) { - foreign "C" dirty_MUT_VAR(BaseReg "ptr", mv "ptr") []; - } - RET_P(r); } hunk ./rts/PrimOps.cmm 349 R3 = stg_NO_FINALIZER_closure; } - ALLOC_PRIM( SIZEOF_StgWeak, R1_PTR & R2_PTR & R3_PTR, stg_mkWeakzh ); + MAYBE_GC(R1_PTR & R2_PTR & R3_PTR, stg_mkWeakzh); + + ("ptr" w) = foreign "C" allocatePrim(MyCapability() "ptr", + BYTES_TO_WDS(SIZEOF_StgWeak)) [R1,R2,R3]; hunk ./rts/PrimOps.cmm 354 - w = Hp - SIZEOF_StgWeak + WDS(1); SET_HDR(w, stg_WEAK_info, W_[CCCS]); // We don't care about cfinalizer here. hunk ./rts/PrimOps.cmm 366 StgWeak_cfinalizer(w) = stg_NO_FINALIZER_closure; ACQUIRE_LOCK(sm_mutex); - StgWeak_link(w) = W_[weak_ptr_list]; - W_[weak_ptr_list] = w; + StgWeak_link(w) = generation_weak_ptrs(StgRegTable_rG0(BaseReg)); + generation_weak_ptrs(StgRegTable_rG0(BaseReg)) = w; RELEASE_LOCK(sm_mutex); IF_DEBUG(weak, foreign "C" debugBelch(stg_weak_msg,w) []); hunk ./rts/PrimOps.cmm 395 flag = R5; eptr = R6; - ALLOC_PRIM( SIZEOF_StgWeak, R1_PTR & R2_PTR, stg_mkWeakForeignEnvzh ); + MAYBE_GC(R1_PTR & R2_PTR, stg_mkWeakForeignEnvzh); + ("ptr" w) = foreign "C" allocatePrim(MyCapability() "ptr", + BYTES_TO_WDS(SIZEOF_StgWeak)) [R1,R2]; hunk ./rts/PrimOps.cmm 399 - w = Hp - SIZEOF_StgWeak + WDS(1); SET_HDR(w, stg_WEAK_info, W_[CCCS]); payload_words = 4; hunk ./rts/PrimOps.cmm 403 words = BYTES_TO_WDS(SIZEOF_StgArrWords) + payload_words; - ("ptr" p) = foreign "C" allocate(MyCapability() "ptr", words) []; + ("ptr" p) = foreign "C" allocatePrim(MyCapability() "ptr", words) []; TICK_ALLOC_PRIM(SIZEOF_StgArrWords,WDS(payload_words),0); SET_HDR(p, stg_ARR_WORDS_info, W_[CCCS]); hunk ./rts/PrimOps.cmm 423 StgWeak_cfinalizer(w) = p; ACQUIRE_LOCK(sm_mutex); - StgWeak_link(w) = W_[weak_ptr_list]; - W_[weak_ptr_list] = w; + StgWeak_link(w) = W_[generation_weak_ptrs(StgRegTable_rG0(BaseReg))]; + W_[generation_weak_ptrs(StgRegTable_rG0(BaseReg))] = w; RELEASE_LOCK(sm_mutex); IF_DEBUG(weak, foreign "C" debugBelch(stg_weak_msg,w) []); hunk ./rts/PrimOps.cmm 1143 /* args: none */ W_ mvar; - ALLOC_PRIM ( SIZEOF_StgMVar, NO_PTRS, stg_newMVarzh ); + MAYBE_GC(NO_PTRS, stg_newMVarzh); + ("ptr" mvar) = foreign "C" allocatePrim(MyCapability(), + BYTES_TO_WDS(SIZEOF_StgMVar)) []; hunk ./rts/PrimOps.cmm 1147 - mvar = Hp - SIZEOF_StgMVar + WDS(1); SET_HDR(mvar,stg_MVAR_DIRTY_info,W_[CCCS]); // MVARs start dirty: generation 0 has no mutable list StgMVar_head(mvar) = stg_END_TSO_QUEUE_closure; hunk ./rts/PrimOps.cmm 1159 #define PerformTake(stack, value) \ W_[StgStack_sp(stack) + WDS(1)] = value; \ W_[StgStack_sp(stack) + WDS(0)] = stg_gc_unpt_r1_info; +// no need to globalise_wrt: the if the mvar is global, then the value and tso +// will be too. #define PerformPut(stack,lval) \ StgStack_sp(stack) = StgStack_sp(stack) + WDS(3); \ hunk ./rts/PrimOps.cmm 1168 stg_takeMVarzh { - W_ mvar, val, info, tso, q; + W_ mvar, val, info, tso, q, bd, new_tso; /* args: R1 = MVar closure */ mvar = R1; hunk ./rts/PrimOps.cmm 1179 info = GET_INFO(mvar); #endif - if (info == stg_MVAR_CLEAN_info) { - foreign "C" dirty_MVAR(BaseReg "ptr", mvar "ptr") []; - } + // Not necessary until we have multiple local generations: + // if (info == stg_MVAR_CLEAN_info) { + // foreign "C" dirty_MVAR(BaseReg "ptr", mvar "ptr") []; + // } /* If the MVar is empty, put ourselves on its blocking queue, * and wait until we're woken up. hunk ./rts/PrimOps.cmm 1189 */ if (StgMVar_value(mvar) == stg_END_TSO_QUEUE_closure) { - // Note [mvar-heap-check] We want to do the heap check in the - // branch here, to avoid the conditional in the common case. - // However, we've already locked the MVar above, so we better - // be careful to unlock it again if the the heap check fails. - // Unfortunately we don't have an easy way to inject any code - // into the heap check generated by the code generator, so we - // have to do it in stg_gc_gen (see HeapStackCheck.cmm). - HP_CHK_GEN_TICKY(SIZEOF_StgMVarTSOQueue, R1_PTR, stg_takeMVarzh); - - q = Hp - SIZEOF_StgMVarTSOQueue + WDS(1); - + ("ptr" q) = foreign "C" allocatePrim(MyCapability(), + BYTES_TO_WDS(SIZEOF_StgMVarTSOQueue)); SET_HDR(q, stg_MVAR_TSO_QUEUE_info, CCS_SYSTEM); StgMVarTSOQueue_link(q) = END_TSO_QUEUE; StgMVarTSOQueue_tso(q) = CurrentTSO; hunk ./rts/PrimOps.cmm 1195 + bd = Bdescr(mvar); + if (isGlobalPrim(bd,mvar)) { + ("ptr" new_tso) = foreign "C" globalise_(MyCapability(), q); + } + if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) { StgMVar_head(mvar) = q; } else { hunk ./rts/PrimOps.cmm 1204 StgMVarTSOQueue_link(StgMVar_tail(mvar)) = q; - foreign "C" recordClosureMutated(MyCapability() "ptr", - StgMVar_tail(mvar)) []; + // No need to do this, at least until we have multiple + // local generations: + // foreign "C" recordClosureMutated(MyCapability() "ptr", + // StgMVar_tail(mvar)) []; } StgTSO__link(CurrentTSO) = q; StgTSO_block_info(CurrentTSO) = mvar; hunk ./rts/PrimOps.cmm 1289 RET_NP(0, stg_NO_FINALIZER_closure); } - if (info == stg_MVAR_CLEAN_info) { - foreign "C" dirty_MVAR(BaseReg "ptr", mvar "ptr") []; - } + // Not necessary until we have multiple local generations: + // if (info == stg_MVAR_CLEAN_info) { + // foreign "C" dirty_MVAR(BaseReg "ptr", mvar "ptr") []; + // } /* we got the value... */ val = StgMVar_value(mvar); hunk ./rts/PrimOps.cmm 1341 stg_putMVarzh { - W_ mvar, val, info, tso, q; + W_ mvar, val, info, tso, q, bd, new_tso; /* args: R1 = MVar, R2 = value */ mvar = R1; hunk ./rts/PrimOps.cmm 1353 info = GET_INFO(mvar); #endif - if (info == stg_MVAR_CLEAN_info) { - foreign "C" dirty_MVAR(BaseReg "ptr", mvar "ptr"); - } + // Not necessary until we have multiple local generations: + // if (info == stg_MVAR_CLEAN_info) { + // foreign "C" dirty_MVAR(BaseReg "ptr", mvar "ptr") []; + // } if (StgMVar_value(mvar) != stg_END_TSO_QUEUE_closure) { hunk ./rts/PrimOps.cmm 1360 - // see Note [mvar-heap-check] above - HP_CHK_GEN_TICKY(SIZEOF_StgMVarTSOQueue, R1_PTR & R2_PTR, stg_putMVarzh); - - q = Hp - SIZEOF_StgMVarTSOQueue + WDS(1); - + ("ptr" q) = foreign "C" allocatePrim(MyCapability(), + BYTES_TO_WDS(SIZEOF_StgMVarTSOQueue)); SET_HDR(q, stg_MVAR_TSO_QUEUE_info, CCS_SYSTEM); StgMVarTSOQueue_link(q) = END_TSO_QUEUE; StgMVarTSOQueue_tso(q) = CurrentTSO; hunk ./rts/PrimOps.cmm 1366 + bd = Bdescr(mvar); + if (isGlobalPrim(bd,mvar)) { + ("ptr" new_tso) = foreign "C" globalise_(MyCapability(), q); + } + if (StgMVar_head(mvar) == stg_END_TSO_QUEUE_closure) { StgMVar_head(mvar) = q; } else { hunk ./rts/PrimOps.cmm 1375 StgMVarTSOQueue_link(StgMVar_tail(mvar)) = q; - foreign "C" recordClosureMutated(MyCapability() "ptr", - StgMVar_tail(mvar)) []; + // No need to do this, at least until we have multiple + // local generations: + // foreign "C" recordClosureMutated(MyCapability() "ptr", + // StgMVar_tail(mvar)) []; } StgTSO__link(CurrentTSO) = q; StgTSO_block_info(CurrentTSO) = mvar; hunk ./rts/PrimOps.cmm 1390 jump stg_block_putmvar; } + bd = Bdescr(mvar); + if (isGlobalPrim(bd,mvar)) { + ("ptr" val) = foreign "C" MVAR_GLOBALISE(MyCapability(), val); + } + q = StgMVar_head(mvar); loop: if (q == stg_END_TSO_QUEUE_closure) { hunk ./rts/PrimOps.cmm 1400 /* No further takes, the MVar is now full. */ StgMVar_value(mvar) = val; - unlockClosure(mvar, stg_MVAR_DIRTY_info); + unlockClosure(mvar, stg_MVAR_DIRTY_info); jump %ENTRY_CODE(Sp(0)); } if (StgHeader_info(q) == stg_IND_info || hunk ./rts/PrimOps.cmm 1427 // indicate that the MVar operation has now completed. StgTSO__link(tso) = stg_END_TSO_QUEUE_closure; + + // if the TSO we're waking up belongs to another cap, then the + // MVAR must be global, because otherwise it couldn't have blocked + // on this MVAR. Hence, the value we're writing must be global + // too, so we don't need a write barrier to update this TSO. + // Indeed, we can't use a write barrier if the TSO belongs to + // another cap, because we would be putting it on the wrong + // mutable list. + // + // if (TO_W_(StgStack_dirty(stack)) == 0) { + // foreign "C" dirty_STACK(MyCapability() "ptr", stack "ptr") []; + // } hunk ./rts/PrimOps.cmm 1440 - if (TO_W_(StgStack_dirty(stack)) == 0) { - foreign "C" dirty_STACK(MyCapability() "ptr", stack "ptr") []; - } - foreign "C" tryWakeupThread(MyCapability() "ptr", tso) []; unlockClosure(mvar, stg_MVAR_DIRTY_info); hunk ./rts/PrimOps.cmm 1449 stg_tryPutMVarzh { - W_ mvar, val, info, tso, q; + W_ mvar, val, info, tso, bd, q; /* args: R1 = MVar, R2 = value */ mvar = R1; hunk ./rts/PrimOps.cmm 1461 info = GET_INFO(mvar); #endif - if (info == stg_MVAR_CLEAN_info) { - foreign "C" dirty_MVAR(BaseReg "ptr", mvar "ptr"); - } + // Not necessary until we have multiple local generations: + // if (info == stg_MVAR_CLEAN_info) { + // foreign "C" dirty_MVAR(BaseReg "ptr", mvar "ptr") []; + // } if (StgMVar_value(mvar) != stg_END_TSO_QUEUE_closure) { #if defined(THREADED_RTS) hunk ./rts/PrimOps.cmm 1473 RET_N(0); } + bd = Bdescr(mvar); + if (isGlobalPrim(bd,mvar)) { + ("ptr" val) = foreign "C" MVAR_GLOBALISE(MyCapability(), val); + } + q = StgMVar_head(mvar); loop: if (q == stg_END_TSO_QUEUE_closure) { hunk ./rts/PrimOps.cmm 1483 /* No further takes, the MVar is now full. */ StgMVar_value(mvar) = val; - unlockClosure(mvar, stg_MVAR_DIRTY_info); + unlockClosure(mvar, stg_MVAR_DIRTY_info); RET_N(1); } if (StgHeader_info(q) == stg_IND_info || hunk ./rts/PrimOps.cmm 1511 // indicate that the MVar operation has now completed. StgTSO__link(tso) = stg_END_TSO_QUEUE_closure; - if (TO_W_(StgStack_dirty(stack)) == 0) { - foreign "C" dirty_STACK(MyCapability() "ptr", stack "ptr") []; - } - + // if the TSO we're waking up belongs to another cap, then the + // MVAR must be global, because otherwise it couldn't have blocked + // on this MVAR. Hence, the value we're writing must be global + // too, so we don't need a write barrier to update this TSO. + // Indeed, we can't use a write barrier if the TSO belongs to + // another cap, because we would be putting it on the wrong + // mutable list. + // + // if (TO_W_(StgStack_dirty(stack)) == 0) { + // foreign "C" dirty_STACK(MyCapability() "ptr", stack "ptr") []; + // } + foreign "C" tryWakeupThread(MyCapability() "ptr", tso) []; unlockClosure(mvar, stg_MVAR_DIRTY_info); hunk ./rts/PrimOps.cmm 1538 { W_ index, sn_obj; - ALLOC_PRIM( SIZEOF_StgStableName, R1_PTR, stg_makeStableNamezh ); - + MAYBE_GC(R1_PTR, stg_makeStableNamezh); + (index) = foreign "C" lookupStableName(R1 "ptr") []; /* Is there already a StableName for this heap object? hunk ./rts/PrimOps.cmm 1546 * stable_ptr_table is a pointer to an array of snEntry structs. */ if ( snEntry_sn_obj(W_[stable_ptr_table] + index*SIZEOF_snEntry) == NULL ) { - sn_obj = Hp - SIZEOF_StgStableName + WDS(1); + ("ptr" sn_obj) = foreign "C" allocatePrim(MyCapability() "ptr", + BYTES_TO_WDS(SIZEOF_StgStableName)) [R1]; SET_HDR(sn_obj, stg_STABLE_NAME_info, W_[CCCS]); StgStableName_sn(sn_obj) = index; snEntry_sn_obj(W_[stable_ptr_table] + index*SIZEOF_snEntry) = sn_obj; hunk ./rts/PrimOps.cmm 1596 words = BYTES_TO_WDS(SIZEOF_StgBCO) + BYTE_ARR_WDS(bitmap_arr); bytes = WDS(words); - ALLOC_PRIM( bytes, R1_PTR&R2_PTR&R3_PTR&R5_PTR, stg_newBCOzh ); + MAYBE_GC(R1_PTR & R2_PTR & R3_PTR & R5_PTR, stg_newBCOzh); + ("ptr" bco) = foreign "C" allocatePrim(MyCapability() "ptr", words) + [R1,R2,R3,R5]; hunk ./rts/PrimOps.cmm 1600 - bco = Hp - bytes + WDS(1); SET_HDR(bco, stg_BCO_info, W_[CCCS]); StgBCO_instrs(bco) = R1; hunk ./rts/PrimOps.cmm 1631 // This function is *only* used to wrap zero-arity BCOs in an // updatable wrapper (see ByteCodeLink.lhs). An AP thunk is always // saturated and always points directly to a FUN or BCO. - ASSERT(%INFO_TYPE(%GET_STD_INFO(R1)) == HALF_W_(BCO) && - StgBCO_arity(R1) == HALF_W_(0)); + ASSERT(TO_W_(%INFO_TYPE(%GET_STD_INFO(R1))) == BCO && + TO_W_(StgBCO_arity(R1)) == 0); HP_CHK_GEN_TICKY(SIZEOF_StgAP, R1_PTR, stg_mkApUpd0zh); TICK_ALLOC_UP_THK(0, 0); hunk ./rts/PrimOps.cmm 1655 W_ info, ptrs, nptrs, p, ptrs_arr, nptrs_arr; info = %GET_STD_INFO(UNTAG(R1)); + MAYBE_GC(R1_PTR, stg_unpackClosurezh); + // Some closures have non-standard layout, so we omit those here. W_ type; type = TO_W_(%INFO_TYPE(info)); hunk ./rts/PrimOps.cmm 1684 ptrs_arr_cards = mutArrPtrsCardWords(ptrs); ptrs_arr_sz = SIZEOF_StgMutArrPtrs + WDS(ptrs) + WDS(ptrs_arr_cards); - ALLOC_PRIM (ptrs_arr_sz + nptrs_arr_sz, R1_PTR, stg_unpackClosurezh); - W_ clos; clos = UNTAG(R1); hunk ./rts/PrimOps.cmm 1687 - ptrs_arr = Hp - nptrs_arr_sz - ptrs_arr_sz + WDS(1); - nptrs_arr = Hp - nptrs_arr_sz + WDS(1); + ("ptr" ptrs_arr ) = foreign "C" allocatePrim(MyCapability(), ptrs_arr_sz) []; + ("ptr" nptrs_arr) = foreign "C" allocatePrim(MyCapability(), nptrs_arr_sz) []; SET_HDR(ptrs_arr, stg_MUT_ARR_PTRS_FROZEN_info, W_[CCCS]); StgMutArrPtrs_ptrs(ptrs_arr) = ptrs; hunk ./rts/PrimOps.cmm 1986 stg_noDuplicatezh { + W_ new_tso; STK_CHK_GEN( WDS(1), NO_PTRS, stg_noDuplicatezh ); // leave noDuplicate frame in case the current // computation is suspended and restarted (see above). hunk ./rts/PrimOps.cmm 1995 SAVE_THREAD_STATE(); ASSERT(StgTSO_what_next(CurrentTSO) == ThreadRunGHC::I16); - foreign "C" threadPaused (MyCapability() "ptr", CurrentTSO "ptr") []; + ("ptr" new_tso) = foreign "C" threadPaused(MyCapability() "ptr", CurrentTSO) []; + CurrentTSO = new_tso; if (StgTSO_what_next(CurrentTSO) == ThreadKilled::I16) { jump stg_threadFinished; hunk ./rts/Printer.c 231 debugBelch(")\n"); break; + case IND_LOCAL: + debugBelch("IND_LOCAL("); + printPtr((StgPtr)((StgInd*)obj)->indirectee); + debugBelch(")\n"); + break; + case IND_PERM: debugBelch("IND("); printPtr((StgPtr)((StgInd*)obj)->indirectee); hunk ./rts/Printer.c 313 break; } - case MUT_ARR_PTRS_CLEAN: - debugBelch("MUT_ARR_PTRS_CLEAN(size=%lu)\n", (lnat)((StgMutArrPtrs *)obj)->ptrs); + case MUT_ARR_PTRS_LOCAL: + debugBelch("MUT_ARR_PTRS_LOCAL(size=%lu)\n", (lnat)((StgMutArrPtrs *)obj)->ptrs); break; hunk ./rts/Printer.c 317 - case MUT_ARR_PTRS_DIRTY: - debugBelch("MUT_ARR_PTRS_DIRTY(size=%lu)\n", (lnat)((StgMutArrPtrs *)obj)->ptrs); + case MUT_ARR_PTRS_GLOBAL: + debugBelch("MUT_ARR_PTRS_GLOBAL(size=%lu)\n", (lnat)((StgMutArrPtrs *)obj)->ptrs); break; case MUT_ARR_PTRS_FROZEN: hunk ./rts/Printer.c 333 break; } - case MUT_VAR_CLEAN: + case MUT_VAR_LOCAL: { StgMutVar* mv = (StgMutVar*)obj; hunk ./rts/Printer.c 336 - debugBelch("MUT_VAR_CLEAN(var=%p)\n", mv->var); + debugBelch("MUT_VAR_LOCAL(var=%p)\n", mv->var); break; } hunk ./rts/Printer.c 340 - case MUT_VAR_DIRTY: + case MUT_VAR_GLOBAL: { StgMutVar* mv = (StgMutVar*)obj; hunk ./rts/Printer.c 343 - debugBelch("MUT_VAR_DIRTY(var=%p)\n", mv->var); + debugBelch("MUT_VAR_GLOBAL(var=%p)\n", mv->var); break; } hunk ./rts/Printer.c 907 #endif /* HAVE_BFD_H */ -void findPtr(P_ p, int); /* keep gcc -Wall happy */ - -int searched = 0; - -static int -findPtrBlocks (StgPtr p, bdescr *bd, StgPtr arr[], int arr_size, int i) -{ - StgPtr q, r, end; - for (; bd; bd = bd->link) { - searched++; - for (q = bd->start; q < bd->free; q++) { - if (UNTAG_CLOSURE((StgClosure*)*q) == (StgClosure *)p) { - if (i < arr_size) { - for (r = bd->start; r < bd->free; r = end) { - // skip over zeroed-out slop - while (*r == 0) r++; - if (!LOOKS_LIKE_CLOSURE_PTR(r)) { - debugBelch("%p found at %p, no closure at %p\n", - p, q, r); - break; - } - end = r + closure_sizeW((StgClosure*)r); - if (q < end) { - debugBelch("%p = ", r); - printClosure((StgClosure *)r); - arr[i++] = r; - break; - } - } - if (r >= bd->free) { - debugBelch("%p found at %p, closure?", p, q); - } - } else { - return i; - } - } - } - } - return i; -} - -void -findPtr(P_ p, int follow) -{ - nat g; - bdescr *bd; - const int arr_size = 1024; - StgPtr arr[arr_size]; - int i = 0; - searched = 0; - - for (g = 0; g < RtsFlags.GcFlags.generations; g++) { - bd = generations[g].blocks; - i = findPtrBlocks(p,bd,arr,arr_size,i); - bd = generations[g].large_objects; - i = findPtrBlocks(p,bd,arr,arr_size,i); - if (i >= arr_size) return; - } - if (follow && i == 1) { - debugBelch("-->\n"); - findPtr(arr[0], 1); - } -} /* prettyPrintClosure() is for printing out a closure using the data constructor names found in the info tables. Closures are printed in a fashion that resembles hunk ./rts/Printer.c 935 while (type == IND || type == IND_STATIC || + type == IND_LOCAL || type == IND_PERM) { obj = ((StgInd *)obj)->indirectee; hunk ./rts/Printer.c 1048 [AP_STACK] = "AP_STACK", [IND] = "IND", [IND_PERM] = "IND_PERM", + [IND_LOCAL] = "IND_LOCAL", [IND_STATIC] = "IND_STATIC", [RET_BCO] = "RET_BCO", [RET_SMALL] = "RET_SMALL", hunk ./rts/Printer.c 1064 [MVAR_CLEAN] = "MVAR_CLEAN", [MVAR_DIRTY] = "MVAR_DIRTY", [ARR_WORDS] = "ARR_WORDS", - [MUT_ARR_PTRS_CLEAN] = "MUT_ARR_PTRS_CLEAN", - [MUT_ARR_PTRS_DIRTY] = "MUT_ARR_PTRS_DIRTY", + [MUT_ARR_PTRS_LOCAL] = "MUT_ARR_PTRS_LOCAL", + [MUT_ARR_PTRS_GLOBAL] = "MUT_ARR_PTRS_GLOBAL", [MUT_ARR_PTRS_FROZEN0] = "MUT_ARR_PTRS_FROZEN0", [MUT_ARR_PTRS_FROZEN] = "MUT_ARR_PTRS_FROZEN", hunk ./rts/Printer.c 1068 - [MUT_VAR_CLEAN] = "MUT_VAR_CLEAN", - [MUT_VAR_DIRTY] = "MUT_VAR_DIRTY", + [MUT_VAR_LOCAL] = "MUT_VAR_LOCAL", + [MUT_VAR_GLOBAL] = "MUT_VAR_GLOBAL", [WEAK] = "WEAK", [PRIM] = "PRIM", [MUT_PRIM] = "MUT_PRIM", hunk ./rts/ProfHeap.c 879 case CONSTR: case FUN: case IND_PERM: + case IND_LOCAL: case BLACKHOLE: case BLOCKING_QUEUE: case FUN_1_0: hunk ./rts/ProfHeap.c 916 case WEAK: case PRIM: case MUT_PRIM: - case MUT_VAR_CLEAN: - case MUT_VAR_DIRTY: + case MUT_VAR_LOCAL: + case MUT_VAR_GLOBAL: prim = rtsTrue; size = sizeW_fromITBL(info); break; hunk ./rts/ProfHeap.c 939 size = arr_words_sizeW((StgArrWords*)p); break; - case MUT_ARR_PTRS_CLEAN: - case MUT_ARR_PTRS_DIRTY: + case MUT_ARR_PTRS_LOCAL: + case MUT_ARR_PTRS_GLOBAL: case MUT_ARR_PTRS_FROZEN: case MUT_ARR_PTRS_FROZEN0: prim = rtsTrue; hunk ./rts/ProfHeap.c 1079 #endif // Traverse the heap, collecting the census info - for (g = 0; g < RtsFlags.GcFlags.generations; g++) { - heapCensusChain( census, generations[g].blocks ); + for (g = 0; g < total_generations; g++) { + heapCensusChain( census, all_generations[g].blocks ); // Are we interested in large objects? might be // confusing to include the stack in a heap profile. hunk ./rts/ProfHeap.c 1083 - heapCensusChain( census, generations[g].large_objects ); + heapCensusChain( census, all_generations[g].large_objects ); } // dump out the census info hunk ./rts/RaiseAsync.c 158 { MessageThrowTo *msg; - msg = (MessageThrowTo *) allocate(cap, sizeofW(MessageThrowTo)); + // globalise everything up front. The alternative, to globalise + // later (in throwToSendMsg() and blockedThrowTo()) doesn't work + // because we would be globalising a WHITEHOLE closure. + globalise(cap, (StgClosure**)&source); + globalise(cap, (StgClosure**)&target); + globalise(cap, (StgClosure**)&exception); + + msg = (MessageThrowTo *) allocatePrim(cap, sizeofW(MessageThrowTo)); + setGlobal((StgClosure*)msg); + // message starts locked; the caller has to unlock it when it is // ready. SET_HDR(msg, &stg_WHITEHOLE_info, CCS_SYSTEM); hunk ./rts/RaiseAsync.c 178 switch (throwToMsg(cap, msg)) { case THROWTO_SUCCESS: +#ifdef DEBUG + // so that sanity checking can understand this closure. + SET_INFO(msg, &stg_MSG_THROWTO_info); +#endif return NULL; case THROWTO_BLOCKED: default: hunk ./rts/RaiseAsync.c 365 blockedThrowTo(cap,target,msg); return THROWTO_BLOCKED; } else { - // Revoke the message by replacing it with IND. We're not - // locking anything here, so we might still get a TRY_WAKEUP - // message from the owner of the blackhole some time in the - // future, but that doesn't matter. + // Revoke the message by replacing it with + // STUB_MSG_BLACKHOLE. We're not locking anything here, so we + // might still get a TRY_WAKEUP message from the owner of the + // blackhole some time in the future, but that doesn't matter. ASSERT(target->block_info.bh->header.info == &stg_MSG_BLACKHOLE_info); hunk ./rts/RaiseAsync.c 370 - OVERWRITE_INFO(target->block_info.bh, &stg_IND_info); + OVERWRITE_PRIM_CLOSURE((StgClosure*)target->block_info.bh, + &stg_STUB_MSG_BLACKHOLE_info, + sizeofW(MessageBlackHole), + sizeofW(StgInd)); + raiseAsync(cap, target, msg->exception, rtsFalse, NULL); + return THROWTO_SUCCESS; + } + } + + case BlockedOnMsgGlobalise: + { + if (target->flags & TSO_BLOCKEX) { + // BlockedOnMsgGlobalise is not interruptible. + blockedThrowTo(cap,target,msg); + return THROWTO_BLOCKED; + } else { + // We could revoke the message, but it won't do any harm + // if we just leave it in flight. raiseAsync(cap, target, msg->exception, rtsFalse, NULL); return THROWTO_SUCCESS; } hunk ./rts/RaiseAsync.c 625 if (mvar->head == q) { mvar->head = q->link; - q->header.info = &stg_IND_info; + OVERWRITE_PRIM_CLOSURE((StgClosure*)q, &stg_IND_info, + sizeofW(StgMVarTSOQueue), sizeofW(StgInd)); if (mvar->tail == q) { mvar->tail = (StgMVarTSOQueue*)END_TSO_QUEUE; } hunk ./rts/RaiseAsync.c 636 // we lose the tail pointer when the GC shorts out the IND. // So we use MSG_NULL as a kind of non-dupable indirection; // these are ignored by takeMVar/putMVar. - q->header.info = &stg_MSG_NULL_info; + OVERWRITE_PRIM_CLOSURE((StgClosure*)q, &stg_MSG_NULL_info, + sizeofW(StgMVarTSOQueue), sizeofW(Message)); } else { hunk ./rts/RaiseAsync.c 640 - q->header.info = &stg_IND_info; + OVERWRITE_PRIM_CLOSURE((StgClosure*)q, &stg_IND_info, + sizeofW(StgMVarTSOQueue), sizeofW(StgInd)); } // revoke the MVar operation hunk ./rts/RaiseAsync.c 687 break; } + case BlockedOnMsgGlobalise: + // nothing to do + goto done; + #if !defined(THREADED_RTS) case BlockedOnRead: case BlockedOnWrite: hunk ./rts/RetainerProfile.c 461 return; // one child (fixed), no SRT - case MUT_VAR_CLEAN: - case MUT_VAR_DIRTY: + case MUT_VAR_LOCAL: + case MUT_VAR_GLOBAL: *first_child = ((StgMutVar *)c)->var; return; case THUNK_SELECTOR: hunk ./rts/RetainerProfile.c 469 *first_child = ((StgSelector *)c)->selectee; return; case IND_PERM: + case IND_LOCAL: case BLACKHOLE: *first_child = ((StgInd *)c)->indirectee; return; hunk ./rts/RetainerProfile.c 522 break; // StgMutArrPtr.ptrs, no SRT - case MUT_ARR_PTRS_CLEAN: - case MUT_ARR_PTRS_DIRTY: + case MUT_ARR_PTRS_LOCAL: + case MUT_ARR_PTRS_GLOBAL: case MUT_ARR_PTRS_FROZEN: case MUT_ARR_PTRS_FROZEN0: init_ptrs(&se.info, ((StgMutArrPtrs *)c)->ptrs, hunk ./rts/RetainerProfile.c 855 case BCO: case CONSTR_STATIC: // StgMutArrPtr.ptrs, no SRT - case MUT_ARR_PTRS_CLEAN: - case MUT_ARR_PTRS_DIRTY: + case MUT_ARR_PTRS_LOCAL: + case MUT_ARR_PTRS_GLOBAL: case MUT_ARR_PTRS_FROZEN: case MUT_ARR_PTRS_FROZEN0: *c = find_ptrs(&se->info); hunk ./rts/RetainerProfile.c 921 case CONSTR_0_2: case ARR_WORDS: // one child (fixed), no SRT - case MUT_VAR_CLEAN: - case MUT_VAR_DIRTY: + case MUT_VAR_LOCAL: + case MUT_VAR_GLOBAL: case THUNK_SELECTOR: case IND_PERM: case CONSTR_1_1: hunk ./rts/RetainerProfile.c 1015 case MUT_PRIM: case MVAR_CLEAN: case MVAR_DIRTY: - case MUT_VAR_CLEAN: - case MUT_VAR_DIRTY: - case MUT_ARR_PTRS_CLEAN: - case MUT_ARR_PTRS_DIRTY: + case MUT_VAR_LOCAL: + case MUT_VAR_GLOBAL: + case MUT_ARR_PTRS_LOCAL: + case MUT_ARR_PTRS_GLOBAL: case MUT_ARR_PTRS_FROZEN: case MUT_ARR_PTRS_FROZEN0: hunk ./rts/RetainerProfile.c 1067 // as isAlive doesn't look through IND_STATIC as it ignores static // closures. See trac #3956 for a program that hit this error. case IND_STATIC: + case IND_LOCAL: case BLACKHOLE: // static objects case CONSTR_STATIC: hunk ./rts/RetainerProfile.c 1792 // // The following code assumes that WEAK objects are considered to be roots // for retainer profilng. - for (weak = weak_ptr_list; weak != NULL; weak = weak->link) - // retainRoot((StgClosure *)weak); - retainRoot(NULL, (StgClosure **)&weak); + for (g = 0; g < total_generations; g++) { + for (weak = &all_generations[g].weak_ptrs; weak != NULL; + weak = weak->link) { + // retainRoot((StgClosure *)weak); + retainRoot(NULL, (StgClosure **)&weak); + } + } // Consider roots from the stable ptr table. markStablePtrTable(retainRoot, NULL); hunk ./rts/RetainerProfile.c 1806 // The following code resets the rs field of each unvisited mutable // object (computing sumOfNewCostExtra and updating costArray[] when // debugging retainer profiler). - for (g = 0; g < RtsFlags.GcFlags.generations; g++) { + for (g = 0; g < total_generations; g++) { // NOT TRUE: even G0 has a block on its mutable list // ASSERT(g != 0 || (generations[g].mut_list == NULL)); hunk ./rts/RetainerProfile.c 1813 // Traversing through mut_list is necessary // because we can find MUT_VAR objects which have not been // visited during retainer profiling. - for (bd = generations[g].mut_list; bd != NULL; bd = bd->link) { +#ifdef THREADED_RTS + if (RtsFlags.ParFlags.nNodes != 0) + barf("RetainerProfile.c: this code needs fixing for -N > 0"); +#endif + for (bd = capabilities[0].mut_lists[g]; bd != NULL; bd = bd->link) { for (ml = bd->start; ml < bd->free; ml++) { maybeInitRetainerSet((StgClosure *)*ml); hunk ./rts/RtsFlags.c 120 RtsFlags.DebugFlags.squeeze = rtsFalse; RtsFlags.DebugFlags.hpc = rtsFalse; RtsFlags.DebugFlags.sparks = rtsFalse; + RtsFlags.DebugFlags.mallocleaks = rtsFalse; #endif #if defined(PROFILING) hunk ./rts/RtsFlags.c 313 " -Dz DEBUG: stack squezing", " -Dc DEBUG: program coverage", " -Dr DEBUG: sparks", +" -Dk DEBUG: malloc leaks", "", " NOTE: DEBUG events are sent to stderr by default; add -l to create a", " binary event log file instead.", hunk ./rts/RtsFlags.c 701 case 'r': RtsFlags.DebugFlags.sparks = rtsTrue; break; + case 'k': + RtsFlags.DebugFlags.mallocleaks = rtsTrue; + break; default: bad_option( rts_argv[arg] ); } hunk ./rts/RtsFlags.c 1139 error = rtsTrue; break; - case 'b': /* heapBase in hex; undocumented */ + case 'b': /* undocumented: heapBase in hex; 0x prefix optional */ if (rts_argv[arg][3] != '\0') { hunk ./rts/RtsFlags.c 1141 - RtsFlags.GcFlags.heapBase - = strtol(rts_argv[arg]+3, (char **) NULL, 16); + if (rts_argv[arg][4] == '0' && + rts_argv[arg][5] == 'x') { + RtsFlags.GcFlags.heapBase + = strtol(rts_argv[arg]+6, (char **) NULL, 16); + } else { + RtsFlags.GcFlags.heapBase + = strtol(rts_argv[arg]+3, (char **) NULL, 16); + } } else { errorBelch("-xb: requires argument"); error = rtsTrue; hunk ./rts/RtsStartup.c 136 setProgArgv(*argc,*argv); } - /* Initialise the stats department, phase 1 */ - initStats1(); - #ifdef USE_PAPI papi_init(); #endif hunk ./rts/RtsStartup.c 156 /* initialize the storage manager */ initStorage(); + /* Initialise the stats department, phase 1 */ + initStats1(); + + /* after the storage manager has been initialised */ + startWorkers(); + /* initialise the stable pointer table */ initStablePtrTable(); hunk ./rts/RtsStartup.c 362 exitScheduler(wait_foreign); /* run C finalizers for all active weak pointers */ - runAllCFinalizers(weak_ptr_list); + runAllCFinalizers(); #if defined(RTS_USER_SIGNALS) if (RtsFlags.MiscFlags.install_signal_handlers) { hunk ./rts/STM.c 94 #include "STM.h" #include "Trace.h" #include "Threads.h" +#include "sm/Globalise.h" +#include "WritePolicy.h" #include hunk ./rts/STM.c 416 static StgInvariantCheckQueue *new_stg_invariant_check_queue(Capability *cap, StgAtomicInvariant *invariant) { StgInvariantCheckQueue *result; - result = (StgInvariantCheckQueue *)allocate(cap, sizeofW(StgInvariantCheckQueue)); + result = (StgInvariantCheckQueue *)allocatePrim(cap, sizeofW(StgInvariantCheckQueue)); SET_HDR (result, &stg_INVARIANT_CHECK_QUEUE_info, CCS_SYSTEM); result -> invariant = invariant; result -> my_execution = NO_TREC; hunk ./rts/STM.c 423 return result; } -static StgTVarWatchQueue *new_stg_tvar_watch_queue(Capability *cap, - StgClosure *closure) { - StgTVarWatchQueue *result; - result = (StgTVarWatchQueue *)allocate(cap, sizeofW(StgTVarWatchQueue)); - SET_HDR (result, &stg_TVAR_WATCH_QUEUE_info, CCS_SYSTEM); - result -> closure = closure; - return result; -} - static StgTRecChunk *new_stg_trec_chunk(Capability *cap) { StgTRecChunk *result; hunk ./rts/STM.c 425 - result = (StgTRecChunk *)allocate(cap, sizeofW(StgTRecChunk)); + result = (StgTRecChunk *)allocatePrim(cap, sizeofW(StgTRecChunk)); SET_HDR (result, &stg_TREC_CHUNK_info, CCS_SYSTEM); result -> prev_chunk = END_STM_CHUNK_LIST; result -> next_entry_idx = 0; hunk ./rts/STM.c 429 + result -> cap_no = cap -> no; return result; } hunk ./rts/STM.c 433 -static StgTRecHeader *new_stg_trec_header(Capability *cap, - StgTRecHeader *enclosing_trec) { +static StgTRecHeader *new_stg_trec_header(Capability *cap) { StgTRecHeader *result; hunk ./rts/STM.c 435 - result = (StgTRecHeader *) allocate(cap, sizeofW(StgTRecHeader)); - SET_HDR (result, &stg_TREC_HEADER_info, CCS_SYSTEM); hunk ./rts/STM.c 436 - result -> enclosing_trec = enclosing_trec; + result = (StgTRecHeader *) allocatePrim(cap, sizeofW(StgTRecHeader)); + setGlobal((StgClosure*)result); + + SET_HDR (result, &stg_TREC_HEADER_info, CCS_SYSTEM); result -> current_chunk = new_stg_trec_chunk(cap); hunk ./rts/STM.c 441 - result -> invariants_to_check = END_INVARIANT_CHECK_QUEUE; hunk ./rts/STM.c 442 - if (enclosing_trec == NO_TREC) { - result -> state = TREC_ACTIVE; - } else { - ASSERT(enclosing_trec -> state == TREC_ACTIVE || - enclosing_trec -> state == TREC_CONDEMNED); - result -> state = enclosing_trec -> state; - } + recordMutableCap (cap, (StgClosure*)result, global_gen_no); return result; } hunk ./rts/STM.c 467 } static StgTVarWatchQueue *alloc_stg_tvar_watch_queue(Capability *cap, - StgClosure *closure) { + StgClosure *closure, + rtsBool is_local) { StgTVarWatchQueue *result = NULL; hunk ./rts/STM.c 470 - if (cap -> free_tvar_watch_queues == END_STM_WATCH_QUEUE) { - result = new_stg_tvar_watch_queue(cap, closure); - } else { - result = cap -> free_tvar_watch_queues; - result -> closure = closure; - cap -> free_tvar_watch_queues = result -> next_queue_entry; + result = (StgTVarWatchQueue *)allocatePrim(cap, sizeofW(StgTVarWatchQueue)); + if (!is_local) { + setGlobal((StgClosure*)result); } hunk ./rts/STM.c 474 + SET_HDR (result, &stg_TVAR_WATCH_QUEUE_info, CCS_SYSTEM); + result -> closure = closure; return result; } hunk ./rts/STM.c 496 cap -> free_trec_chunks = result -> prev_chunk; result -> prev_chunk = END_STM_CHUNK_LIST; result -> next_entry_idx = 0; + ASSERT(result -> cap_no == cap -> no); } return result; } hunk ./rts/STM.c 513 StgTRecHeader *enclosing_trec) { StgTRecHeader *result = NULL; if (cap -> free_trec_headers == NO_TREC) { - result = new_stg_trec_header(cap, enclosing_trec); + result = new_stg_trec_header(cap); } else { result = cap -> free_trec_headers; cap -> free_trec_headers = result -> enclosing_trec; hunk ./rts/STM.c 517 - result -> enclosing_trec = enclosing_trec; result -> current_chunk -> next_entry_idx = 0; hunk ./rts/STM.c 518 - result -> invariants_to_check = END_INVARIANT_CHECK_QUEUE; - if (enclosing_trec == NO_TREC) { + } + + result -> enclosing_trec = enclosing_trec; + result -> invariants_to_check = END_INVARIANT_CHECK_QUEUE; + result -> cap_no = cap -> no; + + if (enclosing_trec == NO_TREC) { result -> state = TREC_ACTIVE; hunk ./rts/STM.c 526 - } else { + } else { ASSERT(enclosing_trec -> state == TREC_ACTIVE || enclosing_trec -> state == TREC_CONDEMNED); result -> state = enclosing_trec -> state; hunk ./rts/STM.c 530 - } } return result; } hunk ./rts/STM.c 571 ACQ_ASSERT(s -> current_value == (StgClosure *)trec); NACQ_ASSERT(s -> current_value == e -> expected_value); fq = s -> first_watch_queue_entry; - q = alloc_stg_tvar_watch_queue(cap, (StgClosure*) tso); + q = alloc_stg_tvar_watch_queue(cap, (StgClosure*) tso, + isLocal((StgClosure*)s)); q -> next_queue_entry = fq; q -> prev_queue_entry = END_STM_WATCH_QUEUE; if (fq != END_STM_WATCH_QUEUE) { hunk ./rts/STM.c 874 /************************************************************************/ -void stmPreGCHook() { - nat i; - +void stmPreGCHook (Capability *cap) { lock_stm(NO_TREC); TRACE("stmPreGCHook"); hunk ./rts/STM.c 877 - for (i = 0; i < n_capabilities; i ++) { - Capability *cap = &capabilities[i]; - cap -> free_tvar_watch_queues = END_STM_WATCH_QUEUE; - cap -> free_trec_chunks = END_STM_CHUNK_LIST; - cap -> free_trec_headers = NO_TREC; - } + cap->free_tvar_watch_queues = END_STM_WATCH_QUEUE; + cap->free_trec_chunks = END_STM_CHUNK_LIST; + cap->free_trec_headers = NO_TREC; unlock_stm(NO_TREC); } hunk ./rts/STM.c 1123 FOR_EACH_ENTRY(my_execution, e, { StgTVar *s = e -> tvar; - StgTVarWatchQueue *q = alloc_stg_tvar_watch_queue(cap, (StgClosure*)inv); + StgTVarWatchQueue *q = alloc_stg_tvar_watch_queue(cap, (StgClosure*)inv, + isLocal((StgClosure*)s)); StgTVarWatchQueue *fq = s -> first_watch_queue_entry; // We leave "last_execution" holding the values that will be hunk ./rts/STM.c 1165 // 1. Allocate an StgAtomicInvariant, set last_execution to NO_TREC // to signal that this is a new invariant in the current atomic block - invariant = (StgAtomicInvariant *) allocate(cap, sizeofW(StgAtomicInvariant)); + invariant = (StgAtomicInvariant *) allocatePrim(cap, sizeofW(StgAtomicInvariant)); TRACE("%p : stmAddInvariantToCheck allocated invariant=%p", trec, invariant); SET_HDR (invariant, &stg_ATOMIC_INVARIANT_info, CCS_SYSTEM); invariant -> code = code; hunk ./rts/STM.c 1393 IF_STM_FG_LOCKS({ s -> num_updates ++; }); + // we're about to write e->new_value into s, better ensure + // the global heap invariant is maintained: + if (isGlobal((StgClosure*)s)) { + e->new_value = TVAR_GLOBALISE(cap, e->new_value); + } unlock_tvar(trec, s, e -> new_value, TRUE); } ACQ_ASSERT(!tvar_is_locked(s, trec)); hunk ./rts/STM.c 1485 // (Otherwise the transaction was not valid and the thread will have to // retry it). + // Better globalise the TSO. We don't know for sure that this is + // necessary, but checking would be tedious: if any TVar is global + // then the TSO needs to be. + globalise(cap, (StgClosure**)&tso); + // Put ourselves to sleep. We retain locks on all the TVars involved // until we are sound asleep : (a) on the wait queues, (b) BlockedOnSTM // in the TSO, (c) TREC_WAITING in the Trec. hunk ./rts/STM.c 1658 StgTVar *stmNewTVar(Capability *cap, StgClosure *new_value) { StgTVar *result; - result = (StgTVar *)allocate(cap, sizeofW(StgTVar)); + result = (StgTVar *)allocatePrim(cap, sizeofW(StgTVar)); SET_HDR (result, &stg_TVAR_info, CCS_SYSTEM); result -> current_value = new_value; result -> first_watch_queue_entry = END_STM_WATCH_QUEUE; hunk ./rts/STM.h 51 -------------- */ -void stmPreGCHook(void); +void stmPreGCHook(Capability *cap); /*---------------------------------------------------------------------- hunk ./rts/Schedule.c 488 t->saved_winerror = GetLastError(); #endif - traceEventStopThread(cap, t, ret); + if (ret == ThreadBlocked) { + traceEventStopThread(cap, t, t->why_blocked + 6); + } else { + traceEventStopThread(cap, t, ret); + } ASSERT_FULL_CAPABILITY_INVARIANTS(cap,task); ASSERT(t->cap == cap); hunk ./rts/Schedule.c 641 sched_state >= SCHED_INTERRUPTING)) return; +#if 0 + if (cap->await_reply) + { + nat i; + + cap->await_reply = rtsFalse; + + for (;;) { + if (shouldYieldCapability(cap,task)) { + break; + } + if (!emptyRunQueue(cap) || + !emptyInbox(cap) || + sched_state >= SCHED_INTERRUPTING) { + return; + } + } + } +#endif + // otherwise yield (sleep), and keep yielding if necessary. do { yieldCapability(&cap,task); hunk ./rts/Schedule.c 765 setTSOPrev(cap, t, prev); prev = t; } else { + t = globalise_TSO(cap,t); + appendToRunQueue(free_caps[i],t); traceEventMigrateThread (cap, t, free_caps[i]->no); hunk ./rts/Schedule.c 971 scheduleProcessInbox (Capability *cap USED_IF_THREADS) { #if defined(THREADED_RTS) - Message *m; + Message *m, *next; + int r; while (!emptyInbox(cap)) { hunk ./rts/Schedule.c 975 - ACQUIRE_LOCK(&cap->lock); + if (cap->r.rCurrentNursery->link == NULL || + cap->r.rG0->n_new_large_words >= large_alloc_lim) { + scheduleDoGC(cap, cap->running_task, rtsFalse); + } + + // don't use a blocking acquire; if the lock is held by + // another thread then just carry on. This seems to avoid + // getting stuck in a message ping-pong situation with other + // processors. We'll check the inbox again later anyway. + // + // We should really use a more efficient queue data structure + // here. The trickiness is that we must ensure a Capability + // never goes idle if the inbox is non-empty, which is why we + // use cap->lock (cap->lock is released as the last thing + // before going idle; see Capability.c:releaseCapability()). + r = TRY_ACQUIRE_LOCK(&cap->lock); + if (r != 0) return; + m = cap->inbox; hunk ./rts/Schedule.c 994 - cap->inbox = m->link; + cap->inbox = (Message*)END_TSO_QUEUE; + cap->n_inbox = 0; + RELEASE_LOCK(&cap->lock); hunk ./rts/Schedule.c 998 - executeMessage(cap, (Message *)m); + + while (m != (Message*)END_TSO_QUEUE) { + next = m->link; + executeMessage(cap, m); + m = next; + } } #endif } hunk ./rts/Schedule.c 1089 bd = allocGroup_lock(blocks); cap->r.rNursery->n_blocks += blocks; - + // link the new group into the list bd->link = cap->r.rCurrentNursery; bd->u.back = cap->r.rCurrentNursery->u.back; hunk ./rts/Schedule.c 1118 // This assert can be a killer if the app is doing lots // of large block allocations. - IF_DEBUG(sanity, checkNurserySanity(cap->r.rNursery)); + IF_DEBUG(sanity, checkNurserySanity(cap->no)); // now update the nursery to point to the new block cap->r.rCurrentNursery = bd; hunk ./rts/Schedule.c 1262 #endif } - ASSERT(task->incall->tso == t); + ASSERT(task->incall->tso == t); if (t->what_next == ThreadComplete) { if (task->incall->ret) { hunk ./rts/Schedule.c 1330 scheduleDoGC (Capability *cap, Task *task USED_IF_THREADS, rtsBool force_major) { rtsBool heap_census; + nat gc_type, N; #ifdef THREADED_RTS /* extern static volatile StgWord waiting_for_gc; lives inside capability.c */ hunk ./rts/Schedule.c 1334 - rtsBool gc_type, prev_pending_gc; + rtsBool prev_pending_gc; nat i; #endif hunk ./rts/Schedule.c 1345 return cap; } + // The final shutdown GC is always single-threaded, because it's + // possible that some of the Capabilities have no worker threads. + + heap_census = scheduleNeedHeapProfile(rtsTrue); + + if (heap_census || force_major) { + N = RtsFlags.GcFlags.generations - 1; + } else { + N = next_gc_gen; + } + + if (heap_census || sched_state >= SCHED_INTERRUPTING) { + gc_type = GC_SEQ; #ifdef THREADED_RTS hunk ./rts/Schedule.c 1359 - if (sched_state < SCHED_INTERRUPTING - && RtsFlags.ParFlags.parGcEnabled - && N >= RtsFlags.ParFlags.parGcGen - && ! oldest_gen->mark) - { - gc_type = PENDING_GC_PAR; + } else if (next_gc_gen == 0 && !force_major) { + gc_type = GC_LOCAL; + } else if (RtsFlags.ParFlags.parGcEnabled + && n_capabilities > 1 + && next_gc_gen >= RtsFlags.ParFlags.parGcGen + && ! oldest_gen->mark) { + gc_type = GC_PAR; +#endif } else { hunk ./rts/Schedule.c 1368 - gc_type = PENDING_GC_SEQ; + gc_type = GC_SEQ; } // In order to GC, there must be no threads running Haskell code. hunk ./rts/Schedule.c 1386 threads if waiting_for_gc is set. Tested inside yieldCapability() and releaseCapability() in Capability.c */ - prev_pending_gc = cas(&waiting_for_gc, 0, gc_type); - if (prev_pending_gc) { - do { - debugTrace(DEBUG_sched, "someone else is trying to GC (%d)...", - prev_pending_gc); - ASSERT(cap); - yieldCapability(&cap,task); - } while (waiting_for_gc); - return cap; // NOTE: task->cap might have changed here +#ifdef THREADED_RTS + if (gc_type != GC_LOCAL) { + prev_pending_gc = cas(&waiting_for_gc, 0, gc_type); + if (prev_pending_gc) { + do { + debugTrace(DEBUG_sched, "someone else is trying to GC (%d)...", + prev_pending_gc); + ASSERT(cap); + yieldCapability(&cap,task); + } while (waiting_for_gc); + return cap; // NOTE: task->cap might have changed here + } } hunk ./rts/Schedule.c 1399 +#endif hunk ./rts/Schedule.c 1401 - setContextSwitches(); - - // The final shutdown GC is always single-threaded, because it's - // possible that some of the Capabilities have no worker threads. - - if (gc_type == PENDING_GC_SEQ) - { - traceEventRequestSeqGc(cap); - } - else - { + switch (gc_type) { + case GC_PAR: + setContextSwitches(); traceEventRequestParGc(cap); hunk ./rts/Schedule.c 1405 - debugTrace(DEBUG_sched, "ready_to_gc, grabbing GC threads"); + break; + case GC_SEQ: + setContextSwitches(); + traceEventRequestSeqGc(cap); + break; } hunk ./rts/Schedule.c 1412 - if (gc_type == PENDING_GC_SEQ) +#ifdef THREADED_RTS + if (gc_type == GC_SEQ) { // single-threaded GC: grab all the capabilities for (i=0; i < n_capabilities; i++) { hunk ./rts/Schedule.c 1417 - debugTrace(DEBUG_sched, "ready_to_gc, grabbing all the capabilies (%d/%d)", i, n_capabilities); + debugTrace(DEBUG_sched, "scheduleDoGC: grabbing all the capabilies (%d/%d)", i, n_capabilities); if (cap != &capabilities[i]) { Capability *pcap = &capabilities[i]; // we better hope this task doesn't get migrated to hunk ./rts/Schedule.c 1433 } } } - else + else if (gc_type == GC_PAR) { hunk ./rts/Schedule.c 1435 + debugTrace(DEBUG_sched, "scheduleDoGC: grabbing GC threads"); // multi-threaded GC: make sure all the Capabilities donate one // GC thread each. waitForGcThreads(cap); hunk ./rts/Schedule.c 1440 } - #endif IF_DEBUG(scheduler, printAllThreads()); hunk ./rts/Schedule.c 1455 sched_state = SCHED_SHUTTING_DOWN; } - heap_census = scheduleNeedHeapProfile(rtsTrue); - - traceEventGcStart(cap); #if defined(THREADED_RTS) // reset waiting_for_gc *before* GC, so that when the GC threads // emerge they don't immediately re-enter the GC. hunk ./rts/Schedule.c 1459 waiting_for_gc = 0; - GarbageCollect(force_major || heap_census, gc_type, cap); -#else - GarbageCollect(force_major || heap_census, 0, cap); #endif hunk ./rts/Schedule.c 1460 + + traceEventGcStart(cap); + GarbageCollect(N, gc_type, cap); traceEventGcEnd(cap); if (recent_activity == ACTIVITY_INACTIVE && force_major) hunk ./rts/Schedule.c 1483 } #if defined(THREADED_RTS) - if (gc_type == PENDING_GC_PAR) + if (gc_type == GC_PAR) { releaseGCThreads(cap); } hunk ./rts/Schedule.c 1521 #endif #if defined(THREADED_RTS) - if (gc_type == PENDING_GC_SEQ) { + if (gc_type == GC_SEQ) { // release our stash of capabilities. for (i = 0; i < n_capabilities; i++) { if (cap != &capabilities[i]) { hunk ./rts/Schedule.c 1585 pid = fork(); + if (pid) { // parent startTimer(); // #4074 hunk ./rts/Schedule.c 1617 // now gone. for (g = 0; g < RtsFlags.GcFlags.generations; g++) { - for (t = generations[g].threads; t != END_TSO_QUEUE; t = next) { + for (t = all_generations[g].threads; t != END_TSO_QUEUE; t = next) { next = t->global_link; hunk ./rts/Schedule.c 1619 - // don't allow threads to catch the ThreadKilled + // don't allow threads to catch the ThreadKilled // exception, but we do want to raiseAsync() because these // threads may be evaluating thunks that we need later. deleteThread_(cap,t); hunk ./rts/Schedule.c 1646 // Empty the threads lists. Otherwise, the garbage // collector may attempt to resurrect some of these threads. - for (g = 0; g < RtsFlags.GcFlags.generations; g++) { - generations[g].threads = END_TSO_QUEUE; + for (g = 0; g < total_generations; g++) { + all_generations[g].threads = END_TSO_QUEUE; } discardTasksExcept(cap->running_task); hunk ./rts/Schedule.c 1680 #else /* !FORKPROCESS_PRIMOP_SUPPORTED */ barf("forkProcess#: primop not supported on this platform, sorry!\n"); #endif + } /* --------------------------------------------------------------------------- hunk ./rts/Schedule.c 1697 debugTrace(DEBUG_sched,"deleting all threads"); for (g = 0; g < RtsFlags.GcFlags.generations; g++) { - for (t = generations[g].threads; t != END_TSO_QUEUE; t = next) { + for (t = all_generations[g].threads; t != END_TSO_QUEUE; t = next) { next = t->global_link; deleteThread(cap,t); } hunk ./rts/Schedule.c 1860 tso = incall->suspended_tso; incall->suspended_tso = NULL; incall->suspended_cap = NULL; + tso->_link = END_TSO_QUEUE; // no write barrier reqd traceEventRunThread(cap, tso); hunk ./rts/Schedule.c 2029 #endif RELEASE_LOCK(&sched_mutex); +} hunk ./rts/Schedule.c 2031 +void +startWorkers(void) +{ #if defined(THREADED_RTS) /* * Eagerly start one worker to run each Capability, except for hunk ./rts/Schedule.c 2111 #endif } +void markScheduler (evac_fn evac USED_IF_NOT_THREADS, + void *user USED_IF_NOT_THREADS) +{ +#if !defined(THREADED_RTS) + evac(user, (StgClosure **)(void *)&blocked_queue_hd); + evac(user, (StgClosure **)(void *)&blocked_queue_tl); + evac(user, (StgClosure **)(void *)&sleeping_queue); +#endif +} + /* ----------------------------------------------------------------------------- performGC hunk ./rts/Schedule.h 24 * Locks assumed : none */ void initScheduler (void); +void startWorkers (void); void exitScheduler (rtsBool wait_foreign); void freeScheduler (void); hunk ./rts/Schedule.h 27 +void markScheduler (evac_fn evac, void *user); // Place a new thread on the run queue of the current Capability void scheduleThread (Capability *cap, StgTSO *tso); hunk ./rts/Schedule.h 125 cap->run_queue_hd = tso; tso->block_info.prev = END_TSO_QUEUE; } else { + cap->run_queue_tl = cap->run_queue_tl; setTSOLink(cap, cap->run_queue_tl, tso); setTSOPrev(cap, tso, cap->run_queue_tl); } hunk ./rts/Schedule.h 145 setTSOLink(cap, tso, cap->run_queue_hd); tso->block_info.prev = END_TSO_QUEUE; if (cap->run_queue_hd != END_TSO_QUEUE) { + cap->run_queue_hd = cap->run_queue_hd; setTSOPrev(cap, cap->run_queue_hd, tso); } cap->run_queue_hd = tso; hunk ./rts/Schedule.h 160 popRunQueue (Capability *cap) { StgTSO *t = cap->run_queue_hd; + ASSERT(t != END_TSO_QUEUE); cap->run_queue_hd = t->_link; if (t->_link != END_TSO_QUEUE) { hunk ./rts/Sparks.c 13 #include "Rts.h" #include "Schedule.h" +#include "sm/Globalise.h" #include "RtsUtils.h" #include "Trace.h" #include "Prelude.h" hunk ./rts/Sparks.c 18 #include "Sparks.h" +#include "WritePolicy.h" #if defined(THREADED_RTS) hunk ./rts/Sparks.c 75 p = UNTAG_CLOSURE(p); if (closure_SHOULD_SPARK(p)) { - pushWSDeque(pool,p); + pushWSDeque(pool, SPARK_GLOBALISE(cap,p)); cap->sparks_created++; } else { cap->sparks_dud++; hunk ./rts/Stable.c 17 #include "RtsUtils.h" #include "Trace.h" #include "Stable.h" +#include "sm/Globalise.h" /* Comment from ADR's implementation in old RTS: hunk ./rts/Stable.c 192 while (get_itbl(q)->type == IND || get_itbl(q)->type == IND_STATIC || + get_itbl(q)->type == IND_LOCAL || get_itbl(q)->type == IND_PERM) { q = ((StgInd *)q)->indirectee; tag = GET_CLOSURE_TAG(q); hunk ./rts/Stable.c 207 { StgWord sn; void* sn_tmp; + Task *task; if (stable_ptr_free == NULL) { enlargeStablePtrTable(); hunk ./rts/Stable.c 218 */ p = (StgPtr)removeIndirections((StgClosure*)p); + // Globalise the pointer, since the stable pointer table is a global + // set of roots. + task = myTask(); // sometimes called for static closures without a Task + if (task != NULL) { + globalise(task->cap, (StgClosure**)&p); + } + // register the untagged pointer. This just makes things simpler. p = (StgPtr)UNTAG_CLOSURE((StgClosure*)p); hunk ./rts/Stats.c 19 #include "GetTime.h" #include "sm/Storage.h" #include "sm/GC.h" // gc_alloc_block_sync, whitehole_spin +#include "sm/GCThread.h" +#include "sm/BlockAlloc.h" #if USE_PAPI #include "Papi.h" hunk ./rts/Stats.c 31 #define TICK_TO_DBL(t) ((double)(t) / TICKS_PER_SECOND) -static Ticks ElapsedTimeStart = 0; +static Ticks + start_init_cpu, start_init_elapsed, + end_init_cpu, end_init_elapsed, + start_exit_cpu, start_exit_elapsed, + end_exit_cpu, end_exit_elapsed; hunk ./rts/Stats.c 37 -static Ticks InitUserTime = 0; -static Ticks InitElapsedTime = 0; -static Ticks InitElapsedStamp = 0; +static Ticks GC_tot_cpu = 0; hunk ./rts/Stats.c 39 -static Ticks MutUserTime = 0; -static Ticks MutElapsedTime = 0; -static Ticks MutElapsedStamp = 0; - -static Ticks ExitUserTime = 0; -static Ticks ExitElapsedTime = 0; - -static StgWord64 GC_tot_alloc = 0; -static StgWord64 GC_tot_copied = 0; +static StgWord64 GC_tot_alloc = 0; +static StgWord64 GC_tot_copied = 0; static StgWord64 GC_par_max_copied = 0; static StgWord64 GC_par_avg_copied = 0; hunk ./rts/Stats.c 45 -static Ticks GC_start_time = 0, GC_tot_time = 0; /* User GC Time */ -static Ticks GCe_start_time = 0, GCe_tot_time = 0; /* Elapsed GC time */ - #ifdef PROFILING hunk ./rts/Stats.c 46 -static Ticks RP_start_time = 0, RP_tot_time = 0; /* retainer prof user time */ -static Ticks RPe_start_time = 0, RPe_tot_time = 0; /* retainer prof elap time */ +static Ticks RP_start_time = 0, RP_tot_time = 0; // retainer prof user time +static Ticks RPe_start_time = 0, RPe_tot_time = 0; // retainer prof elap time static Ticks HC_start_time, HC_tot_time = 0; // heap census prof user time static Ticks HCe_start_time, HCe_tot_time = 0; // heap census prof elap time hunk ./rts/Stats.c 59 #define PROF_VAL(x) 0 #endif -static lnat MaxResidency = 0; // in words; for stats only -static lnat AvgResidency = 0; -static lnat ResidencySamples = 0; // for stats only -static lnat MaxSlop = 0; +static lnat max_residency = 0; // in words; for stats only +static lnat avg_residency = 0; +static lnat residency_samples = 0; // for stats only +static lnat max_slop = 0; hunk ./rts/Stats.c 64 -static lnat GC_start_faults = 0, GC_end_faults = 0; +static lnat GC_end_faults = 0; hunk ./rts/Stats.c 66 -static Ticks *GC_coll_times = NULL; -static Ticks *GC_coll_etimes = NULL; +static Ticks *GC_coll_cpu = NULL; +static Ticks *GC_coll_elapsed = NULL; +static Ticks *GC_coll_max_pause = NULL; static void statsFlush( void ); static void statsClose( void ); hunk ./rts/Stats.c 73 -Ticks stat_getElapsedGCTime(void) -{ - return GCe_tot_time; -} +/* ----------------------------------------------------------------------------- + Current elapsed time + ------------------------------------------------------------------------- */ Ticks stat_getElapsedTime(void) { hunk ./rts/Stats.c 79 - return getProcessElapsedTime() - ElapsedTimeStart; + return getProcessElapsedTime() - start_init_elapsed; } hunk ./rts/Stats.c 82 -/* mut_user_time_during_GC() and mut_user_time() - * - * The former function can be used to get the current mutator time - * *during* a GC, i.e. between stat_startGC and stat_endGC. This is - * used in the heap profiler for accurately time stamping the heap - * sample. - * - * ATTENTION: mut_user_time_during_GC() relies on GC_start_time being - * defined in stat_startGC() - to minimise system calls, - * GC_start_time is, however, only defined when really needed (check - * stat_startGC() for details) - */ -double -mut_user_time_during_GC( void ) -{ - return TICK_TO_DBL(GC_start_time - GC_tot_time - PROF_VAL(RP_tot_time + HC_tot_time)); -} +/* --------------------------------------------------------------------------- + Measure the current MUT time, for profiling + ------------------------------------------------------------------------ */ double mut_user_time( void ) hunk ./rts/Stats.c 89 { - Ticks user; - user = getProcessCPUTime(); - return TICK_TO_DBL(user - GC_tot_time - PROF_VAL(RP_tot_time + HC_tot_time)); + Ticks cpu; + cpu = getProcessCPUTime(); + return TICK_TO_DBL(cpu - GC_tot_cpu - PROF_VAL(RP_tot_time + HC_tot_time)); } #ifdef PROFILING hunk ./rts/Stats.c 96 /* - mut_user_time_during_RP() is similar to mut_user_time_during_GC(); - it returns the MUT time during retainer profiling. + mut_user_time_during_RP() returns the MUT time during retainer profiling. The same is for mut_user_time_during_HC(); */ double hunk ./rts/Stats.c 102 mut_user_time_during_RP( void ) { - return TICK_TO_DBL(RP_start_time - GC_tot_time - RP_tot_time - HC_tot_time); + return TICK_TO_DBL(RP_start_time - GC_tot_cpu - RP_tot_time - HC_tot_time); } double hunk ./rts/Stats.c 108 mut_user_time_during_heap_census( void ) { - return TICK_TO_DBL(HC_start_time - GC_tot_time - RP_tot_time - HC_tot_time); + return TICK_TO_DBL(HC_start_time - GC_tot_cpu - RP_tot_time - HC_tot_time); } #endif /* PROFILING */ hunk ./rts/Stats.c 112 -// initStats0() has no dependencies, it can be called right at the beginning +/* --------------------------------------------------------------------------- + initStats0() has no dependencies, it can be called right at the beginning + ------------------------------------------------------------------------ */ + void initStats0(void) { hunk ./rts/Stats.c 119 - ElapsedTimeStart = 0; + start_init_cpu = 0; + start_init_elapsed = 0; + end_init_cpu = 0; + end_init_elapsed = 0; hunk ./rts/Stats.c 124 - InitUserTime = 0; - InitElapsedTime = 0; - InitElapsedStamp = 0; - - MutUserTime = 0; - MutElapsedTime = 0; - MutElapsedStamp = 0; - - ExitUserTime = 0; - ExitElapsedTime = 0; + start_exit_cpu = 0; + start_exit_elapsed = 0; + end_exit_cpu = 0; + end_exit_elapsed = 0; GC_tot_alloc = 0; GC_tot_copied = 0; hunk ./rts/Stats.c 133 GC_par_max_copied = 0; GC_par_avg_copied = 0; - GC_start_time = 0; - GC_tot_time = 0; - GCe_start_time = 0; - GCe_tot_time = 0; + GC_tot_cpu = 0; #ifdef PROFILING RP_start_time = 0; hunk ./rts/Stats.c 147 HCe_tot_time = 0; #endif - MaxResidency = 0; - AvgResidency = 0; - ResidencySamples = 0; - MaxSlop = 0; + max_residency = 0; + avg_residency = 0; + residency_samples = 0; + max_slop = 0; hunk ./rts/Stats.c 152 - GC_start_faults = 0; GC_end_faults = 0; } hunk ./rts/Stats.c 155 -// initStats1() can be called after setupRtsFlags() +/* --------------------------------------------------------------------------- + initStats1() can be called after setupRtsFlags() + ------------------------------------------------------------------------ */ + void initStats1 (void) { hunk ./rts/Stats.c 168 statsPrintf(" Alloc Copied Live GC GC TOT TOT Page Flts\n"); statsPrintf(" bytes bytes bytes user elap user elap\n"); } - GC_coll_times = + GC_coll_cpu = (Ticks *)stgMallocBytes( hunk ./rts/Stats.c 170 - sizeof(Ticks)*RtsFlags.GcFlags.generations, + sizeof(Ticks)*total_generations, "initStats"); hunk ./rts/Stats.c 172 - GC_coll_etimes = + GC_coll_elapsed = (Ticks *)stgMallocBytes( hunk ./rts/Stats.c 174 - sizeof(Ticks)*RtsFlags.GcFlags.generations, + sizeof(Ticks)*total_generations, "initStats"); hunk ./rts/Stats.c 176 - for (i = 0; i < RtsFlags.GcFlags.generations; i++) { - GC_coll_times[i] = 0; - GC_coll_etimes[i] = 0; + GC_coll_max_pause = + (Ticks *)stgMallocBytes( + sizeof(Ticks)*total_generations, + "initStats"); + for (i = 0; i < total_generations; i++) { + GC_coll_cpu[i] = 0; + GC_coll_elapsed[i] = 0; + GC_coll_max_pause[i] = 0; } } hunk ./rts/Stats.c 194 void stat_startInit(void) { - Ticks elapsed; - - elapsed = getProcessElapsedTime(); - ElapsedTimeStart = elapsed; + getProcessTimes(&start_init_cpu, &start_init_elapsed); } void hunk ./rts/Stats.c 200 stat_endInit(void) { - Ticks user, elapsed; + getProcessTimes(&end_init_cpu, &end_init_elapsed); hunk ./rts/Stats.c 202 - getProcessTimes(&user, &elapsed); - - InitUserTime = user; - InitElapsedStamp = elapsed; - if (ElapsedTimeStart > elapsed) { - InitElapsedTime = 0; - } else { - InitElapsedTime = elapsed - ElapsedTimeStart; - } #if USE_PAPI /* We start counting events for the mutator * when garbage collection starts hunk ./rts/Stats.c 223 void stat_startExit(void) { - Ticks user, elapsed; - - getProcessTimes(&user, &elapsed); - - MutElapsedStamp = elapsed; - MutElapsedTime = elapsed - GCe_tot_time - - PROF_VAL(RPe_tot_time + HCe_tot_time) - InitElapsedStamp; - if (MutElapsedTime < 0) { MutElapsedTime = 0; } /* sometimes -0.00 */ - - MutUserTime = user - GC_tot_time - - PROF_VAL(RP_tot_time + HC_tot_time) - InitUserTime; - if (MutUserTime < 0) { MutUserTime = 0; } + getProcessTimes(&start_exit_cpu, &start_exit_elapsed); #if USE_PAPI /* We stop counting mutator events hunk ./rts/Stats.c 232 /* This flag is needed, because GC is run once more after this function */ papi_is_reporting = 0; - #endif } hunk ./rts/Stats.c 238 void stat_endExit(void) { - Ticks user, elapsed; - - getProcessTimes(&user, &elapsed); - - ExitUserTime = user - MutUserTime - GC_tot_time - PROF_VAL(RP_tot_time + HC_tot_time) - InitUserTime; - ExitElapsedTime = elapsed - MutElapsedStamp; - if (ExitUserTime < 0) { - ExitUserTime = 0; - } - if (ExitElapsedTime < 0) { - ExitElapsedTime = 0; - } + getProcessTimes(&end_exit_cpu, &end_exit_elapsed); } /* ----------------------------------------------------------------------------- hunk ./rts/Stats.c 247 static nat rub_bell = 0; -/* initialise global variables needed during GC - * - * * GC_start_time is read in mut_user_time_during_GC(), which in turn is - * needed if either PROFILING or DEBUGing is enabled - */ void hunk ./rts/Stats.c 248 -stat_startGC(void) +stat_startGC (gc_thread *gct) { nat bell = RtsFlags.GcFlags.ringBell; hunk ./rts/Stats.c 261 } } - if (RtsFlags.GcFlags.giveStats != NO_GC_STATS - || RtsFlags.ProfFlags.doHeapProfile) - // heap profiling needs GC_tot_time - { - getProcessTimes(&GC_start_time, &GCe_start_time); - if (RtsFlags.GcFlags.giveStats) { - GC_start_faults = getPageFaults(); - } - } - #if USE_PAPI if(papi_is_reporting) { /* Switch to counting GC events */ hunk ./rts/Stats.c 269 } #endif + getProcessTimes(&gct->gc_start_cpu, &gct->gc_start_elapsed); + gct->gc_start_thread_cpu = getThreadCPUTime(); + + if (RtsFlags.GcFlags.giveStats != NO_GC_STATS) + { + gct->gc_start_faults = getPageFaults(); + } +} + +void +stat_gcWorkerThreadStart (gc_thread *gct) +{ + if (RtsFlags.GcFlags.giveStats != NO_GC_STATS) + { + getProcessTimes(&gct->gc_start_cpu, &gct->gc_start_elapsed); + gct->gc_start_thread_cpu = getThreadCPUTime(); + } +} + +void +stat_gcWorkerThreadDone (gc_thread *gct) +{ + Ticks thread_cpu, elapsed, gc_cpu, gc_elapsed; + + if (RtsFlags.GcFlags.giveStats != NO_GC_STATS) + { + elapsed = getProcessElapsedTime(); + thread_cpu = getThreadCPUTime(); + + gc_cpu = thread_cpu - gct->gc_start_thread_cpu; + gc_elapsed = elapsed - gct->gc_start_elapsed; + + taskDoneGC(gct->cap->running_task, gc_cpu, gc_elapsed); + } } /* ----------------------------------------------------------------------------- hunk ./rts/Stats.c 310 -------------------------------------------------------------------------- */ void -stat_endGC (lnat alloc, lnat live, lnat copied, lnat gen, +stat_endGC (gc_thread *gct, + lnat alloc, lnat live, lnat copied, nat gen, lnat max_copied, lnat avg_copied, lnat slop) { hunk ./rts/Stats.c 314 + nat gen_ix; + if (RtsFlags.GcFlags.giveStats != NO_GC_STATS || RtsFlags.ProfFlags.doHeapProfile) // heap profiling needs GC_tot_time hunk ./rts/Stats.c 320 { - Ticks time, etime, gc_time, gc_etime; - - getProcessTimes(&time, &etime); - gc_time = time - GC_start_time; - gc_etime = etime - GCe_start_time; + Ticks cpu, elapsed, thread_gc_cpu, gc_cpu, gc_elapsed; hunk ./rts/Stats.c 322 + getProcessTimes(&cpu, &elapsed); + gc_elapsed = elapsed - gct->gc_start_elapsed; + + thread_gc_cpu = getThreadCPUTime() - gct->gc_start_thread_cpu; + + if (gct->gc_type == GC_LOCAL) { + gc_cpu = thread_gc_cpu; + } else { + gc_cpu = cpu - gct->gc_start_cpu; + } + + taskDoneGC(gct->cap->running_task, thread_gc_cpu, gc_elapsed); + + if (gct->gc_type == GC_LOCAL) { + gen_ix = gct->index; + } else { + gen_ix = old_generations[gct->collect_gen].ix; + } + if (RtsFlags.GcFlags.giveStats == VERBOSE_GC_STATS) { nat faults = getPageFaults(); hunk ./rts/Stats.c 347 statsPrintf("%9ld %9ld %9ld", alloc*sizeof(W_), copied*sizeof(W_), live*sizeof(W_)); - statsPrintf(" %5.2f %5.2f %7.2f %7.2f %4ld %4ld (Gen: %2ld)\n", - TICK_TO_DBL(gc_time), - TICK_TO_DBL(gc_etime), - TICK_TO_DBL(time), - TICK_TO_DBL(etime - ElapsedTimeStart), - faults - GC_start_faults, - GC_start_faults - GC_end_faults, - gen); + statsPrintf(" %5.2f %5.2f %7.2f %7.2f %4ld %4ld ", + TICK_TO_DBL(gc_cpu), + TICK_TO_DBL(gc_elapsed), + TICK_TO_DBL(cpu), + TICK_TO_DBL(elapsed - start_init_elapsed), + faults - gct->gc_start_faults, + gct->gc_start_faults - GC_end_faults); hunk ./rts/Stats.c 355 + switch (gct->gc_type) { + case GC_LOCAL: + statsPrintf("(G0.%d, loc)", gct->index); + break; + case GC_PAR: + statsPrintf("(G%d, par)", gct->collect_gen); + break; + case GC_SEQ: + statsPrintf("(G%d)", gct->collect_gen); + break; + } + statsPrintf("\n"); GC_end_faults = faults; statsFlush(); } hunk ./rts/Stats.c 371 - GC_coll_times[gen] += gc_time; - GC_coll_etimes[gen] += gc_etime; + GC_coll_cpu[gen_ix] += gc_cpu; + GC_coll_elapsed[gen_ix] += gc_elapsed; + if (GC_coll_max_pause[gen_ix] < gc_elapsed) { + GC_coll_max_pause[gen_ix] = gc_elapsed; + } GC_tot_copied += (StgWord64) copied; GC_tot_alloc += (StgWord64) alloc; hunk ./rts/Stats.c 381 GC_par_max_copied += (StgWord64) max_copied; GC_par_avg_copied += (StgWord64) avg_copied; - GC_tot_time += gc_time; - GCe_tot_time += gc_etime; - -#if defined(THREADED_RTS) - { - Task *task; - if ((task = myTask()) != NULL) { - task->gc_time += gc_time; - task->gc_etime += gc_etime; - } - } -#endif + GC_tot_cpu += gc_cpu; if (gen == RtsFlags.GcFlags.generations-1) { /* major GC? */ hunk ./rts/Stats.c 384 - if (live > MaxResidency) { - MaxResidency = live; + if (live > max_residency) { + max_residency = live; } hunk ./rts/Stats.c 387 - ResidencySamples++; - AvgResidency += live; + residency_samples++; + avg_residency += live; } hunk ./rts/Stats.c 391 - if (slop > MaxSlop) MaxSlop = slop; + if (slop > max_slop) max_slop = slop; } if (rub_bell) { hunk ./rts/Stats.c 530 statsPrintf(" (SLOW_CALLS_" #arity ") %% of (TOTAL_CALLS) : %.1f%%\n", \ SLOW_CALLS_##arity * 100.0/TOTAL_CALLS) -extern lnat hw_alloc_blocks; - void stat_exit(int alloc) { hunk ./rts/Stats.c 533 + generation *gen; + Ticks gc_global_cpu = 0; + Ticks gc_global_elapsed = 0; + Ticks gc_local_cpu = 0; + Ticks gc_local_elapsed = 0; + Ticks init_cpu = 0; + Ticks init_elapsed = 0; + Ticks mut_cpu = 0; + Ticks mut_elapsed = 0; + Ticks gc_cpu = 0; + Ticks gc_elapsed = 0; + Ticks exit_cpu = 0; + Ticks exit_elapsed = 0; + StgWord64 globalised = 0; + if (RtsFlags.GcFlags.giveStats != NO_GC_STATS) { char temp[BIG_STRING_LEN]; hunk ./rts/Stats.c 551 - Ticks time; - Ticks etime; - nat g, total_collections = 0; + Ticks tot_cpu; + Ticks tot_elapsed; + nat i, g, total_collections = 0; hunk ./rts/Stats.c 555 - getProcessTimes( &time, &etime ); - etime -= ElapsedTimeStart; + getProcessTimes( &tot_cpu, &tot_elapsed ); + tot_elapsed -= start_init_elapsed; GC_tot_alloc += alloc; hunk ./rts/Stats.c 561 /* Count total garbage collections */ - for (g = 0; g < RtsFlags.GcFlags.generations; g++) - total_collections += generations[g].collections; + for (g = 0; g < total_generations; g++) + total_collections += all_generations[g].collections; hunk ./rts/Stats.c 564 - /* avoid divide by zero if time is measured as 0.00 seconds -- SDM */ - if (time == 0.0) time = 1; - if (etime == 0.0) etime = 1; + /* avoid divide by zero if tot_cpu is measured as 0.00 seconds -- SDM */ + if (tot_cpu == 0.0) tot_cpu = 1; + if (tot_elapsed == 0.0) tot_elapsed = 1; if (RtsFlags.GcFlags.giveStats >= VERBOSE_GC_STATS) { statsPrintf("%9ld %9.9s %9.9s", (lnat)alloc*sizeof(W_), "", ""); hunk ./rts/Stats.c 573 statsPrintf(" %5.2f %5.2f\n\n", 0.0, 0.0); } + for (i = 0; i < total_generations; i++) { + if (all_generations[i].is_local) { + gc_local_cpu += GC_coll_cpu[i]; + gc_local_elapsed += GC_coll_elapsed[i]; + } else { + gc_global_cpu += GC_coll_cpu[i]; + gc_global_elapsed += GC_coll_elapsed[i]; + } + } + if (RtsFlags.GcFlags.giveStats >= SUMMARY_GC_STATS) { showStgWord64(GC_tot_alloc*sizeof(W_), temp, rtsTrue/*commas*/); hunk ./rts/Stats.c 592 temp, rtsTrue/*commas*/); statsPrintf("%16s bytes copied during GC\n", temp); - if ( ResidencySamples > 0 ) { - showStgWord64(MaxResidency*sizeof(W_), + for (i = 0; i < n_capabilities; i++) { + globalised += gc_threads[i]->globalised; + } + + showStgWord64(globalised * sizeof(W_), + temp, rtsTrue/*commas*/); + statsPrintf("%16s bytes globalised\n", temp); + + if ( residency_samples > 0 ) { + showStgWord64(max_residency*sizeof(W_), temp, rtsTrue/*commas*/); statsPrintf("%16s bytes maximum residency (%ld sample(s))\n", hunk ./rts/Stats.c 604 - temp, ResidencySamples); + temp, residency_samples); } hunk ./rts/Stats.c 607 - showStgWord64(MaxSlop*sizeof(W_), temp, rtsTrue/*commas*/); + showStgWord64(max_slop*sizeof(W_), temp, rtsTrue/*commas*/); statsPrintf("%16s bytes maximum slop\n", temp); statsPrintf("%16ld MB total memory in use (%ld MB lost due to fragmentation)\n\n", hunk ./rts/Stats.c 615 (peak_mblocks_allocated * BLOCKS_PER_MBLOCK * BLOCK_SIZE_W - hw_alloc_blocks * BLOCK_SIZE_W) / (1024 * 1024 / sizeof(W_))); /* Print garbage collections in each gen */ - for (g = 0; g < RtsFlags.GcFlags.generations; g++) { - statsPrintf(" Generation %d: %5d collections, %5d parallel, %5.2fs, %5.2fs elapsed\n", - g, generations[g].collections, - generations[g].par_collections, - TICK_TO_DBL(GC_coll_times[g]), - TICK_TO_DBL(GC_coll_etimes[g])); + statsPrintf(" Tot time (elapsed) Avg pause Max pause\n"); + for (g = 0; g < total_generations; g++) { + gen = &all_generations[g]; + if (gen->is_local) { + statsPrintf(" Gen %2d.%-2d %5d colls, local %5.2fs %5.2fs %3.4fs %3.4fs\n", + gen->no, gen->cap, + gen->collections, + TICK_TO_DBL(GC_coll_cpu[g]), + TICK_TO_DBL(GC_coll_elapsed[g]), + TICK_TO_DBL(GC_coll_elapsed[g] / gen->collections), + TICK_TO_DBL(GC_coll_max_pause[g])); + } else { + statsPrintf(" Gen %2d %5d colls, %5d par %5.2fs %5.2fs %3.4fs %3.4fs\n", + gen->no, + gen->collections, + gen->par_collections, + TICK_TO_DBL(GC_coll_cpu[g]), + TICK_TO_DBL(GC_coll_elapsed[g]), + TICK_TO_DBL(GC_coll_elapsed[g] / gen->collections), + TICK_TO_DBL(GC_coll_max_pause[g])); + } } #if defined(THREADED_RTS) hunk ./rts/Stats.c 647 ); } #endif - - statsPrintf("\n"); + statsPrintf("\n"); #if defined(THREADED_RTS) { hunk ./rts/Stats.c 689 } #endif - statsPrintf(" INIT time %6.2fs (%6.2fs elapsed)\n", - TICK_TO_DBL(InitUserTime), TICK_TO_DBL(InitElapsedTime)); - statsPrintf(" MUT time %6.2fs (%6.2fs elapsed)\n", - TICK_TO_DBL(MutUserTime), TICK_TO_DBL(MutElapsedTime)); - statsPrintf(" GC time %6.2fs (%6.2fs elapsed)\n", - TICK_TO_DBL(GC_tot_time), TICK_TO_DBL(GCe_tot_time)); + gc_elapsed = gc_local_elapsed + gc_global_elapsed; + + init_cpu = end_init_cpu - start_init_cpu; + init_elapsed = end_init_elapsed - start_init_elapsed; + + exit_cpu = end_exit_cpu - start_exit_cpu; + exit_elapsed = end_exit_elapsed - start_exit_elapsed; + + gc_cpu = gc_local_cpu + gc_global_cpu; + + statsPrintf(" INIT time %6.2fs (%6.2fs elapsed)\n", + TICK_TO_DBL(init_cpu), TICK_TO_DBL(init_elapsed)); + +#ifdef THREADED_RTS + if (RtsFlags.ParFlags.nNodes == 1) +#endif + { + // In single-threaded mode, we can separate out the + // local GC time from the MUT time, and report the + // total GC time separately. + + mut_elapsed = start_exit_elapsed - end_init_elapsed + - gc_elapsed; + + mut_cpu = start_exit_cpu - end_init_cpu + - gc_local_cpu - gc_global_cpu + - PROF_VAL(RP_tot_time + HC_tot_time); + if (mut_cpu < 0) { mut_cpu = 0; } + + statsPrintf(" MUT time %6.2fs (%6.2fs elapsed)\n", + TICK_TO_DBL(mut_cpu), TICK_TO_DBL(mut_elapsed)); + statsPrintf(" GC time %6.2fs (%6.2fs elapsed)\n", + TICK_TO_DBL(gc_cpu), TICK_TO_DBL(gc_elapsed)); + } +#ifdef THREADED_RTS + else + { + // In multi-threaded mode, we have to include the + // local GC time in the MUT time, because each thread + // has its own independent interleaving of MUT and + // local GC. + + mut_elapsed = start_exit_elapsed - end_init_elapsed + - gc_global_elapsed; + + mut_cpu = start_exit_cpu - end_init_cpu + - gc_global_cpu + - PROF_VAL(RP_tot_time + HC_tot_time); + if (mut_cpu < 0) { mut_cpu = 0; } + + statsPrintf(" MUT+GC0 time %6.2fs (%6.2fs elapsed) (%.2fs MUT + %.2fs GC0)\n", + TICK_TO_DBL(mut_cpu), + TICK_TO_DBL(mut_elapsed), + TICK_TO_DBL(mut_cpu - gc_local_cpu), + TICK_TO_DBL(gc_local_cpu)); + statsPrintf(" GC1 time %6.2fs (%6.2fs elapsed)\n", + TICK_TO_DBL(gc_global_cpu), TICK_TO_DBL(gc_global_elapsed)); + } +#endif + #ifdef PROFILING hunk ./rts/Stats.c 750 - statsPrintf(" RP time %6.2fs (%6.2fs elapsed)\n", + statsPrintf(" RP time %6.2fs (%6.2fs elapsed)\n", TICK_TO_DBL(RP_tot_time), TICK_TO_DBL(RPe_tot_time)); hunk ./rts/Stats.c 752 - statsPrintf(" PROF time %6.2fs (%6.2fs elapsed)\n", + statsPrintf(" PROF time %6.2fs (%6.2fs elapsed)\n", TICK_TO_DBL(HC_tot_time), TICK_TO_DBL(HCe_tot_time)); #endif hunk ./rts/Stats.c 755 - statsPrintf(" EXIT time %6.2fs (%6.2fs elapsed)\n", - TICK_TO_DBL(ExitUserTime), TICK_TO_DBL(ExitElapsedTime)); - statsPrintf(" Total time %6.2fs (%6.2fs elapsed)\n\n", - TICK_TO_DBL(time), TICK_TO_DBL(etime)); - statsPrintf(" %%GC time %5.1f%% (%.1f%% elapsed)\n\n", - TICK_TO_DBL(GC_tot_time)*100/TICK_TO_DBL(time), - TICK_TO_DBL(GCe_tot_time)*100/TICK_TO_DBL(etime)); + statsPrintf(" EXIT time %6.2fs (%6.2fs elapsed)\n", + TICK_TO_DBL(exit_cpu), TICK_TO_DBL(exit_elapsed)); + statsPrintf(" Total time %6.2fs (%6.2fs elapsed)\n\n", + TICK_TO_DBL(tot_cpu), TICK_TO_DBL(tot_elapsed)); +#ifndef THREADED_RTS + statsPrintf(" %%GC time %5.1f%% (%.1f%% elapsed)\n\n", + TICK_TO_DBL(gc_cpu)*100/TICK_TO_DBL(tot_cpu), + TICK_TO_DBL(gc_elapsed)*100/TICK_TO_DBL(tot_elapsed)); +#endif hunk ./rts/Stats.c 765 - if (time - GC_tot_time - PROF_VAL(RP_tot_time + HC_tot_time) == 0) + if (tot_cpu - GC_tot_cpu - PROF_VAL(RP_tot_time + HC_tot_time) == 0) showStgWord64(0, temp, rtsTrue/*commas*/); else showStgWord64( hunk ./rts/Stats.c 770 (StgWord64)((GC_tot_alloc*sizeof(W_))/ - TICK_TO_DBL(time - GC_tot_time - + TICK_TO_DBL(tot_cpu - GC_tot_cpu - PROF_VAL(RP_tot_time + HC_tot_time))), temp, rtsTrue/*commas*/); hunk ./rts/Stats.c 777 statsPrintf(" Alloc rate %s bytes per MUT second\n\n", temp); statsPrintf(" Productivity %5.1f%% of total user, %.1f%% of total elapsed\n\n", - TICK_TO_DBL(time - GC_tot_time - - PROF_VAL(RP_tot_time + HC_tot_time) - InitUserTime) * 100 - / TICK_TO_DBL(time), - TICK_TO_DBL(time - GC_tot_time - - PROF_VAL(RP_tot_time + HC_tot_time) - InitUserTime) * 100 - / TICK_TO_DBL(etime)); + TICK_TO_DBL(tot_cpu - GC_tot_cpu - + PROF_VAL(RP_tot_time + HC_tot_time) - init_cpu) * 100 + / TICK_TO_DBL(tot_cpu), + TICK_TO_DBL(tot_cpu - GC_tot_cpu - + PROF_VAL(RP_tot_time + HC_tot_time) - init_cpu) * 100 + / TICK_TO_DBL(tot_elapsed)); /* TICK_PRINT(1); hunk ./rts/Stats.c 801 statsPrintf("gc_alloc_block_sync: %"FMT_Word64"\n", gc_alloc_block_sync.spin); statsPrintf("whitehole_spin: %"FMT_Word64"\n", whitehole_spin); - for (g = 0; g < RtsFlags.GcFlags.generations; g++) { - statsPrintf("gen[%d].sync_large_objects: %"FMT_Word64"\n", g, generations[g].sync_large_objects.spin); + for (g = 0; g < total_generations; g++) { + statsPrintf("gen[%d].sync: %"FMT_Word64"\n", g, all_generations[g].sync.spin); } } #endif hunk ./rts/Stats.c 833 statsPrintf(fmt1, GC_tot_alloc*(StgWord64)sizeof(W_)); statsPrintf(fmt2, total_collections, - ResidencySamples == 0 ? 0 : - AvgResidency*sizeof(W_)/ResidencySamples, - MaxResidency*sizeof(W_), - ResidencySamples, + residency_samples == 0 ? 0 : + avg_residency*sizeof(W_)/residency_samples, + max_residency*sizeof(W_), + residency_samples, (unsigned long)(peak_mblocks_allocated * MBLOCK_SIZE / (1024L * 1024L)), hunk ./rts/Stats.c 838 - TICK_TO_DBL(InitUserTime), TICK_TO_DBL(InitElapsedTime), - TICK_TO_DBL(MutUserTime), TICK_TO_DBL(MutElapsedTime), - TICK_TO_DBL(GC_tot_time), TICK_TO_DBL(GCe_tot_time)); + TICK_TO_DBL(init_cpu), TICK_TO_DBL(init_elapsed), + TICK_TO_DBL(mut_cpu), TICK_TO_DBL(mut_elapsed), + TICK_TO_DBL(gc_cpu), TICK_TO_DBL(gc_elapsed)); } statsFlush(); hunk ./rts/Stats.c 847 statsClose(); } - if (GC_coll_times) - stgFree(GC_coll_times); - GC_coll_times = NULL; - if (GC_coll_etimes) - stgFree(GC_coll_etimes); - GC_coll_etimes = NULL; + if (GC_coll_cpu) + stgFree(GC_coll_cpu); + GC_coll_cpu = NULL; + if (GC_coll_elapsed) + stgFree(GC_coll_elapsed); + GC_coll_elapsed = NULL; } /* ----------------------------------------------------------------------------- hunk ./rts/Stats.c 863 void statDescribeGens(void) { - nat g, mut, lge; - lnat live, slop; - lnat tot_live, tot_slop; + nat g, n, i, lge; + nat cap_blocks, gen_blocks; + nat cap_mut, gen_mut; + lnat cap_live, gen_live; + lnat slop, tot_live, tot_slop; bdescr *bd; generation *gen; hunk ./rts/Stats.c 872 debugBelch( -"----------------------------------------------------------\n" -" Gen Max Mut-list Blocks Large Live Slop\n" -" Blocks Bytes Objects \n" -"----------------------------------------------------------\n"); +"------------------------------------------------------------------------\n" +" Gen Max Large Prim Cap Mut-list Total Live Slop\n" +" Blocks Objects Blocks Bytes Blocks \n" +"------------------------------------------------------------------------\n"); tot_live = 0; tot_slop = 0; hunk ./rts/Stats.c 879 - for (g = 0; g < RtsFlags.GcFlags.generations; g++) { - mut = 0; - for (bd = generations[g].mut_list; bd != NULL; bd = bd->link) { - mut += (bd->free - bd->start) * sizeof(W_); - } - - gen = &generations[g]; - - debugBelch("%5d %7d %9d", g, gen->max_blocks, mut); + for (n = 0; n < total_generations; n++) { + gen = &all_generations[n]; + g = gen->no; for (bd = gen->large_objects, lge = 0; bd; bd = bd->link) { lge++; hunk ./rts/Stats.c 886 } - live = gen->n_words + countOccupied(gen->large_objects); - slop = (gen->n_blocks + gen->n_large_blocks) * BLOCK_SIZE_W - live; - debugBelch("%8d %8d %8ld %8ld\n", gen->n_blocks, lge, - live*sizeof(W_), slop*sizeof(W_)); - tot_live += live; + gen_live = gen->n_words + gen->n_prim_words + countOccupied(gen->large_objects); + gen_blocks = gen->n_blocks + gen->n_large_blocks + gen->n_prim_blocks; + + slop = gen_blocks * BLOCK_SIZE_W - gen_live; + + debugBelch("%5d %7d %8d %8d %4s %8s %8d %8ld %8ld\n", + g, gen->max_blocks, lge, gen->n_prim_blocks, "", "", gen_blocks, + gen_live*sizeof(W_), slop*sizeof(W_)); + + for (i = 0; i < n_capabilities; i++) { + cap_mut = countOccupied(capabilities[i].mut_lists[g]); + + cap_live = countOccupied(gc_threads[i]->gens[n].todo_bd); + cap_live += countOccupied(gc_threads[i]->gens[n].part_list); + cap_live += countOccupied(gc_threads[i]->gens[n].scavd_list); + + cap_blocks = countBlocks(gc_threads[i]->gens[n].todo_bd); + cap_blocks += gc_threads[i]->gens[n].n_part_blocks; + cap_blocks += gc_threads[i]->gens[n].n_scavd_blocks; + + slop = cap_blocks * BLOCK_SIZE_W - cap_live; + + debugBelch("%5s %7s %8s %8s %4d %8ld %8d %8ld %8ld\n", + "", "", "", "", i, cap_mut*sizeof(W_), cap_blocks, + cap_live*sizeof(W_), slop*sizeof(W_)); + + gen_mut += cap_mut; + gen_live += cap_live; + gen_blocks += cap_blocks; + } + + debugBelch("%55s-----------------\n",""); + debugBelch("%5s %7s %8s %4s %8s %8s %8s %8ld %8ld\n\n", + "", "", "", "", "", "", "", + gen_live*sizeof(W_), slop*sizeof(W_)); + + slop = gen_blocks * BLOCK_SIZE_W - gen_live; + + tot_live += gen_live; tot_slop += slop; } hunk ./rts/Stats.c 927 - debugBelch("----------------------------------------------------------\n"); - debugBelch("%41s%8ld %8ld\n","",tot_live*sizeof(W_),tot_slop*sizeof(W_)); - debugBelch("----------------------------------------------------------\n"); + debugBelch("------------------------------------------------------------------------\n"); + debugBelch("%55s%8ld %8ld\n","",tot_live*sizeof(W_),tot_slop*sizeof(W_)); + debugBelch("------------------------------------------------------------------------\n"); debugBelch("\n"); } hunk ./rts/Stats.h 16 #include "BeginPrivate.h" +struct gc_thread_; + void stat_startInit(void); void stat_endInit(void); hunk ./rts/Stats.h 21 -void stat_startGC(void); -void stat_endGC (lnat alloc, lnat live, - lnat copied, lnat gen, - lnat max_copied, lnat avg_copied, lnat slop); +void stat_startGC(struct gc_thread_ *gct); +void stat_endGC (struct gc_thread_ *gct, lnat alloc, lnat live, + lnat copied, nat gen, + lnat max_copied, lnat avg_copied, lnat slop); + +void stat_gcWorkerThreadStart (struct gc_thread_ *gct); +void stat_gcWorkerThreadDone (struct gc_thread_ *gct); #ifdef PROFILING void stat_startRP(void); hunk ./rts/Stats.h 52 void initStats0(void); void initStats1(void); -double mut_user_time_during_GC(void); double mut_user_time(void); #ifdef PROFILING hunk ./rts/StgMiscClosures.cmm 193 INFO_TABLE(stg_IND,1,0,IND,"IND","IND") { TICK_ENT_DYN_IND(); /* tick */ - R1 = UNTAG(StgInd_indirectee(R1)); + R1 = StgInd_indirectee(R1); TICK_ENT_VIA_NODE(); hunk ./rts/StgMiscClosures.cmm 195 - jump %GET_ENTRY(R1); + ENTER(); } INFO_TABLE(stg_IND_direct,1,0,IND,"IND","IND") hunk ./rts/StgMiscClosures.cmm 209 INFO_TABLE(stg_IND_STATIC,1,0,IND_STATIC,"IND_STATIC","IND_STATIC") { TICK_ENT_STATIC_IND(); /* tick */ - R1 = UNTAG(StgInd_indirectee(R1)); + R1 = StgInd_indirectee(R1); TICK_ENT_VIA_NODE(); hunk ./rts/StgMiscClosures.cmm 211 - jump %GET_ENTRY(R1); + ENTER(); } INFO_TABLE(stg_IND_PERM,1,0,IND_PERM,"IND_PERM","IND_PERM") hunk ./rts/StgMiscClosures.cmm 255 jump %GET_ENTRY(R1); } +STRING(ind_local_msg,"IND_LOCAL: not mine") + +#define DEF_IND_LOCAL(n) \ + INFO_TABLE_CONSTR(stg_IND_LOCAL##n,1,0,n,IND_LOCAL,"IND_LOCAL","IND_LOCAL") \ + { \ + W_ r; \ + if (TO_W_(Capability_no(MyCapability())) != n) { \ + MAYBE_GC(R1_PTR, stg_IND_LOCAL##n##_info); \ + (r) = foreign "C" messageGlobalise(MyCapability(), CurrentTSO, R1, n) [R1]; \ + if (r != 0) { \ + jump stg_block_enter; \ + } \ + } \ + \ + R1 = StgInd_indirectee(R1); \ + ENTER(); \ + } + +DEF_IND_LOCAL(0) +DEF_IND_LOCAL(1) +DEF_IND_LOCAL(2) +DEF_IND_LOCAL(3) +DEF_IND_LOCAL(4) +DEF_IND_LOCAL(5) +DEF_IND_LOCAL(6) +DEF_IND_LOCAL(7) +DEF_IND_LOCAL(8) +DEF_IND_LOCAL(9) +DEF_IND_LOCAL(10) +DEF_IND_LOCAL(11) +DEF_IND_LOCAL(12) +DEF_IND_LOCAL(13) +DEF_IND_LOCAL(14) +DEF_IND_LOCAL(15) +DEF_IND_LOCAL(16) +DEF_IND_LOCAL(17) +DEF_IND_LOCAL(18) +DEF_IND_LOCAL(19) +DEF_IND_LOCAL(20) +DEF_IND_LOCAL(21) +DEF_IND_LOCAL(22) +DEF_IND_LOCAL(23) +DEF_IND_LOCAL(24) +DEF_IND_LOCAL(25) +DEF_IND_LOCAL(26) +DEF_IND_LOCAL(27) +DEF_IND_LOCAL(28) +DEF_IND_LOCAL(29) +DEF_IND_LOCAL(30) +DEF_IND_LOCAL(31) +DEF_IND_LOCAL(32) +DEF_IND_LOCAL(33) +DEF_IND_LOCAL(34) +DEF_IND_LOCAL(35) +DEF_IND_LOCAL(36) +DEF_IND_LOCAL(37) +DEF_IND_LOCAL(38) +DEF_IND_LOCAL(39) +DEF_IND_LOCAL(40) +DEF_IND_LOCAL(41) +DEF_IND_LOCAL(42) +DEF_IND_LOCAL(43) +DEF_IND_LOCAL(44) +DEF_IND_LOCAL(45) +DEF_IND_LOCAL(46) +DEF_IND_LOCAL(47) +DEF_IND_LOCAL(48) +DEF_IND_LOCAL(49) +DEF_IND_LOCAL(50) +DEF_IND_LOCAL(51) +DEF_IND_LOCAL(52) +DEF_IND_LOCAL(53) +DEF_IND_LOCAL(54) +DEF_IND_LOCAL(55) +DEF_IND_LOCAL(56) +DEF_IND_LOCAL(57) +DEF_IND_LOCAL(58) +DEF_IND_LOCAL(59) +DEF_IND_LOCAL(60) +DEF_IND_LOCAL(61) +DEF_IND_LOCAL(62) +DEF_IND_LOCAL(63) +DEF_IND_LOCAL(64) + +section "rodata" { + stg_IND_LOCAL_tbl: + W_ stg_IND_LOCAL0_info; + W_ stg_IND_LOCAL1_info; + W_ stg_IND_LOCAL2_info; + W_ stg_IND_LOCAL3_info; + W_ stg_IND_LOCAL4_info; + W_ stg_IND_LOCAL5_info; + W_ stg_IND_LOCAL6_info; + W_ stg_IND_LOCAL7_info; + W_ stg_IND_LOCAL8_info; + W_ stg_IND_LOCAL9_info; + W_ stg_IND_LOCAL10_info; + W_ stg_IND_LOCAL11_info; + W_ stg_IND_LOCAL12_info; + W_ stg_IND_LOCAL13_info; + W_ stg_IND_LOCAL14_info; + W_ stg_IND_LOCAL15_info; + W_ stg_IND_LOCAL16_info; + W_ stg_IND_LOCAL17_info; + W_ stg_IND_LOCAL18_info; + W_ stg_IND_LOCAL19_info; + W_ stg_IND_LOCAL20_info; + W_ stg_IND_LOCAL21_info; + W_ stg_IND_LOCAL22_info; + W_ stg_IND_LOCAL23_info; + W_ stg_IND_LOCAL24_info; + W_ stg_IND_LOCAL25_info; + W_ stg_IND_LOCAL26_info; + W_ stg_IND_LOCAL27_info; + W_ stg_IND_LOCAL28_info; + W_ stg_IND_LOCAL29_info; + W_ stg_IND_LOCAL30_info; + W_ stg_IND_LOCAL31_info; + W_ stg_IND_LOCAL32_info; + W_ stg_IND_LOCAL33_info; + W_ stg_IND_LOCAL34_info; + W_ stg_IND_LOCAL35_info; + W_ stg_IND_LOCAL36_info; + W_ stg_IND_LOCAL37_info; + W_ stg_IND_LOCAL38_info; + W_ stg_IND_LOCAL39_info; + W_ stg_IND_LOCAL40_info; + W_ stg_IND_LOCAL41_info; + W_ stg_IND_LOCAL42_info; + W_ stg_IND_LOCAL43_info; + W_ stg_IND_LOCAL44_info; + W_ stg_IND_LOCAL45_info; + W_ stg_IND_LOCAL46_info; + W_ stg_IND_LOCAL47_info; + W_ stg_IND_LOCAL48_info; + W_ stg_IND_LOCAL49_info; + W_ stg_IND_LOCAL50_info; + W_ stg_IND_LOCAL51_info; + W_ stg_IND_LOCAL52_info; + W_ stg_IND_LOCAL53_info; + W_ stg_IND_LOCAL54_info; + W_ stg_IND_LOCAL55_info; + W_ stg_IND_LOCAL56_info; + W_ stg_IND_LOCAL57_info; + W_ stg_IND_LOCAL58_info; + W_ stg_IND_LOCAL59_info; + W_ stg_IND_LOCAL60_info; + W_ stg_IND_LOCAL61_info; + W_ stg_IND_LOCAL62_info; + W_ stg_IND_LOCAL63_info; + W_ stg_IND_LOCAL64_info; +} + /* ---------------------------------------------------------------------------- Black holes. hunk ./rts/StgMiscClosures.cmm 419 INFO_TABLE(stg_BLACKHOLE,1,0,BLACKHOLE,"BLACKHOLE","BLACKHOLE") { - W_ r, p, info, bq, msg, owner, bd; + W_ r, p, info, bq, msg, owner, bd, new_tso; TICK_ENT_DYN_IND(); /* tick */ hunk ./rts/StgMiscClosures.cmm 431 } info = StgHeader_info(p); - if (info == stg_IND_info) { + if (info == stg_STUB_BLOCKING_QUEUE_info) { // This could happen, if e.g. we got a BLOCKING_QUEUE that has // just been replaced with an IND by another thread in // wakeBlockingQueue(). hunk ./rts/StgMiscClosures.cmm 442 info == stg_BLOCKING_QUEUE_CLEAN_info || info == stg_BLOCKING_QUEUE_DIRTY_info) { - ("ptr" msg) = foreign "C" allocate(MyCapability() "ptr", - BYTES_TO_WDS(SIZEOF_MessageBlackHole)) [R1]; - + ("ptr" msg) = foreign "C" allocatePrim(MyCapability() "ptr", + BYTES_TO_WDS(SIZEOF_MessageBlackHole)) [R1]; SET_HDR(msg, stg_MSG_BLACKHOLE_info, CCS_SYSTEM); MessageBlackHole_tso(msg) = CurrentTSO; MessageBlackHole_bh(msg) = R1; hunk ./rts/StgMiscClosures.cmm 455 } else { StgTSO_why_blocked(CurrentTSO) = BlockedOnBlackHole::I16; StgTSO_block_info(CurrentTSO) = msg; - jump stg_block_blackhole; + jump stg_block_enter; } } else hunk ./rts/StgMiscClosures.cmm 488 INFO_TABLE(stg_BLOCKING_QUEUE_DIRTY,4,0,BLOCKING_QUEUE,"BLOCKING_QUEUE","BLOCKING_QUEUE") { foreign "C" barf("BLOCKING_QUEUE_DIRTY object entered!") never returns; } +/* + * A kind of indirection that is never intended to be entered. This + * is used for stubbing out BLOCKING_QUEUEs in wakeBlockingQueue(). + */ +INFO_TABLE(stg_STUB_BLOCKING_QUEUE,1,0,IND,"STUB_BLOCKING_QUEUE","STUB_BLOCKING_QUEUE") +{ foreign "C" barf("STUB_BLOCKING_QUEUE object entered!") never returns; } /* ---------------------------------------------------------------------------- Whiteholes are used for the "locked" state of a closure (see lockClosure()) hunk ./rts/StgMiscClosures.cmm 643 INFO_TABLE_CONSTR(stg_MSG_BLACKHOLE,3,0,0,PRIM,"MSG_BLACKHOLE","MSG_BLACKHOLE") { foreign "C" barf("MSG_BLACKHOLE object entered!") never returns; } +// used to overwrite a MSG_BLACKHOLE when the message has been used/revoked +// Closure type is IND so these get automatically eliminated by the GC. +INFO_TABLE(stg_STUB_MSG_BLACKHOLE,1,0,IND,"STUB_MSG_BLACKHOLE","STUB_MSG_BLACKHOLE") +{ foreign "C" barf("STUB_MSG_BLACKHOLE object entered!") never returns; } + // used to overwrite a MSG_THROWTO when the message has been used/revoked INFO_TABLE_CONSTR(stg_MSG_NULL,1,0,0,PRIM,"MSG_NULL","MSG_NULL") { foreign "C" barf("MSG_NULL object entered!") never returns; } hunk ./rts/StgMiscClosures.cmm 652 +INFO_TABLE_CONSTR(stg_MSG_GLOBALISE,3,0,0,PRIM,"MSG_GLOBALISE","MSG_GLOBALISE") +{ foreign "C" barf("MSG_GLOBALISE object entered!") never returns; } + /* ---------------------------------------------------------------------------- END_TSO_QUEUE hunk ./rts/StgMiscClosures.cmm 685 INFO_TABLE(stg_ARR_WORDS, 0, 0, ARR_WORDS, "ARR_WORDS", "ARR_WORDS") { foreign "C" barf("ARR_WORDS object entered!") never returns; } -INFO_TABLE(stg_MUT_ARR_PTRS_CLEAN, 0, 0, MUT_ARR_PTRS_CLEAN, "MUT_ARR_PTRS_CLEAN", "MUT_ARR_PTRS_CLEAN") -{ foreign "C" barf("MUT_ARR_PTRS_CLEAN object entered!") never returns; } +INFO_TABLE(stg_MUT_ARR_PTRS_LOCAL, 0, 0, MUT_ARR_PTRS_LOCAL, "MUT_ARR_PTRS_LOCAL", "MUT_ARR_PTRS_LOCAL") +{ foreign "C" barf("MUT_ARR_PTRS_LOCAL object entered!") never returns; } hunk ./rts/StgMiscClosures.cmm 688 -INFO_TABLE(stg_MUT_ARR_PTRS_DIRTY, 0, 0, MUT_ARR_PTRS_DIRTY, "MUT_ARR_PTRS_DIRTY", "MUT_ARR_PTRS_DIRTY") -{ foreign "C" barf("MUT_ARR_PTRS_DIRTY object entered!") never returns; } +INFO_TABLE(stg_MUT_ARR_PTRS_GLOBAL, 0, 0, MUT_ARR_PTRS_GLOBAL, "MUT_ARR_PTRS_GLOBAL", "MUT_ARR_PTRS_GLOBAL") +{ foreign "C" barf("MUT_ARR_PTRS_GLOBAL object entered!") never returns; } INFO_TABLE(stg_MUT_ARR_PTRS_FROZEN, 0, 0, MUT_ARR_PTRS_FROZEN, "MUT_ARR_PTRS_FROZEN", "MUT_ARR_PTRS_FROZEN") { foreign "C" barf("MUT_ARR_PTRS_FROZEN object entered!") never returns; } hunk ./rts/StgMiscClosures.cmm 701 Mutable Variables ------------------------------------------------------------------------- */ -INFO_TABLE(stg_MUT_VAR_CLEAN, 1, 0, MUT_VAR_CLEAN, "MUT_VAR_CLEAN", "MUT_VAR_CLEAN") -{ foreign "C" barf("MUT_VAR_CLEAN object entered!") never returns; } -INFO_TABLE(stg_MUT_VAR_DIRTY, 1, 0, MUT_VAR_DIRTY, "MUT_VAR_DIRTY", "MUT_VAR_DIRTY") -{ foreign "C" barf("MUT_VAR_DIRTY object entered!") never returns; } +INFO_TABLE(stg_MUT_VAR_LOCAL, 1, 0, MUT_VAR_LOCAL, "MUT_VAR_LOCAL", "MUT_VAR_LOCAL") +{ foreign "C" barf("MUT_VAR_LOCAL object entered!") never returns; } +INFO_TABLE(stg_MUT_VAR_GLOBAL, 1, 0, MUT_VAR_GLOBAL, "MUT_VAR_GLOBAL", "MUT_VAR_GLOBAL") +{ foreign "C" barf("MUT_VAR_GLOBAL object entered!") never returns; } /* ---------------------------------------------------------------------------- Dummy return closure hunk ./rts/StgStartup.cmm 99 stg_returnToSched { + W_ new_tso; SAVE_THREAD_STATE(); hunk ./rts/StgStartup.cmm 101 - foreign "C" threadPaused(MyCapability() "ptr", CurrentTSO); + ("ptr" new_tso) = foreign "C" threadPaused(MyCapability() "ptr", CurrentTSO); + CurrentTSO = new_tso; jump StgReturn; } hunk ./rts/StgStartup.cmm 122 // the MVar on which we are about to block in SMP mode. stg_returnToSchedButFirst { + W_ new_tso; SAVE_THREAD_STATE(); hunk ./rts/StgStartup.cmm 124 - foreign "C" threadPaused(MyCapability() "ptr", CurrentTSO); + ("ptr" new_tso) = foreign "C" threadPaused(MyCapability() "ptr", CurrentTSO); + CurrentTSO = new_tso; jump R2; } hunk ./rts/Task.h 210 // void taskTimeStamp (Task *task); +// The current Task has finished a GC, record the amount of time spent. +void taskDoneGC (Task *task, Ticks cpu_time, Ticks elapsed_time); + // Put the task back on the free list, mark it stopped. Used by // forkProcess(). // hunk ./rts/ThreadPaused.c 178 * here. We also take the opportunity to do stack squeezing if it's * turned on. * -------------------------------------------------------------------------- */ -void +StgTSO * // returns the new TSO, since it might have moved threadPaused(Capability *cap, StgTSO *tso) { StgClosure *frame; hunk ./rts/ThreadPaused.c 191 nat weight = 0; nat weight_pending = 0; rtsBool prev_was_update_frame = rtsFalse; + rtsBool tso_global; // Check to see whether we have threads waiting to raise // exceptions, and we're not blocking exceptions, or are blocked hunk ./rts/ThreadPaused.c 199 // TSO_BLOCKEX and becomes blocked interruptibly, this is the only // place we ensure that the blocked_exceptions get a chance. maybePerformBlockedException (cap, tso); - if (tso->what_next == ThreadKilled) { return; } + if (tso->what_next == ThreadKilled) { return tso; } // NB. Blackholing is *compulsory*, we must either do lazy // blackholing, or eager blackholing consistently. See Note hunk ./rts/ThreadPaused.c 209 frame = (StgClosure *)tso->stackobj->sp; + // used to decide whether we need to globalise the TSO + tso_global = isGlobal((StgClosure*)tso); + while ((P_)frame < stack_end) { info = get_ret_itbl(frame); hunk ./rts/ThreadPaused.c 214 - + switch (info->i.type) { case UPDATE_FRAME: hunk ./rts/ThreadPaused.c 285 } #endif + // .. we're about to write a TSO pointer into the BLACKHOLE, + // so we better not violate the global heap invariant: + if (Bdescr((P_)bh)->gen_no > 0 && !tso_global) { + globalise(cap,(StgClosure**)&tso); + tso_global = rtsTrue; + } + // The payload of the BLACKHOLE points to the TSO ((StgInd *)bh)->indirectee = (StgClosure *)tso; write_barrier(); hunk ./rts/ThreadPaused.c 297 SET_INFO(bh,&stg_BLACKHOLE_info); - // .. and we need a write barrier, since we just mutated the closure: - recordClosureMutated(cap,bh); - // We pretend that bh has just been created. LDV_RECORD_CREATE(bh); hunk ./rts/ThreadPaused.c 343 } else { tso->flags &= ~TSO_SQUEEZED; } + + return tso; } hunk ./rts/ThreadPaused.h 12 #ifndef THREADPAUSED_H #define THREADPAUSED_H -RTS_PRIVATE void threadPaused ( Capability *cap, StgTSO * ); +RTS_PRIVATE StgTSO * threadPaused ( Capability *cap, StgTSO * ); #endif /* THREADPAUSED_H */ hunk ./rts/Threads.c 73 size = MIN_STACK_WORDS + sizeofW(StgStack); } + tso = (StgTSO *)allocatePrim(cap, sizeofW(StgTSO)); + TICK_ALLOC_TSO(); + SET_HDR(tso, &stg_TSO_info, CCS_SYSTEM); + /* The size argument we are given includes all the per-thread * overheads: * hunk ./rts/Threads.c 89 * of a benchmark hack, but it doesn't do any harm. */ stack_size = round_to_mblocks(size - sizeofW(StgTSO)); - stack = (StgStack *)allocate(cap, stack_size); + stack = (StgStack *)allocatePrim(cap, stack_size); TICK_ALLOC_STACK(stack_size); SET_HDR(stack, &stg_STACK_info, CCS_SYSTEM); hunk ./rts/Threads.c 92 + stack->tso = tso; stack->stack_size = stack_size - sizeofW(StgStack); stack->sp = stack->stack + stack->stack_size; stack->dirty = 1; hunk ./rts/Threads.c 97 - tso = (StgTSO *)allocate(cap, sizeofW(StgTSO)); - TICK_ALLOC_TSO(); - SET_HDR(tso, &stg_TSO_info, CCS_SYSTEM); - // Always start with the compiled code evaluator tso->what_next = ThreadRunGHC; tso->why_blocked = NotBlocked; hunk ./rts/Threads.c 127 /* Link the new thread on the global thread list. */ - ACQUIRE_LOCK(&sched_mutex); - tso->id = next_thread_id++; // while we have the mutex - tso->global_link = g0->threads; - g0->threads = tso; - RELEASE_LOCK(&sched_mutex); + tso->id = atomic_inc((StgVolatilePtr)&next_thread_id) - 1; + tso->global_link = cap->r.rG0->threads; + cap->r.rG0->threads = tso; // ToDo: report the stack size in the event? traceEventCreateThread(cap, tso); hunk ./rts/Threads.c 178 prev = NULL; for (t = *queue; t != END_TSO_QUEUE; prev = t, t = t->_link) { - if (t == tso) { + if (t == tso) { if (prev) { setTSOLink(cap,prev,t->_link); t->_link = END_TSO_QUEUE; hunk ./rts/Threads.c 244 if (tso->cap != cap) { MessageWakeup *msg; - msg = (MessageWakeup *)allocate(cap,sizeofW(MessageWakeup)); + msg = (MessageWakeup *)allocateInGen(cap,global_gen_ix, + sizeofW(MessageWakeup)); SET_HDR(msg, &stg_MSG_TRY_WAKEUP_info, CCS_SYSTEM); msg->tso = tso; sendMessage(cap, tso->cap, (Message*)msg); hunk ./rts/Threads.c 249 - debugTraceCap(DEBUG_sched, cap, "message: try wakeup thread %ld on cap %d", - (lnat)tso->id, tso->cap->no); return; } #endif hunk ./rts/Threads.c 286 case BlockedOnBlackHole: case BlockedOnSTM: case ThreadMigrating: + case BlockedOnMsgGlobalise: goto unblock; default: hunk ./rts/Threads.c 325 // ThreadMigrating tells the target cap that it needs to be added to // the run queue when it receives the MSG_TRY_WAKEUP. tso->why_blocked = ThreadMigrating; + tso = globalise_TSO(from, tso); // after setting tso->cap tso->cap = to; tryWakeupThread(from, tso); } hunk ./rts/Threads.c 348 for (msg = bq->queue; msg != (MessageBlackHole*)END_TSO_QUEUE; msg = msg->link) { i = msg->header.info; - if (i != &stg_IND_info) { - ASSERT(i == &stg_MSG_BLACKHOLE_info); - tryWakeupThread(cap,msg->tso); - } +#ifdef THREADED_RTS + if (i == &stg_MSG_GLOBALISE_info) { + executeMessage(cap,(Message *)msg); + } else +#endif + if (i != &stg_STUB_MSG_BLACKHOLE_info) { + ASSERT(i == &stg_MSG_BLACKHOLE_info); + tryWakeupThread(cap,msg->tso); + } } // overwrite the BQ with an indirection so it will be hunk ./rts/Threads.c 367 // checking the owner field at the same time. bq->bh = 0; bq->queue = 0; bq->owner = 0; #endif - OVERWRITE_INFO(bq, &stg_IND_info); + OVERWRITE_PRIM_CLOSURE((StgClosure*)bq, &stg_STUB_BLOCKING_QUEUE_info, + sizeofW(StgBlockingQueue), sizeofW(StgInd)); } // If we update a closure that we know we BLACKHOLE'd, and the closure hunk ./rts/Threads.c 389 for (bq = tso->bq; bq != (StgBlockingQueue*)END_TSO_QUEUE; bq = next) { next = bq->link; - if (bq->header.info == &stg_IND_info) { + if (bq->header.info == &stg_STUB_BLOCKING_QUEUE_info) { // ToDo: could short it out right here, to avoid hunk ./rts/Threads.c 391 - // traversing this IND multiple times. + // traversing this STUB_BLOCKING_QUEUE multiple times. continue; } hunk ./rts/Threads.c 578 "allocating new stack chunk of size %d bytes", chunk_size * sizeof(W_)); - new_stack = (StgStack*) allocate(cap, chunk_size); + new_stack = (StgStack*) allocatePrim(cap, chunk_size); SET_HDR(new_stack, &stg_STACK_info, CCS_SYSTEM); TICK_ALLOC_STACK(chunk_size); hunk ./rts/Threads.c 582 + // we don't have to globalise the STACK, because it is only + // pointed to by TSO and STACK objects which are both private. + + new_stack->tso = tso; new_stack->dirty = 0; // begin clean, we'll mark it dirty below new_stack->stack_size = chunk_size - sizeofW(StgStack); new_stack->sp = new_stack->stack + new_stack->stack_size; hunk ./rts/Threads.c 641 // will be discarded after the first overflow, being replaced by a // non-moving 32k chunk. if (old_stack->sp == old_stack->stack + old_stack->stack_size) { - frame->next_chunk = new_stack; + frame->next_chunk = (StgStack*)END_TSO_QUEUE; // dummy } tso->stackobj = new_stack; hunk ./rts/Threads.c 753 case BlockedOnSTM: debugBelch("is blocked on an STM operation"); break; + case BlockedOnMsgGlobalise: + debugBelch("is blocked on globalisation of %p", + ((MessageGlobalise*)tso->block_info.closure)->req); + break; default: barf("printThreadBlockage: strange tso->why_blocked: %d for TSO %d (%d)", tso->why_blocked, tso->id, tso); hunk ./rts/Threads.c 791 void printAllThreads(void) { - StgTSO *t, *next; + StgTSO *t; nat i, g; Capability *cap; hunk ./rts/Threads.c 806 } debugBelch("other threads:\n"); - for (g = 0; g < RtsFlags.GcFlags.generations; g++) { - for (t = generations[g].threads; t != END_TSO_QUEUE; t = next) { + for (g = 0; g < total_generations; g++) { + for (t = all_generations[g].threads; t != END_TSO_QUEUE; t = t->global_link) { if (t->why_blocked != NotBlocked) { printThreadStatus(t); } hunk ./rts/Threads.c 811 - next = t->global_link; } } } hunk ./rts/Updates.cmm 51 /* ToDo: it might be a PAP, so we should check... */ TICK_UPD_CON_IN_NEW(sizeW_fromITBL(%GET_STD_INFO(updatee))); - updateWithIndirection(updatee, - R1, - jump %ENTRY_CODE(Sp(0))); + updateWithIndirectionToValue(updatee, + R1, + jump %ENTRY_CODE(Sp(0))); } hunk ./rts/Updates.cmm 80 // common case: it is still our BLACKHOLE if (v == CurrentTSO) { - updateWithIndirection(updatee, - R1, - jump %ENTRY_CODE(Sp(0))); + updateWithIndirectionToValue(updatee, + R1, + jump %ENTRY_CODE(Sp(0))); } // The other cases are all handled by the generic code hunk ./rts/Updates.h 12 #ifndef UPDATES_H #define UPDATES_H +#ifndef CMINUSMINUS +#include "sm/Globalise.h" +#endif + #ifndef CMINUSMINUS #include "BeginPrivate.h" #endif hunk ./rts/Updates.h 20 +#include "WritePolicy.h" + /* ----------------------------------------------------------------------------- Updates -------------------------------------------------------------------------- */ hunk ./rts/Updates.h 28 /* LDV profiling: * After all, we do *NOT* need to call LDV_RECORD_CREATE() for IND - * closures because they are inherently used. But, it corrupts - * the invariants that every closure keeps its creation time in the profiling - * field. So, we call LDV_RECORD_CREATE(). + * closures because they are inherently used. But, it corrupts the + * invariants that every closure keeps its creation time in the + * profiling field. So, we call LDV_RECORD_CREATE(). */ /* We have two versions of this macro (sadly), one for use in C-- code, hunk ./rts/Updates.h 45 */ #ifdef CMINUSMINUS -#define updateWithIndirection(p1, p2, and_then) \ - W_ bd; \ +// NB. for young-generation updates we use stg_IND_direct_info as an +// optimisation - this kind of indirection assumes that the indirectee +// is a value, so can just return its payload rather than entering it. +// Hence the argument p2 here is required to be a value, not another +// indirection, BLACKHOLE, or thunk. +#define updateWithIndirectionToValue(p1, p2, and_then) \ + W_ bd, p3; \ \ OVERWRITING_CLOSURE(p1); \ hunk ./rts/Updates.h 54 - StgInd_indirectee(p1) = p2; \ - prim %write_barrier() []; \ - SET_INFO(p1, stg_BLACKHOLE_info); \ - LDV_RECORD_CREATE(p1); \ - bd = Bdescr(p1); \ + bd = Bdescr(p1); \ if (bdescr_gen_no(bd) != 0 :: bits16) { \ hunk ./rts/Updates.h 56 - recordMutableCap(p1, TO_W_(bdescr_gen_no(bd)), R1); \ + ("ptr" p3) = foreign "C" UPDATE_GLOBALISE(MyCapability(), p2 "ptr"); \ + StgInd_indirectee(p1) = p3; \ + prim %write_barrier() []; \ + SET_INFO(p1, stg_BLACKHOLE_info); \ + LDV_RECORD_CREATE(p1); \ TICK_UPD_OLD_IND(); \ and_then; \ } else { \ hunk ./rts/Updates.h 64 + StgInd_indirectee(p1) = p2; \ + prim %write_barrier() []; \ + SET_INFO(p1, stg_IND_direct_info); \ + LDV_RECORD_CREATE(p1); \ TICK_UPD_NEW_IND(); \ and_then; \ } hunk ./rts/Updates.h 74 #else /* !CMINUSMINUS */ +// NB. unlike updateWithIndirectionToValue above, p2 is not required +// to be a value, and we use the more generic stg_IND_info. INLINE_HEADER void updateWithIndirection (Capability *cap, StgClosure *p1, StgClosure *p2) hunk ./rts/Updates.h 81 { bdescr *bd; + StgClosure *p3; ASSERT( (P_)p1 != (P_)p2 ); /* not necessarily true: ASSERT( !closure_IND(p1) ); */ hunk ./rts/Updates.h 87 /* occurs in RaiseAsync.c:raiseAsync() */ OVERWRITING_CLOSURE(p1); - ((StgInd *)p1)->indirectee = p2; - write_barrier(); - SET_INFO(p1, &stg_BLACKHOLE_info); - LDV_RECORD_CREATE(p1); bd = Bdescr((StgPtr)p1); if (bd->gen_no != 0) { hunk ./rts/Updates.h 89 - recordMutableCap(p1, cap, bd->gen_no); + p3 = UPDATE_GLOBALISE(cap, p2); + ((StgInd *)p1)->indirectee = p3; + write_barrier(); + SET_INFO(p1, &stg_BLACKHOLE_info); + LDV_RECORD_CREATE(p1); TICK_UPD_OLD_IND(); } else { hunk ./rts/Updates.h 96 + ((StgInd *)p1)->indirectee = p2; + write_barrier(); + SET_INFO(p1, &stg_IND_info); + LDV_RECORD_CREATE(p1); TICK_UPD_NEW_IND(); } } hunk ./rts/Weak.c 34 } void -runAllCFinalizers(StgWeak *list) +runAllCFinalizers(void) { StgWeak *w; Task *task; hunk ./rts/Weak.c 38 + nat g; + generation *gen; task = myTask(); if (task != NULL) { hunk ./rts/Weak.c 46 task->running_finalizers = rtsTrue; } - for (w = list; w; w = w->link) { - StgArrWords *farr; - - farr = (StgArrWords *)UNTAG_CLOSURE(w->cfinalizer); - - if ((StgClosure *)farr != &stg_NO_FINALIZER_closure) - runCFinalizer((void *)farr->payload[0], - (void *)farr->payload[1], - (void *)farr->payload[2], - farr->payload[3]); + for (g = 0; g < total_generations; g++) { + gen = &all_generations[g]; + + for (w = gen->weak_ptrs; w; w = w->link) { + StgArrWords *farr; + + farr = (StgArrWords *)UNTAG_CLOSURE(w->cfinalizer); + + if ((StgClosure *)farr != &stg_NO_FINALIZER_closure) + runCFinalizer((void *)farr->payload[0], + (void *)farr->payload[1], + (void *)farr->payload[2], + farr->payload[3]); + } } if (task != NULL) { hunk ./rts/Weak.c 102 for (w = list; w; w = w->link) { StgArrWords *farr; - // Better not be a DEAD_WEAK at this stage; the garbage - // collector removes DEAD_WEAKs from the weak pointer list. - ASSERT(w->header.info != &stg_DEAD_WEAK_info); + // Better be a DEAD_WEAK + ASSERT(w->header.info == &stg_DEAD_WEAK_info); if (w->finalizer != &stg_NO_FINALIZER_closure) { n++; hunk ./rts/Weak.c 117 (void *)farr->payload[2], farr->payload[3]); -#ifdef PROFILING - // A weak pointer is inherently used, so we do not need to call - // LDV_recordDead(). - // - // Furthermore, when PROFILING is turned on, dead weak - // pointers are exactly as large as weak pointers, so there is - // no need to fill the slop, either. See stg_DEAD_WEAK_info - // in StgMiscClosures.hc. -#endif - SET_HDR(w, &stg_DEAD_WEAK_info, w->header.prof.ccs); } if (task != NULL) { hunk ./rts/Weak.h 20 extern StgWeak * weak_ptr_list; void runCFinalizer(void *fn, void *ptr, void *env, StgWord flag); -void runAllCFinalizers(StgWeak *w); +void runAllCFinalizers(void); void scheduleFinalizers(Capability *cap, StgWeak *w); void markWeakList(void); addfile ./rts/WritePolicy.h hunk ./rts/WritePolicy.h 1 +/* ----------------------------------------------------------------------------- + * + * (c) The GHC Team, 2010 + * + * Write-barrier globalisation policy + * + * ---------------------------------------------------------------------------*/ + +#ifndef WRITE_POLICY_H +#define WRITE_POLICY_H + +#define UPDATE_GLOBALISE globalise_ +#define MUT_VAR_GLOBALISE globaliseFull_ +#define MUT_ARR_GLOBALISE globaliseFull_ +#define MVAR_GLOBALISE globaliseFull_ +#define TVAR_GLOBALISE globaliseFull_ +#define SPARK_GLOBALISE globaliseFull_ +#define MSG_GLOB_GLOBALISE globaliseFull_ + +#endif /* WRITE_POLICY_H */ hunk ./rts/ghc.mk 69 cp $< $@ rts/dist/build/sm/Scav_thr.c : rts/sm/Scav.c | $$(dir $$@)/. cp $< $@ +rts/dist/build/sm/Evac_loc.c : rts/sm/Evac.c | $$(dir $$@)/. + cp $< $@ +rts/dist/build/sm/Scav_loc.c : rts/sm/Scav.c | $$(dir $$@)/. + cp $< $@ rts_H_FILES = $(wildcard includes/*.h) $(wildcard rts/*.h) hunk ./rts/ghc.mk 146 endif ifneq "$$(findstring thr, $1)" "" -rts_$1_EXTRA_C_SRCS = rts/dist/build/sm/Evac_thr.c rts/dist/build/sm/Scav_thr.c +rts_$1_EXTRA_C_SRCS = rts/dist/build/sm/Evac_thr.c \ + rts/dist/build/sm/Scav_thr.c \ + rts/dist/build/sm/Evac_loc.c \ + rts/dist/build/sm/Scav_loc.c endif $(call distdir-way-opts,rts,dist,$1) hunk ./rts/ghc.mk 394 # -O3 helps unroll some loops (especially in copy() with a constant argument). rts/sm/Evac_CC_OPTS += -funroll-loops rts/dist/build/sm/Evac_thr_HC_OPTS += -optc-funroll-loops +rts/dist/build/sm/Evac_loc_HC_OPTS += -optc-funroll-loops + +rts/sm/Globalise_CC_OPTS += -funroll-loops # These files are just copies of sm/Evac.c and sm/Scav.c respectively, # but compiled with -DPARALLEL_GC. hunk ./rts/ghc.mk 402 rts/dist/build/sm/Evac_thr_CC_OPTS += -DPARALLEL_GC -Irts/sm rts/dist/build/sm/Scav_thr_CC_OPTS += -DPARALLEL_GC -Irts/sm +rts/dist/build/sm/Evac_loc_CC_OPTS += -DLOCAL_GC -Irts/sm +rts/dist/build/sm/Scav_loc_CC_OPTS += -DLOCAL_GC -Irts/sm #----------------------------------------------------------------------------- # Add PAPI library if needed hunk ./rts/sm/BlockAlloc.c 360 bd = alloc_mega_group(1); bd->blocks = n; initGroup(bd); // we know the group will fit + IF_DEBUG(sanity,memset(bd->start, 0xaa, bd->blocks * BLOCK_SIZE)); rem = bd + n; rem->blocks = BLOCKS_PER_MBLOCK-n; initGroup(rem); // init the slop hunk ./rts/sm/BlockAlloc.c 483 // Todo: not true in multithreaded GC // ASSERT_SM_LOCK(); + IF_DEBUG(sanity, checkFreeListSanity()); + ASSERT(p->free != (P_)-1); p->free = (void *)-1; /* indicates that this block is free */ hunk ./rts/sm/Compact.c 21 #include "RtsUtils.h" #include "BlockAlloc.h" #include "GC.h" +#include "GCThread.h" #include "Compact.h" #include "Schedule.h" #include "Apply.h" hunk ./rts/sm/Compact.c 511 // nothing to follow continue; - case MUT_ARR_PTRS_CLEAN: - case MUT_ARR_PTRS_DIRTY: + case MUT_ARR_PTRS_LOCAL: + case MUT_ARR_PTRS_GLOBAL: case MUT_ARR_PTRS_FROZEN: case MUT_ARR_PTRS_FROZEN0: // follow everything hunk ./rts/sm/Compact.c 529 case STACK: { StgStack *stack = (StgStack*)p; + thread_(&stack->tso); thread_stack(stack->sp, stack->stack + stack->stack_size); continue; } hunk ./rts/sm/Compact.c 634 case CONSTR: case PRIM: case MUT_PRIM: - case MUT_VAR_CLEAN: - case MUT_VAR_DIRTY: + case MUT_VAR_LOCAL: + case MUT_VAR_GLOBAL: case BLACKHOLE: case BLOCKING_QUEUE: { hunk ./rts/sm/Compact.c 674 case IND: case IND_PERM: + case IND_LOCAL: thread(&((StgInd *)p)->indirectee); return p + sizeofW(StgInd); hunk ./rts/sm/Compact.c 697 case ARR_WORDS: return p + arr_words_sizeW((StgArrWords *)p); - case MUT_ARR_PTRS_CLEAN: - case MUT_ARR_PTRS_DIRTY: + case MUT_ARR_PTRS_LOCAL: + case MUT_ARR_PTRS_GLOBAL: case MUT_ARR_PTRS_FROZEN: case MUT_ARR_PTRS_FROZEN0: // follow everything hunk ./rts/sm/Compact.c 719 case STACK: { StgStack *stack = (StgStack*)p; + thread_(&stack->tso); thread_stack(stack->sp, stack->stack + stack->stack_size); return p + stack_sizeW(stack); } hunk ./rts/sm/Compact.c 940 } void -compact(StgClosure *static_objects) +compact(gc_thread *gct) { nat g, blocks; generation *gen; hunk ./rts/sm/Compact.c 948 // 1. thread the roots markCapabilities((evac_fn)thread_root, NULL); + markScheduler((evac_fn)thread_root, NULL); + // the weak pointer lists... hunk ./rts/sm/Compact.c 951 - if (weak_ptr_list != NULL) { - thread((void *)&weak_ptr_list); + for (g = 0; g < total_generations; g++) { + gen = &all_generations[g]; + if (gen->weak_ptrs != NULL) { + thread((void *)&gen->weak_ptrs); + } } hunk ./rts/sm/Compact.c 957 - if (old_weak_ptr_list != NULL) { - thread((void *)&old_weak_ptr_list); // tmp + if (gct->old_weak_ptrs != NULL) { + thread((void *)&gct->old_weak_ptrs); } // mutable lists hunk ./rts/sm/Compact.c 966 bdescr *bd; StgPtr p; nat n; - for (bd = generations[g].mut_list; bd != NULL; bd = bd->link) { - for (p = bd->start; p < bd->free; p++) { - thread((StgClosure **)p); - } - } for (n = 0; n < n_capabilities; n++) { for (bd = capabilities[n].mut_lists[g]; bd != NULL; bd = bd->link) { hunk ./rts/sm/Compact.c 977 } // the global thread list - for (g = 0; g < RtsFlags.GcFlags.generations; g++) { - thread((void *)&generations[g].threads); + for (g = 0; g < total_generations; g++) { + thread((void *)&all_generations[g].threads); } // any threads resurrected during this GC hunk ./rts/sm/Compact.c 982 - thread((void *)&resurrected_threads); + thread((void *)&gct->resurrected_threads); + + // any threads resurrected during this GC + thread((void *)&gct->exception_threads); // the task list { hunk ./rts/sm/Compact.c 1002 } // the static objects - thread_static(static_objects /* ToDo: ok? */); + thread_static(gct->scavenged_static_objects); // the stable pointer table threadStablePtrTable((evac_fn)thread_root, NULL); hunk ./rts/sm/Compact.c 1011 markCAFs((evac_fn)thread_root, NULL); // 2. update forward ptrs - for (g = 0; g < RtsFlags.GcFlags.generations; g++) { - gen = &generations[g]; + for (g = 0; g < total_generations; g++) { + gen = &all_generations[g]; debugTrace(DEBUG_gc, "update_fwd: %d", g); update_fwd(gen->blocks); hunk ./rts/sm/Compact.c 1017 update_fwd_large(gen->scavenged_large_objects); - if (g == RtsFlags.GcFlags.generations-1 && gen->old_blocks != NULL) { + if (g == total_generations-1 && gen->old_blocks != NULL) { debugTrace(DEBUG_gc, "update_fwd: %d (compact)", g); update_fwd_compact(gen->old_blocks); } hunk ./rts/sm/Compact.h 18 #define SM_COMPACT_H #include "BeginPrivate.h" +#include "GCThread.h" INLINE_HEADER void mark(StgPtr p, bdescr *bd) hunk ./rts/sm/Compact.h 23 { - nat offset_within_block = p - bd->start; // in words + StgWord offset_within_block = ((W_)p & BLOCK_MASK) / sizeof(W_); // in words StgPtr bitmap_word = (StgPtr)bd->u.bitmap + (offset_within_block / (sizeof(W_)*BITS_PER_BYTE)); StgWord bit_mask = (StgWord)1 << (offset_within_block & (sizeof(W_)*BITS_PER_BYTE - 1)); hunk ./rts/sm/Compact.h 27 + ASSERT(((W_)bitmap_word & MBLOCK_MASK) >= FIRST_BLOCK_OFF); *bitmap_word |= bit_mask; } hunk ./rts/sm/Compact.h 34 INLINE_HEADER void unmark(StgPtr p, bdescr *bd) { - nat offset_within_block = p - bd->start; // in words + StgWord offset_within_block = ((W_)p & BLOCK_MASK) / sizeof(W_); // in words StgPtr bitmap_word = (StgPtr)bd->u.bitmap + (offset_within_block / (sizeof(W_)*BITS_PER_BYTE)); StgWord bit_mask = (StgWord)1 << (offset_within_block & (sizeof(W_)*BITS_PER_BYTE - 1)); hunk ./rts/sm/Compact.h 44 INLINE_HEADER StgWord is_marked(StgPtr p, bdescr *bd) { - nat offset_within_block = p - bd->start; // in words + StgWord offset_within_block = ((W_)p & BLOCK_MASK) / sizeof(W_); // in words StgPtr bitmap_word = (StgPtr)bd->u.bitmap + (offset_within_block / (sizeof(W_)*BITS_PER_BYTE)); StgWord bit_mask = (StgWord)1 << (offset_within_block & (sizeof(W_)*BITS_PER_BYTE - 1)); hunk ./rts/sm/Compact.h 51 return (*bitmap_word & bit_mask); } -void compact (StgClosure *static_objects); +void compact (gc_thread *); #include "EndPrivate.h" hunk ./rts/sm/Evac.c 21 #include "Storage.h" #include "GC.h" #include "GCThread.h" +#include "GCTDecl.h" #include "GCUtils.h" #include "Compact.h" #include "MarkStack.h" hunk ./rts/sm/Evac.c 33 StgWord64 whitehole_spin = 0; #endif -#if defined(THREADED_RTS) && !defined(PARALLEL_GC) -#define evacuate(p) evacuate1(p) -#define HEAP_ALLOCED_GC(p) HEAP_ALLOCED(p) +#if defined(THREADED_RTS) +#if defined(LOCAL_GC) +#define evacuate(p) evacuate_local(p) +#elif !defined(PARALLEL_GC) +#define evacuate(p) evacuate_seq(p) +// evacuate() is the safe one; evacuate_seq() can only be used for +// sequential GC. +#endif #endif #if !defined(PARALLEL_GC) hunk ./rts/sm/Evac.c 44 +#define HEAP_ALLOCED_GC(p) HEAP_ALLOCED(p) + #define copy_tag_nolock(p, info, src, size, stp, tag) \ copy_tag(p, info, src, size, stp, tag) #endif hunk ./rts/sm/Evac.c 62 -------------------------------------------------------------------------- */ STATIC_INLINE StgPtr -alloc_for_copy (nat size, generation *gen) +alloc_for_copy (nat size, nat gen_ix) { StgPtr to; gen_workspace *ws; hunk ./rts/sm/Evac.c 72 * evacuate to an older generation, adjust it here (see comment * by evacuate()). */ - if (gen < gct->evac_gen) { + if (gen_ix < gct->evac_gen_ix) { if (gct->eager_promotion) { hunk ./rts/sm/Evac.c 74 - gen = gct->evac_gen; + gen_ix = gct->evac_gen_ix; } else { gct->failed_to_evac = rtsTrue; } hunk ./rts/sm/Evac.c 80 } - ws = &gct->gens[gen->no]; - // this compiles to a single mem access to gen->abs_no only + ws = &gct->gens[gen_ix]; /* chain a new block onto the to-space for the destination gen if * necessary. hunk ./rts/sm/Evac.c 100 -------------------------------------------------------------------------- */ STATIC_INLINE GNUC_ATTR_HOT void -copy_tag(StgClosure **p, const StgInfoTable *info, - StgClosure *src, nat size, generation *gen, StgWord tag) +copy_closure(StgPtr from, StgPtr to, const StgInfoTable *info, nat size) { hunk ./rts/sm/Evac.c 102 - StgPtr to, from; nat i; hunk ./rts/sm/Evac.c 104 - to = alloc_for_copy(size,gen); - - from = (StgPtr)src; to[0] = (W_)info; hunk ./rts/sm/Evac.c 105 - for (i = 1; i < size; i++) { // unroll for small i + for (i = 1; i < size; i++) { // unroll for small size to[i] = from[i]; } hunk ./rts/sm/Evac.c 108 +} hunk ./rts/sm/Evac.c 110 -// if (to+size+2 < bd->start + BLOCK_SIZE_W) { -// __builtin_prefetch(to + size + 2, 1); -// } +STATIC_INLINE GNUC_ATTR_HOT void +copy_tag(StgClosure **p, const StgInfoTable *info, + StgClosure *src, nat size, nat gen_ix, StgWord tag) +{ + StgPtr to; + + to = alloc_for_copy(size,gen_ix); + copy_closure((StgPtr)src,to,info,size); #if defined(PARALLEL_GC) { hunk ./rts/sm/Evac.c 137 #ifdef PROFILING // We store the size of the just evacuated object in the LDV word so that // the profiler can guess the position of the next object later. - SET_EVACUAEE_FOR_LDV(from, size); + SET_EVACUAEE_FOR_LDV(src, size); #endif } hunk ./rts/sm/Evac.c 144 #if defined(PARALLEL_GC) STATIC_INLINE void copy_tag_nolock(StgClosure **p, const StgInfoTable *info, - StgClosure *src, nat size, generation *gen, StgWord tag) + StgClosure *src, nat size, nat gen_ix, StgWord tag) { hunk ./rts/sm/Evac.c 146 - StgPtr to, from; - nat i; - - to = alloc_for_copy(size,gen); + StgPtr to; hunk ./rts/sm/Evac.c 148 - from = (StgPtr)src; - to[0] = (W_)info; - for (i = 1; i < size; i++) { // unroll for small i - to[i] = from[i]; - } + to = alloc_for_copy(size,gen_ix); + copy_closure((StgPtr)src,to,info,size); // if somebody else reads the forwarding pointer, we better make // sure there's a closure at the end of it. hunk ./rts/sm/Evac.c 164 #ifdef PROFILING // We store the size of the just evacuated object in the LDV word so that // the profiler can guess the position of the next object later. - SET_EVACUAEE_FOR_LDV(from, size); + SET_EVACUAEE_FOR_LDV(src, size); #endif } #endif hunk ./rts/sm/Evac.c 175 */ static rtsBool copyPart(StgClosure **p, StgClosure *src, nat size_to_reserve, - nat size_to_copy, generation *gen) + nat size_to_copy, nat gen_ix) { hunk ./rts/sm/Evac.c 177 - StgPtr to, from; - nat i; + StgPtr to; StgWord info; #if defined(PARALLEL_GC) hunk ./rts/sm/Evac.c 198 info = (W_)src->header.info; #endif - to = alloc_for_copy(size_to_reserve, gen); + to = alloc_for_copy(size_to_reserve, gen_ix); hunk ./rts/sm/Evac.c 200 - from = (StgPtr)src; - to[0] = info; - for (i = 1; i < size_to_copy; i++) { // unroll for small i - to[i] = from[i]; - } + copy_closure((StgPtr)src,to,(const StgInfoTable*)info,size_to_copy); write_barrier(); src->header.info = (const StgInfoTable*)MK_FORWARDING_PTR(to); hunk ./rts/sm/Evac.c 209 #ifdef PROFILING // We store the size of the just evacuated object in the LDV word so that // the profiler can guess the position of the next object later. - SET_EVACUAEE_FOR_LDV(from, size_to_reserve); + SET_EVACUAEE_FOR_LDV(src, size_to_reserve); // fill the slop if (size_to_reserve - size_to_copy > 0) LDV_FILL_SLOP(to + size_to_copy, (int)(size_to_reserve - size_to_copy)); hunk ./rts/sm/Evac.c 222 /* Copy wrappers that don't tag the closure after copying */ STATIC_INLINE GNUC_ATTR_HOT void copy(StgClosure **p, const StgInfoTable *info, - StgClosure *src, nat size, generation *gen) + StgClosure *src, nat size, nat gen_ix) { hunk ./rts/sm/Evac.c 224 - copy_tag(p,info,src,size,gen,0); + copy_tag(p,info,src,size,gen_ix,0); } /* ----------------------------------------------------------------------------- hunk ./rts/sm/Evac.c 243 { bdescr *bd; generation *gen, *new_gen; + nat gen_ix, new_gen_ix; gen_workspace *ws; bd = Bdescr(p); hunk ./rts/sm/Evac.c 247 + gen_ix = bd->gen_ix; gen = bd->gen; hunk ./rts/sm/Evac.c 249 - ACQUIRE_SPIN_LOCK(&gen->sync_large_objects); + ACQUIRE_SPIN_LOCK(&gen->sync); // already evacuated? if (bd->flags & BF_EVACUATED) { hunk ./rts/sm/Evac.c 256 /* Don't forget to set the gct->failed_to_evac flag if we didn't get * the desired destination (see comments in evacuate()). */ - if (gen < gct->evac_gen) { + if (gen_ix < gct->evac_gen_ix) { gct->failed_to_evac = rtsTrue; TICK_GC_FAILED_PROMOTION(); } hunk ./rts/sm/Evac.c 260 - RELEASE_SPIN_LOCK(&gen->sync_large_objects); + RELEASE_SPIN_LOCK(&gen->sync); return; } hunk ./rts/sm/Evac.c 276 /* link it on to the evacuated large object list of the destination gen */ - new_gen = bd->dest; - if (new_gen < gct->evac_gen) { + new_gen_ix = bd->dest_ix; + + if (new_gen_ix < gct->evac_gen_ix) { if (gct->eager_promotion) { hunk ./rts/sm/Evac.c 280 - new_gen = gct->evac_gen; + new_gen_ix = gct->evac_gen_ix; } else { gct->failed_to_evac = rtsTrue; } hunk ./rts/sm/Evac.c 286 } - ws = &gct->gens[new_gen->no]; + ws = &gct->gens[new_gen_ix]; + new_gen = &all_generations[new_gen_ix]; bd->flags |= BF_EVACUATED; initBdescr(bd, new_gen, new_gen->to); hunk ./rts/sm/Evac.c 298 // them straight on the scavenged_large_objects list. if (bd->flags & BF_PINNED) { ASSERT(get_itbl((StgClosure *)p)->type == ARR_WORDS); - if (new_gen != gen) { ACQUIRE_SPIN_LOCK(&new_gen->sync_large_objects); } + if (new_gen != gen) { ACQUIRE_SPIN_LOCK(&new_gen->sync); } dbl_link_onto(bd, &new_gen->scavenged_large_objects); new_gen->n_scavenged_large_blocks += bd->blocks; hunk ./rts/sm/Evac.c 301 - if (new_gen != gen) { RELEASE_SPIN_LOCK(&new_gen->sync_large_objects); } + if (new_gen != gen) { RELEASE_SPIN_LOCK(&new_gen->sync); } } else { bd->link = ws->todo_large_objects; ws->todo_large_objects = bd; hunk ./rts/sm/Evac.c 307 } - RELEASE_SPIN_LOCK(&gen->sync_large_objects); + RELEASE_SPIN_LOCK(&gen->sync); } /* ---------------------------------------------------------------------------- hunk ./rts/sm/Evac.c 342 as good as it's going to get. We pass the argument to evacuate() in a register using the 'regparm' attribute (see the prototype for evacuate() near the top of this file). - - Changing evacuate() to take an (StgClosure **) rather than - returning the new pointer seems attractive, because we can avoid - writing back the pointer when it hasn't changed (eg. for a static - object, or an object in a generation > N). However, I tried it and - it doesn't help. One reason is that the (StgClosure **) pointer - gets spilled to the stack inside evacuate(), resulting in far more - extra reads/writes than we save. ------------------------------------------------------------------------- */ REGPARM1 GNUC_ATTR_HOT void hunk ./rts/sm/Evac.c 348 evacuate(StgClosure **p) { bdescr *bd = NULL; - generation *gen; + nat gen_ix; StgClosure *q; const StgInfoTable *info; StgWord tag; hunk ./rts/sm/Evac.c 360 tag = GET_CLOSURE_TAG(q); q = UNTAG_CLOSURE(q); +loop2: + ASSERTM(LOOKS_LIKE_CLOSURE_PTR(q), "invalid closure, info=%p", q->header.info); if (!HEAP_ALLOCED_GC(q)) { hunk ./rts/sm/Evac.c 366 +#ifdef LOCAL_GC + return; +#else if (!major_gc) return; info = get_itbl(q); hunk ./rts/sm/Evac.c 460 default: barf("evacuate(static): strange closure type %d", (int)(info->type)); } +#endif /* !LOCAL_GC */ } bd = Bdescr((P_)q); hunk ./rts/sm/Evac.c 465 - if ((bd->flags & (BF_LARGE | BF_MARKED | BF_EVACUATED)) != 0) { +#if defined(LOCAL_GC) + // make sure this isn't a pointer into the wrong local heap + ASSERT(bd->gen_no > gct->collect_gen || bd->gen_ix == gct->index || + ((bd->flags & BF_PRIM) && isGlobalPrim(q))); +#endif + + // in local GC, we mark/sweep BF_PRIM blocks, but in global GC + // we copy them. + if ((bd->flags & (BF_LARGE | BF_MARKED | BF_EVACUATED | BF_PRIM)) != 0) { // pointer into to-space: just return it. It might be a pointer // into a generation that we aren't collecting (> N), or it hunk ./rts/sm/Evac.c 485 // We aren't copying this object, so we have to check // whether it is already in the target generation. (this is // the write barrier). - if (bd->gen < gct->evac_gen) { + if (bd->gen_ix < gct->evac_gen_ix) { gct->failed_to_evac = rtsTrue; TICK_GC_FAILED_PROMOTION(); } hunk ./rts/sm/Evac.c 499 return; } + if ((bd->flags & BF_PRIM)) { + // we may have to record a failed evac here, so that the + // parent object is recorded in the remembered set. + // However, at the end of GC globalise_mut_list will remove + // the mut_list entry and globalise the transitive closure + // of objects pointed to by it. + if (bd->gen_ix < gct->evac_gen_ix) { + gct->failed_to_evac = rtsTrue; + TICK_GC_FAILED_PROMOTION(); + } + // beware, this object may be in another Capability's local + // heap. If so, it ought to be global. + if (!isGlobalPrim((StgClosure*)q)) { + ASSERT(gct->gc_type == GC_LOCAL ? bd->gen_ix == gct->index : 1); + if (!is_marked((P_)q,bd)) { + mark((P_)q,bd); + push_mark_stack(q); + } + } + return; + } + /* If the object is in a gen that we're compacting, then we * need to use an alternative evacuate procedure. */ hunk ./rts/sm/Evac.c 526 if (!is_marked((P_)q,bd)) { mark((P_)q,bd); - push_mark_stack((P_)q); + push_mark_stack(q); } return; } hunk ./rts/sm/Evac.c 530 - - gen = bd->dest; info = q->header.info; if (IS_FORWARDING_PTR(info)) hunk ./rts/sm/Evac.c 534 { - /* Already evacuated, just return the forwarding address. - * HOWEVER: if the requested destination generation (gct->evac_gen) is - * older than the actual generation (because the object was - * already evacuated to a younger generation) then we have to - * set the gct->failed_to_evac flag to indicate that we couldn't - * manage to promote the object to the desired generation. - */ - /* - * Optimisation: the check is fairly expensive, but we can often - * shortcut it if either the required generation is 0, or the - * current object (the EVACUATED) is in a high enough generation. - * We know that an EVACUATED always points to an object in the - * same or an older generation. gen is the lowest generation that the - * current object would be evacuated to, so we only do the full - * check if gen is too low. - */ - StgClosure *e = (StgClosure*)UN_FORWARDING_PTR(info); - *p = TAG_CLOSURE(tag,e); - if (gen < gct->evac_gen) { // optimisation - if (Bdescr((P_)e)->gen < gct->evac_gen) { - gct->failed_to_evac = rtsTrue; - TICK_GC_FAILED_PROMOTION(); - } - } - return; + q = (StgClosure*)UN_FORWARDING_PTR(info); + *p = TAG_CLOSURE(tag,q); + // we must loop here: the forwarding pointer might be into + // from-space if this is a globalised closure. + goto loop2; // loop2 not loop: keep the previous tag } hunk ./rts/sm/Evac.c 541 + gen_ix = bd->dest_ix; + switch (INFO_PTR_TO_STRUCT(info)->type) { case WHITEHOLE: hunk ./rts/sm/Evac.c 548 goto loop; - case MUT_VAR_CLEAN: - case MUT_VAR_DIRTY: - case MVAR_CLEAN: - case MVAR_DIRTY: - copy(p,info,q,sizeW_fromITBL(INFO_PTR_TO_STRUCT(info)),gen); - return; - // For ints and chars of low value, save space by replacing references to // these with closures with references to common, shared ones in the RTS. // hunk ./rts/sm/Evac.c 557 case CONSTR_0_1: { #if defined(__PIC__) && defined(mingw32_HOST_OS) - copy_tag_nolock(p,info,q,sizeofW(StgHeader)+1,gen,tag); + copy_tag_nolock(p,info,q,sizeofW(StgHeader)+1,gen_ix,tag); #else StgWord w = (StgWord)q->payload[0]; if (info == Czh_con_info && hunk ./rts/sm/Evac.c 574 ); } else { - copy_tag_nolock(p,info,q,sizeofW(StgHeader)+1,gen,tag); + copy_tag_nolock(p,info,q,sizeofW(StgHeader)+1,gen_ix,tag); } #endif return; hunk ./rts/sm/Evac.c 583 case FUN_0_1: case FUN_1_0: case CONSTR_1_0: - copy_tag_nolock(p,info,q,sizeofW(StgHeader)+1,gen,tag); + copy_tag_nolock(p,info,q,sizeofW(StgHeader)+1,gen_ix,tag); return; case THUNK_1_0: hunk ./rts/sm/Evac.c 588 case THUNK_0_1: - copy(p,info,q,sizeofW(StgThunk)+1,gen); + copy(p,info,q,sizeofW(StgThunk)+1,gen_ix); return; case THUNK_1_1: hunk ./rts/sm/Evac.c 594 case THUNK_2_0: case THUNK_0_2: -#ifdef NO_PROMOTE_THUNKS -#error bitrotted -#endif - copy(p,info,q,sizeofW(StgThunk)+2,gen); - return; + copy(p,info,q,sizeofW(StgThunk)+2,gen_ix); + return; case FUN_1_1: case FUN_2_0: hunk ./rts/sm/Evac.c 602 case FUN_0_2: case CONSTR_1_1: case CONSTR_2_0: - copy_tag_nolock(p,info,q,sizeofW(StgHeader)+2,gen,tag); - return; - case CONSTR_0_2: hunk ./rts/sm/Evac.c 603 - copy_tag_nolock(p,info,q,sizeofW(StgHeader)+2,gen,tag); + copy_tag_nolock(p,info,q,sizeofW(StgHeader)+2,gen_ix,tag); return; case THUNK: hunk ./rts/sm/Evac.c 607 - copy(p,info,q,thunk_sizeW_fromITBL(INFO_PTR_TO_STRUCT(info)),gen); + copy(p,info,q,thunk_sizeW_fromITBL(INFO_PTR_TO_STRUCT(info)),gen_ix); return; case FUN: hunk ./rts/sm/Evac.c 612 case IND_PERM: + case IND_LOCAL: case CONSTR: hunk ./rts/sm/Evac.c 614 - copy_tag_nolock(p,info,q,sizeW_fromITBL(INFO_PTR_TO_STRUCT(info)),gen,tag); + copy_tag_nolock(p,info,q,sizeW_fromITBL(INFO_PTR_TO_STRUCT(info)),gen_ix,tag); return; case BLACKHOLE: hunk ./rts/sm/Evac.c 632 || i == &stg_WHITEHOLE_info || i == &stg_BLOCKING_QUEUE_CLEAN_info || i == &stg_BLOCKING_QUEUE_DIRTY_info) { - copy(p,info,q,sizeofW(StgInd),gen); + copy(p,info,q,sizeofW(StgInd),gen_ix); return; } hunk ./rts/sm/Evac.c 635 - ASSERT(i != &stg_IND_info); + ASSERT(i != &stg_STUB_BLOCKING_QUEUE_info); } q = r; *p = r; hunk ./rts/sm/Evac.c 646 case WEAK: case PRIM: case MUT_PRIM: - copy(p,info,q,sizeW_fromITBL(INFO_PTR_TO_STRUCT(info)),gen); + copy(p,info,q,sizeW_fromITBL(INFO_PTR_TO_STRUCT(info)),gen_ix); return; case BCO: hunk ./rts/sm/Evac.c 650 - copy(p,info,q,bco_sizeW((StgBCO *)q),gen); + copy(p,info,q,bco_sizeW((StgBCO *)q),gen_ix); return; case THUNK_SELECTOR: hunk ./rts/sm/Evac.c 658 return; case IND: - // follow chains of indirections, don't evacuate them - q = ((StgInd*)q)->indirectee; - *p = q; - goto loop; - - case RET_BCO: - case RET_SMALL: - case RET_BIG: - case RET_DYN: - case UPDATE_FRAME: - case UNDERFLOW_FRAME: - case STOP_FRAME: - case CATCH_FRAME: - case CATCH_STM_FRAME: - case CATCH_RETRY_FRAME: - case ATOMICALLY_FRAME: - // shouldn't see these - barf("evacuate: stack frame at %p\n", q); + // follow chains of indirections, don't evacuate them + q = ((StgInd*)q)->indirectee; + *p = q; + goto loop; case PAP: hunk ./rts/sm/Evac.c 664 - copy(p,info,q,pap_sizeW((StgPAP*)q),gen); + copy(p,info,q,pap_sizeW((StgPAP*)q),gen_ix); return; case AP: hunk ./rts/sm/Evac.c 668 - copy(p,info,q,ap_sizeW((StgAP*)q),gen); + copy(p,info,q,ap_sizeW((StgAP*)q),gen_ix); return; case AP_STACK: hunk ./rts/sm/Evac.c 672 - copy(p,info,q,ap_stack_sizeW((StgAP_STACK*)q),gen); + copy(p,info,q,ap_stack_sizeW((StgAP_STACK*)q),gen_ix); return; case ARR_WORDS: hunk ./rts/sm/Evac.c 677 // just copy the block - copy(p,info,q,arr_words_sizeW((StgArrWords *)q),gen); + copy(p,info,q,arr_words_sizeW((StgArrWords *)q),gen_ix); + return; + + case MUT_VAR_LOCAL: + case MUT_VAR_GLOBAL: + case MVAR_CLEAN: + case MVAR_DIRTY: + copy(p,info,q,sizeW_fromITBL(INFO_PTR_TO_STRUCT(info)),gen_ix); return; hunk ./rts/sm/Evac.c 687 - case MUT_ARR_PTRS_CLEAN: - case MUT_ARR_PTRS_DIRTY: + case MUT_ARR_PTRS_LOCAL: + case MUT_ARR_PTRS_GLOBAL: case MUT_ARR_PTRS_FROZEN: case MUT_ARR_PTRS_FROZEN0: // just copy the block hunk ./rts/sm/Evac.c 692 - copy(p,info,q,mut_arr_ptrs_sizeW((StgMutArrPtrs *)q),gen); + copy(p,info,q,mut_arr_ptrs_sizeW((StgMutArrPtrs *)q),gen_ix); return; case TSO: hunk ./rts/sm/Evac.c 696 - copy(p,info,q,sizeofW(StgTSO),gen); + copy(p,info,q,sizeofW(StgTSO),gen_ix); return; case STACK: hunk ./rts/sm/Evac.c 711 rtsBool mine; mine = copyPart(p,(StgClosure *)stack, stack_sizeW(stack), - sizeofW(StgStack), gen); + sizeofW(StgStack), gen_ix); if (mine) { new_stack = (StgStack *)*p; move_STACK(stack, new_stack); hunk ./rts/sm/Evac.c 725 } case TREC_CHUNK: - copy(p,info,q,sizeofW(StgTRecChunk),gen); + copy(p,info,q,sizeofW(StgTRecChunk),gen_ix); return; default: hunk ./rts/sm/Evac.c 829 unchain_thunk_selectors(prev_thunk_selector, (StgClosure *)p); *q = (StgClosure *)p; // shortcut, behave as for: if (evac) evacuate(q); - if (evac && bd->gen < gct->evac_gen) { + if (evac && bd->gen_ix < gct->evac_gen_ix) { gct->failed_to_evac = rtsTrue; TICK_GC_FAILED_PROMOTION(); } hunk ./rts/sm/Evac.c 937 SET_INFO(p, (StgInfoTable *)info_ptr); OVERWRITING_CLOSURE((StgClosure*)p); SET_INFO(p, &stg_WHITEHOLE_info); + // LDV_RECORD_CREATE is done in unchain_thunk_selectors #endif // the closure in val is now the "value" of the hunk ./rts/sm/Evac.c 952 info = INFO_PTR_TO_STRUCT(info_ptr); switch (info->type) { case IND: + case IND_LOCAL: case IND_PERM: case IND_STATIC: val = ((StgInd *)val)->indirectee; hunk ./rts/sm/Evac.c 986 } case IND: + case IND_LOCAL: case IND_PERM: case IND_STATIC: // Again, we might need to untag a constructor. hunk ./rts/sm/Evac.c 1013 || i == &stg_BLOCKING_QUEUE_DIRTY_info) { goto bale_out; } - ASSERT(i != &stg_IND_info); + ASSERT(i != &stg_STUB_BLOCKING_QUEUE_info); } selectee = UNTAG_CLOSURE( ((StgInd *)selectee)->indirectee ); hunk ./rts/sm/Evac.c 1071 // check whether it was updated in the meantime. *q = (StgClosure *)p; if (evac) { - copy(q,(const StgInfoTable *)info_ptr,(StgClosure *)p,THUNK_SELECTOR_sizeW(),bd->dest); + copy(q,(const StgInfoTable *)info_ptr,(StgClosure *)p,THUNK_SELECTOR_sizeW(),bd->dest_ix); } unchain_thunk_selectors(prev_thunk_selector, *q); return; hunk ./rts/sm/Evac.h 35 #define REGPARM1 #endif -REGPARM1 void evacuate (StgClosure **p); -REGPARM1 void evacuate1 (StgClosure **p); +REGPARM1 void evacuate (StgClosure **p); +REGPARM1 void evacuate_seq (StgClosure **p); +REGPARM1 void evacuate_local (StgClosure **p); + +REGPARM1 void evacuate_local_only (StgClosure **p); extern lnat thunk_selector_depth; hunk ./rts/sm/GC.c 43 #include "GC.h" #include "GCThread.h" +#include "GCTDecl.h" #include "Compact.h" #include "Evac.h" #include "Scav.h" hunk ./rts/sm/GC.c 52 #include "MarkWeak.h" #include "Sparks.h" #include "Sweep.h" +#include "Globalise.h" #include // for memset() #include hunk ./rts/sm/GC.c 101 * flag) is when we're collecting all generations. We only attempt to * deal with static objects and GC CAFs when doing a major GC. */ -nat N; rtsBool major_gc; hunk ./rts/sm/GC.c 103 +rtsBool work_stealing; + +nat next_gc_gen; + +// Number of threads running in *this* GC +nat n_gc_threads; + /* Data used for allocation area sizing. */ static lnat g0_pcnt_kept = 30; // percentage of g0 live at last minor GC hunk ./rts/sm/GC.c 130 StgWord8 the_gc_thread[sizeof(gc_thread) + 64 * sizeof(gen_workspace)]; #endif -// Number of threads running in *this* GC. Affects how many -// step->todos[] lists we have to look in to find work. -nat n_gc_threads; - -// For stats: -long copied; // *words* copied & scavenged during this GC +// The number of currently active GC threads; use inc_running()/dec_running() +static volatile StgWord gc_running_threads; hunk ./rts/sm/GC.c 133 -rtsBool work_stealing; +#if defined(THREADED_RTS) +static Mutex gc_local_mutex; +#endif DECLARE_GCT hunk ./rts/sm/GC.c 143 Static function declarations -------------------------------------------------------------------------- */ -static void mark_root (void *user, StgClosure **root); -static void zero_static_object_list (StgClosure* first_static); -static nat initialise_N (rtsBool force_major_gc); -static void init_collected_gen (nat g, nat threads); -static void init_uncollected_gen (nat g, nat threads); -static void init_gc_thread (gc_thread *t); -static void resize_generations (void); -static void resize_nursery (void); -static void start_gc_threads (void); -static void scavenge_until_all_done (void); -static StgWord inc_running (void); -static StgWord dec_running (void); -static void wakeup_gc_threads (nat n_threads, nat me); -static void shutdown_gc_threads (nat n_threads, nat me); +static void mark_root (void *user, StgClosure **root); +static void zero_static_object_list (StgClosure* first_static); +static nat determine_collect_gen (void); +static void stash_mut_list (Capability *cap, nat gen_no); +static void prepare_collected_gen (generation *gen); +static void prepare_uncollected_gen (nat gc_type, generation *gen); +static void init_gc_thread (gc_thread *); +static void prepare_gc_thread (void); +static void resize_generations (void); +static void resize_nursery (lnat copied, nat N); +static void scavenge_until_all_done (void); +static StgWord inc_running (void); +static StgWord dec_running (void); +static void wakeup_gc_threads (nat me, nat N); +static void shutdown_gc_threads (nat me); +static void collect_gct_blocks (void); +static lnat count_part_lists (nat start_gen); +static void freeMarkStack (void); #if 0 && defined(DEBUG) static void gcCAFs (void); hunk ./rts/sm/GC.c 166 #endif -/* ----------------------------------------------------------------------------- - The mark stack. - -------------------------------------------------------------------------- */ - -bdescr *mark_stack_top_bd; // topmost block in the mark stack -bdescr *mark_stack_bd; // current block in the mark stack -StgPtr mark_sp; // pointer to the next unallocated mark stack entry - /* ----------------------------------------------------------------------------- GarbageCollect: the main entry point to the garbage collector. hunk ./rts/sm/GC.c 173 -------------------------------------------------------------------------- */ void -GarbageCollect (rtsBool force_major_gc, +GarbageCollect (nat N, // generation to collect nat gc_type USED_IF_THREADS, Capability *cap) { hunk ./rts/sm/GC.c 179 bdescr *bd; generation *gen; - lnat live, allocated, max_copied, avg_copied, slop; + lnat live, allocated, copied, max_copied, avg_copied, slop; gc_thread *saved_gct; hunk ./rts/sm/GC.c 181 - nat g, t, n; + nat g, n; // necessary if we stole a callee-saves register for gct: saved_gct = gct; hunk ./rts/sm/GC.c 190 CostCentreStack *prev_CCS; #endif - ACQUIRE_SM_LOCK; + if (gc_type != GC_LOCAL) { ACQUIRE_SM_LOCK; } #if defined(RTS_USER_SIGNALS) hunk ./rts/sm/GC.c 193 - if (RtsFlags.MiscFlags.install_signal_handlers) { + if (RtsFlags.MiscFlags.install_signal_handlers && + gc_type != GC_LOCAL) { // block signals blockUserSignals(); } hunk ./rts/sm/GC.c 203 ASSERT(sizeof(gen_workspace) == 16 * sizeof(StgWord)); // otherwise adjust the padding in gen_workspace. - // tell the stats department that we've started a GC - stat_startGC(); + SET_GCT(gc_threads[cap->no]); + + gct->gc_type = gc_type; + major_gc = (N == RtsFlags.GcFlags.generations-1); hunk ./rts/sm/GC.c 208 - // tell the STM to discard any cached closures it's hoping to re-use - stmPreGCHook(); + // tell the stats department that we've started a GC + stat_startGC(gct); // lock the StablePtr table hunk ./rts/sm/GC.c 212 - stablePtrPreGC(); + if (gc_type != GC_LOCAL) stablePtrPreGC(); + +// if (gc_type == GC_LOCAL) { ACQUIRE_LOCK(&gct->local_gc_lock); } #ifdef DEBUG hunk ./rts/sm/GC.c 217 + // these are stats only; we could make them thread-local but it + // doesn't matter if they aren't accurate. mutlist_MUTVARS = 0; mutlist_MUTARRS = 0; hunk ./rts/sm/GC.c 221 - mutlist_OTHERS = 0; + mutlist_OTHERS = 0; #endif // attribute any costs to CCS_GC hunk ./rts/sm/GC.c 230 CCCS = CCS_GC; #endif - /* Approximate how much we allocated. - * Todo: only when generating stats? - */ - allocated = calcAllocated(rtsFalse/* don't count the nursery yet */); + // Approximate how much we allocated since the last GC. + if (gc_type == GC_LOCAL) { + allocated = calcAllocatedCap(cap, rtsFalse); + } else { + allocated = calcAllocated(rtsFalse/* don't count the nursery yet */); + } /* Figure out which generation to collect */ hunk ./rts/sm/GC.c 239 - n = initialise_N(force_major_gc); + if (gc_type == GC_LOCAL) { + ASSERT(N == 0); + } + gct->collect_gen = N; #if defined(THREADED_RTS) hunk ./rts/sm/GC.c 245 - work_stealing = RtsFlags.ParFlags.parGcLoadBalancingEnabled && - N >= RtsFlags.ParFlags.parGcLoadBalancingGen; + if (gc_type == GC_LOCAL) { + work_stealing = rtsFalse; + } else { + work_stealing = RtsFlags.ParFlags.parGcLoadBalancingEnabled && + N >= RtsFlags.ParFlags.parGcLoadBalancingGen; + } // It's not always a good idea to do load balancing in parallel // GC. In particular, for a parallel program we don't want to // lose locality by moving cached data into another CPU's cache hunk ./rts/sm/GC.c 262 // a flag. #endif - /* Start threads, so they can be spinning up while we finish initialisation. - */ - start_gc_threads(); + gc_running_threads = 0; #if defined(THREADED_RTS) hunk ./rts/sm/GC.c 265 - /* How many threads will be participating in this GC? - * We don't try to parallelise minor GCs (unless the user asks for - * it with +RTS -gn0), or mark/compact/sweep GC. - */ - if (gc_type == PENDING_GC_PAR) { + // How many threads will be participating in this GC? + if (gc_type == GC_PAR) { n_gc_threads = RtsFlags.ParFlags.nNodes; } else { n_gc_threads = 1; hunk ./rts/sm/GC.c 275 n_gc_threads = 1; #endif - debugTrace(DEBUG_gc, "GC (gen %d): %d KB to collect, %ld MB in use, using %d thread(s)", - N, n * (BLOCK_SIZE / 1024), mblocks_allocated, n_gc_threads); + debugTrace(DEBUG_gc, "GC (%s, gen %d): %ld MB in use, using %d thread(s)", + gc_type==GC_LOCAL ? "GC_LOCAL" : + gc_type==GC_PAR ? "GC_PAR" : "GC_SEQ", + N, mblocks_allocated, n_gc_threads); #ifdef RTS_GTK_FRONTPANEL if (RtsFlags.GcFlags.frontpanel) { hunk ./rts/sm/GC.c 288 #ifdef DEBUG // check for memory leaks if DEBUG is on - memInventory(DEBUG_gc); + if (gc_type != GC_LOCAL || n_capabilities == 1) { + memInventory(DEBUG_gc); + } #endif // check sanity *before* GC hunk ./rts/sm/GC.c 294 - IF_DEBUG(sanity, checkSanity(rtsTrue)); + IF_DEBUG(sanity, checkSanity (gc_type == GC_LOCAL && n_capabilities != 1, + rtsFalse /* before GC */, + major_gc, + cap->no)); hunk ./rts/sm/GC.c 299 - // Initialise all our gc_thread structures - for (t = 0; t < n_gc_threads; t++) { - init_gc_thread(gc_threads[t]); + // Prepare the young generations: + if (gc_type == GC_LOCAL) { + prepare_collected_gen(&all_generations[gct->index]); + } else { + for (n = 0; n < n_capabilities; n++) { + prepare_collected_gen(&all_generations[n]); + } } hunk ./rts/sm/GC.c 307 - - // Initialise all the generations/steps that we're collecting. - for (g = 0; g <= N; g++) { - init_collected_gen(g,n_gc_threads); + // Prepare the generatinos we're collecting, 1..N + for (g = 1; g <= N; g++) { + prepare_collected_gen(&old_generations[g]); } hunk ./rts/sm/GC.c 311 - - // Initialise all the generations/steps that we're *not* collecting. + // Prepare the generations/steps that we're *not* collecting. for (g = N+1; g < RtsFlags.GcFlags.generations; g++) { hunk ./rts/sm/GC.c 313 - init_uncollected_gen(g,n_gc_threads); - } - - /* Allocate a mark stack if we're doing a major collection. - */ - if (major_gc && oldest_gen->mark) { - mark_stack_bd = allocBlock(); - mark_stack_top_bd = mark_stack_bd; - mark_stack_bd->link = NULL; - mark_stack_bd->u.back = NULL; - mark_sp = mark_stack_bd->start; - } else { - mark_stack_bd = NULL; - mark_stack_top_bd = NULL; - mark_sp = NULL; + prepare_uncollected_gen(gc_type, &old_generations[g]); } hunk ./rts/sm/GC.c 316 - // this is the main thread -#ifdef THREADED_RTS - if (n_gc_threads == 1) { - SET_GCT(gc_threads[0]); - } else { - SET_GCT(gc_threads[cap->no]); - } -#else -SET_GCT(gc_threads[0]); -#endif + // Prepare the workspaces attached to this gc_thread + prepare_gc_thread(); /* ----------------------------------------------------------------------- * follow all the roots that we know about: hunk ./rts/sm/GC.c 328 // NB. do this after the mutable lists have been saved above, otherwise // the other GC threads will be writing into the old mutable lists. inc_running(); - wakeup_gc_threads(n_gc_threads, gct->thread_index); - - // Mutable lists from each generation > N - // we want to *scavenge* these roots, not evacuate them: they're not - // going to move in this GC. - // Also do them in reverse generation order, for the usual reason: - // namely to reduce the likelihood of spurious old->new pointers. - // - for (g = RtsFlags.GcFlags.generations-1; g > N; g--) { -#if defined(THREADED_RTS) - if (n_gc_threads > 1) { - scavenge_mutable_list(generations[g].saved_mut_list, &generations[g]); - } else { - scavenge_mutable_list1(generations[g].saved_mut_list, &generations[g]); - } -#else - scavenge_mutable_list(generations[g].saved_mut_list, &generations[g]); -#endif - freeChain_sync(generations[g].saved_mut_list); - generations[g].saved_mut_list = NULL; - - } - + wakeup_gc_threads(gct->index, N); + // scavenge the capability-private mutable lists. This isn't part // of markSomeCapabilities() because markSomeCapabilities() can only // call back into the GC via mark_root() (due to the gct register hunk ./rts/sm/GC.c 334 // variable). - if (n_gc_threads == 1) { + traceEventGcWork(gct->cap); + + switch (gc_type) { + case GC_SEQ: for (n = 0; n < n_capabilities; n++) { hunk ./rts/sm/GC.c 339 -#if defined(THREADED_RTS) - scavenge_capability_mut_Lists1(&capabilities[n]); -#else scavenge_capability_mut_lists(&capabilities[n]); hunk ./rts/sm/GC.c 340 -#endif } hunk ./rts/sm/GC.c 341 - } else { - scavenge_capability_mut_lists(&capabilities[gct->thread_index]); + break; +#ifdef THREADED_RTS + case GC_LOCAL: + scavenge_capability_mut_lists_local(gct->cap); + break; + case GC_PAR: + scavenge_capability_mut_lists_par(gct->cap); + break; +#endif } // follow roots from the CAF list (used by GHCi) hunk ./rts/sm/GC.c 353 - gct->evac_gen = 0; + gct->evac_gen_ix = 0; markCAFs(mark_root, gct); hunk ./rts/sm/GC.c 356 - // follow all the roots that the application knows about. - gct->evac_gen = 0; - markSomeCapabilities(mark_root, gct, gct->thread_index, n_gc_threads, - rtsTrue/*prune sparks*/); - #if defined(RTS_USER_SIGNALS) // mark the signal handlers (signals should be already blocked) markSignalHandlers(mark_root, gct); hunk ./rts/sm/GC.c 361 #endif + // follow all the roots that the application knows about. + gct->evac_gen_ix = 0; + if (gc_type == GC_SEQ) { + for (n = 0; n < n_capabilities; n++) { + markCapability(mark_root, gct, &capabilities[n], + rtsTrue/*don't mark sparks*/); + } + } else { + markCapability(mark_root, gct, cap, rtsTrue/*don't mark sparks*/); + } + + markScheduler(mark_root, gct); + // Mark the weak pointer list, and prepare to detect dead weak pointers. hunk ./rts/sm/GC.c 375 - markWeakPtrList(); initWeakForGC(); hunk ./rts/sm/GC.c 376 + markWeakPtrList(); // Mark the stable pointer table. hunk ./rts/sm/GC.c 379 - markStablePtrTable(mark_root, gct); + if (gc_type != GC_LOCAL) markStablePtrTable(mark_root, gct); /* ------------------------------------------------------------------------- * Repeatedly scavenge all the areas we know about until there's no hunk ./rts/sm/GC.c 393 // must be last... invariant is that everything is fully // scavenged at this point. + if (gc_type == GC_LOCAL) { ACQUIRE_LOCK(&gc_local_mutex); } if (traverseWeakPtrList()) { // returns rtsTrue if evaced something inc_running(); hunk ./rts/sm/GC.c 396 + if (gc_type == GC_LOCAL) { RELEASE_LOCK(&gc_local_mutex); } continue; } hunk ./rts/sm/GC.c 399 + if (gc_type == GC_LOCAL) { RELEASE_LOCK(&gc_local_mutex); } // If we get to here, there's really nothing left to do. break; hunk ./rts/sm/GC.c 405 } - shutdown_gc_threads(n_gc_threads, gct->thread_index); + if (gc_type == GC_PAR) { + shutdown_gc_threads(gct->index); + } // Now see which stable names are still alive. hunk ./rts/sm/GC.c 410 - gcStablePtrTable(); + if (gc_type != GC_LOCAL) gcStablePtrTable(); #ifdef THREADED_RTS hunk ./rts/sm/GC.c 413 - if (n_gc_threads == 1) { + switch (gc_type) + { + case GC_LOCAL: + // don't touch the spark pool for GC_LOCAL: other Capabilities may + // be stealing from it. It only contains global pointers, so we + // are safe to ignore it. + break; + + case GC_SEQ: for (n = 0; n < n_capabilities; n++) { pruneSparkQueue(&capabilities[n]); } hunk ./rts/sm/GC.c 425 - } else { - pruneSparkQueue(&capabilities[gct->thread_index]); + break; + + case GC_PAR: + pruneSparkQueue(gct->cap); + break; } #endif hunk ./rts/sm/GC.c 448 // g0->blocks is to-space from the previous GC if (RtsFlags.GcFlags.generations == 1) { if (g0->blocks != NULL) { - freeChain(g0->blocks); + freeChain_sync(g0->blocks); g0->blocks = NULL; } } hunk ./rts/sm/GC.c 453 - // For each workspace, in each thread, move the copied blocks to the step - { - gc_thread *thr; - gen_workspace *ws; - bdescr *prev, *next; - - for (t = 0; t < n_gc_threads; t++) { - thr = gc_threads[t]; - - for (g = 0; g < RtsFlags.GcFlags.generations; g++) { - ws = &thr->gens[g]; - - // Push the final block - if (ws->todo_bd) { - push_scanned_block(ws->todo_bd, ws); - } - - ASSERT(gct->scan_bd == NULL); - ASSERT(countBlocks(ws->scavd_list) == ws->n_scavd_blocks); - - prev = NULL; - for (bd = ws->scavd_list; bd != NULL; bd = bd->link) { - ws->gen->n_words += bd->free - bd->start; - prev = bd; - } - if (prev != NULL) { - prev->link = ws->gen->blocks; - ws->gen->blocks = ws->scavd_list; - } - ws->gen->n_blocks += ws->n_scavd_blocks; - } - } - - // Add all the partial blocks *after* we've added all the full - // blocks. This is so that we can grab the partial blocks back - // again and try to fill them up in the next GC. - for (t = 0; t < n_gc_threads; t++) { - thr = gc_threads[t]; - - for (g = 0; g < RtsFlags.GcFlags.generations; g++) { - ws = &thr->gens[g]; - - prev = NULL; - for (bd = ws->part_list; bd != NULL; bd = next) { - next = bd->link; - if (bd->free == bd->start) { - if (prev == NULL) { - ws->part_list = next; - } else { - prev->link = next; - } - freeGroup(bd); - ws->n_part_blocks--; - } else { - ws->gen->n_words += bd->free - bd->start; - prev = bd; - } - } - if (prev != NULL) { - prev->link = ws->gen->blocks; - ws->gen->blocks = ws->part_list; - } - ws->gen->n_blocks += ws->n_part_blocks; - - ASSERT(countBlocks(ws->gen->blocks) == ws->gen->n_blocks); - ASSERT(countOccupied(ws->gen->blocks) == ws->gen->n_words); - } - } - } - // Finally: compact or sweep the oldest generation. if (major_gc && oldest_gen->mark) { if (oldest_gen->compact) hunk ./rts/sm/GC.c 456 - compact(gct->scavenged_static_objects); + compact(gct); else sweep(oldest_gen); } hunk ./rts/sm/GC.c 461 - /* run through all the generations/steps and tidy up - */ copied = 0; max_copied = 0; avg_copied = 0; hunk ./rts/sm/GC.c 466 { nat i; - for (i=0; i < n_gc_threads; i++) { - if (n_gc_threads > 1) { + if (n_gc_threads > 1) { + for (i=0; i < n_gc_threads; i++) { debugTrace(DEBUG_gc,"thread %d:", i); debugTrace(DEBUG_gc," copied %ld", gc_threads[i]->copied * sizeof(W_)); debugTrace(DEBUG_gc," scanned %ld", gc_threads[i]->scanned * sizeof(W_)); hunk ./rts/sm/GC.c 474 debugTrace(DEBUG_gc," any_work %ld", gc_threads[i]->any_work); debugTrace(DEBUG_gc," no_work %ld", gc_threads[i]->no_work); debugTrace(DEBUG_gc," scav_find_work %ld", gc_threads[i]->scav_find_work); + copied += gc_threads[i]->copied; + max_copied = stg_max(gc_threads[i]->copied, max_copied); } hunk ./rts/sm/GC.c 477 - copied += gc_threads[i]->copied; - max_copied = stg_max(gc_threads[i]->copied, max_copied); - } - if (n_gc_threads == 1) { + avg_copied = copied; + } else { + copied = gct->copied; max_copied = 0; avg_copied = 0; hunk ./rts/sm/GC.c 482 - } else { - avg_copied = copied; } } hunk ./rts/sm/GC.c 485 - for (g = 0; g < RtsFlags.GcFlags.generations; g++) { + // Run through all the generations/steps and tidy up. + // We're going to: + // - count the amount of "live" data (live) + // - count the amount of "copied" data in this GC (copied) + // - free from-space + // - make to-space the new from-space (set BF_EVACUATED on all blocks) + // - sweep the prim area + // + live = 0; + for (n = 0; n < total_generations; n++) { + + gen = &all_generations[n]; + g = gen->no; + + // someone else's local generation? + if (gc_type == GC_LOCAL && isNonLocalGen(gen)) + continue; + + ACQUIRE_SPIN_LOCK(&gen->sync); if (g == N) { hunk ./rts/sm/GC.c 506 - generations[g].collections++; // for stats - if (n_gc_threads > 1) generations[g].par_collections++; + gen->collections++; // for stats + if (n_gc_threads > 1) gen->par_collections++; } hunk ./rts/sm/GC.c 510 + // Count "live" data. Do it here rather than in calcLiveWords + // because we're inside the gen->sync lock. + live += gen->n_words + gen->n_prim_words + + countOccupied(gen->large_objects); + // Count the mutable list as bytes "copied" for the purposes of // stats. Every mutable list is copied during every GC. if (g > 0) { hunk ./rts/sm/GC.c 519 nat mut_list_size = 0; - for (bd = generations[g].mut_list; bd != NULL; bd = bd->link) { - mut_list_size += bd->free - bd->start; - } - for (n = 0; n < n_capabilities; n++) { - for (bd = capabilities[n].mut_lists[g]; - bd != NULL; bd = bd->link) { - mut_list_size += bd->free - bd->start; + if (gc_type == GC_LOCAL) { + mut_list_size = countOccupied(cap->mut_lists[g]); + } else { + for (n = 0; n < n_capabilities; n++) { + mut_list_size += countOccupied(capabilities[n].mut_lists[g]); } } copied += mut_list_size; hunk ./rts/sm/GC.c 535 } bdescr *next, *prev; - gen = &generations[g]; // for generations we collected... if (g <= N) { hunk ./rts/sm/GC.c 560 } else { prev->link = next; } - freeGroup(bd); + freeGroup_sync(bd); gen->n_old_blocks--; } else hunk ./rts/sm/GC.c 591 ASSERT(countBlocks(gen->blocks) == gen->n_blocks); ASSERT(countOccupied(gen->blocks) == gen->n_words); } - else // not copacted + else // not compacted { hunk ./rts/sm/GC.c 593 - freeChain(gen->old_blocks); + freeChain_sync(gen->old_blocks); + + if (N >= global_gen_no) { + freeChain_sync(gen->prim_blocks); + gen->prim_blocks = NULL; + gen->n_prim_blocks = 0; + gen->n_prim_words = 0; + } else { + sweepPrimArea(gen); + } + gen->n_prim_words = countOccupied(gen->prim_blocks); } gen->old_blocks = NULL; hunk ./rts/sm/GC.c 614 * collection from large_objects. Any objects left on the * large_objects list are therefore dead, so we free them here. */ - freeChain(gen->large_objects); + freeChain_sync(gen->large_objects); gen->large_objects = gen->scavenged_large_objects; gen->n_large_blocks = gen->n_scavenged_large_blocks; gen->n_new_large_words = 0; hunk ./rts/sm/GC.c 633 // add the new blocks we promoted during this GC gen->n_large_blocks += gen->n_scavenged_large_blocks; - ASSERT(countBlocks(gen->large_objects) == gen->n_large_blocks); } hunk ./rts/sm/GC.c 634 + + ASSERT(countBlocks(gen->large_objects) == gen->n_large_blocks); + + gen->scavenged_large_objects = NULL; + gen->n_scavenged_large_blocks = 0; + + RELEASE_SPIN_LOCK(&gen->sync); } // for all generations // update the max size of older generations after a major GC hunk ./rts/sm/GC.c 646 resize_generations(); - // Calculate the amount of live data for stats. - live = calcLiveWords(); + // add in the partial blocks in the gen_workspaces, but ignore gen 0 + // if this is a local GC (we can't count another capability's part_list) + live += count_part_lists( (gc_type == GC_LOCAL) ? n_capabilities : 0 ); // Start a new pinned_object_block hunk ./rts/sm/GC.c 651 - for (n = 0; n < n_capabilities; n++) { - capabilities[n].pinned_object_block = NULL; - } - - // Free the mark stack. - if (mark_stack_top_bd != NULL) { - debugTrace(DEBUG_gc, "mark stack: %d blocks", - countBlocks(mark_stack_top_bd)); - freeChain(mark_stack_top_bd); + if (gc_type == GC_LOCAL) { + cap->pinned_object_block = NULL; + } else { + for (n = 0; n < n_capabilities; n++) { + capabilities[n].pinned_object_block = NULL; + } } hunk ./rts/sm/GC.c 659 + // Free the mark stack, leaving one block. + freeMarkStack(); + // Free any bitmaps. hunk ./rts/sm/GC.c 663 - for (g = 0; g <= N; g++) { - gen = &generations[g]; + for (g = 0; g < total_generations; g++) { + gen = &all_generations[g]; + if (gct->gc_type == GC_LOCAL && isNonLocalGen(gen)) + continue; if (gen->bitmap != NULL) { hunk ./rts/sm/GC.c 668 - freeGroup(gen->bitmap); + freeGroup_sync(gen->bitmap); gen->bitmap = NULL; } } hunk ./rts/sm/GC.c 674 // Reset the nursery: make the blocks empty - allocated += clearNurseries(); + if (gc_type == GC_LOCAL) { + allocated += clearNursery(cap->no); + } else { + allocated += clearNurseries(); + } hunk ./rts/sm/GC.c 680 - resize_nursery(); + resize_nursery(copied,N); hunk ./rts/sm/GC.c 682 - resetNurseries(); - - // mark the garbage collected CAFs as dead -#if 0 && defined(DEBUG) // doesn't work at the moment - if (major_gc) { gcCAFs(); } -#endif - #ifdef PROFILING // resetStaticObjectForRetainerProfiling() must be called before // zeroing below. hunk ./rts/sm/GC.c 704 } } - // send exceptions to any threads which were about to die - RELEASE_SM_LOCK; - resurrectThreads(resurrected_threads); - ACQUIRE_SM_LOCK; + // Reset the nursery + if (gc_type == GC_LOCAL) { + resetNursery(cap->no); + } else { + resetNurseries(); + } // Update the stable pointer hash table. hunk ./rts/sm/GC.c 712 - updateStablePtrTable(major_gc); + if (gc_type != GC_LOCAL) updateStablePtrTable(major_gc); + + // paranoia until I figure out how much of the following code can be + // run concurrently... + if (gc_type == GC_LOCAL) { ACQUIRE_LOCK(&gc_local_mutex); } + + // ok, GC over: tell the stats department what happened. + slop = calcLiveBlocks() * BLOCK_SIZE_W - live; + stat_endGC(gct, allocated, live, copied, N, max_copied, avg_copied, slop); + + // make sure our mut_lists don't point to anything local. This step + // also moves mut_list entries to the right mut_list if they ended + // up on the wrong one during a GC_PAR. + for (g = RtsFlags.GcFlags.generations-1; g > 0; g--) { + if (gc_type == GC_LOCAL) { + stash_mut_list (cap, g); + } else { + for (n = 0; n < n_capabilities; n++) { + stash_mut_list (&capabilities[n], g); + } + } + } + if (gc_type == GC_LOCAL) { + globalise_capability_mut_lists (cap); + } else { + RELEASE_SM_LOCK; // globalise acquires sm_mutex itself as needed + for (n = 0; n < n_capabilities; n++) { + globalise_capability_mut_lists (&capabilities[n]); + } + ACQUIRE_SM_LOCK; + } // unlock the StablePtr table. Must be before scheduleFinalizers(), // because a finalizer may call hs_free_fun_ptr() or hunk ./rts/sm/GC.c 747 // hs_free_stable_ptr(), both of which access the StablePtr table. - stablePtrPostGC(); + if (gc_type != GC_LOCAL) stablePtrPostGC(); // Start any pending finalizers. Must be after // updateStablePtrTable() and stablePtrPostGC() (see #4221). hunk ./rts/sm/GC.c 751 - RELEASE_SM_LOCK; - scheduleFinalizers(cap, old_weak_ptr_list); - ACQUIRE_SM_LOCK; + if (gc_type != GC_LOCAL) { RELEASE_SM_LOCK; } + scheduleFinalizers(cap, gct->old_weak_ptrs); + if (gc_type != GC_LOCAL) { ACQUIRE_SM_LOCK; } + + // send exceptions to any threads which were about to die + if (gc_type != GC_LOCAL) { + RELEASE_SM_LOCK; + resurrectThreads(gct->resurrected_threads); + ACQUIRE_SM_LOCK; + } else { + // we don't do this in local GC (yet) + ASSERT(gct->resurrected_threads == END_TSO_QUEUE); + } if (major_gc) { nat need, got; hunk ./rts/sm/GC.c 779 } // check sanity after GC - IF_DEBUG(sanity, checkSanity(rtsTrue)); + IF_DEBUG(sanity, checkSanity (gc_type == GC_LOCAL && n_capabilities != 1, + rtsTrue /* after GC */, + major_gc, + cap->no)); // extra GC trace info IF_DEBUG(gc, statDescribeGens()); hunk ./rts/sm/GC.c 799 #ifdef DEBUG // check for memory leaks if DEBUG is on - memInventory(DEBUG_gc); + if (gc_type != GC_LOCAL || n_capabilities == 1) { + // we can't account for blocks in another local heap, so only do this + // when doing a global GC. + memInventory(DEBUG_gc); + } #endif #ifdef RTS_GTK_FRONTPANEL hunk ./rts/sm/GC.c 812 } #endif - // ok, GC over: tell the stats department what happened. - slop = calcLiveBlocks() * BLOCK_SIZE_W - live; - stat_endGC(allocated, live, copied, N, max_copied, avg_copied, slop); - // Guess which generation we'll collect *next* time hunk ./rts/sm/GC.c 813 - initialise_N(force_major_gc); + next_gc_gen = determine_collect_gen(); #if defined(RTS_USER_SIGNALS) if (RtsFlags.MiscFlags.install_signal_handlers) { hunk ./rts/sm/GC.c 822 } #endif - RELEASE_SM_LOCK; + if (gc_type == GC_LOCAL) { + RELEASE_LOCK(&gc_local_mutex); +// RELEASE_LOCK(&gct->local_gc_lock); + } else { + RELEASE_SM_LOCK; + } + + gct->gc_type = GC_LOCAL; // always GC_LOCAL between collections SET_GCT(saved_gct); } hunk ./rts/sm/GC.c 834 +/* ----------------------------------------------------------------------------- + allocate memory in a generation using the GC's data structures + -------------------------------------------------------------------------- */ + +StgPtr allocateInGen (Capability *cap USED_IF_THREADS, nat gen_ix, nat size) +{ + StgPtr p; + gen_workspace *ws; + gc_thread *saved_gct; + bdescr *bd; + + saved_gct = gct; + SET_GCT(gc_threads[cap->no]); + ASSERT(gct->gc_type == GC_LOCAL); // alloc_todo_block needs to know + + ws = &gct->gens[gen_ix]; + + ASSERT(ws->todo_bd != NULL); + + p = ws->todo_free; + ws->todo_free += size; + + if (ws->todo_free > ws->todo_lim) { + bd = ws->todo_bd; + if (ws->todo_lim + size <= bd->start + bd->blocks * BLOCK_SIZE_W) { + ws->todo_lim = bd->start + bd->blocks * BLOCK_SIZE_W; + } else { + bd->link = ws->scavd_list; + ws->scavd_list = bd; + ws->n_scavd_blocks += bd->blocks; + IF_DEBUG(sanity, + ASSERT(countBlocks(ws->scavd_list) == ws->n_scavd_blocks)); + alloc_todo_block (ws, size); + p = ws->todo_free; + ws->todo_free += size; + } + } + + ASSERT(ws->todo_free >= ws->todo_bd->free && ws->todo_free <= ws->todo_lim); + + SET_GCT(saved_gct); + + return p; +} + /* ----------------------------------------------------------------------------- Figure out which generation to collect, initialise N and major_gc. hunk ./rts/sm/GC.c 887 -------------------------------------------------------------------------- */ static nat -initialise_N (rtsBool force_major_gc) +determine_collect_gen (void) { hunk ./rts/sm/GC.c 889 - int g; - nat blocks, blocks_total; + nat g, i, blocks, prim_blocks; hunk ./rts/sm/GC.c 891 - blocks = 0; - blocks_total = 0; + // count the number of prim blocks, and count this with the total + // number of old-gen blocks for the purposes of deciding whether + // to do an old-gen GC. This avoids the prim area growing too + // large, and seems to be a slight win in gc_bench. hunk ./rts/sm/GC.c 896 - if (force_major_gc) { - N = RtsFlags.GcFlags.generations - 1; + prim_blocks = 0; + if (gct->gc_type == GC_LOCAL) { + prim_blocks = all_generations[gct->index].n_prim_blocks; + // XXX shouldn't we add n_prim_blocks from all the local heaps here? } else { hunk ./rts/sm/GC.c 901 - N = 0; + for (i = 0; i < n_capabilities; i++) { + prim_blocks += all_generations[i].n_prim_blocks; + } } hunk ./rts/sm/GC.c 906 - for (g = RtsFlags.GcFlags.generations - 1; g >= 0; g--) { + // we always collect at least gen 0 + for (g = RtsFlags.GcFlags.generations - 1; g > 0; g--) { hunk ./rts/sm/GC.c 909 - blocks = generations[g].n_words / BLOCK_SIZE_W - + generations[g].n_large_blocks; + blocks = old_generations[g].n_words / BLOCK_SIZE_W + + old_generations[g].n_large_blocks + + prim_blocks; hunk ./rts/sm/GC.c 913 - if (blocks >= generations[g].max_blocks) { - N = stg_max(N,g); - } - if ((nat)g <= N) { - blocks_total += blocks; - } + if (blocks >= old_generations[g].max_blocks) + break; } hunk ./rts/sm/GC.c 917 - blocks_total += countNurseryBlocks(); - - major_gc = (N == RtsFlags.GcFlags.generations-1); - return blocks_total; + return g; } hunk ./rts/sm/GC.c 920 + /* ----------------------------------------------------------------------------- Initialise the gc_thread structures. -------------------------------------------------------------------------- */ hunk ./rts/sm/GC.c 930 #define GC_THREAD_RUNNING 2 #define GC_THREAD_WAITING_TO_CONTINUE 3 +static void +init_gc_thread (gc_thread *t) +{ + t->static_objects = END_OF_STATIC_LIST; + t->scavenged_static_objects = END_OF_STATIC_LIST; + t->scan_bd = NULL; + t->evac_gen_ix = 0; + t->failed_to_evac = rtsFalse; + t->eager_promotion = rtsTrue; + t->thunk_selector_depth = 0; + t->copied = 0; + t->scanned = 0; + t->any_work = 0; + t->no_work = 0; + t->scav_find_work = 0; + t->resurrected_threads = NULL; +} + static void new_gc_thread (nat n, gc_thread *t) { hunk ./rts/sm/GC.c 954 nat g; gen_workspace *ws; + t->cap = &capabilities[n]; + #ifdef THREADED_RTS hunk ./rts/sm/GC.c 957 - t->id = 0; initSpinLock(&t->gc_spin); initSpinLock(&t->mut_spin); ACQUIRE_SPIN_LOCK(&t->gc_spin); hunk ./rts/sm/GC.c 962 t->wakeup = GC_THREAD_INACTIVE; // starts true, so we can wait for the // thread to start up, see wakeup_gc_threads + initMutex(&t->local_gc_lock); #endif hunk ./rts/sm/GC.c 965 - t->thread_index = n; + t->index = n; + t->localg0 = &all_generations[n]; t->free_blocks = NULL; t->gc_count = 0; hunk ./rts/sm/GC.c 969 + t->mut_lists = capabilities[t->index].mut_lists; + + // this counter is aggregated over the whole run + t->globalised = 0; init_gc_thread(t); hunk ./rts/sm/GC.c 975 - + + // The mark stack always has one block in it. + t->mark_stack_bd = allocBlock(); + t->mark_stack_top_bd = t->mark_stack_bd; + t->mark_stack_bd->link = NULL; + t->mark_stack_bd->u.back = NULL; + t->mark_sp = t->mark_stack_bd->start; + #ifdef USE_PAPI t->papi_events = -1; #endif hunk ./rts/sm/GC.c 987 - for (g = 0; g < RtsFlags.GcFlags.generations; g++) + // always GC_LOCAL outside of a GC_SEQ or GC_PAR collection. + t->gc_type = GC_LOCAL; + + // we need a workspace for every generation, even the local + // generations of other Capabilities, in case we do a + // single-threaded GC. + for (g = 0; g < total_generations; g++) { ws = &t->gens[g]; hunk ./rts/sm/GC.c 996 - ws->gen = &generations[g]; - ASSERT(g == ws->gen->no); + ws->gen = &all_generations[g]; + ASSERT(g == ws->gen->ix); ws->my_gct = t; hunk ./rts/sm/GC.c 1000 - ws->todo_bd = NULL; + // We want to call + // alloc_todo_block(ws,0); + // but can't, because it uses gct which isn't set up at this point. + // Hence, allocate a block for todo_bd manually: + { + bdescr *bd = allocBlock(); // no lock, locks aren't initialised yet + initBdescr(bd, ws->gen, ws->gen->to); + bd->flags = BF_EVACUATED; + bd->u.scan = bd->free = bd->start; + + ws->todo_bd = bd; + ws->todo_free = bd->free; + ws->todo_lim = bd->start + BLOCK_SIZE_W; + } + ws->todo_q = newWSDeque(128); ws->todo_overflow = NULL; ws->n_todo_overflow = 0; hunk ./rts/sm/GC.c 1018 + ws->todo_large_objects = NULL; ws->part_list = NULL; ws->n_part_blocks = 0; hunk ./rts/sm/GC.c 1042 for (i = 0; i < RtsFlags.ParFlags.nNodes; i++) { gc_threads[i] = stgMallocBytes(sizeof(gc_thread) + - RtsFlags.GcFlags.generations * sizeof(gen_workspace), + total_generations * sizeof(gen_workspace), "alloc_gc_threads"); new_gc_thread(i, gc_threads[i]); hunk ./rts/sm/GC.c 1047 } + + initMutex(&gc_local_mutex); #else gc_threads = stgMallocBytes (sizeof(gc_thread*),"alloc_gc_threads"); gc_threads[0] = gct; hunk ./rts/sm/GC.c 1083 } } +static lnat +count_part_lists (nat start_gen) +{ + nat n, g; + lnat words; + + words = 0; + for (n = 0; n < n_capabilities; n++) { + for (g = start_gen; g < total_generations; g++) { + words += countOccupied(gc_threads[n]->gens[g].part_list); + } + } + return words; +} + /* ---------------------------------------------------------------------------- Start GC threads ------------------------------------------------------------------------- */ hunk ./rts/sm/GC.c 1102 -static volatile StgWord gc_running_threads; - static StgWord inc_running (void) { hunk ./rts/sm/GC.c 1106 StgWord new; + if (gct->gc_type != GC_PAR) return 1; new = atomic_inc(&gc_running_threads); ASSERT(new <= n_gc_threads); return new; hunk ./rts/sm/GC.c 1115 static StgWord dec_running (void) { + if (gct->gc_type != GC_PAR) return 0; ASSERT(gc_running_threads != 0); return atomic_dec(&gc_running_threads); } hunk ./rts/sm/GC.c 1123 static rtsBool any_work (void) { - int g; + nat g; gen_workspace *ws; gct->any_work++; hunk ./rts/sm/GC.c 1131 write_barrier(); // scavenge objects in compacted generation - if (mark_stack_bd != NULL && !mark_stack_empty()) { + if (!mark_stack_empty()) { return rtsTrue; } hunk ./rts/sm/GC.c 1135 - // Check for global work in any step. We don't need to check for - // local work, because we have already exited scavenge_loop(), - // which means there is no local work for this thread. - for (g = 0; g < (int)RtsFlags.GcFlags.generations; g++) { + // Check for global work in any generation. We don't need to + // check for local work, because we have already exited + // scavenge_loop(), which means there is no local work for this + // thread. + for (g = 0; g < total_generations; g++) { ws = &gct->gens[g]; hunk ./rts/sm/GC.c 1141 + if (gct->gc_type == GC_LOCAL && isNonLocalGen(ws->gen)) + continue; if (ws->todo_large_objects) return rtsTrue; if (!looksEmptyWSDeque(ws->todo_q)) return rtsTrue; if (ws->todo_overflow) return rtsTrue; hunk ./rts/sm/GC.c 1151 #if defined(THREADED_RTS) if (work_stealing) { nat n; + int i; // look for work to steal for (n = 0; n < n_gc_threads; n++) { hunk ./rts/sm/GC.c 1154 - if (n == gct->thread_index) continue; - for (g = RtsFlags.GcFlags.generations-1; g >= 0; g--) { - ws = &gc_threads[n]->gens[g]; + if (n == gct->index) continue; + for (i = total_generations-1; i >= 0; i--) { + // never steal from local heap: + ws = &gc_threads[n]->gens[i]; + if (all_generations[i].is_local) continue; if (!looksEmptyWSDeque(ws->todo_q)) return rtsTrue; } } hunk ./rts/sm/GC.c 1178 { nat r; - loop: hunk ./rts/sm/GC.c 1179 - traceEventGcWork(&capabilities[gct->thread_index]); - #if defined(THREADED_RTS) hunk ./rts/sm/GC.c 1180 - if (n_gc_threads > 1) { - scavenge_loop(); + if (gct->gc_type == GC_LOCAL) { + scavenge_loop_local(); + } + else if (n_gc_threads > 1) { + scavenge_loop_par(); } else { hunk ./rts/sm/GC.c 1186 - scavenge_loop1(); + scavenge_loop(); } #else scavenge_loop(); hunk ./rts/sm/GC.c 1192 #endif + collect_gct_blocks(); + // scavenge_loop() only exits when there's no work to do r = dec_running(); hunk ./rts/sm/GC.c 1197 - traceEventGcIdle(&capabilities[gct->thread_index]); + traceEventGcIdle(gct->cap); debugTrace(DEBUG_gc, "%d GC threads still running", r); hunk ./rts/sm/GC.c 1205 // usleep(1); if (any_work()) { inc_running(); + traceEventGcWork(gct->cap); goto loop; } // any_work() does not remove the work from the queue, it hunk ./rts/sm/GC.c 1214 // scavenge_loop() to perform any pending work. } - traceEventGcDone(&capabilities[gct->thread_index]); + traceEventGcDone(gct->cap); } #if defined(THREADED_RTS) hunk ./rts/sm/GC.c 1228 saved_gct = gct; gct = gc_threads[cap->no]; - gct->id = osThreadId(); + gct->gc_type = GC_PAR; // Wait until we're told to wake up RELEASE_SPIN_LOCK(&gct->mut_spin); hunk ./rts/sm/GC.c 1233 gct->wakeup = GC_THREAD_STANDING_BY; - debugTrace(DEBUG_gc, "GC thread %d standing by...", gct->thread_index); + debugTrace(DEBUG_gc, "GC thread %d standing by...", gct->index); ACQUIRE_SPIN_LOCK(&gct->gc_spin); #ifdef USE_PAPI hunk ./rts/sm/GC.c 1243 } papi_thread_start_gc1_count(gct->papi_events); #endif + stat_gcWorkerThreadStart(gct); hunk ./rts/sm/GC.c 1245 + prepare_gc_thread(); + + traceEventGcWork(gct->cap); + // Every thread evacuates some roots. hunk ./rts/sm/GC.c 1250 - gct->evac_gen = 0; - markSomeCapabilities(mark_root, gct, gct->thread_index, n_gc_threads, - rtsTrue/*prune sparks*/); - scavenge_capability_mut_lists(&capabilities[gct->thread_index]); + gct->evac_gen_ix = 0; + markCapability(mark_root, gct, cap, rtsTrue/*prune sparks*/); + scavenge_capability_mut_lists(cap); scavenge_until_all_done(); hunk ./rts/sm/GC.c 1256 + // can't do this here, because we might have to shuffle entries + // betwee mut_lists after parallel GC, so the main thread does it all. + // globalise_mut_lists (cap); + #ifdef THREADED_RTS // Now that the whole heap is marked, we discard any sparks that // were found to be unreachable. The main GC thread is currently hunk ./rts/sm/GC.c 1270 pruneSparkQueue(cap); #endif + // free this thread's mark stack. + freeMarkStack(); + + // record the time spent doing GC in the Task structure + stat_gcWorkerThreadDone(gct); + #ifdef USE_PAPI // count events in this thread towards the GC totals papi_thread_stop_gc1_count(gct->papi_events); hunk ./rts/sm/GC.c 1285 RELEASE_SPIN_LOCK(&gct->gc_spin); gct->wakeup = GC_THREAD_WAITING_TO_CONTINUE; debugTrace(DEBUG_gc, "GC thread %d waiting to continue...", - gct->thread_index); + gct->index); ACQUIRE_SPIN_LOCK(&gct->mut_spin); hunk ./rts/sm/GC.c 1287 - debugTrace(DEBUG_gc, "GC thread %d on my way...", gct->thread_index); + debugTrace(DEBUG_gc, "GC thread %d on my way...", gct->index); + + gct->gc_type = GC_LOCAL; // always GC_LOCAL between collections SET_GCT(saved_gct); } hunk ./rts/sm/GC.c 1332 #endif // THREADED_RTS static void -start_gc_threads (void) -{ -#if defined(THREADED_RTS) - gc_running_threads = 0; -#endif -} - -static void -wakeup_gc_threads (nat n_threads USED_IF_THREADS, nat me USED_IF_THREADS) +wakeup_gc_threads (nat me USED_IF_THREADS, nat N USED_IF_THREADS) { #if defined(THREADED_RTS) nat i; hunk ./rts/sm/GC.c 1336 - for (i=0; i < n_threads; i++) { + + if (n_gc_threads == 1) return; + + for (i=0; i < n_gc_threads; i++) { if (i == me) continue; inc_running(); debugTrace(DEBUG_gc, "waking up gc thread %d", i); hunk ./rts/sm/GC.c 1343 + gc_threads[i]->collect_gen = N; if (gc_threads[i]->wakeup != GC_THREAD_STANDING_BY) barf("wakeup_gc_threads"); gc_threads[i]->wakeup = GC_THREAD_RUNNING; hunk ./rts/sm/GC.c 1357 // standby state, otherwise they may still be executing inside // any_work(), and may even remain awake until the next GC starts. static void -shutdown_gc_threads (nat n_threads USED_IF_THREADS, nat me USED_IF_THREADS) +shutdown_gc_threads (nat me USED_IF_THREADS) { #if defined(THREADED_RTS) nat i; hunk ./rts/sm/GC.c 1361 - for (i=0; i < n_threads; i++) { + for (i=0; i < n_gc_threads; i++) { if (i == me) continue; while (gc_threads[i]->wakeup != GC_THREAD_WAITING_TO_CONTINUE) { write_barrier(); } } hunk ./rts/sm/GC.c 1392 ------------------------------------------------------------------------- */ static void -init_collected_gen (nat g, nat n_threads) +prepare_collected_gen (generation *gen) { hunk ./rts/sm/GC.c 1394 - nat t, i; + nat i, g, n; gen_workspace *ws; hunk ./rts/sm/GC.c 1396 - generation *gen; - bdescr *bd; + bdescr *bd, *next; // Throw away the current mutable list. Invariant: the mutable // list always has at least one block; this means we can avoid a hunk ./rts/sm/GC.c 1401 // check for NULL in recordMutable(). + g = gen->no; if (g != 0) { hunk ./rts/sm/GC.c 1403 - freeChain(generations[g].mut_list); - generations[g].mut_list = allocBlock(); for (i = 0; i < n_capabilities; i++) { hunk ./rts/sm/GC.c 1404 - freeChain(capabilities[i].mut_lists[g]); - capabilities[i].mut_lists[g] = allocBlock(); + freeChain_sync(capabilities[i].mut_lists[g]); + capabilities[i].mut_lists[g] = allocBlock_sync(); } } hunk ./rts/sm/GC.c 1409 - gen = &generations[g]; - ASSERT(gen->no == g); - // we'll construct a new list of threads in this step // during GC, throw away the current list. gen->old_threads = gen->threads; hunk ./rts/sm/GC.c 1423 gen->live_estimate = 0; // initialise the large object queues. - gen->scavenged_large_objects = NULL; - gen->n_scavenged_large_blocks = 0; + ASSERT(gen->scavenged_large_objects == NULL); + ASSERT(gen->n_scavenged_large_blocks == 0); hunk ./rts/sm/GC.c 1426 + // grab all the partial blocks stashed in the gc_thread workspaces and + // move them to the old_blocks list of this gen. + for (n = 0; n < n_capabilities; n++) { + ws = &gc_threads[n]->gens[gen->ix]; + + for (bd = ws->part_list; bd != NULL; bd = next) { + next = bd->link; + bd->link = gen->old_blocks; + gen->old_blocks = bd; + gen->n_old_blocks += bd->blocks; + } + ws->part_list = NULL; + ws->n_part_blocks = 0; + + for (bd = ws->scavd_list; bd != NULL; bd = next) { + next = bd->link; + bd->link = gen->old_blocks; + gen->old_blocks = bd; + gen->n_old_blocks += bd->blocks; + } + ws->scavd_list = NULL; + ws->n_scavd_blocks = 0; + + if (ws->todo_free != ws->todo_bd->start) { + ws->todo_bd->free = ws->todo_free; + ws->todo_bd->link = gen->old_blocks; + gen->old_blocks = ws->todo_bd; + gen->n_old_blocks += ws->todo_bd->blocks; + alloc_todo_block(ws,0); // always has one block. + } + } + // mark the small objects as from-space for (bd = gen->old_blocks; bd; bd = bd->link) { bd->flags &= ~BF_EVACUATED; hunk ./rts/sm/GC.c 1468 bd->flags &= ~BF_EVACUATED; } - // for a compacted generation, we need to allocate the bitmap - if (gen->mark) { + // if we're doing global GC, mark all the prim blocks as ordinary + // blocks, so we'll copy them rather than marking. + if (gct->collect_gen >= global_gen_no) { + for (bd = gen->prim_blocks; bd; bd = bd->link) { + bd->flags &= ~(BF_PRIM | BF_MARKED | BF_EVACUATED); + } + } + + // allocate the mark bitmap for any blocks that will be marked, as + // opposed to copied, during this collection. + { nat bitmap_size; // in bytes bdescr *bitmap_bdescr; StgWord *bitmap; hunk ./rts/sm/GC.c 1482 - - bitmap_size = gen->n_old_blocks * BLOCK_SIZE / (sizeof(W_)*BITS_PER_BYTE); - + bdescr *marked_blocks; + nat n_marked_blocks; + + if (gen->mark) { + ASSERT(gen->prim_blocks == NULL); + marked_blocks = gen->old_blocks; + n_marked_blocks = gen->n_old_blocks; + } else if (gct->collect_gen < global_gen_no) { + ASSERT(countBlocks(gen->prim_blocks) == gen->n_prim_blocks); + marked_blocks = gen->prim_blocks; + n_marked_blocks = gen->n_prim_blocks; + } else { + marked_blocks = NULL; + n_marked_blocks = 0; + } + + bitmap_size = n_marked_blocks * BLOCK_SIZE / (sizeof(W_)*BITS_PER_BYTE); + if (bitmap_size > 0) { hunk ./rts/sm/GC.c 1501 - bitmap_bdescr = allocGroup((lnat)BLOCK_ROUND_UP(bitmap_size) - / BLOCK_SIZE); + bitmap_bdescr = allocGroup_sync((lnat)BLOCK_ROUND_UP(bitmap_size) + / BLOCK_SIZE); gen->bitmap = bitmap_bdescr; bitmap = bitmap_bdescr->start; hunk ./rts/sm/GC.c 1514 // For each block in this step, point to its bitmap from the // block descriptor. - for (bd=gen->old_blocks; bd != NULL; bd = bd->link) { + for (bd = marked_blocks; bd != NULL; bd = bd->link) { bd->u.bitmap = bitmap; bitmap += BLOCK_SIZE_W / (sizeof(W_)*BITS_PER_BYTE); hunk ./rts/sm/GC.c 1533 } } } - - // For each GC thread, for each step, allocate a "todo" block to - // store evacuated objects to be scavenged, and a block to store - // evacuated objects that do not need to be scavenged. - for (t = 0; t < n_threads; t++) { - ws = &gc_threads[t]->gens[g]; - - ws->todo_large_objects = NULL; - - ws->part_list = NULL; - ws->n_part_blocks = 0; - - // allocate the first to-space block; extra blocks will be - // chained on as necessary. - ws->todo_bd = NULL; - ASSERT(looksEmptyWSDeque(ws->todo_q)); - alloc_todo_block(ws,0); - - ws->todo_overflow = NULL; - ws->n_todo_overflow = 0; - - ws->scavd_list = NULL; - ws->n_scavd_blocks = 0; - } } hunk ./rts/sm/GC.c 1541 ------------------------------------------------------------------------- */ static void -init_uncollected_gen (nat g, nat threads) +stash_mut_list (Capability *cap, nat gen_no) { hunk ./rts/sm/GC.c 1543 - nat t, n; - gen_workspace *ws; - generation *gen; - bdescr *bd; + cap->saved_mut_lists[gen_no] = cap->mut_lists[gen_no]; + cap->mut_lists[gen_no] = allocBlock_sync(); +} + +/* ---------------------------------------------------------------------------- + Initialise a generation that is *not* to be collected + ------------------------------------------------------------------------- */ + +static void +prepare_uncollected_gen (nat gc_type, generation *gen) +{ + nat i; + + ASSERT(gen->no > 0); // save the current mutable lists for this generation, and // allocate a fresh block for each one. We'll traverse these hunk ./rts/sm/GC.c 1561 // mutable lists as roots early on in the GC. - generations[g].saved_mut_list = generations[g].mut_list; - generations[g].mut_list = allocBlock(); - for (n = 0; n < n_capabilities; n++) { - capabilities[n].saved_mut_lists[g] = capabilities[n].mut_lists[g]; - capabilities[n].mut_lists[g] = allocBlock(); + if (gc_type == GC_LOCAL) { + // for a local GC, we use the private mutable lists for this + // capability only. + stash_mut_list(gct->cap, gen->no); + } else { + // for a global GC, we use the private mutable lists of every + // capability. + for (i = 0; i < n_capabilities; i++) { + stash_mut_list(&capabilities[i], gen->no); + } + + // not necessarily true in GC_LOCAL: + ASSERT(gen->scavenged_large_objects == NULL); + ASSERT(gen->n_scavenged_large_blocks == 0); } hunk ./rts/sm/GC.c 1577 - gen = &generations[g]; +} hunk ./rts/sm/GC.c 1579 - gen->scavenged_large_objects = NULL; - gen->n_scavenged_large_blocks = 0; +/* ----------------------------------------------------------------------------- + Initialise a gc_thread before GC + -------------------------------------------------------------------------- */ + +void +prepare_gen_workspace (nat g) +{ + gen_workspace *ws; hunk ./rts/sm/GC.c 1588 - for (t = 0; t < threads; t++) { - ws = &gc_threads[t]->gens[g]; + ws = &gct->gens[g]; hunk ./rts/sm/GC.c 1590 - ASSERT(looksEmptyWSDeque(ws->todo_q)); - ws->todo_large_objects = NULL; - - ws->part_list = NULL; - ws->n_part_blocks = 0; - - ws->scavd_list = NULL; - ws->n_scavd_blocks = 0; - - // If the block at the head of the list in this generation - // is less than 3/4 full, then use it as a todo block. - if (gen->blocks && isPartiallyFull(gen->blocks)) - { - ws->todo_bd = gen->blocks; - ws->todo_free = ws->todo_bd->free; - ws->todo_lim = ws->todo_bd->start + BLOCK_SIZE_W; - gen->blocks = gen->blocks->link; - gen->n_blocks -= 1; - gen->n_words -= ws->todo_bd->free - ws->todo_bd->start; - ws->todo_bd->link = NULL; - // we must scan from the current end point. - ws->todo_bd->u.scan = ws->todo_bd->free; - } - else - { - ws->todo_bd = NULL; - alloc_todo_block(ws,0); + ASSERT(looksEmptyWSDeque(ws->todo_q)); + ASSERT(ws->todo_large_objects == NULL); + + ASSERT(countBlocks(ws->part_list) == ws->n_part_blocks); + ASSERT(countBlocks(ws->scavd_list) == ws->n_scavd_blocks); + // scavd_list can be populated by globalise() during mutation. + + ASSERT(ws->todo_bd != NULL); + + // don't scavenge anything we allocated using allocateInGen + // XXX there should be a better place for this + ws->todo_bd->u.scan = ws->todo_free; + + ASSERT(ws->todo_overflow == NULL); + ASSERT(ws->n_todo_overflow == 0); +} + +static void +prepare_gc_thread (void) +{ + nat g; + + init_gc_thread(gct); + + // Initialise workspaces for all generations + for (g = 0; g < total_generations; g++) { + // If this gen is the local G0 for another Capability and + // we're doing a local GC, then we don't bother allocating a + // todo block. + if (!(gct->gc_type == GC_LOCAL && isNonLocalGenIx(g))) { + prepare_gen_workspace(g); } } hunk ./rts/sm/GC.c 1623 +} hunk ./rts/sm/GC.c 1625 - // deal out any more partial blocks to the threads' part_lists - t = 0; - while (gen->blocks && isPartiallyFull(gen->blocks)) - { - bd = gen->blocks; - gen->blocks = bd->link; - ws = &gc_threads[t]->gens[g]; - bd->link = ws->part_list; - ws->part_list = bd; - ws->n_part_blocks += 1; - bd->u.scan = bd->free; - gen->n_blocks -= 1; - gen->n_words -= bd->free - bd->start; - t++; - if (t == n_gc_threads) t = 0; +static void +collect_gct_blocks (void) +{ + nat g; + gen_workspace *ws; + bdescr *bd, *prev; + + for (g = 0; g < total_generations; g++) { + ws = &gct->gens[g]; + + // in GC_PAR and GC_SEQ we might evacuate objects in any part + // of the heap, but in GC_LOCAL we can only touch our local + // heap and the global heap. + if (gct->gc_type == GC_LOCAL && isNonLocalGen(ws->gen)) + continue; + + // there may still be a block attached to ws->todo_bd; + // leave it there to use next time. + + if (ws->scavd_list != NULL) { + ACQUIRE_SPIN_LOCK(&ws->gen->sync); + + ASSERT(gct->scan_bd == NULL); + ASSERT(countBlocks(ws->scavd_list) == ws->n_scavd_blocks); + + prev = NULL; + for (bd = ws->scavd_list; bd != NULL; bd = bd->link) { + ws->gen->n_words += bd->free - bd->start; + prev = bd; + } + if (prev != NULL) { + prev->link = ws->gen->blocks; + ws->gen->blocks = ws->scavd_list; + } + ws->gen->n_blocks += ws->n_scavd_blocks; + + ws->scavd_list = NULL; + ws->n_scavd_blocks = 0; + + RELEASE_SPIN_LOCK(&ws->gen->sync); + } } } hunk ./rts/sm/GC.c 1670 /* ----------------------------------------------------------------------------- - Initialise a gc_thread before GC + Free excess blocks on the mark stack -------------------------------------------------------------------------- */ static void hunk ./rts/sm/GC.c 1674 -init_gc_thread (gc_thread *t) +freeMarkStack (void) { hunk ./rts/sm/GC.c 1676 - t->static_objects = END_OF_STATIC_LIST; - t->scavenged_static_objects = END_OF_STATIC_LIST; - t->scan_bd = NULL; - t->mut_lists = capabilities[t->thread_index].mut_lists; - t->evac_gen = 0; - t->failed_to_evac = rtsFalse; - t->eager_promotion = rtsTrue; - t->thunk_selector_depth = 0; - t->copied = 0; - t->scanned = 0; - t->any_work = 0; - t->no_work = 0; - t->scav_find_work = 0; + if (gct->mark_stack_top_bd->link != NULL) { + debugTrace(DEBUG_gc, "mark stack: %d blocks", + countBlocks(gct->mark_stack_top_bd)); + freeChain_sync(gct->mark_stack_top_bd->link); + gct->mark_stack_top_bd->link = NULL; + ASSERT(gct->mark_stack_top_bd == gct->mark_stack_bd); + } } /* ----------------------------------------------------------------------------- hunk ./rts/sm/GC.c 1701 saved_gct = gct; SET_GCT(user); - evacuate(root); +#ifdef THREADED_RTS + if (gct->gc_type == GC_LOCAL) { + evacuate_local(root); + } else +#endif + { + evacuate(root); + } SET_GCT(saved_gct); } hunk ./rts/sm/GC.c 1720 static void zero_static_object_list(StgClosure* first_static) { - StgClosure* p; + StgClosure* p, *prev; StgClosure* link; const StgInfoTable *info; hunk ./rts/sm/GC.c 1724 + for (p = first_static; p != END_OF_STATIC_LIST; p = link) { + info = get_itbl(p); + link = *STATIC_LINK(info, p); + prev = p; + } + for (p = first_static; p != END_OF_STATIC_LIST; p = link) { info = get_itbl(p); link = *STATIC_LINK(info, p); hunk ./rts/sm/GC.c 1829 min_alloc, size, max); #endif - for (g = 0; g < gens; g++) { - generations[g].max_blocks = size; + for (g = 1; g < gens; g++) { + old_generations[g].max_blocks = size; } } } hunk ./rts/sm/GC.c 1840 -------------------------------------------------------------------------- */ static void -resize_nursery (void) +resize_nursery (lnat copied, nat N) { const lnat min_nursery = RtsFlags.GcFlags.minAllocAreaSize * n_capabilities; hunk ./rts/sm/GC.c 1844 - if (RtsFlags.GcFlags.generations == 1) + if (gct->gc_type == GC_LOCAL) + { + if (RtsFlags.GcFlags.heapSizeSuggestion == 0) + { + ACQUIRE_SM_LOCK; // needed due to use of allocGroup/freeGroup + resizeNursery(gct->cap, RtsFlags.GcFlags.minAllocAreaSize); + RELEASE_SM_LOCK; + } + else + { + // we don't know how big the nursery was supposed + // to be, so just leave it as is. It might be a bit bigger than + // before due to adding new large blocks and/or new blocks in + // allocate(), but we'll resize at the next major GC. + if (nurseries[gct->index].n_blocks < + RtsFlags.GcFlags.minAllocAreaSize) { + ACQUIRE_SM_LOCK; // needed due to use of allocGroup/freeGroup + resizeNursery(gct->cap, RtsFlags.GcFlags.minAllocAreaSize); + RELEASE_SM_LOCK; + } + } + } + else if (RtsFlags.GcFlags.generations == 1) { // Two-space collector: nat blocks; hunk ./rts/sm/GC.c 1884 * performance we get from 3L bytes, reducing to the same * performance at 2L bytes. */ - blocks = generations[0].n_blocks; + blocks = all_generations[0].n_blocks; if ( RtsFlags.GcFlags.maxHeapSize != 0 && blocks * RtsFlags.GcFlags.oldGenFactor * 2 > hunk ./rts/sm/GC.h 26 StgClosure * isAlive ( StgClosure *p ); void markCAFs ( evac_fn evac, void *user ); -extern nat N; -extern rtsBool major_gc; +StgPtr allocateInGen (Capability *cap, nat gen_ix, nat size); hunk ./rts/sm/GC.h 28 -extern bdescr *mark_stack_bd; -extern bdescr *mark_stack_top_bd; -extern StgPtr mark_sp; - -extern long copied; - -extern rtsBool work_stealing; +extern rtsBool major_gc; // collecting the oldest gen? +extern rtsBool work_stealing; // work stealing is enabled? +extern nat next_gc_gen; // generation to collect next time #ifdef DEBUG extern nat mutlist_MUTVARS, mutlist_MUTARRS, mutlist_MVARS, mutlist_OTHERS; hunk ./rts/sm/GC.h 49 void releaseGCThreads (Capability *cap); #endif +// used in globalise() +void prepare_gen_workspace (nat g); + #define WORK_UNIT_WORDS 128 #include "EndPrivate.h" hunk ./rts/sm/GCAux.c 20 #include "Capability.h" #include "Trace.h" #include "Schedule.h" -// DO NOT include "GCThread.h", we don't want the register variable +// DO NOT include "GCTDecl.h", we don't want the register variable /* ----------------------------------------------------------------------------- isAlive determines whether the given closure is still alive (after hunk ./rts/sm/GCAux.c 68 return p; } + // BF_PRIM things can be either copied or marked, depending on + // kind of GC we're doing, so we check for both here. + if (bd->flags & BF_PRIM) { + info = q->header.info; + if (is_marked((P_)q,bd) || isGlobalPrim(q)) { + return p; + } else { + return NULL; + } + } + // large objects use the evacuated flag if (bd->flags & BF_LARGE) { return NULL; hunk ./rts/sm/GCAux.c 152 evac(user, &c->indirectee); } } + addfile ./rts/sm/GCTDecl.h hunk ./rts/sm/GCTDecl.h 1 +/* ----------------------------------------------------------------------------- + * + * (c) The GHC Team 1998-2009 + * + * Documentation on the architecture of the Garbage Collector can be + * found in the online commentary: + * + * http://hackage.haskell.org/trac/ghc/wiki/Commentary/Rts/Storage/GC + * + * ---------------------------------------------------------------------------*/ + +#ifndef SM_GCTDECL_H +#define SM_GCTDECL_H + +#include "BeginPrivate.h" + +/* ----------------------------------------------------------------------------- + The gct variable is thread-local and points to the current thread's + gc_thread structure. It is heavily accessed, so we try to put gct + into a global register variable if possible; if we don't have a + register then use gcc's __thread extension to create a thread-local + variable. + -------------------------------------------------------------------------- */ + +#if defined(THREADED_RTS) + +#define GLOBAL_REG_DECL(type,name,reg) register type name REG(reg); + +#define SET_GCT(to) gct = (to) + + + +#if (defined(i386_HOST_ARCH) && defined(linux_HOST_OS)) +// Using __thread is better than stealing a register on x86/Linux, because +// we have too few registers available. In my tests it was worth +// about 5% in GC performance, but of course that might change as gcc +// improves. -- SDM 2009/04/03 +// +// We ought to do the same on MacOS X, but __thread is not +// supported there yet (gcc 4.0.1). + +extern __thread gc_thread* gct; +#define DECLARE_GCT __thread gc_thread* gct; + + +#elif defined(sparc_HOST_ARCH) +// On SPARC we can't pin gct to a register. Names like %l1 are just offsets +// into the register window, which change on each function call. +// +// There are eight global (non-window) registers, but they're used for other purposes. +// %g0 -- always zero +// %g1 -- volatile over function calls, used by the linker +// %g2-%g3 -- used as scratch regs by the C compiler (caller saves) +// %g4 -- volatile over function calls, used by the linker +// %g5-%g7 -- reserved by the OS + +extern __thread gc_thread* gct; +#define DECLARE_GCT __thread gc_thread* gct; + + +#elif defined(REG_Base) && !defined(i386_HOST_ARCH) +// on i386, REG_Base is %ebx which is also used for PIC, so we don't +// want to steal it + +GLOBAL_REG_DECL(gc_thread*, gct, REG_Base) +#define DECLARE_GCT /* nothing */ + + +#elif defined(REG_R1) + +GLOBAL_REG_DECL(gc_thread*, gct, REG_R1) +#define DECLARE_GCT /* nothing */ + + +#elif defined(__GNUC__) + +extern __thread gc_thread* gct; +#define DECLARE_GCT __thread gc_thread* gct; + +#else + +#error Cannot find a way to declare the thread-local gct + +#endif + +#else // not the threaded RTS + +extern StgWord8 the_gc_thread[]; + +#define gct ((gc_thread*)&the_gc_thread) +#define SET_GCT(to) /*nothing*/ +#define DECLARE_GCT /*nothing*/ + +#endif // THREADED_RTS + +#include "EndPrivate.h" + +#endif // SM_GCTDECL_H hunk ./rts/sm/GCThread.h 18 #define SM_GCTHREAD_H #include "WSDeque.h" +#include "GetTime.h" // for Ticks #include "BeginPrivate.h" hunk ./rts/sm/GCThread.h 119 ------------------------------------------------------------------------- */ typedef struct gc_thread_ { + Capability *cap; + #ifdef THREADED_RTS hunk ./rts/sm/GCThread.h 122 - OSThreadId id; // The OS thread that this struct belongs to SpinLock gc_spin; SpinLock mut_spin; volatile rtsBool wakeup; hunk ./rts/sm/GCThread.h 125 + + Mutex local_gc_lock; #endif hunk ./rts/sm/GCThread.h 128 - nat thread_index; // a zero based index identifying the thread + nat index; // a zero based index identifying the thread + // equal to cap->no + + generation *localg0; // The local G0 for this Capability bdescr * free_blocks; // a buffer of free blocks for this thread // during GC without accessing the block hunk ./rts/sm/GCThread.h 156 // -------------------- // evacuate flags - generation *evac_gen; // Youngest generation that objects + nat evac_gen_ix; // Youngest generation that objects // should be evacuated to in // evacuate(). (Logically an // argument to evacuate, but it's hunk ./rts/sm/GCThread.h 172 // instead of the to-space // corresponding to the object - lnat thunk_selector_depth; // ummm.... not used as of now + lnat thunk_selector_depth; // used to avoid unbounded recursion in + // evacuate() for THUNK_SELECTOR + + nat collect_gen; // maximum generation (no) to collect + + nat gc_type; // The gc type (GC_SEQ, GC_PAR, GC_LOCAL) + + rtsBool globalise_thunks; // whether to globalise THUNK objects + + StgTSO *resurrected_threads; // threads found to be unreachable, + // linked by ->global_link field. + + StgTSO *exception_threads; // List of blocked threads found to + // have pending throwTos + + /* Which stage of processing various kinds of weak pointer are we at? + * (see traverse_weak_ptr_list() below for discussion). + */ + enum { WeakPtrs, WeakThreads, WeakDone } weak_stage; + + StgWeak *old_weak_ptrs; + + // -------------------- + // The mark stack + + bdescr *mark_stack_top_bd; // topmost block in the mark stack + bdescr *mark_stack_bd; // current block in the mark stack + StgPtr mark_sp; // pointer to the next unallocated mark + // stack entry #ifdef USE_PAPI int papi_events; hunk ./rts/sm/GCThread.h 211 lnat copied; lnat scanned; + lnat globalised; lnat any_work; lnat no_work; lnat scav_find_work; hunk ./rts/sm/GCThread.h 216 + Ticks gc_start_cpu; // process CPU time + Ticks gc_start_elapsed; // process elapsed time + Ticks gc_start_thread_cpu; // thread CPU time + lnat gc_start_faults; + // ------------------- // workspaces hunk ./rts/sm/GCThread.h 224 - // array of workspaces, indexed by stp->abs_no. This is placed + // array of workspaces, indexed by gen->abs_no. This is placed // directly at the end of the gc_thread structure so that we can get from // the gc_thread pointer to a workspace using only pointer // arithmetic, no memory access. This happens in the inner loop hunk ./rts/sm/GCThread.h 235 extern nat n_gc_threads; -/* ----------------------------------------------------------------------------- - The gct variable is thread-local and points to the current thread's - gc_thread structure. It is heavily accessed, so we try to put gct - into a global register variable if possible; if we don't have a - register then use gcc's __thread extension to create a thread-local - variable. - - Even on x86 where registers are scarce, it is worthwhile using a - register variable here: I measured about a 2-5% slowdown with the - __thread version. - -------------------------------------------------------------------------- */ - extern gc_thread **gc_threads; hunk ./rts/sm/GCThread.h 237 -#if defined(THREADED_RTS) - -#define GLOBAL_REG_DECL(type,name,reg) register type name REG(reg); - -#define SET_GCT(to) gct = (to) - - - -#if (defined(i386_HOST_ARCH) && defined(linux_HOST_OS)) -// Using __thread is better than stealing a register on x86/Linux, because -// we have too few registers available. In my tests it was worth -// about 5% in GC performance, but of course that might change as gcc -// improves. -- SDM 2009/04/03 -// -// We ought to do the same on MacOS X, but __thread is not -// supported there yet (gcc 4.0.1). - -extern __thread gc_thread* gct; -#define DECLARE_GCT __thread gc_thread* gct; - - -#elif defined(sparc_HOST_ARCH) -// On SPARC we can't pin gct to a register. Names like %l1 are just offsets -// into the register window, which change on each function call. -// -// There are eight global (non-window) registers, but they're used for other purposes. -// %g0 -- always zero -// %g1 -- volatile over function calls, used by the linker -// %g2-%g3 -- used as scratch regs by the C compiler (caller saves) -// %g4 -- volatile over function calls, used by the linker -// %g5-%g7 -- reserved by the OS - -extern __thread gc_thread* gct; -#define DECLARE_GCT __thread gc_thread* gct; - - -#elif defined(REG_Base) && !defined(i386_HOST_ARCH) -// on i386, REG_Base is %ebx which is also used for PIC, so we don't -// want to steal it - -GLOBAL_REG_DECL(gc_thread*, gct, REG_Base) -#define DECLARE_GCT /* nothing */ - - -#elif defined(REG_R1) - -GLOBAL_REG_DECL(gc_thread*, gct, REG_R1) -#define DECLARE_GCT /* nothing */ - - -#elif defined(__GNUC__) - -extern __thread gc_thread* gct; -#define DECLARE_GCT __thread gc_thread* gct; - -#else - -#error Cannot find a way to declare the thread-local gct - -#endif - -#else // not the threaded RTS - -extern StgWord8 the_gc_thread[]; - -#define gct ((gc_thread*)&the_gc_thread) -#define SET_GCT(to) /*nothing*/ -#define DECLARE_GCT /*nothing*/ - -#endif - #include "EndPrivate.h" #endif // SM_GCTHREAD_H hunk ./rts/sm/GCUtils.c 21 #include "Storage.h" #include "GC.h" #include "GCThread.h" +#include "GCTDecl.h" #include "GCUtils.h" #include "Printer.h" #include "Trace.h" hunk ./rts/sm/GCUtils.c 36 bdescr * allocBlock_sync(void) { - bdescr *bd; - ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync); - bd = allocBlock(); - RELEASE_SPIN_LOCK(&gc_alloc_block_sync); - return bd; + // GC_LOCAL uses the ordinary locking protocol for the block + // allocator because it runs concurrently with the mutator. + if (gct->gc_type == GC_LOCAL) { + return allocBlock_lock(); + } else { + bdescr *bd; + ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync); + bd = allocBlock(); + RELEASE_SPIN_LOCK(&gc_alloc_block_sync); + return bd; + } } hunk ./rts/sm/GCUtils.c 49 -static bdescr * +bdescr * allocGroup_sync(nat n) { hunk ./rts/sm/GCUtils.c 52 - bdescr *bd; - ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync); - bd = allocGroup(n); - RELEASE_SPIN_LOCK(&gc_alloc_block_sync); - return bd; + // GC_LOCAL uses the ordinary locking protocol for the block + // allocator because it runs concurrently with the mutator. + if (gct->gc_type == GC_LOCAL) { + return allocGroup_lock(n); + } else { + bdescr *bd; + ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync); + bd = allocGroup(n); + RELEASE_SPIN_LOCK(&gc_alloc_block_sync); + return bd; + } } hunk ./rts/sm/GCUtils.c 66 +#define ALLOCBLOCKS 0 +#if ALLOCBLOCKS static void allocBlocks_sync(nat n, bdescr **hd, bdescr **tl, generation *gen, StgWord32 flags) hunk ./rts/sm/GCUtils.c 93 *hd = bd; *tl = &bd[n-1]; } +#endif void freeChain_sync(bdescr *bd) hunk ./rts/sm/GCUtils.c 98 { - ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync); - freeChain(bd); - RELEASE_SPIN_LOCK(&gc_alloc_block_sync); + // GC_LOCAL uses the ordinary locking protocol for the block + // allocator because it runs concurrently with the mutator. + if (gct->gc_type == GC_LOCAL) { + return freeChain_lock(bd); + } else { + ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync); + freeChain(bd); + RELEASE_SPIN_LOCK(&gc_alloc_block_sync); + } +} + +void +freeGroup_sync(bdescr *bd) +{ + // GC_LOCAL uses the ordinary locking protocol for the block + // allocator because it runs concurrently with the mutator. + if (gct->gc_type == GC_LOCAL) { + return freeGroup_lock(bd); + } else { + ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync); + freeGroup(bd); + RELEASE_SPIN_LOCK(&gc_alloc_block_sync); + } } /* ----------------------------------------------------------------------------- hunk ./rts/sm/GCUtils.c 163 // look for work to steal for (n = 0; n < n_gc_threads; n++) { - if (n == gct->thread_index) continue; + if (n == gct->index) continue; bd = stealWSDeque(gc_threads[n]->gens[g].todo_q); if (bd) { return bd; hunk ./rts/sm/GCUtils.c 233 } } - gct->copied += ws->todo_free - bd->free; bd->free = ws->todo_free; ASSERT(bd->u.scan >= bd->start && bd->u.scan <= bd->free); hunk ./rts/sm/GCUtils.c 299 bd = allocGroup_sync((lnat)BLOCK_ROUND_UP(size*sizeof(W_)) / BLOCK_SIZE); } else { -#if 1 - bd = allocBlock_sync(); -#else + +#if ALLOCBLOCKS bdescr *hd, *tl; allocBlocks_sync(16, &hd, &tl, ws->gen, BF_EVACUATED); hunk ./rts/sm/GCUtils.c 309 ws->n_part_blocks += 15; bd = hd; +#else + bd = allocBlock_sync(); #endif } initBdescr(bd, ws->gen, ws->gen->to); hunk ./rts/sm/GCUtils.c 322 ws->todo_bd = bd; ws->todo_free = bd->free; - ws->todo_lim = stg_min(bd->start + bd->blocks * BLOCK_SIZE_W, - bd->free + stg_max(WORK_UNIT_WORDS,size)); + + if (gct->gc_type == GC_PAR && work_stealing) { + // use a smaller limit if we're doing load-balancing, so we + // can share available work more quickly. + ws->todo_lim = stg_min(bd->start + bd->blocks * BLOCK_SIZE_W, + bd->free + stg_max(WORK_UNIT_WORDS,size)); + } else { + ws->todo_lim = bd->start + bd->blocks * BLOCK_SIZE_W; + } debugTrace(DEBUG_gc, "alloc new todo block %p for gen %d", bd->free, ws->gen->no); hunk ./rts/sm/GCUtils.c 344 #if DEBUG void -printMutableList(generation *gen) +printMutableList(bdescr *bd) { hunk ./rts/sm/GCUtils.c 346 - bdescr *bd; StgPtr p; hunk ./rts/sm/GCUtils.c 348 - debugBelch("mutable list %p: ", gen->mut_list); + debugBelch("mutable list %p: ", bd); hunk ./rts/sm/GCUtils.c 350 - for (bd = gen->mut_list; bd != NULL; bd = bd->link) { + for (; bd != NULL; bd = bd->link) { for (p = bd->start; p < bd->free; p++) { debugBelch("%p (%s), ", (void *)*p, info_type((StgClosure *)*p)); } hunk ./rts/sm/GCUtils.h 19 #include "BeginPrivate.h" -bdescr *allocBlock_sync(void); -void freeChain_sync(bdescr *bd); +#include "GCTDecl.h" + +bdescr *allocBlock_sync (void); +bdescr *allocGroup_sync (nat n); +void freeChain_sync (bdescr *bd); +void freeGroup_sync (bdescr *bd); void push_scanned_block (bdescr *bd, gen_workspace *ws); StgPtr todo_block_full (nat size, gen_workspace *ws); hunk ./rts/sm/GCUtils.h 46 #if DEBUG -void printMutableList (generation *gen); +void printMutableList (bdescr *bd); #endif hunk ./rts/sm/GCUtils.h 49 +// returns True if the given generation index belongs to +// another GC thread. +INLINE_HEADER rtsBool isNonLocalGen (generation *gen) +{ + return (gen->is_local && gen->cap != gct->index); +} + +INLINE_HEADER rtsBool isNonLocalGenIx (nat ix) +{ + return isNonLocalGen(&all_generations[ix]); +} + // Version of recordMutableGen for use during GC. This uses the // mutable lists attached to the current gc_thread structure, which // are the same as the mutable lists on the Capability. hunk ./rts/sm/GCUtils.h 72 bd = gct->mut_lists[gen_no]; if (bd->free >= bd->start + BLOCK_SIZE_W) { bdescr *new_bd; - new_bd = allocBlock_sync(); + new_bd = allocBlock_sync(); new_bd->link = bd; bd = new_bd; gct->mut_lists[gen_no] = bd; addfile ./rts/sm/Globalise.c hunk ./rts/sm/Globalise.c 1 +/* ----------------------------------------------------------------------------- + * + * (c) The GHC Team 1998-2010 + * + * Globalise data from the local heap to the global heap. + * + * Documentation on the architecture of the Garbage Collector can be + * found in the online commentary: + * + * http://hackage.haskell.org/trac/ghc/wiki/Commentary/Rts/Storage/GC + * + * ---------------------------------------------------------------------------*/ + +#include "PosixSource.h" +#include "Rts.h" + +#include "Evac.h" +#include "GC.h" +#include "GCThread.h" +#include "GCTDecl.h" +#include "GCUtils.h" +#include "Capability.h" +#include "Globalise.h" +#include "Prelude.h" +#include "Storage.h" +#include "Trace.h" +#include "Apply.h" +#include "Printer.h" +#include "Updates.h" +#include "MarkStack.h" + +static REGPARM1 GNUC_ATTR_HOT void globalise_evac (StgClosure **p); +static void globalise_scavenge (void); +static void globalise_scavenge_TSO (StgTSO *tso); +STATIC_INLINE void globalise_large (StgPtr p); +static void globalise_large_bitmap (StgPtr p, StgLargeBitmap *large_bitmap, + nat size); +static void globalise_maybe_record_mutable (StgClosure *q); +static void globalise_stack (StgPtr p, StgPtr stack_end); + +void +globaliseWRT (Capability *cap USED_IF_THREADS, + StgClosure *parent, StgClosure **root) +{ + if (isGlobal(parent)) { + globalise(cap, root); + } +} + +void +globaliseFullWRT (Capability *cap USED_IF_THREADS, + StgClosure *parent, StgClosure **root) +{ + if (isGlobal(parent)) { + *root = globaliseFull_(cap, *root); + } +} + +void +globalise (Capability *cap, StgClosure **root) +{ + // very careful: we can't write the new pointer until the + // structure is fully globalised, otherwise another processor + // might follow the pointer and find local heap. + *root = globalise_(cap, *root); +} + +StgClosure * +globalise_ (Capability *cap USED_IF_THREADS, StgClosure *p) +{ + gc_thread *saved_gct; + + // necessary if we stole a callee-saves register for gct: + saved_gct = gct; + SET_GCT(gc_threads[cap->no]); + + // No: we can get here via GarbageCollect(N=1)->resurrectThreads + // ASSERT(gct->gc_type == GC_LOCAL); + + // prepare_gen_workspace(gen->ix), inlined: + { + gen_workspace *ws; + ws = &gct->gens[global_gen_ix]; + ws->todo_bd->u.scan = ws->todo_free; + } + + gct->globalise_thunks = rtsTrue; + + globalise_evac(&p); + gct->globalise_thunks = rtsFalse; + globalise_scavenge(); + + ASSERT(!gct->failed_to_evac); + + SET_GCT(saved_gct); + + return p; +} + +StgClosure * +globaliseFull_ (Capability *cap USED_IF_THREADS, StgClosure *p) +{ + gc_thread *saved_gct; + + // necessary if we stole a callee-saves register for gct: + saved_gct = gct; + SET_GCT(gc_threads[cap->no]); + + // No: we can get here via GarbageCollect(N=1)->resurrectThreads + // ASSERT(gct->gc_type == GC_LOCAL); + + gct->globalise_thunks = rtsTrue; + + // prepare_gen_workspace(gen->ix), inlined: + { + gen_workspace *ws; + ws = &gct->gens[global_gen_ix]; + ws->todo_bd->u.scan = ws->todo_free; + } + + globalise_evac(&p); + globalise_scavenge(); + + ASSERT(!gct->failed_to_evac); + + SET_GCT(saved_gct); + + return p; +} + +// Use globalise_TSO() if you want to migrate a thread from one +// Capability to another. It ensures that all the fields of the TSO +// are published, whereas the ordinary globalise() only ensures that +// the StgTSO structure itself is published. + +StgTSO * +globalise_TSO (Capability *cap USED_IF_THREADS, StgTSO *tso) +{ + gc_thread *saved_gct; + StgStack *stack; + StgUnderflowFrame *frame; + + // necessary if we stole a callee-saves register for gct: + saved_gct = gct; + SET_GCT(gc_threads[cap->no]); + + // No: we can get here via GarbageCollect(N=1)->resurrectThreads + // ASSERT(gct->gc_type == GC_LOCAL); + + gct->globalise_thunks = rtsTrue; // probably a good idea, but not mandatory + + // prepare_gen_workspace(gen->ix), inlined: + { + gen_workspace *ws; + ws = &gct->gens[global_gen_ix]; + ws->todo_bd->u.scan = ws->todo_free; + } + + // Normally a TSO is considered to be a private object, and we + // don't have to globalise its contents. However, + // in this case we want to globalise the complete contents because + // we are about to transfer ownership of the TSO to another + // Capability. + + // First, evacuate and scavenge the TSO object + globalise_evac((StgClosure**)&tso); + globalise_scavenge_TSO(tso); + + // globalise and scavenge all the stack chunks + stack = tso->stackobj; + while (1) { + globalise_stack(stack->sp, stack->stack + stack->stack_size); + frame = (StgUnderflowFrame*) (stack->stack + stack->stack_size + - sizeofW(StgUnderflowFrame)); + if (frame->info != &stg_stack_underflow_frame_info + || frame->next_chunk == (StgStack*)END_TSO_QUEUE) break; + stack = frame->next_chunk; + } + + // finish up globalising + globalise_scavenge(); + + // now we need to mark the TSO and all the STACKs as clean, + // because if they migrate to another Capability then they need to + // be added to that Capability's mutable list. + tso->dirty = 0; + stack = tso->stackobj; + while (1) { + stack->dirty = 0; + frame = (StgUnderflowFrame*) (stack->stack + stack->stack_size + - sizeofW(StgUnderflowFrame)); + if (frame->info != &stg_stack_underflow_frame_info + || frame->next_chunk == (StgStack*)END_TSO_QUEUE) break; + stack = frame->next_chunk; + } + + if (gct->failed_to_evac) { + barf("globalise: could not globalise the whole structure"); + } + + SET_GCT(saved_gct); + return tso; +} + + +STATIC_INLINE void +globalise_large (StgPtr p) +{ + bdescr *bd; + generation *gen, *new_gen; + nat gen_ix; + gen_workspace *ws; + + bd = Bdescr(p); + gen_ix = bd->gen_ix; + gen = bd->gen; + + ACQUIRE_SPIN_LOCK(&gen->sync); + + // remove from large_object list + if (bd->u.back) { + bd->u.back->link = bd->link; + } else { // first object in the list + gen->large_objects = bd->link; + } + if (bd->link) { + bd->link->u.back = bd->u.back; + } + gen->n_large_blocks -= bd->blocks; + + // link it on to the evacuated large object list of the destination gen + new_gen = &all_generations[global_gen_ix]; + ws = &gct->gens[global_gen_ix]; + + bd->flags |= BF_EVACUATED; + initBdescr(bd, new_gen, new_gen->to); + + // If this is a block of pinned objects, we don't have to scan + // these objects, because they aren't allowed to contain any + // pointers. For these blocks, we skip the scavenge stage and put + // them straight on the large_objects list. + if (bd->flags & BF_PINNED) { + ASSERT(get_itbl((StgClosure *)p)->type == ARR_WORDS); + if (new_gen != gen) { ACQUIRE_SPIN_LOCK(&new_gen->sync); } + dbl_link_onto(bd, &new_gen->large_objects); + new_gen->n_large_blocks += bd->blocks; + if (new_gen != gen) { RELEASE_SPIN_LOCK(&new_gen->sync); } + } else { + bd->link = ws->todo_large_objects; + ws->todo_large_objects = bd; + } + + RELEASE_SPIN_LOCK(&gen->sync); +} + + +STATIC_INLINE StgPtr +alloc_for_copy (nat size, nat gen_ix) +{ + StgPtr to; + gen_workspace *ws; + + ws = &gct->gens[gen_ix]; + + /* chain a new block onto the to-space for the destination gen if + * necessary. + */ + to = ws->todo_free; + ws->todo_free += size; + if (ws->todo_free > ws->todo_lim) { + to = todo_block_full(size, ws); + } + ASSERT(ws->todo_free >= ws->todo_bd->free && ws->todo_free <= ws->todo_lim); + + return to; +} + +STATIC_INLINE StgPtr +alloc_for_copy_global (nat size) +{ + return alloc_for_copy(size, global_gen_ix); +} + +STATIC_INLINE GNUC_ATTR_HOT void +copy_closure(StgPtr from, StgPtr to, const StgInfoTable *info, nat size) +{ + nat i; + + to[0] = (W_)info; + for (i = 1; i < size; i++) { // unroll for small i + to[i] = from[i]; + } +} + +STATIC_INLINE GNUC_ATTR_HOT void +copy_tag(StgClosure **p, const StgInfoTable *info, + StgClosure *src, nat size, StgWord tag) +{ + StgPtr to; + + // debugTrace(DEBUG_gc, "globalising value at *%p = %p", p, src); + to = alloc_for_copy_global(size); + copy_closure((StgPtr)src, to, info, size); + src->header.info = (const StgInfoTable *)MK_FORWARDING_PTR(to); + *p = TAG_CLOSURE(tag,(StgClosure*)to); +} + +STATIC_INLINE GNUC_ATTR_HOT void +copy_IND(StgClosure **p, const StgInfoTable *info, + StgClosure *src, nat size) +{ + StgPtr to; + + to = alloc_for_copy_global(size); + copy_closure((StgPtr)src, to, info, size); +#ifdef DEBUG + zeroSlop((StgPtr)src + sizeofW(StgInd), size - sizeofW(StgInd)); +#endif + src->header.info = &stg_IND_info; + ((StgInd *)src)->indirectee = (StgClosure *)to; + *p = (StgClosure*)to; +} + +STATIC_INLINE GNUC_ATTR_HOT void +nocopy_IND_LOCAL(bdescr *bd, StgClosure **p, StgClosure *src) +{ + StgPtr to = alloc_for_copy_global(sizeofW(StgInd)); + ((StgInd *)to)->header.info = stg_IND_LOCAL_tbl[bd->gen_ix]; + ((StgInd *)to)->indirectee = src; + *p = (StgClosure*)to; +} + + +REGPARM1 GNUC_ATTR_HOT void +globalise_evac (StgClosure **p) +{ + bdescr *bd = NULL; + StgClosure *q; + const StgInfoTable *info; + StgWord tag; + + q = *p; + +loop: + /* The tag and the pointer are split, to be merged after evacing */ + tag = GET_CLOSURE_TAG(q); + q = UNTAG_CLOSURE(q); + + ASSERT(LOOKS_LIKE_CLOSURE_PTR(q)); + + if (!HEAP_ALLOCED(q)) return; // already global + + bd = Bdescr((P_)q); + + if (bd->gen_no > 0) return; + + // make sure this isn't a pointer into the wrong local heap + // NB. we might be doing a global GC, and got here via + // globalise_mut_lists, in which case it won't necessarily be our + // local heap. +// ASSERT(gct->gc_type != GC_LOCAL || isGlobal(q) || bd->gen_ix == gct->index); + + if (bd->flags & BF_LARGE) { + globalise_large((P_)q); + return; + } + + if (bd->flags & BF_PRIM) { + if (!isGlobalPrim(q)) { + setGlobal(q); + push_mark_stack(q); + } + return; + } + + info = q->header.info; + if (IS_FORWARDING_PTR(info)) + { + StgClosure *e = (StgClosure*)UN_FORWARDING_PTR(info); + *p = TAG_CLOSURE(tag,e); + return; + } + + switch (INFO_PTR_TO_STRUCT(info)->type) { + + case WHITEHOLE: + goto loop; + + // For ints and chars of low value, save space by replacing references to + // these with closures with references to common, shared ones in the RTS. + // + // * Except when compiling into Windows DLLs which don't support cross-package + // data references very well. + // + case CONSTR_0_1: + { +#if defined(__PIC__) && defined(mingw32_HOST_OS) + copy_tag(p,info,q,sizeofW(StgHeader)+1,tag); +#else + StgWord w = (StgWord)q->payload[0]; + if (info == Czh_con_info && + // unsigned, so always true: (StgChar)w >= MIN_CHARLIKE && + (StgChar)w <= MAX_CHARLIKE) { + *p = TAG_CLOSURE(tag, + (StgClosure *)CHARLIKE_CLOSURE((StgChar)w) + ); + } + else if (info == Izh_con_info && + (StgInt)w >= MIN_INTLIKE && (StgInt)w <= MAX_INTLIKE) { + *p = TAG_CLOSURE(tag, + (StgClosure *)INTLIKE_CLOSURE((StgInt)w) + ); + } + else { + copy_tag(p,info,q,sizeofW(StgHeader)+1,tag); + } +#endif + return; + } + + case FUN_0_1: + case FUN_1_0: + case CONSTR_1_0: + copy_tag(p,info,q,sizeofW(StgHeader)+1,tag); + return; + + case THUNK_1_0: + case THUNK_0_1: + if (gct->globalise_thunks) { + copy_IND(p,info,q,sizeofW(StgThunk)+1); + } else { + nocopy_IND_LOCAL(bd, p, q); + } + return; + + case IND_PERM: + copy_IND(p,info,q,sizeofW(StgThunk)+1); + return; + + case THUNK_1_1: + case THUNK_2_0: + case THUNK_0_2: + if (gct->globalise_thunks) { + copy_IND(p,info,q,sizeofW(StgThunk)+2); + } else { + nocopy_IND_LOCAL(bd, p, q); + } + return; + + case FUN_1_1: + case FUN_2_0: + case FUN_0_2: + case CONSTR_1_1: + case CONSTR_2_0: + case CONSTR_0_2: + copy_tag(p,info,q,sizeofW(StgHeader)+2,tag); + return; + + case THUNK: + if (gct->globalise_thunks) { + copy_IND(p,info,q,thunk_sizeW_fromITBL(INFO_PTR_TO_STRUCT(info))); + } else { + nocopy_IND_LOCAL(bd, p, q); + } + return; + + case FUN: + case CONSTR: + copy_tag(p,info,q,sizeW_fromITBL(INFO_PTR_TO_STRUCT(info)),tag); + return; + + case THUNK_SELECTOR: + if (gct->globalise_thunks) { + copy_IND(p,info,q,THUNK_SELECTOR_sizeW()); + } else { + nocopy_IND_LOCAL(bd, p, q); + } + return; + + case IND: + // follow chains of indirections, don't evacuate them + q = ((StgInd*)q)->indirectee; + *p = q; + goto loop; + + case PAP: + copy_IND(p,info,q,pap_sizeW((StgPAP*)q)); + return; + + case AP: + if (gct->globalise_thunks) { + copy_IND(p,info,q,ap_sizeW((StgAP*)q)); + } else { + nocopy_IND_LOCAL(bd, p, q); + } + return; + + case AP_STACK: + if (gct->globalise_thunks) { + copy_IND(p,info,q,ap_stack_sizeW((StgAP_STACK*)q)); + } else { + nocopy_IND_LOCAL(bd, p, q); + } + return; + + case BLACKHOLE: + // don't promote: we can't move BLACKHOLEs, because the update + // frame points to them, so leave an IND_LOCAL instead + { + StgClosure *r; + const StgInfoTable *i; + + r = ((StgInd*)q)->indirectee; + if (GET_CLOSURE_TAG(r) == 0) { + i = r->header.info; + ASSERT(!IS_FORWARDING_PTR(i)); // these things can't be + // forwarding ptrs, outside of GC. + if (i == &stg_TSO_info + || i == &stg_WHITEHOLE_info + || i == &stg_BLOCKING_QUEUE_CLEAN_info + || i == &stg_BLOCKING_QUEUE_DIRTY_info) + { + nocopy_IND_LOCAL(bd, p, q); + return; + } + ASSERT(i != &stg_STUB_BLOCKING_QUEUE_info); + } + q = r; + *p = r; + goto loop; + } + + case ARR_WORDS: + // just copy the block + copy_tag(p,info,q,arr_words_sizeW((StgArrWords *)q),tag); + return; + + default: + barf("globalise_evac: strange closure type %d", (int)(INFO_PTR_TO_STRUCT(info)->type)); + } + + barf("globalise_evac"); +} + +/* ----------------------------------------------------------------------------- + Blocks of function args occur on the stack (at the top) and + in PAPs. + -------------------------------------------------------------------------- */ + +STATIC_INLINE StgPtr +globalise_arg_block (StgFunInfoTable *fun_info, StgClosure **args) +{ + StgPtr p; + StgWord bitmap; + nat size; + + p = (StgPtr)args; + switch (fun_info->f.fun_type) { + case ARG_GEN: + bitmap = BITMAP_BITS(fun_info->f.b.bitmap); + size = BITMAP_SIZE(fun_info->f.b.bitmap); + goto small_bitmap; + case ARG_GEN_BIG: + size = GET_FUN_LARGE_BITMAP(fun_info)->size; + globalise_large_bitmap(p, GET_FUN_LARGE_BITMAP(fun_info), size); + p += size; + break; + default: + bitmap = BITMAP_BITS(stg_arg_bitmaps[fun_info->f.fun_type]); + size = BITMAP_SIZE(stg_arg_bitmaps[fun_info->f.fun_type]); + small_bitmap: + while (size > 0) { + if ((bitmap & 1) == 0) { + globalise_evac((StgClosure **)p); + } + p++; + bitmap = bitmap >> 1; + size--; + } + break; + } + return p; +} + +STATIC_INLINE GNUC_ATTR_HOT StgPtr +globalise_PAP_payload (StgClosure *fun, StgClosure **payload, StgWord size) +{ + StgPtr p; + StgWord bitmap; + StgFunInfoTable *fun_info; + + fun_info = get_fun_itbl(UNTAG_CLOSURE(fun)); + ASSERT(fun_info->i.type != PAP); + p = (StgPtr)payload; + + switch (fun_info->f.fun_type) { + case ARG_GEN: + bitmap = BITMAP_BITS(fun_info->f.b.bitmap); + goto small_bitmap; + case ARG_GEN_BIG: + globalise_large_bitmap(p, GET_FUN_LARGE_BITMAP(fun_info), size); + p += size; + break; + case ARG_BCO: + globalise_large_bitmap((StgPtr)payload, BCO_BITMAP(fun), size); + p += size; + break; + default: + bitmap = BITMAP_BITS(stg_arg_bitmaps[fun_info->f.fun_type]); + small_bitmap: + while (size > 0) { + if ((bitmap & 1) == 0) { + globalise_evac((StgClosure **)p); + } + p++; + bitmap = bitmap >> 1; + size--; + } + break; + } + return p; +} + +STATIC_INLINE GNUC_ATTR_HOT StgPtr +globalise_PAP (StgPAP *pap) +{ + globalise_evac(&pap->fun); + return globalise_PAP_payload (pap->fun, pap->payload, pap->n_args); +} + +STATIC_INLINE StgPtr +globalise_AP (StgAP *ap) +{ + globalise_evac(&ap->fun); + return globalise_PAP_payload (ap->fun, ap->payload, ap->n_args); +} + +/* ----------------------------------------------------------------------------- + scavenge a chunk of memory described by a bitmap + -------------------------------------------------------------------------- */ + +static void +globalise_large_bitmap( StgPtr p, StgLargeBitmap *large_bitmap, nat size ) +{ + nat i, b; + StgWord bitmap; + + b = 0; + bitmap = large_bitmap->bitmap[b]; + for (i = 0; i < size; ) { + if ((bitmap & 1) == 0) { + globalise_evac((StgClosure **)p); + } + i++; + p++; + if (i % BITS_IN(W_) == 0) { + b++; + bitmap = large_bitmap->bitmap[b]; + } else { + bitmap = bitmap >> 1; + } + } +} + +STATIC_INLINE StgPtr +globalise_small_bitmap (StgPtr p, nat size, StgWord bitmap) +{ + while (size > 0) { + if ((bitmap & 1) == 0) { + globalise_evac((StgClosure **)p); + } + p++; + bitmap = bitmap >> 1; + size--; + } + return p; +} + +/* ----------------------------------------------------------------------------- + globalise_stack walks over a section of stack and evacuates all the + objects pointed to by it. We can use the same code for walking + AP_STACK_UPDs, since these are just sections of copied stack. + -------------------------------------------------------------------------- */ + +static void +globalise_stack(StgPtr p, StgPtr stack_end) +{ + const StgRetInfoTable* info; + StgWord bitmap; + nat size; + + /* + * Each time around this loop, we are looking at a chunk of stack + * that starts with an activation record. + */ + while (p < stack_end) { + info = get_ret_itbl((StgClosure *)p); + + switch (info->i.type) { + + case UPDATE_FRAME: + { + nat type; + const StgInfoTable *i; + StgClosure *updatee; + + // see comment about UPDATE_FRAME in scavenge_stack(), which + // also applies here. + updatee = ((StgUpdateFrame *)p)->updatee; + i = updatee->header.info; + ASSERT(!IS_FORWARDING_PTR(i)); + type = INFO_PTR_TO_STRUCT(i)->type; + switch (type) { + // globalise_evac() won't move a BLACKHOLE, but we really + // want to move it here since we're globalising the thread + // that points to it. + case BLACKHOLE: + if (Bdescr((P_)updatee)->gen_ix < global_gen_ix) { + copy_IND(&((StgUpdateFrame *)p)->updatee,i,updatee, + BLACKHOLE_sizeW()); + } + break; + + default: + barf("globalise_stack: %d", type); + } + ASSERT(GET_CLOSURE_TAG(((StgUpdateFrame *)p)->updatee) == 0); + p += sizeofW(StgUpdateFrame); + continue; + } + + // small bitmap (< 32 entries, or 64 on a 64-bit machine) + case CATCH_STM_FRAME: + case CATCH_RETRY_FRAME: + case ATOMICALLY_FRAME: + case STOP_FRAME: + case UNDERFLOW_FRAME: + case CATCH_FRAME: + case RET_SMALL: + bitmap = BITMAP_BITS(info->i.layout.bitmap); + size = BITMAP_SIZE(info->i.layout.bitmap); + // NOTE: the payload starts immediately after the info-ptr, we + // don't have an StgHeader in the same sense as a heap closure. + p++; + p = globalise_small_bitmap(p, size, bitmap); + continue; + + case RET_BCO: { + StgBCO *bco; + nat size; + + p++; + globalise_evac((StgClosure **)p); + bco = (StgBCO *)*p; + p++; + size = BCO_BITMAP_SIZE(bco); + globalise_large_bitmap(p, BCO_BITMAP(bco), size); + p += size; + continue; + } + + // large bitmap (> 32 entries, or > 64 on a 64-bit machine) + case RET_BIG: + { + nat size; + + size = GET_LARGE_BITMAP(&info->i)->size; + p++; + globalise_large_bitmap(p, GET_LARGE_BITMAP(&info->i), size); + p += size; + continue; + } + + // Dynamic bitmap: the mask is stored on the stack, and + // there are a number of non-pointers followed by a number + // of pointers above the bitmapped area. (see StgMacros.h, + // HEAP_CHK_GEN). + case RET_DYN: + { + StgWord dyn; + dyn = ((StgRetDyn *)p)->liveness; + + // traverse the bitmap first + bitmap = RET_DYN_LIVENESS(dyn); + p = (P_)&((StgRetDyn *)p)->payload[0]; + size = RET_DYN_BITMAP_SIZE; + p = globalise_small_bitmap(p, size, bitmap); + + // skip over the non-ptr words + p += RET_DYN_NONPTRS(dyn) + RET_DYN_NONPTR_REGS_SIZE; + + // follow the ptr words + for (size = RET_DYN_PTRS(dyn); size > 0; size--) { + globalise_evac((StgClosure **)p); + p++; + } + continue; + } + + case RET_FUN: + { + StgRetFun *ret_fun = (StgRetFun *)p; + StgFunInfoTable *fun_info; + + globalise_evac(&ret_fun->fun); + fun_info = get_fun_itbl(UNTAG_CLOSURE(ret_fun->fun)); + p = globalise_arg_block(fun_info, ret_fun->payload); + continue; + } + + default: + barf("globalise_stack: weird activation record found on stack: %d", (int)(info->i.type)); + } + } +} + +static void +globalise_scavenge_TSO (StgTSO *tso) +{ + debugTrace(DEBUG_gc,"globalising thread %d",(int)tso->id); + + // update the pointer from the Task. + if (tso->bound != NULL) { + tso->bound->tso = tso; + } + + if ( tso->why_blocked == BlockedOnMVar + || tso->why_blocked == BlockedOnBlackHole + || tso->why_blocked == BlockedOnMsgThrowTo + || tso->why_blocked == NotBlocked + ) { + globalise_evac(&tso->block_info.closure); + } + globalise_evac((StgClosure **)&tso->blocked_exceptions); + globalise_evac((StgClosure **)&tso->bq); + + // scavange current transaction record + globalise_evac((StgClosure **)&tso->trec); + + globalise_evac((StgClosure **)&tso->stackobj); + + globalise_evac((StgClosure **)&tso->_link); + if ( tso->why_blocked == BlockedOnMVar + || tso->why_blocked == BlockedOnBlackHole + || tso->why_blocked == BlockedOnMsgThrowTo + || tso->why_blocked == NotBlocked + ) { + globalise_evac(&tso->block_info.closure); + } + + ASSERT(!gct->failed_to_evac); + tso->dirty = 0; +} + + +static StgPtr globalise_scavenge_mut_arr_ptrs (StgMutArrPtrs *a) +{ + StgPtr p; + + for (p = (P_)&a->payload[0]; p < (P_)&a->payload[a->ptrs]; p++) { + globalise_evac((StgClosure**)p); + } + + return (StgPtr)a + mut_arr_ptrs_sizeW(a); +} + + +static StgPtr +globalise_scavenge_prim (StgPtr p, nat type) +{ + switch (type) { + + case ARR_WORDS: + // nothing to do; these can happen if a large ARR_WORDS gets + // globalised. + return p += arr_words_sizeW((StgArrWords *)p); + + case MUT_ARR_PTRS_LOCAL: + SET_INFO((StgMutArrPtrs *)p, &stg_MUT_ARR_PTRS_GLOBAL_info); + // fall through + case MUT_ARR_PTRS_GLOBAL: + return globalise_scavenge_mut_arr_ptrs((StgMutArrPtrs *)p); + + case MUT_ARR_PTRS_FROZEN: + case MUT_ARR_PTRS_FROZEN0: + return globalise_scavenge_mut_arr_ptrs((StgMutArrPtrs *)p); + + default: + barf("globalise_scavenge_one: unimplemented/strange closure type %d @ %p", + type, p); + } +} + +static GNUC_ATTR_HOT REGPARM1 StgPtr +globalise_scavenge_one (StgPtr p) +{ + const StgInfoTable *info; + + ASSERT(LOOKS_LIKE_CLOSURE_PTR(p)); + info = get_itbl((StgClosure *)p); + + switch (info->type) { + + case FUN_2_0: + globalise_evac(&((StgClosure *)p)->payload[1]); + globalise_evac(&((StgClosure *)p)->payload[0]); + return p + sizeofW(StgHeader) + 2; + + case THUNK_2_0: + globalise_evac(&((StgThunk *)p)->payload[1]); + globalise_evac(&((StgThunk *)p)->payload[0]); + return p + sizeofW(StgThunk) + 2; + + case CONSTR_2_0: + globalise_evac(&((StgClosure *)p)->payload[1]); + globalise_evac(&((StgClosure *)p)->payload[0]); + return p + sizeofW(StgHeader) + 2; + + case THUNK_1_0: + globalise_evac(&((StgThunk *)p)->payload[0]); + return p + sizeofW(StgThunk) + 1; + + case FUN_1_0: + case CONSTR_1_0: + globalise_evac(&((StgClosure *)p)->payload[0]); + return p + sizeofW(StgHeader) + 1; + + case THUNK_0_1: + return p + sizeofW(StgThunk) + 1; + + case FUN_0_1: + case CONSTR_0_1: + return p + sizeofW(StgHeader) + 1; + + case THUNK_0_2: + return p + sizeofW(StgThunk) + 2; + + case FUN_0_2: + case CONSTR_0_2: + return p + sizeofW(StgHeader) + 2; + + case THUNK_1_1: + globalise_evac(&((StgThunk *)p)->payload[0]); + return p + sizeofW(StgThunk) + 2; + + case FUN_1_1: + case CONSTR_1_1: + globalise_evac(&((StgClosure *)p)->payload[0]); + return p + sizeofW(StgHeader) + 2; + + case THUNK: + { + StgPtr end; + + end = (P_)((StgThunk *)p)->payload + info->layout.payload.ptrs; + for (p = (P_)((StgThunk *)p)->payload; p < end; p++) { + globalise_evac((StgClosure **)p); + } + return p + info->layout.payload.nptrs; + } + + case MUT_VAR_LOCAL: + SET_INFO((StgMutVar*)p, &stg_MUT_VAR_GLOBAL_info); + // fall through + case FUN: + case CONSTR: + case BLACKHOLE: + case PRIM: + case MUT_PRIM: + case MVAR_CLEAN: + case MVAR_DIRTY: + case MUT_VAR_GLOBAL: + case TREC_CHUNK: + case BLOCKING_QUEUE: + { + StgPtr end; + + end = (P_)((StgClosure *)p)->payload + info->layout.payload.ptrs; + for (p = (P_)((StgClosure *)p)->payload; p < end; p++) { + globalise_evac((StgClosure **)p); + } + return p + info->layout.payload.nptrs; + } + + case WEAK: + { + StgWeak *w = (StgWeak *)p; + globalise_evac(&w->cfinalizer); + globalise_evac(&w->key); + globalise_evac(&w->value); + globalise_evac(&w->finalizer); + // link field already points into this gen + return p + sizeofW(StgWeak); + } + + case THUNK_SELECTOR: + { + StgSelector *s = (StgSelector *)p; + globalise_evac(&s->selectee); + return p + THUNK_SELECTOR_sizeW(); + } + + // A chunk of stack saved in a heap object + case AP_STACK: + { + StgAP_STACK *ap = (StgAP_STACK *)p; + + globalise_evac(&ap->fun); + globalise_stack((StgPtr)ap->payload, (StgPtr)ap->payload + ap->size); + return (StgPtr)ap->payload + ap->size; + } + + case PAP: + return globalise_PAP((StgPAP *)p); + + case AP: + return globalise_AP((StgAP *)p); + + case TSO: + { + StgTSO *tso = (StgTSO*)p; + // we don't touch the insides of a TSO, it is private + tso->dirty = 1; + gct->failed_to_evac = rtsTrue; + return p + sizeofW(StgTSO); + } + + case STACK: + { + StgStack *stack = (StgStack*)p; + // we don't touch the insides of a STACK, it is private + globalise_evac((StgClosure**)&stack->tso); + stack->dirty = 1; + gct->failed_to_evac = rtsTrue; + return p + stack_sizeW(stack); + } + + // an IND_LOCAL could be here as a result of a BLACKHOLE being + // globalised, so we don't want to try to globalise it. + case IND_LOCAL: + gct->failed_to_evac = rtsTrue; + return p + sizeofW(StgInd); + + case IND: + case IND_STATIC: + globalise_evac(&((StgInd*)p)->indirectee); + return p + sizeofW(StgInd); + + default: + return globalise_scavenge_prim(p, info->type); + } +} + + +static void +globalise_scavenge_large (gen_workspace *ws) +{ + bdescr *bd; + StgPtr p; + + bd = ws->todo_large_objects; + + for (; bd != NULL; bd = ws->todo_large_objects) { + + // take this object *off* the large objects list and put it on + // the scavenged large objects list. This is so that we can + // treat new_large_objects as a stack and push new objects on + // the front when evacuating. + ws->todo_large_objects = bd->link; + + ACQUIRE_SPIN_LOCK(&ws->gen->sync); + dbl_link_onto(bd, &ws->gen->large_objects); + ws->gen->n_large_blocks += bd->blocks; + RELEASE_SPIN_LOCK(&ws->gen->sync); + + p = bd->start; + globalise_scavenge_one(p); + if (gct->failed_to_evac) { + gct->failed_to_evac = rtsFalse; + globalise_maybe_record_mutable((StgClosure*)p); + } + + // stats + gct->scanned += closure_sizeW((StgClosure*)p); + } +} + +static GNUC_ATTR_HOT void +globalise_scavenge_block (bdescr *bd) +{ + StgPtr p, q; + gen_workspace *ws; + + debugTrace(DEBUG_gc, "globalise: scavenging block %p (gen %d) @ %p", + bd->start, bd->gen_no, bd->u.scan); + + gct->scan_bd = bd; + ws = &gct->gens[bd->gen_ix]; + + p = bd->u.scan; + + // we might be evacuating into the very block that we're + // scavenging, so we have to check the real bd->free pointer each + // time around the loop. + while (p < bd->free || (bd == ws->todo_bd && p < ws->todo_free)) { + ASSERT(bd->link == NULL); + q = globalise_scavenge_one(p); + + if (gct->failed_to_evac) { + gct->failed_to_evac = rtsFalse; + globalise_maybe_record_mutable((StgClosure*)p); + } + + p = q; + } + + if (p > bd->free) { + bd->free = p; + } + + debugTrace(DEBUG_gc, " scavenged %ld bytes", + (unsigned long)((bd->free - bd->u.scan) * sizeof(W_))); + + // update stats: this is a block that has been scavenged + gct->globalised += bd->free - bd->u.scan; + bd->u.scan = bd->free; + + if (bd != ws->todo_bd) { + // we're not going to evac any more objects into + // this block, so push it now. + push_scanned_block(bd, ws); + } + + gct->scan_bd = NULL; +} + + +static void +globalise_scavenge (void) +{ + gen_workspace *ws; + bdescr *bd; + StgPtr p; + + do { + ws = &gct->gens[global_gen_ix]; // XXX assumes structure of generations + + gct->scan_bd = NULL; + + // scavenge objects in compacted generation + while ((p = pop_mark_stack())) { + globalise_scavenge_one(p); + if (gct->failed_to_evac) { + gct->failed_to_evac = rtsFalse; + globalise_maybe_record_mutable((StgClosure*)p); + } + } + + // If we have a scan block with some work to do, + // scavenge everything up to the free pointer. + if (ws->todo_bd->u.scan < ws->todo_free) + { + globalise_scavenge_block(ws->todo_bd); + continue; + } + + // If we have any large objects to scavenge, do them now. + if (ws->todo_large_objects) { + globalise_scavenge_large(ws); + continue; + } + + if ((bd = grab_local_todo_block(ws)) != NULL) { + globalise_scavenge_block(bd); + continue; + } + + return; + + } while (1); +} + +/* ----------------------------------------------------------------------------- + Mutable lists + -------------------------------------------------------------------------- */ + +static void +globalise_maybe_record_mutable (StgClosure *q) +{ + nat g = global_gen_no; + + switch (get_itbl((StgClosure*)q)->type) { + + case TSO: + // put it on the correct mutable list; during parallel + // GC it may have ended up on the wrong one. + ASSERT(gct->gc_type == GC_LOCAL ? + ((StgTSO*)q)->cap == gct->cap : 1); + recordMutableCap(((StgTSO*)q)->cap, q, g); + break; + + case STACK: + // c.f. TSO + // this is the entire reason for the tso field of STACK + ASSERT(gct->gc_type == GC_LOCAL ? + ((StgStack*)q)->tso->cap == gct->cap : 1); + recordMutableCap(((StgStack*)q)->tso->cap, q, g); + break; + + case IND_LOCAL: + // put it on the correct mutable list; during parallel + // GC it may have ended up on the wrong one. + recordMutableCap(&capabilities[get_itbl(q)->srt_bitmap], + q, g); + break; + + default: + barf("globalise_maybe_record_mutable: %d", get_itbl(q)->type); + } +} + +static void +globalise_mut_list (Capability *cap STG_UNUSED, bdescr *mut_list, + nat g STG_UNUSED) +{ + bdescr *bd; + StgPtr p; + StgClosure *q; + + // if g != 1, then we'll need to do something more clever below + ASSERT(g == 1); + + gct->failed_to_evac = rtsFalse; + + for (bd = mut_list; bd != NULL; bd = bd->link) { + + for (p = bd->start; p < bd->free; p++) { + + q = (StgClosure *)*p; + + // globalise_scavenge_one() will ignore an IND_LOCAL, + // because they are created by globalise_evac() on a + // BLACKHOLE. But we can scavenge any IND_LOCALs on the + // mut_list here without going into a loop. + switch (get_itbl((StgClosure*)q)->type) { +// case IND_LOCAL: +// globalise_evac(&((StgInd*)q)->indirectee); +// // note: we can't change it to an IND at this stage, +// // because we haven't finished scavenging the +// // transitive closure yet. A later stage will have to +// // do that. +// break; + default: + globalise_scavenge_one((StgPtr)q); + break; + } + + if (gct->failed_to_evac) { + gct->failed_to_evac = rtsFalse; + globalise_maybe_record_mutable(q); + } + + } + } +} + + +// called after GC to clean up any global->local pointers that arose. +// c.f. scavenge_capability_mut_lists() +void +globalise_capability_mut_lists (Capability *cap) +{ + nat g; + + gct->globalise_thunks = rtsFalse; + + for (g = RtsFlags.GcFlags.generations-1; g > 0; g--) { + globalise_mut_list(cap, cap->saved_mut_lists[g], g); + freeChain_sync(cap->saved_mut_lists[g]); + cap->saved_mut_lists[g] = NULL; + } + + globalise_scavenge(); +} + +/* ----------------------------------------------------------------------------- + Publishing + -------------------------------------------------------------------------- */ + +StgClosure * +publish (Capability *cap, StgClosure *p) +{ + StgInd *i; + + if (!HEAP_ALLOCED(p) || isGlobal(p)) return p; + + // ToDo: perhaps want to check what kind of object p is; for a + // simple value with no pointers we could just globalise + // it. + + // ToDo: inline part of allocateInGen, like alloc_for_copy + i = (StgInd*) allocateInGen(cap, global_gen_ix, sizeofW(StgInd)); + SET_HDR(i, stg_IND_LOCAL_tbl[cap->no], CCS_SYSTEM); + i->indirectee = p; + recordMutableCap(cap,(StgClosure*)i,global_gen_no); + return (StgClosure *)i; +} + +StgClosure * +publish_gen (Capability *cap, StgClosure *p, generation *gen) +{ + StgInd *i; + + // ToDo: inline part of allocateInGen, like alloc_for_copy + i = (StgInd*) allocateInGen(cap, gen->ix, sizeofW(StgInd)); + SET_HDR(i, stg_IND_LOCAL_tbl[cap->no], CCS_SYSTEM); + i->indirectee = p; + recordMutableCap(cap,(StgClosure*)i,gen->no); + return (StgClosure *)i; +} addfile ./rts/sm/Globalise.h hunk ./rts/sm/Globalise.h 1 +/* ----------------------------------------------------------------------------- + * + * (c) The GHC Team 1998-2010 + * + * Globalise data from the local heap to the global heap. + * + * Documentation on the architecture of the Garbage Collector can be + * found in the online commentary: + * + * http://hackage.haskell.org/trac/ghc/wiki/Commentary/Rts/Storage/GC + * + * ---------------------------------------------------------------------------*/ + +#ifndef SM_GLOBALISE_H +#define SM_GLOBALISE_H + +#include "BeginPrivate.h" + +StgClosure * publish (Capability *cap, StgClosure *p); +StgClosure * publish_gen (Capability *cap, StgClosure *p, generation *gen); + +void globalise (Capability *cap, StgClosure **root); +StgClosure * globalise_ (Capability *cap, StgClosure *root); +StgClosure * globaliseFull_(Capability *cap, StgClosure *root); +void globaliseWRT (Capability *cap, StgClosure *parent, + StgClosure **root); +void globaliseFullWRT (Capability *cap USED_IF_THREADS, + StgClosure *parent, StgClosure **root); + +void globalise_capability_mut_lists (Capability *cap); + +// for use when migrating a TSO +StgTSO * globalise_TSO (Capability *cap, StgTSO *tso); + +#include "EndPrivate.h" + +#endif /* SM_GLOBALISE_H */ hunk ./rts/sm/MBlock.c 262 markHeapUnalloced( (StgWord8*)addr + i * MBLOCK_SIZE ); } +#ifdef DEBUG + if (RtsFlags.DebugFlags.sanity) { + memset(addr, 0xdd, n * MBLOCK_SIZE); + } else +#endif osFreeMBlocks(addr, n); } hunk ./rts/sm/MarkStack.h 20 #include "BeginPrivate.h" INLINE_HEADER void -push_mark_stack(StgPtr p) +push_mark_stack(StgClosure *p) { bdescr *bd; hunk ./rts/sm/MarkStack.h 24 - *mark_sp++ = (StgWord)p; + *gct->mark_sp++ = (StgWord)p; hunk ./rts/sm/MarkStack.h 26 - if (((W_)mark_sp & BLOCK_MASK) == 0) + if (((W_)gct->mark_sp & BLOCK_MASK) == 0) { hunk ./rts/sm/MarkStack.h 28 - if (mark_stack_bd->u.back != NULL) + if (gct->mark_stack_bd->link != NULL) { hunk ./rts/sm/MarkStack.h 30 - mark_stack_bd = mark_stack_bd->u.back; + gct->mark_stack_bd = gct->mark_stack_bd->link; } else { hunk ./rts/sm/MarkStack.h 35 bd = allocBlock_sync(); - bd->link = mark_stack_bd; - bd->u.back = NULL; - mark_stack_bd->u.back = bd; // double-link the new block on - mark_stack_top_bd = bd; - mark_stack_bd = bd; + bd->u.back = gct->mark_stack_bd; + bd->link = NULL; + gct->mark_stack_bd->link = bd; // double-link the new block on + gct->mark_stack_bd = bd; } hunk ./rts/sm/MarkStack.h 40 - mark_sp = mark_stack_bd->start; + gct->mark_sp = gct->mark_stack_bd->start; } } hunk ./rts/sm/MarkStack.h 47 INLINE_HEADER StgPtr pop_mark_stack(void) { - if (((W_)mark_sp & BLOCK_MASK) == 0) + if (((W_)gct->mark_sp & BLOCK_MASK) == 0) { hunk ./rts/sm/MarkStack.h 49 - if (mark_stack_bd->link == NULL) + if (gct->mark_stack_bd->u.back == NULL) { return NULL; } hunk ./rts/sm/MarkStack.h 55 else { - mark_stack_bd = mark_stack_bd->link; - mark_sp = mark_stack_bd->start + BLOCK_SIZE_W; + gct->mark_stack_bd = gct->mark_stack_bd->u.back; + gct->mark_sp = gct->mark_stack_bd->start + BLOCK_SIZE_W; } } hunk ./rts/sm/MarkStack.h 59 - return (StgPtr)*--mark_sp; + return (StgPtr)*--gct->mark_sp; } INLINE_HEADER rtsBool hunk ./rts/sm/MarkStack.h 65 mark_stack_empty(void) { - return (((W_)mark_sp & BLOCK_MASK) == 0 && mark_stack_bd->link == NULL); + return (((W_)gct->mark_sp & BLOCK_MASK) == 0 && gct->mark_stack_bd->u.back == NULL); } #include "EndPrivate.h" hunk ./rts/sm/MarkWeak.c 3 /* ----------------------------------------------------------------------------- * - * (c) The GHC Team 1998-2008 + * (c) The GHC Team 1998-2010 * * Weak pointers and weak-like things in the GC * hunk ./rts/sm/MarkWeak.c 20 #include "MarkWeak.h" #include "GC.h" #include "GCThread.h" +#include "GCTDecl.h" +#include "GCUtils.h" #include "Evac.h" #include "Trace.h" #include "Schedule.h" hunk ./rts/sm/MarkWeak.c 73 -------------------------------------------------------------------------- */ -/* Which stage of processing various kinds of weak pointer are we at? - * (see traverse_weak_ptr_list() below for discussion). - */ -typedef enum { WeakPtrs, WeakThreads, WeakDone } WeakStage; -static WeakStage weak_stage; - -/* Weak pointers - */ -StgWeak *old_weak_ptr_list; // also pending finaliser list - -// List of threads found to be unreachable -StgTSO *resurrected_threads; - static void resurrectUnreachableThreads (generation *gen); static rtsBool tidyThreadList (generation *gen); hunk ./rts/sm/MarkWeak.c 79 void initWeakForGC(void) { - old_weak_ptr_list = weak_ptr_list; - weak_ptr_list = NULL; - weak_stage = WeakPtrs; - resurrected_threads = END_TSO_QUEUE; + gct->old_weak_ptrs = NULL; + gct->weak_stage = WeakPtrs; + gct->resurrected_threads = END_TSO_QUEUE; } rtsBool hunk ./rts/sm/MarkWeak.c 88 traverseWeakPtrList(void) { StgWeak *w, **last_w, *next_w; - StgClosure *new; rtsBool flag = rtsFalse; hunk ./rts/sm/MarkWeak.c 89 + generation *gen; + bdescr *bd; const StgInfoTable *info; hunk ./rts/sm/MarkWeak.c 92 + StgClosure *new; + nat g; hunk ./rts/sm/MarkWeak.c 95 - switch (weak_stage) { + switch (gct->weak_stage) { case WeakDone: return rtsFalse; hunk ./rts/sm/MarkWeak.c 101 case WeakPtrs: - /* doesn't matter where we evacuate values/finalizers to, since - * these pointers are treated as roots (iff the keys are alive). - */ - gct->evac_gen = 0; - - last_w = &old_weak_ptr_list; - for (w = old_weak_ptr_list; w != NULL; w = next_w) { - - /* There might be a DEAD_WEAK on the list if finalizeWeak# was - * called on a live weak pointer object. Just remove it. - */ - if (w->header.info == &stg_DEAD_WEAK_info) { - next_w = ((StgDeadWeak *)w)->link; - *last_w = next_w; - continue; - } - - info = get_itbl(w); - switch (info->type) { - - case WEAK: - /* Now, check whether the key is reachable. - */ - new = isAlive(w->key); - if (new != NULL) { - w->key = new; - // evacuate the value and finalizer - evacuate(&w->value); - evacuate(&w->finalizer); - // remove this weak ptr from the old_weak_ptr list - *last_w = w->link; - // and put it on the new weak ptr list - next_w = w->link; - w->link = weak_ptr_list; - weak_ptr_list = w; - flag = rtsTrue; hunk ./rts/sm/MarkWeak.c 102 - debugTrace(DEBUG_weak, - "weak pointer still alive at %p -> %p", - w, w->key); - continue; - } - else { - last_w = &(w->link); - next_w = w->link; - continue; - } + last_w = &gct->old_weak_ptrs; + for (w = gct->old_weak_ptrs; w != NULL; w = next_w) { + + /* There might be a DEAD_WEAK on the list if finalizeWeak# was + * called on a live weak pointer object. Just remove it. + */ + if (w->header.info == &stg_DEAD_WEAK_info) { + next_w = ((StgDeadWeak *)w)->link; + *last_w = next_w; + continue; + } + + info = get_itbl(w); + switch (info->type) { + + case WEAK: + /* Now, check whether the key is reachable. + */ + new = isAlive(w->key); + if (new != NULL) { + w->key = new; hunk ./rts/sm/MarkWeak.c 124 - default: - barf("traverseWeakPtrList: not WEAK"); - } + bd = Bdescr((P_)w); + gct->evac_gen_ix = bd->gen_ix; + + // evacuate the value and finalizer + evacuate(&w->value); + evacuate(&w->finalizer); + // remove this weak ptr from the old_weak_ptrs list + *last_w = w->link; + // and put it on the new weak ptr list + next_w = w->link; + w->link = bd->gen->weak_ptrs; + bd->gen->weak_ptrs = w; + flag = rtsTrue; + + debugTrace(DEBUG_weak, + "weak pointer still alive at %p -> %p", + w, w->key); + + if (gct->failed_to_evac || + Bdescr((P_)w->key)->gen_ix < bd->gen_ix) { + gct->failed_to_evac = rtsFalse; + recordMutableGen_GC((StgClosure *)w,bd->gen_no); + } + + continue; + } + else { + last_w = &(w->link); + next_w = w->link; + continue; + } + + default: + barf("traverseWeakPtrListGen: not WEAK"); + } } hunk ./rts/sm/MarkWeak.c 160 - + /* If we didn't make any changes, then we can go round and kill all * the dead weak pointers. The old_weak_ptr list is used as a list * of pending finalizers later on. hunk ./rts/sm/MarkWeak.c 166 */ if (flag == rtsFalse) { - for (w = old_weak_ptr_list; w; w = w->link) { - evacuate(&w->finalizer); - } + for (w = gct->old_weak_ptrs; w; w = w->link) { + bd = Bdescr((P_)w); + gct->evac_gen_ix = bd->gen_ix; + evacuate(&w->finalizer); + if (gct->failed_to_evac) { + gct->failed_to_evac = rtsFalse; + recordMutableGen_GC((StgClosure *)w,Bdescr((P_)w)->gen_no); + } hunk ./rts/sm/MarkWeak.c 175 - // Next, move to the WeakThreads stage after fully - // scavenging the finalizers we've just evacuated. - weak_stage = WeakThreads; +#ifdef PROFILING + // A weak pointer is inherently used, so we do not need to call + // LDV_recordDead(). + // + // Furthermore, when PROFILING is turned on, dead weak + // pointers are exactly as large as weak pointers, so there is + // no need to fill the slop, either. See stg_DEAD_WEAK_info + // in StgMiscClosures.hc. +#endif + SET_INFO(w, &stg_DEAD_WEAK_info); + } + + // Next, move to the WeakThreads stage after fully + // scavenging the finalizers we've just evacuated. + gct->weak_stage = WeakThreads; } return rtsTrue; hunk ./rts/sm/MarkWeak.c 195 case WeakThreads: - /* Now deal with the step->threads lists, which behave somewhat like + /* Now deal with the gen->threads lists, which behave somewhat like * the weak ptr list. If we discover any threads that are about to * become garbage, we wake them up and administer an exception. hunk ./rts/sm/MarkWeak.c 198 + * + * We can't do in the same phase as WeakPtrs, because we want + * the property that a reachable finalizer can keep a thread + * alive (see test conc031). */ { hunk ./rts/sm/MarkWeak.c 204 - nat g; // Traverse thread lists for generations we collected... hunk ./rts/sm/MarkWeak.c 206 -// ToDo when we have one gen per capability: -// for (n = 0; n < n_capabilities; n++) { -// if (tidyThreadList(&nurseries[n])) { -// flag = rtsTrue; -// } -// } - for (g = 0; g <= N; g++) { - if (tidyThreadList(&generations[g])) { + for (g = 0; g < total_generations; g++) { + gen = &all_generations[g]; + if (gen->no > gct->collect_gen) break; + if (gct->gc_type == GC_LOCAL && isNonLocalGen(gen)) continue; + if (tidyThreadList(gen)) { flag = rtsTrue; } } hunk ./rts/sm/MarkWeak.c 214 - + /* If we evacuated any threads, we need to go back to the scavenger. */ if (flag) return rtsTrue; hunk ./rts/sm/MarkWeak.c 221 /* And resurrect any threads which were about to become garbage. */ - { - nat g; - for (g = 0; g <= N; g++) { - resurrectUnreachableThreads(&generations[g]); - } + for (g = 0; g < total_generations; g++) { + gen = &all_generations[g]; + if (gen->no > gct->collect_gen) break; + if (gct->gc_type == GC_LOCAL && isNonLocalGen(gen)) continue; + resurrectUnreachableThreads(gen); } hunk ./rts/sm/MarkWeak.c 227 - - weak_stage = WeakDone; // *now* we're done, + + gct->weak_stage = WeakDone; // *now* we're done, return rtsTrue; // but one more round of scavenging, please } hunk ./rts/sm/MarkWeak.c 238 } } - static void resurrectUnreachableThreads (generation *gen) +static void resurrectUnreachableThreads (generation *gen) { StgTSO *t, *tmp, *next; hunk ./rts/sm/MarkWeak.c 250 // become garbage, because they might get // pending exceptions. switch (t->what_next) { + case ThreadKilled: case ThreadComplete: continue; hunk ./rts/sm/MarkWeak.c 254 + default: tmp = t; evacuate((StgClosure **)&tmp); hunk ./rts/sm/MarkWeak.c 258 - tmp->global_link = resurrected_threads; - resurrected_threads = tmp; + + if (gct->gc_type == GC_LOCAL) { + // We don't currently resurrect threads with + // exceptions in GC_LOCAL. It's hard to get right, + // becuase we would have to do real throwTo rather + // than throwToSingleThhreaded. + generation *gen = Bdescr((P_)tmp)->gen; + tmp->global_link = gen->threads; + gen->threads = tmp; + } else { + tmp->global_link = gct->resurrected_threads; + gct->resurrected_threads = tmp; + } } } } hunk ./rts/sm/MarkWeak.c 275 + static rtsBool tidyThreadList (generation *gen) { StgTSO *t, *tmp, *next, **prev; hunk ./rts/sm/MarkWeak.c 285 for (t = gen->old_threads; t != END_TSO_QUEUE; t = next) { - tmp = (StgTSO *)isAlive((StgClosure *)t); - - if (tmp != NULL) { - t = tmp; + if (gct->gc_type == GC_LOCAL) { + tmp = t; + evacuate((StgClosure **)&tmp); + } else { + tmp = (StgTSO *)isAlive((StgClosure *)t); } hunk ./rts/sm/MarkWeak.c 292 - ASSERT(get_itbl(t)->type == TSO); + if (tmp != NULL) { + t = tmp; + } + next = t->global_link; hunk ./rts/sm/MarkWeak.c 298 + ASSERT(get_itbl(t)->type == TSO); + // if the thread is not masking exceptions but there are // pending exceptions on its queue, then something has gone // wrong. However, pending exceptions are OK if there is an hunk ./rts/sm/MarkWeak.c 330 } /* ----------------------------------------------------------------------------- - Evacuate every weak pointer object on the weak_ptr_list, and update - the link fields. + Every weak pointer object (WEAK) is treated as implicitly alive to + begin with. Only when w->key is found to be unreachable does the + WEAK turn into a DEAD_WEAK which may be subsequently GC'd. hunk ./rts/sm/MarkWeak.c 334 - ToDo: with a lot of weak pointers, this will be expensive. We - should have a per-GC weak pointer list, just like threads. - -------------------------------------------------------------------------- */ + All WEAK objects are chained together by their w->link fields. + Since we're doing generational GC, it makes sense to divide this + list per-generation, so that we don't have to traverse the weak + pointers of old generations when collecting young generations, so + each generation has its own gen->weak_ptrs list. + + In markWeakPtr list we traverse the gen->weak_ptrs lists of + generations we are collecting, and evacuate each one. The + resulting weak pointer objects are chained together on a temporary + list gct->old_weak_ptrs, which we will traverse again (possibly + multiple times) in traverseWeakPtrList(). Eventually WEAK objects + will be placed on the appropriate gen->weak_ptrs list of the + generation that they now belong to. + -------------------------------------------------------------------------- */ void markWeakPtrList ( void ) hunk ./rts/sm/MarkWeak.c 352 { - StgWeak *w, **last_w; - - last_w = &weak_ptr_list; - for (w = weak_ptr_list; w; w = w->link) { - // w might be WEAK, EVACUATED, or DEAD_WEAK (actually CON_STATIC) here + StgWeak *w, *next; + generation *gen; + nat g; hunk ./rts/sm/MarkWeak.c 356 + for (g = 0; g < total_generations; g++) { + gen = &all_generations[g]; + if (gen->no > gct->collect_gen) break; + if (gct->gc_type == GC_LOCAL && isNonLocalGen(gen)) continue; + + for (w = gen->weak_ptrs; w; w = next) { + // w might be WEAK, a forwarding ptr, or DEAD_WEAK (actually + // CONSTR_STATIC) here + #ifdef DEBUG hunk ./rts/sm/MarkWeak.c 366 - { // careful to do this assertion only reading the info ptr - // once, because during parallel GC it might change under our feet. - const StgInfoTable *info; - info = w->header.info; - ASSERT(IS_FORWARDING_PTR(info) - || info == &stg_DEAD_WEAK_info - || INFO_PTR_TO_STRUCT(info)->type == WEAK); - } + { // careful to do this assertion only reading the info ptr + // once, because during parallel GC it might change under our feet. + const StgInfoTable *info; + info = w->header.info; + ASSERT(IS_FORWARDING_PTR(info) + || info == &stg_DEAD_WEAK_info + || INFO_PTR_TO_STRUCT(info)->type == WEAK); + } #endif hunk ./rts/sm/MarkWeak.c 375 + + evacuate((StgClosure**)&w); hunk ./rts/sm/MarkWeak.c 378 - evacuate((StgClosure **)last_w); - w = *last_w; - if (w->header.info == &stg_DEAD_WEAK_info) { - last_w = &(((StgDeadWeak*)w)->link); - } else { - last_w = &(w->link); - } - } -} + next = w->link; + // We drop DEAD_WEAK objects from the list here. + if (w->header.info != &stg_DEAD_WEAK_info) { + w->link = gct->old_weak_ptrs; + gct->old_weak_ptrs = w; + } + } hunk ./rts/sm/MarkWeak.c 386 + gen->weak_ptrs = NULL; + // we'll re-populate these lists later in traverseWeakPtrList() + } +} hunk ./rts/sm/MarkWeak.h 19 #include "BeginPrivate.h" -extern StgWeak *old_weak_ptr_list; -extern StgTSO *resurrected_threads; -extern StgTSO *exception_threads; - void initWeakForGC ( void ); rtsBool traverseWeakPtrList ( void ); void markWeakPtrList ( void ); hunk ./rts/sm/Sanity.c 7 * * Sanity checking code for the heap and stack. * - * Used when debugging: check that everything reasonable. + * Used when debugging: check that everything satisfies the invariants. * * - All things that are supposed to be pointers look like pointers. * hunk ./rts/sm/Sanity.c 21 #ifdef DEBUG /* whole file */ +#include "Storage.h" +#include "BlockAlloc.h" +#include "GCThread.h" + #include "RtsUtils.h" hunk ./rts/sm/Sanity.c 26 -#include "sm/Storage.h" -#include "sm/BlockAlloc.h" #include "Sanity.h" #include "Schedule.h" #include "Apply.h" hunk ./rts/sm/Sanity.c 88 checkClosureShallow( StgClosure* p ) { StgClosure *q; + const StgInfoTable *info; q = UNTAG_CLOSURE(p); hunk ./rts/sm/Sanity.c 91 + + // forwarding pointers are allowed in the local heap + info = q->header.info; + if (IS_FORWARDING_PTR(info)) { + q = (StgClosure*)UN_FORWARDING_PTR(info); + } + ASSERT(LOOKS_LIKE_CLOSURE_PTR(q)); /* Is it a static closure? */ hunk ./rts/sm/Sanity.c 179 { StgFunInfoTable *fun_info; StgRetFun *ret_fun; + StgClosure *fun; ret_fun = (StgRetFun *)c; hunk ./rts/sm/Sanity.c 182 - fun_info = get_fun_itbl(UNTAG_CLOSURE(ret_fun->fun)); + fun = UNTAG_CLOSURE(ret_fun->fun); + if (IS_FORWARDING_PTR(fun->header.info)) { + fun = (StgClosure*)UN_FORWARDING_PTR(fun->header.info); + } + fun_info = get_fun_itbl(UNTAG_CLOSURE(fun)); size = ret_fun->size; switch (fun_info->f.fun_type) { case ARG_GEN: hunk ./rts/sm/Sanity.c 275 { const StgInfoTable *info; - ASSERT(LOOKS_LIKE_CLOSURE_PTR(p)); - p = UNTAG_CLOSURE(p); /* Is it a static closure (i.e. in the data segment)? */ hunk ./rts/sm/Sanity.c 277 - if (!HEAP_ALLOCED(p)) { - ASSERT(closure_STATIC(p)); - } else { - ASSERT(!closure_STATIC(p)); - } - info = p->header.info; if (IS_FORWARDING_PTR(info)) { hunk ./rts/sm/Sanity.c 280 - barf("checkClosure: found EVACUATED closure %d", info->type); + return checkClosure((StgClosure *)UN_FORWARDING_PTR(info)); } info = INFO_PTR_TO_STRUCT(info); hunk ./rts/sm/Sanity.c 284 + ASSERT(HEAP_ALLOCED(p) || closure_STATIC(p)); + ASSERT(LOOKS_LIKE_CLOSURE_PTR(p)); + switch (info->type) { case MVAR_CLEAN: hunk ./rts/sm/Sanity.c 326 case CONSTR_0_2: case CONSTR_2_0: case IND_PERM: + case IND_LOCAL: case BLACKHOLE: case PRIM: case MUT_PRIM: hunk ./rts/sm/Sanity.c 330 - case MUT_VAR_CLEAN: - case MUT_VAR_DIRTY: + case MUT_VAR_LOCAL: case CONSTR_STATIC: case CONSTR_NOCAF_STATIC: case THUNK_STATIC: hunk ./rts/sm/Sanity.c 354 ASSERT(get_itbl(bq->owner)->type == TSO); ASSERT(bq->queue == (MessageBlackHole*)END_TSO_QUEUE || bq->queue->header.info == &stg_MSG_BLACKHOLE_info); - ASSERT(bq->link == (StgBlockingQueue*)END_TSO_QUEUE || + ASSERT(bq->link == (StgBlockingQueue*)END_TSO_QUEUE || get_itbl(bq->link)->type == IND || get_itbl(bq->link)->type == BLOCKING_QUEUE); hunk ./rts/sm/Sanity.c 388 } case THUNK_SELECTOR: - ASSERT(LOOKS_LIKE_CLOSURE_PTR(((StgSelector *)p)->selectee)); - return THUNK_SELECTOR_sizeW(); + ASSERT(LOOKS_LIKE_CLOSURE_PTR(((StgSelector *)p)->selectee)); + return THUNK_SELECTOR_sizeW(); case IND: { hunk ./rts/sm/Sanity.c 401 return sizeofW(StgInd); } - case RET_BCO: - case RET_SMALL: - case RET_BIG: - case RET_DYN: - case UPDATE_FRAME: - case UNDERFLOW_FRAME: - case STOP_FRAME: - case CATCH_FRAME: - case ATOMICALLY_FRAME: - case CATCH_RETRY_FRAME: - case CATCH_STM_FRAME: - barf("checkClosure: stack frame"); - case AP: { StgAP* ap = (StgAP *)p; hunk ./rts/sm/Sanity.c 424 } case ARR_WORDS: - return arr_words_sizeW((StgArrWords *)p); + return arr_words_sizeW((StgArrWords *)p); hunk ./rts/sm/Sanity.c 426 - case MUT_ARR_PTRS_CLEAN: - case MUT_ARR_PTRS_DIRTY: + case MUT_ARR_PTRS_LOCAL: case MUT_ARR_PTRS_FROZEN: case MUT_ARR_PTRS_FROZEN0: { hunk ./rts/sm/Sanity.c 460 } default: - barf("checkClosure (closure type %d)", info->type); + barf("checkClosure (closure type %d)", info->type); + } +} + +static void +checkGlobalPtr (StgClosure *p) +{ + if (HEAP_ALLOCED(p)) { + ASSERT(isGlobal(p)); + ASSERT(LOOKS_LIKE_CLOSURE_PTR(p)); } } hunk ./rts/sm/Sanity.c 473 +// Check that an object only points to global heap, unless it is +// IND_LOCAL or TSO. We use this to test that the global heap +// invariant is satisfied. +StgOffset +checkGlobalClosure (StgClosure* p) +{ + const StgInfoTable *info; + + ASSERT(LOOKS_LIKE_CLOSURE_PTR(p)); + + p = UNTAG_CLOSURE(p); + + info = p->header.info; + + if (IS_FORWARDING_PTR(info)) { + barf("checkGlobalClosure: found EVACUATED closure %d", info->type); + } + info = INFO_PTR_TO_STRUCT(info); + + switch (info->type) { + + case MVAR_CLEAN: + case MVAR_DIRTY: + { + StgMVar *mvar = (StgMVar *)p; + checkGlobalPtr((StgClosure *)mvar->head); + checkGlobalPtr((StgClosure *)mvar->tail); + checkGlobalPtr((StgClosure *)mvar->value); + return sizeofW(StgMVar); + } + + case THUNK: + case THUNK_1_0: + case THUNK_0_1: + case THUNK_1_1: + case THUNK_0_2: + case THUNK_2_0: + { + nat i; + for (i = 0; i < info->layout.payload.ptrs; i++) { + checkGlobalPtr(((StgThunk *)p)->payload[i]); + } + return thunk_sizeW_fromITBL(info); + } + + case MUT_PRIM: + if (p->header.info == &stg_TREC_HEADER_info) { + // TREC_HEADERs are private and are allowed to point to + // the local heap. + return checkClosure(p); + } // else fall through... + case FUN: + case FUN_1_0: + case FUN_0_1: + case FUN_1_1: + case FUN_0_2: + case FUN_2_0: + case CONSTR: + case CONSTR_1_0: + case CONSTR_0_1: + case CONSTR_1_1: + case CONSTR_0_2: + case CONSTR_2_0: + case IND_PERM: + case IND_STATIC: + case BLACKHOLE: + case MUT_VAR_GLOBAL: + case CONSTR_STATIC: + case CONSTR_NOCAF_STATIC: + case THUNK_STATIC: + case FUN_STATIC: + case PRIM: + { + nat i; + for (i = 0; i < info->layout.payload.ptrs; i++) { + checkGlobalPtr(p->payload[i]); + } + return sizeW_fromITBL(info); + } + + case IND_LOCAL: + { + StgClosure *q = ((StgInd*)p)->indirectee; + ASSERT(LOOKS_LIKE_CLOSURE_PTR(q)); + // the indirectee should live in the correct local heap + ASSERT(Bdescr((P_)q)->gen_ix == get_itbl(p)->srt_bitmap); + return sizeofW(StgInd); + } + + case BLOCKING_QUEUE: + { + StgBlockingQueue *bq = (StgBlockingQueue *)p; + + // NO: the BH might have been updated now + // ASSERT(get_itbl(bq->bh)->type == BLACKHOLE); + + // NO: we allow global->local BH pointers + // checkGlobalPtr((StgClosure*)bq->bh); + ASSERT(LOOKS_LIKE_CLOSURE_PTR(((StgClosure*)bq->bh))); + + ASSERT(get_itbl(bq->owner)->type == TSO); + ASSERT(bq->queue == (MessageBlackHole*)END_TSO_QUEUE + || bq->queue->header.info == &stg_MSG_BLACKHOLE_info + || bq->queue->header.info == &stg_MSG_GLOBALISE_info); + ASSERT(bq->link == (StgBlockingQueue*)END_TSO_QUEUE || + get_itbl(bq->link)->type == IND || + get_itbl(bq->link)->type == BLOCKING_QUEUE); + + return sizeofW(StgBlockingQueue); + } + + case BCO: { + StgBCO *bco = (StgBCO *)p; + checkGlobalPtr((StgClosure *)bco->instrs); + checkGlobalPtr((StgClosure *)bco->literals); + checkGlobalPtr((StgClosure *)bco->ptrs); + return bco_sizeW(bco); + } + + case WEAK: + /* deal with these specially - the info table isn't + * representative of the actual layout. + */ + { StgWeak *w = (StgWeak *)p; + checkGlobalPtr(w->key); + checkGlobalPtr(w->value); + checkGlobalPtr(w->finalizer); + if (w->link) { + checkGlobalPtr((StgClosure *)w->link); + } + return sizeW_fromITBL(info); + } + + case THUNK_SELECTOR: + checkGlobalPtr(((StgSelector *)p)->selectee); + return THUNK_SELECTOR_sizeW(); + + case IND: + { + /* we don't expect to see any of these after GC + * but they might appear during execution + */ + StgInd *ind = (StgInd *)p; + checkGlobalPtr(ind->indirectee); + return sizeofW(StgInd); + } + + case AP: + { + StgAP* ap = (StgAP *)p; + checkPAP (ap->fun, ap->payload, ap->n_args); + return ap_sizeW(ap); + } + + case PAP: + { + StgPAP* pap = (StgPAP *)p; + checkPAP (pap->fun, pap->payload, pap->n_args); + return pap_sizeW(pap); + } + + case AP_STACK: + { + StgAP_STACK *ap = (StgAP_STACK *)p; + checkGlobalPtr(ap->fun); + checkStackChunk((StgPtr)ap->payload, (StgPtr)ap->payload + ap->size); + return ap_stack_sizeW(ap); + } + + case ARR_WORDS: + return arr_words_sizeW((StgArrWords *)p); + + case MUT_ARR_PTRS_GLOBAL: + case MUT_ARR_PTRS_FROZEN: + case MUT_ARR_PTRS_FROZEN0: + { + StgMutArrPtrs* a = (StgMutArrPtrs *)p; + nat i; + for (i = 0; i < a->ptrs; i++) { + // checkGlobalPtr(a->payload[i]); + // XXX: mutable arrays can have local ptrs at the + // moment, see globalise_scavenge_large(). + ASSERT(LOOKS_LIKE_CLOSURE_PTR(a->payload[i])); + } + return mut_arr_ptrs_sizeW(a); + } + + case TSO: + checkTSO((StgTSO *)p); + return sizeofW(StgTSO); + + case STACK: + checkSTACK((StgStack*)p); + return stack_sizeW((StgStack*)p); + + case TREC_CHUNK: + // these are allowed to point to the local heap, so we don't + // use checkGlobalPtr(). + { + nat i; + StgTRecChunk *tc = (StgTRecChunk *)p; + ASSERT(LOOKS_LIKE_CLOSURE_PTR((StgClosure *)tc->prev_chunk)); + for (i = 0; i < tc -> next_entry_idx; i ++) { + ASSERT(LOOKS_LIKE_CLOSURE_PTR((StgClosure *)tc->entries[i].tvar)); + ASSERT(LOOKS_LIKE_CLOSURE_PTR(tc->entries[i].expected_value)); + ASSERT(LOOKS_LIKE_CLOSURE_PTR(tc->entries[i].new_value)); + } + return sizeofW(StgTRecChunk); + } + + default: + barf("checkGlobalClosure (closure type %d)", info->type); + } +} /* ----------------------------------------------------------------------------- Check Heap Sanity hunk ./rts/sm/Sanity.c 697 all the objects in the remainder of the chain. -------------------------------------------------------------------------- */ -void -checkHeap(bdescr *bd) +void checkHeapChain (bdescr *bd) { StgPtr p; hunk ./rts/sm/Sanity.c 701 -#if defined(THREADED_RTS) - // heap sanity checking doesn't work with SMP, because we can't - // zero the slop (see Updates.h). - return; -#endif - for (; bd != NULL; bd = bd->link) { if(!(bd->flags & BF_SWEPT)) { p = bd->start; hunk ./rts/sm/Sanity.c 718 } } +void checkGlobalHeapChain (bdescr *bd) +{ + StgPtr p; + + for (; bd != NULL; bd = bd->link) { + p = bd->start; + while (p < bd->free) { + nat size = checkGlobalClosure((StgClosure *)p); + /* This is the smallest size of closure that can live in the heap */ + ASSERT( size >= MIN_PAYLOAD_SIZE + sizeofW(StgHeader) ); + p += size; + + /* skip over slop */ + while (p < bd->free && + (*p < 0x1000 || !LOOKS_LIKE_INFO_PTR(*p))) { p++; } + } + } +} + +void checkPrimHeapChain (bdescr *bd) +{ + StgPtr p; + StgWord flag; + + for (; bd != NULL; bd = bd->link) { + p = bd->start; + while (p < bd->free) { + flag = *p; + p += 1; + switch (flag) { + case 0: // not global + p += checkClosure((StgClosure*)p); + break; + case 1: // global + p += checkGlobalClosure((StgClosure*)p); + break; + default: // reclaimed free space of size 'flag - 2' + p += flag - 2; + break; + } + } + } +} + void checkHeapChunk(StgPtr start, StgPtr end) { hunk ./rts/sm/Sanity.c 803 checkTSO(StgTSO *tso) { if (tso->what_next == ThreadKilled) { - /* The garbage collector doesn't bother following any pointers - * from dead threads, so don't check sanity here. - */ - return; + /* The garbage collector doesn't bother following any pointers + * from dead threads, so don't check sanity here. + */ + return; } ASSERT(tso->_link == END_TSO_QUEUE || hunk ./rts/sm/Sanity.c 813 tso->_link->header.info == &stg_MVAR_TSO_QUEUE_info || tso->_link->header.info == &stg_TSO_info); - if ( tso->why_blocked == BlockedOnMVar - || tso->why_blocked == BlockedOnBlackHole - || tso->why_blocked == BlockedOnMsgThrowTo - || tso->why_blocked == NotBlocked - ) { + switch (tso->why_blocked) { + case NotBlocked: + case BlockedOnDelay: + case BlockedOnWrite: + case BlockedOnRead: + break; + default: ASSERT(LOOKS_LIKE_CLOSURE_PTR(tso->block_info.closure)); } hunk ./rts/sm/Sanity.c 841 StgTSO *tso; nat g; - for (g = 0; g < RtsFlags.GcFlags.generations; g++) { - for (tso=generations[g].threads; tso != END_TSO_QUEUE; + for (g = 0; g < total_generations; g++) { + for (tso = all_generations[g].threads; tso != END_TSO_QUEUE; tso = tso->global_link) { ASSERT(LOOKS_LIKE_CLOSURE_PTR(tso)); ASSERT(get_itbl(tso)->type == TSO); hunk ./rts/sm/Sanity.c 853 // be on the mutable list. if (tso->dirty) { ASSERT(Bdescr((P_)tso)->gen_no == 0 || (tso->flags & TSO_MARKED)); + tso->flags &= ~TSO_MARKED; + } + + { + StgStack *stack; + StgUnderflowFrame *frame; + + stack = tso->stackobj; + while (1) { + if (stack->dirty & 1) { + ASSERT(Bdescr((P_)stack)->gen_no == 0 || (stack->dirty & TSO_MARKED)); + stack->dirty &= ~TSO_MARKED; + } + frame = (StgUnderflowFrame*) (stack->stack + stack->stack_size + - sizeofW(StgUnderflowFrame)); + if (frame->info != &stg_stack_underflow_frame_info + || frame->next_chunk == (StgStack*)END_TSO_QUEUE) break; + stack = frame->next_chunk; + } } hunk ./rts/sm/Sanity.c 873 - tso->flags &= ~TSO_MARKED; } } } hunk ./rts/sm/Sanity.c 881 Check mutable list sanity. -------------------------------------------------------------------------- */ -void -checkMutableList( bdescr *mut_bd, nat gen ) +static void +checkMutableList (bdescr *mut_bd, nat gen, nat cap_no) { hunk ./rts/sm/Sanity.c 884 - bdescr *bd; + bdescr *bd, *pbd; StgPtr q; StgClosure *p; hunk ./rts/sm/Sanity.c 891 for (bd = mut_bd; bd != NULL; bd = bd->link) { for (q = bd->start; q < bd->free; q++) { p = (StgClosure *)*q; - ASSERT(!HEAP_ALLOCED(p) || Bdescr((P_)p)->gen_no == gen); - if (get_itbl(p)->type == TSO) { + pbd = Bdescr((P_)p); + ASSERT(!HEAP_ALLOCED(p) || pbd->gen_no == gen || + ((pbd->flags & BF_PRIM) && isGlobalPrim(p))); + checkGlobalClosure(p); + + switch (get_itbl(p)->type) { + case TSO: + // TSOs on the mutable list must belong to this capability + ASSERT(((StgTSO*)p)->cap->no == cap_no); ((StgTSO *)p)->flags |= TSO_MARKED; hunk ./rts/sm/Sanity.c 901 + break; + case STACK: + ASSERT(((StgStack*)p)->tso->cap->no == cap_no); + ((StgStack *)p)->dirty |= TSO_MARKED; + break; } hunk ./rts/sm/Sanity.c 907 - } + } } } hunk ./rts/sm/Sanity.c 911 -void -checkMutableLists (rtsBool checkTSOs) +static void +checkLocalMutableLists (nat cap_no) { hunk ./rts/sm/Sanity.c 914 - nat g, i; + nat g; + for (g = 1; g < RtsFlags.GcFlags.generations; g++) { + checkMutableList(capabilities[cap_no].mut_lists[g], g, cap_no); + } +} hunk ./rts/sm/Sanity.c 920 - for (g = 0; g < RtsFlags.GcFlags.generations; g++) { - checkMutableList(generations[g].mut_list, g); - for (i = 0; i < n_capabilities; i++) { - checkMutableList(capabilities[i].mut_lists[g], g); - } +static void +checkMutableLists (void) +{ + nat i; + for (i = 0; i < n_capabilities; i++) { + checkLocalMutableLists(i); } hunk ./rts/sm/Sanity.c 927 - checkGlobalTSOList(checkTSOs); } /* hunk ./rts/sm/Sanity.c 973 /* Nursery sanity check */ void -checkNurserySanity (nursery *nursery) +checkNurserySanity (nat cap_no) { bdescr *bd, *prev; nat blocks = 0; hunk ./rts/sm/Sanity.c 977 + nursery *nursery; + + nursery = &nurseries[cap_no]; prev = NULL; for (bd = nursery->blocks; bd != NULL; bd = bd->link) { hunk ./rts/sm/Sanity.c 983 + ASSERT(bd->gen == &all_generations[cap_no]); + ASSERT(bd->gen_ix == cap_no); ASSERT(bd->u.back == prev); prev = bd; blocks += bd->blocks; hunk ./rts/sm/Sanity.c 994 } +static void checkGeneration (generation *gen, + rtsBool after_major_gc USED_IF_THREADS) +{ + nat n; + gen_workspace *ws; + + ASSERT(countBlocks(gen->blocks) == gen->n_blocks); + ASSERT(countBlocks(gen->prim_blocks) == gen->n_prim_blocks); + ASSERT(countBlocks(gen->large_objects) == gen->n_large_blocks); + +#if defined(THREADED_RTS) + // heap sanity checking doesn't work with SMP, because we can't + // zero the slop (see Updates.h). However, we can sanity-check + // the heap after a major gc, because there is no slop. + if (!after_major_gc) return; +#endif + + if (gen->is_local) { + checkHeapChain(gen->blocks); + checkPrimHeapChain(gen->prim_blocks); + } else { + checkGlobalHeapChain(gen->blocks); + ASSERT(gen->prim_blocks == NULL); + } + + for (n = 0; n < n_capabilities; n++) { + ws = &gc_threads[n]->gens[gen->ix]; + if (gen->is_local) { + checkHeapChain(ws->todo_bd); + checkHeapChain(ws->part_list); + checkHeapChain(ws->scavd_list); + } else { + checkGlobalHeapChain(ws->todo_bd); + checkGlobalHeapChain(ws->part_list); + checkGlobalHeapChain(ws->scavd_list); + } + } + + checkLargeObjects(gen->large_objects); +} + +/* Local heap sanity check. */ +static void checkLocalHeap (nat cap_no) +{ + nat g; + generation *gen; + + for (g = 0; g < total_generations; g++) { + gen = &all_generations[g]; + if (gen->is_local && gen->cap == cap_no) { + checkGeneration(gen, rtsFalse); + } + } + checkNurserySanity(cap_no); +} + /* Full heap sanity check. */ hunk ./rts/sm/Sanity.c 1051 -void -checkSanity( rtsBool check_heap ) +static void checkFullHeap (rtsBool after_major_gc) { nat g, n; hunk ./rts/sm/Sanity.c 1055 - for (g = 0; g < RtsFlags.GcFlags.generations; g++) { - ASSERT(countBlocks(generations[g].blocks) - == generations[g].n_blocks); - ASSERT(countBlocks(generations[g].large_objects) - == generations[g].n_large_blocks); - if (check_heap) { - checkHeap(generations[g].blocks); - } - checkLargeObjects(generations[g].large_objects); + for (g = 0; g < total_generations; g++) { + checkGeneration(&all_generations[g], after_major_gc); } hunk ./rts/sm/Sanity.c 1058 - for (n = 0; n < n_capabilities; n++) { hunk ./rts/sm/Sanity.c 1059 - checkNurserySanity(&nurseries[n]); + checkNurserySanity(n); } hunk ./rts/sm/Sanity.c 1061 - - checkFreeListSanity(); +} + +void checkSanity(rtsBool local_only, rtsBool after_gc, rtsBool major_gc, + nat cap_no) +{ + if (local_only) { + checkLocalHeap(cap_no); + } else { + checkFullHeap(after_gc && major_gc); + } + + if (!local_only) checkFreeListSanity(); hunk ./rts/sm/Sanity.c 1074 -#if defined(THREADED_RTS) // always check the stacks in threaded mode, because checkHeap() // does nothing in this case. hunk ./rts/sm/Sanity.c 1076 - checkMutableLists(rtsTrue); -#else - if (check_heap) { - checkMutableLists(rtsFalse); - } else { - checkMutableLists(rtsTrue); + if (after_gc) { + if (local_only) { + checkLocalMutableLists(cap_no); + } else { + checkMutableLists(); + checkGlobalTSOList(rtsTrue); + } } hunk ./rts/sm/Sanity.c 1084 -#endif } // If memInventory() calculates that we have a memory leak, this hunk ./rts/sm/Sanity.c 1096 static void findMemoryLeak (void) { - nat g, i; - for (g = 0; g < RtsFlags.GcFlags.generations; g++) { - for (i = 0; i < n_capabilities; i++) { - markBlocks(capabilities[i].mut_lists[g]); - } - markBlocks(generations[g].mut_list); - markBlocks(generations[g].blocks); - markBlocks(generations[g].large_objects); + nat g, i, n; + for (n = 0; n < total_generations; n++) { + g = all_generations[n].no; + for (i = 0; i < n_capabilities; i++) { + if (g > 0) { markBlocks(capabilities[i].mut_lists[g]); } + markBlocks(gc_threads[i]->gens[n].part_list); + markBlocks(gc_threads[i]->gens[n].scavd_list); + markBlocks(gc_threads[i]->gens[n].todo_bd); + } + markBlocks(all_generations[n].blocks); + markBlocks(all_generations[n].prim_blocks); + markBlocks(all_generations[n].large_objects); } hunk ./rts/sm/Sanity.c 1110 - for (i = 0; i < n_capabilities; i++) { - markBlocks(nurseries[i].blocks); - } + for (i = 0; i < n_capabilities; i++) { + markBlocks(nurseries[i].blocks); + markBlocks(gc_threads[i]->mark_stack_top_bd); + } #ifdef PROFILING // TODO: hunk ./rts/sm/Sanity.c 1172 genBlocks (generation *gen) { ASSERT(countBlocks(gen->blocks) == gen->n_blocks); + ASSERT(countBlocks(gen->prim_blocks) == gen->n_prim_blocks); ASSERT(countBlocks(gen->large_objects) == gen->n_large_blocks); hunk ./rts/sm/Sanity.c 1174 - return gen->n_blocks + gen->n_old_blocks + - countAllocdBlocks(gen->large_objects); + return gen->n_blocks + gen->n_prim_blocks + gen->n_old_blocks + + countAllocdBlocks(gen->large_objects); } void hunk ./rts/sm/Sanity.c 1181 memInventory (rtsBool show) { - nat g, i; - lnat gen_blocks[RtsFlags.GcFlags.generations]; + nat g, i, n; + lnat gen_blocks[total_generations]; lnat nursery_blocks, retainer_blocks, hunk ./rts/sm/Sanity.c 1184 - arena_blocks, exec_blocks; + arena_blocks, exec_blocks, mark_stack_blocks; lnat live_blocks = 0, free_blocks = 0; rtsBool leak; hunk ./rts/sm/Sanity.c 1188 - // count the blocks we current have + // count the blocks we currently have hunk ./rts/sm/Sanity.c 1190 - for (g = 0; g < RtsFlags.GcFlags.generations; g++) { - gen_blocks[g] = 0; + for (n = 0; n < total_generations; n++) { + g = all_generations[n].no; + gen_blocks[n] = 0; for (i = 0; i < n_capabilities; i++) { hunk ./rts/sm/Sanity.c 1194 - gen_blocks[g] += countBlocks(capabilities[i].mut_lists[g]); + if (g > 0) { + gen_blocks[n] += countBlocks(capabilities[i].mut_lists[g]); + } + gen_blocks[n] += countBlocks(gc_threads[i]->gens[n].part_list); + gen_blocks[n] += countBlocks(gc_threads[i]->gens[n].scavd_list); + gen_blocks[n] += countBlocks(gc_threads[i]->gens[n].todo_bd); } hunk ./rts/sm/Sanity.c 1201 - gen_blocks[g] += countAllocdBlocks(generations[g].mut_list); - gen_blocks[g] += genBlocks(&generations[g]); + gen_blocks[n] += genBlocks(&all_generations[n]); } nursery_blocks = 0; hunk ./rts/sm/Sanity.c 1210 nursery_blocks += nurseries[i].n_blocks; } + mark_stack_blocks = 0; + for (i = 0; i < n_capabilities; i++) { + mark_stack_blocks += countBlocks(gc_threads[i]->mark_stack_top_bd); + } + retainer_blocks = 0; #ifdef PROFILING if (RtsFlags.ProfFlags.doHeapProfile == HEAP_BY_RETAINER) { hunk ./rts/sm/Sanity.c 1232 free_blocks = countFreeList(); live_blocks = 0; - for (g = 0; g < RtsFlags.GcFlags.generations; g++) { + for (g = 0; g < total_generations; g++) { live_blocks += gen_blocks[g]; } hunk ./rts/sm/Sanity.c 1235 - live_blocks += nursery_blocks + + live_blocks += nursery_blocks + mark_stack_blocks + retainer_blocks + arena_blocks + exec_blocks; #define MB(n) (((n) * BLOCK_SIZE_W) / ((1024*1024)/sizeof(W_))) hunk ./rts/sm/Sanity.c 1249 } else { debugBelch("Memory inventory:\n"); } - for (g = 0; g < RtsFlags.GcFlags.generations; g++) { + for (g = 0; g < total_generations; g++) { debugBelch(" gen %d blocks : %5lu blocks (%lu MB)\n", g, gen_blocks[g], MB(gen_blocks[g])); } hunk ./rts/sm/Sanity.c 1255 debugBelch(" nursery : %5lu blocks (%lu MB)\n", nursery_blocks, MB(nursery_blocks)); + debugBelch(" mark stacks : %5lu blocks (%lu MB)\n", + mark_stack_blocks, MB(mark_stack_blocks)); debugBelch(" retainer : %5lu blocks (%lu MB)\n", retainer_blocks, MB(retainer_blocks)); debugBelch(" arena blocks : %5lu blocks (%lu MB)\n", hunk ./rts/sm/Sanity.c 1282 } +/* ----------------------------------------------------------------------------- + * + * Tools for use in gdb. + * + * ---------------------------------------------------------------------------*/ + +void +findBlockInList(bdescr *bd, bdescr *list) +{ + bdescr *p; + + for (p = list; p != NULL; p = p->link) { + ASSERT(bd != p); + } +} + +// useful for locating a block from within gdb. +void +findBlock (bdescr *bd) +{ + nat n, g, i; + generation *gen; + + for (n = 0; n < total_generations; n++) { + g = all_generations[n].no; + for (i = 0; i < n_capabilities; i++) { + if (g > 0) { + findBlockInList(bd, capabilities[i].mut_lists[g]); + } + findBlockInList(bd, gc_threads[i]->gens[n].part_list); + findBlockInList(bd, gc_threads[i]->gens[n].scavd_list); + findBlockInList(bd, gc_threads[i]->gens[n].todo_bd); + } + + gen = &all_generations[n]; + findBlockInList(bd, gen->blocks); + findBlockInList(bd, gen->old_blocks); + findBlockInList(bd, gen->prim_blocks); + findBlockInList(bd, gen->large_objects); + findBlockInList(bd, gen->scavenged_large_objects); + } + + for (i = 0; i < n_capabilities; i++) { + findBlockInList(bd, nurseries[i].blocks); + findBlockInList(bd, gc_threads[i]->mark_stack_top_bd); + } +} + +int searched = 0; + +static int +findPtrBlocks (char *msg, StgPtr p, bdescr *bd, StgPtr arr[], int arr_size, int i) +{ + StgPtr q, r, end; + nat block_no = 0; + + for (; bd; bd = bd->link, block_no++) { + searched++; + for (q = bd->start; q < bd->free; q++) { + if (UNTAG_CLOSURE((StgClosure*)*q) == (StgClosure *)p) { + debugBelch("found at %p in block %d of %s\n", + q, block_no, msg); + if (i < arr_size) { + for (r = bd->start; r < bd->free; r = end) { + // skip over zeroed-out slop + while (*r == 0) r++; + if (!LOOKS_LIKE_CLOSURE_PTR(r)) { + debugBelch("no closure at %p\n", r); + break; + } + end = r + closure_sizeW((StgClosure*)r); + if (q < end) { + debugBelch("%p = ", r); + printClosure((StgClosure *)r); + arr[i++] = r; + break; + } + } + if (r >= bd->free) { + debugBelch("closure?\n"); + } + } else { + return i; + } + } + } + } + return i; +} + +void +findPtr(P_ p, int follow) +{ + nat g; + bdescr *bd; + const int arr_size = 1024; + StgPtr arr[arr_size]; + int i = 0; + nat j; + searched = 0; + char msg[512]; + + for (g = 0; g < total_generations; g++) { + bd = all_generations[g].blocks; + snprintf(msg, 512, "all_generations[%d].blocks", g); + i = findPtrBlocks(msg, p,bd,arr,arr_size,i); + + bd = all_generations[g].prim_blocks; + snprintf(msg, 512, "all_generations[%d].prim_blocks", g); + i = findPtrBlocks(msg, p,bd,arr,arr_size,i); + + bd = all_generations[g].large_objects; + snprintf(msg, 512, "all_generations[%d].large_objects", g); + i = findPtrBlocks(msg, p,bd,arr,arr_size,i); + + for (j = 0; j < n_capabilities; j++) { + bd = gc_threads[j]->gens[g].part_list; + snprintf(msg, 512, "gc_threads[%d]->gens[%d].part_list", j,g); + i = findPtrBlocks(msg, p,bd,arr,arr_size,i); + + bd = gc_threads[j]->gens[g].todo_bd; + snprintf(msg, 512, "gc_threads[%d]->gens[%d].todo_bd", j,g); + i = findPtrBlocks(msg, p,bd,arr,arr_size,i); + } + if (i >= arr_size) return; + } + if (follow && i == 1) { + debugBelch("-->\n"); + findPtr(arr[0], 1); + } +} + +void +findPtrAnywhere (StgPtr p) +{ + StgPtr mblock, q; + + mblock = getFirstMBlock(); + do { + for (q = mblock; q < mblock + MBLOCK_SIZE_W; q++) { + if (UNTAG_CLOSURE((StgClosure*)*q) == + UNTAG_CLOSURE((StgClosure*)p)) { + debugBelch("found at %p\n", q); + } + } + } while ((mblock = getNextMBlock(mblock))); +} + #endif /* DEBUG */ hunk ./rts/sm/Sanity.h 16 #include "BeginPrivate.h" -# if defined(PAR) -# define PVM_PE_MASK 0xfffc0000 -# define MAX_PVM_PES MAX_PES -# define MAX_PVM_TIDS MAX_PES -# define MAX_SLOTS 100000 -# endif - /* debugging routines */ hunk ./rts/sm/Sanity.h 17 -void checkSanity ( rtsBool check_heap ); -void checkNurserySanity ( nursery *nursery ); -void checkHeap ( bdescr *bd ); +void checkSanity (rtsBool local_only, rtsBool after_gc, rtsBool major_gc, + nat cap_no); +void checkNurserySanity (nat cap_no); + +void checkHeapChain (bdescr *bd); +void checkGlobalHeapChain (bdescr *bd); +void checkPrimHeapChain (bdescr *bd); + void checkHeapChunk ( StgPtr start, StgPtr end ); void checkLargeObjects ( bdescr *bd ); void checkTSO ( StgTSO* tso ); hunk ./rts/sm/Sanity.h 31 void checkGlobalTSOList ( rtsBool checkTSOs ); void checkStaticObjects ( StgClosure* static_objects ); void checkStackChunk ( StgPtr sp, StgPtr stack_end ); -StgOffset checkStackFrame ( StgPtr sp ); -StgOffset checkClosure ( StgClosure* p ); +StgOffset checkStackFrame (StgPtr sp); hunk ./rts/sm/Sanity.h 33 -void checkMutableList ( bdescr *bd, nat gen ); -void checkMutableLists ( rtsBool checkTSOs ); +StgOffset checkClosure (StgClosure* p); +StgOffset checkGlobalClosure (StgClosure* p); void checkRunQueue (Capability *cap); hunk ./rts/sm/Sanity.h 42 void checkBQ (StgTSO *bqe, StgClosure *closure); +/* tools for use in gdb */ + +void findPtr (StgPtr p, int follow); +void findPtrAnywhere (StgPtr p); + +void findBlock (bdescr *bd); +void findBlockInList (bdescr *bd, bdescr *list); + #include "EndPrivate.h" #endif /* DEBUG */ hunk ./rts/sm/Scav.c 37 StgLargeBitmap *large_bitmap, nat size ); -#if defined(THREADED_RTS) && !defined(PARALLEL_GC) -# define evacuate(a) evacuate1(a) -# define scavenge_loop(a) scavenge_loop1(a) -# define scavenge_block(a) scavenge_block1(a) -# define scavenge_mutable_list(bd,g) scavenge_mutable_list1(bd,g) -# define scavenge_capability_mut_lists(cap) scavenge_capability_mut_Lists1(cap) +#if defined(THREADED_RTS) +#if defined(LOCAL_GC) +# define evacuate(a) evacuate_local(a) +# define scavenge_loop(a) scavenge_loop_local(a) +# define scavenge_capability_mut_lists(cap) scavenge_capability_mut_lists_local(cap) +# define scavengeTSO(tso) scavengeTSO_local(tso) +# define scavenge_large(ws) scavenge_large_local(ws) +#elif defined(PARALLEL_GC) +// use the thread-safe evacuate() for evacuation in parallel GC +# define scavenge_loop(a) scavenge_loop_par(a) +# define scavenge_capability_mut_lists(cap) scavenge_capability_mut_lists_par(cap) +# define scavengeTSO(tso) scavengeTSO_par(tso) +# define scavenge_large(ws) scavenge_large_par(ws) +#else +// sequential GC in the threaded RTS can call evacuate_seq() +# define evacuate(a) evacuate_seq(a) +#endif #endif /* ----------------------------------------------------------------------------- hunk ./rts/sm/Scav.c 103 tso->dirty = gct->failed_to_evac; + debugTrace(DEBUG_gc,"dirty: %d",(int)tso->dirty); + gct->eager_promotion = saved_eager; } hunk ./rts/sm/Scav.c 276 return scavenge_PAP_payload (ap->fun, ap->payload, ap->n_args); } +#ifndef LOCAL_GC /* ----------------------------------------------------------------------------- Scavenge SRTs -------------------------------------------------------------------------- */ hunk ./rts/sm/Scav.c 352 bitmap = bitmap >> 1; } } +#endif // not LOCAL_GC hunk ./rts/sm/Scav.c 354 +#ifdef LOCAL_GC +#define USED_IF_NOT_LOCAL STG_UNUSED +#else +#define USED_IF_NOT_LOCAL STG_UNUSED +#endif STATIC_INLINE GNUC_ATTR_HOT void hunk ./rts/sm/Scav.c 361 -scavenge_thunk_srt(const StgInfoTable *info) +scavenge_thunk_srt(const StgInfoTable *info USED_IF_NOT_LOCAL) { hunk ./rts/sm/Scav.c 363 +#ifdef LOCAL_GC + return; +#else StgThunkInfoTable *thunk_info; if (!major_gc) return; hunk ./rts/sm/Scav.c 372 thunk_info = itbl_to_thunk_itbl(info); scavenge_srt((StgClosure **)GET_SRT(thunk_info), thunk_info->i.srt_bitmap); +#endif } STATIC_INLINE GNUC_ATTR_HOT void hunk ./rts/sm/Scav.c 376 -scavenge_fun_srt(const StgInfoTable *info) +scavenge_fun_srt(const StgInfoTable *info USED_IF_NOT_LOCAL) { hunk ./rts/sm/Scav.c 378 +#ifdef LOCAL_GC + return; +#else StgFunInfoTable *fun_info; if (!major_gc) return; hunk ./rts/sm/Scav.c 387 fun_info = itbl_to_fun_itbl(info); scavenge_srt((StgClosure **)GET_FUN_SRT(fun_info), fun_info->i.srt_bitmap); +#endif } /* ----------------------------------------------------------------------------- hunk ./rts/sm/Scav.c 407 { StgPtr p, q; StgInfoTable *info; + nat saved_evac_gen_ix; rtsBool saved_eager_promotion; gen_workspace *ws; hunk ./rts/sm/Scav.c 415 bd->start, bd->gen_no, bd->u.scan); gct->scan_bd = bd; - gct->evac_gen = bd->gen; + gct->evac_gen_ix = bd->gen_ix; + saved_evac_gen_ix = gct->evac_gen_ix; saved_eager_promotion = gct->eager_promotion; gct->failed_to_evac = rtsFalse; hunk ./rts/sm/Scav.c 420 - ws = &gct->gens[bd->gen->no]; + ws = &gct->gens[bd->gen_ix]; p = bd->u.scan; hunk ./rts/sm/Scav.c 424 - // we might be evacuating into the very object that we're + // we might be evacuating into the very block that we're // scavenging, so we have to check the real bd->free pointer each // time around the loop. while (p < bd->free || (bd == ws->todo_bd && p < ws->todo_free)) { hunk ./rts/sm/Scav.c 429 - ASSERT(bd->link == NULL); + ASSERT(bd->link == NULL); ASSERT(LOOKS_LIKE_CLOSURE_PTR(p)); hunk ./rts/sm/Scav.c 431 - info = get_itbl((StgClosure *)p); - ASSERT(gct->thunk_selector_depth == 0); q = p; hunk ./rts/sm/Scav.c 434 + info = get_itbl((StgClosure *)p); switch (info->type) { case MVAR_CLEAN: hunk ./rts/sm/Scav.c 565 break; } + case IND_LOCAL: + evacuate(&((StgInd *)p)->indirectee); +#ifndef LOCAL_GC + // evacuated to the current gen: no need for IND_LOCAL any more. + // note: in LOCAL_GC we can't change it to an IND here, + // because we haven't finished scavenging the transitive + // closure yet + if (!gct->failed_to_evac) { + ((StgClosure *)p)->header.info = &stg_IND_info; + } +#endif + p += sizeofW(StgInd); + break; + case IND_PERM: case BLACKHOLE: evacuate(&((StgInd *)p)->indirectee); hunk ./rts/sm/Scav.c 585 p += sizeofW(StgInd); break; - case MUT_VAR_CLEAN: - case MUT_VAR_DIRTY: + case MUT_VAR_LOCAL: + case MUT_VAR_GLOBAL: gct->eager_promotion = rtsFalse; evacuate(&((StgMutVar *)p)->var); gct->eager_promotion = saved_eager_promotion; hunk ./rts/sm/Scav.c 591 - if (gct->failed_to_evac) { - ((StgClosure *)q)->header.info = &stg_MUT_VAR_DIRTY_info; + if (ws->gen->no > 0) { + ((StgClosure *)q)->header.info = &stg_MUT_VAR_GLOBAL_info; } else { hunk ./rts/sm/Scav.c 594 - ((StgClosure *)q)->header.info = &stg_MUT_VAR_CLEAN_info; + ((StgClosure *)q)->header.info = &stg_MUT_VAR_LOCAL_info; } p += sizeofW(StgMutVar); break; hunk ./rts/sm/Scav.c 651 p += arr_words_sizeW((StgArrWords *)p); break; - case MUT_ARR_PTRS_CLEAN: - case MUT_ARR_PTRS_DIRTY: + case MUT_ARR_PTRS_LOCAL: + case MUT_ARR_PTRS_GLOBAL: { // We don't eagerly promote objects pointed to by a mutable // array, but if we find the array only points to objects in hunk ./rts/sm/Scav.c 662 p = scavenge_mut_arr_ptrs((StgMutArrPtrs*)p); - if (gct->failed_to_evac) { - ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_DIRTY_info; + if (ws->gen->no > 0) { + ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_GLOBAL_info; } else { hunk ./rts/sm/Scav.c 665 - ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_CLEAN_info; + ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_LOCAL_info; } gct->eager_promotion = saved_eager_promotion; hunk ./rts/sm/Scav.c 669 - gct->failed_to_evac = rtsTrue; // always put it on the mutable list. break; } hunk ./rts/sm/Scav.c 701 gct->eager_promotion = rtsFalse; + evacuate((StgClosure**)&stack->tso); scavenge_stack(stack->sp, stack->stack + stack->stack_size); stack->dirty = gct->failed_to_evac; p += stack_sizeW(stack); hunk ./rts/sm/Scav.c 732 StgWord i; StgTRecChunk *tc = ((StgTRecChunk *) p); TRecEntry *e = &(tc -> entries[0]); - gct->eager_promotion = rtsFalse; + gct->evac_gen_ix = 0; evacuate((StgClosure **)&tc->prev_chunk); for (i = 0; i < tc -> next_entry_idx; i ++, e++ ) { evacuate((StgClosure **)&e->tvar); hunk ./rts/sm/Scav.c 739 evacuate((StgClosure **)&e->expected_value); evacuate((StgClosure **)&e->new_value); } - gct->eager_promotion = saved_eager_promotion; + gct->evac_gen_ix = saved_evac_gen_ix; gct->failed_to_evac = rtsTrue; // mutable p += sizeofW(StgTRecChunk); break; hunk ./rts/sm/Scav.c 766 } if (p > bd->free) { - gct->copied += ws->todo_free - bd->free; bd->free = p; } hunk ./rts/sm/Scav.c 773 (unsigned long)((bd->free - bd->u.scan) * sizeof(W_))); // update stats: this is a block that has been scavenged + gct->copied += bd->free - bd->u.scan; gct->scanned += bd->free - bd->u.scan; bd->u.scan = bd->free; hunk ./rts/sm/Scav.c 785 gct->scan_bd = NULL; } + /* ----------------------------------------------------------------------------- Scavenge everything on the mark stack. hunk ./rts/sm/Scav.c 795 -------------------------------------------------------------------------- */ static void -scavenge_mark_stack(void) +scavenge_mark_stack (generation *gen) { StgPtr p, q; StgInfoTable *info; hunk ./rts/sm/Scav.c 799 + nat saved_evac_gen_ix; rtsBool saved_eager_promotion; hunk ./rts/sm/Scav.c 802 - gct->evac_gen = oldest_gen; + saved_evac_gen_ix = gct->evac_gen_ix; saved_eager_promotion = gct->eager_promotion; hunk ./rts/sm/Scav.c 804 + gct->evac_gen_ix = gen->ix; while ((p = pop_mark_stack())) { hunk ./rts/sm/Scav.c 924 // no "old" generation. break; + case IND_LOCAL: + evacuate(&((StgInd *)p)->indirectee); +#ifndef LOCAL_GC + // evacuated to the current gen: no need for IND_LOCAL any more + // note: in LOCAL_GC we can't change it to an IND here, + // because we haven't finished scavenging the transitive + // closure yet + if (!gct->failed_to_evac) { + ((StgClosure *)p)->header.info = &stg_IND_info; + } +#endif + break; + case IND: case BLACKHOLE: evacuate(&((StgInd *)p)->indirectee); hunk ./rts/sm/Scav.c 942 break; - case MUT_VAR_CLEAN: - case MUT_VAR_DIRTY: { + case MUT_VAR_LOCAL: + case MUT_VAR_GLOBAL: { gct->eager_promotion = rtsFalse; evacuate(&((StgMutVar *)p)->var); gct->eager_promotion = saved_eager_promotion; hunk ./rts/sm/Scav.c 948 - if (gct->failed_to_evac) { - ((StgClosure *)q)->header.info = &stg_MUT_VAR_DIRTY_info; + if (gen->no > 0) { + ((StgClosure *)q)->header.info = &stg_MUT_VAR_GLOBAL_info; } else { hunk ./rts/sm/Scav.c 951 - ((StgClosure *)q)->header.info = &stg_MUT_VAR_CLEAN_info; + ((StgClosure *)q)->header.info = &stg_MUT_VAR_LOCAL_info; } break; } hunk ./rts/sm/Scav.c 1003 scavenge_AP((StgAP *)p); break; - case MUT_ARR_PTRS_CLEAN: - case MUT_ARR_PTRS_DIRTY: + case MUT_ARR_PTRS_LOCAL: + case MUT_ARR_PTRS_GLOBAL: // follow everything { // We don't eagerly promote objects pointed to by a mutable hunk ./rts/sm/Scav.c 1015 scavenge_mut_arr_ptrs((StgMutArrPtrs *)p); - if (gct->failed_to_evac) { - ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_DIRTY_info; + if (gen->no > 0) { + ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_GLOBAL_info; } else { hunk ./rts/sm/Scav.c 1018 - ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_CLEAN_info; + ((StgClosure *)q)->header.info = &stg_MUT_ARR_PTRS_LOCAL_info; } gct->eager_promotion = saved_eager_promotion; hunk ./rts/sm/Scav.c 1022 - gct->failed_to_evac = rtsTrue; // mutable anyhow. break; } hunk ./rts/sm/Scav.c 1055 gct->eager_promotion = rtsFalse; + evacuate((StgClosure**)&stack->tso); scavenge_stack(stack->sp, stack->stack + stack->stack_size); stack->dirty = gct->failed_to_evac; hunk ./rts/sm/Scav.c 1084 StgWord i; StgTRecChunk *tc = ((StgTRecChunk *) p); TRecEntry *e = &(tc -> entries[0]); - gct->eager_promotion = rtsFalse; + gct->evac_gen_ix = 0; evacuate((StgClosure **)&tc->prev_chunk); for (i = 0; i < tc -> next_entry_idx; i ++, e++ ) { evacuate((StgClosure **)&e->tvar); hunk ./rts/sm/Scav.c 1091 evacuate((StgClosure **)&e->expected_value); evacuate((StgClosure **)&e->new_value); } - gct->eager_promotion = saved_eager_promotion; + gct->evac_gen_ix = saved_evac_gen_ix; gct->failed_to_evac = rtsTrue; // mutable break; } hunk ./rts/sm/Scav.c 1103 if (gct->failed_to_evac) { gct->failed_to_evac = rtsFalse; - if (gct->evac_gen) { - recordMutableGen_GC((StgClosure *)q, gct->evac_gen->no); - } + if (gen->no > 0) { + recordMutableGen_GC((StgClosure *)q, oldest_gen->no); + } else { + ASSERT(Bdescr(q)->flags & BF_PRIM); + if (isGlobalPrim((StgClosure*)q)) { + recordMutableGen_GC((StgClosure *)q, oldest_gen->no); + } + } } } // while (p = pop_mark_stack()) } hunk ./rts/sm/Scav.c 1127 scavenge_one(StgPtr p) { const StgInfoTable *info; + nat saved_evac_gen_ix = gct->evac_gen_ix; rtsBool no_luck; rtsBool saved_eager_promotion; hunk ./rts/sm/Scav.c 1197 break; } - case MUT_VAR_CLEAN: - case MUT_VAR_DIRTY: { + case MUT_VAR_LOCAL: + case MUT_VAR_GLOBAL: { StgPtr q = p; gct->eager_promotion = rtsFalse; hunk ./rts/sm/Scav.c 1205 evacuate(&((StgMutVar *)p)->var); gct->eager_promotion = saved_eager_promotion; - if (gct->failed_to_evac) { - ((StgClosure *)q)->header.info = &stg_MUT_VAR_DIRTY_info; + if (Bdescr(p)->gen_no > 0) { // XXX too expensive + ((StgClosure *)q)->header.info = &stg_MUT_VAR_GLOBAL_info; } else { hunk ./rts/sm/Scav.c 1208 - ((StgClosure *)q)->header.info = &stg_MUT_VAR_CLEAN_info; + ((StgClosure *)q)->header.info = &stg_MUT_VAR_LOCAL_info; } break; } hunk ./rts/sm/Scav.c 1261 // nothing to follow break; - case MUT_ARR_PTRS_CLEAN: - case MUT_ARR_PTRS_DIRTY: + case MUT_ARR_PTRS_LOCAL: + case MUT_ARR_PTRS_GLOBAL: { // We don't eagerly promote objects pointed to by a mutable // array, but if we find the array only points to objects in hunk ./rts/sm/Scav.c 1272 scavenge_mut_arr_ptrs((StgMutArrPtrs *)p); - if (gct->failed_to_evac) { - ((StgClosure *)p)->header.info = &stg_MUT_ARR_PTRS_DIRTY_info; + if (Bdescr(p)->gen_no > 0) { // XXX too expensive + ((StgClosure *)p)->header.info = &stg_MUT_ARR_PTRS_GLOBAL_info; } else { hunk ./rts/sm/Scav.c 1275 - ((StgClosure *)p)->header.info = &stg_MUT_ARR_PTRS_CLEAN_info; + ((StgClosure *)p)->header.info = &stg_MUT_ARR_PTRS_LOCAL_info; } gct->eager_promotion = saved_eager_promotion; hunk ./rts/sm/Scav.c 1279 - gct->failed_to_evac = rtsTrue; break; } hunk ./rts/sm/Scav.c 1310 gct->eager_promotion = rtsFalse; + evacuate((StgClosure**)&stack->tso); scavenge_stack(stack->sp, stack->stack + stack->stack_size); stack->dirty = gct->failed_to_evac; hunk ./rts/sm/Scav.c 1332 gct->eager_promotion = saved_eager_promotion; gct->failed_to_evac = rtsTrue; // mutable break; - } case TREC_CHUNK: hunk ./rts/sm/Scav.c 1339 StgWord i; StgTRecChunk *tc = ((StgTRecChunk *) p); TRecEntry *e = &(tc -> entries[0]); - gct->eager_promotion = rtsFalse; + gct->evac_gen_ix = 0; evacuate((StgClosure **)&tc->prev_chunk); for (i = 0; i < tc -> next_entry_idx; i ++, e++ ) { evacuate((StgClosure **)&e->tvar); hunk ./rts/sm/Scav.c 1346 evacuate((StgClosure **)&e->expected_value); evacuate((StgClosure **)&e->new_value); } - gct->eager_promotion = saved_eager_promotion; + gct->evac_gen_ix = saved_evac_gen_ix; gct->failed_to_evac = rtsTrue; // mutable break; } hunk ./rts/sm/Scav.c 1358 case BLACKHOLE: case IND_STATIC: evacuate(&((StgInd *)p)->indirectee); + break; hunk ./rts/sm/Scav.c 1360 -#if 0 && defined(DEBUG) - if (RtsFlags.DebugFlags.gc) - /* Debugging code to print out the size of the thing we just - * promoted - */ - { - StgPtr start = gen->scan; - bdescr *start_bd = gen->scan_bd; - nat size = 0; - scavenge(&gen); - if (start_bd != gen->scan_bd) { - size += (P_)BLOCK_ROUND_UP(start) - start; - start_bd = start_bd->link; - while (start_bd != gen->scan_bd) { - size += BLOCK_SIZE_W; - start_bd = start_bd->link; - } - size += gen->scan - - (P_)BLOCK_ROUND_DOWN(gen->scan); - } else { - size = gen->scan - start; - } - debugBelch("evac IND_OLDGEN: %ld bytes", size * sizeof(W_)); - } + case IND_LOCAL: + evacuate(&((StgInd *)p)->indirectee); +#ifndef LOCAL_GC + // if we succeeded in evacuating; turn it into an ordinary IND + // note: in LOCAL_GC we can't change it to an IND here, + // because we haven't finished scavenging the transitive + // closure yet + if (!gct->failed_to_evac) { + ((StgInd *)p)->header.info = &stg_IND_info; + } #endif hunk ./rts/sm/Scav.c 1371 - break; + break; default: barf("scavenge_one: strange object %d", (int)(info->type)); hunk ./rts/sm/Scav.c 1390 remove non-mutable objects from the mutable list at this point. -------------------------------------------------------------------------- */ -void +static void scavenge_mutable_list(bdescr *bd, generation *gen) { StgPtr p, q; hunk ./rts/sm/Scav.c 1395 - gct->evac_gen = gen; + gct->evac_gen_ix = gen->ix; for (; bd != NULL; bd = bd->link) { hunk ./rts/sm/Scav.c 1397 + + debugTrace(DEBUG_gc, "scavenging mut_list block %p", bd); + for (q = bd->start; q < bd->free; q++) { p = (StgPtr)*q; ASSERT(LOOKS_LIKE_CLOSURE_PTR(p)); hunk ./rts/sm/Scav.c 1406 #ifdef DEBUG switch (get_itbl((StgClosure *)p)->type) { - case MUT_VAR_CLEAN: + case MUT_VAR_LOCAL: // can happen due to concurrent writeMutVars hunk ./rts/sm/Scav.c 1408 - case MUT_VAR_DIRTY: + case MUT_VAR_GLOBAL: mutlist_MUTVARS++; break; hunk ./rts/sm/Scav.c 1410 - case MUT_ARR_PTRS_CLEAN: - case MUT_ARR_PTRS_DIRTY: + case MUT_ARR_PTRS_LOCAL: + case MUT_ARR_PTRS_GLOBAL: case MUT_ARR_PTRS_FROZEN: case MUT_ARR_PTRS_FROZEN0: mutlist_MUTARRS++; break; hunk ./rts/sm/Scav.c 1424 } #endif - // Check whether this object is "clean", that is it - // definitely doesn't point into a young generation. - // Clean objects don't need to be scavenged. Some clean - // objects (MUT_VAR_CLEAN) are not kept on the mutable - // list at all; others, such as TSO - // are always on the mutable list. - // - switch (get_itbl((StgClosure *)p)->type) { - case MUT_ARR_PTRS_CLEAN: - recordMutableGen_GC((StgClosure *)p,gen->no); - continue; - case MUT_ARR_PTRS_DIRTY: + // Check whether this object is "clean", that is it + // definitely doesn't point into a young generation. + // Clean objects don't need to be scavenged. Some clean + // objects (MUT_VAR_CLEAN) are not kept on the mutable + // list at all; others, such as TSO + // are always on the mutable list. + // + switch (get_itbl((StgClosure *)p)->type) { + + case MUT_ARR_PTRS_LOCAL: + barf("scavenge_mutable_list: MUT_ARR_PTRS_LOCAL"); + case MUT_ARR_PTRS_GLOBAL: { rtsBool saved_eager_promotion; saved_eager_promotion = gct->eager_promotion; hunk ./rts/sm/Scav.c 1440 gct->eager_promotion = rtsFalse; - + scavenge_mut_arr_ptrs_marked((StgMutArrPtrs *)p); hunk ./rts/sm/Scav.c 1442 - + if (gct->failed_to_evac) { hunk ./rts/sm/Scav.c 1444 - ((StgClosure *)p)->header.info = &stg_MUT_ARR_PTRS_DIRTY_info; - } else { - ((StgClosure *)p)->header.info = &stg_MUT_ARR_PTRS_CLEAN_info; + recordMutableGen_GC((StgClosure *)p,gen->no); + gct->failed_to_evac = rtsFalse; } hunk ./rts/sm/Scav.c 1447 - + gct->eager_promotion = saved_eager_promotion; hunk ./rts/sm/Scav.c 1449 - gct->failed_to_evac = rtsFalse; - recordMutableGen_GC((StgClosure *)p,gen->no); - continue; + continue; } hunk ./rts/sm/Scav.c 1451 + +#ifdef LOCAL_GC + case IND_LOCAL: + { + StgInd *ind = (StgInd *)p; + + if (isGlobal(ind->indirectee)) { + SET_INFO(ind, &stg_IND_info); + continue; + } + break; + } +#endif + + case TSO: { +#if defined(THREADED_RTS) || defined(DEBUG) + StgTSO *tso = (StgTSO *)p; +#endif + + debugTrace(DEBUG_gc, "mut_list TSO %p", tso); + +#ifdef THREADED_RTS + if (gct->gc_type == GC_LOCAL && tso->cap->no != gct->index) { + // this TSO must have been migrated from here to + // another Capability, so we must ignore its entry + // on the local mut list. + debugTrace(DEBUG_gc, + "not scavenging TSO %u, belongs to cap %u", + (nat)tso->id, (nat)tso->cap->no); + continue; + } +#endif + + break; + } + + case STACK: { +#ifdef THREADED_RTS + StgStack *stack = (StgStack*)p; + + if (gct->gc_type == GC_LOCAL && + stack->tso->cap->no != gct->index) { + // this TSO must have been migrated from here to + // another Capability, so we must ignore its entry + // on the local mut list. + debugTrace(DEBUG_gc, + "not scavenging STACK %p for TSO %u, belongs to cap %u", + (void*)stack, (nat)stack->tso->id, (nat)stack->tso->cap->no); + continue; + } +#endif + break; + } + default: hunk ./rts/sm/Scav.c 1506 - ; - } + ; + } hunk ./rts/sm/Scav.c 1509 - if (scavenge_one(p)) { - // didn't manage to promote everything, so put the - // object back on the list. - recordMutableGen_GC((StgClosure *)p,gen->no); - } - } + if (scavenge_one(p)) { + // didn't manage to promote everything, so put the + // object back on the list. + recordMutableGen_GC((StgClosure *)p,gen->no); + } + } } } hunk ./rts/sm/Scav.c 1529 * Also do them in reverse generation order, for the usual reason: * namely to reduce the likelihood of spurious old->new pointers. */ - for (g = RtsFlags.GcFlags.generations-1; g > N; g--) { - scavenge_mutable_list(cap->saved_mut_lists[g], &generations[g]); + for (g = RtsFlags.GcFlags.generations-1; g > gct->collect_gen; g--) { + scavenge_mutable_list(cap->saved_mut_lists[g], &old_generations[g]); freeChain_sync(cap->saved_mut_lists[g]); cap->saved_mut_lists[g] = NULL; } hunk ./rts/sm/Scav.c 1554 /* Always evacuate straight to the oldest generation for static * objects */ - gct->evac_gen = oldest_gen; + gct->evac_gen_ix = oldest_gen->ix; /* keep going until we've scavenged all the objects on the linked list... */ hunk ./rts/sm/Scav.c 1581 /* Take this object *off* the static_objects list, * and put it on the scavenged_static_objects list. */ + ASSERT(gct->scavenged_static_objects != NULL); gct->static_objects = *STATIC_LINK(info,p); *STATIC_LINK(info,p) = gct->scavenged_static_objects; gct->scavenged_static_objects = p; hunk ./rts/sm/Scav.c 1761 p = scavenge_small_bitmap(p, size, bitmap); follow_srt: +#ifndef LOCAL_GC if (major_gc) scavenge_srt((StgClosure **)GET_SRT(info), info->i.srt_bitmap); hunk ./rts/sm/Scav.c 1764 +#endif continue; case RET_BCO: { hunk ./rts/sm/Scav.c 1840 /*----------------------------------------------------------------------------- scavenge the large object list. - evac_gen set by caller; similar games played with evac_gen as with + evac_gen_ix set by caller; similar games played with evac_gen_ix as with scavenge() - see comment at the top of scavenge(). Most large hunk ./rts/sm/Scav.c 1842 - objects are (repeatedly) mutable, so most of the time evac_gen will + objects are (repeatedly) mutable, so most of the time evac_gen_ix will be zero. --------------------------------------------------------------------------- */ hunk ./rts/sm/Scav.c 1852 bdescr *bd; StgPtr p; - gct->evac_gen = ws->gen; + gct->evac_gen_ix = ws->gen->ix; bd = ws->todo_large_objects; hunk ./rts/sm/Scav.c 1864 // the front when evacuating. ws->todo_large_objects = bd->link; - ACQUIRE_SPIN_LOCK(&ws->gen->sync_large_objects); + ACQUIRE_SPIN_LOCK(&ws->gen->sync); dbl_link_onto(bd, &ws->gen->scavenged_large_objects); ws->gen->n_scavenged_large_blocks += bd->blocks; hunk ./rts/sm/Scav.c 1867 - RELEASE_SPIN_LOCK(&ws->gen->sync_large_objects); + RELEASE_SPIN_LOCK(&ws->gen->sync); p = bd->start; if (scavenge_one(p)) { hunk ./rts/sm/Scav.c 1914 loop: did_something = rtsFalse; - for (g = RtsFlags.GcFlags.generations-1; g >= 0; g--) { + for (g = total_generations-1; g >= 0; g--) { ws = &gct->gens[g]; hunk ./rts/sm/Scav.c 1917 + // in GC_PAR and GC_SEQ we might evacuate objects in any part + // of the heap, but in GC_LOCAL we can only touch our local + // heap and the global heap. + if (gct->gc_type == GC_LOCAL && isNonLocalGen(ws->gen)) + continue; + gct->scan_bd = NULL; // If we have a scan block with some work to do, hunk ./rts/sm/Scav.c 1997 } // scavenge objects in compacted generation - if (mark_stack_bd != NULL && !mark_stack_empty()) { - scavenge_mark_stack(); + if (!mark_stack_empty()) { + if (gct->collect_gen >= global_gen_no) { + scavenge_mark_stack(global_gen); + } else { + scavenge_mark_stack(gct->cap->r.rG0); + } work_to_do = rtsTrue; } hunk ./rts/sm/Scav.h 20 #include "BeginPrivate.h" void scavenge_loop (void); -void scavenge_mutable_list (bdescr *bd, generation *gen); void scavenge_capability_mut_lists (Capability *cap); #ifdef THREADED_RTS hunk ./rts/sm/Scav.h 23 -void scavenge_loop1 (void); -void scavenge_mutable_list1 (bdescr *bd, generation *gen); -void scavenge_capability_mut_Lists1 (Capability *cap); + +void scavenge_loop_par (void); +void scavenge_capability_mut_lists_par (Capability *cap); + +void scavenge_loop_local (void); +void scavenge_capability_mut_lists_local (Capability *cap); + +#else + +#define scavenge_loop1 scavenge_loop +#define scavenge_capability_mut_lists1 scavenge_capability_mut_lists + #endif #include "EndPrivate.h" hunk ./rts/sm/Storage.c 31 #include "Trace.h" #include "GC.h" #include "Evac.h" +#include "Globalise.h" +#include "WritePolicy.h" #include hunk ./rts/sm/Storage.c 62 * - local generations * - nurseries */ -StgClosure *caf_list = NULL; -StgClosure *revertible_caf_list = NULL; + +StgClosure *caf_list; +StgClosure *revertible_caf_list; rtsBool keepCAFs; hunk ./rts/sm/Storage.c 67 -nat large_alloc_lim; /* GC if n_large_blocks in any nursery +nat large_alloc_lim; /* GC if n_new_large_blocks in any nursery * reaches this. */ bdescr *exec_block; hunk ./rts/sm/Storage.c 72 -generation *generations = NULL; /* all the generations */ -generation *g0 = NULL; /* generation 0, for convenience */ -generation *oldest_gen = NULL; /* oldest generation, for convenience */ +nat total_generations; // size of all_generations +generation * all_generations; // indexed by gen->ix +generation * old_generations; // indexed by gen->no +generation * g0; // == &old_generations[0] +generation * oldest_gen; // == &old_generations[G-1] +generation * global_gen; // == &all_generations[global_gen_ix] +nat global_gen_ix; // ix of first global gen +nat global_gen_no; // no of first global gen nursery *nurseries = NULL; /* array of nurseries, size == n_capabilities */ hunk ./rts/sm/Storage.c 84 #ifdef THREADED_RTS -/* - * Storage manager mutex: protects all the above state from - * simultaneous access by two STG threads. - */ Mutex sm_mutex; #endif hunk ./rts/sm/Storage.c 87 +/* ----------------------------------------------------------------------------- + Initialising the storage manager state + ------------------------------------------------------------------------- */ + static void allocNurseries ( void ); static void hunk ./rts/sm/Storage.c 94 -initGeneration (generation *gen, int g) +initGeneration (generation *gen, int g, int ix) { gen->no = g; hunk ./rts/sm/Storage.c 97 + gen->ix = ix; gen->collections = 0; gen->par_collections = 0; gen->failed_promotions = 0; hunk ./rts/sm/Storage.c 111 gen->large_objects = NULL; gen->n_large_blocks = 0; gen->n_new_large_words = 0; - gen->mut_list = allocBlock(); + gen->prim_blocks = NULL; + gen->n_prim_blocks = 0; + gen->n_prim_words = 0; gen->scavenged_large_objects = NULL; gen->n_scavenged_large_blocks = 0; gen->mark = 0; hunk ./rts/sm/Storage.c 120 gen->compact = 0; gen->bitmap = NULL; #ifdef THREADED_RTS - initSpinLock(&gen->sync_large_objects); + initSpinLock(&gen->sync); #endif gen->threads = END_TSO_QUEUE; gen->old_threads = END_TSO_QUEUE; hunk ./rts/sm/Storage.c 124 + gen->weak_ptrs = NULL; } void hunk ./rts/sm/Storage.c 130 initStorage( void ) { - nat g, n; + nat g, n; hunk ./rts/sm/Storage.c 132 - if (generations != NULL) { + if (all_generations != NULL) { // multi-init protection return; } hunk ./rts/sm/Storage.c 168 ACQUIRE_SM_LOCK; - /* allocate generation info array */ - generations = (generation *)stgMallocBytes(RtsFlags.GcFlags.generations - * sizeof(struct generation_), - "initStorage: gens"); + // There are N young generations, one for each Capability. + total_generations = n_capabilities + RtsFlags.GcFlags.generations - 1; + + // allocate generation array. all_generations is indexed by + // gen->ix. + all_generations = (generation *)stgMallocBytes(total_generations + * sizeof(struct generation_), + "initStorage: gens"); + + // generations[] is indexed by the generation number, and is only + // valid for generation[0] when n_capabilities==1. + old_generations = &all_generations[n_capabilities - 1]; + + // Initialise all generations + for(n = 0; n < n_capabilities; n++) { + initGeneration(&all_generations[n], 0, n); + all_generations[n].is_local = rtsTrue; + all_generations[n].cap = n; + } + for(g = 1; g < RtsFlags.GcFlags.generations; g++) { + initGeneration(&old_generations[g], g, g + n_capabilities - 1); + old_generations[g].is_local = rtsFalse; + old_generations[g].cap = 0; + } hunk ./rts/sm/Storage.c 193 - /* Initialise all generations */ - for(g = 0; g < RtsFlags.GcFlags.generations; g++) { - initGeneration(&generations[g], g); + // A couple of convenience pointers + g0 = &old_generations[0]; + oldest_gen = &old_generations[RtsFlags.GcFlags.generations-1]; + if (RtsFlags.GcFlags.generations == 1) { + global_gen_ix = 0; + global_gen_no = 0; + } else { + global_gen_ix = n_capabilities; + global_gen_no = 1; } hunk ./rts/sm/Storage.c 203 + global_gen = &all_generations[global_gen_ix]; hunk ./rts/sm/Storage.c 205 - /* A couple of convenience pointers */ - g0 = &generations[0]; - oldest_gen = &generations[RtsFlags.GcFlags.generations-1]; + // Set up the destination pointers in each younger gen. step + { + generation *g0_dest; hunk ./rts/sm/Storage.c 209 - nurseries = stgMallocBytes(n_capabilities * sizeof(struct nursery_), - "initStorage: nurseries"); - - /* Set up the destination pointers in each younger gen. step */ - for (g = 0; g < RtsFlags.GcFlags.generations-1; g++) { - generations[g].to = &generations[g+1]; + if (RtsFlags.GcFlags.generations == 1) { + g0_dest = &old_generations[0]; + } else { + g0_dest = &old_generations[1]; + } + for(n = 0; n < n_capabilities; n++) { + all_generations[n].to = g0_dest; + } + for (g = 1; g < RtsFlags.GcFlags.generations-1; g++) { + old_generations[g].to = &old_generations[g+1]; + } + oldest_gen->to = oldest_gen; } hunk ./rts/sm/Storage.c 222 - oldest_gen->to = oldest_gen; /* The oldest generation has one step. */ if (RtsFlags.GcFlags.compact || RtsFlags.GcFlags.sweep) { hunk ./rts/sm/Storage.c 234 } } - generations[0].max_blocks = 0; - + nurseries = stgMallocBytes(n_capabilities * sizeof(struct nursery_), + "initStorage: nurseries"); + /* The allocation area. Policy: keep the allocation area * small to begin with, even if we have a large suggested heap * size. Reason: we're going to do a major collection first, and we hunk ./rts/sm/Storage.c 245 */ allocNurseries(); - weak_ptr_list = NULL; caf_list = END_OF_STATIC_LIST; revertible_caf_list = END_OF_STATIC_LIST; hunk ./rts/sm/Storage.c 258 whitehole_spin = 0; #endif - N = 0; + next_gc_gen = 0; // allocate a block for each mut list for (n = 0; n < n_capabilities; n++) { hunk ./rts/sm/Storage.c 283 void freeStorage (rtsBool free_heap) { - stgFree(generations); + stgFree(all_generations); if (free_heap) freeAllMBlocks(); #if defined(THREADED_RTS) closeMutex(&sm_mutex); hunk ./rts/sm/Storage.c 299 - builds a BLACKHOLE in the heap - pushes an update frame pointing to the BLACKHOLE - - calls newCaf, below - - updates the CAF with a static indirection to the BLACKHOLE + - calls newCaf, below, which + - creates an IND_LOCAL in the global heap pointing to the BLACKHOLE + - updates the CAF with a static indirection to the IND_LOCAL Why do we build an BLACKHOLE in the heap rather than just updating the thunk directly? It's so that we only need one kind of update hunk ./rts/sm/Storage.c 307 frame - otherwise we'd need a static version of the update frame too. - newCaf() does the following: - - - it puts the CAF on the oldest generation's mutable list. - This is so that we treat the CAF as a root when collecting - younger generations. + Why create the IND_LOCAL, rather than just allcoating the BLACKHOLE + directly in the global heap? If we did that, then we would have to + globalise the current TSO, and it seems unfair to forcefully + promote a TSO just because it entered a CAF. For GHCI, we have additional requirements when dealing with CAFs: hunk ./rts/sm/Storage.c 333 -------------------------------------------------------------------------- */ void -newCAF(StgRegTable *reg, StgClosure* caf) +newCAF(StgRegTable *reg, StgClosure* caf, StgClosure *bh) { hunk ./rts/sm/Storage.c 335 - if(keepCAFs) - { - // HACK: - // If we are in GHCi _and_ we are using dynamic libraries, - // then we can't redirect newCAF calls to newDynCAF (see below), - // so we make newCAF behave almost like newDynCAF. - // The dynamic libraries might be used by both the interpreted - // program and GHCi itself, so they must not be reverted. - // This also means that in GHCi with dynamic libraries, CAFs are not - // garbage collected. If this turns out to be a problem, we could - // do another hack here and do an address range test on caf to figure - // out whether it is from a dynamic library. - ((StgIndStatic *)caf)->saved_info = (StgInfoTable *)caf->header.info; - - ACQUIRE_SM_LOCK; // caf_list is global, locked by sm_mutex - ((StgIndStatic *)caf)->static_link = caf_list; - caf_list = caf; - RELEASE_SM_LOCK; - } - else - { - // Put this CAF on the mutable list for the old generation. - ((StgIndStatic *)caf)->saved_info = NULL; - if (oldest_gen->no != 0) { - recordMutableCap(caf, regTableToCapability(reg), oldest_gen->no); + StgIndStatic *caf_ind = (StgIndStatic *)caf; + Capability *cap = regTableToCapability(reg); + + if(keepCAFs) + { + // HACK: + // If we are in GHCi _and_ we are using dynamic libraries, + // then we can't redirect newCAF calls to newDynCAF (see below), + // so we make newCAF behave almost like newDynCAF. + // The dynamic libraries might be used by both the interpreted + // program and GHCi itself, so they must not be reverted. + // This also means that in GHCi with dynamic libraries, CAFs are not + // garbage collected. If this turns out to be a problem, we could + // do another hack here and do an address range test on caf to figure + // out whether it is from a dynamic library. + caf_ind->saved_info = caf_ind->header.info; + + ACQUIRE_SM_LOCK; // caf_list is global, locked by sm_mutex + caf_ind->static_link = caf_list; + caf_list = caf; + RELEASE_SM_LOCK; } hunk ./rts/sm/Storage.c 357 - } + + caf_ind->indirectee = publish_gen(cap,bh,oldest_gen); + SET_HDR(caf_ind, &stg_IND_STATIC_info, CCS_SYSTEM); } // External API for setting the keepCAFs flag. see #3900. hunk ./rts/sm/Storage.c 379 // The linker hackily arranges that references to newCaf from dynamic // code end up pointing to newDynCAF. void -newDynCAF (StgRegTable *reg STG_UNUSED, StgClosure *caf) +newDynCAF(StgRegTable *reg, StgClosure* caf, StgClosure *bh) { hunk ./rts/sm/Storage.c 381 + StgIndStatic *caf_ind = (StgIndStatic *)caf; + Capability *cap = regTableToCapability(reg); + ACQUIRE_SM_LOCK; ((StgIndStatic *)caf)->saved_info = (StgInfoTable *)caf->header.info; hunk ./rts/sm/Storage.c 391 revertible_caf_list = caf; RELEASE_SM_LOCK; + + caf_ind->indirectee = publish_gen(cap,bh,oldest_gen); + SET_HDR(caf_ind, &stg_IND_STATIC_info, CCS_SYSTEM); } /* ----------------------------------------------------------------------------- hunk ./rts/sm/Storage.c 401 -------------------------------------------------------------------------- */ static bdescr * -allocNursery (bdescr *tail, nat blocks) +allocNursery (bdescr *tail, nat blocks, generation *gen) { bdescr *bd = NULL; nat i, n; hunk ./rts/sm/Storage.c 406 + ASSERT_LOCK_HELD(&sm_mutex); + // We allocate the nursery as a single contiguous block and then // divide it into single blocks manually. This way we guarantee // that the nursery blocks are adjacent, so that the processor's hunk ./rts/sm/Storage.c 420 bd = allocGroup(n); for (i = 0; i < n; i++) { - initBdescr(&bd[i], g0, g0); + initBdescr(&bd[i], gen, gen); bd[i].blocks = 1; bd[i].flags = 0; hunk ./rts/sm/Storage.c 450 } static void -assignNurseriesToCapabilities (void) +assignNurseryToCapability (nat n) { hunk ./rts/sm/Storage.c 452 - nat i; - - for (i = 0; i < n_capabilities; i++) { - capabilities[i].r.rNursery = &nurseries[i]; - capabilities[i].r.rCurrentNursery = nurseries[i].blocks; - capabilities[i].r.rCurrentAlloc = NULL; - } + capabilities[n].r.rNursery = &nurseries[n]; + capabilities[n].r.rG0 = &all_generations[n]; + capabilities[n].r.rCurrentNursery = nurseries[n].blocks; + capabilities[n].r.rCurrentAlloc = NULL; } static void hunk ./rts/sm/Storage.c 463 { nat i; - for (i = 0; i < n_capabilities; i++) { + for (i = 0; i < n_capabilities; i++) + { nurseries[i].blocks = hunk ./rts/sm/Storage.c 466 - allocNursery(NULL, RtsFlags.GcFlags.minAllocAreaSize); + allocNursery(NULL, + RtsFlags.GcFlags.minAllocAreaSize, + &all_generations[i]); + nurseries[i].n_blocks = RtsFlags.GcFlags.minAllocAreaSize; hunk ./rts/sm/Storage.c 472 + + assignNurseryToCapability(i); } hunk ./rts/sm/Storage.c 475 - assignNurseriesToCapabilities(); } lnat // words allocated hunk ./rts/sm/Storage.c 478 -clearNurseries (void) +clearNursery (nat n) { lnat allocated = 0; hunk ./rts/sm/Storage.c 481 + bdescr *bd; + + for (bd = nurseries[n].blocks; bd; bd = bd->link) { + allocated += (lnat)(bd->free - bd->start); + bd->free = bd->start; + ASSERT(bd->gen_no == 0); + ASSERT(bd->gen == &all_generations[n]); + IF_DEBUG(sanity,memset(bd->start, 0xaa, BLOCK_SIZE)); + } + + return allocated; +} + +lnat // words allocated +clearNurseries (void) +{ + lnat allocated = 0; + nat i; + + for (i = 0; i < n_capabilities; i++) { + allocated += clearNursery(i); + } + + return allocated; +} + +void +resetNursery (nat n) +{ + assignNurseryToCapability(n); +} + +void +resetNurseries (void) +{ nat i; hunk ./rts/sm/Storage.c 517 - bdescr *bd; for (i = 0; i < n_capabilities; i++) { hunk ./rts/sm/Storage.c 519 - for (bd = nurseries[i].blocks; bd; bd = bd->link) { - allocated += (lnat)(bd->free - bd->start); - bd->free = bd->start; - ASSERT(bd->gen_no == 0); - ASSERT(bd->gen == g0); - IF_DEBUG(sanity,memset(bd->start, 0xaa, BLOCK_SIZE)); - } + resetNursery(i); } hunk ./rts/sm/Storage.c 521 - - return allocated; -} - -void -resetNurseries (void) -{ - assignNurseriesToCapabilities(); - } lnat hunk ./rts/sm/Storage.c 535 return blocks; } -static void -resizeNursery ( nursery *nursery, nat blocks ) +void +resizeNursery ( Capability *cap, nat blocks ) { bdescr *bd; nat nursery_blocks; hunk ./rts/sm/Storage.c 540 + nursery *nursery; + nat cap_no; + + ASSERT_LOCK_HELD(&sm_mutex); + + cap_no = cap->no; + nursery = &nurseries[cap_no]; nursery_blocks = nursery->n_blocks; if (nursery_blocks == blocks) return; hunk ./rts/sm/Storage.c 554 if (nursery_blocks < blocks) { debugTrace(DEBUG_gc, "increasing size of nursery to %d blocks", blocks); - nursery->blocks = allocNursery(nursery->blocks, blocks-nursery_blocks); + nursery->blocks = allocNursery(nursery->blocks, + blocks-nursery_blocks, + &all_generations[cap_no]); } else { bdescr *next_bd; hunk ./rts/sm/Storage.c 576 // might have gone just under, by freeing a large block, so make // up the difference. if (nursery_blocks < blocks) { - nursery->blocks = allocNursery(nursery->blocks, blocks-nursery_blocks); + nursery->blocks = allocNursery(nursery->blocks, + blocks-nursery_blocks, + &all_generations[cap_no]); } } hunk ./rts/sm/Storage.c 583 nursery->n_blocks = blocks; + IF_DEBUG(sanity, checkNurserySanity(cap->no)); ASSERT(countBlocks(nursery->blocks) == nursery->n_blocks); } hunk ./rts/sm/Storage.c 595 { nat i; for (i = 0; i < n_capabilities; i++) { - resizeNursery(&nurseries[i], blocks); + resizeNursery(&capabilities[i], blocks); } } hunk ./rts/sm/Storage.c 644 { bdescr *bd; StgPtr p; + generation *gen; if (n >= LARGE_OBJECT_THRESHOLD/sizeof(W_)) { lnat req_blocks = (lnat)BLOCK_ROUND_UP(n*sizeof(W_)) / BLOCK_SIZE; hunk ./rts/sm/Storage.c 665 ACQUIRE_SM_LOCK bd = allocGroup(req_blocks); - dbl_link_onto(bd, &g0->large_objects); - g0->n_large_blocks += bd->blocks; // might be larger than req_blocks - g0->n_new_large_words += n; + gen = cap->r.rG0; // local gen, no need to lock + dbl_link_onto(bd, &gen->large_objects); + gen->n_large_blocks += bd->blocks; // might be larger than req_blocks + gen->n_new_large_words += n; RELEASE_SM_LOCK; hunk ./rts/sm/Storage.c 670 - initBdescr(bd, g0, g0); + initBdescr(bd, gen, gen); bd->flags = BF_LARGE; bd->free = bd->start + n; return bd->start; hunk ./rts/sm/Storage.c 696 bd = allocBlock(); cap->r.rNursery->n_blocks++; RELEASE_SM_LOCK; - initBdescr(bd, g0, g0); + initBdescr(bd, cap->r.rG0, cap->r.rG0); bd->flags = 0; // If we had to allocate a new block, then we'll GC // pretty quickly now, because MAYBE_GC() will hunk ./rts/sm/Storage.c 712 } dbl_link_onto(bd, &cap->r.rNursery->blocks); cap->r.rCurrentAlloc = bd; - IF_DEBUG(sanity, checkNurserySanity(cap->r.rNursery)); + IF_DEBUG(sanity, checkNurserySanity(cap->no)); } p = bd->free; bd->free += n; hunk ./rts/sm/Storage.c 749 { StgPtr p; bdescr *bd; + generation *gen; // If the request is for a large object, then allocate() // will give us a pinned object anyway. hunk ./rts/sm/Storage.c 769 if (bd == NULL || (bd->free + n) > (bd->start + BLOCK_SIZE_W)) { ACQUIRE_SM_LOCK; cap->pinned_object_block = bd = allocBlock(); - dbl_link_onto(bd, &g0->large_objects); - g0->n_large_blocks++; + gen = cap->r.rG0; // use our local G0 + dbl_link_onto(bd, &gen->large_objects); + gen->n_large_blocks++; RELEASE_SM_LOCK; hunk ./rts/sm/Storage.c 773 - initBdescr(bd, g0, g0); + initBdescr(bd, gen, gen); bd->flags = BF_PINNED | BF_LARGE; bd->free = bd->start; } hunk ./rts/sm/Storage.c 784 return p; } +StgPtr +allocatePrim (Capability *cap, lnat n) +{ + StgPtr p; + bdescr *bd; + + // If the request is for a large object, then allocate() + // will give us a BF_LARGE object anyway and these are safe to be + // returned from allocatePrim. + if (n >= LARGE_OBJECT_THRESHOLD/sizeof(W_)) { + p = allocate(cap, n); + ASSERT(Bdescr(p)->flags & BF_LARGE); + return p; + } + + // allocate an extra word for the global flag + n = n + 1; + + TICK_ALLOC_HEAP_NOCTR(n); + CCS_ALLOC(CCCS,n); + + bd = cap->r.rG0->prim_blocks; + if (bd == NULL || bd->free + n > bd->start + BLOCK_SIZE_W) { + + // The current prim block is full, we need to find another + // one. First, we try taking the next block from the + // nursery: + bd = cap->r.rCurrentNursery->link; + + if (bd == NULL || bd->free + n > bd->start + BLOCK_SIZE_W) { + // The nursery is empty, or the next block is already + // full: allocate a fresh block (we can't fail here). + ACQUIRE_SM_LOCK; + bd = allocBlock(); + RELEASE_SM_LOCK; + // the bd->dest for prim blocks is the global gen. If we + // didn't do this, then a global collection + // would have to copy live non-global prim objects into a + // new BF_PRIM block. + initBdescr(bd, cap->r.rG0, global_gen); + // If we had to allocate a new block, then we'll GC + // pretty quickly now, because MAYBE_GC() will + // notice that CurrentNursery->link is NULL. + } else { + // we have a block in the nursery: take it out and use it to + // to allocatePrim() from. + cap->r.rCurrentNursery->link = bd->link; + cap->r.rNursery->n_blocks -= bd->blocks; + if (bd->link != NULL) { + bd->link->u.back = cap->r.rCurrentNursery; + } + } + bd->flags = BF_PRIM; + bd->link = cap->r.rG0->prim_blocks; + bd->dest_ix = global_gen_ix; + cap->r.rG0->prim_blocks = bd; // local gen, no need to lock + cap->r.rG0->n_prim_blocks += bd->blocks; + IF_DEBUG(sanity, checkNurserySanity(cap->no)); + IF_DEBUG(sanity, ASSERT(countBlocks(cap->r.rG0->prim_blocks) == cap->r.rG0->n_prim_blocks)); + } + + *bd->free = 0; // set the global flag to 0 + p = bd->free + 1; + bd->free += n; + + IF_DEBUG(sanity, ASSERT(*((StgWord8*)p) == 0xaa)); + return p; +} + + /* ----------------------------------------------------------------------------- Write Barriers -------------------------------------------------------------------------- */ hunk ./rts/sm/Storage.c 864 is. When written to, a MUT_VAR_CLEAN turns into a MUT_VAR_DIRTY and is put on the mutable list. */ -void -dirty_MUT_VAR(StgRegTable *reg, StgClosure *p) +StgClosure * +dirty_MUT_VAR (StgRegTable *reg, StgClosure *p) { hunk ./rts/sm/Storage.c 867 - Capability *cap = regTableToCapability(reg); - if (p->header.info == &stg_MUT_VAR_CLEAN_info) { - p->header.info = &stg_MUT_VAR_DIRTY_info; - recordClosureMutated(cap,p); - } + // experimentally, we'll globaliseFull here. We might be writing + // a thunk into the IORef (e.g. in atomicModifyIORef) and it would + // bad if another processor had to immediately send a message on + // reading the IORef in that case. + return MUT_VAR_GLOBALISE(regTableToCapability(reg), p); +} + +// The write barrier for arrays (or part of it). +StgClosure * +dirty_MUT_ARR (StgRegTable *reg, StgClosure *p) +{ + return MUT_ARR_GLOBALISE(regTableToCapability(reg), p); } // Setting a TSO's link field with a write barrier. hunk ./rts/sm/Storage.c 884 // It is *not* necessary to call this function when // * setting the link field to END_TSO_QUEUE -// * putting a TSO on the blackhole_queue // * setting the link field of the currently running TSO, as it // will already be dirty. void hunk ./rts/sm/Storage.c 891 { if (tso->dirty == 0) { tso->dirty = 1; - recordClosureMutated(cap,(StgClosure*)tso); + recordClosureMutated_(cap,(StgClosure*)tso); } tso->_link = target; } hunk ./rts/sm/Storage.c 901 { if (tso->dirty == 0) { tso->dirty = 1; - recordClosureMutated(cap,(StgClosure*)tso); + recordClosureMutated_(cap,(StgClosure*)tso); } tso->block_info.prev = target; } hunk ./rts/sm/Storage.c 911 { if (tso->dirty == 0) { tso->dirty = 1; - recordClosureMutated(cap,(StgClosure*)tso); + recordClosureMutated_(cap,(StgClosure*)tso); } } hunk ./rts/sm/Storage.c 920 { if (stack->dirty == 0) { stack->dirty = 1; - recordClosureMutated(cap,(StgClosure*)stack); + recordClosureMutated_(cap,(StgClosure*)stack); } } hunk ./rts/sm/Storage.c 932 this really does make a difference on concurrency-heavy benchmarks such as Chaneneos and cheap-concurrency. */ -void -dirty_MVAR(StgRegTable *reg, StgClosure *p) -{ - recordClosureMutated(regTableToCapability(reg),p); -} +/* + * Not currently called, but might be required in the future if we + * have multiple local generations. + */ +// void +// dirty_MVAR(StgRegTable *reg, StgClosure *p) +// { +// recordClosureMutated(regTableToCapability(reg),p); +// } /* ----------------------------------------------------------------------------- * Stats and stuff hunk ./rts/sm/Storage.c 954 * This leaves a little slop at the end of each block. * -------------------------------------------------------------------------- */ +lnat +calcAllocatedCap (Capability *cap, rtsBool include_nursery) +{ + nat allocated = 0; + bdescr *bd; + + if (include_nursery) + { + for (bd = cap->r.rNursery->blocks; bd; bd = bd->link) { + allocated += (lnat)(bd->free - bd->start); + } + } + + allocated += cap->r.rG0->n_new_large_words; + + return allocated; +} + lnat calcAllocated (rtsBool include_nurseries) { hunk ./rts/sm/Storage.c 975 - nat allocated = 0; - bdescr *bd; + nat allocated; nat i; hunk ./rts/sm/Storage.c 978 - // When called from GC.c, we already have the allocation count for - // the nursery from resetNurseries(), so we don't need to walk - // through these block lists again. - if (include_nurseries) - { - for (i = 0; i < n_capabilities; i++) { - for (bd = nurseries[i].blocks; bd; bd = bd->link) { - allocated += (lnat)(bd->free - bd->start); - } - } + allocated = 0; + for (i = 0; i < n_capabilities; i++) { + allocated += calcAllocatedCap(&capabilities[i], include_nurseries); } hunk ./rts/sm/Storage.c 983 - // add in sizes of new large and pinned objects - allocated += g0->n_new_large_words; - return allocated; } hunk ./rts/sm/Storage.c 992 lnat calcLiveBlocks (void) { nat g; - lnat live = 0; - generation *gen; + lnat live; hunk ./rts/sm/Storage.c 994 - for (g = 0; g < RtsFlags.GcFlags.generations; g++) { - /* approximate amount of live data (doesn't take into account slop - * at end of each block). - */ - gen = &generations[g]; - live += gen->n_large_blocks + gen->n_blocks; + live = 0; + for (g = 0; g < total_generations; g++) { + // approximate amount of live data (doesn't take into account slop + // at end of each block). + live += all_generations[g].n_blocks + all_generations[g].n_large_blocks; } return live; } hunk ./rts/sm/Storage.c 1015 return words; } -// Return an accurate count of the live data in the heap, excluding -// generation 0. +// Return an accurate count of the live data in the heap lnat calcLiveWords (void) { nat g; hunk ./rts/sm/Storage.c 1023 generation *gen; live = 0; - for (g = 0; g < RtsFlags.GcFlags.generations; g++) { - gen = &generations[g]; - live += gen->n_words + countOccupied(gen->large_objects); + for (g = 0; g < total_generations; g++) { + gen = &all_generations[g]; + live += gen->n_words + gen->n_prim_words + countOccupied(gen->large_objects); } return live; } hunk ./rts/sm/Storage.c 1044 nat g; generation *gen; - for (g = 0; g < RtsFlags.GcFlags.generations; g++) { - gen = &generations[g]; + for (g = 0; g < total_generations; g++) { + gen = &all_generations[g]; // we need at least this much space needed += gen->n_blocks + gen->n_large_blocks; hunk ./rts/sm/Storage.c 1051 // any additional space needed to collect this gen next time? - if (g == 0 || // always collect gen 0 + if (gen->no == 0 || // always collect gen 0 (gen->n_blocks + gen->n_large_blocks > gen->max_blocks)) { // we will collect this gen next time if (gen->mark) { hunk ./rts/sm/Storage.h 41 /* ----------------------------------------------------------------------------- Generational garbage collection support - recordMutable(StgPtr p) Informs the garbage collector that a - previously immutable object has - become (permanently) mutable. Used - by thawArray and similar. - updateWithIndirection(p1,p2) Updates the object at p1 with an indirection pointing to p2. This is normally called for objects in an old hunk ./rts/sm/Storage.h 46 generation (>0) when they are updated. - updateWithPermIndirection(p1,p2) As above but uses a permanent indir. - -------------------------------------------------------------------------- */ /* hunk ./rts/sm/Storage.h 65 #define ASSERT_SM_LOCK() #endif -INLINE_HEADER void -recordMutableGen(StgClosure *p, nat gen_no) -{ - bdescr *bd; - - bd = generations[gen_no].mut_list; - if (bd->free >= bd->start + BLOCK_SIZE_W) { - bdescr *new_bd; - new_bd = allocBlock(); - new_bd->link = bd; - bd = new_bd; - generations[gen_no].mut_list = bd; - } - *bd->free++ = (StgWord)p; - -} - -INLINE_HEADER void -recordMutableGenLock(StgClosure *p, nat gen_no) -{ - ACQUIRE_SM_LOCK; - recordMutableGen(p,gen_no); - RELEASE_SM_LOCK; -} - -INLINE_HEADER void -recordMutable(StgClosure *p) -{ - bdescr *bd; - ASSERT(closure_MUTABLE(p)); - bd = Bdescr((P_)p); - if (bd->gen_no > 0) recordMutableGen(p, bd->gen_no); -} - -INLINE_HEADER void -recordMutableLock(StgClosure *p) -{ - ACQUIRE_SM_LOCK; - recordMutable(p); - RELEASE_SM_LOCK; -} - -/* ----------------------------------------------------------------------------- - The write barrier for MVARs - -------------------------------------------------------------------------- */ - -void dirty_MVAR(StgRegTable *reg, StgClosure *p); - /* ----------------------------------------------------------------------------- Nursery manipulation -------------------------------------------------------------------------- */ hunk ./rts/sm/Storage.h 71 extern nursery *nurseries; +void resetNursery ( nat n ); void resetNurseries ( void ); hunk ./rts/sm/Storage.h 73 + +lnat clearNursery ( nat n ); lnat clearNurseries ( void ); hunk ./rts/sm/Storage.h 76 + +void resizeNursery ( Capability *cap, nat blocks ); void resizeNurseries ( nat blocks ); void resizeNurseriesFixed ( nat blocks ); hunk ./rts/sm/Storage.h 80 + lnat countNurseryBlocks ( void ); /* ----------------------------------------------------------------------------- hunk ./rts/sm/Storage.h 88 -------------------------------------------------------------------------- */ lnat calcAllocated (rtsBool count_nurseries); +lnat calcAllocatedCap (Capability *cap, rtsBool count_nursery); lnat calcLiveBlocks (void); lnat calcLiveWords (void); lnat countOccupied (bdescr *bd); hunk ./rts/sm/Sweep.c 19 #include "BlockAlloc.h" #include "Sweep.h" +#include "Compact.h" #include "Trace.h" hunk ./rts/sm/Sweep.c 21 +#include "GCUtils.h" void sweep(generation *gen) hunk ./rts/sm/Sweep.c 89 ASSERT(countBlocks(gen->old_blocks) == gen->n_old_blocks); } + + +typedef enum { Empty, LiveLocal, LiveGlobal } IsEmpty; + +static IsEmpty +empty (bdescr *bd) +{ + StgPtr p; + StgWord flag, size; + IsEmpty is_empty; + + is_empty = Empty; + p = bd->start; + while (p < bd->free) { + flag = *p; + p += 1; + switch (flag) { + case 0: // not global + size = closure_sizeW((StgClosure*)p); + if (!is_marked(p,bd)) { + *(p-1) = size+2; + } else { + is_empty = LiveLocal; + } + p += size; + break; + case 1: // global +#ifndef DEBUG + // in DEBUG, we need to sweep the whole lot, for sanity checking + return LiveGlobal; +#else + is_empty = LiveGlobal; + p += closure_sizeW((StgClosure*)p); +#endif + break; + default: // reclaimed free space of size 'flag-2' + p += flag-2; + break; + } + } + + return is_empty; +} + +void +sweepPrimArea (generation *gen) +{ + bdescr *bd, *prev, *next; + + prev = NULL; + for (bd = gen->prim_blocks; bd != NULL; bd = next) + { + next = bd->link; + + // The BF_GLOBAL flag indicates that this block contains one + // or more global objects. There's no point in sweeping it, + // because we can't free the block. +#ifndef DEBUG + // Not in DEBUG mode: we might need to sanity-check the prim + // heap, so we need to sweep the free areas. + if (bd->flags & BF_GLOBAL) { + prev = bd; + continue; + } +#endif + + switch (empty(bd)) { + case Empty: + if (prev == NULL) { + gen->prim_blocks = next; + } else { + prev->link = next; + } + debugTrace(DEBUG_gc, "sweepPrimArea: free block at %p", bd->start); + freeGroup_sync(bd); + gen->n_prim_blocks--; + break; + + case LiveGlobal: + bd->flags |= BF_GLOBAL; + // fall through + case LiveLocal: + prev = bd; + break; + } + } + + IF_DEBUG(sanity, ASSERT(countBlocks(gen->prim_blocks) == gen->n_prim_blocks)); + +} hunk ./rts/sm/Sweep.h 18 #define SM_SWEEP_H RTS_PRIVATE void sweep(generation *gen); +RTS_PRIVATE void sweepPrimArea (generation *gen); #endif /* SM_SWEEP_H */ hunk ./utils/genapply/GenApply.hs 383 reg_doc, text " Sp_adj(" <> int sp' <> text ");", -- enter, but adjust offset with tag - text " jump " <> text "%GET_ENTRY(R1-" <> int tag <> text ");", + text " info = %INFO_PTR(R1-" <> int tag <> text ");", + text " if (IS_FORWARDING_PTR(info)) { R1 = UN_FORWARDING_PTR(info) + " <> int tag <> text "; info = %INFO_PTR(R1-" <> int tag <> text "); }", + text " jump " <> text "%ENTRY_CODE(info);", text "}" ] -- I don't totally understand this code, I copied it from hunk ./utils/genapply/GenApply.hs 570 text "case AP,", text " AP_STACK,", text " BLACKHOLE,", + text " IND_LOCAL,", text " WHITEHOLE,", text " THUNK,", text " THUNK_1_0,", hunk ./utils/genapply/GenApply.hs 644 -- Functions can be tagged, so we untag them! text "R1 = UNTAG(R1);", - text "info = %GET_STD_INFO(R1);", + text "info = %INFO_PTR(R1);", + -- check for forwarding pointers + text "if (IS_FORWARDING_PTR(info)) { R1 = UN_FORWARDING_PTR(info); info = %INFO_PTR(R1); }", + text "info = %STD_INFO(info);", text "switch [INVALID_OBJECT .. N_CLOSURE_TYPES] (TO_W_(%INFO_TYPE(info))) {", nest 4 (vcat [ text "case FUN,", } [fix an assertion Simon Marlow **20110118123326 Ignore-this: 955826b876d18367cf0cb064d23fcac2 ] hunk ./rts/Messages.c 157 g->req, (lnat)g->tso->id); p = UNTAG_CLOSURE(g->req); - ASSERT(isGlobal((StgClosure*)p)); + ASSERT(!HEAP_ALLOCED(p) || isGlobal((StgClosure*)p)); info = get_itbl(p); // paranoia [allocate 4 blocks at a time to reduce contention and overhead Simon Marlow **20110118123354 Ignore-this: 6ecf78cb3f08e000d4f7555980a964d0 ] { hunk ./rts/sm/GCUtils.c 66 } -#define ALLOCBLOCKS 0 +#define ALLOCBLOCKS 4 #if ALLOCBLOCKS static void hunk ./rts/sm/GCUtils.c 69 -allocBlocks_sync(nat n, bdescr **hd, bdescr **tl, +allocChain_sync(nat n, bdescr **hd, bdescr **tl, generation *gen, StgWord32 flags) { bdescr *bd; hunk ./rts/sm/GCUtils.c 76 nat i; if (gct->gc_type == GC_LOCAL) { - bd = allocGroup_lock(n); + ACQUIRE_SM_LOCK; } else { ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync); hunk ./rts/sm/GCUtils.c 79 - bd = allocGroup(n); - RELEASE_SPIN_LOCK(&gc_alloc_block_sync); } hunk ./rts/sm/GCUtils.c 81 + bd = allocGroup(n); + + // NB. hold the lock while we fiddle with the block group, + // otherwise we can invalidate invariants of the block allocator. + // e.g. the block allocator assumes that bd->blocks doesn't change + // for allocated blocks, so that it can do free list coalescing. for (i = 0; i < n; i++) { bd[i].blocks = 1; initBdescr(&bd[i], gen, gen->to); hunk ./rts/sm/GCUtils.c 96 } *hd = bd; *tl = &bd[n-1]; + + if (gct->gc_type == GC_LOCAL) { + RELEASE_SM_LOCK; + } else { + RELEASE_SPIN_LOCK(&gc_alloc_block_sync); + } } #endif hunk ./rts/sm/GCUtils.c 313 #if ALLOCBLOCKS bdescr *hd, *tl; - allocBlocks_sync(16, &hd, &tl, ws->gen, BF_EVACUATED); + allocChain_sync(ALLOCBLOCKS, &hd, &tl, ws->gen, BF_EVACUATED); tl->link = ws->part_list; ws->part_list = hd->link; hunk ./rts/sm/GCUtils.c 316 - ws->n_part_blocks += 15; + ws->n_part_blocks += ALLOCBLOCKS-1; bd = hd; #else } [sanity check fix Simon Marlow **20110118123403 Ignore-this: 664d21f85fef60212704eb0b2f0e6e5b ] hunk ./rts/sm/Sanity.c 427 return arr_words_sizeW((StgArrWords *)p); case MUT_ARR_PTRS_LOCAL: + case MUT_ARR_PTRS_GLOBAL: case MUT_ARR_PTRS_FROZEN: case MUT_ARR_PTRS_FROZEN0: { [sweep fix Simon Marlow **20110118123419 Ignore-this: 6cdc753fcb60e92fce7bd228eee1f035 ] { hunk ./rts/sm/Sweep.c 163 prev->link = next; } debugTrace(DEBUG_gc, "sweepPrimArea: free block at %p", bd->start); + gen->n_prim_blocks -= bd->blocks; freeGroup_sync(bd); hunk ./rts/sm/Sweep.c 165 - gen->n_prim_blocks--; break; case LiveGlobal: } [more comments on the bdescr type Simon Marlow **20110120123923 Ignore-this: 44904b77b1096573fa0f8e2f0f354b4b ] { hunk ./includes/rts/storage/Block.h 49 * on a 32-bit machine. */ +// Note: fields marked with [READ ONLY] must not be modified by the +// client of the block allocator API. All other fields can be +// freely modified. + #ifndef CMINUSMINUS typedef struct bdescr_ { hunk ./includes/rts/storage/Block.h 55 - StgPtr start; /* start addr of memory */ - StgPtr free; /* first free byte of memory */ - struct bdescr_ *link; /* used for chaining blocks together */ + + StgPtr start; // [READ ONLY] start addr of memory + + StgPtr free; // first free byte of memory. + // NB. during use this value should lie + // between start and start + blocks * + // BLOCK_SIZE. Values outside this + // range are reserved for use by the + // block allocator. In particular, the + // value (StgPtr)(-1) is used to + // indicate that a block is unallocated. + + struct bdescr_ *link; // used for chaining blocks together + union { hunk ./includes/rts/storage/Block.h 70 - struct bdescr_ *back; /* used (occasionally) for doubly-linked lists*/ - StgWord *bitmap; - StgPtr scan; /* scan pointer for copying GC */ + struct bdescr_ *back; // used (occasionally) for doubly-linked lists + StgWord *bitmap; // bitmap for marking GC + StgPtr scan; // scan pointer for copying GC } u; hunk ./includes/rts/storage/Block.h 75 - struct generation_ *gen; /* generation */ + struct generation_ *gen; // generation StgWord16 gen_no; // gen->no, cached StgWord16 gen_ix; // gen->ix, cached hunk ./includes/rts/storage/Block.h 81 StgWord16 dest_ix; // ix of destination generation - StgWord16 flags; /* block flags, see below */ + StgWord16 flags; // block flags, see below hunk ./includes/rts/storage/Block.h 83 - StgWord32 blocks; /* no. of blocks (if grp head, 0 otherwise) */ + StgWord32 blocks; // [READ ONLY] no. of blocks in a group + // (if group head, 0 otherwise) #if SIZEOF_VOID_P == 8 StgWord32 _padding[2]; } [don't use the global g0, use cap->r.rG0 instead Simon Marlow **20110120123948 Ignore-this: 88225035a0e53ff80c7799b9310936e9 ] hunk ./rts/sm/Storage.c 778 bd->free = bd->start; } - g0->n_new_large_words += n; + cap->r.rG0->n_new_large_words += n; p = bd->free; bd->free += n; return p; [fix slop calculation Simon Marlow **20110120124235 Ignore-this: 6788facb495c998363d6aa90cb7d7a0 ] hunk ./rts/sm/Storage.c 998 for (g = 0; g < total_generations; g++) { // approximate amount of live data (doesn't take into account slop // at end of each block). - live += all_generations[g].n_blocks + all_generations[g].n_large_blocks; + live += all_generations[g].n_blocks + + all_generations[g].n_prim_blocks + + all_generations[g].n_large_blocks; } return live; } [80 columnize Simon Marlow **20110120124247 Ignore-this: f64d8231852e27b159b15697f6aa79e1 ] hunk ./rts/sm/Storage.c 1027 live = 0; for (g = 0; g < total_generations; g++) { gen = &all_generations[g]; - live += gen->n_words + gen->n_prim_words + countOccupied(gen->large_objects); + live += gen->n_words + + gen->n_prim_words + + countOccupied(gen->large_objects); } return live; } [use allocatePrim for MUT_ARR_PTRS_FROZEN Simon Marlow **20110120150518 Ignore-this: 73a090243ba1a186c3e29f7539da704e ] hunk ./rts/Weak.c 129 debugTrace(DEBUG_weak, "weak: batching %d finalizers", n); size = n + mutArrPtrsCardTableSize(n); - arr = (StgMutArrPtrs *)allocate(cap, sizeofW(StgMutArrPtrs) + size); + arr = (StgMutArrPtrs *)allocatePrim(cap, sizeofW(StgMutArrPtrs) + size); TICK_ALLOC_PRIM(sizeofW(StgMutArrPtrs), n, 0); SET_HDR(arr, &stg_MUT_ARR_PTRS_FROZEN_info, CCS_SYSTEM); arr->ptrs = n; [PLAN updates Simon Marlow **20110120160928 Ignore-this: 306282622ba39f1d523a5772261c148e ] { hunk ./PLAN 4 ----------------------------------------------------------------------------- -- Plan - - allocate multiple blocks at the same time to reduce sm_mutex - contention. This caused crashes before when I tried it - why? + - fix hangs in ray with localgc N=16/20 hunk ./PLAN 6 - - fix crashes on the 24-core machine. + - nbody slower with -N1? Something recent in HEAD? - measure CnC properly hunk ./PLAN 15 - try to improve the message-queue locking. Multiple writers should be non-blocking. - - parTree: do we have deep stacks? - - - Push down local_gc_lock further? Do we have contention problems? - Measure. - - BLOCKING_QUEUE can point to IND_LOCAL instead of the BLACKHOLE, assertion at Message.c:300 fails hunk ./PLAN 33 a bit of a hack, and the nursery has to be resized after each collection. - - run tests, stabilise - - calcAllocated: take into account allocations from allocatePrim(). - threadStackOverflow: hacky -1s to account for global flags hunk ./PLAN 58 ----------------------------------------------------------------------------- Tuning + - Push down local_gc_lock further? Do we have contention problems? + Measure. + - ray: using parBuffer 1000 much better than parBuffer 200 for -N8, and we beat HEAD now. hunk ./PLAN 64 - - for some reason, allowing the inbox of a capability to accumulate - messages (up to 50) before context-switching it works much better - than eagerly context-switching. - - instead of setting failed_to_evac, just globalise_evac hunk ./PLAN 65 - immediately. - + immediately +. - sparks: can we do better than just globalising all sparks? e.g. separate the spark pool into local and global pools hunk ./PLAN 231 * XXX we broke LDV profiling in CgTailCall * XXX we probably don't need to duplicate the whole of Scav.c/Evac.c -* XXX slop calculation is wrong: 4,294,967,128 bytes maximum slop * XXX major_gc is global, we rely on it always being rtsFalse for local GC * XXX what to do about heapSizeSuggestion and resize_nursery: should we resize the current nursery accoring to the amount of stuff that } [fix for globalising BLOCKING_QUEUEs Simon Marlow **20110120160948 Ignore-this: 432594d834bf214535b889e1baed76c2 ] { hunk ./rts/sm/Globalise.c 974 case MVAR_DIRTY: case MUT_VAR_GLOBAL: case TREC_CHUNK: - case BLOCKING_QUEUE: { StgPtr end; hunk ./rts/sm/Globalise.c 984 return p + info->layout.payload.nptrs; } + case BLOCKING_QUEUE: + { + StgBlockingQueue *bq = (StgBlockingQueue*)p; + + globalise_evac((StgClosure **)&bq->owner); + globalise_evac((StgClosure **)&bq->bh); + globalise_evac((StgClosure **)&bq->owner); + // no need to globalise bq->queue; that is private + + if (get_itbl(bq->bh)->type == IND_LOCAL) + { + // the BLACKHOLE is in a local heap. We better update the + // BLOCKING_QUEUE to point directly to the BLACKHOLE; this + // is ok, because BLOCKING_QUEUE is a private object + // (except for the bq->tso pointer, which other + // Capabilities might read). + // + // If we are migrating the owner, then the stack will have + // been globalised first, which will have moved the + // BLACKHOLE into the global heap, so we won't encounter + // this problem. + bq->bh = ((StgInd*)bq->bh)->indirectee; + } + + return p + sizeofW(StgBlockingQueue); + } + case WEAK: { StgWeak *w = (StgWeak *)p; } [allocate prim blocks from the block allocator, not the nursery Simon Marlow **20110120161018 Ignore-this: 60d6f17e3a5956b630195e95545b7935 ] { hunk ./rts/sm/Storage.c 807 bd = cap->r.rG0->prim_blocks; if (bd == NULL || bd->free + n > bd->start + BLOCK_SIZE_W) { - - // The current prim block is full, we need to find another - // one. First, we try taking the next block from the - // nursery: - bd = cap->r.rCurrentNursery->link; - - if (bd == NULL || bd->free + n > bd->start + BLOCK_SIZE_W) { - // The nursery is empty, or the next block is already - // full: allocate a fresh block (we can't fail here). - ACQUIRE_SM_LOCK; - bd = allocBlock(); - RELEASE_SM_LOCK; - // the bd->dest for prim blocks is the global gen. If we - // didn't do this, then a global collection - // would have to copy live non-global prim objects into a - // new BF_PRIM block. - initBdescr(bd, cap->r.rG0, global_gen); - // If we had to allocate a new block, then we'll GC - // pretty quickly now, because MAYBE_GC() will - // notice that CurrentNursery->link is NULL. - } else { - // we have a block in the nursery: take it out and use it to - // to allocatePrim() from. - cap->r.rCurrentNursery->link = bd->link; - cap->r.rNursery->n_blocks -= bd->blocks; - if (bd->link != NULL) { - bd->link->u.back = cap->r.rCurrentNursery; - } - } + bd = allocBlock_lock(); + initBdescr(bd, cap->r.rG0, global_gen); + bd->free = bd->start; bd->flags = BF_PRIM; bd->link = cap->r.rG0->prim_blocks; bd->dest_ix = global_gen_ix; hunk ./rts/sm/Storage.c 815 cap->r.rG0->prim_blocks = bd; // local gen, no need to lock cap->r.rG0->n_prim_blocks += bd->blocks; - IF_DEBUG(sanity, checkNurserySanity(cap->no)); - IF_DEBUG(sanity, ASSERT(countBlocks(cap->r.rG0->prim_blocks) == cap->r.rG0->n_prim_blocks)); + IF_DEBUG(sanity, ASSERT(countBlocks(cap->r.rG0->prim_blocks) == + cap->r.rG0->n_prim_blocks)); } hunk ./rts/sm/Storage.c 819 + // account for the allocation in n_new_large_words; this both + // triggers GC and counts for allocation. + cap->r.rG0->n_new_large_words += n; *bd->free = 0; // set the global flag to 0 p = bd->free + 1; bd->free += n; } [add -HA, fixedAllHeapSizeSuggestion for measurements Simon Marlow **20110121124916 Ignore-this: f64b3df187b1dae60fc2dd6e90a11d14 ] { hunk ./includes/rts/Flags.h 39 nat minAllocAreaSize; /* in *blocks* */ nat minOldGenSize; /* in *blocks* */ nat heapSizeSuggestion; /* in *blocks* */ + nat fixedAllocHeapSizeSuggestion; /* in *blocks* */ rtsBool heapSizeSuggestionAuto; double oldGenFactor; double pcFreeHeap; hunk ./rts/RtsFlags.c 79 RtsFlags.GcFlags.minOldGenSize = (1024 * 1024) / BLOCK_SIZE; RtsFlags.GcFlags.maxHeapSize = 0; /* off by default */ RtsFlags.GcFlags.heapSizeSuggestion = 0; /* none */ + RtsFlags.GcFlags.fixedAllocHeapSizeSuggestion = 0; /* none */ RtsFlags.GcFlags.heapSizeSuggestionAuto = rtsFalse; RtsFlags.GcFlags.pcFreeHeap = 3; /* 3% */ RtsFlags.GcFlags.oldGenFactor = 2; hunk ./rts/RtsFlags.c 763 case 'H': if (rts_argv[arg][2] == '\0') { RtsFlags.GcFlags.heapSizeSuggestionAuto = rtsTrue; + } else if (rts_argv[arg][2] == 'A') { + RtsFlags.GcFlags.fixedAllocHeapSizeSuggestion = + (nat)(decodeSize(rts_argv[arg], 3, BLOCK_SIZE, HS_WORD_MAX) / BLOCK_SIZE); } else { RtsFlags.GcFlags.heapSizeSuggestion = (nat)(decodeSize(rts_argv[arg], 2, BLOCK_SIZE, HS_WORD_MAX) / BLOCK_SIZE); hunk ./rts/sm/GC.c 1765 live = (words + BLOCK_SIZE_W - 1) / BLOCK_SIZE_W + oldest_gen->n_large_blocks; - // default max size for all generations except zero - size = stg_max(live * RtsFlags.GcFlags.oldGenFactor, - RtsFlags.GcFlags.minOldGenSize); + // default max size for all generations except zero + if (RtsFlags.GcFlags.fixedAllocHeapSizeSuggestion) + { + size = (RtsFlags.GcFlags.heapSizeSuggestion + - live + - n_capabilities * RtsFlags.GcFlags.minAllocAreaSize) + / (2 * (RtsFlags.GcFlags.generations - 1)); + } + else + { + size = live * RtsFlags.GcFlags.oldGenFactor; + } + + size = stg_max(size, RtsFlags.GcFlags.minOldGenSize); if (RtsFlags.GcFlags.heapSizeSuggestionAuto) { RtsFlags.GcFlags.heapSizeSuggestion = size; } [fix live/slop stats Simon Marlow **20110125125303 Ignore-this: 18464335b17298693be1150383cb372c ] { hunk ./rts/Stats.c 886 for (bd = gen->large_objects, lge = 0; bd; bd = bd->link) { lge++; } - gen_live = gen->n_words + gen->n_prim_words + countOccupied(gen->large_objects); - gen_blocks = gen->n_blocks + gen->n_large_blocks + gen->n_prim_blocks; + + gen_live = genLiveWords(gen); + gen_blocks = genLiveBlocks(gen); slop = gen_blocks * BLOCK_SIZE_W - gen_live; hunk ./rts/Stats.c 899 for (i = 0; i < n_capabilities; i++) { cap_mut = countOccupied(capabilities[i].mut_lists[g]); - cap_live = countOccupied(gc_threads[i]->gens[n].todo_bd); - cap_live += countOccupied(gc_threads[i]->gens[n].part_list); - cap_live += countOccupied(gc_threads[i]->gens[n].scavd_list); - - cap_blocks = countBlocks(gc_threads[i]->gens[n].todo_bd); - cap_blocks += gc_threads[i]->gens[n].n_part_blocks; - cap_blocks += gc_threads[i]->gens[n].n_scavd_blocks; + cap_live = gcThreadLiveWords(i,n); + cap_blocks = gcThreadLiveBlocks(i,n); slop = cap_blocks * BLOCK_SIZE_W - cap_live; hunk ./rts/sm/GC.c 159 static void wakeup_gc_threads (nat me, nat N); static void shutdown_gc_threads (nat me); static void collect_gct_blocks (void); -static lnat count_part_lists (nat start_gen); static void freeMarkStack (void); #if 0 && defined(DEBUG) hunk ./rts/sm/GC.c 178 { bdescr *bd; generation *gen; - lnat live, allocated, copied, max_copied, avg_copied, slop; + lnat live_words, live_blocks; + lnat allocated, copied, max_copied, avg_copied, slop; gc_thread *saved_gct; nat g, n; hunk ./rts/sm/GC.c 487 // Run through all the generations/steps and tidy up. // We're going to: - // - count the amount of "live" data (live) + // - count the amount of "live" data (live_words, live_blocks) // - count the amount of "copied" data in this GC (copied) // - free from-space // - make to-space the new from-space (set BF_EVACUATED on all blocks) hunk ./rts/sm/GC.c 493 // - sweep the prim area // - live = 0; + live_words = 0; + live_blocks = 0; + for (n = 0; n < total_generations; n++) { gen = &all_generations[n]; hunk ./rts/sm/GC.c 512 if (n_gc_threads > 1) gen->par_collections++; } - // Count "live" data. Do it here rather than in calcLiveWords - // because we're inside the gen->sync lock. - live += gen->n_words + gen->n_prim_words + - countOccupied(gen->large_objects); - // Count the mutable list as bytes "copied" for the purposes of // stats. Every mutable list is copied during every GC. if (g > 0) { hunk ./rts/sm/GC.c 615 gen->large_objects = gen->scavenged_large_objects; gen->n_large_blocks = gen->n_scavenged_large_blocks; gen->n_new_large_words = 0; - ASSERT(countBlocks(gen->large_objects) == gen->n_large_blocks); } else // for generations > N { hunk ./rts/sm/GC.c 636 gen->scavenged_large_objects = NULL; gen->n_scavenged_large_blocks = 0; + // Count "live" data. Do it here rather than in calcLiveWords + // because we're inside the gen->sync lock. + live_words += genLiveWords(gen); + live_blocks += genLiveBlocks(gen); + + // add in the partial blocks in the gen_workspaces, but ignore gen 0 + // if this is a local GC (we can't count another capability's part_list) + if (gc_type == GC_LOCAL) { + live_words += gcThreadLiveWords(gct->index, gen->no); + live_blocks += gcThreadLiveBlocks(gct->index, gen->no); + } else { + nat i; + for (i = 0; i < n_capabilities; i++) { + live_words += gcThreadLiveWords(i, gen->no); + live_blocks += gcThreadLiveBlocks(i, gen->no); + } + } + RELEASE_SPIN_LOCK(&gen->sync); } // for all generations hunk ./rts/sm/GC.c 660 // update the max size of older generations after a major GC resize_generations(); - // add in the partial blocks in the gen_workspaces, but ignore gen 0 - // if this is a local GC (we can't count another capability's part_list) - live += count_part_lists( (gc_type == GC_LOCAL) ? n_capabilities : 0 ); - // Start a new pinned_object_block if (gc_type == GC_LOCAL) { cap->pinned_object_block = NULL; hunk ./rts/sm/GC.c 729 if (gc_type == GC_LOCAL) { ACQUIRE_LOCK(&gc_local_mutex); } // ok, GC over: tell the stats department what happened. - slop = calcLiveBlocks() * BLOCK_SIZE_W - live; - stat_endGC(gct, allocated, live, copied, N, max_copied, avg_copied, slop); + slop = live_blocks * BLOCK_SIZE_W - live_words; + stat_endGC(gct, allocated, live_words, + copied, N, max_copied, avg_copied, slop); // make sure our mut_lists don't point to anything local. This step // also moves mut_list entries to the right mut_list if they ended hunk ./rts/sm/GC.c 1094 } } -static lnat -count_part_lists (nat start_gen) -{ - nat n, g; - lnat words; - - words = 0; - for (n = 0; n < n_capabilities; n++) { - for (g = start_gen; g < total_generations; g++) { - words += countOccupied(gc_threads[n]->gens[g].part_list); - } - } - return words; -} - /* ---------------------------------------------------------------------------- Start GC threads ------------------------------------------------------------------------- */ hunk ./rts/sm/Storage.c 18 #include "Rts.h" #include "Storage.h" +#include "GCThread.h" #include "RtsUtils.h" #include "Stats.h" #include "BlockAlloc.h" hunk ./rts/sm/Storage.c 964 return allocated; } -/* Approximate the amount of live data in the heap. To be called just - * after garbage collection (see GarbageCollect()). - */ -lnat calcLiveBlocks (void) -{ - nat g; - lnat live; - - live = 0; - for (g = 0; g < total_generations; g++) { - // approximate amount of live data (doesn't take into account slop - // at end of each block). - live += all_generations[g].n_blocks - + all_generations[g].n_prim_blocks - + all_generations[g].n_large_blocks; - } - return live; -} - lnat countOccupied (bdescr *bd) { lnat words; hunk ./rts/sm/Storage.c 976 return words; } +lnat genLiveWords (generation *gen) +{ + return gen->n_words + + gen->n_prim_words + + countOccupied(gen->large_objects); +} + +lnat genLiveBlocks (generation *gen) +{ + return gen->n_blocks + + gen->n_prim_blocks + + gen->n_large_blocks; +} + +lnat gcThreadLiveWords (nat i, nat g) +{ + lnat words; + + words = countOccupied(gc_threads[i]->gens[g].todo_bd); + words += countOccupied(gc_threads[i]->gens[g].part_list); + words += countOccupied(gc_threads[i]->gens[g].scavd_list); + + return words; +} + +lnat gcThreadLiveBlocks (nat i, nat g) +{ + lnat blocks; + + blocks = countBlocks(gc_threads[i]->gens[g].todo_bd); + blocks += gc_threads[i]->gens[g].n_part_blocks; + blocks += gc_threads[i]->gens[g].n_scavd_blocks; + + return blocks; +} + // Return an accurate count of the live data in the heap hunk ./rts/sm/Storage.c 1013 +// NB. this doesn't include live data held in the gc_thread structures +// (see gcThreadLiveWords()). lnat calcLiveWords (void) { nat g; hunk ./rts/sm/Storage.c 1019 lnat