rbdlsim/3rdparty/tracy/examples/ToyPathTracer/Source/enkiTS/TaskScheduler.cpp

// Copyright (c) 2013 Doug Binks
// 
// This software is provided 'as-is', without any express or implied
// warranty. In no event will the authors be held liable for any damages
// arising from the use of this software.
// 
// Permission is granted to anyone to use this software for any purpose,
// including commercial applications, and to alter it and redistribute it
// freely, subject to the following restrictions:
// 
// 1. The origin of this software must not be misrepresented; you must not
//    claim that you wrote the original software. If you use this software
//    in a product, an acknowledgement in the product documentation would be
//    appreciated but is not required.
// 2. Altered source versions must be plainly marked as such, and must not be
//    misrepresented as being the original software.
// 3. This notice may not be removed or altered from any source distribution.

#include <assert.h>

#include "TaskScheduler.h"
#include "LockLessMultiReadPipe.h"


using namespace enki;


static const uint32_t PIPESIZE_LOG2              = 8;
static const uint32_t SPIN_COUNT                 = 100;
static const uint32_t SPIN_BACKOFF_MULTIPLIER    = 10;
static const uint32_t MAX_NUM_INITIAL_PARTITIONS = 8;

// each software thread gets it's own copy of gtl_threadNum, so this is safe to use as a static variable
static THREAD_LOCAL uint32_t                             gtl_threadNum       = 0;

namespace enki 
{
	struct SubTaskSet
	{
		ITaskSet*           pTask;
		TaskSetPartition    partition;
	};

	// we derive class TaskPipe rather than typedef to get forward declaration working easily
	class TaskPipe : public LockLessMultiReadPipe<PIPESIZE_LOG2,enki::SubTaskSet> {};

	struct ThreadArgs
	{
		uint32_t		threadNum;
		TaskScheduler*  pTaskScheduler;
	};
}

namespace
{
	SubTaskSet       SplitTask( SubTaskSet& subTask_, uint32_t rangeToSplit_ )
	{
		SubTaskSet splitTask = subTask_;
		uint32_t rangeLeft = subTask_.partition.end - subTask_.partition.start;

        if( rangeToSplit_ > rangeLeft )
        {
            rangeToSplit_ = rangeLeft;
        }
        splitTask.partition.end = subTask_.partition.start + rangeToSplit_;
		subTask_.partition.start = splitTask.partition.end;
		return splitTask;
	}

	#if defined _WIN32
		#if defined _M_IX86  || defined _M_X64
			#pragma intrinsic(_mm_pause)
			inline void Pause() { _mm_pause(); }
		#endif
	#elif defined __i386__ || defined __x86_64__
		inline void Pause() { __asm__ __volatile__("pause;"); }
	#else
		inline void Pause() { ;} // may have NOP or yield equiv
	#endif
}


static void SafeCallback(ProfilerCallbackFunc func_, uint32_t threadnum_)
{
	if( func_ )
	{
		func_(threadnum_);
	}
}

ProfilerCallbacks* TaskScheduler::GetProfilerCallbacks()
{
	return &m_ProfilerCallbacks;
}

THREADFUNC_DECL TaskScheduler::TaskingThreadFunction( void* pArgs )
{
	ThreadArgs args					= *(ThreadArgs*)pArgs;
	uint32_t threadNum				= args.threadNum;
	TaskScheduler*  pTS				= args.pTaskScheduler;
    gtl_threadNum      = threadNum;

	SafeCallback( pTS->m_ProfilerCallbacks.threadStart, threadNum );
    
    uint32_t spinCount = 0;
	uint32_t hintPipeToCheck_io = threadNum + 1;	// does not need to be clamped.
    while( pTS->m_bRunning )
    {
        if(!pTS->TryRunTask( threadNum, hintPipeToCheck_io ) )
        {
            // no tasks, will spin then wait
            ++spinCount;
            if( spinCount > SPIN_COUNT )
            {
				pTS->WaitForTasks( threadNum );
				spinCount = 0;
            }
			else
			{
				uint32_t spinBackoffCount = spinCount * SPIN_BACKOFF_MULTIPLIER;
				while( spinBackoffCount )
				{
					Pause();
					--spinBackoffCount;
				}
			}
        }
        else
        {
            spinCount = 0;
        }
    }

    AtomicAdd( &pTS->m_NumThreadsRunning, -1 );
	SafeCallback( pTS->m_ProfilerCallbacks.threadStop, threadNum );

    return 0;
}


void TaskScheduler::StartThreads()
{
    if( m_bHaveThreads )
    {
        return;
    }
    m_bRunning = true;

    SemaphoreCreate( m_NewTaskSemaphore );

    // we create one less thread than m_NumThreads as the main thread counts as one
    m_pThreadNumStore = new ThreadArgs[m_NumThreads];
    m_pThreadIDs      = new threadid_t[m_NumThreads];
	m_pThreadNumStore[0].threadNum      = 0;
	m_pThreadNumStore[0].pTaskScheduler = this;
	m_pThreadIDs[0] = 0;
    m_NumThreadsWaiting = 0;
    m_NumThreadsRunning = 1;// acount for main thread
    for( uint32_t thread = 1; thread < m_NumThreads; ++thread )
    {
		m_pThreadNumStore[thread].threadNum      = thread;
		m_pThreadNumStore[thread].pTaskScheduler = this;
        ThreadCreate( &m_pThreadIDs[thread], TaskingThreadFunction, &m_pThreadNumStore[thread] );
        ++m_NumThreadsRunning;
    }

    // ensure we have sufficient tasks to equally fill either all threads including main
    // or just the threads we've launched, this is outside the firstinit as we want to be able
    // to runtime change it
	if( 1 == m_NumThreads )
	{
		m_NumPartitions = 1;
		m_NumInitialPartitions = 1;
	}
	else
	{
		m_NumPartitions = m_NumThreads * (m_NumThreads - 1);
		m_NumInitialPartitions = m_NumThreads - 1;
		if( m_NumInitialPartitions > MAX_NUM_INITIAL_PARTITIONS )
		{
			m_NumInitialPartitions = MAX_NUM_INITIAL_PARTITIONS;
		}
	}

    m_bHaveThreads = true;
}

void TaskScheduler::StopThreads( bool bWait_ )
{
    if( m_bHaveThreads )
    {
        // wait for them threads quit before deleting data
        m_bRunning = false;
        while( bWait_ && m_NumThreadsRunning > 1 )
        {
            // keep firing event to ensure all threads pick up state of m_bRunning
            SemaphoreSignal( m_NewTaskSemaphore, m_NumThreadsRunning );
        }

        for( uint32_t thread = 1; thread < m_NumThreads; ++thread )
        {
            ThreadTerminate( m_pThreadIDs[thread] );
        }

		m_NumThreads = 0;
        delete[] m_pThreadNumStore;
        delete[] m_pThreadIDs;
        m_pThreadNumStore = 0;
        m_pThreadIDs = 0;
        SemaphoreClose( m_NewTaskSemaphore );

        m_bHaveThreads = false;
		m_NumThreadsWaiting = 0;
		m_NumThreadsRunning = 0;
    }
}

bool TaskScheduler::TryRunTask( uint32_t threadNum, uint32_t& hintPipeToCheck_io_ )
{
    // check for tasks
    SubTaskSet subTask;
    bool bHaveTask = m_pPipesPerThread[ threadNum ].WriterTryReadFront( &subTask );

	uint32_t threadToCheck = hintPipeToCheck_io_;
	uint32_t checkCount = 0;
    while( !bHaveTask && checkCount < m_NumThreads )
    {
		threadToCheck = ( hintPipeToCheck_io_ + checkCount ) % m_NumThreads;
		if( threadToCheck != threadNum )
		{
			bHaveTask = m_pPipesPerThread[ threadToCheck ].ReaderTryReadBack( &subTask );
		}
		++checkCount;
    }
        
    if( bHaveTask )
    {
		// update hint, will preserve value unless actually got task from another thread.
		hintPipeToCheck_io_ = threadToCheck;

		uint32_t partitionSize = subTask.partition.end - subTask.partition.start;
		if( subTask.pTask->m_RangeToRun < partitionSize )
		{
			SubTaskSet taskToRun = SplitTask( subTask, subTask.pTask->m_RangeToRun );
			SplitAndAddTask( gtl_threadNum, subTask, subTask.pTask->m_RangeToRun, 0 );
			taskToRun.pTask->ExecuteRange( taskToRun.partition, threadNum );
			AtomicAdd( &taskToRun.pTask->m_RunningCount, -1 );
		}
		else
		{

			// the task has already been divided up by AddTaskSetToPipe, so just run it
			subTask.pTask->ExecuteRange( subTask.partition, threadNum );
			AtomicAdd( &subTask.pTask->m_RunningCount, -1 );
		}
    }

    return bHaveTask;

}

void TaskScheduler::WaitForTasks( uint32_t threadNum )
{
	// We incrememt the number of threads waiting here in order
	// to ensure that the check for tasks occurs after the increment
	// to prevent a task being added after a check, then the thread waiting.
	// This will occasionally result in threads being mistakenly awoken,
	// but they will then go back to sleep.
	AtomicAdd( &m_NumThreadsWaiting, 1 );

    bool bHaveTasks = false;
    for( uint32_t thread = 0; thread < m_NumThreads; ++thread )
    {
        if( !m_pPipesPerThread[ thread ].IsPipeEmpty() )
        {
            bHaveTasks = true;
            break;
        }
    }
    if( !bHaveTasks )
    {
        SafeCallback( m_ProfilerCallbacks.waitStart, threadNum );
        SemaphoreWait( m_NewTaskSemaphore );
        SafeCallback( m_ProfilerCallbacks.waitStop, threadNum );
    }

    int32_t prev = AtomicAdd( &m_NumThreadsWaiting, -1 );
    assert( prev != 0 );
}

void TaskScheduler::WakeThreads()
{
	SemaphoreSignal( m_NewTaskSemaphore, m_NumThreadsWaiting );
}

void TaskScheduler::SplitAndAddTask( uint32_t threadNum_, SubTaskSet subTask_,
	uint32_t rangeToSplit_, int32_t runningCountOffset_ )
{
    int32_t numAdded = 0;
    while( subTask_.partition.start != subTask_.partition.end )
    {
        SubTaskSet taskToAdd = SplitTask( subTask_, rangeToSplit_ );

        // add the partition to the pipe
        ++numAdded;
        if( !m_pPipesPerThread[ gtl_threadNum ].WriterTryWriteFront( taskToAdd ) )
        {
			if( numAdded > 1 )
			{
				WakeThreads();
			}
			// alter range to run the appropriate fraction
			if( taskToAdd.pTask->m_RangeToRun < rangeToSplit_ )
			{
				taskToAdd.partition.end = taskToAdd.partition.start + taskToAdd.pTask->m_RangeToRun;
				subTask_.partition.start = taskToAdd.partition.end;
			}
            taskToAdd.pTask->ExecuteRange( taskToAdd.partition, threadNum_ );
            --numAdded;
        }
    }

    // increment running count by number added
    AtomicAdd( &subTask_.pTask->m_RunningCount, numAdded + runningCountOffset_ );

	WakeThreads();
}

void    TaskScheduler::AddTaskSetToPipe( ITaskSet* pTaskSet )
{
	// set running count to -1 to guarantee it won't be found complete until all subtasks added
    pTaskSet->m_RunningCount = -1;

    // divide task up and add to pipe
    pTaskSet->m_RangeToRun = pTaskSet->m_SetSize / m_NumPartitions;
    if( pTaskSet->m_RangeToRun < pTaskSet->m_MinRange ) { pTaskSet->m_RangeToRun = pTaskSet->m_MinRange; }

	uint32_t rangeToSplit = pTaskSet->m_SetSize / m_NumInitialPartitions;
	if( rangeToSplit < pTaskSet->m_MinRange ) { rangeToSplit = pTaskSet->m_MinRange; }

    SubTaskSet subTask;
    subTask.pTask = pTaskSet;
    subTask.partition.start = 0;
    subTask.partition.end = pTaskSet->m_SetSize;
	SplitAndAddTask( gtl_threadNum, subTask, rangeToSplit, 1 );
}

void    TaskScheduler::WaitforTaskSet( const ITaskSet* pTaskSet )
{
	uint32_t hintPipeToCheck_io = gtl_threadNum + 1;	// does not need to be clamped.
	if( pTaskSet )
	{
		while( pTaskSet->m_RunningCount )
		{
			TryRunTask( gtl_threadNum, hintPipeToCheck_io );
			// should add a spin then wait for task completion event.
		}
	}
	else
	{
			TryRunTask( gtl_threadNum, hintPipeToCheck_io );
	}
}

void    TaskScheduler::WaitforAll()
{
    bool bHaveTasks = true;
 	uint32_t hintPipeToCheck_io = gtl_threadNum  + 1;	// does not need to be clamped.
	int32_t threadsRunning = m_NumThreadsRunning - 1;
    while( bHaveTasks || m_NumThreadsWaiting < threadsRunning )
    {
        TryRunTask( gtl_threadNum, hintPipeToCheck_io );
        bHaveTasks = false;
        for( uint32_t thread = 0; thread < m_NumThreads; ++thread )
        {
            if( !m_pPipesPerThread[ thread ].IsPipeEmpty() )
            {
                bHaveTasks = true;
                break;
            }
        }
     }
}

void    TaskScheduler::WaitforAllAndShutdown()
{
    WaitforAll();
    StopThreads(true);
	delete[] m_pPipesPerThread;
    m_pPipesPerThread = 0;
}

uint32_t        TaskScheduler::GetNumTaskThreads() const
{
    return m_NumThreads;
}

TaskScheduler::TaskScheduler()
		: m_pPipesPerThread(NULL)
		, m_NumThreads(0)
		, m_pThreadNumStore(NULL)
		, m_pThreadIDs(NULL)
		, m_bRunning(false)
		, m_NumThreadsRunning(0)
		, m_NumThreadsWaiting(0)
		, m_NumPartitions(0)
		, m_bHaveThreads(false)
{
	memset(&m_ProfilerCallbacks, 0, sizeof(m_ProfilerCallbacks));
}

TaskScheduler::~TaskScheduler()
{
    StopThreads( true ); // Stops threads, waiting for them.

    delete[] m_pPipesPerThread;
    m_pPipesPerThread = 0;
}

void    TaskScheduler::Initialize( uint32_t numThreads_ )
{
	assert( numThreads_ );
    StopThreads( true ); // Stops threads, waiting for them.
    delete[] m_pPipesPerThread;

	m_NumThreads = numThreads_;

    m_pPipesPerThread = new TaskPipe[ m_NumThreads ];

    StartThreads();
}

void   TaskScheduler::Initialize()
{
	Initialize( GetNumHardwareThreads() );
}
Added Tracy profiler 2021-08-06 12:10:32 +02:00			`// Copyright (c) 2013 Doug Binks`
			`//`
			`// This software is provided 'as-is', without any express or implied`
			`// warranty. In no event will the authors be held liable for any damages`
			`// arising from the use of this software.`
			`//`
			`// Permission is granted to anyone to use this software for any purpose,`
			`// including commercial applications, and to alter it and redistribute it`
			`// freely, subject to the following restrictions:`
			`//`
			`// 1. The origin of this software must not be misrepresented; you must not`
			`// claim that you wrote the original software. If you use this software`
			`// in a product, an acknowledgement in the product documentation would be`
			`// appreciated but is not required.`
			`// 2. Altered source versions must be plainly marked as such, and must not be`
			`// misrepresented as being the original software.`
			`// 3. This notice may not be removed or altered from any source distribution.`

			`#include <assert.h>`

			`#include "TaskScheduler.h"`
			`#include "LockLessMultiReadPipe.h"`



			`using namespace enki;`


			`static const uint32_t PIPESIZE_LOG2 = 8;`
			`static const uint32_t SPIN_COUNT = 100;`
			`static const uint32_t SPIN_BACKOFF_MULTIPLIER = 10;`
			`static const uint32_t MAX_NUM_INITIAL_PARTITIONS = 8;`

			`// each software thread gets it's own copy of gtl_threadNum, so this is safe to use as a static variable`
			`static THREAD_LOCAL uint32_t gtl_threadNum = 0;`

			`namespace enki`
			`{`
			`struct SubTaskSet`
			`{`
			`ITaskSet* pTask;`
			`TaskSetPartition partition;`
			`};`

			`// we derive class TaskPipe rather than typedef to get forward declaration working easily`
			`class TaskPipe : public LockLessMultiReadPipe<PIPESIZE_LOG2,enki::SubTaskSet> {};`

			`struct ThreadArgs`
			`{`
			`uint32_t threadNum;`
			`TaskScheduler* pTaskScheduler;`
			`};`
			`}`

			`namespace`
			`{`
			`SubTaskSet SplitTask( SubTaskSet& subTask_, uint32_t rangeToSplit_ )`
			`{`
			`SubTaskSet splitTask = subTask_;`
			`uint32_t rangeLeft = subTask_.partition.end - subTask_.partition.start;`

			`if( rangeToSplit_ > rangeLeft )`
			`{`
			`rangeToSplit_ = rangeLeft;`
			`}`
			`splitTask.partition.end = subTask_.partition.start + rangeToSplit_;`
			`subTask_.partition.start = splitTask.partition.end;`
			`return splitTask;`
			`}`

			`#if defined _WIN32`
			`#if defined _M_IX86 \|\| defined _M_X64`
			`#pragma intrinsic(_mm_pause)`
			`inline void Pause() { _mm_pause(); }`
			`#endif`
			`#elif defined __i386__ \|\| defined __x86_64__`
			`inline void Pause() { __asm__ __volatile__("pause;"); }`
			`#else`
			`inline void Pause() { ;} // may have NOP or yield equiv`
			`#endif`
			`}`


			`static void SafeCallback(ProfilerCallbackFunc func_, uint32_t threadnum_)`
			`{`
			`if( func_ )`
			`{`
			`func_(threadnum_);`
			`}`
			`}`

			`ProfilerCallbacks* TaskScheduler::GetProfilerCallbacks()`
			`{`
			`return &m_ProfilerCallbacks;`
			`}`

			`THREADFUNC_DECL TaskScheduler::TaskingThreadFunction( void* pArgs )`
			`{`
			`ThreadArgs args = (ThreadArgs)pArgs;`
			`uint32_t threadNum = args.threadNum;`
			`TaskScheduler* pTS = args.pTaskScheduler;`
			`gtl_threadNum = threadNum;`

			`SafeCallback( pTS->m_ProfilerCallbacks.threadStart, threadNum );`

			`uint32_t spinCount = 0;`
			`uint32_t hintPipeToCheck_io = threadNum + 1; // does not need to be clamped.`
			`while( pTS->m_bRunning )`
			`{`
			`if(!pTS->TryRunTask( threadNum, hintPipeToCheck_io ) )`
			`{`
			`// no tasks, will spin then wait`
			`++spinCount;`
			`if( spinCount > SPIN_COUNT )`
			`{`
			`pTS->WaitForTasks( threadNum );`
			`spinCount = 0;`
			`}`
			`else`
			`{`
			`uint32_t spinBackoffCount = spinCount * SPIN_BACKOFF_MULTIPLIER;`
			`while( spinBackoffCount )`
			`{`
			`Pause();`
			`--spinBackoffCount;`
			`}`
			`}`
			`}`
			`else`
			`{`
			`spinCount = 0;`
			`}`
			`}`

			`AtomicAdd( &pTS->m_NumThreadsRunning, -1 );`
			`SafeCallback( pTS->m_ProfilerCallbacks.threadStop, threadNum );`

			`return 0;`
			`}`


			`void TaskScheduler::StartThreads()`
			`{`
			`if( m_bHaveThreads )`
			`{`
			`return;`
			`}`
			`m_bRunning = true;`

			`SemaphoreCreate( m_NewTaskSemaphore );`

			`// we create one less thread than m_NumThreads as the main thread counts as one`
			`m_pThreadNumStore = new ThreadArgs[m_NumThreads];`
			`m_pThreadIDs = new threadid_t[m_NumThreads];`
			`m_pThreadNumStore[0].threadNum = 0;`
			`m_pThreadNumStore[0].pTaskScheduler = this;`
			`m_pThreadIDs[0] = 0;`
			`m_NumThreadsWaiting = 0;`
			`m_NumThreadsRunning = 1;// acount for main thread`
			`for( uint32_t thread = 1; thread < m_NumThreads; ++thread )`
			`{`
			`m_pThreadNumStore[thread].threadNum = thread;`
			`m_pThreadNumStore[thread].pTaskScheduler = this;`
			`ThreadCreate( &m_pThreadIDs[thread], TaskingThreadFunction, &m_pThreadNumStore[thread] );`
			`++m_NumThreadsRunning;`
			`}`

			`// ensure we have sufficient tasks to equally fill either all threads including main`
			`// or just the threads we've launched, this is outside the firstinit as we want to be able`
			`// to runtime change it`
			`if( 1 == m_NumThreads )`
			`{`
			`m_NumPartitions = 1;`
			`m_NumInitialPartitions = 1;`
			`}`
			`else`
			`{`
			`m_NumPartitions = m_NumThreads * (m_NumThreads - 1);`
			`m_NumInitialPartitions = m_NumThreads - 1;`
			`if( m_NumInitialPartitions > MAX_NUM_INITIAL_PARTITIONS )`
			`{`
			`m_NumInitialPartitions = MAX_NUM_INITIAL_PARTITIONS;`
			`}`
			`}`

			`m_bHaveThreads = true;`
			`}`

			`void TaskScheduler::StopThreads( bool bWait_ )`
			`{`
			`if( m_bHaveThreads )`
			`{`
			`// wait for them threads quit before deleting data`
			`m_bRunning = false;`
			`while( bWait_ && m_NumThreadsRunning > 1 )`
			`{`
			`// keep firing event to ensure all threads pick up state of m_bRunning`
			`SemaphoreSignal( m_NewTaskSemaphore, m_NumThreadsRunning );`
			`}`

			`for( uint32_t thread = 1; thread < m_NumThreads; ++thread )`
			`{`
			`ThreadTerminate( m_pThreadIDs[thread] );`
			`}`

			`m_NumThreads = 0;`
			`delete[] m_pThreadNumStore;`
			`delete[] m_pThreadIDs;`
			`m_pThreadNumStore = 0;`
			`m_pThreadIDs = 0;`
			`SemaphoreClose( m_NewTaskSemaphore );`

			`m_bHaveThreads = false;`
			`m_NumThreadsWaiting = 0;`
			`m_NumThreadsRunning = 0;`
			`}`
			`}`

			`bool TaskScheduler::TryRunTask( uint32_t threadNum, uint32_t& hintPipeToCheck_io_ )`
			`{`
			`// check for tasks`
			`SubTaskSet subTask;`
			`bool bHaveTask = m_pPipesPerThread[ threadNum ].WriterTryReadFront( &subTask );`

			`uint32_t threadToCheck = hintPipeToCheck_io_;`
			`uint32_t checkCount = 0;`
			`while( !bHaveTask && checkCount < m_NumThreads )`
			`{`
			`threadToCheck = ( hintPipeToCheck_io_ + checkCount ) % m_NumThreads;`
			`if( threadToCheck != threadNum )`
			`{`
			`bHaveTask = m_pPipesPerThread[ threadToCheck ].ReaderTryReadBack( &subTask );`
			`}`
			`++checkCount;`
			`}`

			`if( bHaveTask )`
			`{`
			`// update hint, will preserve value unless actually got task from another thread.`
			`hintPipeToCheck_io_ = threadToCheck;`

			`uint32_t partitionSize = subTask.partition.end - subTask.partition.start;`
			`if( subTask.pTask->m_RangeToRun < partitionSize )`
			`{`
			`SubTaskSet taskToRun = SplitTask( subTask, subTask.pTask->m_RangeToRun );`
			`SplitAndAddTask( gtl_threadNum, subTask, subTask.pTask->m_RangeToRun, 0 );`
			`taskToRun.pTask->ExecuteRange( taskToRun.partition, threadNum );`
			`AtomicAdd( &taskToRun.pTask->m_RunningCount, -1 );`
			`}`
			`else`
			`{`

			`// the task has already been divided up by AddTaskSetToPipe, so just run it`
			`subTask.pTask->ExecuteRange( subTask.partition, threadNum );`
			`AtomicAdd( &subTask.pTask->m_RunningCount, -1 );`
			`}`
			`}`

			`return bHaveTask;`

			`}`

			`void TaskScheduler::WaitForTasks( uint32_t threadNum )`
			`{`
			`// We incrememt the number of threads waiting here in order`
			`// to ensure that the check for tasks occurs after the increment`
			`// to prevent a task being added after a check, then the thread waiting.`
			`// This will occasionally result in threads being mistakenly awoken,`
			`// but they will then go back to sleep.`
			`AtomicAdd( &m_NumThreadsWaiting, 1 );`

			`bool bHaveTasks = false;`
			`for( uint32_t thread = 0; thread < m_NumThreads; ++thread )`
			`{`
			`if( !m_pPipesPerThread[ thread ].IsPipeEmpty() )`
			`{`
			`bHaveTasks = true;`
			`break;`
			`}`
			`}`
			`if( !bHaveTasks )`
			`{`
			`SafeCallback( m_ProfilerCallbacks.waitStart, threadNum );`
			`SemaphoreWait( m_NewTaskSemaphore );`
			`SafeCallback( m_ProfilerCallbacks.waitStop, threadNum );`
			`}`

			`int32_t prev = AtomicAdd( &m_NumThreadsWaiting, -1 );`
			`assert( prev != 0 );`
			`}`

			`void TaskScheduler::WakeThreads()`
			`{`
			`SemaphoreSignal( m_NewTaskSemaphore, m_NumThreadsWaiting );`
			`}`

			`void TaskScheduler::SplitAndAddTask( uint32_t threadNum_, SubTaskSet subTask_,`
			`uint32_t rangeToSplit_, int32_t runningCountOffset_ )`
			`{`
			`int32_t numAdded = 0;`
			`while( subTask_.partition.start != subTask_.partition.end )`
			`{`
			`SubTaskSet taskToAdd = SplitTask( subTask_, rangeToSplit_ );`

			`// add the partition to the pipe`
			`++numAdded;`
			`if( !m_pPipesPerThread[ gtl_threadNum ].WriterTryWriteFront( taskToAdd ) )`
			`{`
			`if( numAdded > 1 )`
			`{`
			`WakeThreads();`
			`}`
			`// alter range to run the appropriate fraction`
			`if( taskToAdd.pTask->m_RangeToRun < rangeToSplit_ )`
			`{`
			`taskToAdd.partition.end = taskToAdd.partition.start + taskToAdd.pTask->m_RangeToRun;`
			`subTask_.partition.start = taskToAdd.partition.end;`
			`}`
			`taskToAdd.pTask->ExecuteRange( taskToAdd.partition, threadNum_ );`
			`--numAdded;`
			`}`
			`}`

			`// increment running count by number added`
			`AtomicAdd( &subTask_.pTask->m_RunningCount, numAdded + runningCountOffset_ );`

			`WakeThreads();`
			`}`

			`void TaskScheduler::AddTaskSetToPipe( ITaskSet* pTaskSet )`
			`{`
			`// set running count to -1 to guarantee it won't be found complete until all subtasks added`
			`pTaskSet->m_RunningCount = -1;`

			`// divide task up and add to pipe`
			`pTaskSet->m_RangeToRun = pTaskSet->m_SetSize / m_NumPartitions;`
			`if( pTaskSet->m_RangeToRun < pTaskSet->m_MinRange ) { pTaskSet->m_RangeToRun = pTaskSet->m_MinRange; }`

			`uint32_t rangeToSplit = pTaskSet->m_SetSize / m_NumInitialPartitions;`
			`if( rangeToSplit < pTaskSet->m_MinRange ) { rangeToSplit = pTaskSet->m_MinRange; }`

			`SubTaskSet subTask;`
			`subTask.pTask = pTaskSet;`
			`subTask.partition.start = 0;`
			`subTask.partition.end = pTaskSet->m_SetSize;`
			`SplitAndAddTask( gtl_threadNum, subTask, rangeToSplit, 1 );`
			`}`

			`void TaskScheduler::WaitforTaskSet( const ITaskSet* pTaskSet )`
			`{`
			`uint32_t hintPipeToCheck_io = gtl_threadNum + 1; // does not need to be clamped.`
			`if( pTaskSet )`
			`{`
			`while( pTaskSet->m_RunningCount )`
			`{`
			`TryRunTask( gtl_threadNum, hintPipeToCheck_io );`
			`// should add a spin then wait for task completion event.`
			`}`
			`}`
			`else`
			`{`
			`TryRunTask( gtl_threadNum, hintPipeToCheck_io );`
			`}`
			`}`

			`void TaskScheduler::WaitforAll()`
			`{`
			`bool bHaveTasks = true;`
			`uint32_t hintPipeToCheck_io = gtl_threadNum + 1; // does not need to be clamped.`
			`int32_t threadsRunning = m_NumThreadsRunning - 1;`
			`while( bHaveTasks \|\| m_NumThreadsWaiting < threadsRunning )`
			`{`
			`TryRunTask( gtl_threadNum, hintPipeToCheck_io );`
			`bHaveTasks = false;`
			`for( uint32_t thread = 0; thread < m_NumThreads; ++thread )`
			`{`
			`if( !m_pPipesPerThread[ thread ].IsPipeEmpty() )`
			`{`
			`bHaveTasks = true;`
			`break;`
			`}`
			`}`
			`}`
			`}`

			`void TaskScheduler::WaitforAllAndShutdown()`
			`{`
			`WaitforAll();`
			`StopThreads(true);`
			`delete[] m_pPipesPerThread;`
			`m_pPipesPerThread = 0;`
			`}`

			`uint32_t TaskScheduler::GetNumTaskThreads() const`
			`{`
			`return m_NumThreads;`
			`}`

			`TaskScheduler::TaskScheduler()`
			`: m_pPipesPerThread(NULL)`
			`, m_NumThreads(0)`
			`, m_pThreadNumStore(NULL)`
			`, m_pThreadIDs(NULL)`
			`, m_bRunning(false)`
			`, m_NumThreadsRunning(0)`
			`, m_NumThreadsWaiting(0)`
			`, m_NumPartitions(0)`
			`, m_bHaveThreads(false)`
			`{`
			`memset(&m_ProfilerCallbacks, 0, sizeof(m_ProfilerCallbacks));`
			`}`

			`TaskScheduler::~TaskScheduler()`
			`{`
			`StopThreads( true ); // Stops threads, waiting for them.`

			`delete[] m_pPipesPerThread;`
			`m_pPipesPerThread = 0;`
			`}`

			`void TaskScheduler::Initialize( uint32_t numThreads_ )`
			`{`
			`assert( numThreads_ );`
			`StopThreads( true ); // Stops threads, waiting for them.`
			`delete[] m_pPipesPerThread;`

			`m_NumThreads = numThreads_;`

			`m_pPipesPerThread = new TaskPipe[ m_NumThreads ];`

			`StartThreads();`
			`}`

			`void TaskScheduler::Initialize()`
			`{`
			`Initialize( GetNumHardwareThreads() );`
			`}`