Cogs.Core: Source/Services/TaskManager.cpp Source File

#include "TaskManager.h"
#include "Services/Time.h"
#include "Platform/Instrumentation.h"
#include "Foundation/Platform/Timer.h"
 
#include "Foundation/Collections/Pool.h"
#include "Foundation/Logging/Logger.h"
#include "Foundation/Platform/Threads.h"
 
#include <atomic>
#include <cassert>
#include <condition_variable>
#include <queue>
#include <string>
#include <vector>
 
#ifdef EMSCRIPTEN
#include <emscripten/threading.h>
#endif
 
using namespace Cogs::Collections;
 
namespace {
  Cogs::Logging::Log logger = Cogs::Logging::getLogger("TaskMngr");
 
  // Contains the worker thread index in a queue + 1 for worker threads, and zero for main thread.
  thread_local uint32_t workerIndex = 0;
}
 
#if !defined(COGS_SINGLETHREADED)
namespace Cogs
{
  namespace Core
  {
    enum class TaskFlags : uint16_t
    {
      None = 0,
      Group = 1,
    };
    ENABLE_ENUM_FLAGS(TaskFlags);
 
    class Task
    {
    public:
      Task() = default;
 
      TaskFunction kernel;
 
      Task *parent = nullptr;
 
      Atomic<int> active{0};
 
      TaskFlags flags = TaskFlags::None;
 
      std::atomic<uint16_t> generation = 0;
 
      std::atomic<int> continuationCount;
 
      TaskId continuations[5];
    };
 
#ifdef _WIN64
    static_assert(sizeof(Task) == 128, "Size of the task struct should be an even 128 bytes.");
#endif
 
    struct QueueWorkerStateData
    {
      std::atomic<uint64_t> timeInKernel = 0;
      std::atomic<uint32_t> kernelLaunches = 0;
      uint32_t kernelNesting = 0;
    };
 
    class TaskPool : public Pool<Task>
    {
    public:
      TaskPool() : Pool(1024, 1024) {}
    };
 
    class TaskQueue
    {
    public:
      TaskQueue(TaskQueueId id, std::string_view name, size_t numThreads) : name(name), id(id), threads(numThreads)
      {
        initializeWorkers();
      }
 
      TaskQueue(const TaskQueue&) = delete;
      TaskQueue& operator=(const TaskQueue&) = delete;
 
      ~TaskQueue()
      {
        {
          LockGuard lock(taskMutex);
          done = true;
        }
 
        taskVariable.notify_all();
 
        for (auto &t : threads)
        {
          t.join();
        }
      }
 
      Task *createTask(const TaskFunction *func, ElementHandle &handle, const TaskId *parentTaskId = nullptr)
      {
        Task *task;
 
        {
          LockGuard poolLock(poolMutex);
 
          task = taskPool.create();
          handle = taskPool.getHandle(task);
        }
 
        if (func)
        {
          task->kernel = *func;
        }
        else
        {
          task->flags |= TaskFlags::Group;
        }
 
        // Assign the unique generation counter to differentiate between the currently created task
        // and reallocated tasks using the same storage at a later point in time.
        task->generation = ++generation;
 
        assert(task->active == 0 && "Task should not have active workers.");
 
        // The number of active sub-tasks for the Task counts itself.
        if (func)
          ++task->active;
 
        if (parentTaskId)
        {
          Task *parentTask = getTask(*parentTaskId);
 
          assert(parentTask && "Parent task not valid.");
 
          // Ensure the parent task has not been recycled. Trying to parent a task to a recycled task
          // is considered a programmer error. It is up to the calling code to keep the parent alive until
          // children are added.
          assert(parentTask->generation == parentTaskId->generation && "Parent task not in generation.");
 
          task->parent = parentTask;
 
          // Add the child task as an active sub-task.
          ++parentTask->active;
        }
 
        // Keep a count of the number of queued/executing tasks.
        if (func)
          active++;
 
        return task;
      }
 
      Task *getTask(const TaskId &taskId)
      {
        LockGuard poolLock(poolMutex);
 
        return taskPool[taskId.taskHandle];
      }
 
      void destroy(const TaskId &taskId)
      {
        auto task = getTask(taskId);
 
        if (task->generation != taskId.generation)
        {
          assert(false && "Should never happen!");
          return;
        }
        else
        {
          // We need to be sure the task has completed before removing it from the pool.
          wait(taskId);
 
          destroyTask(task);
        }
      }
 
      TaskId create(TaskFunctionRef func)
      {
        ElementHandle handle;
 
        auto task = createTask(&func, handle);
 
        return TaskId{handle, task->generation, id};
      }
 
      TaskId createGroup()
      {
        ElementHandle handle;
 
        auto task = createTask(nullptr, handle);
 
        return TaskId{handle, task->generation, id};
      }
 
      TaskId create(TaskFunctionRef func, const TaskId &parentTask)
      {
        ElementHandle handle;
 
        auto task = createTask(&func, handle, &parentTask);
 
        return TaskId{handle, task->generation, id};
      }
 
      void enqueueTask(Task *task)
      {
        if (threads.empty()) {
          task->kernel();
          finishTask(task);
          return;
        }
 
        LockGuard taskLock(taskMutex);
        taskQueue.push(task);
        // Wake up a single waiting TaskWorker instance (if any) to execute the enqueued
        // task.
        taskVariable.notify_one();
      }
 
      void enqueue(const TaskId &taskId)
      {
        enqueueTask(getTask(taskId));
      }
 
      TaskId enqueue(TaskFunctionRef func)
      {
        if (threads.empty()) {
          func();
          return NoTask;
        }
 
        ElementHandle handle;
        Task* task = createTask(&func, handle);
        enqueueTask(task);
 
        return TaskId{handle, task->generation, id};
      }
 
      TaskId enqueue(TaskFunctionRef func, const TaskId &parentTaskId)
      {
        if (threads.empty()) {
          func();
          return NoTask;
        }
 
        ElementHandle handle;
        Task* task = createTask(&func, handle, &parentTaskId);
        enqueueTask(task);
 
        return TaskId{handle, task->generation, id};
      }
 
      Task *getNextTask()
      {
        UniqueLock taskLock(taskMutex);
 
        taskVariable.wait(taskLock, [this]()
                          { return taskQueue.size() || done; });
 
        if (done)
          return nullptr;
 
        auto task = taskQueue.front();
        taskQueue.pop();
        return task;
      }
 
      Task *getAvailableTask()
      {
        LockGuard taskLock(taskMutex);
 
        if (taskQueue.size())
        {
          auto task = taskQueue.front();
          taskQueue.pop();
          return task;
        }
 
        return nullptr;
      }
 
      bool canExecute(const Task *task) const
      {
        assert(task->active >= 1 && "Cannot execute inactive task.");
 
        return task->active == 1;
      }
 
      void workOnTask(Task *task)
      {
        while (!canExecute(task))
        {
          yield();
        }
 
        Timer timer;
        QueueWorkerStateData& stateData = (*workerStateData)[workerIndex];
        bool measure = stateData.kernelNesting++ == 0;
        if (measure) {
          // Running kernels can be nested if the kernel waits for another task, so we only measure
          // the outermost kernel launch to avoid measuring the same timespan multiple times.
          timer = Timer::startNew();
        }
        task->kernel();
        if (measure) {
          stateData.timeInKernel.fetch_add(static_cast<uint64_t>(1e6 * timer.elapsedSeconds()));
          stateData.kernelLaunches.fetch_add(1);
        }
        assert(stateData.kernelNesting);
        stateData.kernelNesting--;
        finishTask(task);
      }
 
      void yield()
      {
        auto task = getAvailableTask();
 
        if (task)
        {
          workOnTask(task);
        }
        else
        {
          Threads::yield();
        }
      }
 
      void finishTask(Task *task)
      {
        assert(task->active && "Task must have active tasks when calling finishTask().");
 
        if ((task->flags & TaskFlags::Group) == TaskFlags::Group)
        {
          // Groups are not destroyed automatically when the active count reaches zero.
          --task->active;
          return;
        }
 
        // Remove the task itself from its count of active sub-tasks.
        if (--task->active == 0)
        {
          if (task->parent)
          {
            // See if the parent task needs to be destroyed. Will have its active worker count decremented
            // by finishTask().
            finishTask(task->parent);
          }
 
          destroyTask(task);
        }
      }
 
      void destroyTask(Task *task)
      {
        assert(!task->active && "Task cannot have active sub tasks when destroyed.");
 
        // Remove the task itself from the queues active tasks.
        if ((task->flags & TaskFlags::Group) != TaskFlags::Group) {
          --active;
        }
 
        task->generation = ++generation;
 
        {
          LockGuard poolLock(poolMutex);
          taskPool.destroy(task);
        }
      }
 
      bool isActive(const TaskId& taskId)
      {
        assert(taskId.queueId == id && "Task id not valid for current queue.");
        Task* task = getTask(taskId);
        return task->active.load();
      }
 
      void wait(const TaskId &taskId)
      {
        assert(taskId.queueId == id && "Task id not valid for current queue.");
        CpuInstrumentationScope(SCOPE_TASKMANAGER, "Wait");
 
        auto task = getTask(taskId);
 
        if (task->generation != taskId.generation)
        {
          CpuInstrumentationScope(SCOPE_TASKMANAGER, "Unblocked wait");
          return;
        }
        else
        {
          CpuInstrumentationScope(SCOPE_TASKMANAGER, "Blocked wait");
 
          while (task->active.load())
          {
            yield();
          }
        }
      }
 
      void waitAll()
      {
        CpuInstrumentationScope(SCOPE_TASKMANAGER, "WaitAll");
        while (active)
        {
          yield();
        }
      }
 
      size_t getConcurrency() const
      {
        return threads.size();
      }
 
      TaskQueueId getId() const
      {
        return id;
      }
 
      const std::string& getName() const
      {
        return name;
      }
 
      void getQueueState(QueueState& queueState, std::vector<QueueWorkerState>& workerStates) const
      {
        queueState = this->queueState;
        workerStates = this->workerStates;
      }
 
      void updateState(Context* context)
      {
        float elapsed = static_cast<float>(timer.elapsedSeconds());
        if (elapsed < 1.0) return;
        timer = Timer::startNew();
        uint32_t frameCount = context->time->getFrame() - lastFrame;
        lastFrame = context->time->getFrame();
 
        const float perSecondScale = 1.f / elapsed;
        const float perFrameScale = 1.f / frameCount;
 
        float aggregateTimeInKernel = 0.f;
        uint32_t aggreagateKernelLaunches = 0;
        size_t workerStateCount = workerStates.size();
        for (size_t i = 0; i < workerStateCount; i++) {
          float timeInKernel = 1e-6f * static_cast<float>((*workerStateData)[i].timeInKernel.exchange(0));
          uint32_t kernelLaunches = (*workerStateData)[i].kernelLaunches.exchange(0);
          workerStates[i].utilization = perSecondScale * timeInKernel;
          workerStates[i].tasksPerSecond = perSecondScale *  kernelLaunches;
          workerStates[i].tasksPerFrame = perFrameScale * kernelLaunches;
          aggregateTimeInKernel += timeInKernel;
          aggreagateKernelLaunches += kernelLaunches;
        }
 
        queueState.load = perSecondScale * aggregateTimeInKernel;
        queueState.tasksPerSecond = perSecondScale * aggreagateKernelLaunches;
        queueState.tasksPerFrame = perFrameScale * aggreagateKernelLaunches;
      }
 
    private:
      Timer timer = Timer::startNew();
      uint32_t lastFrame = 0;
      
      QueueState queueState{};
 
      std::vector<QueueWorkerState> workerStates;
 
      std::unique_ptr<std::vector<QueueWorkerStateData>> workerStateData;
 
      void initializeWorkers();
 
      TaskPool taskPool;
 
      std::string name;
 
      Atomic<int> active{0};
 
      bool done = false;
 
      Atomic<uint16_t> generation = 0;
 
      std::queue<Task *> taskQueue;
 
      Mutex poolMutex;
 
      Mutex taskMutex;
 
      std::condition_variable taskVariable;
 
      TaskQueueId id{0};
 
      std::vector<Thread> threads;
    };
 
    class TaskWorker
    {
    public:
      TaskWorker(TaskQueue *taskQueue, uint32_t workerIx, const std::string &name) : taskQueue(taskQueue), workerIx(workerIx), name(name)
      {
      }
 
      void operator()()
      {
        Instrumentation::initializeThread(name.c_str());
        CpuInstrumentationScope(SCOPE_TASKMANAGER, "WorkerThread");
 
        assert(workerIx);
        workerIndex = workerIx;
 
        while (auto task = taskQueue->getNextTask())
        {
          taskQueue->workOnTask(task);
        }
 
        Instrumentation::destroyThread();
      }
 
      TaskQueue *taskQueue;
      uint32_t workerIx = 0;
      std::string name;
    };
 
    void TaskQueue::initializeWorkers()
    {
      size_t workerStateCount = threads.size() + 1;
      workerStates.resize(workerStateCount);
      workerStateData = std::make_unique<std::vector<QueueWorkerStateData>>(workerStateCount);
 
      std::string n;
      n.reserve(120);
      for (size_t i = 0; i < threads.size(); ++i)
      {
        n.clear();
        n += name;
        n += " Thread ";
        n += std::to_string(i);
 
        TaskWorker worker(this, static_cast<uint32_t>(i + 1), n);
        threads[i] = Thread(worker);
 
        Threads::setName(threads[i], n);
      }
    }
 
    class TaskQueues
    {
    public:
      TaskQueues() = default;
 
      TaskQueue *operator[](TaskQueueId id) { return queues[id].get(); }
 
      TaskQueueId createQueue(std::string_view name, const size_t numThreads)
      {
        auto id = nextId++;
 
        // Ensure the container can hold the desired queue index.
        queues.resize(std::max(static_cast<size_t>(id + 1), queues.size()));
 
        queues[id] = std::make_unique<TaskQueue>(id, name, numThreads);
 
        return id;
      }
 
      std::thread::id mainThreadId;
 
      std::vector<std::unique_ptr<TaskQueue>> queues;
 
      Atomic<TaskQueueId> nextId{0};
    };
  }
}
#endif
 
 
#if defined(COGS_SINGLETHREADED)
#pragma message( "Building single threaded TaskManager." )
namespace Cogs
{
  namespace Core
  {
    class TaskQueues
    {
    };
  }
}
 
Cogs::Core::TaskManager::TaskManager(Context* /*context*/)
{
  taskQueues = std::make_unique<TaskQueues>();
}
 
Cogs::Core::TaskManager::~TaskManager()
{
}
 
void Cogs::Core::TaskManager::updateState(Context* context)
{
}
 
Cogs::Core::TaskQueueId Cogs::Core::TaskManager::createQueue(std::string_view /*name*/, const size_t /*numThreads*/)
{
  return GlobalQueue;
}
 
size_t Cogs::Core::TaskManager::getQueueCount() const
{
  return 0;
}
 
size_t Cogs::Core::TaskManager::getQueueConcurrency(TaskQueueId /*queue*/)
{
  return 0;
}
 
const std::string& Cogs::Core::TaskManager::getQueueName(TaskQueueId queue) const
{
  static const std::string noname = "";
  return noname;
}
 
void Cogs::Core::TaskManager::getQueueState(QueueState& queueState, std::vector<QueueWorkerState>& workerStates, TaskQueueId /*queue*/) const
{
  queueState = {};
  workerStates.clear();
}
 
bool Cogs::Core::TaskManager::onMainThread() const
{
  return true;
}
 
Cogs::Core::TaskId Cogs::Core::TaskManager::createGroup(TaskQueueId /*queue*/)
{
  return TaskId();
}
 
Cogs::Core::TaskId Cogs::Core::TaskManager::create(TaskQueueId /*queue*/, TaskFunctionRef func)
{
  func();
 
  return TaskId();
}
 
Cogs::Core::TaskId Cogs::Core::TaskManager::createChild(const TaskId &/*parentTask*/, TaskFunctionRef func)
{
  func();
 
  return TaskId();
}
 
void Cogs::Core::TaskManager::enqueue(const TaskId &/*taskId*/)
{
}
 
Cogs::Core::TaskId Cogs::Core::TaskManager::enqueueChild(const TaskId &/*parentTask*/, TaskFunctionRef func)
{
  func();
 
  return TaskId();
}
 
Cogs::Core::TaskId Cogs::Core::TaskManager::enqueue(TaskQueueId /*queue*/, TaskFunctionRef func)
{
  func();
 
  return TaskId();
}
 
void Cogs::Core::TaskManager::destroy(const TaskId &/*taskId*/)
{
}
 
bool Cogs::Core::TaskManager::isActive(const TaskId& /*taskId*/)
{
  return true;
}
 
void Cogs::Core::TaskManager::wait(const TaskId &/*taskId*/)
{
}
 
void Cogs::Core::TaskManager::waitAll(TaskQueueId /*queue*/)
{
}
 
void Cogs::Core::TaskManager::waitAll()
{
}
 
#else
#pragma message( "Building multithreaded TaskManager." )
 
namespace {
  static constexpr Cogs::StringView globalQueueFactorName = "taskManager.globalQueue.threadCountFactor";
  static constexpr Cogs::StringView globalQueueMaxName = "taskManager.globalQueue.threadCountMax";
 
  static constexpr Cogs::StringView resourceQueueFactorName = "taskManager.resourceQueue.threadCountFactor";
  static constexpr Cogs::StringView resourceQueueMaxName = "taskManager.resourceQueue.threadCountMax";
 
  size_t getWorkerCount(Cogs::Core::Context* context, Cogs::StringView factorName, Cogs::StringView maxName, size_t concurrency, float factorDefault)
  {
    // We do not want the workers to compete with the main thread when busy,
    // hence the default is one less than the concurrency.
    size_t defaultMaxWorkers = std::max(size_t(1), concurrency) - 1;
 
    size_t workerCount = static_cast<size_t>(std::max(0.f, context->variables->get(factorName, factorDefault) * concurrency));
    size_t maxWorkers = static_cast<size_t>(std::max(0, context->variables->get(maxName, int(defaultMaxWorkers))));
    return std::min(workerCount, maxWorkers);
  }
 
  bool hasThreadingSupport()
  {
#ifdef EMSCRIPTEN
    return emscripten_has_threading_support();
#else
    return true;
#endif
  }
 
}
 
Cogs::Core::TaskManager::TaskManager(Context* context) : taskQueues(new TaskQueues())
{
  taskQueues->mainThreadId = std::this_thread::get_id();
 
  assert(context->variables);
 
  // Usually, number of logical processors available. However, the value _can_
  // be zero when the value is not well-defined or not computable. 
  size_t concurrency = Threads::hardwareConcurrency();
 
  size_t globalThreads = 0;
  size_t resourceThreads = 0;
  if (hasThreadingSupport()) {
    globalThreads = getWorkerCount(context, globalQueueFactorName, globalQueueMaxName, concurrency, 1.f);
    resourceThreads = getWorkerCount(context, resourceQueueFactorName, resourceQueueMaxName, concurrency, 0.5);
  }
  else {
    LOG_WARNING(logger, "Multithreaded cogs, but no threading available.");
  }
  LOG_INFO(logger, "Concurrency: %zu globalThreads=%zu resourceThreads=%zu", concurrency, globalThreads, resourceThreads);
 
  // Create predefined Queues. MUST be in this order, se definition of GlobalQueue & ResourceQueue
  TaskQueueId globalQueueId = createQueue("Global", globalThreads);
  assert(globalQueueId == GlobalQueue);
 
  TaskQueueId resourceQueueId = createQueue("Resource", resourceThreads);
  assert(resourceQueueId == ResourceQueue);
}
 
Cogs::Core::TaskManager::~TaskManager()
{
  waitAll();
 
  taskQueues->queues.clear();
}
 
void Cogs::Core::TaskManager::updateState(Context* context)
{
  for (std::unique_ptr<TaskQueue>& queue : taskQueues->queues) {
    queue->updateState(context);
  }
}
 
size_t Cogs::Core::TaskManager::getQueueCount() const
{
  return taskQueues->queues.size();
}
 
const std::string& Cogs::Core::TaskManager::getQueueName(TaskQueueId queue) const
{
  assert(queue < taskQueues->queues.size());
  return taskQueues->queues[queue]->getName();
}
 
void Cogs::Core::TaskManager::getQueueState(QueueState& queueState, std::vector<QueueWorkerState>& workerState, TaskQueueId queue) const
{
  assert(queue < taskQueues->queues.size());
  return taskQueues->queues[queue]->getQueueState(queueState, workerState);
}
 
size_t Cogs::Core::TaskManager::getQueueConcurrency(TaskQueueId queue)
{
  TaskQueue* q = getQueue(queue);
  return q->getConcurrency();
}
 
bool Cogs::Core::TaskManager::onMainThread() const
{
  return std::this_thread::get_id() == taskQueues->mainThreadId;
}
 
inline Cogs::Core::TaskQueue* Cogs::Core::TaskManager::getQueue(TaskQueueId queueId)
{
  return (*taskQueues)[queueId];
}
 
Cogs::Core::TaskId Cogs::Core::TaskManager::create(TaskQueueId queue, TaskFunctionRef func)
{
  TaskQueue* q = getQueue(queue);
 
  return q->create(func);
}
 
Cogs::Core::TaskId Cogs::Core::TaskManager::createGroup(TaskQueueId queue)
{
  TaskQueue* q = getQueue(queue);
 
  return q->createGroup();
}
 
Cogs::Core::TaskId Cogs::Core::TaskManager::createChild(const TaskId &task, TaskFunctionRef func)
{
  TaskQueue* q = getQueue(task.queueId);
 
  return q->create(func, task);
}
 
void Cogs::Core::TaskManager::enqueue(const TaskId &task)
{
  TaskQueue* q = getQueue(task.queueId);
 
  return q->enqueue(task);
}
 
Cogs::Core::TaskId Cogs::Core::TaskManager::enqueue(TaskQueueId queue, TaskFunctionRef func)
{
  TaskQueue* q = getQueue(queue);
 
  return q->enqueue(func);
}
 
Cogs::Core::TaskId Cogs::Core::TaskManager::enqueueChild(const TaskId &group, TaskFunctionRef func)
{
  TaskQueue* q = getQueue(group.queueId);
 
  return q->enqueue(func, group);
}
 
void Cogs::Core::TaskManager::destroy(const TaskId &taskId)
{
  if (!taskId.isValid()) return;
 
  TaskQueue* q = getQueue(taskId.queueId);
  q->destroy(taskId);
}
 
bool Cogs::Core::TaskManager::isActive(const TaskId& task)
{
  return getQueue(task.queueId)->isActive(task);
}
 
void Cogs::Core::TaskManager::wait(const TaskId &task)
{
  if (!task.isValid()) return;
 
  TaskQueue* queue = getQueue(task.queueId);
  if (onMainThread() && (queue->getId() == ResourceQueue)) {
    LOG_ERROR_ONCE(logger, "Waited on a task in the resource queue from the main thread");
  }
  getQueue(task.queueId)->wait(task);
}
 
void Cogs::Core::TaskManager::waitAll(TaskQueueId queue)
{
  auto q = getQueue(queue);
 
  q->waitAll();
}
 
void Cogs::Core::TaskManager::waitAll()
{
  for (auto &q : taskQueues->queues)
  {
    q->waitAll();
  }
}
 
Cogs::Core::TaskQueueId Cogs::Core::TaskManager::createQueue(std::string_view name, const size_t numThreads)
{
  return taskQueues->createQueue(name, numThreads);
}
 
#endif