GTPin
GTPin: Funtime Sample Tool

The Funtime tool counts the cycles it takes for each kernel to execute from the beginning to the end

Running Funtime tool

To run the Funtime tool in its default configuration, use this command:

Profilers/Bin/gtpin -t funtime -- app

How to understand Funtime results

When you run the in-house GTPin Funtime tool in its default configuration, the directory GTPIN_PROFILE_FUNTIME0 is generated. GTPin saves the profiling results in the file GTPIN_PROFILE_FUNTIME0\Session_Final\funtime.txt. The results are presented in the following format:

### Kernel/Shader execution-time profile generated by GTPin ###

Legend:
NA - kernel was not instrumented.
          Name              HashID      SIMD      Type          Freq.    Total-Cycle     Avg-Cycles        Skipped            Platform     Execution descriptor

L3_SLM_8x8_8x16    f54af91315561f54         8        CS            512      164965992         322199              0              OpenCL            0 0
L3_SLM_8x8_8x16    f54af91315561f54         8        CS            512      164442942         321177              0              OpenCL            0 1
L3_SLM_8x8_8x16    f54af91315561f54         8        CS            512      165458952         323162              0              OpenCL            0 2
L3_SLM_8x8_8x16    f54af91315561f54         8        CS            512      168150199         328418              0              OpenCL            0 3
L3_SLM_8x8_8x16    f54af91315561f54         8        CS            512      165136147         322531              0              OpenCL            0 4
L3_SLM_8x8_8x16    f54af91315561f54         8        CS            512      167331603         326819              0              OpenCL            0 5

Each line represents a single run (dispatch to HW device) of specific kernel, where the fields have the following meaning:

If the name of the kernel is not known to GTPin, GTPin creates an artificial name in the format: CS_asmf54af91315561f54_simd8 where the prefix indicates the kernel type; the suffix indicates the SIMD width to which this kernel was compiled; and the 16-digit number is the hash ID of the IR representation of this kernel.

(Back to the list of all GTPin Sample Tools)

funtime.h

00001 /*========================== begin_copyright_notice ============================
00002 Copyright (C) 2018-2022 Intel Corporation
00003 
00004 SPDX-License-Identifier: MIT
00005 ============================= end_copyright_notice ===========================*/
00006 
00007 /*!
00008  * @file Funtime tool definitions
00009  */
00010 
00011 #ifndef FUNTIME_H_
00012 #define FUNTIME_H_
00013 
00014 #include <list>
00015 #include <map>
00016 #include <set>
00017 #include <string>
00018 
00019 #include "gtpin_api.h"
00020 #include "gtpin_tool_utils.h"
00021 
00022 using namespace gtpin;
00023 
00024 /* ============================================================================================= */
00025 // Struct FuntimeRecord
00026 /* ============================================================================================= */
00027 /*!
00028  * Layout of records collected in profile buffer by the funtime tool
00029  */
00030 struct FuntimeRecord 
00031 {
00032     uint64_t cycles;    ///< Total number of cycles
00033     uint32_t freq;      ///< Total number of executions
00034     uint32_t skipped;   ///< Total number of skipped executions
00035 };
00036 
00037 /* ============================================================================================= */
00038 // Class FuntimeDispatchProfile
00039 /* ============================================================================================= */
00040 /*!
00041  * Profiling data collected during a single kernel dispatch
00042  */
00043 struct FuntimeDispatchProfile
00044 {
00045     explicit FuntimeDispatchProfile(const IGtKernelDispatch& kernelDispatch, uint32_t tile = 0);
00046     void Accumulate(const FuntimeRecord& record);
00047 
00048     GtKernelExecDesc  kernelExecDesc; ///< Kernel execution descriptor
00049     uint32_t          tileId;         ///< Identifier of the subdevice (tile) assigned to this kernel dispatch
00050     uint64_t          cycles;         ///< Total number of cycles
00051     uint64_t          freq;           ///< Total number of executions
00052     uint64_t          skipped;        ///< Total number of skipped executions
00053 };
00054 
00055 /* ============================================================================================= */
00056 // Class FuntimeKernelProfile
00057 /* ============================================================================================= */
00058 /*!
00059  * Aggregated profile of all instrumented kernel dispatches
00060  */
00061 class FuntimeKernelProfile
00062 {
00063 public:
00064     FuntimeKernelProfile(const IGtKernel& kernel, const GtProfileArray& profileArray);
00065 
00066     /// Add new dispatched kernel instance,  and return reference to its (empty) profile
00067     FuntimeDispatchProfile& AddKernelDispatch(const IGtKernelDispatch& kernelDispatch, uint32_t tile = 0);
00068 
00069     std::string           ToString()        const;                         ///< @return Text representation of the profile data
00070     const GtProfileArray& GetProfileArray() const { return _profileArray; }///< @return Profile buffer accessor
00071 
00072 private:
00073     std::string                         _name;              ///< Kernel's name
00074     GtKernelType                        _type;              ///< Kernel's type
00075     GtGpuPlatform                       _platform;          ///< Kernel's platform
00076     uint64_t                            _hashId;            ///< Kernel's hash identifier
00077     GtSimdWidth                         _simd;              ///< Kernel's SIMD width
00078     GtProfileArray                      _profileArray;      ///< Profile buffer accessor
00079     std::list<FuntimeDispatchProfile>   _dispatchProfiles;  ///< Profiles per kernel dispatch
00080 };
00081 
00082 /* ============================================================================================= */
00083 // Class Funtime
00084 /* ============================================================================================= */
00085 /*!
00086  * Implementation of the IGtTool interface for the funtime tool
00087  */
00088 class Funtime : public GtTool
00089 {
00090 public:
00091     /// Implementation of the IGtTool interface
00092     const char* Name() const { return "funtime"; }
00093 
00094     void OnKernelBuild(IGtKernelInstrument& instrumentor);
00095     void OnKernelRun(IGtKernelDispatch& dispatcher);
00096     void OnKernelComplete(IGtKernelDispatch& dispatcher);
00097 
00098 public:
00099     static void OnFini();                        ///< Callback function registered with atexit()
00100     std::string ToString() const;                ///< @return Text representation of the profile data
00101 
00102     static Funtime* Instance();                  ///< @return Single instance of this class
00103 
00104 private:
00105     Funtime() = default;
00106     Funtime(const Funtime&) = delete;
00107     Funtime& operator = (const Funtime&) = delete;
00108     ~Funtime() = default;
00109 
00110     /// Generate code at entry/exits of the kernel
00111     void GeneratePreCode(GtGenProcedure& proc, const IGtGenCoder& coder);
00112     void GeneratePostCode(GtGenProcedure& proc, const IGtGenCoder& coder, const GtProfileArray& profileArray);
00113 
00114     /// @return true/false - use 64-bit/32-bit integer for the cycle counter
00115     static bool Use64BitCounters(const IGtGenCoder& coder);
00116 
00117 private:
00118     std::map<GtKernelId, FuntimeKernelProfile> _kernels;  ///< Collection of kernel profiles
00119 
00120     GtReg _addrReg;     ///< Virtual register that holds address within profile buffer
00121     GtReg _dataReg;     ///< Virtual register that holds data to be read from/written to profile buffer 
00122     GtReg _timeReg;     ///< Virtual timer register
00123     GtReg _tmpReg32;    ///< Virtual 32-bit scratch register
00124 };
00125 
00126 #endif

funtime.cpp

00001 /*========================== begin_copyright_notice ============================
00002 Copyright (C) 2015-2025 Intel Corporation
00003 
00004 SPDX-License-Identifier: MIT
00005 ============================= end_copyright_notice ===========================*/
00006 
00007 /*!
00008  * @file Implementation of the Funtime tool
00009  */
00010 
00011 #include <fstream>
00012 #include <sstream>
00013 #include <iomanip>
00014 #include <algorithm>
00015 
00016 #include "funtime.h"
00017 
00018 using namespace gtpin;
00019 using namespace std;
00020 
00021 /* ============================================================================================= */
00022 // Configuration
00023 /* ============================================================================================= */
00024 Knob<int>  knobNumThreadBuckets("num_thread_buckets", 32, "Number of thread buckets. Default - 32, zero - maximum thread buckets");
00025 Knob<bool> knobPerTileProfiling("per_tile_profiling", false, "Enable per-tile (subdevice) profiling");
00026 Knob<bool> knobSkipZeroResults("skip_zero_results", false, "Skip zero results in the Funtime output");
00027 
00028 /* ============================================================================================= */
00029 // Funtime implementation
00030 /* ============================================================================================= */
00031 Funtime* Funtime::Instance()
00032 {
00033     static Funtime instance;
00034     return &instance;
00035 }
00036 
00037 void Funtime::OnKernelBuild(IGtKernelInstrument& instrumentor)
00038 {
00039     const IGtKernel&            kernel          = instrumentor.Kernel();
00040     const IGtCfg&               cfg             = instrumentor.Cfg();
00041     const IGtGenCoder&          coder           = instrumentor.Coder();
00042     const IGtGenArch&           genArch         = GTPin_GetCore()->GenArch();
00043     const IGtGenModel&          genModel        = kernel.GenModel();
00044     IGtProfileBufferAllocator&  allocator       = instrumentor.ProfileBufferAllocator();
00045     IGtVregFactory&             vregs           = coder.VregFactory();
00046     bool                        is64BitCounter  = Use64BitCounters(coder);
00047 
00048     // Allocate the profile buffer. It will hold single FuntimeRecord per each thread bucket
00049     uint32_t numThreadBuckets = (knobNumThreadBuckets == 0) ? genModel.MaxThreadBuckets() : knobNumThreadBuckets;
00050     uint32_t numTiles         = (knobPerTileProfiling && coder.IsTileIdSupported()) ? genArch.MaxTiles(kernel.GpuPlatform()) : 1;
00051     GtProfileArray profileArray(sizeof(FuntimeRecord), numTiles, numThreadBuckets);
00052     profileArray.Allocate(allocator);
00053 
00054     // Initialize virtual registers
00055     _timeReg   = vregs.Make(VREG_TYPE_DWORD);
00056     _tmpReg32  = vregs.MakeScratch();
00057     _addrReg   = vregs.MakeMsgAddrScratch();
00058     _dataReg   = vregs.MakeMsgDataScratch(is64BitCounter? VREG_TYPE_QWORD : VREG_TYPE_DWORD);
00059 
00060     // Generate code that starts/stops timer at entry/exit of the kernel
00061     GtGenProcedure preCode;  GeneratePreCode(preCode, coder);
00062     GtGenProcedure postCode; GeneratePostCode(postCode, coder, profileArray);
00063 
00064     // Instrument kernel entries
00065     instrumentor.InstrumentEntries(preCode);
00066 
00067     // Instrument kernel exits
00068     for (auto bblPtr : cfg.ExitBbls())
00069     {
00070         const IGtIns& ins = bblPtr->LastIns(); GTPIN_ASSERT(ins.IsEot());
00071         GtGenProcedure fakeConsumers;
00072         coder.GenerateFakeSrcConsumers(fakeConsumers, ins);
00073         instrumentor.InstrumentInstruction(ins, GtIpoint::Before(), fakeConsumers);
00074         instrumentor.InstrumentInstruction(ins, GtIpoint::Before(), postCode);
00075     }
00076 
00077     _kernels.emplace(kernel.Id(), FuntimeKernelProfile(kernel, profileArray));
00078 }
00079 
00080 void Funtime::OnKernelRun(IGtKernelDispatch& dispatcher)
00081 {
00082     bool isProfileEnabled = false;
00083 
00084     const IGtKernel& kernel = dispatcher.Kernel();
00085     GtKernelExecDesc execDesc; dispatcher.GetExecDescriptor(execDesc);
00086     if (kernel.IsInstrumented() && IsKernelExecProfileEnabled(execDesc, kernel.GpuPlatform(), kernel.Name().Get()))
00087     {
00088         auto it = _kernels.find(kernel.Id());
00089 
00090         if (it != _kernels.end())
00091         {
00092             IGtProfileBuffer*       buffer          = dispatcher.CreateProfileBuffer(); GTPIN_ASSERT(buffer);
00093             FuntimeKernelProfile&   kernelProfile   = it->second;
00094             const GtProfileArray&   profileArray    = kernelProfile.GetProfileArray();
00095             if (profileArray.Initialize(*buffer))
00096             {
00097                 isProfileEnabled = true;
00098             }
00099             else
00100             {
00101                 GTPIN_ERROR_MSG(string("FUNTIME : ") + string(kernel.Name()) + " : Failed to write into memory buffer");
00102             }
00103         }
00104     }
00105     dispatcher.SetProfilingMode(isProfileEnabled);
00106 }
00107 
00108 void Funtime::OnKernelComplete(IGtKernelDispatch& dispatcher)
00109 {
00110     const IGtKernel& kernel = dispatcher.Kernel();
00111     GtKernelExecDesc execDesc; dispatcher.GetExecDescriptor(execDesc);
00112     bool isProfilingEnabled = dispatcher.IsProfilingEnabled();
00113     if (!isProfilingEnabled || !IsKernelExecProfileEnabled(execDesc, kernel.GpuPlatform(), kernel.Name().Get()))
00114     {
00115         return; // Do nothing with unprofiled kernel dispatches
00116     }
00117 
00118     auto it = _kernels.find(kernel.Id());
00119 
00120     if (it != _kernels.end())
00121     {
00122         const IGtProfileBuffer* buffer          = dispatcher.GetProfileBuffer(); GTPIN_ASSERT(buffer);
00123         FuntimeKernelProfile&   kernelProfile   = it->second;
00124         const GtProfileArray&   profileArray    = kernelProfile.GetProfileArray();
00125 
00126         uint32_t numTiles = profileArray.NumRecords(); // There is a single record for each tile in each thread bucket
00127         for (uint32_t tileId = 0; tileId < numTiles; tileId++)
00128         {
00129             FuntimeDispatchProfile& dispatchProfile = kernelProfile.AddKernelDispatch(dispatcher, tileId);
00130 
00131             for (uint32_t threadBucket = 0; threadBucket < profileArray.NumThreadBuckets(); ++threadBucket)
00132             {
00133                 FuntimeRecord record;
00134                 if (!profileArray.Read(*buffer, &record, tileId, 1, threadBucket))
00135                 {
00136                     GTPIN_ERROR_MSG(string("FUNTIME : ") + string(kernel.Name()) + " : Failed to read from memory buffer");
00137                 }
00138                 else
00139                 {
00140                     dispatchProfile.Accumulate(record);
00141                 }
00142             }
00143         }
00144     }
00145 }
00146 
00147 void Funtime::OnFini()
00148 {
00149     Funtime&    me     = *Instance();
00150     string profileDir  = GTPin_GetCore()->ProfileDir();
00151     string filePath    = JoinPath(profileDir, "funtime.txt");
00152 
00153     ofstream fs(filePath);
00154     if (fs.is_open())
00155     {
00156         fs << me.ToString();
00157         fs.close();
00158     }
00159     else
00160     {
00161         GTPIN_WARNING("FUNTIME : could not create file: " + filePath);
00162     }
00163 }
00164 
00165 string Funtime::ToString() const
00166 {
00167     ostringstream ostr;
00168     ostr << "### Kernel/Shader execution-time profile generated by GTPin ###" << endl << endl;
00169     ostr << "Legend:" << endl;
00170     ostr << "NA - kernel was not instrumented." << endl << endl;
00171     ostr << setw(30) << "Name" << setw(20) << "HashID" << setw(10) << "SIMD" << setw(10) << "Type";
00172     ostr << setw(15) << "Freq." << setw(15) << "Total-Cycle" << setw(15) << "Avg-Cycles" << setw(15) << "Skipped";
00173     ostr << setw(20) << "Platform";
00174     if (knobPerTileProfiling)
00175     {
00176         ostr << setw(10) << "Tile";
00177     }
00178     ostr << " " << setw(35) << "Execution descriptor";
00179     ostr << endl;
00180     for (const auto& kernelEntry : _kernels)
00181     {
00182         ostr << kernelEntry.second.ToString();
00183     }
00184     return ostr.str();
00185 }
00186 
00187 bool Funtime::Use64BitCounters(const IGtGenCoder& coder)
00188 {
00189     return coder.InstructionFactory().CanAccessAtomically(GED_DATA_TYPE_uq);
00190 }
00191 
00192 void Funtime::GeneratePreCode(GtGenProcedure& proc, const IGtGenCoder& coder)
00193 {
00194     coder.StartTimer(proc, _timeReg);
00195     if (!proc.empty()) { proc.front()->AppendAnnotation(__func__); }
00196 }
00197 
00198 void Funtime::GeneratePostCode(GtGenProcedure& proc, const IGtGenCoder& coder, const GtProfileArray& profileArray)
00199 {
00200     IGtInsFactory&  insF            = coder.InstructionFactory();
00201     bool            is64BitCounter  = Use64BitCounters(coder);
00202     GtReg           flagReg         = FlagReg(0);
00203     GtReg           dataRegL        = {_dataReg, sizeof(uint32_t), 0};  // Low 32-bits of the data payload register
00204 
00205     // Generate code that computes elapsed time, and sets flagReg in case of timer overflow
00206     coder.StopTimerExt(proc, _timeReg);
00207 
00208     // _addrReg =  address of the current thread's FuntimeRecord in the profile buffer
00209     if (profileArray.NumRecords() > 1)
00210     {
00211         // The record number = tile ID
00212         GtReg& offsetReg = _tmpReg32;
00213         coder.LoadTileId(proc, offsetReg);
00214         proc += insF.MakeMul(offsetReg, offsetReg, sizeof(FuntimeRecord));
00215         profileArray.ComputeAddress(coder, proc, _addrReg, offsetReg);
00216     }
00217     else
00218     {
00219         profileArray.ComputeAddress(coder, proc, _addrReg);
00220     }
00221 
00222     int32_t base = 0;
00223     int32_t offset;
00224 
00225     // cycles += _timeReg
00226     offset = offsetof(FuntimeRecord, cycles) - base;
00227     profileArray.ComputeRelAddress(coder, proc, _addrReg, _addrReg, offset); base += offset;
00228     proc += insF.MakeMov(dataRegL, _timeReg);   // Move timer value to the low 32-bits of the data register
00229     if (is64BitCounter)
00230     {
00231         // Clear the high 32-bits of the data payload register
00232         GtReg dataRegH = {_dataReg, sizeof(uint32_t), 1};
00233         proc += insF.MakeMov(dataRegH, 0);
00234     }
00235     proc += insF.MakeAtomicAdd(NullReg(), _addrReg, _dataReg, (is64BitCounter? GED_DATA_TYPE_uq : GED_DATA_TYPE_ud));
00236 
00237     // freq++
00238     offset = offsetof(FuntimeRecord, freq) - base;
00239     profileArray.ComputeRelAddress(coder, proc, _addrReg, _addrReg, offset); base += offset;
00240     proc += insF.MakeAtomicInc(NullReg(), _addrReg, GED_DATA_TYPE_ud);
00241 
00242     // if (flagReg) skipped++
00243     offset = offsetof(FuntimeRecord, skipped) - base;
00244     profileArray.ComputeRelAddress(coder, proc, _addrReg, _addrReg, offset); base += offset;
00245     proc += insF.MakeAtomicInc(NullReg(), _addrReg, GED_DATA_TYPE_ud).SetPredicate(flagReg);
00246 
00247     if (!proc.empty()) { proc.front()->AppendAnnotation(__func__); }
00248 }
00249 
00250 /* ============================================================================================= */
00251 // FuntimeDispatchProfile implementation
00252 /* ============================================================================================= */
00253 FuntimeDispatchProfile::FuntimeDispatchProfile(const IGtKernelDispatch& kernelDispatch, uint32_t tile) :
00254     tileId(tile), cycles(0), freq(0), skipped(0)
00255 {
00256     kernelDispatch.GetExecDescriptor(kernelExecDesc);
00257 }
00258 
00259 void FuntimeDispatchProfile::Accumulate(const FuntimeRecord& record)
00260 {
00261     cycles  += record.cycles;
00262     freq    += record.freq;
00263     skipped += record.skipped;
00264 }
00265 
00266 /* ============================================================================================= */
00267 // FuntimeKernelProfile implementation
00268 /* ============================================================================================= */
00269 FuntimeKernelProfile::FuntimeKernelProfile(const IGtKernel& kernel, const GtProfileArray& profileArray) :
00270     _name(GlueString(kernel.Name())), _type(kernel.Type()), _platform(kernel.GpuPlatform()), _hashId(kernel.HashId()),
00271     _simd(kernel.SimdWidth()), _profileArray(profileArray) {}
00272 
00273 FuntimeDispatchProfile& FuntimeKernelProfile::AddKernelDispatch(const IGtKernelDispatch& kernelDispatch, const uint32_t tile)
00274 {
00275     _dispatchProfiles.emplace_back(kernelDispatch, tile);
00276     return _dispatchProfiles.back();
00277 }
00278 
00279 string FuntimeKernelProfile::ToString() const
00280 {
00281     ostringstream ostr;
00282     if (!_dispatchProfiles.empty())
00283     {
00284         for (const auto& dp: _dispatchProfiles)
00285         {
00286             if (knobSkipZeroResults && (dp.freq == 0))
00287             {
00288                 continue; // Skip zero results if the knob is set
00289             }
00290 
00291             uint64_t avgCycles = (dp.freq ? (dp.cycles / dp.freq) : 0);
00292 
00293             ostr << setw(30) << _name << setw(20) << hex << _hashId << dec << setw(10) << _simd << setw(10) << _type.ToString();
00294             ostr << setw(15) << dp.freq << setw(15) << dp.cycles << setw(15) << avgCycles << setw(15) << dp.skipped;
00295             ostr << setw(20) << _platform.ToString();
00296             if (knobPerTileProfiling)
00297             {
00298                 ostr << setw(10) << dp.tileId;
00299             }
00300             ostr << " " << setw(35) << dp.kernelExecDesc.ToString(_platform, ExecDescAlignedFormat());
00301             ostr << endl;
00302         }
00303     }
00304     else
00305     {
00306         ostr << setw(30) << _name << setw(20) << hex << _hashId << dec << setw(10) << _simd << setw(10) << _type.ToString();
00307         ostr << setw(15) << "NA" << setw(15) << "NA" << setw(15) << "NA" << setw(15) << "NA" << setw(20) << "NA";
00308         if (knobPerTileProfiling)
00309         {
00310             ostr << setw(10) << "NA";
00311         }
00312         ostr << " " << setw(35) << "NA";
00313     }
00314     ostr << endl;
00315     return ostr.str();
00316 }
00317 
00318 /* ============================================================================================= */
00319 // GTPin_Entry
00320 /* ============================================================================================= */
00321 EXPORT_C_FUNC void GTPin_Entry(int argc, const char *argv[])
00322 {
00323     ConfigureGTPin(argc, argv);
00324     Funtime::Instance()->Register();
00325     atexit(Funtime::OnFini);
00326 }

(Back to the list of all GTPin Sample Tools)


 All Data Structures Functions Variables Typedefs Enumerations Enumerator


  Copyright (C) 2013-2025 Intel Corporation
SPDX-License-Identifier: MIT