summaryrefslogtreecommitdiffhomepage
path: root/Opcode/Ice/IceRevisitedRadix.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'Opcode/Ice/IceRevisitedRadix.cpp')
-rw-r--r--Opcode/Ice/IceRevisitedRadix.cpp1040
1 files changed, 520 insertions, 520 deletions
diff --git a/Opcode/Ice/IceRevisitedRadix.cpp b/Opcode/Ice/IceRevisitedRadix.cpp
index c9eca90..b654995 100644
--- a/Opcode/Ice/IceRevisitedRadix.cpp
+++ b/Opcode/Ice/IceRevisitedRadix.cpp
@@ -1,520 +1,520 @@
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-/**
- * Contains source code from the article "Radix Sort Revisited".
- * \file IceRevisitedRadix.cpp
- * \author Pierre Terdiman
- * \date April, 4, 2000
- */
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-/**
- * Revisited Radix Sort.
- * This is my new radix routine:
- * - it uses indices and doesn't recopy the values anymore, hence wasting less ram
- * - it creates all the histograms in one run instead of four
- * - it sorts words faster than dwords and bytes faster than words
- * - it correctly sorts negative floating-point values by patching the offsets
- * - it automatically takes advantage of temporal coherence
- * - multiple keys support is a side effect of temporal coherence
- * - it may be worth recoding in asm... (mainly to use FCOMI, FCMOV, etc) [it's probably memory-bound anyway]
- *
- * History:
- * - 08.15.98: very first version
- * - 04.04.00: recoded for the radix article
- * - 12.xx.00: code lifting
- * - 09.18.01: faster CHECK_PASS_VALIDITY thanks to Mark D. Shattuck (who provided other tips, not included here)
- * - 10.11.01: added local ram support
- * - 01.20.02: bugfix! In very particular cases the last pass was skipped in the float code-path, leading to incorrect sorting......
- * - 01.02.02: - "mIndices" renamed => "mRanks". That's a rank sorter after all.
- * - ranks are not "reset" anymore, but implicit on first calls
- * - 07.05.02: - offsets rewritten with one less indirection.
- * - 11.03.02: - "bool" replaced with RadixHint enum
- *
- * \class RadixSort
- * \author Pierre Terdiman
- * \version 1.4
- * \date August, 15, 1998
- */
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-/*
-To do:
- - add an offset parameter between two input values (avoid some data recopy sometimes)
- - unroll ? asm ?
- - 11 bits trick & 3 passes as Michael did
- - prefetch stuff the day I have a P3
- - make a version with 16-bits indices ?
-*/
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-// Precompiled Header
-#include "StdAfx.h"
-
-using namespace IceCore;
-
-#define INVALIDATE_RANKS mCurrentSize|=0x80000000
-#define VALIDATE_RANKS mCurrentSize&=0x7fffffff
-#define CURRENT_SIZE (mCurrentSize&0x7fffffff)
-#define INVALID_RANKS (mCurrentSize&0x80000000)
-
-#define CHECK_RESIZE(n) \
- if(n!=mPreviousSize) \
- { \
- if(n>mCurrentSize) Resize(n); \
- else ResetRanks(); \
- mPreviousSize = n; \
- }
-
-#define CREATE_HISTOGRAMS(type, buffer) \
- /* Clear counters/histograms */ \
- ZeroMemory(mHistogram, 256*4*sizeof(udword)); \
- \
- /* Prepare to count */ \
- ubyte* p = (ubyte*)input; \
- ubyte* pe = &p[nb*4]; \
- udword* h0= &mHistogram[0]; /* Histogram for first pass (LSB) */ \
- udword* h1= &mHistogram[256]; /* Histogram for second pass */ \
- udword* h2= &mHistogram[512]; /* Histogram for third pass */ \
- udword* h3= &mHistogram[768]; /* Histogram for last pass (MSB) */ \
- \
- bool AlreadySorted = true; /* Optimism... */ \
- \
- if(INVALID_RANKS) \
- { \
- /* Prepare for temporal coherence */ \
- type* Running = (type*)buffer; \
- type PrevVal = *Running; \
- \
- while(p!=pe) \
- { \
- /* Read input buffer in previous sorted order */ \
- type Val = *Running++; \
- /* Check whether already sorted or not */ \
- if(Val<PrevVal) { AlreadySorted = false; break; } /* Early out */ \
- /* Update for next iteration */ \
- PrevVal = Val; \
- \
- /* Create histograms */ \
- h0[*p++]++; h1[*p++]++; h2[*p++]++; h3[*p++]++; \
- } \
- \
- /* If all input values are already sorted, we just have to return and leave the */ \
- /* previous list unchanged. That way the routine may take advantage of temporal */ \
- /* coherence, for example when used to sort transparent faces. */ \
- if(AlreadySorted) \
- { \
- mNbHits++; \
- for(udword i=0;i<nb;i++) mRanks[i] = i; \
- return *this; \
- } \
- } \
- else \
- { \
- /* Prepare for temporal coherence */ \
- udword* Indices = mRanks; \
- type PrevVal = (type)buffer[*Indices]; \
- \
- while(p!=pe) \
- { \
- /* Read input buffer in previous sorted order */ \
- type Val = (type)buffer[*Indices++]; \
- /* Check whether already sorted or not */ \
- if(Val<PrevVal) { AlreadySorted = false; break; } /* Early out */ \
- /* Update for next iteration */ \
- PrevVal = Val; \
- \
- /* Create histograms */ \
- h0[*p++]++; h1[*p++]++; h2[*p++]++; h3[*p++]++; \
- } \
- \
- /* If all input values are already sorted, we just have to return and leave the */ \
- /* previous list unchanged. That way the routine may take advantage of temporal */ \
- /* coherence, for example when used to sort transparent faces. */ \
- if(AlreadySorted) { mNbHits++; return *this; } \
- } \
- \
- /* Else there has been an early out and we must finish computing the histograms */ \
- while(p!=pe) \
- { \
- /* Create histograms without the previous overhead */ \
- h0[*p++]++; h1[*p++]++; h2[*p++]++; h3[*p++]++; \
- }
-
-#define CHECK_PASS_VALIDITY(pass) \
- /* Shortcut to current counters */ \
- udword* CurCount = &mHistogram[pass<<8]; \
- \
- /* Reset flag. The sorting pass is supposed to be performed. (default) */ \
- bool PerformPass = true; \
- \
- /* Check pass validity */ \
- \
- /* If all values have the same byte, sorting is useless. */ \
- /* It may happen when sorting bytes or words instead of dwords. */ \
- /* This routine actually sorts words faster than dwords, and bytes */ \
- /* faster than words. Standard running time (O(4*n))is reduced to O(2*n) */ \
- /* for words and O(n) for bytes. Running time for floats depends on actual values... */ \
- \
- /* Get first byte */ \
- ubyte UniqueVal = *(((ubyte*)input)+pass); \
- \
- /* Check that byte's counter */ \
- if(CurCount[UniqueVal]==nb) PerformPass=false;
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-/**
- * Constructor.
- */
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-RadixSort::RadixSort() : mRanks(null), mRanks2(null), mCurrentSize(0), mTotalCalls(0), mNbHits(0)
-{
-#ifndef RADIX_LOCAL_RAM
- // Allocate input-independent ram
- mHistogram = new udword[256*4];
- mOffset = new udword[256];
-#endif
- // Initialize indices
- INVALIDATE_RANKS;
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-/**
- * Destructor.
- */
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-RadixSort::~RadixSort()
-{
- // Release everything
-#ifndef RADIX_LOCAL_RAM
- DELETEARRAY(mOffset);
- DELETEARRAY(mHistogram);
-#endif
- DELETEARRAY(mRanks2);
- DELETEARRAY(mRanks);
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-/**
- * Resizes the inner lists.
- * \param nb [in] new size (number of dwords)
- * \return true if success
- */
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-bool RadixSort::Resize(udword nb)
-{
- // Free previously used ram
- DELETEARRAY(mRanks2);
- DELETEARRAY(mRanks);
-
- // Get some fresh one
- mRanks = new udword[nb]; CHECKALLOC(mRanks);
- mRanks2 = new udword[nb]; CHECKALLOC(mRanks2);
-
- return true;
-}
-
-inline_ void RadixSort::CheckResize(udword nb)
-{
- udword CurSize = CURRENT_SIZE;
- if(nb!=CurSize)
- {
- if(nb>CurSize) Resize(nb);
- mCurrentSize = nb;
- INVALIDATE_RANKS;
- }
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-/**
- * Main sort routine.
- * This one is for integer values. After the call, mRanks contains a list of indices in sorted order, i.e. in the order you may process your data.
- * \param input [in] a list of integer values to sort
- * \param nb [in] number of values to sort, must be < 2^31
- * \param hint [in] RADIX_SIGNED to handle negative values, RADIX_UNSIGNED if you know your input buffer only contains positive values
- * \return Self-Reference
- */
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-RadixSort& RadixSort::Sort(const udword* input, udword nb, RadixHint hint)
-{
- // Checkings
- if(!input || !nb || nb&0x80000000) return *this;
-
- // Stats
- mTotalCalls++;
-
- // Resize lists if needed
- CheckResize(nb);
-
-#ifdef RADIX_LOCAL_RAM
- // Allocate histograms & offsets on the stack
- udword mHistogram[256*4];
-// udword mOffset[256];
- udword* mLink[256];
-#endif
-
- // Create histograms (counters). Counters for all passes are created in one run.
- // Pros: read input buffer once instead of four times
- // Cons: mHistogram is 4Kb instead of 1Kb
- // We must take care of signed/unsigned values for temporal coherence.... I just
- // have 2 code paths even if just a single opcode changes. Self-modifying code, someone?
- if(hint==RADIX_UNSIGNED) { CREATE_HISTOGRAMS(udword, input); }
- else { CREATE_HISTOGRAMS(sdword, input); }
-
- // Compute #negative values involved if needed
- udword NbNegativeValues = 0;
- if(hint==RADIX_SIGNED)
- {
- // An efficient way to compute the number of negatives values we'll have to deal with is simply to sum the 128
- // last values of the last histogram. Last histogram because that's the one for the Most Significant Byte,
- // responsible for the sign. 128 last values because the 128 first ones are related to positive numbers.
- udword* h3= &mHistogram[768];
- for(udword i=128;i<256;i++) NbNegativeValues += h3[i]; // 768 for last histogram, 128 for negative part
- }
-
- // Radix sort, j is the pass number (0=LSB, 3=MSB)
- for(udword j=0;j<4;j++)
- {
- CHECK_PASS_VALIDITY(j);
-
- // Sometimes the fourth (negative) pass is skipped because all numbers are negative and the MSB is 0xFF (for example). This is
- // not a problem, numbers are correctly sorted anyway.
- if(PerformPass)
- {
- // Should we care about negative values?
- if(j!=3 || hint==RADIX_UNSIGNED)
- {
- // Here we deal with positive values only
-
- // Create offsets
-// mOffset[0] = 0;
-// for(udword i=1;i<256;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1];
- mLink[0] = mRanks2;
- for(udword i=1;i<256;i++) mLink[i] = mLink[i-1] + CurCount[i-1];
- }
- else
- {
- // This is a special case to correctly handle negative integers. They're sorted in the right order but at the wrong place.
-
- // Create biased offsets, in order for negative numbers to be sorted as well
-// mOffset[0] = NbNegativeValues; // First positive number takes place after the negative ones
- mLink[0] = &mRanks2[NbNegativeValues]; // First positive number takes place after the negative ones
-// for(udword i=1;i<128;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1]; // 1 to 128 for positive numbers
- for(udword i=1;i<128;i++) mLink[i] = mLink[i-1] + CurCount[i-1]; // 1 to 128 for positive numbers
-
- // Fixing the wrong place for negative values
-// mOffset[128] = 0;
- mLink[128] = mRanks2;
-// for(i=129;i<256;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1];
- for(udword i=129;i<256;i++) mLink[i] = mLink[i-1] + CurCount[i-1];
- }
-
- // Perform Radix Sort
- ubyte* InputBytes = (ubyte*)input;
- InputBytes += j;
- if(INVALID_RANKS)
- {
-// for(udword i=0;i<nb;i++) mRanks2[mOffset[InputBytes[i<<2]]++] = i;
- for(udword i=0;i<nb;i++) *mLink[InputBytes[i<<2]]++ = i;
- VALIDATE_RANKS;
- }
- else
- {
- udword* Indices = mRanks;
- udword* IndicesEnd = &mRanks[nb];
- while(Indices!=IndicesEnd)
- {
- udword id = *Indices++;
-// mRanks2[mOffset[InputBytes[id<<2]]++] = id;
- *mLink[InputBytes[id<<2]]++ = id;
- }
- }
-
- // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
- udword* Tmp = mRanks; mRanks = mRanks2; mRanks2 = Tmp;
- }
- }
- return *this;
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-/**
- * Main sort routine.
- * This one is for floating-point values. After the call, mRanks contains a list of indices in sorted order, i.e. in the order you may process your data.
- * \param input [in] a list of floating-point values to sort
- * \param nb [in] number of values to sort, must be < 2^31
- * \return Self-Reference
- * \warning only sorts IEEE floating-point values
- */
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-RadixSort& RadixSort::Sort(const float* input2, udword nb)
-{
- // Checkings
- if(!input2 || !nb || nb&0x80000000) return *this;
-
- // Stats
- mTotalCalls++;
-
- udword* input = (udword*)input2;
-
- // Resize lists if needed
- CheckResize(nb);
-
-#ifdef RADIX_LOCAL_RAM
- // Allocate histograms & offsets on the stack
- udword mHistogram[256*4];
-// udword mOffset[256];
- udword* mLink[256];
-#endif
-
- // Create histograms (counters). Counters for all passes are created in one run.
- // Pros: read input buffer once instead of four times
- // Cons: mHistogram is 4Kb instead of 1Kb
- // Floating-point values are always supposed to be signed values, so there's only one code path there.
- // Please note the floating point comparison needed for temporal coherence! Although the resulting asm code
- // is dreadful, this is surprisingly not such a performance hit - well, I suppose that's a big one on first
- // generation Pentiums....We can't make comparison on integer representations because, as Chris said, it just
- // wouldn't work with mixed positive/negative values....
- { CREATE_HISTOGRAMS(float, input2); }
-
- // Compute #negative values involved if needed
- udword NbNegativeValues = 0;
- // An efficient way to compute the number of negatives values we'll have to deal with is simply to sum the 128
- // last values of the last histogram. Last histogram because that's the one for the Most Significant Byte,
- // responsible for the sign. 128 last values because the 128 first ones are related to positive numbers.
- udword* h3= &mHistogram[768];
- for(udword i=128;i<256;i++) NbNegativeValues += h3[i]; // 768 for last histogram, 128 for negative part
-
- // Radix sort, j is the pass number (0=LSB, 3=MSB)
- for(udword j=0;j<4;j++)
- {
- // Should we care about negative values?
- if(j!=3)
- {
- // Here we deal with positive values only
- CHECK_PASS_VALIDITY(j);
-
- if(PerformPass)
- {
- // Create offsets
-// mOffset[0] = 0;
- mLink[0] = mRanks2;
-// for(udword i=1;i<256;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1];
- for(udword i=1;i<256;i++) mLink[i] = mLink[i-1] + CurCount[i-1];
-
- // Perform Radix Sort
- ubyte* InputBytes = (ubyte*)input;
- InputBytes += j;
- if(INVALID_RANKS)
- {
-// for(i=0;i<nb;i++) mRanks2[mOffset[InputBytes[i<<2]]++] = i;
- for(udword i=0;i<nb;i++) *mLink[InputBytes[i<<2]]++ = i;
- VALIDATE_RANKS;
- }
- else
- {
- udword* Indices = mRanks;
- udword* IndicesEnd = &mRanks[nb];
- while(Indices!=IndicesEnd)
- {
- udword id = *Indices++;
-// mRanks2[mOffset[InputBytes[id<<2]]++] = id;
- *mLink[InputBytes[id<<2]]++ = id;
- }
- }
-
- // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
- udword* Tmp = mRanks; mRanks = mRanks2; mRanks2 = Tmp;
- }
- }
- else
- {
- // This is a special case to correctly handle negative values
- CHECK_PASS_VALIDITY(j);
-
- if(PerformPass)
- {
- // Create biased offsets, in order for negative numbers to be sorted as well
-// mOffset[0] = NbNegativeValues; // First positive number takes place after the negative ones
- mLink[0] = &mRanks2[NbNegativeValues]; // First positive number takes place after the negative ones
-// for(udword i=1;i<128;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1]; // 1 to 128 for positive numbers
- for(udword i=1;i<128;i++) mLink[i] = mLink[i-1] + CurCount[i-1]; // 1 to 128 for positive numbers
-
- // We must reverse the sorting order for negative numbers!
-// mOffset[255] = 0;
- mLink[255] = mRanks2;
-// for(i=0;i<127;i++) mOffset[254-i] = mOffset[255-i] + CurCount[255-i]; // Fixing the wrong order for negative values
- for(udword i=0;i<127;i++) mLink[254-i] = mLink[255-i] + CurCount[255-i]; // Fixing the wrong order for negative values
-// for(i=128;i<256;i++) mOffset[i] += CurCount[i]; // Fixing the wrong place for negative values
- for(udword i=128;i<256;i++) mLink[i] += CurCount[i]; // Fixing the wrong place for negative values
-
- // Perform Radix Sort
- if(INVALID_RANKS)
- {
- for(udword i=0;i<nb;i++)
- {
- udword Radix = input[i]>>24; // Radix byte, same as above. AND is useless here (udword).
- // ### cmp to be killed. Not good. Later.
-// if(Radix<128) mRanks2[mOffset[Radix]++] = i; // Number is positive, same as above
-// else mRanks2[--mOffset[Radix]] = i; // Number is negative, flip the sorting order
- if(Radix<128) *mLink[Radix]++ = i; // Number is positive, same as above
- else *(--mLink[Radix]) = i; // Number is negative, flip the sorting order
- }
- VALIDATE_RANKS;
- }
- else
- {
- for(udword i=0;i<nb;i++)
- {
- udword Radix = input[mRanks[i]]>>24; // Radix byte, same as above. AND is useless here (udword).
- // ### cmp to be killed. Not good. Later.
-// if(Radix<128) mRanks2[mOffset[Radix]++] = mRanks[i]; // Number is positive, same as above
-// else mRanks2[--mOffset[Radix]] = mRanks[i]; // Number is negative, flip the sorting order
- if(Radix<128) *mLink[Radix]++ = mRanks[i]; // Number is positive, same as above
- else *(--mLink[Radix]) = mRanks[i]; // Number is negative, flip the sorting order
- }
- }
- // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
- udword* Tmp = mRanks; mRanks = mRanks2; mRanks2 = Tmp;
- }
- else
- {
- // The pass is useless, yet we still have to reverse the order of current list if all values are negative.
- if(UniqueVal>=128)
- {
- if(INVALID_RANKS)
- {
- // ###Possible?
- for(udword i=0;i<nb;i++) mRanks2[i] = nb-i-1;
- VALIDATE_RANKS;
- }
- else
- {
- for(udword i=0;i<nb;i++) mRanks2[i] = mRanks[nb-i-1];
- }
-
- // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
- udword* Tmp = mRanks; mRanks = mRanks2; mRanks2 = Tmp;
- }
- }
- }
- }
- return *this;
-}
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-/**
- * Gets the ram used.
- * \return memory used in bytes
- */
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-udword RadixSort::GetUsedRam() const
-{
- udword UsedRam = sizeof(RadixSort);
-#ifndef RADIX_LOCAL_RAM
- UsedRam += 256*4*sizeof(udword); // Histograms
- UsedRam += 256*sizeof(udword); // Offsets
-#endif
- UsedRam += 2*CURRENT_SIZE*sizeof(udword); // 2 lists of indices
- return UsedRam;
-}
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/**
+ * Contains source code from the article "Radix Sort Revisited".
+ * \file IceRevisitedRadix.cpp
+ * \author Pierre Terdiman
+ * \date April, 4, 2000
+ */
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/**
+ * Revisited Radix Sort.
+ * This is my new radix routine:
+ * - it uses indices and doesn't recopy the values anymore, hence wasting less ram
+ * - it creates all the histograms in one run instead of four
+ * - it sorts words faster than dwords and bytes faster than words
+ * - it correctly sorts negative floating-point values by patching the offsets
+ * - it automatically takes advantage of temporal coherence
+ * - multiple keys support is a side effect of temporal coherence
+ * - it may be worth recoding in asm... (mainly to use FCOMI, FCMOV, etc) [it's probably memory-bound anyway]
+ *
+ * History:
+ * - 08.15.98: very first version
+ * - 04.04.00: recoded for the radix article
+ * - 12.xx.00: code lifting
+ * - 09.18.01: faster CHECK_PASS_VALIDITY thanks to Mark D. Shattuck (who provided other tips, not included here)
+ * - 10.11.01: added local ram support
+ * - 01.20.02: bugfix! In very particular cases the last pass was skipped in the float code-path, leading to incorrect sorting......
+ * - 01.02.02: - "mIndices" renamed => "mRanks". That's a rank sorter after all.
+ * - ranks are not "reset" anymore, but implicit on first calls
+ * - 07.05.02: - offsets rewritten with one less indirection.
+ * - 11.03.02: - "bool" replaced with RadixHint enum
+ *
+ * \class RadixSort
+ * \author Pierre Terdiman
+ * \version 1.4
+ * \date August, 15, 1998
+ */
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/*
+To do:
+ - add an offset parameter between two input values (avoid some data recopy sometimes)
+ - unroll ? asm ?
+ - 11 bits trick & 3 passes as Michael did
+ - prefetch stuff the day I have a P3
+ - make a version with 16-bits indices ?
+*/
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Precompiled Header
+#include "StdAfx.h"
+
+using namespace IceCore;
+
+#define INVALIDATE_RANKS mCurrentSize|=0x80000000
+#define VALIDATE_RANKS mCurrentSize&=0x7fffffff
+#define CURRENT_SIZE (mCurrentSize&0x7fffffff)
+#define INVALID_RANKS (mCurrentSize&0x80000000)
+
+#define CHECK_RESIZE(n) \
+ if(n!=mPreviousSize) \
+ { \
+ if(n>mCurrentSize) Resize(n); \
+ else ResetRanks(); \
+ mPreviousSize = n; \
+ }
+
+#define CREATE_HISTOGRAMS(type, buffer) \
+ /* Clear counters/histograms */ \
+ ZeroMemory(mHistogram, 256*4*sizeof(udword)); \
+ \
+ /* Prepare to count */ \
+ ubyte* p = (ubyte*)input; \
+ ubyte* pe = &p[nb*4]; \
+ udword* h0= &mHistogram[0]; /* Histogram for first pass (LSB) */ \
+ udword* h1= &mHistogram[256]; /* Histogram for second pass */ \
+ udword* h2= &mHistogram[512]; /* Histogram for third pass */ \
+ udword* h3= &mHistogram[768]; /* Histogram for last pass (MSB) */ \
+ \
+ bool AlreadySorted = true; /* Optimism... */ \
+ \
+ if(INVALID_RANKS) \
+ { \
+ /* Prepare for temporal coherence */ \
+ type* Running = (type*)buffer; \
+ type PrevVal = *Running; \
+ \
+ while(p!=pe) \
+ { \
+ /* Read input buffer in previous sorted order */ \
+ type Val = *Running++; \
+ /* Check whether already sorted or not */ \
+ if(Val<PrevVal) { AlreadySorted = false; break; } /* Early out */ \
+ /* Update for next iteration */ \
+ PrevVal = Val; \
+ \
+ /* Create histograms */ \
+ h0[*p++]++; h1[*p++]++; h2[*p++]++; h3[*p++]++; \
+ } \
+ \
+ /* If all input values are already sorted, we just have to return and leave the */ \
+ /* previous list unchanged. That way the routine may take advantage of temporal */ \
+ /* coherence, for example when used to sort transparent faces. */ \
+ if(AlreadySorted) \
+ { \
+ mNbHits++; \
+ for(udword i=0;i<nb;i++) mRanks[i] = i; \
+ return *this; \
+ } \
+ } \
+ else \
+ { \
+ /* Prepare for temporal coherence */ \
+ udword* Indices = mRanks; \
+ type PrevVal = (type)buffer[*Indices]; \
+ \
+ while(p!=pe) \
+ { \
+ /* Read input buffer in previous sorted order */ \
+ type Val = (type)buffer[*Indices++]; \
+ /* Check whether already sorted or not */ \
+ if(Val<PrevVal) { AlreadySorted = false; break; } /* Early out */ \
+ /* Update for next iteration */ \
+ PrevVal = Val; \
+ \
+ /* Create histograms */ \
+ h0[*p++]++; h1[*p++]++; h2[*p++]++; h3[*p++]++; \
+ } \
+ \
+ /* If all input values are already sorted, we just have to return and leave the */ \
+ /* previous list unchanged. That way the routine may take advantage of temporal */ \
+ /* coherence, for example when used to sort transparent faces. */ \
+ if(AlreadySorted) { mNbHits++; return *this; } \
+ } \
+ \
+ /* Else there has been an early out and we must finish computing the histograms */ \
+ while(p!=pe) \
+ { \
+ /* Create histograms without the previous overhead */ \
+ h0[*p++]++; h1[*p++]++; h2[*p++]++; h3[*p++]++; \
+ }
+
+#define CHECK_PASS_VALIDITY(pass) \
+ /* Shortcut to current counters */ \
+ udword* CurCount = &mHistogram[pass<<8]; \
+ \
+ /* Reset flag. The sorting pass is supposed to be performed. (default) */ \
+ bool PerformPass = true; \
+ \
+ /* Check pass validity */ \
+ \
+ /* If all values have the same byte, sorting is useless. */ \
+ /* It may happen when sorting bytes or words instead of dwords. */ \
+ /* This routine actually sorts words faster than dwords, and bytes */ \
+ /* faster than words. Standard running time (O(4*n))is reduced to O(2*n) */ \
+ /* for words and O(n) for bytes. Running time for floats depends on actual values... */ \
+ \
+ /* Get first byte */ \
+ ubyte UniqueVal = *(((ubyte*)input)+pass); \
+ \
+ /* Check that byte's counter */ \
+ if(CurCount[UniqueVal]==nb) PerformPass=false;
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/**
+ * Constructor.
+ */
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+RadixSort::RadixSort() : mRanks(null), mRanks2(null), mCurrentSize(0), mTotalCalls(0), mNbHits(0)
+{
+#ifndef RADIX_LOCAL_RAM
+ // Allocate input-independent ram
+ mHistogram = new udword[256*4];
+ mOffset = new udword[256];
+#endif
+ // Initialize indices
+ INVALIDATE_RANKS;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/**
+ * Destructor.
+ */
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+RadixSort::~RadixSort()
+{
+ // Release everything
+#ifndef RADIX_LOCAL_RAM
+ DELETEARRAY(mOffset);
+ DELETEARRAY(mHistogram);
+#endif
+ DELETEARRAY(mRanks2);
+ DELETEARRAY(mRanks);
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/**
+ * Resizes the inner lists.
+ * \param nb [in] new size (number of dwords)
+ * \return true if success
+ */
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+bool RadixSort::Resize(udword nb)
+{
+ // Free previously used ram
+ DELETEARRAY(mRanks2);
+ DELETEARRAY(mRanks);
+
+ // Get some fresh one
+ mRanks = new udword[nb]; CHECKALLOC(mRanks);
+ mRanks2 = new udword[nb]; CHECKALLOC(mRanks2);
+
+ return true;
+}
+
+inline_ void RadixSort::CheckResize(udword nb)
+{
+ udword CurSize = CURRENT_SIZE;
+ if(nb!=CurSize)
+ {
+ if(nb>CurSize) Resize(nb);
+ mCurrentSize = nb;
+ INVALIDATE_RANKS;
+ }
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/**
+ * Main sort routine.
+ * This one is for integer values. After the call, mRanks contains a list of indices in sorted order, i.e. in the order you may process your data.
+ * \param input [in] a list of integer values to sort
+ * \param nb [in] number of values to sort, must be < 2^31
+ * \param hint [in] RADIX_SIGNED to handle negative values, RADIX_UNSIGNED if you know your input buffer only contains positive values
+ * \return Self-Reference
+ */
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+RadixSort& RadixSort::Sort(const udword* input, udword nb, RadixHint hint)
+{
+ // Checkings
+ if(!input || !nb || nb&0x80000000) return *this;
+
+ // Stats
+ mTotalCalls++;
+
+ // Resize lists if needed
+ CheckResize(nb);
+
+#ifdef RADIX_LOCAL_RAM
+ // Allocate histograms & offsets on the stack
+ udword mHistogram[256*4];
+// udword mOffset[256];
+ udword* mLink[256];
+#endif
+
+ // Create histograms (counters). Counters for all passes are created in one run.
+ // Pros: read input buffer once instead of four times
+ // Cons: mHistogram is 4Kb instead of 1Kb
+ // We must take care of signed/unsigned values for temporal coherence.... I just
+ // have 2 code paths even if just a single opcode changes. Self-modifying code, someone?
+ if(hint==RADIX_UNSIGNED) { CREATE_HISTOGRAMS(udword, input); }
+ else { CREATE_HISTOGRAMS(sdword, input); }
+
+ // Compute #negative values involved if needed
+ udword NbNegativeValues = 0;
+ if(hint==RADIX_SIGNED)
+ {
+ // An efficient way to compute the number of negatives values we'll have to deal with is simply to sum the 128
+ // last values of the last histogram. Last histogram because that's the one for the Most Significant Byte,
+ // responsible for the sign. 128 last values because the 128 first ones are related to positive numbers.
+ udword* h3= &mHistogram[768];
+ for(udword i=128;i<256;i++) NbNegativeValues += h3[i]; // 768 for last histogram, 128 for negative part
+ }
+
+ // Radix sort, j is the pass number (0=LSB, 3=MSB)
+ for(udword j=0;j<4;j++)
+ {
+ CHECK_PASS_VALIDITY(j);
+
+ // Sometimes the fourth (negative) pass is skipped because all numbers are negative and the MSB is 0xFF (for example). This is
+ // not a problem, numbers are correctly sorted anyway.
+ if(PerformPass)
+ {
+ // Should we care about negative values?
+ if(j!=3 || hint==RADIX_UNSIGNED)
+ {
+ // Here we deal with positive values only
+
+ // Create offsets
+// mOffset[0] = 0;
+// for(udword i=1;i<256;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1];
+ mLink[0] = mRanks2;
+ for(udword i=1;i<256;i++) mLink[i] = mLink[i-1] + CurCount[i-1];
+ }
+ else
+ {
+ // This is a special case to correctly handle negative integers. They're sorted in the right order but at the wrong place.
+
+ // Create biased offsets, in order for negative numbers to be sorted as well
+// mOffset[0] = NbNegativeValues; // First positive number takes place after the negative ones
+ mLink[0] = &mRanks2[NbNegativeValues]; // First positive number takes place after the negative ones
+// for(udword i=1;i<128;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1]; // 1 to 128 for positive numbers
+ for(udword i=1;i<128;i++) mLink[i] = mLink[i-1] + CurCount[i-1]; // 1 to 128 for positive numbers
+
+ // Fixing the wrong place for negative values
+// mOffset[128] = 0;
+ mLink[128] = mRanks2;
+// for(i=129;i<256;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1];
+ for(udword i=129;i<256;i++) mLink[i] = mLink[i-1] + CurCount[i-1];
+ }
+
+ // Perform Radix Sort
+ ubyte* InputBytes = (ubyte*)input;
+ InputBytes += j;
+ if(INVALID_RANKS)
+ {
+// for(udword i=0;i<nb;i++) mRanks2[mOffset[InputBytes[i<<2]]++] = i;
+ for(udword i=0;i<nb;i++) *mLink[InputBytes[i<<2]]++ = i;
+ VALIDATE_RANKS;
+ }
+ else
+ {
+ udword* Indices = mRanks;
+ udword* IndicesEnd = &mRanks[nb];
+ while(Indices!=IndicesEnd)
+ {
+ udword id = *Indices++;
+// mRanks2[mOffset[InputBytes[id<<2]]++] = id;
+ *mLink[InputBytes[id<<2]]++ = id;
+ }
+ }
+
+ // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
+ udword* Tmp = mRanks; mRanks = mRanks2; mRanks2 = Tmp;
+ }
+ }
+ return *this;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/**
+ * Main sort routine.
+ * This one is for floating-point values. After the call, mRanks contains a list of indices in sorted order, i.e. in the order you may process your data.
+ * \param input [in] a list of floating-point values to sort
+ * \param nb [in] number of values to sort, must be < 2^31
+ * \return Self-Reference
+ * \warning only sorts IEEE floating-point values
+ */
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+RadixSort& RadixSort::Sort(const float* input2, udword nb)
+{
+ // Checkings
+ if(!input2 || !nb || nb&0x80000000) return *this;
+
+ // Stats
+ mTotalCalls++;
+
+ udword* input = (udword*)input2;
+
+ // Resize lists if needed
+ CheckResize(nb);
+
+#ifdef RADIX_LOCAL_RAM
+ // Allocate histograms & offsets on the stack
+ udword mHistogram[256*4];
+// udword mOffset[256];
+ udword* mLink[256];
+#endif
+
+ // Create histograms (counters). Counters for all passes are created in one run.
+ // Pros: read input buffer once instead of four times
+ // Cons: mHistogram is 4Kb instead of 1Kb
+ // Floating-point values are always supposed to be signed values, so there's only one code path there.
+ // Please note the floating point comparison needed for temporal coherence! Although the resulting asm code
+ // is dreadful, this is surprisingly not such a performance hit - well, I suppose that's a big one on first
+ // generation Pentiums....We can't make comparison on integer representations because, as Chris said, it just
+ // wouldn't work with mixed positive/negative values....
+ { CREATE_HISTOGRAMS(float, input2); }
+
+ // Compute #negative values involved if needed
+ udword NbNegativeValues = 0;
+ // An efficient way to compute the number of negatives values we'll have to deal with is simply to sum the 128
+ // last values of the last histogram. Last histogram because that's the one for the Most Significant Byte,
+ // responsible for the sign. 128 last values because the 128 first ones are related to positive numbers.
+ udword* h3= &mHistogram[768];
+ for(udword i=128;i<256;i++) NbNegativeValues += h3[i]; // 768 for last histogram, 128 for negative part
+
+ // Radix sort, j is the pass number (0=LSB, 3=MSB)
+ for(udword j=0;j<4;j++)
+ {
+ // Should we care about negative values?
+ if(j!=3)
+ {
+ // Here we deal with positive values only
+ CHECK_PASS_VALIDITY(j);
+
+ if(PerformPass)
+ {
+ // Create offsets
+// mOffset[0] = 0;
+ mLink[0] = mRanks2;
+// for(udword i=1;i<256;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1];
+ for(udword i=1;i<256;i++) mLink[i] = mLink[i-1] + CurCount[i-1];
+
+ // Perform Radix Sort
+ ubyte* InputBytes = (ubyte*)input;
+ InputBytes += j;
+ if(INVALID_RANKS)
+ {
+// for(i=0;i<nb;i++) mRanks2[mOffset[InputBytes[i<<2]]++] = i;
+ for(udword i=0;i<nb;i++) *mLink[InputBytes[i<<2]]++ = i;
+ VALIDATE_RANKS;
+ }
+ else
+ {
+ udword* Indices = mRanks;
+ udword* IndicesEnd = &mRanks[nb];
+ while(Indices!=IndicesEnd)
+ {
+ udword id = *Indices++;
+// mRanks2[mOffset[InputBytes[id<<2]]++] = id;
+ *mLink[InputBytes[id<<2]]++ = id;
+ }
+ }
+
+ // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
+ udword* Tmp = mRanks; mRanks = mRanks2; mRanks2 = Tmp;
+ }
+ }
+ else
+ {
+ // This is a special case to correctly handle negative values
+ CHECK_PASS_VALIDITY(j);
+
+ if(PerformPass)
+ {
+ // Create biased offsets, in order for negative numbers to be sorted as well
+// mOffset[0] = NbNegativeValues; // First positive number takes place after the negative ones
+ mLink[0] = &mRanks2[NbNegativeValues]; // First positive number takes place after the negative ones
+// for(udword i=1;i<128;i++) mOffset[i] = mOffset[i-1] + CurCount[i-1]; // 1 to 128 for positive numbers
+ for(udword i=1;i<128;i++) mLink[i] = mLink[i-1] + CurCount[i-1]; // 1 to 128 for positive numbers
+
+ // We must reverse the sorting order for negative numbers!
+// mOffset[255] = 0;
+ mLink[255] = mRanks2;
+// for(i=0;i<127;i++) mOffset[254-i] = mOffset[255-i] + CurCount[255-i]; // Fixing the wrong order for negative values
+ for(udword i=0;i<127;i++) mLink[254-i] = mLink[255-i] + CurCount[255-i]; // Fixing the wrong order for negative values
+// for(i=128;i<256;i++) mOffset[i] += CurCount[i]; // Fixing the wrong place for negative values
+ for(udword i=128;i<256;i++) mLink[i] += CurCount[i]; // Fixing the wrong place for negative values
+
+ // Perform Radix Sort
+ if(INVALID_RANKS)
+ {
+ for(udword i=0;i<nb;i++)
+ {
+ udword Radix = input[i]>>24; // Radix byte, same as above. AND is useless here (udword).
+ // ### cmp to be killed. Not good. Later.
+// if(Radix<128) mRanks2[mOffset[Radix]++] = i; // Number is positive, same as above
+// else mRanks2[--mOffset[Radix]] = i; // Number is negative, flip the sorting order
+ if(Radix<128) *mLink[Radix]++ = i; // Number is positive, same as above
+ else *(--mLink[Radix]) = i; // Number is negative, flip the sorting order
+ }
+ VALIDATE_RANKS;
+ }
+ else
+ {
+ for(udword i=0;i<nb;i++)
+ {
+ udword Radix = input[mRanks[i]]>>24; // Radix byte, same as above. AND is useless here (udword).
+ // ### cmp to be killed. Not good. Later.
+// if(Radix<128) mRanks2[mOffset[Radix]++] = mRanks[i]; // Number is positive, same as above
+// else mRanks2[--mOffset[Radix]] = mRanks[i]; // Number is negative, flip the sorting order
+ if(Radix<128) *mLink[Radix]++ = mRanks[i]; // Number is positive, same as above
+ else *(--mLink[Radix]) = mRanks[i]; // Number is negative, flip the sorting order
+ }
+ }
+ // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
+ udword* Tmp = mRanks; mRanks = mRanks2; mRanks2 = Tmp;
+ }
+ else
+ {
+ // The pass is useless, yet we still have to reverse the order of current list if all values are negative.
+ if(UniqueVal>=128)
+ {
+ if(INVALID_RANKS)
+ {
+ // ###Possible?
+ for(udword i=0;i<nb;i++) mRanks2[i] = nb-i-1;
+ VALIDATE_RANKS;
+ }
+ else
+ {
+ for(udword i=0;i<nb;i++) mRanks2[i] = mRanks[nb-i-1];
+ }
+
+ // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
+ udword* Tmp = mRanks; mRanks = mRanks2; mRanks2 = Tmp;
+ }
+ }
+ }
+ }
+ return *this;
+}
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/**
+ * Gets the ram used.
+ * \return memory used in bytes
+ */
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+udword RadixSort::GetUsedRam() const
+{
+ udword UsedRam = sizeof(RadixSort);
+#ifndef RADIX_LOCAL_RAM
+ UsedRam += 256*4*sizeof(udword); // Histograms
+ UsedRam += 256*sizeof(udword); // Offsets
+#endif
+ UsedRam += 2*CURRENT_SIZE*sizeof(udword); // 2 lists of indices
+ return UsedRam;
+}