大数据的Quicksort实施似乎太慢

我编写了一个程序来比较数组大小分别为10、1,000、100,000、1,000,000和10,000,000的不同排序算法。我当然希望Insertion在10上胜出并合并堆，并迅速真正在较高级别上发挥作用。

但是，下面是我得到的机会。（请注意，即使它们都是\ $ \ theta（n \ log n）\ $，快速排序的速度也比堆和合并快得多。）

我考虑过的事情：

每种算法都使用相同的种子来初始化每个数组，因此数字是相同的。
我只是在计时算法而已。并且不知道出了什么问题，但是也许我们俩都缺少了一些东西。
我将程序从闪存驱动器移到了台式机上，以测试可能的数据传输问题。）仅在夜间运行，没有其他任何作用。

 Key: Hours:Minutes:Seconds:Milliseconds:Microseconds

测试3

 n        Insertion        Merge            Heap             Quick            
10       0:0:0:0:3        0:0:0:0:28       0:0:0:0:35       0:0:0:0:43       
1000     0:0:0:11:470     0:0:0:2:358      0:0:0:7:787      0:0:0:3:596      
100000   0:0:1:911:865    0:0:0:51:506     0:0:0:24:257     0:0:0:55:519     
1000000  0:3:59:769:105   0:0:0:351:129    0:0:0:238:878    0:0:0:916:885    
10000000 8:11:44:552:820  0:0:3:521:108    0:0:3:560:178    0:1:13:709:830

测试2

 n              Insertion      Merge          Heap           Quick          
10             0:0:0:0:5      0:0:0:0:49     0:0:0:0:37     0:0:0:0:50     
1000           0:0:0:15:473   0:0:0:2:893    0:0:0:8:402    0:0:0:5:230    
100000         0:0:2:518:566  0:0:0:57:845   0:0:0:32:917   0:0:0:71:243   
1000000        0:5:38:538:795 0:0:0:460:796  0:0:0:312:66   0:0:1:398:508  
10000000       11:48:6:630:6390:0:3:690:329  0:0:3:518:281  0:1:18:180:11

测试1

          n    Insertion        Merge         Heap        Quick
       10       2676ns      19626ns      26316ns      33454ns
     1000   11504040ns    2298935ns    6835250ns    3741456ns
   100000 1849274815ns   47654052ns   23620952ns   52819295ns
  1000000     0:3:58ns      0:0:0ns      0:0:0ns      0:0:0ns
 10000000    8:10:25ns      0:0:3ns      0:0:3ns     0:1:15ns

H这是我的快速排序实现（35行）：

 public static long quick(int[] list) {
    long startTime = System.nanoTime();

    quickSort(list, 0, list.length - 1);

    long endTime = System.nanoTime();
    return endTime - startTime;
}

public static void quickSort(int[] A, int p, int r) {
    if(p < r) {
        int q = randomizedPartition(A, p, r);
        quickSort(A, p, q-1);
        quickSort(A, q+1, r);
    }
}

public static int randomizedPartition(int[] A, int p, int r) {
    Random random = new Random();
    int i = random.nextInt((r-p)+1)+p;
    swap(A,r,i);
    return partition(A,p,r);
}

public static int partition(int[] A, int p, int r) {
    int x = A[r];
    int i = p-1;
     for(int j = p; j < r; j++) {
        if(A[j] <= x) {
            i++;
            swap(A, i, j);
        }
    }
    swap(A, i+1, r);
    return i+1;
}

如果需要（267行），这是我的整个代码：

 import java.util.Random;
import java.util.concurrent.TimeUnit;
import java.io.*;

public class algComp {
public static void main(String[] args) {
    driver(10); // Sort array of length 10
    driver(1_000); // Sort array of length 1000
    driver(100_000);

    /* WARNING: Running program with the below values takes a lot of time!! */
    driver(1_000_000);
    //driver(10_000_000);
    /* You are now leaving the danger zone. */

    System.out.println("-----------------------------------------------");
    content = String.format(content + "\nKey: Hours:Minutes:Seconds:Milliseconds:Microseconds");
    printToFile(); // Prints data to times.txt
}

public static void driver(int n) {

    // Insertion sort
    int[] list = data(n);
    if(n == 10) {
        System.out.format("%10s","Unsorted: ");
        printList(list);
    }
    long iTime = insertion(list);
    if(n == 10) {
        System.out.format("%10s","iSorted: ");
        printList(list);
    }

    // Merge sort
    list = data(n);
    long mTime = mergeSort(list);
    if(n == 10) {
        System.out.format("%10s","mSorted: ");
        printList(list);
    }

    // Heap sort
    list=data(n);
    long hTime = heap(list);
    if(n == 10) {
        System.out.format("%10s","hSorted: ");
        printList(list);
    }

    // Quick sort
    list=data(n);
    long qTime = quick(list);
    if(n == 10) {
        System.out.format("%10s","qSorted: ");
        printList(list);
    }

    if(n == 10) { // This will only print once
        // Print prettifying stuff
        System.out.println("Data is being written to times.txt...");
        content = String.format(content + "%-9s%-17s%-17s%-17s%-17s\n",
            "n","Insertion","Merge","Heap","Quick");
    }

    content = String.format(content + "%-9d%-17s%-17s%-17s%-17s%-1s",n,displayTime(iTime),
    displayTime(mTime),displayTime(hTime),displayTime(qTime),"\n");
}

public static long insertion(int[] A) {
    long startTime = System.nanoTime();
    int i, j, key;
    for(j = 1; j < A.length; j++) {
        key = A[j];
        // If previous is greater than selected (key) swap
        for(i = j - 1; (i >= 0) && (A[i] > key); i--) {
            A[i+1] = A[i];
        }
        A[i+1] = key;
    }
    long endTime = System.nanoTime();
    return endTime - startTime;
}

public static long mergeSort(int[] A) {
    long startTime = System.nanoTime();

    if(A.length > 1) {
        // First Half
        int[] firstHalf = new int[A.length/2];
        System.arraycopy(A, 0, firstHalf, 0, A.length/2);
        mergeSort(firstHalf);

        // Second Half
        int secondHalfLength = A.length - A.length/2;
        int[] secondHalf = new int[secondHalfLength];
        System.arraycopy(A, A.length/2, secondHalf, 0, secondHalfLength);
        mergeSort(secondHalf);

        // Merge two arrays
        merge(firstHalf,secondHalf,A);
    }

    long endTime = System.nanoTime();
    return endTime - startTime;
}

public static void merge(int[] A1, int[] A2, int[] temp) {
    int current1 = 0; // Current index in list 1
    int current2 = 0; // Current index in list 2
    int current3 = 0; // Current index in temp

    // Compares elemets in A1 and A2 and sorts them
    while(current1 < A1.length && current2 < A2.length) {
        if(A1[current1] < A2[current2])
            temp[current3++] = A1[current1++];
        else
            temp[current3++] = A2[current2++];
    }

    // Merge two arrays into temp
    while(current1 < A1.length)
        temp[current3++] = A1[current1++];

    while(current2 < A2.length)
        temp[current3++] = A2[current2++];
}

public static long heap(int[] A) {
    long startTime = System.nanoTime();
    int temp;

    int heapSize = A.length-1;
    buildMaxHeap(A);
    for(int i = A.length-1; i >= 1; i--) {
        swap(A,0,i); // Root is now biggest element, swap to end of array
        heapSize--; // Reduce heapSize to ignore sorted elements
        maxHeapify(A,0,heapSize);
    }


    long endTime = System.nanoTime();
    return endTime - startTime;
}

public static void buildMaxHeap(int[] A) {
    int heapSize = A.length-1;
    // Bottom up, check parents children, sort and move up tree
    for(int i = (heapSize/2); i >= 0; i--)
        maxHeapify(A,i,heapSize);
}

public static void maxHeapify(int[] A, int i, int heapSize) {
    int temp,largest;
    int l = left(i); // 2i
    int r = right(i); // 2i + 1

    if(l <= heapSize && A[l] > A[i]) // Check left child (which is largest?)
        largest = l;
    else largest = i;
    if(r <= heapSize && A[r] > A[largest]) // Check right child
        largest = r;
    if(largest != i) { // If parent is biggest do nothing, else make parent largest
        swap(A,i,largest);
        maxHeapify(A,largest,heapSize);
    }
}

public static int left(int i) {
    return 2*i;
}

public static int right(int i) {
    return 2*i+1;
}

public static long quick(int[] list) {
    long startTime = System.nanoTime();

    quickSort(list, 0, list.length - 1);

    long endTime = System.nanoTime();
    return endTime - startTime;
}

public static void quickSort(int[] A, int p, int r) {
    if(p < r) {
        int q = randomizedPartition(A, p, r);
        quickSort(A, p, q-1);
        quickSort(A, q+1, r);
    }
}

public static int randomizedPartition(int[] A, int p, int r) {
    Random random = new Random();
    int i = random.nextInt((r-p)+1)+p;
    swap(A,r,i);
    return partition(A,p,r);
}

public static int partition(int[] A, int p, int r) {
    int x = A[r];
    int i = p-1;
    for(int j = p; j < r; j++) {
        if(A[j] <= x) {
            i++;
            swap(A, i, j);
        }
    }
    swap(A, i+1, r);
    return i+1;
}

public static void swap(int[] list, int i, int j) {
    int temp = list[i];
    list[i] = list[j];
    list[j] = temp;
}

public static String displayTime(long n) {
    long hours = TimeUnit.NANOSECONDS.toHours(n);
    long minutes = TimeUnit.NANOSECONDS.toMinutes(n) - (TimeUnit.NANOSECONDS.toHours(n)*60);
    long seconds = TimeUnit.NANOSECONDS.toSeconds(n) - (TimeUnit.NANOSECONDS.toMinutes(n) *60);
    long milliseconds = TimeUnit.NANOSECONDS.toMillis(n) - (TimeUnit.NANOSECONDS.toSeconds(n)*1000);
    long microseconds = TimeUnit.NANOSECONDS.toMicros(n) - (TimeUnit.NANOSECONDS.toMillis(n)*1000);
    String displayThis = (hours + ":" + minutes + ":" + seconds + ":" + milliseconds + ":" + microseconds);
    return displayThis;
}

public static int[] data(int n) {
    Random random = new Random(seed); // Random seed stays same for all sorts
    int[] list = new int[n];

    for(int i = 0; i < list.length; i++) {
        list[i] = random.nextInt(1000);
    }

    return list;
}

public static void printList(int[] list) {
    for(int i = 0; i < list.length; i++) {
            System.out.format("%5d",list[i]);
    }
    System.out.println();
}

public static void printToFile() {
    // Print to file
    try {
        File file = new File("times.txt");
        if(!file.exists())
            file.createNewFile();
        FileWriter fw = new FileWriter(file.getAbsoluteFile());
        BufferedWriter bw = new BufferedWriter(fw);
        bw.write(content);
        bw.close();
        System.out.println("Done.");
    } catch (IOException e) {
        e.printStackTrace();
    }
}

// Global variables
public static String content = ""; // Used to print data to text file times.txt
public static int seed = (int)(Math.random()*10_000); // Seed for generating lists
}

做什么您认为？当然，快速排序应该在3秒钟（1000万分钟）而不是一分钟的时间内运行。我在做什么错？

欢迎使用代码审查！我喜欢您已经包含了整个代码+1！希望整个事情都能得到审查；）

@Mats Mug谢谢，尽管我最担心为什么我得到意想不到的输出，但我还是感谢任何评论。期待任何想法。 ;）

@mjolka这是标准实现。稍后我会尝试您说的内容，但不确定如何获取。有什么想法吗？

我明天（早期）将使用它，但是按原样使用我的代码不会导致任何错误。

更新了我的答案，解释了为什么它很慢。

#1 楼

@mjolka能够在我之前确定根本问题；-)除了他指出的问题之外，还有两个问题。

第一个问题是创建新的Random实例的形式非常糟糕。在每个分区上。随机实例在某些逻辑结构上进行内部同步，并在创建过程中造成大量开销。相反，您应该使用不同的分区机制。

此外，您的命名也很糟糕：

您的类名以小写字母开头>您的变量的长度为1个字母，有些为大写（A，p，q，r，...）。数据生成器：

public static int[] data(int n) {
    Random random = new Random(seed); // Random seed stays same for all sorts
    int[] list = new int[n];

    for(int i = 0; i < list.length; i++) {
        list[i] = random.nextInt(list.length * 10);
    }

    return list;
}

当您用此代码运行代码时，得到的结果是：

n        Insertion        Merge            Heap             Quick            
10       0:0:0:0:0        0:0:0:0:0        0:0:0:0:0        0:0:0:0:34   
1000     0:0:0:0:0        0:0:0:0:0        0:0:0:0:0        0:0:0:2:480  
100000   0:0:0:0:0        0:0:0:0:0        0:0:0:0:0        0:0:0:21:987 
1000000  0:0:0:0:0        0:0:0:0:0        0:0:0:0:0        0:0:0:139:194
10000000 0:0:0:0:0        0:0:0:0:0        0:0:0:0:0        0:0:1:468:183

结果，您不会遇到太多的重复数据问题，而且排序很快。值，结果是：

n        Insertion        Merge            Heap             Quick        
10       0:0:0:0:0        0:0:0:0:0        0:0:0:0:0        0:0:0:0:17   
1000     0:0:0:0:0        0:0:0:0:0        0:0:0:0:0        0:0:0:0:708  
100000   0:0:0:0:0        0:0:0:0:0        0:0:0:0:0        0:0:0:18:462 
1000000  0:0:0:0:0        0:0:0:0:0        0:0:0:0:0        0:0:0:103:790
10000000 0:0:0:0:0        0:0:0:0:0        0:0:0:0:0        0:0:1:182:385

最后，我使用等值分组实现了自己的排序，并将数据限制为1000，结果是：

n        Insertion        Merge            Heap             Quick            Monkey           
10       0:0:0:0:0        0:0:0:0:0        0:0:0:0:0        0:0:0:0:16       0:0:0:0:9        
1000     0:0:0:0:0        0:0:0:0:0        0:0:0:0:0        0:0:0:0:562      0:0:0:0:684      
100000   0:0:0:0:0        0:0:0:0:0        0:0:0:0:0        0:0:0:16:675     0:0:0:18:119     
1000000  0:0:0:0:0        0:0:0:0:0        0:0:0:0:0        0:0:0:532:437    0:0:0:65:608     
10000000 0:0:0:0:0        0:0:0:0:0        0:0:0:0:0        0:0:48:533:774   0:0:0:694:184

随着随机数据的增多，猴子的排序速度变慢，而快速排序的速度加快：

n        Insertion        Merge            Heap             Quick            Monkey           
10       0:0:0:0:0        0:0:0:0:0        0:0:0:0:0        0:0:0:0:16       0:0:0:0:8        
1000     0:0:0:0:0        0:0:0:0:0        0:0:0:0:0        0:0:0:0:631      0:0:0:0:819      
100000   0:0:0:0:0        0:0:0:0:0        0:0:0:0:0        0:0:0:16:834     0:0:0:22:353     
1000000  0:0:0:0:0        0:0:0:0:0        0:0:0:0:0        0:0:0:103:699    0:0:0:127:421    
10000000 0:0:0:0:0        0:0:0:0:0        0:0:0:0:0        0:0:1:223:673    0:0:1:372:614

最重要的一点是，即使有良好的数据，您仍有20％的时间用于创建新的Random实例。我一直在使用randomizePartition方法：

public static int randomizedPartition(int[] data, int first, int last) {
    // Random random = new Random();
    // int i = random.nextInt((last-first)+1)+first;
    swap(data,last,first + (last - first)/2);
    return partition(data,first,last);

然后，如果您对猴子排序感兴趣：

public static long monkey(int[] list) {
    long startTime = System.nanoTime();
    monkeySort(list, 0, list.length - 1);
    return System.nanoTime() - startTime;
}

private static void monkeySort(final int[] data, final int left, final int right) {

    if (left >= right) {
        return;
    }

    // partition in this method because we need two outputs, the 'same' and the 'lft'.
    // swap values the same as the partition to the end as well.
    final int val = data[right];
    int lft = left;
    int rht = right - 1;
    int same = right;
    while (lft <= rht) {
        if (data[lft] > val) {
            swap(data, lft, rht);
            rht--;
        } else if (data[rht] == val) {
            same--;
            swap(data, rht, same);
            rht--;
        } else {
            lft++;
        }
    }

    // move all the 'same' values in to the pivot point.
    int ntop = lft - 1;
    while (same <= right) {
        swap(data, lft++, same++);
    }
    monkeySort(data, left, ntop);
    monkeySort(data, lft, right);

}

有关Random（）的更多详细信息

我引用了Random，值得对我的意思有更多的了解。这是java.util.Random的（略有重组）源代码：

private static final AtomicLong seedUniquifier
    = new AtomicLong(8682522807148012L);

public Random() {
    this(seedUniquifier() ^ System.nanoTime());
}

private static long seedUniquifier() {
    // L'Ecuyer, "Tables of Linear Congruential Generators of
    // Different Sizes and Good Lattice Structure", 1999
    for (;;) {
        long current = seedUniquifier.get();
        long next = current * 181783497276652981L;
        if (seedUniquifier.compareAndSet(current, next))
            return next;
    }
}

public Random(long seed) {
    if (getClass() == Random.class)
        this.seed = new AtomicLong(initialScramble(seed));
    else {
        // subclass might have overriden setSeed
        this.seed = new AtomicLong();
        setSeed(seed);
    }
}

请注意，有一个名为new Random()的静态AtomicLong。每次创建新的seedUniquifier时，都会对该AtomicLong进行两个引用，从而导致许多不必要的内存影响。此外，其中存在潜在的竞争条件，可能导致进程循环并重试。

Random具有单个AtomicLong引用来确保Random类是线程安全的，这已经有点不好了但是整个Random构造函数本质上也是全局安全的（实际上，您不能同时使用相同的种子创建两个Random实例）。该要求的实施成本高昂。

\ $ \ begingroup \ $
我不确定你的意思是什么。您是在说您完全取消了随机处理并使用了常规的快速排序吗？如果是这样，我同意这在某种程度上可能会更好，因为我们已经可以假设数据足够随机，因此不需要它。
\ $ \ endgroup \ $
– Ethan
2014年10月21日在16:06

\ $ \ begingroup \ $
关于随机的要点是，您不应在每个分区上创建一个新分区。如果您可以在堆栈中传递一个实例，那将是一个更好的解决方案。新的Random（）应该非常谨慎地使用。
\ $ \ endgroup \ $
–rolfl
2014年10月21日在16:09

\ $ \ begingroup \ $
我从random partition（）中删除了新的Random（），然后继续用洗澡水将婴儿扔出去，消除了random.nextInt（）。一种替代方法是实际实例化QuickSorter并使用Random数据成员。
\ $ \ endgroup \ $
–灰胡子
17年12月21日在15:46

#2 楼

Quicksort的此实现对具有许多重复元素的阵列的性能较差。来自Wikipedia，重点在于

使用上述分区算法（即使使用选择好的枢轴值的偶数
），quicksort的表现也很差。用于包含许多重复元素的输入。当所有输入元素都相等时，
问题就很明显了：在
每次递归时，左分区为空（输入值不小于枢轴），而右分区具有仅减少了一个
元素（枢轴已删除）。因此，该算法需要二次时间来对等值数组进行排序。将值分为三组：小于枢轴的值，等于枢轴的值和大于枢轴的值。

通过在输入上运行它可以看到这种二次行为例如，new int[10000]。实际上，您很可能会得到StackOverflowError。现在，在测试数据中，您有\ $ 10 {，} 000 {，} 000 \ $个元素，但您只是在选择随机值在\ $ [0,1 {，} 000）\ $范围内。所以...您有很多重复的元素！

让它在我的计算机上按原样运行（由于速度太慢，我没有运行插入排序）

$ java AlgComp && cat times.txt
Unsorted:   323  653  751   33  350  378  913  280  243  792
 mSorted:    33  243  280  323  350  378  653  751  792  913
 hSorted:    33  243  280  323  350  378  653  751  792  913
 qSorted:    33  243  280  323  350  378  653  751  792  913
Data is being written to times.txt...
-----------------------------------------------
Done.
n        Insertion        Merge            Heap             Quick
10       0:0:0:0:0        0:0:0:0:27       0:0:0:0:35       0:0:0:0:38
1000     0:0:0:0:0        0:0:0:1:852      0:0:0:0:822      0:0:0:3:224
100000   0:0:0:0:0        0:0:0:28:421     0:0:0:20:784     0:0:0:37:872
1000000  0:0:0:0:0        0:0:0:233:576    0:0:0:202:443    0:0:0:905:65
10000000 0:0:0:0:0        0:0:2:359:261    0:0:2:752:239    0:1:20:914:310

现在让我们在data中更改这一行：

list[i] = random.nextInt(1000);

为此

list[i] = random.nextInt(1000000);

现在结果更加符合我们的期望：

$ java AlgComp && cat times.txt
Unsorted: 1662389001535502665295332468356126461089888942823562420254
 mSorted: 2356224683166238420254529533550266561264610898889428900153
 hSorted: 2356224683166238420254529533550266561264610898889428900153
 qSorted: 2356224683166238420254529533550266561264610898889428900153
Data is being written to times.txt...
-----------------------------------------------
Done.
n        Insertion        Merge            Heap             Quick
10       0:0:0:0:0        0:0:0:0:21       0:0:0:0:98       0:0:0:0:56
1000     0:0:0:0:0        0:0:0:1:997      0:0:0:1:14       0:0:0:2:41
100000   0:0:0:0:0        0:0:0:27:223     0:0:0:22:562     0:0:0:21:587
1000000  0:0:0:0:0        0:0:0:283:939    0:0:0:215:551    0:0:0:137:658
10000000 0:0:0:0:0        0:0:2:899:176    0:0:3:681:388    0:0:1:845:255

当然，真正的解决方法是不改变data ，但要更改分区算法。

\ $ \ begingroup \ $
你知道吗？当我们谈论它时，我在课堂上提出了这个（显然，我是唯一完成该程序的人。
\ $ \ endgroup \ $
– Ethan
2014年10月21日在12:35

\ $ \ begingroup \ $
顺便说一句，我意识到声明一个新的Random对象有一个问题（根据folfl的回答），但您的解决方案或其他任何问题仍然无法解决。不知道您是否要重写任何内容，但是我已经运行了100遍，没有问题。无论如何，我很欣赏答案，这很好。 :)
\ $ \ endgroup \ $
– Ethan
2014年10月21日在17:15

\ $ \ begingroup \ $
@Ethan请在这里查看：http://en.wikipedia.org/wiki/Quicksort#Repeated_elements
\ $ \ endgroup \ $
–chbaker0
14-10-21在23:45

\ $ \ begingroup \ $
我从这里清除了很多分心的评论。
\ $ \ endgroup \ $
–rolfl
14-10-22在0:45

编程黑洞网

大数据的Quicksort实施似乎太慢

评论

#1 楼

评论

#2 楼

评论