Hello group,

I've come from C/C++ and am now trying to code some Python because I
absolutely love the language. However I still have trouble getting
Python code to run efficiently. Right now I have a easy task: Get a
file, split it up into a million chunks, count the most prominent
character in each chunk and output that value into a file - in other
words: Say we have a 2 GB file, we evaluate what character is most
prominent in filepos [0, 2048[ - say it's a "A", then put a 65 in there
(ord("A")).

I've first tried Python. Please don't beat me, it's slow as hell and
probably a horrible solution:

#!/usr/bin/python
import sys
import os

f = open(sys.argv[1], "r")
filesize = os.stat(sys.argv[1])[6]

width = 1024
height = 1024
pixels = width * height
blocksize = filesize / width / height

print("Filesize       : %d" % (filesize))
print("Image size     : %dx%d" % (width, height))
print("Bytes per Pixel: %d" % (blocksize))

picture = { }
havepixels = 0
while True:
        data = f.read(blocksize)
        if len(data) <= 0: break

        datamap = { }
        for i in range(len(data)):
                datamap[ord(data[i])] = datamap.get(data[i], 0) + 1

        maxchr = None
        maxcnt = None
        for (char, count) in datamap.items():
                if (maxcnt is None) or (count > maxcnt):
                        maxcnt = count
                        maxchr = char

        most = maxchr
        
        posx = havepixels % width
        posy = havepixels / width
        
        havepixels += 1
        if (havepixels % 1024) == 0:
                print("Progresss %s: %.1f%%" % (sys.argv[1], 100.0 * havepixels 
/ pixels))

        picture[(posx, posy)] = most

pic = open(sys.argv[1] + ".pgm", "w")
pic.write("P2\n")
pic.write("# CREATOR: Crappyass Python Script\n")
pic.write("%d %d\n" % (width, height))
pic.write("255\n")
for y in range(height):
        for x in range(width):
                pos = (x, y)
                most = picture.get(pos, -1)
                pic.write("%d\n" % (most))

As this was horribly slow (20 Minutes for a 2GB file) I coded the whole
thing in C also:

#include <stdio.h>
#include <errno.h>
#include <string.h>
#include <stdlib.h>

#define BLOCKSIZE 2048

int main(int argc, char **argv) {
        unsigned int count[256];
        int width, height;
        FILE *f;
        FILE *in;
        width = 1024;
        height = 1024;
        char temp[2048];

        if (argc != 2) { fprintf(stderr, "Argument?\n"); exit(2); }

        in = fopen(argv[1], "r");
        if (!in) { perror("fopen"); exit(1); }

        snprintf(temp, 255, "%s.pgm", argv[1]);
        f = fopen(temp, "w");
        if (!f) { perror("fopen"); exit(1); }
        
        fprintf(f, "P2\n");
        fprintf(f, "# CREATOR: C\n");
        fprintf(f, "%d %d\n", width, height);
        fprintf(f, "255\n");

        width = 1024;
        height = 1024;
        while (fread(temp, 1, sizeof(temp), in) == sizeof(temp)) {
                int i;
                memset(count, 0, sizeof(count));
                for (i = 0; i < sizeof(temp); i++) {
                        count[(int)temp[i]]++;
                }

                int greatest;
                int maxcount;

                greatest = 0;
                maxcount = count[0];
                for (i = 1; i < 256; i++) {
                        if (count[i] > maxcount) {
                                maxcount = count[i];
                                greatest = i;
                        }
                }

                fprintf(f, "%d\n", greatest);
        }

        fclose(f);
        fclose(in);
        return 0;
}

Which takes about 40 seconds. I want the niceness of Python but a little
more speed than I'm getting (I'd settle for factor 2 or 3 slower, but
factor 30 is just too much).

Can anyone point out how to solve this efficiently in Python?

Kind regards,
Johannes

-- 
"Meine Gegenklage gegen dich lautet dann auf bewusste Verlogenheit,
verlästerung von Gott, Bibel und mir und bewusster Blasphemie."
         -- Prophet und Visionär Hans Joss aka HJP in de.sci.physik
                         <48d8bf1d$0$7510$54022...@news.sunrise.ch>
--
http://mail.python.org/mailman/listinfo/python-list

Reply via email to