On Tue, Apr 05, 2011 at 11:57:46PM -0700, elij wrote:
> - remove need to use mysql for generating the sql
> - just consider categories an integer range, specified to the size
>   of that in the aur-schema.
> - use the logging module instead of writing directly to stderr
>   this makes the code cleaner as it removes the numerous tests for the value
>   of DBUG, yet allows devs to control the level of output verbosity.
> ---
>  support/schema/gendummydata.py |  106 +++++++++------------------------------
>  1 files changed, 25 insertions(+), 81 deletions(-)
> 

I agree with both changes, but please split that one into two separate
patches.

> diff --git a/support/schema/gendummydata.py b/support/schema/gendummydata.py
> index 7b1d0cf..8ed9f69 100755
> --- a/support/schema/gendummydata.py
> +++ b/support/schema/gendummydata.py
> @@ -15,9 +15,9 @@ import os
>  import sys
>  import cStringIO
>  import commands
> +import logging
>  
> -
> -DBUG      = 1
> +log_level = logging.DEBUG # logging level. set to logging.INFO to reduce 
> output

I'm not a Python coder, but is there any reason to use lowercase here
whereas we use uppercase for all other constants?

>  SEED_FILE = "/usr/share/dict/words"
>  DB_HOST   = os.getenv("DB_HOST", "localhost")
>  DB_NAME   = os.getenv("DB_NAME", "AUR")
> @@ -33,6 +33,7 @@ PKG_FILES = (8, 30)    # min/max number of files in a 
> package
>  PKG_DEPS  = (1, 5)     # min/max depends a package has
>  PKG_SRC   = (1, 3)     # min/max sources a package has
>  PKG_CMNTS = (1, 5)     # min/max number of comments a package has
> +CATEGORIES_COUNT = 17  # the number of categories from aur-schema
>  VOTING    = (0, .30)   # percentage range for package voting
>  RANDOM_PATHS = (       # random path locations for package files
>       "/usr/bin", "/usr/lib", "/etc", "/etc/rc.d", "/usr/share", "/lib",
> @@ -45,44 +46,25 @@ RANDOM_URL = ("http://www.";, "ftp://ftp.";, "http://";, 
> "ftp://";)
>  RANDOM_LOCS = ("pub", "release", "files", "downloads", "src")
>  FORTUNE_CMD = "/usr/bin/fortune -l"
>  
> +# setup logging
> +logformat = "%(levelname)s: %(message)s"
> +logging.basicConfig(format=logformat, level=log_level)
> +log = logging.getLogger()
>  
>  if len(sys.argv) != 2:
> -     sys.stderr.write("Missing output filename argument");
> +     log.error("Missing output filename argument")
>       raise SystemExit
>  
>  # make sure the seed file exists
>  #
>  if not os.path.exists(SEED_FILE):
> -     sys.stderr.write("Please install the 'words' Arch package\n");
> -     raise SystemExit
> -
> -# Make sure database access will be available
> -#
> -try:
> -     import MySQLdb
> -except:
> -     sys.stderr.write("Please install the 'mysql-python' Arch package\n");
> -     raise SystemExit
> -
> -# try to connect to database
> -#
> -try:
> -     db = MySQLdb.connect(host = DB_HOST, user = DB_USER,
> -                     db = DB_NAME, passwd = DB_PASS)
> -     dbc = db.cursor()
> -except:
> -     sys.stderr.write("Could not connect to database\n");
> +     log.error("Please install the 'words' Arch package")
>       raise SystemExit

Shouldn't we rather use "sys.exit(1);" here instead of raising a
SystemExit exception? That way we'd have a proper exit status, also.
Might be something to include in the debugging/error handling patch.

>  
> -esc = db.escape_string
> -
> -
>  # track what users/package names have been used
>  #
>  seen_users = {}
>  seen_pkgs = {}
> -categories = {}
> -category_keys = []
>  user_keys = []
>  
>  # some functions to generate random data
> @@ -95,14 +77,14 @@ def genVersion():
>               ver.append("%d" % random.randrange(0,100))
>       return ".".join(ver) + "-u%d" % random.randrange(1,11)
>  def genCategory():
> -     return categories[category_keys[random.randrange(0,len(category_keys))]]
> +     return random.randrange(0,CATEGORIES_COUNT)
>  def genUID():
>       return seen_users[user_keys[random.randrange(0,len(user_keys))]]
>  
>  
>  # load the words, and make sure there are enough words for users/pkgs
>  #
> -if DBUG: print "Grabbing words from seed file..."
> +log.debug("Grabbing words from seed file...")
>  fp = open(SEED_FILE, "r")
>  contents = fp.readlines()
>  fp.close()
> @@ -117,7 +99,7 @@ else:
>  
>  # select random usernames
>  #
> -if DBUG: print "Generating random user names..."
> +log.debug("Generating random user names...")
>  user_id = USER_ID
>  while len(seen_users) < MAX_USERS:
>       user = random.randrange(0, len(contents))
> @@ -130,7 +112,7 @@ user_keys = seen_users.keys()
>  
>  # select random package names
>  #
> -if DBUG: print "Generating random package names..."
> +log.debug("Generating random package names...")
>  num_pkgs = PKG_ID
>  while len(seen_pkgs) < MAX_PKGS:
>       pkg = random.randrange(0, len(contents))
> @@ -149,22 +131,6 @@ while len(seen_pkgs) < MAX_PKGS:
>  #
>  contents = None
>  
> -# Load package categories from database
> -#
> -if DBUG: print "Loading package categories..."
> -q = "SELECT * FROM PackageCategories"
> -dbc.execute(q)
> -row = dbc.fetchone()
> -while row:
> -     categories[row[1]] = row[0]
> -     row = dbc.fetchone()
> -category_keys = categories.keys()
> -
> -# done with the database
> -#
> -dbc.close()
> -db.close()
> -
>  # developer/tu IDs
>  #
>  developers = []
> @@ -179,8 +145,7 @@ out.write("BEGIN;\n")
>  
>  # Begin by creating the User statements
>  #
> -if DBUG: print "Creating SQL statements for users.",
> -count = 0
> +log.debug("Creating SQL statements for users.")
>  for u in user_keys:
>       account_type = 1  # default to normal user
>       if not has_devs or not has_tus:
> @@ -201,22 +166,18 @@ for u in user_keys:
>                       # a normal user account
>                       #
>                       pass
> -     
> +
>       s = "INSERT INTO Users (ID, AccountTypeID, Username, Email, Passwd) 
> VALUES (%d, %d, '%s', '%s...@example.com', MD5('%s'));\n" % (seen_users[u], 
> account_type, u, u, u)
>       out.write(s)
> -     if count % 10 == 0:
> -             if DBUG: print ".",
> -     count += 1
> -if DBUG: print "."
> -if DBUG:
> -     print "Number of developers:", len(developers)
> -     print "Number of trusted users:", len(trustedusers)
> -     print "Number of users:", (MAX_USERS-len(developers)-len(trustedusers))
> -     print "Number of packages:", MAX_PKGS
> +
> +log.debug("Number of developers: %d" % len(developers))
> +log.debug("Number of trusted users: %d" % len(trustedusers))
> +log.debug("Number of users: %d" % 
> (MAX_USERS-len(developers)-len(trustedusers)))
> +log.debug("Number of packages: %d" % MAX_PKGS)
>  
>  # Create the package statements
>  #
> -if DBUG: print "Creating SQL statements for packages.",
> +log.debug("Creating SQL statements for packages.")
>  count = 0
>  for p in seen_pkgs.keys():
>       NOW = int(time.time())
> @@ -237,26 +198,21 @@ for p in seen_pkgs.keys():
>                       genCategory(), NOW, uuid, muid)
>  
>       out.write(s)
> -     if count % 100 == 0:
> -             if DBUG: print ".",
>       count += 1
>  
>       # create random comments for this package
>       #
>       num_comments = random.randrange(PKG_CMNTS[0], PKG_CMNTS[1])
>       for i in range(0, num_comments):
> -             fortune = esc(commands.getoutput(FORTUNE_CMD).replace("'",""))
> +             fortune = commands.getoutput(FORTUNE_CMD).replace("'","")

Why did you drop escape_string() here?

>               now = NOW + random.randrange(400, 86400*3)
>               s = "INSERT INTO PackageComments (PackageID, UsersID, Comments, 
> CommentTS) VALUES (%d, %d, '%s', %d);\n" % (seen_pkgs[p], genUID(), fortune, 
> now)
>               out.write(s)
>  
> -if DBUG: print "."
> -
>  # Cast votes
>  #
>  track_votes = {}
> -if DBUG: print "Casting votes for packages.",
> -count = 0
> +log.debug("Casting votes for packages.")
>  for u in user_keys:
>       num_votes = random.randrange(int(len(seen_pkgs)*VOTING[0]),
>                       int(len(seen_pkgs)*VOTING[1]))
> @@ -270,9 +226,6 @@ for u in user_keys:
>                               track_votes[pkg] = 0
>                       track_votes[pkg] += 1
>                       out.write(s)
> -                     if count % 100 == 0:
> -                             if DBUG: print ".",
> -                     count += 1
>  
>  # Update statements for package votes
>  #
> @@ -282,8 +235,7 @@ for p in track_votes.keys():
>  
>  # Create package dependencies and sources
>  #
> -if DBUG: print "."; print "Creating statements for package depends/sources.",
> -count = 0
> +log.debug("Creating statements for package depends/sources.")
>  for p in seen_pkgs.keys():
>       num_deps = random.randrange(PKG_DEPS[0], PKG_DEPS[1])
>       this_deps = {}
> @@ -307,17 +259,9 @@ for p in seen_pkgs.keys():
>                               seen_pkgs[p], src)
>               out.write(s)
>  
> -     if count % 100 == 0:
> -             if DBUG: print ".",
> -     count += 1
> -
> -
>  # close output file
>  #
>  out.write("COMMIT;\n")
>  out.write("\n")
>  out.close()
> -
> -if DBUG: print "."
> -if DBUG: print "Done."
> -
> +log.debug("Done.")
> -- 
> 1.7.4.1

Reply via email to