Author: johan
Date: Sat Jun 12 13:08:51 2010
New Revision: 954000

URL: http://svn.apache.org/viewvc?rev=954000&view=rev
Log:
Remove references to -dev version of pig, add example script, use comparators 
singletons. Patch by Jeremy Hanna, review by johan. CASSANDRA-1150

Added:
    cassandra/trunk/contrib/pig/cassandra.yaml
    cassandra/trunk/contrib/pig/example-script.pig
Removed:
    cassandra/trunk/contrib/pig/storage-conf.xml
Modified:
    cassandra/trunk/NEWS.txt
    cassandra/trunk/contrib/pig/README.txt
    cassandra/trunk/contrib/pig/bin/pig_cassandra
    cassandra/trunk/contrib/pig/build.xml
    cassandra/trunk/contrib/word_count/src/WordCountSetup.java
    cassandra/trunk/src/java/org/apache/cassandra/cli/CliClient.java
    cassandra/trunk/src/java/org/apache/cassandra/db/ColumnFamilySerializer.java
    
cassandra/trunk/src/java/org/apache/cassandra/hadoop/ColumnFamilyRecordReader.java

Modified: cassandra/trunk/NEWS.txt
URL: 
http://svn.apache.org/viewvc/cassandra/trunk/NEWS.txt?rev=954000&r1=953999&r2=954000&view=diff
==============================================================================
--- cassandra/trunk/NEWS.txt (original)
+++ cassandra/trunk/NEWS.txt Sat Jun 12 13:08:51 2010
@@ -48,6 +48,12 @@ Thrift API
     - The get_string_property() method has been removed.
     - The get_string_list_property() method has been removed.
 
+Other
+-----
+    - If extending AbstractType, make sure you follow the singleton pattern
+      followed by Cassandra core AbstractType extensions.
+      e.g. BytesType has a variable called 'instance' and an empty constructor
+      with default access
 
 0.6.0
 =====

Modified: cassandra/trunk/contrib/pig/README.txt
URL: 
http://svn.apache.org/viewvc/cassandra/trunk/contrib/pig/README.txt?rev=954000&r1=953999&r2=954000&view=diff
==============================================================================
--- cassandra/trunk/contrib/pig/README.txt (original)
+++ cassandra/trunk/contrib/pig/README.txt Sat Jun 12 13:08:51 2010
@@ -4,14 +4,22 @@ Setup:
 
 First build and start a Cassandra server with the default
 configuration* and set the PIG_HOME and JAVA_HOME environment
-variables to the location of a Pig >= 0.7.0-dev install and your Java
+variables to the location of a Pig >= 0.7.0 install and your Java
 install. If you would like to run using the Hadoop backend, you should
 also set PIG_CONF_DIR to the location of your Hadoop config.
 
 Run:
 
 contrib/pig$ ant
-contrib/pig$ bin/pig_cassandra
+contrib/pig$ bin/pig_cassandra -x local example-script.pig
+
+This will run the test script against your Cassandra instance
+and will assume that there is a Keyspace1/Standard1 with some
+data in it. It will run in local mode (see pig docs for more info).
+
+If you'd like to get to a 'grunt>' shell prompt, run:
+
+contrib/pig$ bin/pig_cassandra -x local
 
 Once the 'grunt>' shell has loaded, try a simple program like the
 following, which will determine the top 50 column names:
@@ -26,4 +34,4 @@ grunt> topnames = LIMIT orderednames 50;
 grunt> dump topnames;
 
 *If you want to point Pig at a real cluster, modify the seed
-address in storage-conf.xml and re-run the build step.
+address in cassandra.yaml and re-run the build step.

Modified: cassandra/trunk/contrib/pig/bin/pig_cassandra
URL: 
http://svn.apache.org/viewvc/cassandra/trunk/contrib/pig/bin/pig_cassandra?rev=954000&r1=953999&r2=954000&view=diff
==============================================================================
--- cassandra/trunk/contrib/pig/bin/pig_cassandra (original)
+++ cassandra/trunk/contrib/pig/bin/pig_cassandra Sat Jun 12 13:08:51 2010
@@ -33,7 +33,7 @@ fi
 CLASSPATH=$CLASSPATH:$LOADFUNC_JAR
 
 if [ "x$PIG_HOME" = "x" ]; then
-    echo "PIG_HOME not set: requires Pig >= 0.7.0-dev" >&2
+    echo "PIG_HOME not set: requires Pig >= 0.7.0" >&2
     exit 1
 fi
 

Modified: cassandra/trunk/contrib/pig/build.xml
URL: 
http://svn.apache.org/viewvc/cassandra/trunk/contrib/pig/build.xml?rev=954000&r1=953999&r2=954000&view=diff
==============================================================================
--- cassandra/trunk/contrib/pig/build.xml (original)
+++ cassandra/trunk/contrib/pig/build.xml Sat Jun 12 13:08:51 2010
@@ -49,7 +49,7 @@
     </target>
 
     <target depends="init" name="build">
-        <fail unless="env.PIG_HOME" message="Please set PIG_HOME to the 
location of a Pig >= 0.7.0-dev install." />
+        <fail unless="env.PIG_HOME" message="Please set PIG_HOME to the 
location of a Pig >= 0.7.0 install." />
         <javac destdir="${build.classes}">
             <src path="${build.src}" />
             <classpath refid="classpath" />

Added: cassandra/trunk/contrib/pig/cassandra.yaml
URL: 
http://svn.apache.org/viewvc/cassandra/trunk/contrib/pig/cassandra.yaml?rev=954000&view=auto
==============================================================================
--- cassandra/trunk/contrib/pig/cassandra.yaml (added)
+++ cassandra/trunk/contrib/pig/cassandra.yaml Sat Jun 12 13:08:51 2010
@@ -0,0 +1,236 @@
+# Cassandra storage config YAML 
+# See http://wiki.apache.org/cassandra/StorageConfiguration for
+# explanations of configuration directives.
+
+# name of the cluster
+cluster_name: 'Test Cluster'
+
+# Set to true to make new [non-seed] nodes automatically migrate data
+# to themselves from the pre-existing nodes in the cluster.  Defaults
+# to false because you can only bootstrap N machines at a time from
+# an existing cluster of N, so if you are bringing up a cluster of
+# 10 machines with 3 seeds you would have to do it in stages.  Leaving
+# this off for the initial start simplifies that.
+auto_bootstrap: false
+
+# See http://wiki.apache.org/cassandra/HintedHandoff
+hinted_handoff_enabled: true
+
+# authentication backend, implementing IAuthenticator; used to limit keyspace 
access
+authenticator: org.apache.cassandra.auth.AllowAllAuthenticator
+
+# any IPartitioner may be used, including your own as long as it is on
+# the classpath.  Out of the box, Cassandra provides
+# org.apache.cassandra.dht.RandomPartitioner
+# org.apache.cassandra.dht.OrderPreservingPartitioner, and
+# org.apache.cassandra.dht.CollatingOrderPreservingPartitioner.
+partitioner: org.apache.cassandra.dht.RandomPartitioner
+
+# directories where Cassandra should store data on disk.
+data_file_directories:
+    - /var/lib/cassandra/data
+
+# Addresses of hosts that are deemed contact points. 
+# Cassandra nodes use this list of hosts to find each other and learn
+# the topology of the ring.  You must change this if you are running
+# multiple nodes!
+seeds:
+    - 127.0.0.1
+
+# Access mode.  mmapped i/o is substantially faster, but only practical on
+# a 64bit machine (which notably does not include EC2 "small" instances)
+# or relatively small datasets.  "auto", the safe choice, will enable
+# mmapping on a 64bit JVM.  Other values are "mmap", "mmap_index_only"
+# (which may allow you to get part of the benefits of mmap on a 32bit
+# machine by mmapping only index files) and "standard".
+# (The buffer size settings that follow only apply to standard,
+# non-mmapped i/o.)
+disk_access_mode: auto
+
+# Unlike most systems, in Cassandra writes are faster than reads, so
+# you can afford more of those in parallel.  A good rule of thumb is 2
+# concurrent reads per processor core.  Increase ConcurrentWrites to
+# the number of clients writing at once if you enable CommitLogSync +
+# CommitLogSyncDelay. -->
+concurrent_reads: 8
+concurrent_writes: 32
+
+# This sets the amount of memtable flush writer threads.  These will
+# be blocked by disk io, and each one will hold a memtable in memory
+# while blocked. If you have a large heap and many data directories,
+# you can increase this value for better flush performance.
+# By default this will be set to the amount of data directories defined.
+#memtable_flush_writers: 1
+
+# Buffer size to use when performing contiguous column slices. 
+# Increase this to the size of the column slices you typically perform
+sliced_buffer_size_in_kb: 64
+
+# TCP port, for commands and data
+storage_port: 7000
+
+# Address to bind to and tell other nodes to connect to. You _must_
+# change this if you want multiple nodes to be able to communicate!
+listen_address: localhost
+
+# The address to bind the Thrift RPC service to
+rpc_address: localhost
+# port for Thrift to listen on
+rpc_port: 9160
+# Whether or not to use a framed transport for Thrift.
+thrift_framed_transport: false
+snapshot_before_compaction: false
+
+# The threshold size in megabytes the binary memtable must grow to,
+# before it's submitted for flushing to disk.
+binary_memtable_throughput_in_mb: 256
+# Number of minutes to keep a memtable in memory
+memtable_flush_after_mins: 60
+# Size of the memtable in memory before it is dumped
+memtable_throughput_in_mb: 64
+# Number of objects in millions in the memtable before it is dumped
+memtable_operations_in_millions: 0.3
+# Buffer size to use when flushing !memtables to disk.
+flush_data_buffer_size_in_mb: 32
+# Increase (decrease) the index buffer size relative to the data
+# buffer if you have few (many) columns per key.
+flush_index_buffer_size_in_mb: 8
+
+column_index_size_in_kb: 64
+row_warning_threshold_in_mb: 512
+
+# commit log
+commitlog_directory: /var/lib/cassandra/commitlog
+
+# Size to allow commitlog to grow to before creating a new segment 
+commitlog_rotation_threshold_in_mb: 128
+
+# commitlog_sync may be either "periodic" or "batch." 
+# When in batch mode, Cassandra won't ack writes until the commit log
+# has been fsynced to disk.  It will wait up to
+# CommitLogSyncBatchWindowInMS milliseconds for other writes, before
+# performing the sync.
+commitlog_sync: periodic
+
+# the other option is "timed," where writes may be acked immediately
+# and the CommitLog is simply synced every commitlog_sync_period_in_ms
+# milliseconds.
+commitlog_sync_period_in_ms: 10000
+
+# Time to wait for a reply from other nodes before failing the command 
+rpc_timeout_in_ms: 10000
+
+# phi value that must be reached for a host to be marked down.
+# most users should never need to adjust this.
+# phi_convict_threshold: 8
+
+# time to wait before garbage collecting tombstones (deletion markers)
+gc_grace_seconds: 864000
+
+# endpoint_snitch -- Set this to a class that implements
+# IEndpointSnitch, which will let Cassandra know enough
+# about your network topology to route requests efficiently.
+# Out of the box, Cassandra provides
+# org.apache.cassandra.locator.SimpleSnitch,
+# org.apache.cassandra.locator.RackInferringSnitch, and
+# org.apache.cassandra.locator.PropertyFileSnitch.
+endpoint_snitch: org.apache.cassandra.locator.SimpleSnitch
+
+# A ColumnFamily is the Cassandra concept closest to a relational table. 
+#
+# Keyspaces are separate groups of ColumnFamilies.  Except in very
+# unusual circumstances you will have one Keyspace per application.
+#
+# Keyspace required parameters:
+# - name: name of the keyspace; "system" and "definitions" are 
+#   reserved for Cassandra Internals.
+# - replica_placement_strategy: the class that determines how replicas
+#   are distributed among nodes.  Must implement IReplicaPlacementStrategy.
+#   Out of the box, Cassandra provides 
+#     * org.apache.cassandra.locator.RackUnawareStrategy 
+#     * org.apache.cassandra.locator.RackAwareStrategy
+#     * org.apache.cassandra.locator.DatacenterShardStrategy
+#
+#   RackUnawareStrategy is the simplest; it simply places the first
+#   replica at the node whose token is closest to the key (as determined
+#   by the Partitioner), and additional replicas on subsequent nodes
+#   along the ring in increasing Token order.
+# 
+#   RackAwareStrategy is special cased for replication_factor of 3.  It
+#   places one replica in each of two datacenters, and the third on a
+#   different rack in in the first.
+#
+#   DatacenterShardStrategy is a generalization of RackAwareStrategy.
+#   For each datacenter, you can specify (in `datacenter.properties`)
+#   how many replicas you want on a per-keyspace basis.  Replicas are
+#   placed on different racks within each DC, if possible.
+# 
+# - replication_factor: Number of replicas of each row
+# - column_families: column families associated with this keyspace
+#
+# ColumnFamily required parameters:
+# - name: name of the ColumnFamily.  Must not contain the character "-".
+# - compare_with: tells Cassandra how to sort the columns for slicing
+#   operations. The default is BytesType, which is a straightforward
+#   lexical comparison of the bytes in each column.  Other options are
+#   AsciiType, UTF8Type, LexicalUUIDType, TimeUUIDType, and LongType.
+#   You can also specify the fully-qualified class name to a class of
+#   your choice extending org.apache.cassandra.db.marshal.AbstractType.
+#
+# ColumnFamily optional parameters:
+# - keys_cached: specifies the number of keys per sstable whose
+#   locations we keep in memory in "mostly LRU" order.  (JUST the key
+#   locations, NOT any column values.) Specify a fraction (value less
+#   than 1) or an absolute number of keys to cache.  Defaults to 200000
+#   keys.
+# - rows_cached: specifies the number of rows whose entire contents we
+#   cache in memory. Do not use this on ColumnFamilies with large rows,
+#   or ColumnFamilies with high write:read ratios. Specify a fraction
+#   (value less than 1) or an absolute number of rows to cache.
+#   Defaults to 0. (i.e. row caching is off by default)
+# - comment: used to attach additional human-readable information about 
+#   the column family to its definition.
+# - read_repair_chance: specifies the probability with which read
+#   repairs should be invoked on non-quorum reads.  must be between 0
+#   and 1. defaults to 1.0 (always read repair).
+# - preload_row_cache: If true, will populate row cache on startup.
+#   Defaults to false.
+#
+# NOTE: this keyspace definition is for demonstration purposes only.
+#       Cassandra will not load these definitions during startup. See
+#       http://wiki.apache.org/cassandra/FAQ#no_keyspaces for an explanation.
+keyspaces:
+    - name: Keyspace1
+      replica_placement_strategy: 
org.apache.cassandra.locator.RackUnawareStrategy
+      replication_factor: 1
+      column_families:
+        - name: Standard1
+          compare_with: BytesType
+
+        - name: Standard2
+          compare_with: UTF8Type
+          read_repair_chance: 0.1
+          keys_cached: 100
+
+        - name: StandardByUUID1
+          compare_with: TimeUUIDType
+          clock_type: Timestamp
+          reconciler: TimestampReconciler          
+
+        - name: Super1
+          column_type: Super
+          compare_with: BytesType
+          compare_subcolumns_with: BytesType
+
+        - name: Super2
+          column_type: Super
+          compare_subcolumns_with: UTF8Type
+          preload_row_cache: true
+          rows_cached: 10000
+          keys_cached: 50
+          comment: 'A column family with supercolumns, whose column and 
subcolumn names are UTF8 strings'
+
+        - name: Super3
+          column_type: Super
+          compare_with: LongType
+          comment: 'A column family with supercolumns, whose column names are 
Longs (8 bytes)'

Added: cassandra/trunk/contrib/pig/example-script.pig
URL: 
http://svn.apache.org/viewvc/cassandra/trunk/contrib/pig/example-script.pig?rev=954000&view=auto
==============================================================================
--- cassandra/trunk/contrib/pig/example-script.pig (added)
+++ cassandra/trunk/contrib/pig/example-script.pig Sat Jun 12 13:08:51 2010
@@ -0,0 +1,8 @@
+rows = LOAD 'cassandra://Keyspace1/Standard1' USING CassandraStorage();
+cols = FOREACH rows GENERATE flatten($1);
+colnames = FOREACH cols GENERATE $0;
+namegroups = GROUP colnames BY $0;
+namecounts = FOREACH namegroups GENERATE COUNT($1), group;
+orderednames = ORDER namecounts BY $0;
+topnames = LIMIT orderednames 50;
+dump topnames;
\ No newline at end of file

Modified: cassandra/trunk/contrib/word_count/src/WordCountSetup.java
URL: 
http://svn.apache.org/viewvc/cassandra/trunk/contrib/word_count/src/WordCountSetup.java?rev=954000&r1=953999&r2=954000&view=diff
==============================================================================
--- cassandra/trunk/contrib/word_count/src/WordCountSetup.java (original)
+++ cassandra/trunk/contrib/word_count/src/WordCountSetup.java Sat Jun 12 
13:08:51 2010
@@ -46,6 +46,8 @@ public class WordCountSetup
         Map<byte[], Map<String,List<Mutation>>> mutationMap;
         Column c;
 
+        // text0: no rows
+
         // text1: 1 row, 1 word
         c = new Column("text1".getBytes(), "word1".getBytes(), new 
Clock(System.currentTimeMillis()));
         mutationMap = getMutationMap("key0".getBytes(), 
WordCount.COLUMN_FAMILY, c);

Modified: cassandra/trunk/src/java/org/apache/cassandra/cli/CliClient.java
URL: 
http://svn.apache.org/viewvc/cassandra/trunk/src/java/org/apache/cassandra/cli/CliClient.java?rev=954000&r1=953999&r2=954000&view=diff
==============================================================================
--- cassandra/trunk/src/java/org/apache/cassandra/cli/CliClient.java (original)
+++ cassandra/trunk/src/java/org/apache/cassandra/cli/CliClient.java Sat Jun 12 
13:08:51 2010
@@ -47,7 +47,7 @@ public class CliClient 
     }
 
     // Execute a CLI Statement 
-    public void executeCLIStmt(String stmt) throws TException, 
NotFoundException, InvalidRequestException, UnavailableException, 
TimedOutException, IllegalAccessException, ClassNotFoundException, 
InstantiationException
+    public void executeCLIStmt(String stmt) throws TException, 
NotFoundException, InvalidRequestException, UnavailableException, 
TimedOutException, IllegalAccessException, ClassNotFoundException, 
InstantiationException, NoSuchFieldException
     {
         CommonTree ast = null;
 
@@ -243,7 +243,7 @@ public class CliClient 
     }
 
     private void doSlice(String keyspace, String key, String columnFamily, 
byte[] superColumnName)
-            throws InvalidRequestException, UnavailableException, 
TimedOutException, TException, UnsupportedEncodingException, 
IllegalAccessException, NotFoundException, InstantiationException, 
ClassNotFoundException
+            throws InvalidRequestException, UnavailableException, 
TimedOutException, TException, UnsupportedEncodingException, 
IllegalAccessException, NotFoundException, InstantiationException, 
NoSuchFieldException
     {
         SliceRange range = new SliceRange(ArrayUtils.EMPTY_BYTE_ARRAY, 
ArrayUtils.EMPTY_BYTE_ARRAY, true, 1000000);
         List<ColumnOrSuperColumn> columns = 
thriftClient_.get_slice(key.getBytes(),
@@ -276,34 +276,36 @@ public class CliClient 
         css_.out.println("Returned " + size + " results.");
     }
  
-    private String formatSuperColumnName(String keyspace, String columnFamily, 
SuperColumn column) throws NotFoundException, TException, 
ClassNotFoundException, IllegalAccessException, InstantiationException
+    private String formatSuperColumnName(String keyspace, String columnFamily, 
SuperColumn column) throws NotFoundException, TException, 
IllegalAccessException, InstantiationException, NoSuchFieldException
     {
         return 
getFormatTypeForColumn(keyspacesMap.get(keyspace).get(columnFamily).get("CompareWith")).getString(column.name);
     }
 
-    private String formatSubcolumnName(String keyspace, String columnFamily, 
Column subcolumn) throws NotFoundException, TException, ClassNotFoundException, 
IllegalAccessException, InstantiationException
+    private String formatSubcolumnName(String keyspace, String columnFamily, 
Column subcolumn) throws NotFoundException, TException, IllegalAccessException, 
InstantiationException, NoSuchFieldException
     {
         return 
getFormatTypeForColumn(keyspacesMap.get(keyspace).get(columnFamily).get("CompareSubcolumnsWith")).getString(subcolumn.name);
     }
 
-    private String formatColumnName(String keyspace, String columnFamily, 
Column column) throws ClassNotFoundException, NotFoundException, TException, 
IllegalAccessException, InstantiationException
+    private String formatColumnName(String keyspace, String columnFamily, 
Column column) throws NotFoundException, TException, IllegalAccessException, 
InstantiationException, NoSuchFieldException
     {
         return 
getFormatTypeForColumn(keyspacesMap.get(keyspace).get(columnFamily).get("CompareWith")).getString(column.name);
     }
 
-    private AbstractType getFormatTypeForColumn(String compareWith) throws 
ClassNotFoundException, IllegalAccessException, InstantiationException
+    private AbstractType getFormatTypeForColumn(String compareWith) throws 
IllegalAccessException, InstantiationException, NoSuchFieldException
     {
         AbstractType type;
         try {
-            type = (AbstractType) Class.forName(compareWith).newInstance();
+            // Get the singleton instance of the AbstractType subclass
+            Class c = Class.forName(compareWith);
+            type = (AbstractType) c.getField("instance").get(c);
         } catch (ClassNotFoundException e) {
-            type = BytesType.class.newInstance();
+            type = BytesType.instance;
         }
         return type;
     }
 
     // Execute GET statement
-    private void executeGet(CommonTree ast) throws TException, 
NotFoundException, InvalidRequestException, UnavailableException, 
TimedOutException, UnsupportedEncodingException, IllegalAccessException, 
InstantiationException, ClassNotFoundException
+    private void executeGet(CommonTree ast) throws TException, 
NotFoundException, InvalidRequestException, UnavailableException, 
TimedOutException, UnsupportedEncodingException, IllegalAccessException, 
InstantiationException, ClassNotFoundException, NoSuchFieldException
     {
         if (!CliMain.isConnected() || !hasKeySpace())
             return;

Modified: 
cassandra/trunk/src/java/org/apache/cassandra/db/ColumnFamilySerializer.java
URL: 
http://svn.apache.org/viewvc/cassandra/trunk/src/java/org/apache/cassandra/db/ColumnFamilySerializer.java?rev=954000&r1=953999&r2=954000&view=diff
==============================================================================
--- 
cassandra/trunk/src/java/org/apache/cassandra/db/ColumnFamilySerializer.java 
(original)
+++ 
cassandra/trunk/src/java/org/apache/cassandra/db/ColumnFamilySerializer.java 
Sat Jun 12 13:08:51 2010
@@ -128,7 +128,9 @@ public class ColumnFamilySerializer impl
 
         try
         {
-            return 
(AbstractType)Class.forName(className).getConstructor().newInstance();
+            // Get the singleton instance of the AbstractType subclass
+            Class c = Class.forName(className);
+            return (AbstractType) c.getField("instance").get(c);
         }
         catch (ClassNotFoundException e)
         {

Modified: 
cassandra/trunk/src/java/org/apache/cassandra/hadoop/ColumnFamilyRecordReader.java
URL: 
http://svn.apache.org/viewvc/cassandra/trunk/src/java/org/apache/cassandra/hadoop/ColumnFamilyRecordReader.java?rev=954000&r1=953999&r2=954000&view=diff
==============================================================================
--- 
cassandra/trunk/src/java/org/apache/cassandra/hadoop/ColumnFamilyRecordReader.java
 (original)
+++ 
cassandra/trunk/src/java/org/apache/cassandra/hadoop/ColumnFamilyRecordReader.java
 Sat Jun 12 13:08:51 2010
@@ -187,7 +187,7 @@ public class ColumnFamilyRecordReader ex
          */
         private void maybeConnect() throws InvalidRequestException, 
TException, AuthenticationException, 
             AuthorizationException, NotFoundException, InstantiationException, 
IllegalAccessException, 
-            ClassNotFoundException
+            ClassNotFoundException, NoSuchFieldException
         {
             // only need to connect once
             if (socket != null && socket.isOpen())
@@ -213,7 +213,9 @@ public class ColumnFamilyRecordReader ex
                 Map<String, Map<String,String>> desc = 
client.describe_keyspace(keyspace);
                 Map<String,String> ksProps = desc.get(cfName);
                 String compClass = ksProps.get("CompareWith");
-                comparator = (AbstractType) 
Class.forName(compClass).newInstance();
+                // Get the singleton instance of the AbstractType subclass
+                Class c = Class.forName(compClass);
+                comparator = (AbstractType) c.getField("instance").get(c);
             }
         }
 


Reply via email to