Author: johan Date: Sat Jun 12 13:08:51 2010 New Revision: 954000 URL: http://svn.apache.org/viewvc?rev=954000&view=rev Log: Remove references to -dev version of pig, add example script, use comparators singletons. Patch by Jeremy Hanna, review by johan. CASSANDRA-1150
Added: cassandra/trunk/contrib/pig/cassandra.yaml cassandra/trunk/contrib/pig/example-script.pig Removed: cassandra/trunk/contrib/pig/storage-conf.xml Modified: cassandra/trunk/NEWS.txt cassandra/trunk/contrib/pig/README.txt cassandra/trunk/contrib/pig/bin/pig_cassandra cassandra/trunk/contrib/pig/build.xml cassandra/trunk/contrib/word_count/src/WordCountSetup.java cassandra/trunk/src/java/org/apache/cassandra/cli/CliClient.java cassandra/trunk/src/java/org/apache/cassandra/db/ColumnFamilySerializer.java cassandra/trunk/src/java/org/apache/cassandra/hadoop/ColumnFamilyRecordReader.java Modified: cassandra/trunk/NEWS.txt URL: http://svn.apache.org/viewvc/cassandra/trunk/NEWS.txt?rev=954000&r1=953999&r2=954000&view=diff ============================================================================== --- cassandra/trunk/NEWS.txt (original) +++ cassandra/trunk/NEWS.txt Sat Jun 12 13:08:51 2010 @@ -48,6 +48,12 @@ Thrift API - The get_string_property() method has been removed. - The get_string_list_property() method has been removed. +Other +----- + - If extending AbstractType, make sure you follow the singleton pattern + followed by Cassandra core AbstractType extensions. + e.g. BytesType has a variable called 'instance' and an empty constructor + with default access 0.6.0 ===== Modified: cassandra/trunk/contrib/pig/README.txt URL: http://svn.apache.org/viewvc/cassandra/trunk/contrib/pig/README.txt?rev=954000&r1=953999&r2=954000&view=diff ============================================================================== --- cassandra/trunk/contrib/pig/README.txt (original) +++ cassandra/trunk/contrib/pig/README.txt Sat Jun 12 13:08:51 2010 @@ -4,14 +4,22 @@ Setup: First build and start a Cassandra server with the default configuration* and set the PIG_HOME and JAVA_HOME environment -variables to the location of a Pig >= 0.7.0-dev install and your Java +variables to the location of a Pig >= 0.7.0 install and your Java install. If you would like to run using the Hadoop backend, you should also set PIG_CONF_DIR to the location of your Hadoop config. Run: contrib/pig$ ant -contrib/pig$ bin/pig_cassandra +contrib/pig$ bin/pig_cassandra -x local example-script.pig + +This will run the test script against your Cassandra instance +and will assume that there is a Keyspace1/Standard1 with some +data in it. It will run in local mode (see pig docs for more info). + +If you'd like to get to a 'grunt>' shell prompt, run: + +contrib/pig$ bin/pig_cassandra -x local Once the 'grunt>' shell has loaded, try a simple program like the following, which will determine the top 50 column names: @@ -26,4 +34,4 @@ grunt> topnames = LIMIT orderednames 50; grunt> dump topnames; *If you want to point Pig at a real cluster, modify the seed -address in storage-conf.xml and re-run the build step. +address in cassandra.yaml and re-run the build step. Modified: cassandra/trunk/contrib/pig/bin/pig_cassandra URL: http://svn.apache.org/viewvc/cassandra/trunk/contrib/pig/bin/pig_cassandra?rev=954000&r1=953999&r2=954000&view=diff ============================================================================== --- cassandra/trunk/contrib/pig/bin/pig_cassandra (original) +++ cassandra/trunk/contrib/pig/bin/pig_cassandra Sat Jun 12 13:08:51 2010 @@ -33,7 +33,7 @@ fi CLASSPATH=$CLASSPATH:$LOADFUNC_JAR if [ "x$PIG_HOME" = "x" ]; then - echo "PIG_HOME not set: requires Pig >= 0.7.0-dev" >&2 + echo "PIG_HOME not set: requires Pig >= 0.7.0" >&2 exit 1 fi Modified: cassandra/trunk/contrib/pig/build.xml URL: http://svn.apache.org/viewvc/cassandra/trunk/contrib/pig/build.xml?rev=954000&r1=953999&r2=954000&view=diff ============================================================================== --- cassandra/trunk/contrib/pig/build.xml (original) +++ cassandra/trunk/contrib/pig/build.xml Sat Jun 12 13:08:51 2010 @@ -49,7 +49,7 @@ </target> <target depends="init" name="build"> - <fail unless="env.PIG_HOME" message="Please set PIG_HOME to the location of a Pig >= 0.7.0-dev install." /> + <fail unless="env.PIG_HOME" message="Please set PIG_HOME to the location of a Pig >= 0.7.0 install." /> <javac destdir="${build.classes}"> <src path="${build.src}" /> <classpath refid="classpath" /> Added: cassandra/trunk/contrib/pig/cassandra.yaml URL: http://svn.apache.org/viewvc/cassandra/trunk/contrib/pig/cassandra.yaml?rev=954000&view=auto ============================================================================== --- cassandra/trunk/contrib/pig/cassandra.yaml (added) +++ cassandra/trunk/contrib/pig/cassandra.yaml Sat Jun 12 13:08:51 2010 @@ -0,0 +1,236 @@ +# Cassandra storage config YAML +# See http://wiki.apache.org/cassandra/StorageConfiguration for +# explanations of configuration directives. + +# name of the cluster +cluster_name: 'Test Cluster' + +# Set to true to make new [non-seed] nodes automatically migrate data +# to themselves from the pre-existing nodes in the cluster. Defaults +# to false because you can only bootstrap N machines at a time from +# an existing cluster of N, so if you are bringing up a cluster of +# 10 machines with 3 seeds you would have to do it in stages. Leaving +# this off for the initial start simplifies that. +auto_bootstrap: false + +# See http://wiki.apache.org/cassandra/HintedHandoff +hinted_handoff_enabled: true + +# authentication backend, implementing IAuthenticator; used to limit keyspace access +authenticator: org.apache.cassandra.auth.AllowAllAuthenticator + +# any IPartitioner may be used, including your own as long as it is on +# the classpath. Out of the box, Cassandra provides +# org.apache.cassandra.dht.RandomPartitioner +# org.apache.cassandra.dht.OrderPreservingPartitioner, and +# org.apache.cassandra.dht.CollatingOrderPreservingPartitioner. +partitioner: org.apache.cassandra.dht.RandomPartitioner + +# directories where Cassandra should store data on disk. +data_file_directories: + - /var/lib/cassandra/data + +# Addresses of hosts that are deemed contact points. +# Cassandra nodes use this list of hosts to find each other and learn +# the topology of the ring. You must change this if you are running +# multiple nodes! +seeds: + - 127.0.0.1 + +# Access mode. mmapped i/o is substantially faster, but only practical on +# a 64bit machine (which notably does not include EC2 "small" instances) +# or relatively small datasets. "auto", the safe choice, will enable +# mmapping on a 64bit JVM. Other values are "mmap", "mmap_index_only" +# (which may allow you to get part of the benefits of mmap on a 32bit +# machine by mmapping only index files) and "standard". +# (The buffer size settings that follow only apply to standard, +# non-mmapped i/o.) +disk_access_mode: auto + +# Unlike most systems, in Cassandra writes are faster than reads, so +# you can afford more of those in parallel. A good rule of thumb is 2 +# concurrent reads per processor core. Increase ConcurrentWrites to +# the number of clients writing at once if you enable CommitLogSync + +# CommitLogSyncDelay. --> +concurrent_reads: 8 +concurrent_writes: 32 + +# This sets the amount of memtable flush writer threads. These will +# be blocked by disk io, and each one will hold a memtable in memory +# while blocked. If you have a large heap and many data directories, +# you can increase this value for better flush performance. +# By default this will be set to the amount of data directories defined. +#memtable_flush_writers: 1 + +# Buffer size to use when performing contiguous column slices. +# Increase this to the size of the column slices you typically perform +sliced_buffer_size_in_kb: 64 + +# TCP port, for commands and data +storage_port: 7000 + +# Address to bind to and tell other nodes to connect to. You _must_ +# change this if you want multiple nodes to be able to communicate! +listen_address: localhost + +# The address to bind the Thrift RPC service to +rpc_address: localhost +# port for Thrift to listen on +rpc_port: 9160 +# Whether or not to use a framed transport for Thrift. +thrift_framed_transport: false +snapshot_before_compaction: false + +# The threshold size in megabytes the binary memtable must grow to, +# before it's submitted for flushing to disk. +binary_memtable_throughput_in_mb: 256 +# Number of minutes to keep a memtable in memory +memtable_flush_after_mins: 60 +# Size of the memtable in memory before it is dumped +memtable_throughput_in_mb: 64 +# Number of objects in millions in the memtable before it is dumped +memtable_operations_in_millions: 0.3 +# Buffer size to use when flushing !memtables to disk. +flush_data_buffer_size_in_mb: 32 +# Increase (decrease) the index buffer size relative to the data +# buffer if you have few (many) columns per key. +flush_index_buffer_size_in_mb: 8 + +column_index_size_in_kb: 64 +row_warning_threshold_in_mb: 512 + +# commit log +commitlog_directory: /var/lib/cassandra/commitlog + +# Size to allow commitlog to grow to before creating a new segment +commitlog_rotation_threshold_in_mb: 128 + +# commitlog_sync may be either "periodic" or "batch." +# When in batch mode, Cassandra won't ack writes until the commit log +# has been fsynced to disk. It will wait up to +# CommitLogSyncBatchWindowInMS milliseconds for other writes, before +# performing the sync. +commitlog_sync: periodic + +# the other option is "timed," where writes may be acked immediately +# and the CommitLog is simply synced every commitlog_sync_period_in_ms +# milliseconds. +commitlog_sync_period_in_ms: 10000 + +# Time to wait for a reply from other nodes before failing the command +rpc_timeout_in_ms: 10000 + +# phi value that must be reached for a host to be marked down. +# most users should never need to adjust this. +# phi_convict_threshold: 8 + +# time to wait before garbage collecting tombstones (deletion markers) +gc_grace_seconds: 864000 + +# endpoint_snitch -- Set this to a class that implements +# IEndpointSnitch, which will let Cassandra know enough +# about your network topology to route requests efficiently. +# Out of the box, Cassandra provides +# org.apache.cassandra.locator.SimpleSnitch, +# org.apache.cassandra.locator.RackInferringSnitch, and +# org.apache.cassandra.locator.PropertyFileSnitch. +endpoint_snitch: org.apache.cassandra.locator.SimpleSnitch + +# A ColumnFamily is the Cassandra concept closest to a relational table. +# +# Keyspaces are separate groups of ColumnFamilies. Except in very +# unusual circumstances you will have one Keyspace per application. +# +# Keyspace required parameters: +# - name: name of the keyspace; "system" and "definitions" are +# reserved for Cassandra Internals. +# - replica_placement_strategy: the class that determines how replicas +# are distributed among nodes. Must implement IReplicaPlacementStrategy. +# Out of the box, Cassandra provides +# * org.apache.cassandra.locator.RackUnawareStrategy +# * org.apache.cassandra.locator.RackAwareStrategy +# * org.apache.cassandra.locator.DatacenterShardStrategy +# +# RackUnawareStrategy is the simplest; it simply places the first +# replica at the node whose token is closest to the key (as determined +# by the Partitioner), and additional replicas on subsequent nodes +# along the ring in increasing Token order. +# +# RackAwareStrategy is special cased for replication_factor of 3. It +# places one replica in each of two datacenters, and the third on a +# different rack in in the first. +# +# DatacenterShardStrategy is a generalization of RackAwareStrategy. +# For each datacenter, you can specify (in `datacenter.properties`) +# how many replicas you want on a per-keyspace basis. Replicas are +# placed on different racks within each DC, if possible. +# +# - replication_factor: Number of replicas of each row +# - column_families: column families associated with this keyspace +# +# ColumnFamily required parameters: +# - name: name of the ColumnFamily. Must not contain the character "-". +# - compare_with: tells Cassandra how to sort the columns for slicing +# operations. The default is BytesType, which is a straightforward +# lexical comparison of the bytes in each column. Other options are +# AsciiType, UTF8Type, LexicalUUIDType, TimeUUIDType, and LongType. +# You can also specify the fully-qualified class name to a class of +# your choice extending org.apache.cassandra.db.marshal.AbstractType. +# +# ColumnFamily optional parameters: +# - keys_cached: specifies the number of keys per sstable whose +# locations we keep in memory in "mostly LRU" order. (JUST the key +# locations, NOT any column values.) Specify a fraction (value less +# than 1) or an absolute number of keys to cache. Defaults to 200000 +# keys. +# - rows_cached: specifies the number of rows whose entire contents we +# cache in memory. Do not use this on ColumnFamilies with large rows, +# or ColumnFamilies with high write:read ratios. Specify a fraction +# (value less than 1) or an absolute number of rows to cache. +# Defaults to 0. (i.e. row caching is off by default) +# - comment: used to attach additional human-readable information about +# the column family to its definition. +# - read_repair_chance: specifies the probability with which read +# repairs should be invoked on non-quorum reads. must be between 0 +# and 1. defaults to 1.0 (always read repair). +# - preload_row_cache: If true, will populate row cache on startup. +# Defaults to false. +# +# NOTE: this keyspace definition is for demonstration purposes only. +# Cassandra will not load these definitions during startup. See +# http://wiki.apache.org/cassandra/FAQ#no_keyspaces for an explanation. +keyspaces: + - name: Keyspace1 + replica_placement_strategy: org.apache.cassandra.locator.RackUnawareStrategy + replication_factor: 1 + column_families: + - name: Standard1 + compare_with: BytesType + + - name: Standard2 + compare_with: UTF8Type + read_repair_chance: 0.1 + keys_cached: 100 + + - name: StandardByUUID1 + compare_with: TimeUUIDType + clock_type: Timestamp + reconciler: TimestampReconciler + + - name: Super1 + column_type: Super + compare_with: BytesType + compare_subcolumns_with: BytesType + + - name: Super2 + column_type: Super + compare_subcolumns_with: UTF8Type + preload_row_cache: true + rows_cached: 10000 + keys_cached: 50 + comment: 'A column family with supercolumns, whose column and subcolumn names are UTF8 strings' + + - name: Super3 + column_type: Super + compare_with: LongType + comment: 'A column family with supercolumns, whose column names are Longs (8 bytes)' Added: cassandra/trunk/contrib/pig/example-script.pig URL: http://svn.apache.org/viewvc/cassandra/trunk/contrib/pig/example-script.pig?rev=954000&view=auto ============================================================================== --- cassandra/trunk/contrib/pig/example-script.pig (added) +++ cassandra/trunk/contrib/pig/example-script.pig Sat Jun 12 13:08:51 2010 @@ -0,0 +1,8 @@ +rows = LOAD 'cassandra://Keyspace1/Standard1' USING CassandraStorage(); +cols = FOREACH rows GENERATE flatten($1); +colnames = FOREACH cols GENERATE $0; +namegroups = GROUP colnames BY $0; +namecounts = FOREACH namegroups GENERATE COUNT($1), group; +orderednames = ORDER namecounts BY $0; +topnames = LIMIT orderednames 50; +dump topnames; \ No newline at end of file Modified: cassandra/trunk/contrib/word_count/src/WordCountSetup.java URL: http://svn.apache.org/viewvc/cassandra/trunk/contrib/word_count/src/WordCountSetup.java?rev=954000&r1=953999&r2=954000&view=diff ============================================================================== --- cassandra/trunk/contrib/word_count/src/WordCountSetup.java (original) +++ cassandra/trunk/contrib/word_count/src/WordCountSetup.java Sat Jun 12 13:08:51 2010 @@ -46,6 +46,8 @@ public class WordCountSetup Map<byte[], Map<String,List<Mutation>>> mutationMap; Column c; + // text0: no rows + // text1: 1 row, 1 word c = new Column("text1".getBytes(), "word1".getBytes(), new Clock(System.currentTimeMillis())); mutationMap = getMutationMap("key0".getBytes(), WordCount.COLUMN_FAMILY, c); Modified: cassandra/trunk/src/java/org/apache/cassandra/cli/CliClient.java URL: http://svn.apache.org/viewvc/cassandra/trunk/src/java/org/apache/cassandra/cli/CliClient.java?rev=954000&r1=953999&r2=954000&view=diff ============================================================================== --- cassandra/trunk/src/java/org/apache/cassandra/cli/CliClient.java (original) +++ cassandra/trunk/src/java/org/apache/cassandra/cli/CliClient.java Sat Jun 12 13:08:51 2010 @@ -47,7 +47,7 @@ public class CliClient } // Execute a CLI Statement - public void executeCLIStmt(String stmt) throws TException, NotFoundException, InvalidRequestException, UnavailableException, TimedOutException, IllegalAccessException, ClassNotFoundException, InstantiationException + public void executeCLIStmt(String stmt) throws TException, NotFoundException, InvalidRequestException, UnavailableException, TimedOutException, IllegalAccessException, ClassNotFoundException, InstantiationException, NoSuchFieldException { CommonTree ast = null; @@ -243,7 +243,7 @@ public class CliClient } private void doSlice(String keyspace, String key, String columnFamily, byte[] superColumnName) - throws InvalidRequestException, UnavailableException, TimedOutException, TException, UnsupportedEncodingException, IllegalAccessException, NotFoundException, InstantiationException, ClassNotFoundException + throws InvalidRequestException, UnavailableException, TimedOutException, TException, UnsupportedEncodingException, IllegalAccessException, NotFoundException, InstantiationException, NoSuchFieldException { SliceRange range = new SliceRange(ArrayUtils.EMPTY_BYTE_ARRAY, ArrayUtils.EMPTY_BYTE_ARRAY, true, 1000000); List<ColumnOrSuperColumn> columns = thriftClient_.get_slice(key.getBytes(), @@ -276,34 +276,36 @@ public class CliClient css_.out.println("Returned " + size + " results."); } - private String formatSuperColumnName(String keyspace, String columnFamily, SuperColumn column) throws NotFoundException, TException, ClassNotFoundException, IllegalAccessException, InstantiationException + private String formatSuperColumnName(String keyspace, String columnFamily, SuperColumn column) throws NotFoundException, TException, IllegalAccessException, InstantiationException, NoSuchFieldException { return getFormatTypeForColumn(keyspacesMap.get(keyspace).get(columnFamily).get("CompareWith")).getString(column.name); } - private String formatSubcolumnName(String keyspace, String columnFamily, Column subcolumn) throws NotFoundException, TException, ClassNotFoundException, IllegalAccessException, InstantiationException + private String formatSubcolumnName(String keyspace, String columnFamily, Column subcolumn) throws NotFoundException, TException, IllegalAccessException, InstantiationException, NoSuchFieldException { return getFormatTypeForColumn(keyspacesMap.get(keyspace).get(columnFamily).get("CompareSubcolumnsWith")).getString(subcolumn.name); } - private String formatColumnName(String keyspace, String columnFamily, Column column) throws ClassNotFoundException, NotFoundException, TException, IllegalAccessException, InstantiationException + private String formatColumnName(String keyspace, String columnFamily, Column column) throws NotFoundException, TException, IllegalAccessException, InstantiationException, NoSuchFieldException { return getFormatTypeForColumn(keyspacesMap.get(keyspace).get(columnFamily).get("CompareWith")).getString(column.name); } - private AbstractType getFormatTypeForColumn(String compareWith) throws ClassNotFoundException, IllegalAccessException, InstantiationException + private AbstractType getFormatTypeForColumn(String compareWith) throws IllegalAccessException, InstantiationException, NoSuchFieldException { AbstractType type; try { - type = (AbstractType) Class.forName(compareWith).newInstance(); + // Get the singleton instance of the AbstractType subclass + Class c = Class.forName(compareWith); + type = (AbstractType) c.getField("instance").get(c); } catch (ClassNotFoundException e) { - type = BytesType.class.newInstance(); + type = BytesType.instance; } return type; } // Execute GET statement - private void executeGet(CommonTree ast) throws TException, NotFoundException, InvalidRequestException, UnavailableException, TimedOutException, UnsupportedEncodingException, IllegalAccessException, InstantiationException, ClassNotFoundException + private void executeGet(CommonTree ast) throws TException, NotFoundException, InvalidRequestException, UnavailableException, TimedOutException, UnsupportedEncodingException, IllegalAccessException, InstantiationException, ClassNotFoundException, NoSuchFieldException { if (!CliMain.isConnected() || !hasKeySpace()) return; Modified: cassandra/trunk/src/java/org/apache/cassandra/db/ColumnFamilySerializer.java URL: http://svn.apache.org/viewvc/cassandra/trunk/src/java/org/apache/cassandra/db/ColumnFamilySerializer.java?rev=954000&r1=953999&r2=954000&view=diff ============================================================================== --- cassandra/trunk/src/java/org/apache/cassandra/db/ColumnFamilySerializer.java (original) +++ cassandra/trunk/src/java/org/apache/cassandra/db/ColumnFamilySerializer.java Sat Jun 12 13:08:51 2010 @@ -128,7 +128,9 @@ public class ColumnFamilySerializer impl try { - return (AbstractType)Class.forName(className).getConstructor().newInstance(); + // Get the singleton instance of the AbstractType subclass + Class c = Class.forName(className); + return (AbstractType) c.getField("instance").get(c); } catch (ClassNotFoundException e) { Modified: cassandra/trunk/src/java/org/apache/cassandra/hadoop/ColumnFamilyRecordReader.java URL: http://svn.apache.org/viewvc/cassandra/trunk/src/java/org/apache/cassandra/hadoop/ColumnFamilyRecordReader.java?rev=954000&r1=953999&r2=954000&view=diff ============================================================================== --- cassandra/trunk/src/java/org/apache/cassandra/hadoop/ColumnFamilyRecordReader.java (original) +++ cassandra/trunk/src/java/org/apache/cassandra/hadoop/ColumnFamilyRecordReader.java Sat Jun 12 13:08:51 2010 @@ -187,7 +187,7 @@ public class ColumnFamilyRecordReader ex */ private void maybeConnect() throws InvalidRequestException, TException, AuthenticationException, AuthorizationException, NotFoundException, InstantiationException, IllegalAccessException, - ClassNotFoundException + ClassNotFoundException, NoSuchFieldException { // only need to connect once if (socket != null && socket.isOpen()) @@ -213,7 +213,9 @@ public class ColumnFamilyRecordReader ex Map<String, Map<String,String>> desc = client.describe_keyspace(keyspace); Map<String,String> ksProps = desc.get(cfName); String compClass = ksProps.get("CompareWith"); - comparator = (AbstractType) Class.forName(compClass).newInstance(); + // Get the singleton instance of the AbstractType subclass + Class c = Class.forName(compClass); + comparator = (AbstractType) c.getField("instance").get(c); } }