Changeset: 3de8927d961b for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=3de8927d961b
Modified Files:
MonetDB5/src/mal/mal_errors.mx
sql/src/backends/monet5/vaults/mseed.mx
sql/src/backends/monet5/vaults/vault.mx
sql/src/backends/monet5/vaults/vault.sql
Branch: default
Log Message:
Safe the state of the vault code
diffs (truncated from 494 to 300 lines):
diff -r 22d492452582 -r 3de8927d961b MonetDB5/src/mal/mal_errors.mx
--- a/MonetDB5/src/mal/mal_errors.mx Fri Dec 10 05:59:03 2010 +0100
+++ b/MonetDB5/src/mal/mal_errors.mx Sat Dec 11 17:37:44 2010 +0100
@@ -86,6 +86,7 @@
#define RUNTIME_IO_EOF "Attempt to read beyond end-of-file"
#define RUNTIME_FILE_NOT_FOUND "File not found"
+#define RUNTIME_UNLINK "File could not be unlinked"
#define RUNTIME_DIR_ERROR "Unable to open directory"
#define RUNTIME_STREAM_FAILED "Could not create stream"
#define RUNTIME_STREAM_WRITE "Could not write to stream"
diff -r 22d492452582 -r 3de8927d961b sql/src/backends/monet5/vaults/mseed.mx
--- a/sql/src/backends/monet5/vaults/mseed.mx Fri Dec 10 05:59:03 2010 +0100
+++ b/sql/src/backends/monet5/vaults/mseed.mx Sat Dec 11 17:37:44 2010 +0100
@@ -21,15 +21,78 @@
@a Martin Kersten
@v 0.1
@+ Mseed
-These routines are meant to interpret mseed files in the context of a vault.
+These routines are meant to interpret mseed files stored in the vault.
The simplifying situation is that mseed has a single model.
The code base assumes that libmseed has been installed on your system.
-Furthermore, the mseed catalog initialization script should have been run.
+The mseed catalog initialization script should have been run.
+...@begin verbatim
+-- this schema is intended to experiment with accessing mseed files
+drop FUNCTION mseedImport();
+drop table mseedCatalog;
+drop table mseedRepository;
+
+-- all records in the mseed files correspond to a row in the catalog
+CREATE TABLE mseedCatalog (
+mseed int, -- Vault file id
+chunk varchar(255), -- SQL volumn storage container name
+seqno int, -- SEED record sequence number,
should be between 0 and 999999
+ PRIMARY KEY (mseed,seqno),
+dataquality char, -- Data record indicator, should be
'D=data unknown qual',
+ -- 'R=raw no
quality', 'Q= quality controlled' or 'M'
+network varchar(11), -- Network
+station varchar(11), -- Station
+location varchar(11), -- Location
+channel varchar(11), -- Channel
+starttime timestamp, -- Record start time, the time
of the first sample, as a high precision epoch time
+samplerate double, -- Nominal sample rate (Hz)
+samplecnt int, -- Number of samples in record
+sampletype string, -- storage type in mseed record
+minval float, -- statistics for search later
+maxval float
+);
+
+-- this function inserts the mseed record information into the catalog
+-- errors are returned for off-line analysis.
+CREATE FUNCTION mseedImport(vid int, source string, target string)
+ RETURNS string
+EXTERNAL NAME mseed.import;
+
+-- mseed data volumns may appear in different formats
+-- we try to postpone them, assuming the optimizer can guide JIT.
+--CREATE TABLE chunkname (
+--time timestamp,
+--mseed int,
+--adata varchar(20), dependent on type
+--idata int,
+--fdata float,
+--ddata double
+--);
+...@end verbatim
+
+...@- How to use the mseed catalog.
+First, the vault directory is populated with the location of the mseed source
files
+in a remote site. The corresponding local name is set using the basename
property,
+and all files creation and access times are set to null.
+Following, a limited number of files are loaded into the vault and analysed.
+The information extracted ends up in the catalog, and remains there forever.
+The underlying mseed file is not decrypted directly, it will be done as soon
+as a query requests its.
+
+A test sequence (after the vault director has been populated)
+...@begin verbatim
+create table batch(vid int, source string, target string);
+
+
+insert into batch
+select vid, source, target from vault where created is null limit 2;
+call batchload( select vid, source, target from vault where created is null
limit 2);
+drop table batch;
+...@end verbatim
@mal
module mseed;
-pattern import(vid:int, fnme:str):str
+pattern import(vid:int, source:str, target:str):str
address MseedImport
comment "Dump the record content of an mseed file from the vault into the
mseed catalog";
@@ -38,7 +101,7 @@
in the vault table for import and to apply the mseed import operation.
It returns the id list of succesful imports.
@verbatim
-select import(vid) from vault where ...
+select mseedImport(vid,source,target) from vault where ...
@end verbatim
@{
@h
@@ -84,6 +147,7 @@
str msg = MAL_SUCCEED;
int *vid = (int*) getArgReference(stk,pci,1);
str *sourcefile = (str*) getArgReference(stk,pci,2);
+ str *targetfile = (str*) getArgReference(stk,pci,3);
MSRecord *msr = 0;
int verbose = 1;
@@ -96,7 +160,8 @@
int j;
time_t t;
struct tm *tm;
- char *s, *basename,*kind;
+ char *s, *kind;
+ timestamp answ;
char file[BUFSIZ];
char buf[BUFSIZ];
char starttime[BUFSIZ];
@@ -105,28 +170,30 @@
(void) mb;
- if ( vaultpath[0] == 0){
- msg= createException(MAL,"mseed.dump","Vault not initialized");
- *ret = GDKstrdup(msg);
- return msg;
- }
- basename = strrchr(*sourcefile,DIR_SEP);
- basename = basename ? basename: *sourcefile;
- snprintf(file,BUFSIZ,"%s%c%s",vaultpath, DIR_SEP,basename);
+ if ( strcmp(*sourcefile, *targetfile) ) {
+ if ( vaultpath[0] == 0){
+ msg= createException(MAL,"mseed.dump","Vault not
initialized");
+ *ret = GDKstrdup(msg);
+ return msg;
+ }
+ snprintf(file,BUFSIZ,"%s%c%s",vaultpath, DIR_SEP,*targetfile);
- /* only fetch the file if it is not already in the local vault */
- if ( access(file, R_OK) ){
- mnstr_printf(cntxt->fdout, "FTP fetch %s ->
%s\n",*sourcefile,file);
- msg= VLTftpget(&j, sourcefile, &basename);
- if ( msg)
- return msg;
- /* remember the location of the copy */
- s= buf;
- snprintf(buf,BUFSIZ,"UPDATE vault SET target='%s' WHERE vid =
%d;", file,*vid);
- msg =SQLstatementIntern(cntxt,&s,"mseed.import
file",TRUE,FALSE);
- if ( msg)
- return msg;
- }
+ /* only fetch the file if it is not already in the local vault
*/
+ if ( access(file, R_OK) ){
+ mnstr_printf(cntxt->fdout, "FTP fetch %s ->
%s\n",*sourcefile,file);
+ msg= VLTimport(&answ, sourcefile, targetfile);
+ if ( msg)
+ return msg;
+ /* remember the location of the copy */
+ s= buf;
+ snprintf(buf,BUFSIZ,"UPDATE vault SET created=now()
WHERE vid = %d;", *vid);
+ msg =SQLstatementIntern(cntxt,&s,"mseed.import
file",TRUE,FALSE);
+ if ( msg)
+ return msg;
+ }
+ } else
+ /* no caching needed or allowed */
+ snprintf(file,BUFSIZ,"%s",*targetfile);
/* Loop over the input file */
s= buf;
diff -r 22d492452582 -r 3de8927d961b sql/src/backends/monet5/vaults/vault.mx
--- a/sql/src/backends/monet5/vaults/vault.mx Fri Dec 10 05:59:03 2010 +0100
+++ b/sql/src/backends/monet5/vaults/vault.mx Sat Dec 11 17:37:44 2010 +0100
@@ -23,53 +23,114 @@
@+ Data Vaults
The Data Vault module provides the basic infrastructure to manage
a repository with datafiles whose integrity and use is shared between
-MonetDB and a client repository.
+MonetDB and the repository owner.
-Once a vault is created, the system administrator can manually add files to
the
-vault catalog. A target directory, when set, denotes the location where
-temporary copies are to be stored.
+Once a vault is created, the system administrator or crontab job adds files to
the vault catalog.
+The source attribute provides the universal resource identifier (URI)
+in a format understood by the CURL library. In most cases, it represents a file
+to be accessed using FTP.
-A import() operation will perform a batch copy of the files marked as to be
copied
-from the remote site using a path to an (ftp) directory.
+A target denotes its name in the staging area, i.e. a local cache where copies
are to be stored.
+The local cache can be hierarchical structured to spread the load over
multiple volumns
+and to retain the structure of the source repository.
+Files are dropped from the local cache using a SQL vacuum() operation based on
a LRU time stamp.
+The retention period depends on the amount of disk resources available.
+The vacuum() operation should be automatically triggered when disk space
becomes a scarce resource.
-If source and target files reside on the same file system then a symbolic link
is sufficient.
-Alternatively, the vault can be populated using normal updates to the vault
catalog.
+An import() operation copies a file from the remote repository into the
staging area.
-The SQL view on the vault merely describes the files being imported,
-their freshness and properties required to fetch them from a remote source.
-It is an ordinary SQL table, which can be the target for updates using SQL.
-See the source file for the script.
+The basename() operation extract the tail of the argument. It can be used to
derive
+target filename locations.
+
+If source and target files reside on the same file system then a symbolic link
is sufficient
+and vacuum() need not be triggered.
+
+The file mapping catalog is kept lean. The attribute 'created' marks the
actual time
+the file was obtained from the remote source. The lru attribute is set each
time we access its content.
+Files that are bound to internal database structures may want to set it into
the future.
@verbatim
CREATE TABLE sys.vault (
vid int PRIMARY KEY,-- Internal key
kind string, -- vault kind (CSV, MSEED,
FITS,..)
source string, -- remote file name for cURL to
access
-refresh boolean, -- refresh each time of
access
-cached timestamp, -- copy stored locally
-target string -- file name of source file in
vault
+target string, -- file name of source file in
vault
+created timestamp, -- timestamp upon
entering the cache
+lru timestamp -- least recently used
);
+
+create function getvault()
+returns string
+external name vault.getdirectory;
+
+create function setvault(dir string)
+returns string
+external name vault.setdirectory;
+
+
+create function basename(fnme string, split string)
+returns string
+external name vault.basename;
+
+create function import(source string, target string)
+returns timestamp
+external name vault.import;
+
+create function remove(target string)
+returns timestamp
+external name vault.remove;
+
+create procedure vacuum( t timestamp)
+begin
+update vault
+ set created= remove(target),
+ lru = null
+ where created < t;
+end;
@end verbatim
The module is developed solely for a Linux environment.
The vault root is a subdirectory of the dbfarm/<dbname>/vault/ and contains
a subdirectory for each vault kind. In turn, each vault kind comes
-with a refinement of the catalog identified above using the vid
-to relate the two.
+with a refinement of the catalog identified above using the vid to relate the
two.
+
+For large staging pools it may be advisable to pre-create the repository
+structure, e.g. mounting multiple volumns for its partitions.
+
+The session structure would be something like:
+...@begin verbatim
+insert into vault(vid,kind,source)
values(0,'dummy','ftp://ftp.rep.edu/repos/station-1'),
+ (1,'dummy','ftp://ftp.rep.edu/repos/station-2');
+update vault
+ set target = basename(source,'repos');
+update vault
+ set created= import(source,target)
+ where created is null;
+select * from vault limit 2;
+call vacuum(now());
+...@end
@mal
module vault;
+command import(src:str,trg:str):timestamp
+address VLTimport
+comment "Use FTP to copy a remote file to the cache";
+
+command remove(t:str):timestamp
+address VLTremove
+comment "Drop a file from the local cache";
+
+command basename( fnme:str, split:str):str
+address VLTbasename
+comment "Split the fnme string just before the split marker.";
+
+command setdirectory(src:str):str
_______________________________________________
Checkin-list mailing list
[email protected]
http://mail.monetdb.org/mailman/listinfo/checkin-list