Changeset: 3de8927d961b for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=3de8927d961b
Modified Files:
        MonetDB5/src/mal/mal_errors.mx
        sql/src/backends/monet5/vaults/mseed.mx
        sql/src/backends/monet5/vaults/vault.mx
        sql/src/backends/monet5/vaults/vault.sql
Branch: default
Log Message:

Safe the state of the vault code


diffs (truncated from 494 to 300 lines):

diff -r 22d492452582 -r 3de8927d961b MonetDB5/src/mal/mal_errors.mx
--- a/MonetDB5/src/mal/mal_errors.mx    Fri Dec 10 05:59:03 2010 +0100
+++ b/MonetDB5/src/mal/mal_errors.mx    Sat Dec 11 17:37:44 2010 +0100
@@ -86,6 +86,7 @@
 
 #define RUNTIME_IO_EOF "Attempt to read beyond end-of-file"
 #define RUNTIME_FILE_NOT_FOUND "File not found"
+#define RUNTIME_UNLINK "File could not be unlinked"
 #define RUNTIME_DIR_ERROR "Unable to open directory"
 #define RUNTIME_STREAM_FAILED "Could not create stream"
 #define RUNTIME_STREAM_WRITE "Could not write to stream"
diff -r 22d492452582 -r 3de8927d961b sql/src/backends/monet5/vaults/mseed.mx
--- a/sql/src/backends/monet5/vaults/mseed.mx   Fri Dec 10 05:59:03 2010 +0100
+++ b/sql/src/backends/monet5/vaults/mseed.mx   Sat Dec 11 17:37:44 2010 +0100
@@ -21,15 +21,78 @@
 @a Martin Kersten
 @v 0.1
 @+ Mseed
-These routines are meant to interpret mseed files in the context of a vault.
+These routines are meant to interpret mseed files stored in the vault.
 The simplifying situation is that mseed has a single model.
 The code base assumes that libmseed has been installed on your system.
 
-Furthermore, the mseed catalog initialization script should have been run.
+The mseed catalog initialization script should have been run.
+...@begin verbatim
+-- this schema is intended to experiment with accessing mseed files
+drop FUNCTION mseedImport();
+drop table mseedCatalog;
+drop table mseedRepository;
+
+-- all records in the mseed files correspond to a row in the catalog
+CREATE TABLE mseedCatalog (
+mseed                  int,                    -- Vault file id
+chunk                  varchar(255),   -- SQL volumn storage container name
+seqno                  int,                    -- SEED record sequence number, 
should be between 0 and 999999
+                PRIMARY KEY (mseed,seqno),
+dataquality    char,                   -- Data record indicator, should be 
'D=data unknown qual', 
+                                                               -- 'R=raw no 
quality', 'Q= quality controlled' or 'M'
+network                        varchar(11),    -- Network
+station                        varchar(11),    -- Station
+location               varchar(11),    -- Location
+channel                        varchar(11),    -- Channel
+starttime              timestamp,              -- Record start time, the time 
of the first sample, as a high precision epoch time 
+samplerate             double,                 -- Nominal sample rate (Hz) 
+samplecnt              int,                    -- Number of samples in record 
+sampletype             string,                 -- storage type in mseed record
+minval                 float,                  -- statistics for search later
+maxval                 float
+); 
+
+-- this function inserts the mseed record information into the catalog
+-- errors are returned for off-line analysis.
+CREATE FUNCTION mseedImport(vid int, source string, target string)
+       RETURNS string
+EXTERNAL NAME mseed.import;
+
+-- mseed data volumns may appear in different formats
+-- we try to postpone them, assuming the optimizer can guide JIT.
+--CREATE TABLE chunkname (
+--time timestamp,
+--mseed        int,
+--adata        varchar(20),    dependent on type
+--idata        int,
+--fdata        float,
+--ddata        double
+--); 
+...@end verbatim
+
+...@- How to use the mseed catalog.
+First, the vault directory is populated with the location of the mseed source 
files
+in a remote site. The corresponding local name is set using the basename 
property,
+and all files creation and access times are set to null.
+Following, a limited number of files are loaded into the vault and analysed.
+The information extracted ends up in the catalog, and remains there forever.
+The underlying mseed file is not decrypted directly, it will be done as soon
+as a query requests its.
+
+A test sequence (after the vault director has been populated)
+...@begin verbatim
+create table batch(vid int, source string, target string);
+
+
+insert into batch
+select vid, source, target from vault where created is null limit 2;
+call batchload( select vid, source, target from vault where created is null 
limit 2);
+drop table batch;
+...@end verbatim
 
 @mal
 module mseed;
-pattern import(vid:int, fnme:str):str
+pattern import(vid:int, source:str, target:str):str
 address MseedImport
 comment "Dump the record content of an mseed file from the vault into the 
mseed catalog";
 
@@ -38,7 +101,7 @@
 in the vault table for import and to apply the mseed import operation.
 It returns the id list of succesful imports.
 @verbatim
-select import(vid) from vault where ...
+select mseedImport(vid,source,target) from vault where ...
 @end verbatim
 @{
 @h
@@ -84,6 +147,7 @@
        str msg = MAL_SUCCEED;
        int *vid = (int*) getArgReference(stk,pci,1); 
        str *sourcefile = (str*) getArgReference(stk,pci,2); 
+       str *targetfile = (str*) getArgReference(stk,pci,3); 
        MSRecord *msr = 0;
 
        int verbose   = 1;
@@ -96,7 +160,8 @@
        int j;
        time_t t;
        struct tm *tm;
-       char *s, *basename,*kind;
+       char *s, *kind;
+       timestamp answ;
        char file[BUFSIZ];
        char buf[BUFSIZ];
        char starttime[BUFSIZ];
@@ -105,28 +170,30 @@
 
        (void) mb;
 
-       if ( vaultpath[0] == 0){
-               msg= createException(MAL,"mseed.dump","Vault not initialized");
-               *ret = GDKstrdup(msg);
-               return msg;
-       }
-       basename = strrchr(*sourcefile,DIR_SEP);
-       basename = basename ? basename:  *sourcefile;
-       snprintf(file,BUFSIZ,"%s%c%s",vaultpath, DIR_SEP,basename);
+       if ( strcmp(*sourcefile, *targetfile) ) {
+               if ( vaultpath[0] == 0){
+                       msg= createException(MAL,"mseed.dump","Vault not 
initialized");
+                       *ret = GDKstrdup(msg);
+                       return msg;
+               }
+               snprintf(file,BUFSIZ,"%s%c%s",vaultpath, DIR_SEP,*targetfile);
 
-       /* only fetch the file if it is not already in the local vault */
-       if ( access(file, R_OK) ){
-               mnstr_printf(cntxt->fdout, "FTP fetch %s -> 
%s\n",*sourcefile,file);
-               msg= VLTftpget(&j, sourcefile, &basename);
-               if ( msg)
-                       return msg;
-               /* remember the location of the copy */
-               s= buf;
-               snprintf(buf,BUFSIZ,"UPDATE vault SET target='%s' WHERE vid = 
%d;", file,*vid);
-               msg =SQLstatementIntern(cntxt,&s,"mseed.import 
file",TRUE,FALSE);
-               if ( msg)
-                       return msg;
-       }
+               /* only fetch the file if it is not already in the local vault 
*/
+               if ( access(file, R_OK) ){
+                       mnstr_printf(cntxt->fdout, "FTP fetch %s -> 
%s\n",*sourcefile,file);
+                       msg= VLTimport(&answ, sourcefile, targetfile);
+                       if ( msg)
+                               return msg;
+                       /* remember the location of the copy */
+                       s= buf;
+                       snprintf(buf,BUFSIZ,"UPDATE vault SET created=now() 
WHERE vid = %d;", *vid);
+                       msg =SQLstatementIntern(cntxt,&s,"mseed.import 
file",TRUE,FALSE);
+                       if ( msg)
+                               return msg;
+               }
+       } else 
+               /* no caching  needed or allowed */
+               snprintf(file,BUFSIZ,"%s",*targetfile);
 
        /* Loop over the input file */
        s= buf;
diff -r 22d492452582 -r 3de8927d961b sql/src/backends/monet5/vaults/vault.mx
--- a/sql/src/backends/monet5/vaults/vault.mx   Fri Dec 10 05:59:03 2010 +0100
+++ b/sql/src/backends/monet5/vaults/vault.mx   Sat Dec 11 17:37:44 2010 +0100
@@ -23,53 +23,114 @@
 @+ Data Vaults
 The Data Vault module provides the basic infrastructure to manage
 a repository with datafiles whose integrity and use is shared between
-MonetDB and a client repository.
+MonetDB and the repository owner. 
 
-Once a vault is created, the system administrator can manually add files to 
the 
-vault catalog. A target directory, when set, denotes the location where
-temporary copies are to be stored.
+Once a vault is created, the system administrator or crontab job adds files to 
the vault catalog.
+The source attribute provides the universal resource identifier (URI)
+in a format understood by the CURL library. In most cases, it represents a file
+to be accessed using FTP.
 
-A import() operation will perform a batch copy of the files marked as to be 
copied
-from the remote site using a path to an (ftp) directory.
+A target denotes its name in the staging area, i.e. a local cache where copies 
are to be stored.
+The local cache can be hierarchical structured to spread the load over 
multiple volumns
+and to retain the structure of the source repository.
+Files are dropped from the local cache using a SQL vacuum() operation based on 
a LRU time stamp.
+The retention period depends on the amount of disk resources available. 
+The vacuum() operation should be automatically triggered when disk space 
becomes a scarce resource.
 
-If source and target files reside on the same file system then a symbolic link 
is sufficient.
-Alternatively, the vault can be populated using normal updates to the vault 
catalog.
+An import() operation copies a file from the remote repository into the 
staging area.
 
-The SQL view on the vault merely describes the files being imported,
-their freshness and properties required to fetch them from a remote source.
-It is an ordinary SQL table, which can be the target for updates using SQL.
-See the source file for the script.
+The basename() operation extract the tail of the argument. It can be used to 
derive
+target filename locations.
+
+If source and target files reside on the same file system then a symbolic link 
is sufficient
+and vacuum() need not be triggered.
+
+The file mapping catalog is kept lean. The attribute 'created' marks the 
actual time
+the file was obtained from the remote source. The lru attribute is set each 
time we access its content.  
+Files that are bound to internal database structures may want to set it into 
the future.
 @verbatim
 CREATE TABLE sys.vault (
 vid                    int PRIMARY KEY,-- Internal key
 kind                   string,                 -- vault kind (CSV, MSEED, 
FITS,..)
 source                 string,                 -- remote file name for cURL to 
access
-refresh                        boolean,                -- refresh each time of 
access
-cached                 timestamp,              -- copy stored locally
-target                 string                  -- file name of source file in 
vault
+target                 string,                 -- file name of source file in 
vault
+created                        timestamp,              -- timestamp upon 
entering the cache
+lru                            timestamp               -- least recently used
 );
+
+create function getvault()
+returns string
+external name vault.getdirectory;
+
+create function setvault(dir string)
+returns string
+external name vault.setdirectory;
+
+
+create function basename(fnme string, split string)
+returns string
+external name vault.basename;
+
+create function import(source string, target string)
+returns timestamp
+external name vault.import;
+
+create function remove(target string)
+returns timestamp
+external name vault.remove;
+
+create procedure vacuum( t timestamp)
+begin
+update vault
+  set created= remove(target),
+  lru = null
+  where  created < t;
+end;
 @end verbatim
 
 The module is developed solely for a Linux environment.
 The vault root is a subdirectory of the dbfarm/<dbname>/vault/ and contains
 a subdirectory for each vault kind. In turn, each vault kind comes
-with a refinement of the catalog identified above using the vid
-to relate the two.
+with a refinement of the catalog identified above using the vid to relate the 
two.
+
+For large staging pools it may be advisable to pre-create the repository
+structure, e.g. mounting multiple volumns for its partitions.
+
+The session structure would be something like:
+...@begin verbatim
+insert into vault(vid,kind,source) 
values(0,'dummy','ftp://ftp.rep.edu/repos/station-1'),
+       (1,'dummy','ftp://ftp.rep.edu/repos/station-2');
+update vault
+  set target = basename(source,'repos');
+update vault
+  set created= import(source,target)
+  where created is null;
+select * from vault limit 2;
+call vacuum(now());
+...@end
 @mal
 module vault;
 
+command import(src:str,trg:str):timestamp
+address VLTimport
+comment "Use FTP to copy a remote file to the cache";
+
+command remove(t:str):timestamp
+address VLTremove
+comment "Drop a file from the local cache";
+
+command basename( fnme:str, split:str):str
+address VLTbasename
+comment "Split the fnme string just before the split marker.";
+
+command setdirectory(src:str):str
_______________________________________________
Checkin-list mailing list
[email protected]
http://mail.monetdb.org/mailman/listinfo/checkin-list

Reply via email to