Re: [galaxy-dev] Creating multiple datasets in a libset

Rob Leclerc Wed, 29 May 2013 06:38:26 -0700

Hi Neil,

I've attached my class function for uploading multiple files.


 def upload_files(self, fullpaths):
        """
            Uploads files from a disk location to a Galaxy library
            Accepts an array of full path filenames
            Example: fullpaths = ['/home/username/file1.txt',
'/home/username/files2.txt']
        """
        if self.jsonstring == None:
            self.get_library()

        library_id = self.library_id
        library_folder_id = self.library_folder_id
        api_key = self.api_key
        api_url = self.api_url

        #Galaxy needs to read the pathnames as a new line delimited string
        #so we do that transformation here
        fullpaths_string = ""
        for path in fullpaths:
            fullpaths_string = fullpaths_string + path + "\n"

        fullpaths_string = fullpaths_string[:-1]
        data = {}
        data['folder_id'] = library_folder_id
        data['file_type'] = 'auto'
        data['dbkey'] = ''
        data['upload_option'] = 'upload_paths'
        data['filesystem_paths'] = fullpaths_string
        data['create_type'] = 'file'
        #Start the upload. This will return right away, but it may take
awhile
        libset = submit(api_key, api_url + "libraries/%s/contents" %
library_id, data, return_formatted = False)

        #Iterate through each dataset we just uploaded and block until all
files have been written to the Galaxy database
        for ds in libset:
            last_filesize = 0
            while True:
                #If file_size != 0 and the file_size is different after a
second iteration, then we assume the disk write is finished
                ds_id = ds['id']
                uploaded_file = display(api_key, api_url +
'libraries/%s/contents/%s' %(library_id, ds_id), return_formatted=False)
                print uploaded_file
                if uploaded_file['file_size'] != 0 and
uploaded_file['file_size'] == last_filesize:
                    break
                else:
                    last_filesize = uploaded_file['file_size']
                    time.sleep(2)
        self.libset = libset
        return libset


Rob Leclerc, PhD
<http://www.linkedin.com/in/robleclerc> <https://twitter.com/#!/robleclerc>
P: (US) +1-(917)-873-3037
P: (Shanghai) +86-1-(861)-612-5469
Personal Email: rob.lecl...@aya.yale.edu


On Wed, May 29, 2013 at 12:45 AM, <neil.burd...@csiro.au> wrote:

> Hi Guys,
>          Did you manage to get multiple datasets working? I can't seem to
> upload multiple files. Only the last file appears in the history. I changed
> my code as mentioned in the thread below in "example_watch_folder.py" to
> add multiple files separated by a new line and increased the sleep time:
>
> for fname in os.listdir(in_folder):
>             fullpath = os.path.join(in_folder, fname)
>             print ' fullpath is [%s] ' % fullpath
>             if os.path.isfile(fullpath):
>                 data = {}
>                 data['folder_id'] = library_folder_id
>                 data['file_type'] = 'auto'
>                 data['dbkey'] = ''
>                 data['upload_option'] = 'upload_paths'
>                 data['filesystem_paths'] =
> "/home/galaxy/galaxy-drop/input/141_S_0851_MRI_T1_Screening.nii.gz\n
> /home/galaxy/galaxy-drop/input/141_S_0851_MRI_T2_Screening.nii.gz"
>                 print ' data is [%s] ' % str(data['filesystem_paths'])
>                 data['create_type'] = 'file'
>                 libset = submit(api_key, api_url + "libraries/%s/contents"
> % library_id, data, return_formatted = False)
>                 #TODO Handle this better, but the datatype isn't always
>                 # set for the followup workflow execution without this
>                 # pause.
>                 time.sleep(65)
>
> However, I get the following crash:
>
> ./example_watch_folder.py 64f3209856a3cf4f2d034a1ad5bf851c
> http://barium-rbh/csiro/api/ /home/galaxy/galaxy-drop/input
> /home/galaxy/galaxy-drop/output "This One" f2db41e1fa331b3e
>
>  fullpath is
> [/home/galaxy/galaxy-drop/input/141_S_0851_MRI_T2_Screening.nii.gz]
>  data is [/home/galaxy/galaxy-drop/input/141_S_0851_MRI_T1_Screening.nii.gz
>  /home/galaxy/galaxy-drop/input/141_S_0851_MRI_T2_Screening.nii.gz]
> url is :
> http://barium-rbh/csiro/api/libraries/33b43b4e7093c91f/contents?key=64f3209856a3cf4f2d034a1ad5bf851c
> data is : {'file_type': 'auto', 'dbkey': '', 'create_type': 'file',
> 'folder_id': 'F33b43b4e7093c91f', 'upload_option': 'upload_paths',
> 'filesystem_paths':
> '/home/galaxy/galaxy-drop/input/141_S_0851_MRI_T1_Screening.nii.gz\n
> /home/galaxy/galaxy-drop/input/141_S_0851_MRI_T2_Screening.nii.gz'}
> url is :
> http://barium-rbh/csiro/api/workflows?key=64f3209856a3cf4f2d034a1ad5bf851c
> data is : {'workflow_id': 'f2db41e1fa331b3e', 'ds_map': {'14': {'src':
> 'ld', 'id': 'ff5476bcf6c921fa'}}, 'history':
> '141_S_0851_MRI_T2_Screening.nii.gz - apiFullCTE'}
> {'outputs': ['daecbdd824e1c349', '358eb58cd5463e0d', 'c0279aab05812500'],
> 'history': '3cc0effd29705aa3'}
> url is :
> http://barium-rbh/csiro/api/workflows?key=64f3209856a3cf4f2d034a1ad5bf851c
> data is : {'workflow_id': 'f2db41e1fa331b3e', 'ds_map': {'14': {'src':
> 'ld', 'id': '79966582feb6c081'}}, 'history':
> '141_S_0851_MRI_T2_Screening.nii.gz - apiFullCTE'}
> {'outputs': ['19c51286b777bc04', '0f71f1fc170d4ab9', '256444f6e7017e58'],
> 'history': 'b701da857886499b'}
> Traceback (most recent call last):
>   File "./example_watch_folder.py", line 89, in <module>
>     main(api_key, api_url, in_folder, out_folder, data_library, workflow )
>   File "./example_watch_folder.py", line 75, in main
>     shutil.move(fullpath, os.path.join(out_folder, fname))
>   File "/usr/lib/python2.7/shutil.py", line 299, in move
>     copy2(src, real_dst)
>   File "/usr/lib/python2.7/shutil.py", line 128, in copy2
>     copyfile(src, dst)
>   File "/usr/lib/python2.7/shutil.py", line 82, in copyfile
>     with open(src, 'rb') as fsrc:
> IOError: [Errno 2] No such file or directory:
> '/home/galaxy/galaxy-drop/input/141_S_0851_MRI_T2_Screening.nii.gz'
>
> It says there is no such file, but this file has already been copied from
> the input to the output directory. Any help much appreciated
>
> Neil
>
> ------------------------------
>
> Message: 2
> Date: Mon, 29 Apr 2013 16:11:39 -0400
> From: Rob Leclerc <robert.lecl...@gmail.com>
> To: Dannon Baker <dannon.ba...@gmail.com>
> Cc: "galaxy-...@bx.psu.edu" <galaxy-...@bx.psu.edu>
> Subject: Re: [galaxy-dev] Creating multiple datasets in a libset
> Message-ID:
>         <CAGkd85fHSgO2YC1T+Frctyso9G5rfQb=_mLyHGSdxPM+s3=
> 8...@mail.gmail.com>
> Content-Type: text/plain; charset="iso-8859-1"
>
> Hi Dannon,
>
> I've written some code to (i) query a dataset to ensure that it's been
> uploaded after a submit and (ii) to ensure a resulting dataset has been
> written to the file.
>
> *#Block until all datasets have been uploaded*
> libset = submit(api_key, api_url + "libraries/%s/contents" % library_id,
> data, return_formatted = False)
> for ds in libset:
>     while True:
>         uploaded_file = display(api_key, api_url +
> 'libraries/%s/contents/%s' %(library_id, ds['id']), return_formatted=False)
>         if uploaded_file['misc_info'] == None:
>             time.sleep(1)
>         else:
>             break
>
> *#Block until all result datasets have been saved to the filesystem*
> result_ds_url = api_url + 'histories/' + history_id + '/contents/' +
> dsh['id'];
> while True:
>     result_ds = display(api_key, result_ds_url, return_formatted=False)
>         if result_ds["state"] == 'ok':
>             break
>         else:
>             time.sleep(1)
>
>
> Rob Leclerc, PhD
> <http://www.linkedin.com/in/robleclerc> <https://twitter.com/#!/robleclerc
> >
> P: (US) +1-(917)-873-3037
> P: (Shanghai) +86-1-(861)-612-5469
> Personal Email: rob.lecl...@aya.yale.edu
>
>
> On Mon, Apr 29, 2013 at 11:18 AM, Dannon Baker <dannon.ba...@gmail.com
> >wrote:
>
> > Yep, that example filesystem_paths you suggest should work fine.  The
> > sleep() bit was a complete hack from the start, for simplicity in
> > demonstrating a very basic pipeline, but what you probably want to do
> for a
> > real implementation is query the dataset in question via the API, verify
> > that the datatype/etc have been set, and only after that execute the
> > workflow; instead of relying on sleep.
> >
> >
> > On Mon, Apr 29, 2013 at 9:24 AM, Rob Leclerc <robert.lecl...@gmail.com
> >wrote:
> >
> >> Hi Dannon,
> >>
> >> Thanks for the response. Sorry to be pedantic, but just to make sure
> that
> >> I understand the interpretation of this field on the other side of the
> API,
> >> I would need to have something like the following:
> >>
> >> data['filesystem_paths'] = "/home/me/file1.vcf \n /home/me/file2.vcf /n
> >> /home/me/file3.vcf"
> >>
> >> I assume I should also increase the time.sleep() to reflect the
> uploading
> >> of extra files?
> >>
> >> Cheers,
> >>
> >> Rob
> >>
> >> Rob Leclerc, PhD
> >> <http://www.linkedin.com/in/robleclerc><
> https://twitter.com/#!/robleclerc>
> >> P: (US) +1-(917)-873-3037
> >> P: (Shanghai) +86-1-(861)-612-5469
> >> Personal Email: rob.lecl...@aya.yale.edu
> >>
> >>
> >> On Mon, Apr 29, 2013 at 9:15 AM, Dannon Baker <dannon.ba...@gmail.com
> >wrote:
> >>
> >>> Hey Rob,
> >>>
> >>> That example_watch_folder.py does just submit exactly one at a time,
> >>> executes the workflow, and then does the next all in separate
> transactions.
> >>>  If you wanted to upload multiple filepaths at once, you'd just append
> more
> >>> to the ''filesystem_paths' field (newline separated paths).
> >>>
> >>> -Dannon
> >>>
> >>>
> >>> On Fri, Apr 26, 2013 at 11:54 PM, Rob Leclerc <
> robert.lecl...@gmail.com>wrote:
> >>>
> >>>> I'm looking at example_watch_folder.py and it's not clear from the
> >>>> example how you submit multiple datasets to a library. In the
> example, the
> >>>> first submit returns a libset [] with only a single entry and then
> proceeds
> >>>> to iterate through each dataset in the libset in the following
> section:
> >>>>
> >>>> data = {}
> >>>>
> >>>>    data['folder_id'] = library_folder_id
> >>>>
> >>>>    data['file_type'] = 'auto'
> >>>>
> >>>>    data['dbkey'] = ''
> >>>>
> >>>>    data['upload_option'] = 'upload_paths'
> >>>>
> >>>>
> >>>>
> >>>> *data['filesystem_paths'] = fullpath*
> >>>>
> >>>>    data['create_type'] = 'file'
> >>>>
> >>>>    libset = submit(api_key, api_url + "libraries/%s/contents" %
> >>>> library_id, data, return_formatted = False)
> >>>>
> >>>>    time.sleep(5)
> >>>>
> >>>>    for ds in libset:
> >>>>
> >>>>        if 'id' in ds:
> >>>>
> >>>>                         wf_data = {}
> >>>>
> >>>>                         wf_data['workflow_id'] = workflow['id']
> >>>>
> >>>>                         wf_data['history'] = "%s - %s" % (fname,
> >>>> workflow['name'])
> >>>>
> >>>>                         wf_data['ds_map'] = {}
> >>>>
> >>>>                         for step_id, ds_in in workflow['inputs'
> >>>> ].iteritems():
> >>>>
> >>>>                             wf_data['ds_map'][step_id] = {'src':'ld',
> >>>> 'id':ds['id']}
> >>>>
> >>>>                         res = submit( api_key, api_url + 'workflows',
> >>>> wf_data, return_formatted=False)
> >>>>
> >>>>
> >>>>
> >>>> Rob Leclerc, PhD
> >>>> <http://www.linkedin.com/in/robleclerc><
> https://twitter.com/#!/robleclerc>
> >>>> P: (US) +1-(917)-873-3037
> >>>> P: (Shanghai) +86-1-(861)-612-5469
> >>>> Personal Email: rob.lecl...@aya.yale.edu
> >>>>
> >>>> ___________________________________________________________
> >>>> Please keep all replies on the list by using "reply all"
> >>>> in your mail client.  To manage your subscriptions to this
> >>>> and other Galaxy lists, please use the interface at:
> >>>>   http://lists.bx.psu.edu/
> >>>>
> >>>> To search Galaxy mailing lists use the unified search at:
> >>>>   http://galaxyproject.org/search/mailinglists/
> >>>>
> >>>
> >>>
> >>
> >
> -------------- next part --------------
> An HTML attachment was scrubbed...
> URL: <
> http://lists.bx.psu.edu/pipermail/galaxy-dev/attachments/20130429/383c60a5/attachment-0001.html
> >
>
>
>

___________________________________________________________
Please keep all replies on the list by using "reply all"
in your mail client.  To manage your subscriptions to this
and other Galaxy lists, please use the interface at:
  http://lists.bx.psu.edu/

To search Galaxy mailing lists use the unified search at:
  http://galaxyproject.org/search/mailinglists/

Re: [galaxy-dev] Creating multiple datasets in a libset

Reply via email to