/* readris.c: functions to read RIS datasets */
/* hoenicka_markus@compuserve.com 3-11-00 */
/* $Id: readris.c,v 1.6 2000/06/18 06:32:25 markus Exp markus $ */

/* ToDo: supply a second FILE ptr to a global.ris file (or NULL). If not null, include the contents before ER  - is written to the complete string. */

#include <stdio.h>
#include <errno.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <unistd.h>
#include <limits.h>

#include "readris.h"
#include "strfncs.h"
#include "tokenize.h"

#define RIS_LINE_SIZE 16384
#define FILE_CHUNK_SIZE 4096

/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  read_ris_set(): reads a dataset in RIS format from a file

  int read_ris_set returns 0 if failed, 1 if complete set was
                   read, 2 if end of file was reached

  FILE* fp a file pointer to the file containing the dataset

  char* deffile the name of a file containing default RIS fields
              those fields will be appended unless deffile is an
              empty string

  char** inbuffer a pointer to a pointer to a buffer allocated with
                 malloc(). The calling function is responsible for
                 freeing the buffer after use. *inbuffer will be
                 modified by the function

  size_t* bufsize a pointer to a variable which receives the buffer size
                 *bufsize will be modified by the function

  int pull_in_default this should always be 0. the fn calls itself with
                this parameter set to 1.

 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
int read_ris_set(FILE* fp, char* deffile, char** inbuffer, size_t* inbufsize, int pull_in_default) {
  int setdone = 0;
  int filedone = 0;
  size_t len_homedefault;
  char* linebuffer;
  char* result;
  char* newinbuf;
  char homedefault[_POSIX_PATH_MAX+1];
  FILE* default_fp = NULL;

  if (*deffile != '\0') {
    default_fp = fopen(deffile, "rb");
    if (default_fp == NULL) {
      strcpy(homedefault, getenv("HOME"));
      strcat(homedefault, "/");
      len_homedefault = strlen(homedefault);
      strncpy(&homedefault[len_homedefault], deffile, _POSIX_PATH_MAX - len_homedefault);
/*        strcat(homedefault, deffile); */
      default_fp = fopen(homedefault, "rb"); /* we ignore that this might fail too - the function will then proceed without the defaults */
      if (default_fp == NULL) {
	return 0;
      }
    }
  }

  linebuffer = (char*)malloc(RIS_LINE_SIZE);
  if (linebuffer == NULL) {
    return 0;
  }

  /* read first line and make sure that its an empty line, otherwise
     the dataset is no real RIS dataset */
  result = fgets(linebuffer, RIS_LINE_SIZE, fp);
  if (result == NULL && errno != EOF) {
    free(linebuffer);
    return 2; /* (0) somehow there is no EOF at end of file ?? */
  }
  if (result == NULL && errno == EOF) {
    free(linebuffer);
    return 2;  /* end of file */
  }
  else if (!(*linebuffer == '\n' || (*linebuffer == '\r' && linebuffer[1] == '\n'))) {
    free(linebuffer);
    return 0; /* file doesn't start with empty line */
  }

  /* remove any additional empty lines */
  while (*linebuffer == '\n' || (*linebuffer == '\r' && linebuffer[1] == '\n')) {
    result = fgets(linebuffer, RIS_LINE_SIZE, fp);
    if (result == NULL) {
      free(linebuffer);
      return 2;
    }
  }
  *inbufsize += strlen(linebuffer);
  newinbuf = realloc(*inbuffer, *inbufsize);
  if (newinbuf == NULL) {
    free(linebuffer);
    return 0;
  }
  else {
    *inbuffer = newinbuf;
  }
  if ((pull_in_default && strncmp(linebuffer, "TY  - ", 6) != 0 && strncmp(linebuffer, "ER  - ", 6) != 0) || !pull_in_default) {
    strcat(*inbuffer, linebuffer);
  }

  while (!setdone) {
    result = fgets(linebuffer, RIS_LINE_SIZE, fp);
    if (result == NULL && errno != EOF) {
/*        free(linebuffer); */
/*        return 0; */
      filedone++;
      setdone++;
    }
    if (result == NULL && errno == EOF) {
      filedone++;
      setdone++;
    }
    else if (strncmp(linebuffer, "ER  - ", 6) == 0) {
      /* pull in global fields if requested */
      if (!pull_in_default && default_fp != NULL) {
	read_ris_set(default_fp, "", inbuffer, inbufsize, 1);
	fclose(default_fp);
      }
      setdone++;
    }
    
    /* remove any CR */
    if ((result = strchr(linebuffer, (int)'\r')) != NULL) {
      *result = '\0';
    }

    if ((pull_in_default && strncmp(linebuffer, "TY  - ", 6) != 0 && strncmp(linebuffer, "ER  - ", 6) != 0) || !pull_in_default) {
      *inbufsize += strlen(linebuffer);
      newinbuf = realloc(*inbuffer, *inbufsize);
      if (newinbuf == NULL) {
	free(linebuffer);
	return 0;
      }
      else {
	*inbuffer = newinbuf;
      }
      strcat(*inbuffer, linebuffer);
/*      printf("%s\n", linebuffer); */
    }
  }

  free(linebuffer);

  if (filedone) {
    return 2;
  }
  else {
    return 1;
  }
}

/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  add_id_from_ris(): adds the IDs from a RIS file to a provided buffer
                     allocated with malloc()

  int add_id_from_ris returns 0 if successful, 1 if an error occurred
                     or if no IDs were found

  char* filename the name of the file that contains the IDs

  char** buffer pointer to a buffer to which the result will be added.
                This pointer will be modified if it is necessary to
                reallocate the buffer.

  size_t *maxlen pointer to current length of buffer; may be modified

  int n_read_stdin if 1, data will be read from stdin. in this case
                filename will be ignored

  ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
int add_id_from_ris(char* filename, char** buffer, size_t *bufsize, int n_read_stdin) {
  FILE *infilefp;
  char* linebuffer;
  char* read_result;
  char* the_end;
  char* new_buffer;
  size_t buflen = 0;

  linebuffer = (char*)malloc((size_t)RIS_LINE_SIZE+1); /* we append a space */

  if (linebuffer == NULL) {
    return 1;
  }

  if (n_read_stdin) {
    infilefp = stdin;
  }
  else {
    if ((infilefp = fopen(filename, "rb")) == NULL) {
      free(linebuffer);
      return 1;
    }
  }

  /* loop over all lines in the file */
  do {
    read_result = fgets(linebuffer, RIS_LINE_SIZE, infilefp);
    if (read_result != NULL) {
      /* see if its a line with ID and if we've got enough space left */
      if (strncmp(read_result, "ID  - ", 6) == 0) {
	/* terminate the string at the first \n or \r, if any */
	the_end = read_result+6;
	while (*the_end != '\0') {
	  if (*the_end == '\r' || *the_end == '\n') {
	    *the_end = '\0';
	    break;
	  }
	  the_end++;
	}

	strcat(read_result, " ");
	if ((new_buffer = mstrcat(*buffer, read_result+6, &buflen, 0)) == NULL) {
	  free(linebuffer);
	  if (!n_read_stdin) {
	    fclose(infilefp);
	  }
	  return 1;
	}
	else {
	  *buffer = new_buffer;
	}
	buflen += strlen(*buffer);
      }
    }
  } while (read_result != NULL);

  free(linebuffer);
  if (!n_read_stdin) {
    fclose(infilefp);
  }

  if (buflen) {
    return 0;
  }
  else { /* no IDs found */
    return 1;
  }
}

/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  read_tokens(): reads tokens from an open file. A token is a string
                 between one or more separators on both sides. 
                 Separators are the file start, the end of file,
                 a space, a tab, a CR, and a LF. Use it to read in 
                 whitespace-delimited lists.

  char* read_tokens returns NULL if an error occurs, otherwise a
                 pointer to buffer. *buffer may get reallocated
                 while reading the file, so it is important that
                 after this function returns *ONLY* the return value
                 is used to address the result and *NEVER* *buffer
                 itself.

  int infilefd file descriptor of an open file with read access
                 which contains the whitespace-delimited data to read 

  char *buffer buffer allocated with malloc() which will received a
                space-separated string containing the read tokens.
                buffer will be reallocated as needed, therefore use
                *ONLY* the return value to access the data after this
                function returns, *NEVER* the original *buffer

  size_t *buffer_len pointer to a variable holding the current size of
                buffer. Will be modified if a realloc() is necessary

  ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
char* read_tokens(int infile_fd, char* buffer, size_t *buffer_len) {

  char *queue;
  size_t nread;
  char* token;
  char* next_token;
  char *new_buffer;
  char *separator;
  int token_len;
  int error = 0;

  queue = (char*)malloc((size_t)FILE_CHUNK_SIZE);
  if (queue == NULL) {
    return NULL;
  }

  do { /* loop until file is read completely */
    /* read a chunk from the file */
    nread = read(infile_fd, (void*)queue, FILE_CHUNK_SIZE - 1);
    if (nread == -1) { /* can't read from this file */
      error = 1;
      break;
    }

    /* terminate to make a string */
    queue[nread] = '\0';
    next_token = queue;
    separator = strpbrk(next_token, " \n\r\t");
    if ((separator == NULL || separator != next_token) && *buffer_len > 0) {
      /* eliminate the trailing space in buffer if this chunk does not start
         with some whitespace */
      buffer[strlen(buffer) - 1] = '\0';
    }

    do { /* loop over all tokens */
      token = nstrtok(next_token, &token_len, " \n\r\t");
      if (token != NULL) {
	token[token_len] = '\0';
/*  	printf("%s\n", token); */
	/* append token to buffer */
	if ((new_buffer = mstrcat(buffer, token, buffer_len, 0)) == NULL) {
	  error = 1;
	  break;
	}
	else {
	  buffer = new_buffer;
	}
	
	/* append a space to the buffer */
	if ((new_buffer = mstrcat(buffer, " ", buffer_len, 0)) == NULL) {
	  error = 1;
	  break;
	}
	else {
	  buffer = new_buffer;
	}
	next_token = token + token_len + 1;
/*  	printf("%s\n", buffer); */
      }
    } while (token != NULL);

  } while (nread > 0 && !error);

  free(queue);

  if (error) {
    return NULL;
  }
  return buffer;
}

/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  risdate(): converts a RIS PY string into a year short int and an
             otherinfo string

  short int risdate returns the year or 0 if none was specified

  char* otherinfo_buffer will receive the other date information. Must
              hold up to 256 chars including the terminal \0

  char* string the PY string to convert

  ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
short int risdate(char* otherinfo_buffer, char* string) {
  char* monthsep;
  short int year;

  if ((monthsep = index(string, (int)'/')) == NULL) {
    if (strlen(string) == 4) { /* have year info */
      year = atoi(string); 
      otherinfo_buffer[0] = '\0';
    }
    else {
      /* have otherinfo (strictly speaking this is no valid RIS 
	 but here we should rather try to conserve whatever we find than
	 try to rectify) */
      strcpy(otherinfo_buffer, string);
      year = 0;
    }
  }
  else if (monthsep-string == 4) { /* if there is year info and more */
    year = atoi(string); /* atoi should read only until the separator */
    strcpy(otherinfo_buffer, string+4);
  }
  else { /* if there is no year info */
    strcpy(otherinfo_buffer, string);
    year = 0;
  }
  return year;
}
/* the following implementation should work with MySQL >= 3.23, leave
   here for later use. date_buffer is formatted for storage in a DATE
   column */
/*  int risdate(char* date_buffer, char* otherinfo_buffer, char* string) { */
/*    char* monthsep; */
/*    char* daysep; */
/*    char* infosep = NULL;  */

/*    char year[5]; */
/*    char month[3]; */
/*    char day[3]; */

/*    year[0] = '\0'; */
/*    month[0] = '\0'; */
/*    day[0] = '\0'; */
/*    date_buffer[0] = '\0'; */
/*    otherinfo_buffer[0] = '\0'; */

/*    find the year */ 
/*    if ((monthsep = index(string, (int)'/')) == NULL) { */
/*      if (strlen(string) == 4) { */
/*        strcpy(year, string); */
/*        strcpy(month, "00"); */
/*        strcpy(day, "00"); */
/*      } */
/*      else { */
/*        return 1; */
/*      } */
/*    } */
/*    else { */
/*      if ((monthsep - string) != 4) { */
/*        strcpy(year, "0000"); */
/*      } */
/*      else { */
/*        strncpy(year, string, 4); */
/*        year[4] = '\0'; */
/*      } */
   
/*      find the month */ 
/*      if ((daysep = index(monthsep + 1, (int)'/')) == NULL) { */
/*        if (strlen(monthsep+1) == 2) { */
/*  	strcpy(month, monthsep+1); */
/*  	strcpy(day, "00"); */
/*        } */
/*        else { */
/*  	return 1; */
/*        } */
/*      } */
/*      else { */
/*        if ((daysep - monthsep) != 3) { */
/*  	strcpy(month, "00"); */
/*        } */
/*        else { */
/*  	strncpy(month, monthsep + 1, 2); */
/*  	month[2] = '\0'; */
/*        } */

/*        if ((infosep = index(daysep + 1, (int)'/')) == NULL) { */
/*  	if (strlen(daysep+1) == 2) { */
/*  	  strcpy(day, daysep+1); */
/*  	} */
/*  	else { */
/*  	  return 1; */
/*  	} */
/*        } */
/*        else { */
/*  	if ((infosep - daysep) != 3) { */
/*  	  strcpy(day, "00"); */
/*  	} */
/*  	else { */
/*  	  strncpy(day, daysep + 1, 2); */
/*  	  day[2] = '\0'; */
/*  	} */
/*        } */
/*      } */
/*    } */
/*     assemble datestring */ 
/*    strcpy(date_buffer, year); */
/*    strcat(date_buffer, "/"); */
/*    strcat(date_buffer, month); */
/*    strcat(date_buffer, "/"); */
/*    strcat(date_buffer, day); */

/*     fill in otherinfo string */ 
/*    if (infosep != NULL) { */
/*      strncpy(otherinfo_buffer, infosep + 1, 255); */
/*      otherinfo_buffer[255] = '\0';  if string was truncated */
/*    } */

/*    return 0; */
/*  } */

/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  ris_rpdate(): converts a RIS RP string into a status int and a
             date string

  int ris_rpdate returns -1 if an error occurred, otherwise returns
             the reprint status (0 = IN FILE, 1 = NOT IN FILE,
             2 = ON REQUEST)

  char* date_buffer will receive the date. Must hold 11 chars
              including the terminal \0. For reprint status 0 and
              1 and if an error occurred, date_buffer will be
              an empty string.

  char* rp_string the string to convert

  ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
int ris_rpdate(char* date_buffer, char* rp_string) {
  int type;
  int provide_default = 0;
  char* date;
  struct tm *tm_ptr;
  time_t the_time;

  if (strncmp(rp_string, "IN FILE", 7) == 0) {
    type = 0;
    date_buffer[0] = '\0';
  }
  else if (strncmp(rp_string, "NOT IN FILE", 11) == 0) {
    type = 1;
    date_buffer[0] = '\0';
  }
  else if (strncmp(rp_string, "ON REQUEST", 10) == 0) {
    type = 2;
  }
  else {
    type = -1;
    date_buffer[0] = '\0';
  }

  if (type == 2) { /* on request */
    if ((date = index(rp_string, (int)'(')) == NULL) {
      provide_default++;
    }
    else if (date[3] != '/' || date[6] != '/') {
      provide_default++;
    }

    if (provide_default) { /* lacking a better idea, use current UTC */
      time(&the_time);
      tm_ptr = gmtime(&the_time);
      sprintf(date_buffer, "%04d-%02d-%02d", tm_ptr->tm_year + 1900, tm_ptr->tm_mon + 1, tm_ptr->tm_mday);
    }
    else {
      /* this is the RIS version of the Y2K issue */
      if (atoi(date+7) < 70) {
	strcpy(date_buffer, "20");
      }
      else {
	strcpy(date_buffer, "19");
      }
      strncat(date_buffer, date+7, 2); /* year */
      strcat(date_buffer, "-");
      strncat(date_buffer, date+1, 2); /* month */
      strcat(date_buffer, "-");
      strncat(date_buffer, date+4, 2); /* day */
    }
  }
  return type;
}





