Hi guys, we currently have a pretty rough HTTP codec in MINA 3, based on regexp. It's good enough for test purposes, but I wonder if it wouldn't be a good idea to start working on somethng more robust and faster too ?
I have gathered all the HTTP 1.1 grammar in one document, that could be used as a base for a faster code (see the http.g file). I don't know if we should use antlr or write a parser by hand. I think that using antlr would be easier, but will be at least 2 times slower than doig it by hand. Another aspect is that we want this parser to be stateless, so that means we have to keep a state in the session. Last, not least, we have to think about the thread model : if we consider that a user can only send one rrequest after the other (ie, the second request can only be processed when the previous one has been completed), then we have no issue. We can go a bit farther, and consider that once we have decoded a full request, then we can accept another one, whatever the server has to do with the previsous request : the ncoming requests are all serialized until they are decoded, then we spawn a new thread to process the HTTP request. wdyt ? -- Regards, Cordialement, Emmanuel Lécharny www.iktek.com
grammar http; httpMessage : request | response ; request : requestLine ( ( generalHeader | requestHeader | entityHeader ) CRLF )* CRLF messageBody? ; response : statusLine ( ( generalHeader | responseHeader | entityHeader ) CRLF )* CRLF messageBody? ; responseHeader : acceptRanges | age | eTag | location | proxyAuthenticate | retryAfter | server | vary | wwwAuthenticate ; eTag : 'ETag' ':' entityTag ; location : 'Location' ':' absoluteURI ; absoluteURI : SCHEME ':' ( hierPart | opaquePart ) ; relativeURI : ( netPath | absPath | relPath ) ( '?' query )? ; relPath : REL_SEGMENT ( absPath )? ; hierPart : ( netPath | absPath ) ( '?' query )? ; netPath : '//' authority ( absPath )? ; authority : uriServer | REQNAME ; uriServer : ( ( USERINFO '@' )? hostport )? ; hostport : host ( ':' port )? ; absPath : '/' pathSegments ; pathSegments : segment ( '/' segment )* ; segment : PCHAR* ( ';' param )* ; param : PCHAR* ; query : URIC* ; opaquePart : URIC_NO_SLASH URIC* ; proxyAuthenticate :` 'Proxy-Authenticate' ':' challenge ( ',' challenge )* ; wwwAuthenticate : 'WWW-Authenticate' ':' challenge ( ',' challenge )* ; challenge : authScheme SP+ authParam ( ',' authParam )* ; authParam : TOKEN '=' ( TOKEN | quotedString ) ; authScheme : TOKEN ; retryAfter : 'Retry-After' ':' ( httpDate | deltaSeconds ) ; server : 'Server' ':' ( product | comment )+ ; vary : 'Vary' ':' ( '*' | fieldName ( ',' fieldName )* ) ; acceptRanges : 'Accept-Ranges' ':' acceptableRanges ; acceptableRanges : 'none' | rangeUnit ( ',' rangeUnit )* ; age : 'Age' ':' ageValue ; ageValue : deltaSeconds ; deltaSeconds : DIGIT+ ; requestLine : method SP requestURI SP httpVersion CRLF ; method : 'OPTIONS' | 'GET' | 'HEAD' | 'POST' | 'PUT' | 'DELETE' | 'TRACE' | 'CONNECT' | extensionMethod ; extensionMethod : TOKEN ; requestURI : '*' | absoluteURI | absPath | authority ; httpVersion : 'HTTP' '/' DIGIT+ '.' DIGIT+ ; generalHeader : cacheControl | connection | date | pragma | trailer | transferEncoding | upgrade | via | warning ; cacheControl : 'Cache-Control' ':' cacheDirective ( ',' cacheDirective )* ; cacheDirective : cacheRequestDirective | cacheResponseDirective ; cacheRequestDirective : 'no-cache' | 'no-store' | 'max-age' '=' deltaSeconds | 'max-stale' ( '=' deltaSeconds )? | 'min-fresh' '=' deltaSeconds | 'no-transform' | 'only-if-cached' | cacheExtension ; cacheResponseDirective : 'public' | 'private' ( '=' '"' fieldName ( ',' fieldName )* '"' )? | 'no-cache' ( '=' '"' fieldName ( ',' fieldName )* '"' )? | 'no-store' | 'no-transform' | 'must-revalidate' | 'proxy-revalidate' | 'max-age' '=' deltaSeconds | 's-maxage' '=' deltaSeconds ; cacheExtension : TOKEN ( '=' ( TOKEN | quotedString ) )? ; connection : 'Connection' ':' connectionToken ( ',' connectionToken )* ; connectionToken : TOKEN ; date : 'Date' ':' httpDate ; httpDate : rfc1123Date | rfc850Date | asctimeDate ; rfc1123Date : wkday ',' SP date1 SP time SP 'GMT' ; rfc850Date : weekday ',' SP date2 SP time SP 'GMT' ; asctimeDate : wkday SP date3 SP time SP DIGIT DIGIT DIGIT DIGIT ; wkday : 'Mon' | 'Tue' | 'Wed' | 'Thu' | 'Fri' | 'Sat' | 'Sun' ; weekday : 'Monday' | 'Tuesday' | 'Wednesday' | 'Thursday' | 'Friday' | 'Saturday' | 'Sunday' ; date1 : DIGIT DIGIT SP month SP DIGIT DIGIT DIGIT DIGIT ; date2 : DIGIT DIGIT '-' month '-' DIGIT DIGIT ; date3 : month SP ( DIGIT DIGIT | ( SP DIGIT ) ) ; month : 'Jan' | 'Feb' | 'Mar' | 'Apr' | 'May' | 'Jun' | 'Jul' | 'Aug' | 'Sep' | 'Oct' | 'Nov' | 'Dec' ; time : DIGIT DIGIT ':' DIGIT DIGIT ':' DIGIT DIGIT ; pragma : 'Pragma' ':' pragmaList ; pragmaList : pragmaDirective (',' pragmaDirective)* ; pragmaDirective : 'no-cache' | extensionPragma ; extensionPragma : TOKEN ( '=' ( TOKEN | quotedString ) )? ; ghTrailer : 'Trailer' ':' fieldName ( ',' fieldName )* ; trailer : ( entityHeader CRLF)* ; transferEncoding : 'Transfer-Encoding' ':' transferCodingList ; transferCodingList : transferCoding ( ',' transferCoding )* ; upgrade : 'Upgrade' ':' productList ; productList : product ( ',' product )* ; via : 'Via' ':' viaList ; viaList : receivedProtocol receivedBy (comment)? ( ',' receivedProtocol receivedBy (comment)? )* ; receivedProtocol : ( protocolName '/')? protocolVersion ; protocolName : TOKEN ; protocolVersion : TOKEN ; receivedBy : host ( ':' port )? | pseudonym ; pseudonym : TOKEN ; host : hostname | ipv4address ; hostname : ( domainlabel '.' )* toplabel ( '.' )? ; domainlabel : ALPHANUM | ALPHANUM ( ALPHANUM | '-' )* ALPHANUM ; toplabel : ALPHA | ALPHA ( ALPHANUM | '-' )* ALPHANUM ; ipv4address : DIGIT+ '.' DIGIT+ '.' DIGIT+ '.' DIGIT+ ; port : DIGIT* ; warning : 'Warning' ':' warningValue ( ',' warningValue )* ; warningValue : warnCode SP warnAgent SP warnText ( SP warnDate )? ; warnCode : DIGIT DIGIT DIGIT ; warnAgent : host ( ':' port )? | pseudonym ; warnText : quotedString ; warnDate : '"' httpDate '"' ; requestHeader : accept | acceptCharset | acceptEncoding | acceptLanguage | authorization | expect | from | rhHost | ifMatch | ifModifiedSince | ifNoneMatch | ifRange | ifUnmodifiedSince | maxForwards | proxyAuthorization | range | referer | te | userAgent ; acceptEncoding : 'Accept-Encoding' ':' codingsList ; codingsList : codings ( ';' 'q' '=' qvalue )? ( ',' codings ( ';' 'q' '=' qvalue )? )* ; codings : contentCoding | '*' ; acceptLanguage : 'Accept-Language' ':' langageList ; langageList : languageRange ( ';' 'q' '=' qvalue )? ( ',' languageRange ( ';' 'q' '=' qvalue )? )* ; languageRange : ( ALPHA18 ( '-' ALPHA18 )* ) | '*' ; authorization : 'Authorization' ':' credentials ; expect : 'Expect' ':' expectationList ; expectationList : expectation ( ',' expectation )* ; expectation : '100-continue' | expectationExtension ; expectationExtension : TOKEN ( '=' ( TOKEN | quotedString ) expectParams* )? ; expectParams : ';' TOKEN ( '=' ( TOKEN | quotedString ) )? ; from : 'From' ':' mailbox ; mailbox : addrSpec | phrase? routeAddr ; phrase : word+ ; routeAddr : '<' ( route )? addrSpec '>' ; route : '@' domain ( ',' '@' domain )* ':' ; addrSpec : localPart '@' domain ; localPart : word ( '.' word )* ; domain : subDomain ( '.' subDomain )* ; subDomain : domainRef | domainLiteral ; domainLiteral : '[' ( DTEXT | quotedPair )* ']' ; domainRef : ATOM ; word : ATOM | mailboxQuotedString ; mailboxQuotedString : '"' ( QTEXT | quotedPair )* '"' ; rhHost : 'Host' ':' host ( ':' port )? ; ifMatch : 'If-Match' ':' ( '*' | entityTagList ) ; ifModifiedSince : 'If-Modified-Since' ':' httpDate ; ifNoneMatch : 'If-None-Match' ':' ( '*' | entityTagList ) ; entityTagList : entityTag ( ',' entityTag )* ; ifRange : 'If-Range' ':' ( entityTag | httpDate ) ; ifUnmodifiedSince : 'If-Unmodified-Since' ':' httpDate ; maxForwards : 'Max-Forwards' ':' DIGIT+ ; proxyAuthorization : 'Proxy-Authorization' ':' credentials ; credentials : authScheme ( authParam ( ',' authParam )* )? ; range : 'Range' ':' rangesSpecifier ; rangesSpecifier : byteRangesSpecifier ; byteRangesSpecifier : bytesUnit '=' byteRangeSet ; byteRangeSet : ( byteRangeSpec> | suffixByteRangeSpec ) byteRangeSetList* ; byteRangeSetList : ',' ( byteRangeSpec> | suffixByteRangeSpec ) byteRangeSetList ; byteRangeSpec : firstBytePos '-' lastBytePos? ; firstBytePos : DIGIT+ ; lastBytePos : DIGIT+ ; suffixByteRangeSpec : '-' suffixLength ; suffixLength : DIGIT+ ; referer : 'Referer' ':' ( absoluteURI | relativeURI ) ; te : 'TE' ':' tCodingsList ; tCodingsList : tCodings ( ',' tCodings )* ; tCodings : 'trailers' | transferExtension acceptParams? ; accept : 'Accept' ':' mediaRangeList ; mediaRangeList : mediaRange acceptParams? ( ',' mediaRangeList )* ; mediaRange : ( '*/*' | ( type '/' '*' ) | ( type '/' subtype ) ) *( ';' parameter ) ; acceptParams : ';' 'q' '=' qvalue ( acceptExtension )* ; type : TOKEN ; subtype : TOKEN ; acceptExtension : ';' TOKEN ( '=' ( TOKEN | quotedString ) )? ; parameter : attribute '=' value ; attribute : TOKEN ; value : TOKEN | quotedString ; acceptCharset : 'Accept-Charset' ':' ( charset | '*' ) ( ';' 'q' '=' qvalue )? ) acceptCharsetList ; acceptCharsetList : ( ', ' ( charset | '*' ) ( ';' 'q' '=' qvalue )? )* ; charset : TOKEN ; contentCoding : TOKEN ; transferCoding : 'chunked' | transferExtension ; transferExtension : TOKEN ( ';' parameter )* ; expires : 'Expires' ':' httpDate ; userAgent : 'User-Agent' ':' ( product | comment )+ ; comment : '(' ( CTEXT | quotedPair | comment )* ')' ; quotedPair : '\\' CHAR ; qvalue : '0' ( '.' digit03? )? | '1' ( '.' zero03? )? ; digit03 : DIGIT | DIGIT DIGIT | DIGIT DIGIT DIGIT ; zero03 : '0' | '00' | '000' ; product : TOKEN ( '/' productVersion )? ; productVersion : TOKEN ; entityHeader : allow | contentEncoding | contentLanguage | contentLength | contentLocation | contentMD5 | contentRange | contentType | expires | lastModified | extensionHeader ; contentEncoding : 'Content-Encoding' ':' contentCoding ( ',' contentCoding )* ; contentLanguage : 'Content-Language' ':' languageTag ( ',' languageTag )* ; contentLength : 'Content-Length' ':' DIGIT+ ; contentLocation : 'Content-Location' ':' ( absoluteURI | relativeURI ) ; contentMD5 : 'Content-MD5' ':' md5Digest ; md5Digest : BASE64 ; contentRange : 'Content-Range' ':' contentRangeSpec ; contentRangeSpec : byteContentRangeSpec ; byteContentRangeSpec : bytesUnit SP byteRangeRespSpec '/' ( instanceLength | '*' ) ; byteRangeRespSpec : ( firstBytePos '-' lastBytePos ) | '*' ; instanceLength : DIGIT+ ; contentType : 'Content-Type' ':' mediaType ; mediaType : type '/' subtype ( ';' parameter )* ; lastModified : 'Last-Modified' ':' httpDate ; extensionHeader : messageHeader ; messageHeader : fieldName ':' ( fieldValue )? ; fieldName : TOKEN ; fieldValue : ( fieldContent | LWS )* ; fieldContent : ( TEXT* | ( TOKEN | SEPARATORS | quotedString )* ) ; messageBody : entityBody ; entityBody : OCTET* ; statusLine : httpVersion SP statusCode SP reasonPhrase CRLF ; statusCode : '100' | '101' | '200' | '201' | '202' | '203' | '204' | '205' | '206' | '300' | '301' | '302' | '303' | '304' | '305' | '307' | '400' | '401' | '402' | '403' | '404' | '405' | '406' | '407' | '408' | '409' | '410' | '411' | '412' | '413' | '414' | '415' | '416' | '417' | '500' | '501' | '502' | '503' | '504' | '505' | extensionCode ; extensionCode : DIGIT DIGIT DIGIT ; reasonPhrase : ( TEXT_NO_CRLF )* ; languageTag : primaryTag ( '-' subtag )* ; primaryTag : ALPHA18 ; subtag : ALPHA18 ; entityTag : weak? opaqueTag ; weak : 'W/' ; opaqueTag : quotedString ; quotedString : '"' ( QDTEXT | quotedPair )* '"' ; rangeUnit : bytesUnit | otherRangeUnit ; bytesUnit : 'bytes' ; otherRangeUnit : TOKEN ; allow : 'Allow' ':' methodList? ; methodList : method ( ',' method )* ; // L E X I C A L R U L E S CRLF : '\r' '\n' ; SP : ' ' ; HT : '\t' ; CR : '\r' ; LF : '\n' ; LWS : CRLF? ( SP | HT )+ ; TEXT : ( HT | CR | LF | ('\u0020'..'\u007e') | ('\u0080'..'\u00ff') )* ; TEXT_NO_CRLF : ( HT | ('\u0020'..'\u007e') | ('\u0080'..'\u00ff') )* ; ATOM : ( '\u0021' | ( '\u0023'..'\u0027' ) | '\u002a' | '\u002b' | '\u002d' | ( '\u002f'..'\u0039' ) | '\u003d' | ( '\u0041'..'\u005a') | ( '\u005e'..'\u007e' ) )* ; QTEXT : ( ('\u0000'..'\u0008') | '\u000b' | '\u000c' | ('\u000e'..'\u001f') | '\u0021' | ('\u0023'..'\u005b') | ('\u005d'..'\u007f') )* ; DTEXT : ( ( '\u0000'..'\u0008' ) | '\u000b' | '\u000c' | ( '\u000e'..'\u001f' ) | ( '\u0021'..'\u005a' ) | ( '\u005e'..'\u007f' ) )* ; QDTEXT : ( HT | CR | LF | ('\u0020'..'\u0021') | ('\u0023'..'\u007e') | ('\u0080'..'\u00ff') )* ; CTEXT : ( HT | CR | LF | ('\u0020'..'\u0027') | ('\u002a'..'\u007e') | ('\u0080'..'\u00ff') )* ; DIGIT : '0' .. '9' ; SEPARATORS : HT | SP | '"' | '(' | ')' | ',' | '/' | ':' | ';' | '<' | '=' | '>' | '?' | '@' | '[' | '\\' | ']' | '{' | '}' ; CHAR : '\u0000'..'\u007f' ; BASE64 : ( '\u002b' | '\u002f' | ( '\u0030'..'\u0039' ) | '\u003d' | ( '\u0041'..'\u005A' ) | ( '\u0061'..'\u007A' ) )* ; TOKEN : '\u0000'..'\u0008' | '\u000a'..'\u001F' | '!' | '\u0023'..'\u0027' | '*' | '+' | '-' | '.' | '0'..'9' | 'A'..'Z' | '^' | '_' | '`' | 'a'..'z' | '|' | '~' | '\u001f' ; fragment UPALPHA : 'A'..'Z' ; fragment LOALPHA : 'a'..'z' ; ALPHA : UPALPHA | LOALPHA ; ALPHANUM : ALPHA | DIGIT ; ALPHA18 : ALPHA | ALPHA ALPHA | ALPHA ALPHA ALPHA | ALPHA ALPHA ALPHA ALPHA | ALPHA ALPHA ALPHA ALPHA ALPHA | ALPHA ALPHA ALPHA ALPHA ALPHA ALPHA | ALPHA ALPHA ALPHA ALPHA ALPHA ALPHA ALPHA | ALPHA ALPHA ALPHA ALPHA ALPHA ALPHA ALPHA ALPHA ; OCTET : '\u0000'..'\u00ff' ; fragment MARK : '-' | '_' | '.' | '!' | '~' | '*' | '\'' | '(' | ')' ; fragment RESERVED : ';' | '/' | '?' | ':' | '@" | '&' | '=' | '+' | '$' | ',' ; fragment UNRESERVED : ALPHANUM | MARK ; fragment HEX : DIGIT | ( 'a'..'f' ) | ( 'A'..'F' ) ; fragment ESCAPED : '%' HEX HEX ; URIC : RESERVED | UNRESERVED | ESCAPED ; URIC_NO_SLASH : UNRESERVED | | ESCAPED | ';' | '?' | ':' | '@' | '&' | '=' | '+' | '$' | ',' ; SCHEME : ALPHA ( ALPHA | DIGIT | '+' | '-' | '.' )* ; REQNAME : ( UNRESERVED | ESCAPED | '$' | ',' | ';' | ':' | '@' | '&' | '=' | '+' )+ ; USERINFO : ( UNRESERVED | ESCAPED | ';' | ':' | '&' | '=' | '+' | '$' | ',' )* ; PCHAR : UNRESERVED | ESCAPED | ':' | '@' | '&' | '=' | '+' | '$' | ',' ; REL_SEGMENT : ( UNRESERVED | ESCAPED | ';' | '@' | '&' | '=' | '+' | '$' | ',' )+ ;