blob: 858dd2900990cd86af4e4a4ba59d2c241d3110c7 [file] [log] [blame]
david_williams126339f2005-07-05 05:54:08 +00001/*******************************************************************************
nitinda5549702007-04-17 04:44:07 +00002 * Copyright (c) 2005, 2007 IBM Corporation and others.
david_williams126339f2005-07-05 05:54:08 +00003 * All rights reserved. This program and the accompanying materials
4 * are made available under the terms of the Eclipse Public License v1.0
5 * which accompanies this distribution, and is available at
6 * http://www.eclipse.org/legal/epl-v10.html
7 *
8 * Contributors:
9 * IBM Corporation - initial API and implementation
10 *******************************************************************************/
11/*nlsXXX*/
12package org.eclipse.jst.jsp.core.internal.contenttype;
13import java.io.IOException;
14import java.io.Reader;
15
16import org.eclipse.wst.xml.core.internal.contenttype.EncodingParserConstants;
17import org.eclipse.wst.xml.core.internal.contenttype.XMLHeadTokenizerConstants;
18
19
20
21
22
23%%
24
25%{
26
27
28
29
30 private boolean hasMore = true;
31 private final static int MAX_TO_SCAN = 8000;
32 StringBuffer string = new StringBuffer();
33 // state stack for easier state handling
34 private IntStack fStateStack = new IntStack();
35 private String valueText = null;
david_williamsbd89a752006-09-19 15:47:05 +000036 private boolean isXHTML;
37 private boolean isWML;
david_williams126339f2005-07-05 05:54:08 +000038
39
40 public JSPHeadTokenizer() {
41 super();
42 }
43
44 public void reset (Reader in) {
45 /* the input device */
46 zzReader = in;
47
48 /* the current state of the DFA */
49 zzState = 0;
50
51 /* the current lexical state */
52 zzLexicalState = YYINITIAL;
53
54 /* this buffer contains the current text to be matched and is
55 the source of the yytext() string */
56 java.util.Arrays.fill(zzBuffer, (char)0);
57
58 /* the textposition at the last accepting state */
59 zzMarkedPos = 0;
60
61 /* the textposition at the last state to be included in yytext */
62 zzPushbackPos = 0;
63
64 /* the current text position in the buffer */
65 zzCurrentPos = 0;
66
67 /* startRead marks the beginning of the yytext() string in the buffer */
68 zzStartRead = 0;
69
70 /**
71 * endRead marks the last character in the buffer, that has been read
72 * from input
73 */
74 zzEndRead = 0;
75
76 /* number of newlines encountered up to the start of the matched text */
david_williamsbd89a752006-09-19 15:47:05 +000077 //yyline = 0;
david_williams126339f2005-07-05 05:54:08 +000078
79 /* the number of characters up to the start of the matched text */
80 yychar = 0;
81
82 /**
83 * the number of characters from the last newline up to the start
84 * of the matched text
85 */
david_williamsbd89a752006-09-19 15:47:05 +000086 //yycolumn = 0;
david_williams126339f2005-07-05 05:54:08 +000087
88 /**
89 * yy_atBOL == true <=> the scanner is currently at the beginning
90 * of a line
91 */
92 zzAtBOL = true;
93
94 /* yy_atEOF == true <=> the scanner has returned a value for EOF */
95 zzAtEOF = false;
96
97 /* denotes if the user-EOF-code has already been executed */
98 zzEOFDone = false;
99
100
101 fStateStack.clear();
102
103 hasMore = true;
david_williamsbd89a752006-09-19 15:47:05 +0000104 isXHTML=false;
105 isWML=false;
david_williams126339f2005-07-05 05:54:08 +0000106
107
108 }
109
110
111 public final HeadParserToken getNextToken() throws IOException {
112 String context = null;
113 context = primGetNextToken();
114 HeadParserToken result = null;
115 if (valueText != null) {
116 result = createToken(context, yychar, valueText);
117 valueText = null;
118 } else {
119 result = createToken(context, yychar, yytext());
120 }
121 return result;
122 }
123
124 public final boolean hasMoreTokens() {
125 return hasMore && yychar < MAX_TO_SCAN;
126 }
127 private void pushCurrentState() {
128 fStateStack.push(yystate());
129
130 }
131
132 private void popState() {
133 yybegin(fStateStack.pop());
134 }
135 private HeadParserToken createToken(String context, int start, String text) {
136 return new HeadParserToken(context, start, text);
137 }
david_williamsbd89a752006-09-19 15:47:05 +0000138
139 public boolean isXHTML() {
140 return isXHTML;
141 }
142 public boolean isWML() {
143 return isWML;
144 }
david_williams126339f2005-07-05 05:54:08 +0000145
146%}
147
148%eof{
149 hasMore=false;
150%eof}
151
152%public
153%class JSPHeadTokenizer
154%function primGetNextToken
155%type String
156%char
157%unicode
158%ignorecase
159//%debug
160%switch
161%buffer 8192
162
163
164UTF16BE = \xFE\xFF
165UTF16LE = \xFF\xFE
166UTF83ByteBOM = \xEF\xBB\xBF
167
168// SpaceChar = [\x20\x09]
169
170
171// [3] S ::= (0x20 | 0x9 | 0xD | 0xA)+
172S = [\x20\x09\x0D\x0A]
173
nitinda5549702007-04-17 04:44:07 +0000174BeginAttributeeValue = {S}* \= {S}*
david_williams126339f2005-07-05 05:54:08 +0000175
176LineTerminator = \r|\n
177
178
179%state ST_XMLDecl
180%state ST_PAGE_DIRECTIVE
181%state QuotedAttributeValue
182%state DQ_STRING
183%state SQ_STRING
184%state UnDelimitedString
185
186%%
187
188
189<YYINITIAL>
190{
david_williamsbd89a752006-09-19 15:47:05 +0000191 // force to start at beginning of line (^) and at beginning of file (yychar == 0)
192 ^ {UTF16BE} {if (yychar == 0 ) {hasMore = false; return EncodingParserConstants.UTF16BE;}}
193 ^ {UTF16LE} {if (yychar == 0 ) {hasMore = false; return EncodingParserConstants.UTF16LE;}}
194 ^ {UTF83ByteBOM} {if (yychar == 0 ) {hasMore = false; return EncodingParserConstants.UTF83ByteBOM;}}
195
196 // force to be started on first line, but we do allow preceeding spaces
197 ^ {S}* "<\?xml" {S}+ {if (yychar == 0 ) {yybegin(ST_XMLDecl); return XMLHeadTokenizerConstants.XMLDeclStart;}}
198
199
200 // following are some simply rules to identify JSP content as "XHTML"
201 // see http://www.rfc-editor.org/rfc/rfc3236.txt
202 "<!DOCTYPE" {S}* "html" {S}* "PUBLIC" .* "//DTD XHTML" {isXHTML = true;}
203 "<html" {S}* "xmlns" {S}* "=" {S}* (\" | \') "http://www.w3.org/1999/xhtml" {isXHTML = true;}
204 // another case that's part of the "HTML family" is WML 1.0 (WML 2.0 is part of XHTML)
205 "<!DOCTYPE" {S}* "wml" {S}* "PUBLIC" .* "//DTD WML" {isWML = true;}
david_williams126339f2005-07-05 05:54:08 +0000206
nitinda5549702007-04-17 04:44:07 +0000207 "<%" {S}* "@" {S}* ("page"|"tag") {S}+ {yybegin(ST_PAGE_DIRECTIVE); return JSPHeadTokenizerConstants.PageDirectiveStart;}
208 ("<jsp:directive.page"|"<jsp:directive.tag") {S}+ {yybegin(ST_PAGE_DIRECTIVE); return JSPHeadTokenizerConstants.PageDirectiveStart;}
david_williams126339f2005-07-05 05:54:08 +0000209
210
211}
212
213<ST_XMLDecl>
214{
nitinda5549702007-04-17 04:44:07 +0000215 "version" {BeginAttributeeValue} {pushCurrentState(); yybegin(QuotedAttributeValue); return XMLHeadTokenizerConstants.XMLDeclVersion;}
216 "encoding" {BeginAttributeeValue} {pushCurrentState(); yybegin(QuotedAttributeValue); return XMLHeadTokenizerConstants.XMLDelEncoding;}
david_williams126339f2005-07-05 05:54:08 +0000217 // note the "forced end" (via 'hasMore=false') once the end of XML Declaration found
218 // This is since non-ascii chars may follow and may cause IOExceptions which would not occur once stream is
219 // read with incorrect encoding (such as if platform encoding is in effect until true encoding detected).
220 // BUT, the hasMore=false was removed for this JSP case (probably still ok for pure XML) because
221 // in a JSP, we must parse past xmlDecl to get at JSP page directive.
222 // We'll assume all chars in this area are "readable" as is.
223 {S}* "\?>" {yybegin(YYINITIAL); return XMLHeadTokenizerConstants.XMLDeclEnd;}
224}
225
226<ST_PAGE_DIRECTIVE>
227{
228// removed 'language' since it really can be handled seperately from encoding, but may add it back later for simple re-use.
nitinda5549702007-04-17 04:44:07 +0000229 "language" {BeginAttributeeValue} {pushCurrentState(); yybegin(QuotedAttributeValue); return JSPHeadTokenizerConstants.PageLanguage;}
230 "contentType" {BeginAttributeeValue} {pushCurrentState(); yybegin(QuotedAttributeValue); return JSPHeadTokenizerConstants.PageContentType;}
231 "pageEncoding" {BeginAttributeeValue} {pushCurrentState(); yybegin(QuotedAttributeValue); return JSPHeadTokenizerConstants.PageEncoding;}
david_williams126339f2005-07-05 05:54:08 +0000232 // note the "forced end" (via 'hasMore=false') once the end of XML Declaration found
233 // This is since non-ascii chars may follow and may cause IOExceptions which would not occur once stream is
234 // read in correct encoding.
235
236 // https://w3.opensource.ibm.com/bugzilla/show_bug.cgi?id=4205 demonstrates how we need to keep parsing,
237 // even if come to end of one page directive, so hasMore=false was removed from these rules.
238 "%>" { yybegin(YYINITIAL); return JSPHeadTokenizerConstants.PageDirectiveEnd;}
239 "\/>" { yybegin(YYINITIAL); return JSPHeadTokenizerConstants.PageDirectiveEnd;}
240}
241
242
243<QuotedAttributeValue>
244{
245 \" { yybegin(DQ_STRING); string.setLength(0); }
246 \' { yybegin(SQ_STRING); string.setLength(0); }
247 // in this state, anything other than a space character can start an undelimited string
248 {S}*. { yypushback(1); yybegin(UnDelimitedString); string.setLength(0);}
249
250}
251
252
253<DQ_STRING>
254{
255
256 \" { popState(); valueText = string.toString(); return EncodingParserConstants.StringValue; }
257 {LineTerminator} { yypushback(1);popState(); valueText = string.toString(); return EncodingParserConstants.InvalidTerminatedStringValue;}
258 "\?>" { yypushback(2); popState(); valueText = string.toString(); return EncodingParserConstants.InvalidTerminatedStringValue;}
259 '<' { yypushback(1);popState(); valueText = string.toString(); return EncodingParserConstants.InvalidTerminatedStringValue;}
260 . { string.append( yytext() ); }
261
262 "%>" { yypushback(2);popState(); valueText = string.toString(); return EncodingParserConstants.InvalidTerminatedStringValue;}
263
264
265}
266
267<SQ_STRING>
268{
269
270 \' { popState(); valueText = string.toString(); return EncodingParserConstants.StringValue;}
271 {LineTerminator} { yypushback(1);popState(); valueText = string.toString(); return EncodingParserConstants.InvalidTerminatedStringValue;}
272 "%>" { yypushback(2);popState(); valueText = string.toString(); return EncodingParserConstants.InvalidTerminatedStringValue;}
273 '<' { yypushback(1);popState(); valueText = string.toString(); return EncodingParserConstants.InvalidTerminatedStringValue;}
274 . { string.append( yytext() ); }
275 "%>" { yypushback(2);popState(); valueText = string.toString(); return EncodingParserConstants.InvalidTerminatedStringValue;}
276
277
278}
279
280<UnDelimitedString>
281{
282
283
284 {S} { yypushback(1);popState(); valueText = string.toString(); return EncodingParserConstants.UnDelimitedStringValue; }
285 {LineTerminator} { yypushback(1);popState(); valueText = string.toString(); return EncodingParserConstants.InvalidTerminatedStringValue;}
286 "\?>" { yypushback(2);popState(); valueText = string.toString(); return EncodingParserConstants.InvalidTerminatedStringValue;}
287 '<' { yypushback(1);popState(); valueText = string.toString(); return EncodingParserConstants.InvalidTerminatedStringValue;}
288 // these are a bit special, since we started an undelimit string, but found a quote ... probably indicates a missing beginning quote
289 \' { yypushback(1);popState(); valueText = string.toString(); return EncodingParserConstants.InvalidTermintatedUnDelimitedStringValue;}
290 \" { yypushback(1);popState(); valueText = string.toString(); return EncodingParserConstants.InvalidTermintatedUnDelimitedStringValue;}
291
292 . { string.append( yytext() ); }
293 "%>" { yypushback(2);popState(); valueText = string.toString(); return EncodingParserConstants.InvalidTerminatedStringValue;}
294
295}
296
297// The "match anything" rule should always be in effect except for when looking for end of string
298// (That is, remember to update state list if/when new states added)
299<YYINITIAL, ST_XMLDecl, QuotedAttributeValue, ST_PAGE_DIRECTIVE>
300{
301// this is the fallback (match "anything") rule (for this scanner, input is ignored, and position advanced, if not recognized)
302.|\n {if (yychar > MAX_TO_SCAN) {hasMore=false; return EncodingParserConstants.MAX_CHARS_REACHED;}}
303}
304
305// this rule always in effect
306<<EOF>> {hasMore = false; return EncodingParserConstants.EOF;}
307