1 /**
2 * BSD-style license; for more info see http://pmd.sourceforge.net/license.html
3 */
4 package net.sourceforge.pmd.cpd;
5
6 import java.util.List;
7
8 /**
9 *
10 * @author Zev Blut zb@ubit.com
11 * @author Romain PELISSE belaran@gmail.com
12 */
13 public abstract class AbstractTokenizer implements Tokenizer {
14
15 // FIXME depending on subclasses to assign local vars is rather fragile -
16 // better to make private and setup via explicit hook methods
17
18 protected List<String> stringToken; // List<String>, should be set by sub
19 // classes
20 protected List<String> ignorableCharacter; // List<String>, should be set by
21 // sub classes
22 // FIXME:Maybe an array of 'char'
23 // would be better for
24 // performance ?
25 protected List<String> ignorableStmt; // List<String>, should be set by sub
26 // classes
27 protected char oneLineCommentChar = '#'; // Most script languages ( shell,
28 // ruby, python,...) use this
29 // symbol for comment line
30
31 private List<String> code;
32 private int lineNumber = 0;
33 private String currentLine;
34
35 protected boolean spanMultipleLinesString = true; // Most languages do, so
36 // default is true
37 protected Character spanMultipleLinesLineContinuationCharacter = null;
38
39 private boolean downcaseString = true;
40
41 public void tokenize(SourceCode tokens, Tokens tokenEntries) {
42 code = tokens.getCode();
43
44 for (lineNumber = 0; lineNumber < code.size(); lineNumber++) {
45 currentLine = code.get(lineNumber);
46 int loc = 0;
47 while (loc < currentLine.length()) {
48 StringBuilder token = new StringBuilder();
49 loc = getTokenFromLine(token, loc);
50 if (token.length() > 0 && !isIgnorableString(token.toString())) {
51 if (downcaseString) {
52 token = new StringBuilder(token.toString().toLowerCase());
53 }
54 // need to re-think how to link this
55 // if ( CPD.debugEnable ) {
56 // System.out.println("Token added:" + token.toString());
57 // }
58 tokenEntries.add(new TokenEntry(token.toString(), tokens.getFileName(), lineNumber));
59
60 }
61 }
62 }
63 tokenEntries.add(TokenEntry.getEOF());
64 }
65
66 private int getTokenFromLine(StringBuilder token, int loc) {
67 for (int j = loc; j < currentLine.length(); j++) {
68 char tok = currentLine.charAt(j);
69 if (!Character.isWhitespace(tok) && !ignoreCharacter(tok)) {
70 if (isComment(tok)) {
71 if (token.length() > 0) {
72 return j;
73 } else {
74 return getCommentToken(token, loc);
75 }
76 } else if (isString(tok)) {
77 if (token.length() > 0) {
78 return j; // we need to now parse the string as a
79 // separate token.
80 } else {
81 // we are at the start of a string
82 return parseString(token, j, tok);
83 }
84 } else {
85 token.append(tok);
86 }
87 } else {
88 if (token.length() > 0) {
89 return j;
90 }
91 }
92 loc = j;
93 }
94 return loc + 1;
95 }
96
97 private int parseString(StringBuilder token, int loc, char stringDelimiter) {
98 boolean escaped = false;
99 boolean done = false;
100 char tok = ' '; // this will be replaced.
101 while (loc < currentLine.length() && !done) {
102 tok = currentLine.charAt(loc);
103 if (escaped && tok == stringDelimiter) { // Found an escaped string
104 escaped = false;
105 } else if (tok == stringDelimiter && token.length() > 0) {
106 // We are done, we found the end of the string...
107 done = true;
108 } else if (tok == '\\') { // Found an escaped char
109 escaped = true;
110 } else { // Adding char...
111 escaped = false;
112 }
113 // Adding char to String:" + token.toString());
114 token.append(tok);
115 loc++;
116 }
117 // Handling multiple lines string
118 if (!done && // ... we didn't find the end of the string
119 loc >= currentLine.length() && // ... we have reach the end of
120 // the line ( the String is
121 // incomplete, for the moment at
122 // least)
123 spanMultipleLinesString && // ... the language allow multiple
124 // line span Strings
125 lineNumber < code.size() - 1 // ... there is still more lines to
126 // parse
127 ) {
128 // removes last character, if it is the line continuation (e.g.
129 // backslash) character
130 if (spanMultipleLinesLineContinuationCharacter != null && token.length() > 0
131 && token.charAt(token.length() - 1) == spanMultipleLinesLineContinuationCharacter.charValue()) {
132 token.deleteCharAt(token.length() - 1);
133 }
134 // parsing new line
135 currentLine = code.get(++lineNumber);
136 // Warning : recursive call !
137 loc = parseString(token, 0, stringDelimiter);
138 }
139 return loc + 1;
140 }
141
142 private boolean ignoreCharacter(char tok) {
143 return ignorableCharacter.contains(String.valueOf(tok));
144 }
145
146 private boolean isString(char tok) {
147 return stringToken.contains(String.valueOf(tok));
148 }
149
150 private boolean isComment(char tok) {
151 return tok == oneLineCommentChar;
152 }
153
154 private int getCommentToken(StringBuilder token, int loc) {
155 while (loc < currentLine.length()) {
156 token.append(currentLine.charAt(loc++));
157 }
158 return loc;
159 }
160
161 private boolean isIgnorableString(String token) {
162 return ignorableStmt.contains(token);
163 }
164 }