/* * MainLayer.cxx -- main generic layer between * the calling layer and the common processing. * * Done to make it easier to build scineplex DLLs, XS modules, etc. * * Copyright (C) 2005 ActiveState Software Inc. All rights reserved. */ #include "BufferAccessor.h" #include "PropSet.h" #include "KeyWords.h" #include "Scintilla.h" #include "SciLexer.h" #include "Common.h" #include "MainLayer.h" #include "MainLayer_gen.h" // #include #include /**************** Internal Declarations ****************/ static void add_char(char *buf, int &num_written, int max_index, char new_char, print_callback_fn_t cb_linePrint, void *cb_linePrint_data); static void add_string(char *buf, int &num_written, int max_index, const char *new_string, print_callback_fn_t cb_linePrint, void *cb_linePrint_data); static void end_style(int curr_style, char *buf, int &num_written, int max_index, print_callback_fn_t cb_linePrint, void *cb_linePrint_data); static void flush_buf(char *buf, int &num_written, print_callback_fn_t cb_linePrint, void *cb_linePrint_data); static void html_out(char c, char *buf, int &num_written, int max_index, print_callback_fn_t cb_linePrint, void *cb_linePrint_data); static void json_out(const char *style, int line, int col, int len, char *buf, int &num_written, int max_index, print_callback_fn_t cb_linePrint, void *cb_linePrint_data); static BufferAccessor *lex_buffer(const char *buffer, int lang, bool verbose, bool MaintainLineStructure, int parsingStartState, const char *keywords); static bool not_actually_code(const char *bufstr, int lang, BufferAccessor *buffer_obj); static void print_style(const char *bufstr, int lang, bool utf8_source, BufferAccessor *buffer_obj, print_callback_fn_t cb_linePrint, void *cb_linePrint_data, int outputFormat, int DumpSource, int DumpEndState, bool DumpFoldLevels, int parsingStartState, int StopAfterDataSectionLine1); static bool set_keywords(int lang, WCONST char *& keywords ); static void start_style(int style_num, char *buf, int &num_written, int max_index, print_callback_fn_t cb_linePrint, void *cb_linePrint_data, int *lang_style_name_parts); /**************** Local Variables ****************/ WCONST char* html_keywords = "a abbr acronym address applet area b base basefont \ bdo big blockquote body br button caption center \ cite code col colgroup dd del dfn dir div dl dt em \ fieldset font form frame frameset h1 h2 h3 h4 h5 h6 \ head hr html i iframe img input ins isindex kbd label \ legend li link map menu meta noframes noscript \ object ol optgroup option p param pre q s samp \ script select small span strike strong style sub sup \ table tbody td textarea tfoot th thead title tr tt u ul \ var xml xmlns abbr accept-charset accept accesskey action align alink \ alt archive axis background bgcolor border \ cellpadding cellspacing char charoff charset checked cite \ class classid clear codebase codetype color cols colspan \ compact content coords \ data datafld dataformatas datapagesize datasrc datetime \ declare defer dir disabled enctype event \ face for frame frameborder \ headers height href hreflang hspace http-equiv \ id ismap label lang language leftmargin link longdesc \ marginwidth marginheight maxlength media method multiple \ name nohref noresize noshade nowrap \ object onblur onchange onclick ondblclick onfocus \ onkeydown onkeypress onkeyup onload onmousedown \ onmousemove onmouseover onmouseout onmouseup \ onreset onselect onsubmit onunload \ profile prompt readonly rel rev rows rowspan rules \ scheme scope selected shape size span src standby start style \ summary tabindex target text title topmargin type usemap \ valign value valuetype version vlink vspace width \ text password checkbox radio submit reset \ file hidden image"; WCONST char * k_perlKeywords = "__FILE__ __LINE__ __PACKAGE__ __DATA__ __END__ AUTOLOAD \ BEGIN CHECK CORE DESTROY END INIT UNITCHECK abs accept alarm \ and atan2 bind binmode bless break caller chdir chmod chomp chop chown \ chr chroot close closedir cmp connect continue cos crypt dbmclose \ dbmopen default defined delete die do dump each else elsif endgrent endhostent \ endnetent endprotoent endpwent endservent eof eq eval exec exists \ exit exp fcntl fileno flock for foreach fork format formline \ ge getc getgrent getgrgid getgrnam gethostbyaddr gethostbyname \ gethostent getlogin getnetbyaddr getnetbyname getnetent getpeername \ getpgrp getppid getpriority getprotobyname getprotobynumber getprotoent \ getpwent getpwnam getpwuid getservbyname getservbyport getservent \ getsockname getsockopt given glob gmtime goto grep gt hex if index \ int ioctl join keys kill last lc lcfirst le length link listen \ local localtime lock log lstat lt m map mkdir msgctl msgget msgrcv \ msgsnd my ne next no not oct open opendir or ord our pack package \ pipe pop pos print printf prototype push q qq qr quotemeta qw \ qx rand read readdir readline readlink readpipe recv redo ref \ rename require reset return reverse rewinddir rindex rmdir s \ say scalar seek seekdir select semctl semget semop send setgrent \ sethostent setnetent setpgrp setpriority setprotoent setpwent \ setservent setsockopt shift shmctl shmget shmread shmwrite shutdown \ sin sleep socket socketpair sort splice split sprintf sqrt srand \ stat state study sub substr symlink syscall sysopen sysread sysseek \ system syswrite tell telldir tie tied time times tr truncate \ uc ucfirst umask undef unless unlink unpack unshift untie until \ use utime values vec wait waitpid wantarray warn when while write \ x xor y"; WCONST char * k_pythonKeywords = "and assert break class continue def del \ elif else except exec finally for from global if import in is lambda not \ or pass print raise return try while"; WCONST char * k_rubyKeywords = "__FILE__ __LINE__ BEGIN END alias and \ begin break case class def defined? do else elsif end ensure false for \ if in module next nil not or redo rescue retry return self super then \ true undef unless until when while yield"; WCONST char * k_xsltKeywords = ""; // list obtained from http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vblr7/html/vaorivblangkeywordsall.asp WCONST char * k_vbKeywords = "addhandler addressof alias and \ andalso ansi as assembly \ auto boolean byref byte \ byval call case catch \ cbool cbyte cchar cdate \ cdec cdbl char cint \ class clng cobj const \ cshort csng cstr ctype \ date decimal declare default \ delegate dim directcast do \ double each else elseif \ end enum erase error \ event exit false finally \ for friend function get \ gettype gosub goto handles \ if implements imports in \ inherits integer interface is \ let lib like long \ loop me mod module \ mustinherit mustoverride mybase myclass \ namespace new next not \ nothing notinheritable notoverridable object \ on option optional or \ orelse overloads overridable overrides \ paramarray preserve private property \ protected public raiseevent readonly \ redim rem removehandler resume \ return select set shadows \ shared short single static \ step stop string structure \ sub synclock then throw \ to true try typeof \ unicode until variant when \ while with withevents writeonly \ xor"; WCONST char * k_cssKeywords = "ascent azimuth background word-spacing \ background-attachment \ background-color background-image background-position \ background-repeat baseline centerline bbox border border-bottom \ border-bottom-color border-bottom-style border-bottom-width \ border-collapse border-color border-color border-left \ border-left-color border-left-style border-left-width border-right \ border-right-color border-right-style border-right-width \ border-spacing border-style border-style border-top border-top-color \ border-top-style border-width bottom cap-height caption-side clear \ color counter-increment counter-reset cue cue-after cue-before cursor \ definition-src descent direction unicode-bidi elevation empty-cells \ float font font-family font-size font-size-adjust font-stretch \ font-style font-variant font-weight height left letter-spacing \ line-height margin margin-bottom margin-left margin-right margin-top \ marker-offset marks mathline max-height max-width min-height min-width \ outline outline-color outline-style outline-width overflow clip \ padding border-top-width padding-bottom padding-left padding-right \ padding-top page orphans page-break-after page-break-before \ page-break-inside pause pause-after pause-before pitch pitch-range \ play-during quotes richness size slope speak speak-header \ speak-numeral speak-punctuation speech-rate src panose-1 stemh stemv stress table-layout text-align text-decoration \ text-indent text-shadow text-transform top right topline unicode-range \ units-per-em vertical-align visibility content voice-family volume \ widows width widths x-height z-index"; WCONST char *k_udtKeywords = ""; // no keywords here /**************** Interface Declarations ****************/ bool doScineplex(const char *bufstr, int lang, bool utf8_source, bool verbose, print_callback_fn_t cb_linePrint, void *cb_linePrint_data, int outputFormat, int parsingStartState, int DumpSource, int DumpEndState, bool DumpFoldLevels, int StopAfterDataSectionLine1, bool MaintainLineStructure, char *p_subLanguage, char **pp_Explanation) { if (!bufstr) { *pp_Explanation = "No chars read in"; return false; } WCONST char *keywords; if (lang != SCLEX_USERTPL) { if (!set_keywords(lang, keywords)) { *pp_Explanation = "Can't set keywords"; return false; } } else { keywords = (WCONST char *) p_subLanguage; } BufferAccessor *buffer_obj = lex_buffer(bufstr, lang, verbose, MaintainLineStructure, parsingStartState, keywords); if (buffer_obj) { bool res; if (not_actually_code(bufstr, lang, buffer_obj)) { *pp_Explanation = "Doesn't look like Perl code"; res = false; } else { print_style(bufstr, lang, utf8_source, buffer_obj, cb_linePrint, cb_linePrint_data, outputFormat, DumpSource, DumpEndState, DumpFoldLevels, parsingStartState, StopAfterDataSectionLine1); res = true; } delete buffer_obj; return res; } *pp_Explanation = "Couldn't get a buffer object"; return false; } /**************** Local Functions ****************/ static void add_char(char *buf, int &num_written, int max_index, char new_char, print_callback_fn_t cb_linePrint, void *cb_linePrint_data) { if (num_written >= max_index) { if (num_written <= max_index) { buf[num_written] = 0; } (*cb_linePrint)(buf, cb_linePrint_data); num_written = 0; } buf[num_written] = new_char; ++num_written; } static void add_string(char *buf, int &num_written, int max_index, const char *new_string, print_callback_fn_t cb_linePrint, void *cb_linePrint_data) { int len = (int) strlen(new_string); if (len == 0) { return; } if (num_written > 0) { if (num_written + len < max_index) { strcpy(&buf[num_written], new_string); num_written += len; return; } else { buf[num_written] = 0; (*cb_linePrint)(buf, cb_linePrint_data); num_written = 0; } } if (len < max_index) { strcpy(buf, new_string); num_written = len; } else { (*cb_linePrint)(new_string, cb_linePrint_data); } } static void end_style(int curr_style, char *buf, int &num_written, int max_index, print_callback_fn_t cb_linePrint, void *cb_linePrint_data) { if (curr_style != SCE_PL_DEFAULT) { add_string(buf, num_written, max_index, "", cb_linePrint, cb_linePrint_data); } } static void flush_buf(char *buf, int &num_written, print_callback_fn_t cb_linePrint, void *cb_linePrint_data) { if (num_written >= 0) { buf[num_written] = 0; (*cb_linePrint)(buf, cb_linePrint_data); num_written = 0; } } // This won't escape ']]', but will escape < > and & static void html_out(char c, char *buf, int &num_written, int max_index, print_callback_fn_t cb_linePrint, void *cb_linePrint_data) { switch (c) { case '<': add_string(buf, num_written, max_index, "<", cb_linePrint, cb_linePrint_data); break; case '>': add_string(buf, num_written, max_index, ">", cb_linePrint, cb_linePrint_data); break; case '&': add_string(buf, num_written, max_index, "&", cb_linePrint, cb_linePrint_data); break; default: add_char(buf, num_written, max_index, c, cb_linePrint, cb_linePrint_data); } } static BufferAccessor *lex_buffer(const char *buffer, int lang, bool verbose, bool MaintainLineStructure, int parsingStartState, const char *keywords) { WordList* keyword_lists[5]; if (lang == SCLEX_VBSCRIPT) { for (int i = 1; i < 4; i++) { keyword_lists[i] = new WordList; keyword_lists[i]->Set(""); } } else { for (int i = 1; i < 4; i++) { keyword_lists[i] = 0; } } keyword_lists[0] = new WordList; keyword_lists[0]->Set(keywords); keyword_lists[4] = 0; const LexerModule *lm = LexerModule::Find(lang); if (!lm) { fprintf (stderr, "Couldn't find language %d\n", lang); return (BufferAccessor *) 0; } BufferAccessor *buffer_obj = new BufferAccessor(buffer, MaintainLineStructure, verbose); int length_doc = (int) strlen(buffer); lm->Lex(0, length_doc, parsingStartState, keyword_lists, *buffer_obj); lm->Fold(0, length_doc, parsingStartState, keyword_lists, *buffer_obj); for (unsigned int i = 0; i < sizeof(keyword_lists)/sizeof(keyword_lists[0]); i++) { delete keyword_lists[i]; } return buffer_obj; } // For Perl, heuristic test to verify that the buffer isn't // ending in the middle of a multi-line construct: // This is a good indication that it isn't actually // code, but check for the presence of /\b\$[-|w]/ to // verify. static bool not_actually_code(const char *bufstr, int lang, BufferAccessor *buffer_obj) { if (lang != SCLEX_PERL || strlen(bufstr) == 0) { return false; } else { int buflen = (int) strlen(bufstr); int last_char_idx = buflen - 1; const int *styles = buffer_obj->p_colors; char last_char_check = 0; char last_ch; const int *p_start; const int *p_end; // points to zero byte switch (styles[last_char_idx]) { case SCE_PL_REGEX: case SCE_PL_REGSUBST: last_char_check = '/'; break; case SCE_PL_BACKTICKS: last_char_check = '`'; break; case SCE_PL_CHARACTER: last_char_check = '\''; break; case SCE_PL_STRING: last_char_check = '"'; break; case SCE_PL_STRING_Q: case SCE_PL_STRING_QQ: case SCE_PL_STRING_QX: case SCE_PL_STRING_QR: case SCE_PL_STRING_QW: break; default: return false; } /* So we end in either a regex or a string. */ last_ch = bufstr[last_char_idx]; if (last_char_check) { if (last_ch == last_char_check) { return false; } } else if (ispunct(last_ch) && last_ch != '_') { // It's a q-string -- for now verify the last char is non alpha // This will generate false-positives, but not false negatives return false; } /* Now we think we don't have Perl code, but if we find something * that looks like Perl code, let's accept it. We do this by * looking for a constrained perl-specific style. */ p_end = styles + buflen; for (p_start = styles; p_start < p_end; ++p_start) { switch (*p_start) { case SCE_PL_SCALAR: case SCE_PL_ARRAY: case SCE_PL_HASH: case SCE_PL_SYMBOLTABLE: return false; } } return true; } } static void print_style(const char *bufstr, int lang, bool utf8_source, BufferAccessor *buffer_obj, print_callback_fn_t cb_linePrint, void *cb_linePrint_data, int outputFormat, int DumpSource, int DumpEndState, bool DumpFoldLevels, int ,// parsingStartState, int StopAfterDataSectionLine1) { const int * styles = buffer_obj->p_colors; const int work_buf_size = 32; const int work_buf_size_sub2 = work_buf_size - 2; char buf[work_buf_size]; buf[work_buf_size - 1] = 0; // No need to overwrite if (outputFormat == CLASSIC_SCINEPLEX) { const char *bp = bufstr; char *curr_ptr; for (size_t i = 0; i < strlen(bufstr); ++i, ++bp) { if (isspace(*bp)) curr_ptr = buf + sprintf(buf, "chr(%d)", *bp); else { buf[0] = *bp; curr_ptr = &buf[1]; } sprintf(curr_ptr, "\t%d\n", styles[i]); if (!(*cb_linePrint)(buf, cb_linePrint_data)) { break; } } } else if (outputFormat == ASCII_LINE_MAPPER_SCINEPLEX) { const char *bp = bufstr; const int *sp = styles; bool needEndOfLine = false; int i; size_t bufLen = strlen(bufstr); int lineNum = 0; bool sawDataSection = false; int num_written = 0; for (i = 0; i < (int) bufLen; ++i, bp++, sp++) { if (!needEndOfLine && DumpSource) { const char *this_bp = bp; int this_i = i; if (DumpFoldLevels) { flush_buf(buf, num_written, cb_linePrint, cb_linePrint_data); char buf8[8]; sprintf(buf8, "%04x ", buffer_obj->LevelAt(lineNum++)); add_string(buf, num_written, work_buf_size_sub2, buf8, cb_linePrint, cb_linePrint_data); } while (this_i < (int) bufLen && *this_bp != '\n' && *this_bp != '\r') { add_char(buf, num_written, work_buf_size_sub2, ((*this_bp == '\t') ? ' ' : *this_bp), cb_linePrint, cb_linePrint_data); ++this_bp; ++this_i; } add_char(buf, num_written, work_buf_size_sub2, '\n', cb_linePrint, cb_linePrint_data); if (DumpFoldLevels) { add_string(buf, num_written, work_buf_size_sub2, " ", cb_linePrint, cb_linePrint_data); } } if (*bp == '\r' || *bp == '\n') { if (DumpEndState) { add_char(buf, num_written, work_buf_size_sub2, (char) (*sp + '0'), cb_linePrint, cb_linePrint_data); } if (*bp == '\r' && *(bp + 1) == '\n') { bp++; sp++; i++; } add_char(buf, num_written, work_buf_size_sub2, '\n', cb_linePrint, cb_linePrint_data); needEndOfLine = false; } else { add_char(buf, num_written, work_buf_size_sub2, (char) (*sp + '0'), cb_linePrint, cb_linePrint_data); if (StopAfterDataSectionLine1 && *sp == SCE_PL_DATASECTION && lang == SCLEX_PERL) { sawDataSection = true; } needEndOfLine = true; } if (sawDataSection) { break; } } if (needEndOfLine) { add_char(buf, num_written, work_buf_size_sub2, '\n', cb_linePrint, cb_linePrint_data); } flush_buf(buf, num_written, cb_linePrint, cb_linePrint_data); } else if (outputFormat == HTML_SCINEPLEX) { int num_written = 0; const char *bp = bufstr; const int *sp = styles; unsigned int i; size_t bufLen = strlen(bufstr); int curr_style = SCE_ANY_DEFAULT; int last_char = 0; int *lang_style_name_parts = NULL; for (i = 0; i < sizeof(style_name_parts)/sizeof(style_name_parts[0]); i++) { if (style_name_parts[i].lang == lang) { lang_style_name_parts = style_name_parts[i].style_name_parts; break; } } for (i = 0; i < bufLen; ++i, bp++, sp++) { if (*sp != curr_style && lang_style_name_parts) { end_style(curr_style, buf, num_written, work_buf_size_sub2, cb_linePrint, cb_linePrint_data); start_style(*sp, buf, num_written, work_buf_size_sub2, cb_linePrint, cb_linePrint_data, lang_style_name_parts); curr_style = *sp; } html_out(last_char = *bp, buf, num_written, work_buf_size_sub2, cb_linePrint, cb_linePrint_data); } if (lang_style_name_parts) { end_style(curr_style, buf, num_written, work_buf_size_sub2, cb_linePrint, cb_linePrint_data); } // End with a newline if we need it if (curr_style != SCE_ANY_DEFAULT || (last_char != '\n' && last_char != '\r')) { add_char(buf, num_written, work_buf_size_sub2, '\n', cb_linePrint, cb_linePrint_data); } flush_buf(buf, num_written, cb_linePrint, cb_linePrint_data); } else if (outputFormat == JSON_POS_GENERIC_SCINEPLEX) { int num_written = 0; const char *bp = bufstr; const int *sp = styles; unsigned int i; size_t bufLen = strlen(bufstr); int curr_style = STYLE_GENERIC_DEFAULT; int *lang_style_name_parts = NULL; for (i = 0; i < sizeof(style_name_parts)/sizeof(style_name_parts[0]); i++) { if (style_name_parts[i].lang == lang) { lang_style_name_parts = style_name_parts[i].style_name_parts; break; } } unsigned int line = 1; unsigned int col = 0; unsigned int start_col = col; bool first = 1; add_string(buf, num_written, work_buf_size_sub2, "[\n", cb_linePrint, cb_linePrint_data); if (lang_style_name_parts) { for (i = 0; i < bufLen; ++i, bp++, sp++) { if (lang_style_name_parts[*sp] != curr_style || *bp == '\n') { if (curr_style != STYLE_GENERIC_DEFAULT && start_col < col) { if (first) { first = 0; } else { add_string(buf, num_written, work_buf_size_sub2, ",\n", cb_linePrint, cb_linePrint_data); } json_out(style_names[curr_style], line, start_col, col - start_col, buf, num_written, work_buf_size_sub2, cb_linePrint, cb_linePrint_data); } start_col = col; curr_style = lang_style_name_parts[*sp]; } if (!utf8_source || (*bp & 0xC0) != 0x80) col++; if (*bp == '\n') { start_col = col = 0; line++; } } if (curr_style != STYLE_GENERIC_DEFAULT && start_col < col) { if (!first) add_string(buf, num_written, work_buf_size_sub2, ",\n", cb_linePrint, cb_linePrint_data); json_out(style_names[curr_style], line, start_col, col - start_col, buf, num_written, work_buf_size_sub2, cb_linePrint, cb_linePrint_data); } } if (DumpFoldLevels) { int last_fold_level = 0; add_string(buf, num_written, work_buf_size_sub2, ",\n [\"foldlevels\"", cb_linePrint, cb_linePrint_data); for (i = 0; i < line; i++) { char pair[32]; char fold_level = (buffer_obj->LevelAt(i) & SC_FOLDLEVELNUMBERMASK) - SC_FOLDLEVELBASE; if (fold_level != last_fold_level) { sprintf(pair, ",%d,%d", i+1, fold_level); add_string(buf, num_written, work_buf_size_sub2, pair, cb_linePrint, cb_linePrint_data); last_fold_level = fold_level; } } add_string(buf, num_written, work_buf_size_sub2, "]", cb_linePrint, cb_linePrint_data); } add_string(buf, num_written, work_buf_size_sub2, "\n]\n", cb_linePrint, cb_linePrint_data); flush_buf(buf, num_written, cb_linePrint, cb_linePrint_data); } } static bool set_keywords(int lang, WCONST char *& keywords) { switch (lang) { case SCLEX_PERL: keywords = k_perlKeywords; break; case SCLEX_CSS: keywords = k_cssKeywords; break; case SCLEX_PYTHON: keywords = k_pythonKeywords; break; case SCLEX_RUBY: keywords = k_rubyKeywords; break; case SCLEX_XSLT: keywords = k_xsltKeywords; break; case SCLEX_VBSCRIPT: keywords = k_vbKeywords; break; case SCLEX_USERTPL: keywords = k_udtKeywords; break; default: return false; } return true; } static void start_style(int style_num, char *buf, int &num_written, int max_index, print_callback_fn_t cb_linePrint, void *cb_linePrint_data, int *lang_style_name_parts) { // This table maps scintilla style numbers to a set of generic styles // used in CSS files. // Is there a way to init this outside a function? Use something // like a class?? if (style_num != SCE_ANY_DEFAULT) { add_string(buf, num_written, max_index, "", cb_linePrint, cb_linePrint_data); } } static void json_out(const char *style, int line, int col, int len, char *buf, int &num_written, int max_index, print_callback_fn_t cb_linePrint, void *cb_linePrint_data) { char num[128]; add_string(buf, num_written, max_index, " [\"", cb_linePrint, cb_linePrint_data); add_string(buf, num_written, max_index, style, cb_linePrint, cb_linePrint_data); sprintf(num, "\",%d,%d,%d]", line, col, len); add_string(buf, num_written, max_index, num, cb_linePrint, cb_linePrint_data); }