package ltx import "core:fmt" import "core:os" import "core:path/filepath" import "core:strings" import "core:unicode" // TODO(Paul): change the key-value format to allow for true arbitrary string values with the use of quotes (maybe?) and make the rules clean and easy to remember. main :: proc() { if len(os.args) < 2 { fmt.eprintf("A naive implementation of TeX-based arbitrary markup. Prints syntax tree and space-preserving XML substitution\nUsage:\n\t%s \n", os.args[0]) return } ltx: Ltx parse_err := ltx_parse_file(<x, os.args[1]) if parse_err != .None { fmt.eprintln(ltx_get_error(<x, parse_err)) return } for node in ltx.nodes { print_node(node) } sb := strings.builder_make() defer strings.builder_destroy(&sb) strip_ltx(&sb, ltx.nodes) strings.builder_reset(&sb) ltx_to_xml(&sb, ltx.nodes) fmt.println(strings.to_string(sb)) } Tokens :: enum { Backslash, LeftBrace, RightBrace, LeftBracket, RightBracket, Assign, Quote, } TokenArray := [Tokens]rune { .Assign = '=', .Backslash = '\\', .LeftBrace = '{', .RightBrace = '}', .LeftBracket = '[', .RightBracket = ']', .Quote = '"', } Node_Kind :: enum { Text, Tag, } Attr_Type :: enum { Flag, Attribute, } Field :: struct { value: string, type: Attr_Type, } Node :: struct { name: string, kind: Node_Kind, text: string, attributes: map[string]Field, children: [dynamic]Node, } Ltx_Error :: enum u32 { None = 0, EOF, KeyExpected, ClosingBracketExpected, ClosingBraceExpected, UnexpectedRightBrace, ValueExpected, KeyAlreadyExists, InvalidKey, CannotReadFile, // InvalidKeyStart, } ltx_seek :: proc(ltx: ^Ltx) -> (rune, Ltx_Error) { next_idx := ltx.idx + 1 if next_idx >= len(ltx.source) do return 0, .EOF if ltx_current_char(ltx) == '\n' { ltx.pos = { col = 0, line = ltx.pos.line + 1, } } else { ltx.pos.col += 1 } ltx.idx = next_idx c := ltx_current_char(ltx) return c, .None } ltx_has_next :: proc(ltx: ^Ltx) -> b32 { return len(ltx.source) > ltx.idx + 1 } ltx_peek :: proc(ltx: ^Ltx) -> (rune, Ltx_Error) { if !ltx_has_next(ltx) do return 0, .EOF return rune(ltx.source[ltx.idx + 1]), .None } ltx_current_char :: proc(ltx: ^Ltx) -> rune { assert(ltx.idx < len(ltx.source), "index cannot be greater than string length") return rune(ltx.source[ltx.idx]) } ltx_consume_whitespace :: proc(ltx: ^Ltx) -> Ltx_Error { for unicode.is_white_space(ltx_current_char(ltx)) do ltx_seek(ltx) or_return // TODO: do prop error handling return .None } is_numeric :: proc(c: rune) -> b32 { return c >= '0' && c <= '9' } is_alpha :: proc(c: rune) -> b32 { return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') } validate_key :: proc(key: string) -> Ltx_Error { assert(len(key) > 0, "expected non-empty key") if !is_alpha(rune(key[0])) do return .InvalidKey for c in key { if !is_alpha(c) && !is_numeric(c) && c != '_' do return .InvalidKey } return .None } Ltx :: struct { source: string, source_path: string, nodes: [dynamic]Node, idx: int, pos: struct { line: u32, col: u32, }, } ltx_parse :: proc(ltx: ^Ltx) -> (err: Ltx_Error) { ltx.idx = 0 stack := make([dynamic]Node) defer delete(stack) text_node_start_idx := 0 for ltx_has_next(ltx) { if ltx_current_char(ltx) == TokenArray[.Backslash] { // TODO: cehck if the char after slash is a token if text_node_start_idx < ltx.idx { node := Node { kind = .Text, text = ltx.source[text_node_start_idx:ltx.idx], } if len(stack) == 0 do append(<x.nodes, node) else do append(&stack[len(stack) - 1].children, node) } ltx_seek(ltx) or_return // skip \ tag_start_idx := ltx.idx for unicode.is_letter(ltx_current_char(ltx)) do ltx_seek(ltx) or_return assert(tag_start_idx < ltx.idx) tag_name := ltx.source[tag_start_idx:ltx.idx] node := Node { kind = .Tag, name = tag_name, } end_pos: struct { tag_name: int, fields: int, content: int, } = { tag_name = ltx.idx, fields = 0, content = 0, } ltx_consume_whitespace(ltx) or_return for ltx_current_char(ltx) == TokenArray[.LeftBracket] { c := ltx_seek(ltx) or_return field_start := ltx.idx key: string for ltx_has_next(ltx) { if c == TokenArray[.Assign] { if field_start >= ltx.idx do return .KeyExpected key = strings.trim(ltx.source[field_start:ltx.idx], " \t") if len(key) <= 0 do return .KeyExpected validate_key(key) or_return ltx_seek(ltx) or_return field_start = ltx.idx } else if c == TokenArray[.RightBracket] { if field_start >= ltx.idx do return .ValueExpected raw_value := strings.trim(ltx.source[field_start:ltx.idx], " \t") if len(raw_value) <= 0 do return .ValueExpected value := Field{} if key != "" { value.type = .Attribute value.value = raw_value } else { key = raw_value value.type = .Flag } _, _, found := map_upsert(&node.attributes, key, value) if found do return .KeyAlreadyExists ltx_seek(ltx) or_return end_pos.fields = ltx.idx break } c = ltx_seek(ltx) or_return } ltx_consume_whitespace(ltx) or_return } ltx_consume_whitespace(ltx) or_return if ltx_current_char(ltx) == TokenArray[.LeftBrace] { ltx_seek(ltx) or_return // consume { append(&stack, node) end_pos.content = ltx.idx } else { append(<x.nodes, node) } if end_pos.content != 0 do text_node_start_idx = end_pos.content else if end_pos.fields != 0 do text_node_start_idx = end_pos.fields else do text_node_start_idx = end_pos.tag_name } else if ltx_current_char(ltx) == TokenArray[.RightBrace] { if len(stack) <= 0 do return .UnexpectedRightBrace node := pop(&stack) if text_node_start_idx < ltx.idx { text_node := Node { kind = .Text, text = ltx.source[text_node_start_idx:ltx.idx], } append(&node.children, text_node) } if len(stack) > 0 { append(&stack[len(stack) - 1].children, node) } else { append(<x.nodes, node) } ltx_seek(ltx) or_return // consume } text_node_start_idx = ltx.idx } else { ltx_seek(ltx) or_return } } if text_node_start_idx < ltx.idx { append(<x.nodes, Node{kind = .Text, text = ltx.source[text_node_start_idx:ltx.idx]}) } if len(stack) > 0 do return .ClosingBraceExpected return .None } ltx_parse_file :: proc(ltx: ^Ltx, file_path: string) -> Ltx_Error { source, ok := os.read_entire_file(file_path) if !ok do return .CannotReadFile abs_path, abs_ok := filepath.abs(file_path) if !abs_ok do return .CannotReadFile ltx.source_path = abs_path ltx.source = string(source) return ltx_parse(ltx) } print_indent :: proc(level: int) { for i in 0 ..< level { fmt.print(" ") } } print_node :: proc(node: Node, indent_level := 0) { print_indent(indent_level) switch node.kind { case .Text: fmt.printf("TEXT: \"%s\"\n", escape_white_space(node.text)) case .Tag: fmt.print("TAG:", node.name) if len(node.attributes) > 0 { for k, v in node.attributes { if v.type == .Flag do fmt.printf(" (%s)", k) else do fmt.printf(" {{%s: %s}}", k, v.value) } } fmt.println() if len(node.children) > 0 { for child in node.children { print_node(child, indent_level + 1) } } } } strip_ltx :: proc(sb: ^strings.Builder, nodes: [dynamic]Node) -> b32 { for node in nodes { switch node.kind { case .Text: fmt.sbprint(sb, (node.text)) case .Tag: strip_ltx(sb, node.children) } } return true } ltx_to_xml :: proc(sb: ^strings.Builder, nodes: [dynamic]Node, depth := 0) { if len(nodes) <= 0 do return if depth == 0 do fmt.sbprintln(sb, "") for node in nodes { switch node.kind { case .Text: fmt.sbprint(sb, node.text) case .Tag: fmt.sbprintf(sb, "<%s", node.name) for k, v in node.attributes do fmt.sbprintf(sb, " %s=\"%s\"", k, v.value) if len(node.children) > 0 { fmt.sbprint(sb, ">") ltx_to_xml(sb, node.children, depth + 1) fmt.sbprintf(sb, "", node.name) } else { fmt.sbprint(sb, " />") } } } if depth == 0 do fmt.sbprintln(sb, "") } ltx_error_to_string :: proc(error: Ltx_Error) -> string { switch error { case .None: return "" case .EOF: return "unexpected end of file" case .KeyExpected: return "key expected before '='" case .ClosingBraceExpected: return "closing brace expected" case .ClosingBracketExpected: return "closing bracket expected" case .UnexpectedRightBrace: return "unexpected '}'" case .ValueExpected: return "value expected after '='" case .KeyAlreadyExists: return "attribute key/flag already exists in attribute" case .InvalidKey: return "invalid key" case .CannotReadFile: return "cannot read file" } return "unknown error" } ltx_get_error :: proc(ltx: ^Ltx, error: Ltx_Error) -> string { if error == .None do return "" file_path := len(ltx.source_path) > 0 ? ltx.source_path : "[source]" line := ltx.pos.line + 1 col := ltx.pos.col + 1 error_msg := ltx_error_to_string(error) return fmt.tprintf("%s(%d,%d): error: %s", file_path, line, col, error_msg) } escape_white_space :: proc(s: string) -> string { sb: strings.Builder = strings.builder_make() // defer strings.builder_destroy(&sb) for c in s { switch c { case '\t': fmt.sbprint(&sb, "\\t") case '\n': fmt.sbprint(&sb, "\\n") case: fmt.sbprint(&sb, c) } } return strings.to_string(sb) } process_white_space :: proc(s: string) -> string { sb: strings.Builder = strings.builder_make() defer strings.builder_destroy(&sb) lines := strings.split(s, "\n") for _line in lines { line := strings.trim_right(_line, "\t ") if len(line) <= 0 do continue last_idx := 0 last_char: u8 = 0 for i := 0; i < len(line); i += 1 { if line[i] == '\r' do continue if unicode.is_space(rune(line[i])) { if last_char == line[i] { last_idx = i } else { fmt.sbprint(&sb, line[last_idx:i]) last_idx = i } } last_char = line[i] } fmt.sbprintln(&sb, line[last_idx:len(line)]) } // return "" return strings.to_string(sb) }