本文是記錄專業課“程序語言理論與編譯技術”的部分筆記。
LECTURE 22(實現一個從AST到RISCV的編譯器)
一、問題分析
1、完整的編譯器(如LLVM)需先完成AST到IR的轉換,并進行代碼優化,再到匯編,如下圖:
本次實驗實現從AST到RISC-V(不考慮優化)的翻譯。對于匯編,我們可以訪問Compiler Explorer (godbolt.org)來熟悉一下,或者參看類似下圖的手冊(包括匯編指令和寄存器類型等內容):
對于本次實驗,我們用 x8 作為幀指針(fp) 來固定指向當前函數的棧楨基址,使用 a0 寄存器作為表達式計算結果的默認存放寄存器(函數返回值也約定放在 a0)。為了方便,我們避免寄存器分配,轉而使用棧來保存局部變量和臨時數據。如下圖:
2、首先考慮Binop的翻譯,我們參考下圖:
對于Int 常數類型,我們直接將常數值加載到寄存器,對應RISC-V 提供的?li(load immediate)偽指令,將一個立即數載入寄存器。而對于Binop,我們考慮處理三種情況:
? Num Binop Num E.g. 2 + 7
? Num Binop Num Binop Num E.g. 2 + 6 * 9
? (Num Binop Num) Binop (Num Binop Num) E.g. (2 + 6) * (8 + 7)
基本流程:計算左操作數 -> 保存左值 -> 計算右操作數 -> 運算合成結果。如此之后,我們可以使用兩種方案來保存:
? 將左值保存在臨時寄存器中(例如t0 ),再計算右值到另一個寄存器(例如t1 )
? 采用棧,在計算左值后將其壓棧保存,計算右值后再彈棧取出左值進行運算
3、然后考慮Let的翻譯:
這里我們需要增加對變量和作用域的支持,使編譯器可以處理let綁定和變量引用。我們需要考慮棧幀與變量地址,當進入一個新的let塊,需在棧上分配新的空間保存局部變量的值(調整棧指針fp的offset)。然后在編譯階段也需要一個映射(符號表)跟蹤當前作用域中每個變量名的棧偏移或寄存器位置(env)。
據此,我們簡化邏輯,使fp固定不動,變量x偏移可以按照進入let的次序決定。而固定幀具體而言,使在程序一開始一次性分配最大所需棧,然后fp = sp固定。
我們可以考慮一個流程:先編譯e1得到其值在a0,然后在棧上分配空間保存其值 (addi sp, sp, -8)。接著將變量名x映射到新空間的位置,例如offset += 8表示又用8字節,映射env新增 (“x”, of fset)。隨后編譯內部表達式e2,Var x按env找到偏移,完成后回退棧指針釋放x所占的空間 (addi sp, sp, 8 )。最后恢復符號表,彈出env中x的綁定。
4、然后考慮If的翻譯,我們參考下圖:
我們需要擴展編譯器支持Bool常量和If條件表達式,正確處理程序的控制流,為每個if表達式生成唯一的標簽并正確安插跳轉指令,且保證程序無論哪一種路徑都會結束。
具體的,思考同前面let/變量的交互:使用標簽而非固定地址。條件的各分支內部可以定義自己的變量且作用域僅限于分支內,實現需以遞歸的方式編譯分支表達式并各自管理環境。在分支中寄存器值通過a0返回結果,無殘留的臨時變量,無需專門清理分支的棧。
5、最后考慮Func和App的翻譯。擴展編譯器支持函數定義(Func)和函數調用(App),函數的閉包包含函數代碼和綁定環境的組合體。而編譯Func(x, body)會產生兩部分輸出:全局函數代碼(儲存待后續統一輸出),以及當前表達式計算時創建閉包的代碼。
回顧Riscv寄存器,caller負責傳遞參數(a0-a7寄存器)和保存臨時寄存器;callee可以自由使用臨時寄存器但需保存和恢復caller保存寄存器(如s0等)。我們在實現時僅考慮函數調用一個顯式參數,并需要注意傳遞參數值和環境指針。
閉包的環境結構(僅提供一種思路):分配一塊內存,大小足夠存放一個指針加上所有自由變量的值;將函數的代碼地址寫入這塊內存的開頭;將該函數的自由變量當前環境中的值寫入內存的后續字段;將指向這塊內存的指針作為閉包值放入某寄存器。
注意,Func需要一個唯一的新函數標簽(類似if的標簽)。對函數體進行的編譯,與main形式上相同(env -> local_env + closure_env)。在main函數體換Func編譯結果為上一步的閉包分配代碼。
而App調用有多種方式:jalr、call等。以jalr為例:將a0(閉包指針)保存到t0,a0放參數;使用ld t1 , 0(t 0)加載閉包偏移為0處內容(代碼地址)到t1;jalr ra, t 1進行跳轉調用。
其余包括free、優化等問題不在本節內容之內。
二、前置代碼
1、lib/ast.ml
type binop = | Add| Sub| Mul| Div| Leqtype expr =| Int of int| Var of string| Bool of bool| Binop of binop * expr * expr| Let of string * expr * expr| If of expr * expr * expr| Func of string * expr| App of expr * expr
2、lib/lexer.mll
{open Parser
}rule read = parse | [' ' '\t' '\n'] { read lexbuf }| '+' { PLUS }| '-' { MINUS }| '*' { TIMES }| '/' { DIV }| '(' { LPAREN }| ')' { RPAREN }| "<=" { LEQ }| "true" { TRUE }| "false" { FALSE }| "let" { LET }| "=" { EQUALS }| "in" { IN }| "if" { IF }| "then" { THEN }| "else" { ELSE }| "->" { ARROW }| "fun" { FUNC }| ['0'-'9']+ as num { INT (int_of_string num) }| ['a'-'z' 'A'-'Z']+ as id { ID id }| eof { EOF }| _ { failwith "Invalid character" }
3、lib/parser.mly
%{open Ast(** [make_apply e [e1; e2; ...]] makes the application [e e1 e2 ...]). Requires: the list argument is non-empty. *)
let rec make_apply e = function| [] -> failwith "precondition violated"| [e'] -> App (e, e')| h :: ((_ :: _) as t) -> make_apply (App (e, h)) t
%}%token <int> INT
%token <string> ID
%token PLUS MINUS TIMES DIV EOF
%token LPAREN RPAREN
%token LEQ
%token TRUE FALSE
%token LET EQUALS IN
%token IF THEN ELSE
%token FUNC ARROW%nonassoc IN
%nonassoc ELSE
%left LEQ
%left PLUS MINUS
%left TIMES DIV%start main
%type <Ast.expr> main
%%main:expr EOF { $1 }
;expr:| simpl_expr { $1 }| simpl_expr simpl_expr+ { make_apply $1 $2 }
;simpl_expr:| INT { Int $1 }| ID { Var $1 }| TRUE { Bool true }| FALSE { Bool false}| simpl_expr LEQ simpl_expr { Binop (Leq, $1, $3) }| simpl_expr TIMES simpl_expr { Binop (Mul, $1, $3) }| simpl_expr DIV simpl_expr { Binop (Div, $1, $3) }| simpl_expr PLUS simpl_expr { Binop (Add, $1, $3) }| simpl_expr MINUS simpl_expr { Binop (Sub, $1, $3) }| LET ID EQUALS simpl_expr IN simpl_expr { Let ($2, $4, $6) }| FUNC ID ARROW expr { Func ($2, $4) }| IF simpl_expr THEN simpl_expr ELSE simpl_expr { If ($2, $4, $6) }| LPAREN expr RPAREN { $2 }
;
4、lib/dune
(library(name Simpl_riscv)(modules parser lexer ast))(ocamllex lexer)
(menhir (modules parser))
5、bin/dune
(executable(public_name Simpl_riscv)(name main)(modules main)(libraries Simpl_riscv)(flags (:standard -w -32-27-26-39-8-37)))
6、bin/main.ml的部分代碼
open Simpl_riscv
open Astlet rec string_of_expr (e : expr) : string = match e with| Int n -> Printf.sprintf "Int %d" n| Var id -> Printf.sprintf "Var %s" id| Bool b -> let b_str = match b with | true -> "true"| false -> "false"inPrintf.sprintf "Bool %s" b_str| Binop (binop, e1, e2) ->let binop_str = match binop with | Add -> "Add"| Mul -> "Mul"| Sub -> "Sub"| Div -> "Div"| Leq -> "Leq"inPrintf.sprintf "Binop (%s, %s, %s)" binop_str (string_of_expr e1) (string_of_expr e2)| Let (var, e1, e2) -> Printf.sprintf "Let (%s, %s, %s)" var (string_of_expr e1) (string_of_expr e2)| If (e1, e2, e3) -> Printf.sprintf "If (%s, %s, %s)" (string_of_expr e1) (string_of_expr e2) (string_of_expr e3)| Func (var, e) -> Printf.sprintf "Func (%s, %s)" var (string_of_expr e)| App (e1, e2) -> Printf.sprintf "App (%s, %s)" (string_of_expr e1) (string_of_expr e2)let parse s : expr =let lexbuf = Lexing.from_string s inlet ast = Parser.main Lexer.read lexbuf inast(* 全局標簽計數器,用于生成唯一標簽 *)
let label_count = ref 0
let fresh_label prefix = incr label_count;Printf.sprintf "%s_%d" prefix !label_count(* 全局列表:保存所有生成的函數代碼,最終附加在程序末尾 *)
let functions : string list ref = ref [](* 簡單的自由變量分析(不去重,僅適用于教學示例) *)
let rec free_vars expr bound = match expr with| Int _ | Bool _ -> []| Var x -> if List.mem x bound then [] else [x]| Binop (_, e1, e2) -> free_vars e1 bound @ free_vars e2 bound| Let (x, e1, e2) -> free_vars e1 bound @ free_vars e2 (x :: bound)| If (cond, e_then, e_else) ->free_vars cond bound @ free_vars e_then bound @ free_vars e_else bound| Func (x, body) -> free_vars body (x :: bound)| App (e1, e2) -> free_vars e1 bound @ free_vars e2 bound(*compile_expr env cur_offset exprenv: (variable, offset) 的關聯列表,其中 offset 是相對于 fp 的偏移(單位:字節)cur_offset: 當前已經分配的 let 變量字節數(每個變量占 8 字節)返回的匯編代碼保證計算結果存放在寄存器 a0 中
*)
let rec compiler_expr (env : (string * int) list) (cur_offset : int) (expr : expr) : string = match expr with| Int n -> Printf.sprintf "\tli a0, %d\n" n| Bool b ->if b then "\tli a0, 1\n" else "\tli a0, 0\n"| Var x ->(trylet offset = List.assoc x env inPrintf.sprintf "\tld a0, -%d(fp)\n" offsetwith Not_found ->failwith ("Unbound variable: " ^ x))
(*——————————————*)and compile_expr_func (local_env : (string * int) list) (closure_env : (string * int) list) (cur_offset : int) (expr : expr) : string =match expr with
(*——————————————*)let compiler_program (e : expr) : string =let body_code = compiler_expr [] 0 e inlet prologue = ".text\n\.global main\n\main:\n\\taddi sp, sp, -64\n\\tmv fp, sp\n"inlet epilogue = "\\tmv sp, fp\n\\taddi sp, sp, 64\n\\tret\n"inlet func_code = String.concat "\n" !functions inprologue ^ body_code ^ epilogue ^ "\n" ^ func_codelet () =let filename = "test/simpl_test4.in" in(* let filename = "test/simpl_test2.in" in *)let in_channel = open_in filename inlet file_content = really_input_string in_channel (in_channel_length in_channel) inclose_in in_channel;(* let res = interp file_content inPrintf.printf "Result of interpreting %s:\n%s\n\n" filename res;let res = interp_big file_content inPrintf.printf "Result of interpreting %s with big-step model:\n%s\n\n" filename res; *)let ast = parse file_content in Printf.printf "AST: %s\n" (string_of_expr ast);let output_file = Sys.argv.(1) inlet oc = open_out output_file inlet asm_code = compiler_program ast inoutput_string oc asm_code;close_out oc;Printf.printf "Generated RISC-V code saved to: %s\n" output_file
三、具體實現
1、bin/main.ml的compiler_expr函數
let rec compiler_expr (env : (string * int) list) (cur_offset : int) (expr : expr) : string = match expr with| Int n -> Printf.sprintf "\tli a0, %d\n" n| Bool b ->if b then "\tli a0, 1\n" else "\tli a0, 0\n"| Var x ->(trylet offset = List.assoc x env inPrintf.sprintf "\tld a0, -%d(fp)\n" offsetwith Not_found ->failwith ("Unbound variable: " ^ x))| Binop (op, e1, e2) ->let code1 = compiler_expr env cur_offset e1 in let push_left = "\taddi sp, sp, -8\n\tsd a0, 0(sp)\n" inlet code2 = compiler_expr env cur_offset e2 inlet pop_left = "\tld t0, 0(sp)\n\taddi sp, sp, 8\n" inlet op_code = match op with| Add -> "\tadd a0, t0, a0\n"| Sub -> "\tsub a0, t0, a0\n"| Mul -> "\tmul a0, t0, a0\n"| Div -> "\tdiv a0, t0, a0\n"| Leq -> "Not implemented"incode1 ^ push_left ^ code2 ^ pop_left ^ op_code| Let (x, e1, e2) ->let code1 = compiler_expr env cur_offset e1 inlet new_offset = cur_offset + 8 inlet alloc = Printf.sprintf "\taddi sp, sp, -8\n\tsd a0, -%d(fp)\n" new_offset inlet env' = (x, new_offset) :: env inlet code2 = compiler_expr env' new_offset e2 inlet free = "\taddi sp, sp, 8\n" incode1 ^ alloc ^ code2 ^ free| If (cond, e_then, e_else) ->let label_else = fresh_label "Lelse" inlet label_end = fresh_label "Lend" inlet code_cond = compiler_expr env cur_offset cond inlet code_then = compiler_expr env cur_offset e_then inlet code_else = compiler_expr env cur_offset e_else incode_cond ^ Printf.sprintf "\tbeq a0, x0, %s\n" label_else ^code_then ^Printf.sprintf "\tj %s\n" label_end ^Printf.sprintf "%s:\n" label_else ^code_else ^Printf.sprintf "%s:\n" label_end| Func (x, body) ->let fvs = free_vars body [x] inlet num_free = List.length fvs inlet func_id = fresh_label "func" inlet local_env = [(x, 8)] inlet closure_env = List.mapi (fun i v -> (v, 8 * i)) fvs inlet func_body_code = compile_expr_func local_env closure_env 0 body inlet func_prologue = Printf.sprintf "%s:\n\taddi sp, sp, -16\n\tsd ra, 8(sp)\n\tsd fp, 0(sp)\n\tmv fp, sp\n" func_idinlet func_epilogue = "\tld ra, 8(sp)\n\tld fp, 0(sp)\n\taddi sp, sp, 16\n\tret\n"inlet func_code = func_prologue ^ func_body_code ^ func_epilogue infunctions := !functions @ [func_code];let closure_size = 8 * (1 + num_free) inlet alloc_code = Printf.sprintf "\tli a0, %d\n\tjal ra, malloc\n" closure_size inlet move_closure = "\tmv t0, a0\n" inlet store_code_ptr = Printf.sprintf "\tla t1, %s\n\tsd t1, 0(t0)\n" func_id inlet store_free_vars = List.mapi (fun i v ->let outer_offset =try List.assoc v env with Not_found -> failwith ("Unbound free var: " ^ v)inPrintf.sprintf "\tld t1, -%d(fp)\n\tsd t1, %d(t0)\n" outer_offset (8 * (i + 1))) fvs |> String.concat ""inlet ret_code = "\tmv a0, t0\n" inalloc_code ^ move_closure ^ store_code_ptr ^ store_free_vars ^ ret_code| App (e1, e2) ->let code_f = compiler_expr env cur_offset e1 inlet save_closure = "\tmv t0, a0\n" inlet code_arg = compiler_expr env cur_offset e2 inlet load_env = "\taddi a1, t0, 8\n" inlet load_code_ptr = "\tld t1, 0(t0)\n" inlet call = "\tjalr ra, 0(t1)\n" incode_f ^ save_closure ^ code_arg ^ load_env ^ load_code_ptr ^ call
2、bin/main.ml的compile_expr_func函數
and compile_expr_func (local_env : (string * int) list) (closure_env : (string * int) list) (cur_offset : int) (expr : expr) : string =match expr with| Int n -> Printf.sprintf "\tli a0, %d\n" n| Bool b ->if b then "\tli a0, 1\n" else "\tli a0, 0\n"| Var x ->if List.mem_assoc x local_env thenPrintf.sprintf "\tld a0, -%d(fp)\n" (List.assoc x local_env)else if List.mem_assoc x closure_env thenPrintf.sprintf "\tld a0, %d(a1)\n" (List.assoc x closure_env)elsefailwith ("Unbound variable in function: " ^ x)| Binop (op, e1, e2) ->let code1 = compile_expr_func local_env closure_env cur_offset e1 inlet push_left = "\taddi sp, sp, -8\n\tsd a0, 0(sp)\n" inlet code2 = compile_expr_func local_env closure_env cur_offset e2 inlet pop_left = "\tld t0, 0, 0(sp)\n\taddi sp, sp, 8\n" inlet op_code = match op with| Add -> "\tadd a0, t0, a0\n"| Sub -> "\tsub a0, t0, a0\n"| Mul -> "\tmul a0, t0, a0\n"| Div -> "\tdiv a0, t0, a0\n"| Leq -> "Not implemented"incode1 ^ push_left ^ code2 ^ pop_left ^ op_code| If _ -> failwith "Not implemented"| Func _ | App _ -> failwith "Nested functions not supported in function bodies"
?