diff --git a/src/backend/arch/x86_64.rs b/src/backend/arch/x86_64.rs index 820bc10..6f72963 100644 --- a/src/backend/arch/x86_64.rs +++ b/src/backend/arch/x86_64.rs @@ -1,4 +1,10 @@ -use crate::io::CompilerMsg; +use crate::{ + backend::{ + program::{Addr, SymTable}, + symbol::Symbol, + }, + io::CompilerMsg, +}; pub struct Asm { pub instrs: Vec, @@ -7,6 +13,7 @@ pub struct Asm { pub enum Instr { Mov { dst: RegMode, src: RegImm }, Int { code: u8 }, + Lea { dst: RegMode, sym: Symbol }, } pub enum RegImm { @@ -22,22 +29,17 @@ pub struct RegMode { high: bool, } -impl Asm { - pub fn compile(&self, out: &mut Vec) -> Result<(), CompilerMsg> { - for instr in &self.instrs { - instr.compile(out)?; - } - Ok(()) - } -} - -impl Instr { - pub fn compile(&self, out: &mut Vec) -> Result<(), CompilerMsg> { +impl super::super::program::Instr for Instr { + fn encode( + &self, + data: &mut Vec, + syms: &mut SymTable, + ) -> Result, CompilerMsg> { match self { Instr::Mov { dst, src } => { let width = dst.width; if width == BitWidth::B16 { - out.push(0x66); + data.push(0x66); } let dst8 = dst.gt8(); let b64 = width == BitWidth::B64; @@ -47,31 +49,58 @@ impl Instr { } else { false }; + // special 64-bit / register 4-7 indicator if dst8 || src8 || b64 || (dst.gt4() && !dst.high) { - out.push(0x40 | dst8 as u8 | ((b64 as u8) << 3) | ((src8 as u8) << 2)); + data.push(0x40 | dst8 as u8 | ((b64 as u8) << 3) | ((src8 as u8) << 2)); } match src { RegImm::Reg(src) => { if dst.width != src.width { return Err("src and dst are not the same size".into()); } - out.push(0x88 | !b8 as u8); + data.push(0x88 | !b8 as u8); let modrm = 0b11_000_000 | (src.base() << 3) | dst.base(); - out.push(modrm); + data.push(modrm); } &RegImm::Imm(imm) => { if imm > width.max() { return Err("immediate cannot fit in register".into()); } - out.push(0xb0 | ((!b8 as u8) << 3) | dst.base()); - out.extend(&imm.to_le_bytes()[..width.bytes()]); + data.push(0xb0 | ((!b8 as u8) << 3) | dst.base()); + data.extend(&imm.to_le_bytes()[..width.bytes()]); } } } - Instr::Int { code } => out.extend([0xcd, *code]), + Instr::Int { code } => data.extend([0xcd, *code]), + Instr::Lea { dst, sym } => { + data.extend([ + 0x48 | ((dst.gt8() as u8) << 2), + 0x8d, + 0x05 | (dst.base() << 3), + ]); + let Some(addr) = syms.get(*sym) else { + let pos = data.len(); + data.extend([0; 4]); + return Ok(Some((*sym, pos))); + }; + data.extend(addr_offset(data.len(), addr)); + } } - Ok(()) + Ok(None) } + fn insert_sym(&self, data: &mut Vec, pos: usize, addr: Addr) { + match self { + Self::Lea { .. } => data[pos..pos + 4].copy_from_slice(&addr_offset(pos, addr)), + _ => panic!("unkown symbol insertion"), + } + } +} + +/// assumes the next instruction is directly after +fn addr_offset(pos: usize, addr: Addr) -> [u8; 4] { + let pos = (pos + 4) as i32; + let offset = addr.val() as i32 - pos; + offset.to_le_bytes() } impl RegMode { @@ -169,10 +198,21 @@ impl BitWidth { } } -pub fn mov(dst: RegMode, src: impl Into) -> Instr { - Instr::Mov { - dst, - src: src.into(), +pub mod instr { + use super::*; + pub fn mov(dst: RegMode, src: impl Into) -> Instr { + Instr::Mov { + dst, + src: src.into(), + } + } + + pub fn lea(dst: RegMode, sym: Symbol) -> Instr { + Instr::Lea { dst, sym } + } + + pub fn int(code: u8) -> Instr { + Instr::Int { code } } } @@ -190,13 +230,16 @@ impl From for RegImm { #[cfg(test)] mod test { + use crate::backend::program::Instr as _; + use super::*; + use instr::*; use reg::*; fn eq(expected: impl AsRef<[u8]>, got: Instr) { let expected = expected.as_ref(); let mut res = Vec::new(); - if let Err(e) = got.compile(&mut res) { + if let Err(e) = got.encode(&mut res, &mut SymTable::new(0)) { panic!("expected {expected:x?}, failed to compile: {}", e.msg); } assert_eq!(expected, &res[..], "expected {expected:x?}, got {res:x?}"); diff --git a/src/backend/elf.rs b/src/backend/elf.rs index f8675d2..2d85d99 100644 --- a/src/backend/elf.rs +++ b/src/backend/elf.rs @@ -52,14 +52,14 @@ pub struct SectionHeader { pub enum Arch { X86_64, - RISCV, + Riscv, } impl Arch { pub fn machine(&self) -> u16 { match self { Arch::X86_64 => 0x3e, - Arch::RISCV => 0xf3, + Arch::Riscv => 0xf3, } } } diff --git a/src/backend/elf_sections.rs b/src/backend/elf_sections.rs new file mode 100644 index 0000000..6ab360c --- /dev/null +++ b/src/backend/elf_sections.rs @@ -0,0 +1,145 @@ +use crate::backend::Addr; + +#[repr(C)] +pub struct ELF64Header { + magic: u32, + class: u8, + endianness: u8, + ei_version: u8, + os_abi: u8, + os_abi_ver: u8, + pad: [u8; 7], + ty: u16, + machine: u16, + e_version: u32, + entry: u64, + program_header_offset: u64, + section_header_offset: u64, + flags: u32, + header_size: u16, + program_header_entry_size: u16, + program_header_num: u16, + section_header_entry_size: u16, + section_header_num: u16, + section_header_str_idx: u16, +} + +#[repr(C)] +pub struct ProgramHeader { + ty: u32, + flags: u32, + offset: u64, + vaddr: u64, + paddr: u64, + filesz: u64, + memsz: u64, + align: u64, +} + +#[repr(C)] +pub struct SectionHeader { + name_idx: u32, + ty: u32, + flags: u64, + addr: u64, + offset: u64, + size: u64, + link: u32, + info: u32, + addr_align: u64, + entry_size: u64, +} + +pub enum Arch { + X86_64, + Riscv, +} + +impl Arch { + pub fn machine(&self) -> u16 { + match self { + Arch::X86_64 => 0x3e, + Arch::Riscv => 0xf3, + } + } +} + +// this is currently specialized for riscv64; obviously add params later +pub fn create(program: &[u8], start_offset: Addr) -> Vec { + let addr_start = 0x400000; + let page_size = 0x1000; + // I don't know if I have to add addr_start here, idk how it maps the memory + let program_size = std::mem::size_of_val(program) as u64; + let program_header = ProgramHeader { + ty: 0x1, // LOAD + flags: 0b101, // executable, readable + offset: 0x0, + vaddr: addr_start, + paddr: 0x0, + filesz: program_size, + memsz: program_size, + align: page_size, + }; + let header_len = (size_of::() + size_of::()) as u64; + let program_pos = header_len; + let header = ELF64Header { + magic: 0x7f_45_4c_46u32.swap_bytes(), + class: 0x2, // 64 bit + endianness: 0x1, // little endian + ei_version: 0x1, + os_abi: 0x0, // system-v + os_abi_ver: 0x0, + pad: [0x0; 7], + ty: 0x2, // executable + machine: Arch::X86_64.machine(), + e_version: 0x1, + entry: addr_start + program_pos + start_offset.val(), + program_header_offset: size_of::() as u64, + section_header_offset: program_pos + program.len() as u64, + // C ABI (16 bit instruction align) + double precision floats + flags: 0x1 | 0x4, + header_size: size_of::() as u16, + program_header_entry_size: size_of::() as u16, + program_header_num: 0x1, + section_header_entry_size: size_of::() as u16, + section_header_num: 0x2, + section_header_str_idx: 0x0, + }; + let section_null = SectionHeader { + name_idx: 0, + ty: 0, + flags: 0, + addr: 0, + offset: 0, + size: 0, + link: 0, + info: 0, + addr_align: 0, + entry_size: 0, + }; + let section = SectionHeader { + name_idx: 0, + ty: 0x1, // program data + flags: (1 << 1) | (1 << 2), // alloc & exec + addr: addr_start, + offset: program_pos, + size: program.len() as u64, + link: 0, + info: 0, + addr_align: 1, + entry_size: 0, + }; + let mut bytes: Vec = Vec::new(); + unsafe { + bytes.extend(as_u8_slice(&header)); + bytes.extend(as_u8_slice(&program_header)); + bytes.extend(program); + bytes.extend(as_u8_slice(§ion_null)); + bytes.extend(as_u8_slice(§ion)); + } + bytes +} + +unsafe fn as_u8_slice(p: &T) -> &[u8] { + unsafe { core::slice::from_raw_parts((p as *const T) as *const u8, size_of::()) } +} diff --git a/src/backend/mod.rs b/src/backend/mod.rs index 7dc72c8..affa101 100644 --- a/src/backend/mod.rs +++ b/src/backend/mod.rs @@ -1,17 +1,13 @@ pub mod arch; pub mod elf; +mod program; +mod symbol; mod test; + pub use arch::*; pub use test::*; -#[derive(Debug, Clone, Copy, PartialEq)] -pub struct Addr(u64); -impl Addr { - const NONE: Self = Self(!0); - pub fn val(&self) -> u64 { - self.0 - } -} +use program::*; pub enum Asm { X86_64(x86_64::Asm), diff --git a/src/backend/program.rs b/src/backend/program.rs new file mode 100644 index 0000000..5c36af9 --- /dev/null +++ b/src/backend/program.rs @@ -0,0 +1,111 @@ +use crate::{ + backend::{elf, symbol::Symbol}, + io::CompilerMsg, +}; +use std::collections::HashMap; + +pub struct LinkedProgram { + pub code: Vec, + pub start: Option, +} + +pub struct UnlinkedProgram { + pub fns: Vec>, + pub ro_data: Vec<(Vec, Symbol)>, + pub sym_count: usize, + pub start: Option, +} + +pub struct UnlinkedFunction { + pub instrs: Vec, + pub sym: Symbol, + // locations within the function for loops & such / goto + pub locations: HashMap, +} + +impl UnlinkedProgram { + pub fn link(self) -> Result { + let mut data = Vec::new(); + let mut sym_table = SymTable::new(self.sym_count); + let mut missing = HashMap::>::new(); + for (val, id) in self.ro_data { + sym_table.insert(id, Addr(data.len() as u64)); + data.extend(val); + } + data.resize(data.len() + (4 - data.len() % 4), 0); + for f in self.fns { + let addr = Addr(data.len() as u64); + let mut added = vec![(f.sym, addr)]; + sym_table.insert(f.sym, addr); + for (i, instr) in f.instrs.into_iter().enumerate() { + let i_pos = Addr(data.len() as u64); + if let Some(sym) = f.locations.get(&i) { + sym_table.insert(*sym, i_pos); + added.push((*sym, i_pos)); + } + if let Some((sym, marker)) = instr.encode(&mut data, &mut sym_table)? { + let info = (instr, marker); + if let Some(vec) = missing.get_mut(&sym) { + vec.push(info); + } else { + missing.insert(sym, vec![info]); + } + } + } + for (sym, addr) in added { + if let Some(vec) = missing.remove(&sym) { + for (i, marker) in vec { + i.insert_sym(&mut data, marker, addr); + } + } + } + } + assert!(missing.is_empty()); + Ok(LinkedProgram { + code: data, + start: self + .start + .map(|s| sym_table.get(s).expect("start symbol doesn't exist")), + }) + } +} + +pub trait Instr { + fn encode( + &self, + data: &mut Vec, + syms: &mut SymTable, + ) -> Result, CompilerMsg>; + fn insert_sym(&self, data: &mut Vec, pos: usize, addr: Addr); +} + +#[derive(Debug, Clone, Copy, PartialEq)] +pub struct Addr(u64); +impl Addr { + pub const ZERO: Self = Self(!0); + pub fn val(&self) -> u64 { + self.0 + } +} + +pub struct SymTable(Vec); +impl SymTable { + pub fn new(len: usize) -> Self { + Self(vec![Addr::ZERO; len]) + } + pub fn insert(&mut self, sym: Symbol, addr: Addr) { + self.0[sym.val()] = addr; + } + pub fn get(&self, sym: Symbol) -> Option { + match self.0[sym.val()] { + Addr::ZERO => None, + addr => Some(addr), + } + } +} + +impl LinkedProgram { + pub fn to_elf(&self) -> Vec { + elf::create(&self.code, self.start.expect("no start")) + } +} diff --git a/src/backend/symbol.rs b/src/backend/symbol.rs new file mode 100644 index 0000000..9471861 --- /dev/null +++ b/src/backend/symbol.rs @@ -0,0 +1,147 @@ +use std::collections::HashMap; + +#[derive(Clone, Copy, Hash, PartialEq, Eq)] +pub struct Symbol(usize); + +impl Symbol { + pub fn val(&self) -> usize { + self.0 + } + pub fn raw(val: usize) -> Self { + Self(val) + } +} + +/// intentionally does not have copy or clone; +/// this should only be consumed once +pub struct WritableSymbol(Symbol); + +impl std::ops::Deref for WritableSymbol { + type Target = Symbol; + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +// pub struct SymbolSpace { +// ro_data: Vec<(Symbol, Vec)>, +// fns: Vec<(Symbol, IRLFunction)>, +// len: usize, +// labels: Vec>, +// } +// +// pub struct SymbolSpaceBuilder { +// symbols: usize, +// unwritten_fns: Vec<(WritableSymbol, FnID)>, +// fn_map: HashMap, +// data_map: HashMap, +// ro_data: Vec<(Symbol, Vec)>, +// fns: Vec<(Symbol, IRLFunction)>, +// labels: Vec>, +// } +// +// impl SymbolSpace { +// pub fn ro_data(&self) -> &[(Symbol, Vec)] { +// &self.ro_data +// } +// pub fn fns(&self) -> &[(Symbol, IRLFunction)] { +// &self.fns +// } +// pub fn labels(&self) -> &[Option] { +// &self.labels +// } +// pub fn len(&self) -> usize { +// self.len +// } +// } +// +// impl SymbolSpaceBuilder { +// pub fn new() -> Self { +// Self { +// symbols: 0, +// unwritten_fns: Vec::new(), +// fn_map: HashMap::new(), +// data_map: HashMap::new(), +// ro_data: Vec::new(), +// fns: Vec::new(), +// labels: Vec::new(), +// } +// } +// pub fn with_entries(entries: &[FnID]) -> SymbolSpaceBuilder { +// let mut s = Self::new(); +// for e in entries { +// s.func(*e); +// } +// s +// } +// pub fn pop_fn(&mut self) -> Option<(WritableSymbol, FnID)> { +// self.unwritten_fns.pop() +// } +// pub fn anon_ro_data(&mut self, data: &[u8], label: Option) -> Symbol { +// let sym = self.reserve(); +// self.write_ro_data(sym, data.to_vec(), label) +// } +// pub fn ro_data(&mut self, id: DataID, data: &[u8], label: Option<&str>) -> Symbol { +// match self.data_map.get(&id) { +// Some(s) => *s, +// None => { +// let sym = self.reserve(); +// self.data_map.insert(id, *sym); +// self.write_ro_data(sym, data.to_vec(), label.map(|l| l.to_string())) +// } +// } +// } +// pub fn func(&mut self, id: FnID) -> Symbol { +// match self.fn_map.get(&id) { +// Some(s) => *s, +// None => { +// let wsym = self.reserve(); +// let sym = *wsym; +// self.unwritten_fns.push((wsym, id)); +// self.fn_map.insert(id, sym); +// sym +// } +// } +// } +// pub fn write_ro_data( +// &mut self, +// sym: WritableSymbol, +// data: Vec, +// name: Option, +// ) -> Symbol { +// self.ro_data.push((*sym, data)); +// self.labels[sym.0.0] = name; +// *sym +// } +// pub fn write_fn( +// &mut self, +// sym: WritableSymbol, +// func: IRLFunction, +// name: Option, +// ) -> Symbol { +// self.fns.push((*sym, func)); +// self.labels[sym.0.0] = name; +// *sym +// } +// pub fn reserve(&mut self) -> WritableSymbol { +// let val = self.symbols; +// self.symbols += 1; +// self.labels.push(None); +// WritableSymbol(Symbol(val)) +// } +// pub fn len(&self) -> usize { +// self.symbols +// } +// pub fn finish(self) -> Option { +// if self.unwritten_fns.is_empty() { +// Some(SymbolSpace { +// len: self.symbols, +// fns: self.fns, +// ro_data: self.ro_data, +// labels: self.labels, +// }) +// } else { +// None +// } +// } +// } diff --git a/src/backend/test.rs b/src/backend/test.rs index 339fabb..6ebdeda 100644 --- a/src/backend/test.rs +++ b/src/backend/test.rs @@ -1,16 +1,35 @@ use crate::backend::{ - Addr, elf, - x86_64::{Asm, Instr, mov, reg::*}, + program::{UnlinkedFunction, UnlinkedProgram}, + symbol::Symbol, + x86_64::{instr::*, reg::*}, }; use std::{fs::OpenOptions, io::Write, os::unix::fs::OpenOptionsExt, process::Command}; pub fn test_x86_64() { - let asm = Asm { - instrs: vec![mov(eax, 1), mov(ebx, 39), Instr::Int { code: 0x80 }], + let s = b"Hello world!\n"; + let program = UnlinkedProgram { + fns: vec![UnlinkedFunction { + instrs: vec![ + mov(eax, 4), + mov(ebx, 1), + lea(ecx, Symbol::raw(1)), + mov(edx, s.len() as u64), + int(0x80), + mov(eax, 1), + mov(ebx, 39), + int(0x80), + ], + sym: Symbol::raw(0), + locations: Default::default(), + }], + ro_data: vec![(s.to_vec(), Symbol::raw(1))], + sym_count: 3, + start: Some(Symbol::raw(0)), }; - let mut out = Vec::new(); - asm.compile(&mut out).expect("failed to compile"); - let binary = elf::create(&out, Addr(0)); + let Ok(linked) = program.link() else { + panic!("failed to link"); + }; + let binary = linked.to_elf(); let path = "./x86_64_test"; let mut file = OpenOptions::new() .create(true) diff --git a/src/parser_ir/mod.rs b/src/parser_ir/mod.rs index 3cb5c70..88c6792 100644 --- a/src/parser_ir/mod.rs +++ b/src/parser_ir/mod.rs @@ -9,6 +9,8 @@ use crate::{ parser::{self, ExprTy, Ident, Node, parse_file}, }; +const EXTENSION: &str = ".lang"; + pub fn parse_program(path: impl AsRef, output: &mut CompilerOutput) -> Option { let path = path.as_ref(); let mut imports = Imports::default(); @@ -16,7 +18,7 @@ pub fn parse_program(path: impl AsRef, output: &mut CompilerOutput) -> Opt imports.add(path.file_stem().unwrap().to_str().unwrap()); while let Some(next) = imports.new.pop() { imports.done.insert(next.clone()); - let path = dir.join(next + ".lang"); + let path = dir.join(next + EXTENSION); println!("=== {path:?}"); let body = parse_file(path, output)?; print!("{}", body.new_dsp()); diff --git a/x86_64_test b/x86_64_test index 6efef5a..0174fb8 100755 Binary files a/x86_64_test and b/x86_64_test differ