From 380a0f977aa8c179398f3a427968cbd5f1fe7881 Mon Sep 17 00:00:00 2001 From: Shadow Cat Date: Wed, 3 Jun 2026 01:50:43 -0400 Subject: [PATCH] x86_64 compiler + elf output (can compile code that returns exit code) --- [ | 24 --- src/backend/arch/x86_64.rs | 270 +++++++++++++++++++++++++++++---- src/backend/elf.rs | 119 +++++++++++++++ src/backend/mod.rs | 17 +++ src/backend/test.rs | 31 ++++ src/io/mod.rs | 1 + src/ir/structs/mod.rs | 5 + src/main.rs | 1 + src/parser/nodes/asm/mod.rs | 7 +- src/parser/nodes/asm/x86_64.rs | 23 +-- x86_64_test | Bin 0 -> 132 bytes 11 files changed, 422 insertions(+), 76 deletions(-) delete mode 100644 [ create mode 100644 src/backend/elf.rs create mode 100644 src/backend/test.rs create mode 100755 x86_64_test diff --git a/[ b/[ deleted file mode 100644 index 183df37..0000000 --- a/[ +++ /dev/null @@ -1,24 +0,0 @@ -mod namespace; -pub use namespace::*; - -use super::Id; - -pub struct Fn { - pub body: Body, -} - -pub struct Body { - pub statements: Vec, -} - -pub struct Statement { - ty: StatementTy, -} - -pub enum StatementTy { - Define, - Assign, - Call { target: VarId, args: VarId }, -} - -pub type VarId = usize; diff --git a/src/backend/arch/x86_64.rs b/src/backend/arch/x86_64.rs index 0680178..820bc10 100644 --- a/src/backend/arch/x86_64.rs +++ b/src/backend/arch/x86_64.rs @@ -1,32 +1,120 @@ +use crate::io::CompilerMsg; + +pub struct Asm { + pub instrs: Vec, +} + +pub enum Instr { + Mov { dst: RegMode, src: RegImm }, + Int { code: u8 }, +} + +pub enum RegImm { + Reg(RegMode), + Imm(u64), +} + pub struct Reg(u8); pub struct RegMode { reg: Reg, - mode: BitMode, + width: BitWidth, + high: bool, +} + +impl Asm { + pub fn compile(&self, out: &mut Vec) -> Result<(), CompilerMsg> { + for instr in &self.instrs { + instr.compile(out)?; + } + Ok(()) + } +} + +impl Instr { + pub fn compile(&self, out: &mut Vec) -> Result<(), CompilerMsg> { + match self { + Instr::Mov { dst, src } => { + let width = dst.width; + if width == BitWidth::B16 { + out.push(0x66); + } + let dst8 = dst.gt8(); + let b64 = width == BitWidth::B64; + let b8 = width == BitWidth::B8; + let src8 = if let RegImm::Reg(src) = src { + src.gt8() + } else { + false + }; + if dst8 || src8 || b64 || (dst.gt4() && !dst.high) { + out.push(0x40 | dst8 as u8 | ((b64 as u8) << 3) | ((src8 as u8) << 2)); + } + match src { + RegImm::Reg(src) => { + if dst.width != src.width { + return Err("src and dst are not the same size".into()); + } + out.push(0x88 | !b8 as u8); + let modrm = 0b11_000_000 | (src.base() << 3) | dst.base(); + out.push(modrm); + } + &RegImm::Imm(imm) => { + if imm > width.max() { + return Err("immediate cannot fit in register".into()); + } + out.push(0xb0 | ((!b8 as u8) << 3) | dst.base()); + out.extend(&imm.to_le_bytes()[..width.bytes()]); + } + } + } + Instr::Int { code } => out.extend([0xcd, *code]), + } + Ok(()) + } +} + +impl RegMode { + pub fn base(&self) -> u8 { + self.reg.0 & 0b111 + } + /// checks if register is not one of the first 8 (0-7) + pub fn gt8(&self) -> bool { + self.reg.0 >= 0b1000 + } + pub fn gt4(&self) -> bool { + self.reg.0 >= 0b0100 + } } macro_rules! def_regs { - ($($val:literal $reg:ident: $B64:literal $B32:literal $B16:literal $B8:literal $($B16H:literal)?,)*) => { - impl Reg { + ($($val:literal : $B64:ident $B32:ident $B16:ident $B8:ident $($B8H:ident=$hval:expr)?,)*) => { + #[allow(non_upper_case_globals)] + pub mod reg { + use super::{RegMode, BitWidth, Reg}; $( - pub const $reg: u8 = $val; + pub const $B64: RegMode = RegMode { reg: Reg($val), width: BitWidth::B64, high: false }; + pub const $B32: RegMode = RegMode { reg: Reg($val), width: BitWidth::B32, high: false }; + pub const $B16: RegMode = RegMode { reg: Reg($val), width: BitWidth::B16, high: false }; + pub const $B8 : RegMode = RegMode { reg: Reg($val), width: BitWidth::B8, high: false }; + $( + pub const $B8H: RegMode = RegMode { reg: $hval.reg, width: BitWidth::B8, high: true }; + )? )* } impl RegMode { pub fn parse(s: &str) -> Option { - let (reg, mode) = match s.to_lowercase().as_str() { + Some(match s.to_lowercase().as_str() { $( - $B64 => ($val, BitMode::B64), - $B32 => ($val, BitMode::B32), - $B16 => ($val, BitMode::B16), - $B8 => ($val, BitMode::B8), - $($B16H => ($val, BitMode::B16H),)? + stringify!($B64) => reg::$B64, + stringify!($B32) => reg::$B32, + stringify!($B16) => reg::$B16, + stringify!($B8 ) => reg::$B8, + $( + stringify!($B8H) => reg::$B8H, + )? )* _ => return None, - }; - Some(RegMode { - reg: Reg(reg), - mode, }) } } @@ -34,30 +122,148 @@ macro_rules! def_regs { } def_regs! { - 0b0000 A : "rax" "eax" "ax" "al" "ah", - 0b0001 C : "rcx" "ecx" "cx" "cl" "ch", - 0b0010 D : "rdx" "edx" "dx" "dl" "dh", - 0b0011 B : "rbx" "ebx" "bx" "bl" "bh", + 0b0000 : rax eax ax al ah=spl, + 0b0001 : rcx ecx cx cl ch=bpl, + 0b0010 : rdx edx dx dl dh=sil, + 0b0011 : rbx ebx bx bl bh=dil, - 0b0100 SP: "rsp" "esp" "sp" "spl", - 0b0101 BP: "rbp" "ebp" "bp" "sbl", - 0b0110 SI: "rsi" "esi" "si" "sil", - 0b0111 DI: "rdi" "edi" "di" "dil", + 0b0100 : rsp esp sp spl, + 0b0101 : rbp ebp bp bpl, + 0b0110 : rsi esi si sil, + 0b0111 : rdi edi di dil, - 0b1000 R8 : "r8" "r8d" "r8w" "r8b", - 0b1001 R9 : "r9" "r9d" "r9w" "r9b", - 0b1010 R10: "r10" "r10d" "r10w" "r10b", - 0b1011 R11: "r11" "r11d" "r11w" "r11b", - 0b1100 R12: "r12" "r12d" "r12w" "r12b", - 0b1101 R13: "r13" "r13d" "r13w" "r13b", - 0b1110 R14: "r14" "r14d" "r14w" "r14b", - 0b1111 R15: "r15" "r15d" "r15w" "r15b", + 0b1000 : r8 r8d r8w r8b, + 0b1001 : r9 r9d r9w r9b, + 0b1010 : r10 r10d r10w r10b, + 0b1011 : r11 r11d r11w r11b, + 0b1100 : r12 r12d r12w r12b, + 0b1101 : r13 r13d r13w r13b, + 0b1110 : r14 r14d r14w r14b, + 0b1111 : r15 r15d r15w r15b, } -pub enum BitMode { +#[derive(Clone, Copy, PartialEq)] +pub enum BitWidth { B64, B32, B16, - B16H, B8, } + +impl BitWidth { + pub const fn max(&self) -> u64 { + match self { + Self::B64 => u64::MAX, + Self::B32 => u32::MAX as u64, + Self::B16 => u16::MAX as u64, + Self::B8 => u8::MAX as u64, + } + } + pub const fn bytes(&self) -> usize { + match self { + Self::B64 => 8, + Self::B32 => 4, + Self::B16 => 2, + Self::B8 => 1, + } + } +} + +pub fn mov(dst: RegMode, src: impl Into) -> Instr { + Instr::Mov { + dst, + src: src.into(), + } +} + +impl From for RegImm { + fn from(value: RegMode) -> Self { + Self::Reg(value) + } +} + +impl From for RegImm { + fn from(value: u64) -> Self { + Self::Imm(value) + } +} + +#[cfg(test)] +mod test { + use super::*; + use reg::*; + + fn eq(expected: impl AsRef<[u8]>, got: Instr) { + let expected = expected.as_ref(); + let mut res = Vec::new(); + if let Err(e) = got.compile(&mut res) { + panic!("expected {expected:x?}, failed to compile: {}", e.msg); + } + assert_eq!(expected, &res[..], "expected {expected:x?}, got {res:x?}"); + } + + #[test] + fn reg_reg() { + // used objdump on some nasm compiled assembly + eq([0x48, 0x89, 0xd8], mov(rax, rbx)); + eq([0x89, 0xd8], mov(eax, ebx)); + eq([0x66, 0x89, 0xd8], mov(ax, bx)); + eq([0x88, 0xd8], mov(al, bl)); + eq([0x88, 0xfc], mov(ah, bh)); + + eq([0x88, 0xf8], mov(al, bh)); + eq([0x88, 0xdc], mov(ah, bl)); + eq([0x40, 0x88, 0xe7], mov(dil, spl)); + + eq([0x4d, 0x89, 0xc8], mov(r8, r9)); + eq([0x45, 0x89, 0xc8], mov(r8d, r9d)); + eq([0x66, 0x45, 0x89, 0xc8], mov(r8w, r9w)); + eq([0x45, 0x88, 0xc8], mov(r8b, r9b)); + + eq([0x49, 0x89, 0xc0], mov(r8, rax)); + eq([0x4c, 0x89, 0xc0], mov(rax, r8)); + eq([0x4d, 0x89, 0xd1], mov(r9, r10)); + + eq([0x4d, 0x89, 0xe0], mov(r8, r12)); + } + + #[test] + fn reg_imm() { + eq( + [0x49, 0xbf, 0xf0, 0xde, 0xbc, 0x9a, 0x78, 0x56, 0x34, 0x12], + mov(r15, 0x123456789abcdef0), + ); + eq( + [0x49, 0xb8, 0xf0, 0xde, 0xbc, 0x9a, 0x78, 0x56, 0x34, 0x12], + mov(r8, 0x123456789abcdef0), + ); + eq( + [0x49, 0xb9, 0xf0, 0xde, 0xbc, 0x9a, 0x78, 0x56, 0x34, 0x12], + mov(r9, 0x123456789abcdef0), + ); + eq([0x41, 0xb9, 0x78, 0x56, 0x34, 0x12], mov(r9d, 0x12345678)); + eq([0x66, 0x41, 0xb9, 0x34, 0x12], mov(r9w, 0x1234)); + eq([0x41, 0xb1, 0x12], mov(r9b, 0x12)); + eq([0x41, 0xb0, 0x12], mov(r8b, 0x12)); + eq([0x41, 0xb7, 0x12], mov(r15b, 0x12)); + + eq( + [0x48, 0xb8, 0xf0, 0xde, 0xbc, 0x9a, 0x78, 0x56, 0x34, 0x12], + mov(rax, 0x123456789abcdef0), + ); + eq( + [0x48, 0xbb, 0xf0, 0xde, 0xbc, 0x9a, 0x78, 0x56, 0x34, 0x12], + mov(rbx, 0x123456789abcdef0), + ); + eq( + [0x48, 0xbf, 0xf0, 0xde, 0xbc, 0x9a, 0x78, 0x56, 0x34, 0x12], + mov(rdi, 0x123456789abcdef0), + ); + eq([0xbb, 0x78, 0x56, 0x34, 0x12], mov(ebx, 0x12345678)); + eq([0x66, 0xbb, 0x34, 0x12], mov(bx, 0x1234)); + eq([0xb3, 0x12], mov(bl, 0x12)); + eq([0xb7, 0x12], mov(bh, 0x12)); + eq([0xb4, 0x12], mov(ah, 0x12)); + eq([0x40, 0xb7, 0x12], mov(dil, 0x12)); + } +} diff --git a/src/backend/elf.rs b/src/backend/elf.rs new file mode 100644 index 0000000..f8675d2 --- /dev/null +++ b/src/backend/elf.rs @@ -0,0 +1,119 @@ +use crate::backend::Addr; + +#[repr(C)] +pub struct ELF64Header { + magic: u32, + class: u8, + endianness: u8, + ei_version: u8, + os_abi: u8, + os_abi_ver: u8, + pad: [u8; 7], + ty: u16, + machine: u16, + e_version: u32, + entry: u64, + program_header_offset: u64, + section_header_offset: u64, + flags: u32, + header_size: u16, + program_header_entry_size: u16, + program_header_num: u16, + section_header_entry_size: u16, + section_header_num: u16, + section_header_str_idx: u16, +} + +#[repr(C)] +pub struct ProgramHeader { + ty: u32, + flags: u32, + offset: u64, + vaddr: u64, + paddr: u64, + filesz: u64, + memsz: u64, + align: u64, +} + +#[repr(C)] +pub struct SectionHeader { + name_idx: u32, + ty: u32, + flags: u64, + addr: u64, + offset: u64, + size: u64, + link: u32, + info: u32, + addr_align: u64, + entry_size: u64, +} + +pub enum Arch { + X86_64, + RISCV, +} + +impl Arch { + pub fn machine(&self) -> u16 { + match self { + Arch::X86_64 => 0x3e, + Arch::RISCV => 0xf3, + } + } +} + +// this is currently specialized for riscv64; obviously add params later +pub fn create(program: &[u8], start_offset: Addr) -> Vec { + let addr_start = 0x400000; + let page_size = 0x1000; + // I don't know if I have to add addr_start here, idk how it maps the memory + let program_size = std::mem::size_of_val(program) as u64; + let program_header = ProgramHeader { + ty: 0x1, // LOAD + flags: 0b101, // executable, readable + offset: 0x0, + vaddr: addr_start, + paddr: 0x0, + filesz: program_size, + memsz: program_size, + align: page_size, + }; + let header_len = (size_of::() + size_of::()) as u64; + let program_pos = header_len; + let header = ELF64Header { + magic: 0x7f_45_4c_46u32.swap_bytes(), + class: 0x2, // 64 bit + endianness: 0x1, // little endian + ei_version: 0x1, + os_abi: 0x0, // system-v + os_abi_ver: 0x0, + pad: [0x0; 7], + ty: 0x2, // executable + machine: Arch::X86_64.machine(), + e_version: 0x1, + entry: addr_start + program_pos + start_offset.val(), + program_header_offset: size_of::() as u64, + section_header_offset: 0x0, + // C ABI (16 bit instruction align) + double precision floats + flags: 0x1 | 0x4, + header_size: size_of::() as u16, + program_header_entry_size: size_of::() as u16, + program_header_num: 0x1, + section_header_entry_size: size_of::() as u16, + section_header_num: 0x0, + section_header_str_idx: 0x0, + }; + let mut bytes: Vec = Vec::new(); + unsafe { + bytes.extend(as_u8_slice(&header)); + bytes.extend(as_u8_slice(&program_header)); + } + bytes.extend(program); + bytes +} + +unsafe fn as_u8_slice(p: &T) -> &[u8] { + unsafe { core::slice::from_raw_parts((p as *const T) as *const u8, size_of::()) } +} diff --git a/src/backend/mod.rs b/src/backend/mod.rs index 0374a92..7dc72c8 100644 --- a/src/backend/mod.rs +++ b/src/backend/mod.rs @@ -1 +1,18 @@ pub mod arch; +pub mod elf; +mod test; +pub use arch::*; +pub use test::*; + +#[derive(Debug, Clone, Copy, PartialEq)] +pub struct Addr(u64); +impl Addr { + const NONE: Self = Self(!0); + pub fn val(&self) -> u64 { + self.0 + } +} + +pub enum Asm { + X86_64(x86_64::Asm), +} diff --git a/src/backend/test.rs b/src/backend/test.rs new file mode 100644 index 0000000..339fabb --- /dev/null +++ b/src/backend/test.rs @@ -0,0 +1,31 @@ +use crate::backend::{ + Addr, elf, + x86_64::{Asm, Instr, mov, reg::*}, +}; +use std::{fs::OpenOptions, io::Write, os::unix::fs::OpenOptionsExt, process::Command}; + +pub fn test_x86_64() { + let asm = Asm { + instrs: vec![mov(eax, 1), mov(ebx, 39), Instr::Int { code: 0x80 }], + }; + let mut out = Vec::new(); + asm.compile(&mut out).expect("failed to compile"); + let binary = elf::create(&out, Addr(0)); + let path = "./x86_64_test"; + let mut file = OpenOptions::new() + .create(true) + .write(true) + .truncate(true) + .mode(0o750) + .open(path) + .expect("Failed to create file"); + file.write_all(&binary).expect("Failed to write to file"); + file.sync_all().expect("Failed to sync file"); + drop(file); + println!("running..."); + let mut proc = Command::new(path).spawn().expect("failed to run"); + let status = proc.wait().expect("failed to wait"); + if let Some(code) = status.code() { + std::process::exit(code); + } +} diff --git a/src/io/mod.rs b/src/io/mod.rs index 3f657e1..c8ff373 100644 --- a/src/io/mod.rs +++ b/src/io/mod.rs @@ -26,6 +26,7 @@ impl std::ops::DerefMut for Spanned { } } +#[derive(Debug)] pub struct CompilerMsg { pub spans: Vec, pub msg: String, diff --git a/src/ir/structs/mod.rs b/src/ir/structs/mod.rs index e5bb04e..9618385 100644 --- a/src/ir/structs/mod.rs +++ b/src/ir/structs/mod.rs @@ -1,6 +1,8 @@ mod namespace; pub use namespace::*; +use crate::backend::Asm; + use super::Id; pub struct Fn { @@ -19,6 +21,7 @@ pub enum StatementTy { Define { target: VarId, val: VarId }, Assign { target: VarId, val: VarId }, Call { target: VarId, args: Vec }, + Asm(Asm), } pub struct Var { @@ -29,6 +32,8 @@ pub struct Var { pub enum Type { Unsigned(u8), Signed(u8), + Array(TypeId), + Ptr(TypeId), Infer, } diff --git a/src/main.rs b/src/main.rs index f5fa2d2..a514ac0 100644 --- a/src/main.rs +++ b/src/main.rs @@ -7,6 +7,7 @@ mod parser; mod parser_ir; fn main() { + return crate::backend::test_x86_64(); let mut args = std::env::args(); let Some(path) = args.nth(1) else { println!("file expected"); diff --git a/src/parser/nodes/asm/mod.rs b/src/parser/nodes/asm/mod.rs index 4202cef..c53a0e9 100644 --- a/src/parser/nodes/asm/mod.rs +++ b/src/parser/nodes/asm/mod.rs @@ -1,9 +1,12 @@ -use crate::parser::{Node, cursor::Token}; +use crate::{ + backend::arch::x86_64::Asm, + parser::{Node, cursor::Token}, +}; pub mod x86_64; pub enum AsmBlock { - X86_64(x86_64::Asm), + X86_64(Asm), } impl Node for AsmBlock { diff --git a/src/parser/nodes/asm/x86_64.rs b/src/parser/nodes/asm/x86_64.rs index f2b8d49..e9cfc7c 100644 --- a/src/parser/nodes/asm/x86_64.rs +++ b/src/parser/nodes/asm/x86_64.rs @@ -1,5 +1,5 @@ use crate::{ - backend::arch::x86_64::RegMode, + backend::arch::x86_64::*, io::{CompilerMsg, Span}, parser::{ Node, @@ -7,20 +7,6 @@ use crate::{ }, }; -pub struct Asm { - instrs: Vec, -} - -pub enum Instr { - Mov { dst: RegMode, src: RegImm }, - Int { code: u64 }, -} - -pub enum RegImm { - Reg(RegMode), - Imm(u64), -} - impl Node for Asm { fn parse(ctx: &mut crate::parser::ParseCtx) -> Result { let mut instrs = Vec::new(); @@ -38,7 +24,9 @@ impl Node for Asm { let Token::Lit(LitTy::Number(num)) = ctx.expect_next()? else { return Err("Expected an immediate".into()); }; - let code = parse_imm(&num, ctx.span)?; + let code = parse_imm(&num, ctx.span)? + .try_into() + .map_err(|_| CompilerMsg::from("Immediate must be a u8"))?; instrs.push(Instr::Int { code }); } _ => { @@ -65,8 +53,7 @@ pub fn parse_imm(mut s: &str, span: Span) -> Result { radix = 16; s = &s[2..]; } - u64::from_str_radix(s, radix) - .map_err(|_| CompilerMsg::from(("invalid immediate", span))) + u64::from_str_radix(s, radix).map_err(|_| CompilerMsg::from(("invalid immediate", span))) } pub fn parse_rmi(ctx: &mut crate::parser::ParseCtx) -> Result { diff --git a/x86_64_test b/x86_64_test new file mode 100755 index 0000000000000000000000000000000000000000..6efef5a86b4e8579c145d5e43beb99a46045b514 GIT binary patch literal 132 zcmb<-^>JfjWMqH=CI&kO5U+y40W1U|!Avm03M3sEEP!etG9a}eF$8cxFo7%{C=W(6 Q2tWjOfHdw_2NGu+0FccFr~m)} literal 0 HcmV?d00001