diff options
author | Thomas Bushnell <thomas@gnu.org> | 1997-02-25 21:28:37 +0000 |
---|---|---|
committer | Thomas Bushnell <thomas@gnu.org> | 1997-02-25 21:28:37 +0000 |
commit | f07a4c844da9f0ecae5bbee1ab94be56505f26f7 (patch) | |
tree | 12b07c7e578fc1a5f53dbfde2632408491ff2a70 /vm | |
download | gnumach-f07a4c844da9f0ecae5bbee1ab94be56505f26f7.tar.gz gnumach-f07a4c844da9f0ecae5bbee1ab94be56505f26f7.tar.bz2 gnumach-f07a4c844da9f0ecae5bbee1ab94be56505f26f7.zip |
Initial source
Diffstat (limited to 'vm')
-rw-r--r-- | vm/memory_object.c | 1191 | ||||
-rw-r--r-- | vm/memory_object.h | 43 | ||||
-rw-r--r-- | vm/memory_object_default.cli | 28 | ||||
-rw-r--r-- | vm/memory_object_user.cli | 28 | ||||
-rw-r--r-- | vm/pmap.h | 267 | ||||
-rw-r--r-- | vm/vm_debug.c | 499 | ||||
-rw-r--r-- | vm/vm_external.c | 159 | ||||
-rw-r--r-- | vm/vm_external.h | 89 | ||||
-rw-r--r-- | vm/vm_fault.c | 2182 | ||||
-rw-r--r-- | vm/vm_fault.h | 64 | ||||
-rw-r--r-- | vm/vm_init.c | 84 | ||||
-rw-r--r-- | vm/vm_kern.c | 1072 | ||||
-rw-r--r-- | vm/vm_kern.h | 63 | ||||
-rw-r--r-- | vm/vm_map.c | 5244 | ||||
-rw-r--r-- | vm/vm_map.h | 448 | ||||
-rw-r--r-- | vm/vm_object.c | 3090 | ||||
-rw-r--r-- | vm/vm_object.h | 374 | ||||
-rw-r--r-- | vm/vm_page.h | 322 | ||||
-rw-r--r-- | vm/vm_pageout.c | 924 | ||||
-rw-r--r-- | vm/vm_pageout.h | 46 | ||||
-rw-r--r-- | vm/vm_resident.c | 1505 | ||||
-rw-r--r-- | vm/vm_user.c | 397 | ||||
-rw-r--r-- | vm/vm_user.h | 50 |
23 files changed, 18169 insertions, 0 deletions
diff --git a/vm/memory_object.c b/vm/memory_object.c new file mode 100644 index 00000000..a2b0bed8 --- /dev/null +++ b/vm/memory_object.c @@ -0,0 +1,1191 @@ +/* + * Mach Operating System + * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University. + * Copyright (c) 1993,1994 The University of Utah and + * the Computer Systems Laboratory (CSL). + * All rights reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON, THE UNIVERSITY OF UTAH AND CSL ALLOW FREE USE OF + * THIS SOFTWARE IN ITS "AS IS" CONDITION, AND DISCLAIM ANY LIABILITY + * OF ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF + * THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +/* + * File: vm/memory_object.c + * Author: Michael Wayne Young + * + * External memory management interface control functions. + */ + +/* + * Interface dependencies: + */ + +#include <mach/std_types.h> /* For pointer_t */ +#include <mach/mach_types.h> + +#include <mach/kern_return.h> +#include <vm/vm_object.h> +#include <mach/memory_object.h> +#include <mach/boolean.h> +#include <mach/vm_prot.h> +#include <mach/message.h> + +#include "memory_object_user.h" +#include "memory_object_default.h" + +/* + * Implementation dependencies: + */ +#include <vm/memory_object.h> +#include <vm/vm_page.h> +#include <vm/vm_pageout.h> +#include <vm/pmap.h> /* For copy_to_phys, pmap_clear_modify */ +#include <kern/thread.h> /* For current_thread() */ +#include <kern/host.h> +#include <vm/vm_kern.h> /* For kernel_map, vm_move */ +#include <vm/vm_map.h> /* For vm_map_pageable */ +#include <ipc/ipc_port.h> + +#include <norma_vm.h> +#include <norma_ipc.h> +#if NORMA_VM +#include <norma/xmm_server_rename.h> +#endif NORMA_VM +#include <mach_pagemap.h> +#if MACH_PAGEMAP +#include <vm/vm_external.h> +#endif MACH_PAGEMAP + +typedef int memory_object_lock_result_t; /* moved from below */ + + +ipc_port_t memory_manager_default = IP_NULL; +decl_simple_lock_data(,memory_manager_default_lock) + +/* + * Important note: + * All of these routines gain a reference to the + * object (first argument) as part of the automatic + * argument conversion. Explicit deallocation is necessary. + */ + +#if !NORMA_VM +/* + * If successful, destroys the map copy object. + */ +kern_return_t memory_object_data_provided(object, offset, data, data_cnt, + lock_value) + vm_object_t object; + vm_offset_t offset; + pointer_t data; + unsigned int data_cnt; + vm_prot_t lock_value; +{ + return memory_object_data_supply(object, offset, (vm_map_copy_t) data, + data_cnt, lock_value, FALSE, IP_NULL, + 0); +} +#endif !NORMA_VM + + +kern_return_t memory_object_data_supply(object, offset, data_copy, data_cnt, + lock_value, precious, reply_to, reply_to_type) + register + vm_object_t object; + register + vm_offset_t offset; + vm_map_copy_t data_copy; + unsigned int data_cnt; + vm_prot_t lock_value; + boolean_t precious; + ipc_port_t reply_to; + mach_msg_type_name_t reply_to_type; +{ + kern_return_t result = KERN_SUCCESS; + vm_offset_t error_offset = 0; + register + vm_page_t m; + register + vm_page_t data_m; + vm_size_t original_length; + vm_offset_t original_offset; + vm_page_t *page_list; + boolean_t was_absent; + vm_map_copy_t orig_copy = data_copy; + + /* + * Look for bogus arguments + */ + + if (object == VM_OBJECT_NULL) { + return(KERN_INVALID_ARGUMENT); + } + + if (lock_value & ~VM_PROT_ALL) { + vm_object_deallocate(object); + return(KERN_INVALID_ARGUMENT); + } + + if ((data_cnt % PAGE_SIZE) != 0) { + vm_object_deallocate(object); + return(KERN_INVALID_ARGUMENT); + } + + /* + * Adjust the offset from the memory object to the offset + * within the vm_object. + */ + + original_length = data_cnt; + original_offset = offset; + + assert(data_copy->type == VM_MAP_COPY_PAGE_LIST); + page_list = &data_copy->cpy_page_list[0]; + + vm_object_lock(object); + vm_object_paging_begin(object); + offset -= object->paging_offset; + + /* + * Loop over copy stealing pages for pagein. + */ + + for (; data_cnt > 0 ; data_cnt -= PAGE_SIZE, offset += PAGE_SIZE) { + + assert(data_copy->cpy_npages > 0); + data_m = *page_list; + + if (data_m == VM_PAGE_NULL || data_m->tabled || + data_m->error || data_m->absent || data_m->fictitious) { + + panic("Data_supply: bad page"); + } + + /* + * Look up target page and check its state. + */ + +retry_lookup: + m = vm_page_lookup(object,offset); + if (m == VM_PAGE_NULL) { + was_absent = FALSE; + } + else { + if (m->absent && m->busy) { + + /* + * Page was requested. Free the busy + * page waiting for it. Insertion + * of new page happens below. + */ + + VM_PAGE_FREE(m); + was_absent = TRUE; + } + else { + + /* + * Have to wait for page that is busy and + * not absent. This is probably going to + * be an error, but go back and check. + */ + if (m->busy) { + PAGE_ASSERT_WAIT(m, FALSE); + vm_object_unlock(object); + thread_block((void (*)()) 0); + vm_object_lock(object); + goto retry_lookup; + } + + /* + * Page already present; error. + * This is an error if data is precious. + */ + result = KERN_MEMORY_PRESENT; + error_offset = offset + object->paging_offset; + + break; + } + } + + /* + * Ok to pagein page. Target object now has no page + * at offset. Set the page parameters, then drop + * in new page and set up pageout state. Object is + * still locked here. + * + * Must clear busy bit in page before inserting it. + * Ok to skip wakeup logic because nobody else + * can possibly know about this page. + */ + + data_m->busy = FALSE; + data_m->dirty = FALSE; + pmap_clear_modify(data_m->phys_addr); + + data_m->page_lock = lock_value; + data_m->unlock_request = VM_PROT_NONE; + data_m->precious = precious; + + vm_page_lock_queues(); + vm_page_insert(data_m, object, offset); + + if (was_absent) + vm_page_activate(data_m); + else + vm_page_deactivate(data_m); + + vm_page_unlock_queues(); + + /* + * Null out this page list entry, and advance to next + * page. + */ + + *page_list++ = VM_PAGE_NULL; + + if (--(data_copy->cpy_npages) == 0 && + vm_map_copy_has_cont(data_copy)) { + vm_map_copy_t new_copy; + + vm_object_unlock(object); + + vm_map_copy_invoke_cont(data_copy, &new_copy, &result); + + if (result == KERN_SUCCESS) { + + /* + * Consume on success requires that + * we keep the original vm_map_copy + * around in case something fails. + * Free the old copy if it's not the original + */ + if (data_copy != orig_copy) { + vm_map_copy_discard(data_copy); + } + + if ((data_copy = new_copy) != VM_MAP_COPY_NULL) + page_list = &data_copy->cpy_page_list[0]; + + vm_object_lock(object); + } + else { + vm_object_lock(object); + error_offset = offset + object->paging_offset + + PAGE_SIZE; + break; + } + } + } + + /* + * Send reply if one was requested. + */ + vm_object_paging_end(object); + vm_object_unlock(object); + + if (vm_map_copy_has_cont(data_copy)) + vm_map_copy_abort_cont(data_copy); + + if (IP_VALID(reply_to)) { + memory_object_supply_completed( + reply_to, reply_to_type, + object->pager_request, + original_offset, + original_length, + result, + error_offset); + } + + vm_object_deallocate(object); + + /* + * Consume on success: The final data copy must be + * be discarded if it is not the original. The original + * gets discarded only if this routine succeeds. + */ + if (data_copy != orig_copy) + vm_map_copy_discard(data_copy); + if (result == KERN_SUCCESS) + vm_map_copy_discard(orig_copy); + + + return(result); +} + +kern_return_t memory_object_data_error(object, offset, size, error_value) + vm_object_t object; + vm_offset_t offset; + vm_size_t size; + kern_return_t error_value; +{ + if (object == VM_OBJECT_NULL) + return(KERN_INVALID_ARGUMENT); + + if (size != round_page(size)) + return(KERN_INVALID_ARGUMENT); + +#ifdef lint + /* Error value is ignored at this time */ + error_value++; +#endif + + vm_object_lock(object); + offset -= object->paging_offset; + + while (size != 0) { + register vm_page_t m; + + m = vm_page_lookup(object, offset); + if ((m != VM_PAGE_NULL) && m->busy && m->absent) { + m->error = TRUE; + m->absent = FALSE; + vm_object_absent_release(object); + + PAGE_WAKEUP_DONE(m); + + vm_page_lock_queues(); + vm_page_activate(m); + vm_page_unlock_queues(); + } + + size -= PAGE_SIZE; + offset += PAGE_SIZE; + } + vm_object_unlock(object); + + vm_object_deallocate(object); + return(KERN_SUCCESS); +} + +kern_return_t memory_object_data_unavailable(object, offset, size) + vm_object_t object; + vm_offset_t offset; + vm_size_t size; +{ +#if MACH_PAGEMAP + vm_external_t existence_info = VM_EXTERNAL_NULL; +#endif MACH_PAGEMAP + + if (object == VM_OBJECT_NULL) + return(KERN_INVALID_ARGUMENT); + + if (size != round_page(size)) + return(KERN_INVALID_ARGUMENT); + +#if MACH_PAGEMAP + if ((offset == 0) && (size > VM_EXTERNAL_LARGE_SIZE) && + (object->existence_info == VM_EXTERNAL_NULL)) { + existence_info = vm_external_create(VM_EXTERNAL_SMALL_SIZE); + } +#endif MACH_PAGEMAP + + vm_object_lock(object); +#if MACH_PAGEMAP + if (existence_info != VM_EXTERNAL_NULL) { + object->existence_info = existence_info; + } + if ((offset == 0) && (size > VM_EXTERNAL_LARGE_SIZE)) { + vm_object_unlock(object); + vm_object_deallocate(object); + return(KERN_SUCCESS); + } +#endif MACH_PAGEMAP + offset -= object->paging_offset; + + while (size != 0) { + register vm_page_t m; + + /* + * We're looking for pages that are both busy and + * absent (waiting to be filled), converting them + * to just absent. + * + * Pages that are just busy can be ignored entirely. + */ + + m = vm_page_lookup(object, offset); + if ((m != VM_PAGE_NULL) && m->busy && m->absent) { + PAGE_WAKEUP_DONE(m); + + vm_page_lock_queues(); + vm_page_activate(m); + vm_page_unlock_queues(); + } + size -= PAGE_SIZE; + offset += PAGE_SIZE; + } + + vm_object_unlock(object); + + vm_object_deallocate(object); + return(KERN_SUCCESS); +} + +/* + * Routine: memory_object_lock_page + * + * Description: + * Perform the appropriate lock operations on the + * given page. See the description of + * "memory_object_lock_request" for the meanings + * of the arguments. + * + * Returns an indication that the operation + * completed, blocked, or that the page must + * be cleaned. + */ + +#define MEMORY_OBJECT_LOCK_RESULT_DONE 0 +#define MEMORY_OBJECT_LOCK_RESULT_MUST_BLOCK 1 +#define MEMORY_OBJECT_LOCK_RESULT_MUST_CLEAN 2 +#define MEMORY_OBJECT_LOCK_RESULT_MUST_RETURN 3 + +memory_object_lock_result_t memory_object_lock_page(m, should_return, + should_flush, prot) + vm_page_t m; + memory_object_return_t should_return; + boolean_t should_flush; + vm_prot_t prot; +{ + /* + * Don't worry about pages for which the kernel + * does not have any data. + */ + + if (m->absent) + return(MEMORY_OBJECT_LOCK_RESULT_DONE); + + /* + * If we cannot change access to the page, + * either because a mapping is in progress + * (busy page) or because a mapping has been + * wired, then give up. + */ + + if (m->busy) + return(MEMORY_OBJECT_LOCK_RESULT_MUST_BLOCK); + + assert(!m->fictitious); + + if (m->wire_count != 0) { + /* + * If no change would take place + * anyway, return successfully. + * + * No change means: + * Not flushing AND + * No change to page lock [2 checks] AND + * Don't need to send page to manager + * + * Don't need to send page to manager means: + * No clean or return request OR ( + * Page is not dirty [2 checks] AND ( + * Page is not precious OR + * No request to return precious pages )) + * + * Now isn't that straightforward and obvious ?? ;-) + * + * XXX This doesn't handle sending a copy of a wired + * XXX page to the pager, but that will require some + * XXX significant surgery. + */ + + if (!should_flush && + ((m->page_lock == prot) || (prot == VM_PROT_NO_CHANGE)) && + ((should_return == MEMORY_OBJECT_RETURN_NONE) || + (!m->dirty && !pmap_is_modified(m->phys_addr) && + (!m->precious || + should_return != MEMORY_OBJECT_RETURN_ALL)))) { + /* + * Restart page unlock requests, + * even though no change took place. + * [Memory managers may be expecting + * to see new requests.] + */ + m->unlock_request = VM_PROT_NONE; + PAGE_WAKEUP(m); + + return(MEMORY_OBJECT_LOCK_RESULT_DONE); + } + + return(MEMORY_OBJECT_LOCK_RESULT_MUST_BLOCK); + } + + /* + * If the page is to be flushed, allow + * that to be done as part of the protection. + */ + + if (should_flush) + prot = VM_PROT_ALL; + + /* + * Set the page lock. + * + * If we are decreasing permission, do it now; + * let the fault handler take care of increases + * (pmap_page_protect may not increase protection). + */ + + if (prot != VM_PROT_NO_CHANGE) { + if ((m->page_lock ^ prot) & prot) { + pmap_page_protect(m->phys_addr, VM_PROT_ALL & ~prot); + } + m->page_lock = prot; + + /* + * Restart any past unlock requests, even if no + * change resulted. If the manager explicitly + * requested no protection change, then it is assumed + * to be remembering past requests. + */ + + m->unlock_request = VM_PROT_NONE; + PAGE_WAKEUP(m); + } + + /* + * Handle cleaning. + */ + + if (should_return != MEMORY_OBJECT_RETURN_NONE) { + /* + * Check whether the page is dirty. If + * write permission has not been removed, + * this may have unpredictable results. + */ + + if (!m->dirty) + m->dirty = pmap_is_modified(m->phys_addr); + + if (m->dirty || (m->precious && + should_return == MEMORY_OBJECT_RETURN_ALL)) { + /* + * If we weren't planning + * to flush the page anyway, + * we may need to remove the + * page from the pageout + * system and from physical + * maps now. + */ + + vm_page_lock_queues(); + VM_PAGE_QUEUES_REMOVE(m); + vm_page_unlock_queues(); + + if (!should_flush) + pmap_page_protect(m->phys_addr, + VM_PROT_NONE); + + /* + * Cleaning a page will cause + * it to be flushed. + */ + + if (m->dirty) + return(MEMORY_OBJECT_LOCK_RESULT_MUST_CLEAN); + else + return(MEMORY_OBJECT_LOCK_RESULT_MUST_RETURN); + } + } + + /* + * Handle flushing + */ + + if (should_flush) { + VM_PAGE_FREE(m); + } else { + extern boolean_t vm_page_deactivate_hint; + + /* + * XXX Make clean but not flush a paging hint, + * and deactivate the pages. This is a hack + * because it overloads flush/clean with + * implementation-dependent meaning. This only + * happens to pages that are already clean. + */ + + if (vm_page_deactivate_hint && + (should_return != MEMORY_OBJECT_RETURN_NONE)) { + vm_page_lock_queues(); + vm_page_deactivate(m); + vm_page_unlock_queues(); + } + } + + return(MEMORY_OBJECT_LOCK_RESULT_DONE); +} + +/* + * Routine: memory_object_lock_request [user interface] + * + * Description: + * Control use of the data associated with the given + * memory object. For each page in the given range, + * perform the following operations, in order: + * 1) restrict access to the page (disallow + * forms specified by "prot"); + * 2) return data to the manager (if "should_return" + * is RETURN_DIRTY and the page is dirty, or + * "should_return" is RETURN_ALL and the page + * is either dirty or precious); and, + * 3) flush the cached copy (if "should_flush" + * is asserted). + * The set of pages is defined by a starting offset + * ("offset") and size ("size"). Only pages with the + * same page alignment as the starting offset are + * considered. + * + * A single acknowledgement is sent (to the "reply_to" + * port) when these actions are complete. If successful, + * the naked send right for reply_to is consumed. + */ + +kern_return_t +memory_object_lock_request(object, offset, size, + should_return, should_flush, prot, + reply_to, reply_to_type) + register vm_object_t object; + register vm_offset_t offset; + register vm_size_t size; + memory_object_return_t should_return; + boolean_t should_flush; + vm_prot_t prot; + ipc_port_t reply_to; + mach_msg_type_name_t reply_to_type; +{ + register vm_page_t m; + vm_offset_t original_offset = offset; + vm_size_t original_size = size; + vm_offset_t paging_offset = 0; + vm_object_t new_object = VM_OBJECT_NULL; + vm_offset_t new_offset = 0; + vm_offset_t last_offset = offset; + int page_lock_result; + int pageout_action = 0; /* '=0' to quiet lint */ + +#define DATA_WRITE_MAX 32 + vm_page_t holding_pages[DATA_WRITE_MAX]; + + /* + * Check for bogus arguments. + */ + if (object == VM_OBJECT_NULL || + ((prot & ~VM_PROT_ALL) != 0 && prot != VM_PROT_NO_CHANGE)) + return (KERN_INVALID_ARGUMENT); + + size = round_page(size); + + /* + * Lock the object, and acquire a paging reference to + * prevent the memory_object and control ports from + * being destroyed. + */ + + vm_object_lock(object); + vm_object_paging_begin(object); + offset -= object->paging_offset; + + /* + * To avoid blocking while scanning for pages, save + * dirty pages to be cleaned all at once. + * + * XXXO A similar strategy could be used to limit the + * number of times that a scan must be restarted for + * other reasons. Those pages that would require blocking + * could be temporarily collected in another list, or + * their offsets could be recorded in a small array. + */ + + /* + * XXX NOTE: May want to consider converting this to a page list + * XXX vm_map_copy interface. Need to understand object + * XXX coalescing implications before doing so. + */ + +#define PAGEOUT_PAGES \ +MACRO_BEGIN \ + vm_map_copy_t copy; \ + register int i; \ + register vm_page_t hp; \ + \ + vm_object_unlock(object); \ + \ + (void) vm_map_copyin_object(new_object, 0, new_offset, ©); \ + \ + if (object->use_old_pageout) { \ + assert(pageout_action == MEMORY_OBJECT_LOCK_RESULT_MUST_CLEAN); \ + (void) memory_object_data_write( \ + object->pager, \ + object->pager_request, \ + paging_offset, \ + (pointer_t) copy, \ + new_offset); \ + } \ + else { \ + (void) memory_object_data_return( \ + object->pager, \ + object->pager_request, \ + paging_offset, \ + (pointer_t) copy, \ + new_offset, \ + (pageout_action == MEMORY_OBJECT_LOCK_RESULT_MUST_CLEAN), \ + !should_flush); \ + } \ + \ + vm_object_lock(object); \ + \ + for (i = 0; i < atop(new_offset); i++) { \ + hp = holding_pages[i]; \ + if (hp != VM_PAGE_NULL) \ + VM_PAGE_FREE(hp); \ + } \ + \ + new_object = VM_OBJECT_NULL; \ +MACRO_END + + for (; + size != 0; + size -= PAGE_SIZE, offset += PAGE_SIZE) + { + /* + * Limit the number of pages to be cleaned at once. + */ + if (new_object != VM_OBJECT_NULL && + new_offset >= PAGE_SIZE * DATA_WRITE_MAX) + { + PAGEOUT_PAGES; + } + + while ((m = vm_page_lookup(object, offset)) != VM_PAGE_NULL) { + switch ((page_lock_result = memory_object_lock_page(m, + should_return, + should_flush, + prot))) + { + case MEMORY_OBJECT_LOCK_RESULT_DONE: + /* + * End of a cluster of dirty pages. + */ + if (new_object != VM_OBJECT_NULL) { + PAGEOUT_PAGES; + continue; + } + break; + + case MEMORY_OBJECT_LOCK_RESULT_MUST_BLOCK: + /* + * Since it is necessary to block, + * clean any dirty pages now. + */ + if (new_object != VM_OBJECT_NULL) { + PAGEOUT_PAGES; + continue; + } + + PAGE_ASSERT_WAIT(m, FALSE); + vm_object_unlock(object); + thread_block((void (*)()) 0); + vm_object_lock(object); + continue; + + case MEMORY_OBJECT_LOCK_RESULT_MUST_CLEAN: + case MEMORY_OBJECT_LOCK_RESULT_MUST_RETURN: + /* + * The clean and return cases are similar. + * + * Mark the page busy since we unlock the + * object below. + */ + m->busy = TRUE; + + /* + * if this would form a discontiguous block, + * clean the old pages and start anew. + * + * NOTE: The first time through here, new_object + * is null, hiding the fact that pageout_action + * is not initialized. + */ + if (new_object != VM_OBJECT_NULL && + (last_offset != offset || + pageout_action != page_lock_result)) { + PAGEOUT_PAGES; + } + + vm_object_unlock(object); + + /* + * If we have not already allocated an object + * for a range of pages to be written, do so + * now. + */ + if (new_object == VM_OBJECT_NULL) { + new_object = vm_object_allocate(original_size); + new_offset = 0; + paging_offset = m->offset + + object->paging_offset; + pageout_action = page_lock_result; + } + + /* + * Move or copy the dirty page into the + * new object. + */ + m = vm_pageout_setup(m, + m->offset + object->paging_offset, + new_object, + new_offset, + should_flush); + + /* + * Save the holding page if there is one. + */ + holding_pages[atop(new_offset)] = m; + new_offset += PAGE_SIZE; + last_offset = offset + PAGE_SIZE; + + vm_object_lock(object); + break; + } + break; + } + } + + /* + * We have completed the scan for applicable pages. + * Clean any pages that have been saved. + */ + if (new_object != VM_OBJECT_NULL) { + PAGEOUT_PAGES; + } + + if (IP_VALID(reply_to)) { + vm_object_unlock(object); + + /* consumes our naked send-once/send right for reply_to */ + (void) memory_object_lock_completed(reply_to, reply_to_type, + object->pager_request, original_offset, original_size); + + vm_object_lock(object); + } + + vm_object_paging_end(object); + vm_object_unlock(object); + vm_object_deallocate(object); + + return (KERN_SUCCESS); +} + +#if !NORMA_VM +/* + * Old version of memory_object_lock_request. + */ +kern_return_t +xxx_memory_object_lock_request(object, offset, size, + should_clean, should_flush, prot, + reply_to, reply_to_type) + register vm_object_t object; + register vm_offset_t offset; + register vm_size_t size; + boolean_t should_clean; + boolean_t should_flush; + vm_prot_t prot; + ipc_port_t reply_to; + mach_msg_type_name_t reply_to_type; +{ + register int should_return; + + if (should_clean) + should_return = MEMORY_OBJECT_RETURN_DIRTY; + else + should_return = MEMORY_OBJECT_RETURN_NONE; + + return(memory_object_lock_request(object,offset,size, + should_return, should_flush, prot, + reply_to, reply_to_type)); +} +#endif !NORMA_VM + +kern_return_t +memory_object_set_attributes_common(object, object_ready, may_cache, + copy_strategy, use_old_pageout) + vm_object_t object; + boolean_t object_ready; + boolean_t may_cache; + memory_object_copy_strategy_t copy_strategy; + boolean_t use_old_pageout; +{ + if (object == VM_OBJECT_NULL) + return(KERN_INVALID_ARGUMENT); + + /* + * Verify the attributes of importance + */ + + switch(copy_strategy) { + case MEMORY_OBJECT_COPY_NONE: + case MEMORY_OBJECT_COPY_CALL: + case MEMORY_OBJECT_COPY_DELAY: + case MEMORY_OBJECT_COPY_TEMPORARY: + break; + default: + vm_object_deallocate(object); + return(KERN_INVALID_ARGUMENT); + } + + if (object_ready) + object_ready = TRUE; + if (may_cache) + may_cache = TRUE; + + vm_object_lock(object); + + /* + * Wake up anyone waiting for the ready attribute + * to become asserted. + */ + + if (object_ready && !object->pager_ready) { + object->use_old_pageout = use_old_pageout; + vm_object_wakeup(object, VM_OBJECT_EVENT_PAGER_READY); + } + + /* + * Copy the attributes + */ + + object->can_persist = may_cache; + object->pager_ready = object_ready; + if (copy_strategy == MEMORY_OBJECT_COPY_TEMPORARY) { + object->temporary = TRUE; + } else { + object->copy_strategy = copy_strategy; + } + + vm_object_unlock(object); + + vm_object_deallocate(object); + + return(KERN_SUCCESS); +} + +#if !NORMA_VM + +/* + * XXX rpd claims that reply_to could be obviated in favor of a client + * XXX stub that made change_attributes an RPC. Need investigation. + */ + +kern_return_t memory_object_change_attributes(object, may_cache, + copy_strategy, reply_to, reply_to_type) + vm_object_t object; + boolean_t may_cache; + memory_object_copy_strategy_t copy_strategy; + ipc_port_t reply_to; + mach_msg_type_name_t reply_to_type; +{ + kern_return_t result; + + /* + * Do the work and throw away our object reference. It + * is important that the object reference be deallocated + * BEFORE sending the reply. The whole point of the reply + * is that it shows up after the terminate message that + * may be generated by setting the object uncacheable. + * + * XXX may_cache may become a tri-valued variable to handle + * XXX uncache if not in use. + */ + result = memory_object_set_attributes_common(object, TRUE, + may_cache, copy_strategy, + FALSE); + + if (IP_VALID(reply_to)) { + + /* consumes our naked send-once/send right for reply_to */ + (void) memory_object_change_completed(reply_to, reply_to_type, + may_cache, copy_strategy); + + } + + return(result); +} + +kern_return_t +memory_object_set_attributes(object, object_ready, may_cache, copy_strategy) + vm_object_t object; + boolean_t object_ready; + boolean_t may_cache; + memory_object_copy_strategy_t copy_strategy; +{ + return memory_object_set_attributes_common(object, object_ready, + may_cache, copy_strategy, + TRUE); +} + +kern_return_t memory_object_ready(object, may_cache, copy_strategy) + vm_object_t object; + boolean_t may_cache; + memory_object_copy_strategy_t copy_strategy; +{ + return memory_object_set_attributes_common(object, TRUE, + may_cache, copy_strategy, + FALSE); +} +#endif !NORMA_VM + +kern_return_t memory_object_get_attributes(object, object_ready, + may_cache, copy_strategy) + vm_object_t object; + boolean_t *object_ready; + boolean_t *may_cache; + memory_object_copy_strategy_t *copy_strategy; +{ + if (object == VM_OBJECT_NULL) + return(KERN_INVALID_ARGUMENT); + + vm_object_lock(object); + *may_cache = object->can_persist; + *object_ready = object->pager_ready; + *copy_strategy = object->copy_strategy; + vm_object_unlock(object); + + vm_object_deallocate(object); + + return(KERN_SUCCESS); +} + +/* + * If successful, consumes the supplied naked send right. + */ +kern_return_t vm_set_default_memory_manager(host, default_manager) + host_t host; + ipc_port_t *default_manager; +{ + ipc_port_t current_manager; + ipc_port_t new_manager; + ipc_port_t returned_manager; + + if (host == HOST_NULL) + return(KERN_INVALID_HOST); + + new_manager = *default_manager; + simple_lock(&memory_manager_default_lock); + current_manager = memory_manager_default; + + if (new_manager == IP_NULL) { + /* + * Retrieve the current value. + */ + + returned_manager = ipc_port_copy_send(current_manager); + } else { + /* + * Retrieve the current value, + * and replace it with the supplied value. + * We consume the supplied naked send right. + */ + + returned_manager = current_manager; + memory_manager_default = new_manager; + + /* + * In case anyone's been waiting for a memory + * manager to be established, wake them up. + */ + + thread_wakeup((event_t) &memory_manager_default); + } + + simple_unlock(&memory_manager_default_lock); + + *default_manager = returned_manager; + return(KERN_SUCCESS); +} + +/* + * Routine: memory_manager_default_reference + * Purpose: + * Returns a naked send right for the default + * memory manager. The returned right is always + * valid (not IP_NULL or IP_DEAD). + */ + +ipc_port_t memory_manager_default_reference() +{ + ipc_port_t current_manager; + + simple_lock(&memory_manager_default_lock); + + while (current_manager = ipc_port_copy_send(memory_manager_default), + !IP_VALID(current_manager)) { + thread_sleep((event_t) &memory_manager_default, + simple_lock_addr(memory_manager_default_lock), + FALSE); + simple_lock(&memory_manager_default_lock); + } + + simple_unlock(&memory_manager_default_lock); + + return current_manager; +} + +/* + * Routine: memory_manager_default_port + * Purpose: + * Returns true if the receiver for the port + * is the default memory manager. + * + * This is a hack to let ds_read_done + * know when it should keep memory wired. + */ + +boolean_t memory_manager_default_port(port) + ipc_port_t port; +{ + ipc_port_t current; + boolean_t result; + + simple_lock(&memory_manager_default_lock); + current = memory_manager_default; + if (IP_VALID(current)) { + /* + * There is no point in bothering to lock + * both ports, which would be painful to do. + * If the receive rights are moving around, + * we might be inaccurate. + */ + + result = port->ip_receiver == current->ip_receiver; + } else + result = FALSE; + simple_unlock(&memory_manager_default_lock); + + return result; +} + +void memory_manager_default_init() +{ + memory_manager_default = IP_NULL; + simple_lock_init(&memory_manager_default_lock); +} diff --git a/vm/memory_object.h b/vm/memory_object.h new file mode 100644 index 00000000..9afa0623 --- /dev/null +++ b/vm/memory_object.h @@ -0,0 +1,43 @@ +/* + * Mach Operating System + * Copyright (c) 1991 Carnegie Mellon University + * All Rights Reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR + * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie the + * rights to redistribute these changes. + */ + +#ifndef _VM_MEMORY_OBJECT_H_ +#define _VM_MEMORY_OBJECT_H_ + +#include <mach/boolean.h> + +/* + * We use "struct ipc_port *" instead of "ipc_port_t" + * to avoid include file circularities. + */ + +extern struct ipc_port *memory_manager_default_reference(); +extern boolean_t memory_manager_default_port(); +extern void memory_manager_default_init(); + +extern struct ipc_port *memory_manager_default; + +#endif _VM_MEMORY_OBJECT_H_ diff --git a/vm/memory_object_default.cli b/vm/memory_object_default.cli new file mode 100644 index 00000000..998a9864 --- /dev/null +++ b/vm/memory_object_default.cli @@ -0,0 +1,28 @@ +/* + * Copyright (c) 1994 The University of Utah and + * the Computer Systems Laboratory at the University of Utah (CSL). + * All rights reserved. + * + * Permission to use, copy, modify and distribute this software is hereby + * granted provided that (1) source code retains these copyright, permission, + * and disclaimer notices, and (2) redistributions including binaries + * reproduce the notices in supporting documentation, and (3) all advertising + * materials mentioning features or use of this software display the following + * acknowledgement: ``This product includes software developed by the + * Computer Systems Laboratory at the University of Utah.'' + * + * THE UNIVERSITY OF UTAH AND CSL ALLOW FREE USE OF THIS SOFTWARE IN ITS "AS + * IS" CONDITION. THE UNIVERSITY OF UTAH AND CSL DISCLAIM ANY LIABILITY OF + * ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * CSL requests users of this software to return to csl-dist@cs.utah.edu any + * improvements that they make and grant CSL redistribution rights. + * + * Author: Bryan Ford, University of Utah CSL + */ +/* This is a client presentation file. */ + +#define KERNEL_USER 1 +#define SEQNOS 1 + +#include <mach/memory_object_default.defs> diff --git a/vm/memory_object_user.cli b/vm/memory_object_user.cli new file mode 100644 index 00000000..2bba41fc --- /dev/null +++ b/vm/memory_object_user.cli @@ -0,0 +1,28 @@ +/* + * Copyright (c) 1994 The University of Utah and + * the Computer Systems Laboratory at the University of Utah (CSL). + * All rights reserved. + * + * Permission to use, copy, modify and distribute this software is hereby + * granted provided that (1) source code retains these copyright, permission, + * and disclaimer notices, and (2) redistributions including binaries + * reproduce the notices in supporting documentation, and (3) all advertising + * materials mentioning features or use of this software display the following + * acknowledgement: ``This product includes software developed by the + * Computer Systems Laboratory at the University of Utah.'' + * + * THE UNIVERSITY OF UTAH AND CSL ALLOW FREE USE OF THIS SOFTWARE IN ITS "AS + * IS" CONDITION. THE UNIVERSITY OF UTAH AND CSL DISCLAIM ANY LIABILITY OF + * ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * CSL requests users of this software to return to csl-dist@cs.utah.edu any + * improvements that they make and grant CSL redistribution rights. + * + * Author: Bryan Ford, University of Utah CSL + */ +/* This is a client presentation file. */ + +#define KERNEL_USER 1 +#define SEQNOS 1 + +#include <mach/memory_object.defs> diff --git a/vm/pmap.h b/vm/pmap.h new file mode 100644 index 00000000..f9a949ed --- /dev/null +++ b/vm/pmap.h @@ -0,0 +1,267 @@ +/* + * Mach Operating System + * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University. + * Copyright (c) 1993,1994 The University of Utah and + * the Computer Systems Laboratory (CSL). + * All rights reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON, THE UNIVERSITY OF UTAH AND CSL ALLOW FREE USE OF + * THIS SOFTWARE IN ITS "AS IS" CONDITION, AND DISCLAIM ANY LIABILITY + * OF ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF + * THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +/* + * File: vm/pmap.h + * Author: Avadis Tevanian, Jr. + * Date: 1985 + * + * Machine address mapping definitions -- machine-independent + * section. [For machine-dependent section, see "machine/pmap.h".] + */ + +#ifndef _VM_PMAP_H_ +#define _VM_PMAP_H_ + +#include <machine/pmap.h> +#include <mach/machine/vm_types.h> +#include <mach/vm_prot.h> +#include <mach/boolean.h> + +/* + * The following is a description of the interface to the + * machine-dependent "physical map" data structure. The module + * must provide a "pmap_t" data type that represents the + * set of valid virtual-to-physical addresses for one user + * address space. [The kernel address space is represented + * by a distinguished "pmap_t".] The routines described manage + * this type, install and update virtual-to-physical mappings, + * and perform operations on physical addresses common to + * many address spaces. + */ + +/* + * Routines used for initialization. + * There is traditionally also a pmap_bootstrap, + * used very early by machine-dependent code, + * but it is not part of the interface. + */ + +extern vm_offset_t pmap_steal_memory(); /* During VM initialization, + * steal a chunk of memory. + */ +extern unsigned int pmap_free_pages(); /* During VM initialization, + * report remaining unused + * physical pages. + */ +extern void pmap_startup(); /* During VM initialization, + * use remaining physical pages + * to allocate page frames. + */ +extern void pmap_init(); /* Initialization, + * after kernel runs + * in virtual memory. + */ + +#ifndef MACHINE_PAGES +/* + * If machine/pmap.h defines MACHINE_PAGES, it must implement + * the above functions. The pmap module has complete control. + * Otherwise, it must implement + * pmap_free_pages + * pmap_virtual_space + * pmap_next_page + * pmap_init + * and vm/vm_resident.c implements pmap_steal_memory and pmap_startup + * using pmap_free_pages, pmap_next_page, pmap_virtual_space, + * and pmap_enter. pmap_free_pages may over-estimate the number + * of unused physical pages, and pmap_next_page may return FALSE + * to indicate that there are no more unused pages to return. + * However, for best performance pmap_free_pages should be accurate. + */ + +extern boolean_t pmap_next_page(); /* During VM initialization, + * return the next unused + * physical page. + */ +extern void pmap_virtual_space(); /* During VM initialization, + * report virtual space + * available for the kernel. + */ +#endif MACHINE_PAGES + +/* + * Routines to manage the physical map data structure. + */ + +/* Create a pmap_t. */ +pmap_t pmap_create(vm_size_t size); + +/* Return the kernel's pmap_t. */ +#ifndef pmap_kernel +extern pmap_t pmap_kernel(void); +#endif pmap_kernel + +/* Gain and release a reference. */ +extern void pmap_reference(pmap_t pmap); +extern void pmap_destroy(pmap_t pmap); + +/* Enter a mapping */ +extern void pmap_enter(pmap_t pmap, vm_offset_t va, vm_offset_t pa, + vm_prot_t prot, boolean_t wired); + + +/* + * Routines that operate on ranges of virtual addresses. + */ + +/* Remove mappings. */ +void pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva); + +/* Change protections. */ +void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot); + +/* + * Routines to set up hardware state for physical maps to be used. + */ +extern void pmap_activate(); /* Prepare pmap_t to run + * on a given processor. + */ +extern void pmap_deactivate(); /* Release pmap_t from + * use on processor. + */ + + +/* + * Routines that operate on physical addresses. + */ + +/* Restrict access to page. */ +void pmap_page_protect(vm_offset_t pa, vm_prot_t prot); + +/* + * Routines to manage reference/modify bits based on + * physical addresses, simulating them if not provided + * by the hardware. + */ + +/* Clear reference bit */ +void pmap_clear_reference(vm_offset_t pa); + +/* Return reference bit */ +#ifndef pmap_is_referenced +boolean_t pmap_is_referenced(vm_offset_t pa); +#endif pmap_is_referenced + +/* Clear modify bit */ +void pmap_clear_modify(vm_offset_t pa); + +/* Return modify bit */ +boolean_t pmap_is_modified(vm_offset_t pa); + + +/* + * Statistics routines + */ +extern void pmap_statistics(); /* Return statistics */ + +#ifndef pmap_resident_count +extern int pmap_resident_count(); +#endif pmap_resident_count + +/* + * Sundry required routines + */ +extern vm_offset_t pmap_extract(); /* Return a virtual-to-physical + * mapping, if possible. + */ + +extern boolean_t pmap_access(); /* Is virtual address valid? */ + +extern void pmap_collect(); /* Perform garbage + * collection, if any + */ + +extern void pmap_change_wiring(); /* Specify pageability */ + +#ifndef pmap_phys_address +extern vm_offset_t pmap_phys_address(); /* Transform address + * returned by device + * driver mapping function + * to physical address + * known to this module. + */ +#endif pmap_phys_address +#ifndef pmap_phys_to_frame +extern int pmap_phys_to_frame(); /* Inverse of + * pmap_phys_address, + * for use by device driver + * mapping function in + * machine-independent + * pseudo-devices. + */ +#endif pmap_phys_to_frame + +/* + * Optional routines + */ +#ifndef pmap_copy +extern void pmap_copy(); /* Copy range of + * mappings, if desired. + */ +#endif pmap_copy +#ifndef pmap_attribute +extern kern_return_t pmap_attribute(); /* Get/Set special + * memory attributes + */ +#endif pmap_attribute + +/* + * Routines defined as macros. + */ +#ifndef PMAP_ACTIVATE_USER +#define PMAP_ACTIVATE_USER(pmap, thread, cpu) { \ + if ((pmap) != kernel_pmap) \ + PMAP_ACTIVATE(pmap, thread, cpu); \ +} +#endif PMAP_ACTIVATE_USER + +#ifndef PMAP_DEACTIVATE_USER +#define PMAP_DEACTIVATE_USER(pmap, thread, cpu) { \ + if ((pmap) != kernel_pmap) \ + PMAP_DEACTIVATE(pmap, thread, cpu); \ +} +#endif PMAP_DEACTIVATE_USER + +#ifndef PMAP_ACTIVATE_KERNEL +#define PMAP_ACTIVATE_KERNEL(cpu) \ + PMAP_ACTIVATE(kernel_pmap, THREAD_NULL, cpu) +#endif PMAP_ACTIVATE_KERNEL + +#ifndef PMAP_DEACTIVATE_KERNEL +#define PMAP_DEACTIVATE_KERNEL(cpu) \ + PMAP_DEACTIVATE(kernel_pmap, THREAD_NULL, cpu) +#endif PMAP_DEACTIVATE_KERNEL + +/* + * Exported data structures + */ + +extern pmap_t kernel_pmap; /* The kernel's map */ + +#endif _VM_PMAP_H_ diff --git a/vm/vm_debug.c b/vm/vm_debug.c new file mode 100644 index 00000000..17c8c311 --- /dev/null +++ b/vm/vm_debug.c @@ -0,0 +1,499 @@ +/* + * Mach Operating System + * Copyright (c) 1991,1990 Carnegie Mellon University + * All Rights Reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR + * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +/* + * File: vm/vm_debug.c. + * Author: Rich Draves + * Date: March, 1990 + * + * Exported kernel calls. See mach_debug/mach_debug.defs. + */ + +#include <mach_vm_debug.h> +#if MACH_VM_DEBUG + +#include <kern/thread.h> +#include <mach/kern_return.h> +#include <mach/machine/vm_types.h> +#include <mach/memory_object.h> +#include <mach/vm_prot.h> +#include <mach/vm_inherit.h> +#include <mach/vm_param.h> +#include <mach_debug/vm_info.h> +#include <mach_debug/hash_info.h> +#include <vm/vm_map.h> +#include <vm/vm_kern.h> +#include <vm/vm_object.h> +#include <kern/task.h> +#include <kern/host.h> +#include <ipc/ipc_port.h> + + + +/* + * Routine: vm_object_real_name + * Purpose: + * Convert a VM object to a name port. + * Conditions: + * Takes object and port locks. + * Returns: + * A naked send right for the object's name port, + * or IP_NULL if the object or its name port is null. + */ + +ipc_port_t +vm_object_real_name(object) + vm_object_t object; +{ + ipc_port_t port = IP_NULL; + + if (object != VM_OBJECT_NULL) { + vm_object_lock(object); + if (object->pager_name != IP_NULL) + port = ipc_port_make_send(object->pager_name); + vm_object_unlock(object); + } + + return port; +} + +/* + * Routine: mach_vm_region_info [kernel call] + * Purpose: + * Retrieve information about a VM region, + * including info about the object chain. + * Conditions: + * Nothing locked. + * Returns: + * KERN_SUCCESS Retrieve region/object info. + * KERN_INVALID_TASK The map is null. + * KERN_NO_SPACE There is no entry at/after the address. + */ + +kern_return_t +mach_vm_region_info(map, address, regionp, portp) + vm_map_t map; + vm_offset_t address; + vm_region_info_t *regionp; + ipc_port_t *portp; +{ + vm_map_t cmap; /* current map in traversal */ + vm_map_t nmap; /* next map to look at */ + vm_map_entry_t entry; /* entry in current map */ + vm_object_t object; + + if (map == VM_MAP_NULL) + return KERN_INVALID_TASK; + + /* find the entry containing (or following) the address */ + + vm_map_lock_read(map); + for (cmap = map;;) { + /* cmap is read-locked */ + + if (!vm_map_lookup_entry(cmap, address, &entry)) { + entry = entry->vme_next; + if (entry == vm_map_to_entry(cmap)) { + if (map == cmap) { + vm_map_unlock_read(cmap); + return KERN_NO_SPACE; + } + + /* back out to top-level & skip this submap */ + + address = vm_map_max(cmap); + vm_map_unlock_read(cmap); + vm_map_lock_read(map); + cmap = map; + continue; + } + } + + if (entry->is_sub_map) { + /* move down to the sub map */ + + nmap = entry->object.sub_map; + vm_map_lock_read(nmap); + vm_map_unlock_read(cmap); + cmap = nmap; + continue; + } else { + break; + } + /*NOTREACHED*/ + } + + + assert(entry->vme_start < entry->vme_end); + + regionp->vri_start = entry->vme_start; + regionp->vri_end = entry->vme_end; + + /* attributes from the real entry */ + + regionp->vri_protection = entry->protection; + regionp->vri_max_protection = entry->max_protection; + regionp->vri_inheritance = entry->inheritance; + regionp->vri_wired_count = entry->wired_count; + regionp->vri_user_wired_count = entry->user_wired_count; + + object = entry->object.vm_object; + *portp = vm_object_real_name(object); + regionp->vri_object = (vm_offset_t) object; + regionp->vri_offset = entry->offset; + regionp->vri_needs_copy = entry->needs_copy; + + regionp->vri_sharing = entry->is_shared; + + vm_map_unlock_read(cmap); + return KERN_SUCCESS; +} + +/* + * Routine: mach_vm_object_info [kernel call] + * Purpose: + * Retrieve information about a VM object. + * Conditions: + * Nothing locked. + * Returns: + * KERN_SUCCESS Retrieved object info. + * KERN_INVALID_ARGUMENT The object is null. + */ + +kern_return_t +mach_vm_object_info(object, infop, shadowp, copyp) + vm_object_t object; + vm_object_info_t *infop; + ipc_port_t *shadowp; + ipc_port_t *copyp; +{ + vm_object_info_t info; + vm_object_info_state_t state; + ipc_port_t shadow, copy; + + if (object == VM_OBJECT_NULL) + return KERN_INVALID_ARGUMENT; + + /* + * Because of lock-ordering/deadlock considerations, + * we can't use vm_object_real_name for the copy object. + */ + + retry: + vm_object_lock(object); + copy = IP_NULL; + if (object->copy != VM_OBJECT_NULL) { + if (!vm_object_lock_try(object->copy)) { + vm_object_unlock(object); + simple_lock_pause(); /* wait a bit */ + goto retry; + } + + if (object->copy->pager_name != IP_NULL) + copy = ipc_port_make_send(object->copy->pager_name); + vm_object_unlock(object->copy); + } + shadow = vm_object_real_name(object->shadow); + + info.voi_object = (vm_offset_t) object; + info.voi_pagesize = PAGE_SIZE; + info.voi_size = object->size; + info.voi_ref_count = object->ref_count; + info.voi_resident_page_count = object->resident_page_count; + info.voi_absent_count = object->absent_count; + info.voi_copy = (vm_offset_t) object->copy; + info.voi_shadow = (vm_offset_t) object->shadow; + info.voi_shadow_offset = object->shadow_offset; + info.voi_paging_offset = object->paging_offset; + info.voi_copy_strategy = object->copy_strategy; + info.voi_last_alloc = object->last_alloc; + info.voi_paging_in_progress = object->paging_in_progress; + + state = 0; + if (object->pager_created) + state |= VOI_STATE_PAGER_CREATED; + if (object->pager_initialized) + state |= VOI_STATE_PAGER_INITIALIZED; + if (object->pager_ready) + state |= VOI_STATE_PAGER_READY; + if (object->can_persist) + state |= VOI_STATE_CAN_PERSIST; + if (object->internal) + state |= VOI_STATE_INTERNAL; + if (object->temporary) + state |= VOI_STATE_TEMPORARY; + if (object->alive) + state |= VOI_STATE_ALIVE; + if (object->lock_in_progress) + state |= VOI_STATE_LOCK_IN_PROGRESS; + if (object->lock_restart) + state |= VOI_STATE_LOCK_RESTART; + if (object->use_old_pageout) + state |= VOI_STATE_USE_OLD_PAGEOUT; + info.voi_state = state; + vm_object_unlock(object); + + *infop = info; + *shadowp = shadow; + *copyp = copy; + return KERN_SUCCESS; +} + +#define VPI_STATE_NODATA (VPI_STATE_BUSY|VPI_STATE_FICTITIOUS| \ + VPI_STATE_PRIVATE|VPI_STATE_ABSENT) + +/* + * Routine: mach_vm_object_pages [kernel call] + * Purpose: + * Retrieve information about the pages in a VM object. + * Conditions: + * Nothing locked. Obeys CountInOut protocol. + * Returns: + * KERN_SUCCESS Retrieved object info. + * KERN_INVALID_ARGUMENT The object is null. + * KERN_RESOURCE_SHORTAGE Couldn't allocate memory. + */ + +kern_return_t +mach_vm_object_pages(object, pagesp, countp) + vm_object_t object; + vm_page_info_array_t *pagesp; + natural_t *countp; +{ + vm_size_t size; + vm_offset_t addr; + vm_page_info_t *pages; + unsigned int potential, actual, count; + vm_page_t p; + kern_return_t kr; + + if (object == VM_OBJECT_NULL) + return KERN_INVALID_ARGUMENT; + + /* start with in-line memory */ + + pages = *pagesp; + potential = *countp; + + for (size = 0;;) { + vm_object_lock(object); + actual = object->resident_page_count; + if (actual <= potential) + break; + vm_object_unlock(object); + + if (pages != *pagesp) + kmem_free(ipc_kernel_map, addr, size); + + size = round_page(actual * sizeof *pages); + kr = kmem_alloc(ipc_kernel_map, &addr, size); + if (kr != KERN_SUCCESS) + return kr; + + pages = (vm_page_info_t *) addr; + potential = size/sizeof *pages; + } + /* object is locked, we have enough wired memory */ + + count = 0; + queue_iterate(&object->memq, p, vm_page_t, listq) { + vm_page_info_t *info = &pages[count++]; + vm_page_info_state_t state = 0; + + info->vpi_offset = p->offset; + info->vpi_phys_addr = p->phys_addr; + info->vpi_wire_count = p->wire_count; + info->vpi_page_lock = p->page_lock; + info->vpi_unlock_request = p->unlock_request; + + if (p->busy) + state |= VPI_STATE_BUSY; + if (p->wanted) + state |= VPI_STATE_WANTED; + if (p->tabled) + state |= VPI_STATE_TABLED; + if (p->fictitious) + state |= VPI_STATE_FICTITIOUS; + if (p->private) + state |= VPI_STATE_PRIVATE; + if (p->absent) + state |= VPI_STATE_ABSENT; + if (p->error) + state |= VPI_STATE_ERROR; + if (p->dirty) + state |= VPI_STATE_DIRTY; + if (p->precious) + state |= VPI_STATE_PRECIOUS; + if (p->overwriting) + state |= VPI_STATE_OVERWRITING; + + if (((state & (VPI_STATE_NODATA|VPI_STATE_DIRTY)) == 0) && + pmap_is_modified(p->phys_addr)) { + state |= VPI_STATE_DIRTY; + p->dirty = TRUE; + } + + vm_page_lock_queues(); + if (p->inactive) + state |= VPI_STATE_INACTIVE; + if (p->active) + state |= VPI_STATE_ACTIVE; + if (p->laundry) + state |= VPI_STATE_LAUNDRY; + if (p->free) + state |= VPI_STATE_FREE; + if (p->reference) + state |= VPI_STATE_REFERENCE; + + if (((state & (VPI_STATE_NODATA|VPI_STATE_REFERENCE)) == 0) && + pmap_is_referenced(p->phys_addr)) { + state |= VPI_STATE_REFERENCE; + p->reference = TRUE; + } + vm_page_unlock_queues(); + + info->vpi_state = state; + } + + if (object->resident_page_count != count) + panic("mach_vm_object_pages"); + vm_object_unlock(object); + + if (pages == *pagesp) { + /* data fit in-line; nothing to deallocate */ + + *countp = actual; + } else if (actual == 0) { + kmem_free(ipc_kernel_map, addr, size); + + *countp = 0; + } else { + vm_size_t size_used, rsize_used; + vm_map_copy_t copy; + + /* kmem_alloc doesn't zero memory */ + + size_used = actual * sizeof *pages; + rsize_used = round_page(size_used); + + if (rsize_used != size) + kmem_free(ipc_kernel_map, + addr + rsize_used, size - rsize_used); + + if (size_used != rsize_used) + bzero((char *) (addr + size_used), + rsize_used - size_used); + + kr = vm_map_copyin(ipc_kernel_map, addr, rsize_used, + TRUE, ©); + assert(kr == KERN_SUCCESS); + + *pagesp = (vm_page_info_t *) copy; + *countp = actual; + } + + return KERN_SUCCESS; +} + +#endif MACH_VM_DEBUG + +/* + * Routine: host_virtual_physical_table_info + * Purpose: + * Return information about the VP table. + * Conditions: + * Nothing locked. Obeys CountInOut protocol. + * Returns: + * KERN_SUCCESS Returned information. + * KERN_INVALID_HOST The host is null. + * KERN_RESOURCE_SHORTAGE Couldn't allocate memory. + */ + +kern_return_t +host_virtual_physical_table_info(host, infop, countp) + host_t host; + hash_info_bucket_array_t *infop; + natural_t *countp; +{ + vm_offset_t addr; + vm_size_t size = 0;/* '=0' to quiet gcc warnings */ + hash_info_bucket_t *info; + unsigned int potential, actual; + kern_return_t kr; + + if (host == HOST_NULL) + return KERN_INVALID_HOST; + + /* start with in-line data */ + + info = *infop; + potential = *countp; + + for (;;) { + actual = vm_page_info(info, potential); + if (actual <= potential) + break; + + /* allocate more memory */ + + if (info != *infop) + kmem_free(ipc_kernel_map, addr, size); + + size = round_page(actual * sizeof *info); + kr = kmem_alloc_pageable(ipc_kernel_map, &addr, size); + if (kr != KERN_SUCCESS) + return KERN_RESOURCE_SHORTAGE; + + info = (hash_info_bucket_t *) addr; + potential = size/sizeof *info; + } + + if (info == *infop) { + /* data fit in-line; nothing to deallocate */ + + *countp = actual; + } else if (actual == 0) { + kmem_free(ipc_kernel_map, addr, size); + + *countp = 0; + } else { + vm_map_copy_t copy; + vm_size_t used; + + used = round_page(actual * sizeof *info); + + if (used != size) + kmem_free(ipc_kernel_map, addr + used, size - used); + + kr = vm_map_copyin(ipc_kernel_map, addr, used, + TRUE, ©); + assert(kr == KERN_SUCCESS); + + *infop = (hash_info_bucket_t *) copy; + *countp = actual; + } + + return KERN_SUCCESS; +} diff --git a/vm/vm_external.c b/vm/vm_external.c new file mode 100644 index 00000000..da591375 --- /dev/null +++ b/vm/vm_external.c @@ -0,0 +1,159 @@ +/* + * Mach Operating System + * Copyright (c) 1991,1990,1989 Carnegie Mellon University + * All Rights Reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR + * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +/* + * This module maintains information about the presence of + * pages not in memory. Since an external memory object + * must maintain a complete knowledge of its contents, this + * information takes the form of hints. + */ + +#include <mach/boolean.h> +#include <kern/zalloc.h> +#include <vm/vm_external.h> +#include <mach/vm_param.h> +#include <kern/assert.h> + + + +boolean_t vm_external_unsafe = FALSE; + +zone_t vm_external_zone = ZONE_NULL; + +/* + * The implementation uses bit arrays to record whether + * a page has been written to external storage. For + * convenience, these bit arrays come in two sizes + * (measured in bytes). + */ + +#define SMALL_SIZE (VM_EXTERNAL_SMALL_SIZE/8) +#define LARGE_SIZE (VM_EXTERNAL_LARGE_SIZE/8) + +zone_t vm_object_small_existence_map_zone; +zone_t vm_object_large_existence_map_zone; + + +vm_external_t vm_external_create(size) + vm_offset_t size; +{ + vm_external_t result; + vm_size_t bytes; + + if (vm_external_zone == ZONE_NULL) + return(VM_EXTERNAL_NULL); + + result = (vm_external_t) zalloc(vm_external_zone); + result->existence_map = (char *) 0; + + bytes = (atop(size) + 07) >> 3; + if (bytes <= SMALL_SIZE) { + result->existence_map = + (char *) zalloc(vm_object_small_existence_map_zone); + result->existence_size = SMALL_SIZE; + } else if (bytes <= LARGE_SIZE) { + result->existence_map = + (char *) zalloc(vm_object_large_existence_map_zone); + result->existence_size = LARGE_SIZE; + } + return(result); +} + +void vm_external_destroy(e) + vm_external_t e; +{ + if (e == VM_EXTERNAL_NULL) + return; + + if (e->existence_map != (char *) 0) { + if (e->existence_size <= SMALL_SIZE) { + zfree(vm_object_small_existence_map_zone, + (vm_offset_t) e->existence_map); + } else { + zfree(vm_object_large_existence_map_zone, + (vm_offset_t) e->existence_map); + } + } + zfree(vm_external_zone, (vm_offset_t) e); +} + +vm_external_state_t _vm_external_state_get(e, offset) + vm_external_t e; + vm_offset_t offset; +{ + unsigned + int bit, byte; + + if (vm_external_unsafe || + (e == VM_EXTERNAL_NULL) || + (e->existence_map == (char *) 0)) + return(VM_EXTERNAL_STATE_UNKNOWN); + + bit = atop(offset); + byte = bit >> 3; + if (byte >= e->existence_size) return (VM_EXTERNAL_STATE_UNKNOWN); + return( (e->existence_map[byte] & (1 << (bit & 07))) ? + VM_EXTERNAL_STATE_EXISTS : VM_EXTERNAL_STATE_ABSENT ); +} + +void vm_external_state_set(e, offset, state) + vm_external_t e; + vm_offset_t offset; + vm_external_state_t state; +{ + unsigned + int bit, byte; + + if ((e == VM_EXTERNAL_NULL) || (e->existence_map == (char *) 0)) + return; + + if (state != VM_EXTERNAL_STATE_EXISTS) + return; + + bit = atop(offset); + byte = bit >> 3; + if (byte >= e->existence_size) return; + e->existence_map[byte] |= (1 << (bit & 07)); +} + +void vm_external_module_initialize() +{ + vm_size_t size = (vm_size_t) sizeof(struct vm_external); + + vm_external_zone = zinit(size, 16*1024*size, size, + 0, "external page bitmaps"); + + vm_object_small_existence_map_zone = zinit(SMALL_SIZE, + round_page(LARGE_SIZE * SMALL_SIZE), + round_page(SMALL_SIZE), + ZONE_EXHAUSTIBLE, + "object small existence maps"); + + vm_object_large_existence_map_zone = zinit(LARGE_SIZE, + round_page(8 * LARGE_SIZE), + round_page(LARGE_SIZE), + ZONE_EXHAUSTIBLE, + "object large existence maps"); +} diff --git a/vm/vm_external.h b/vm/vm_external.h new file mode 100644 index 00000000..70ffd650 --- /dev/null +++ b/vm/vm_external.h @@ -0,0 +1,89 @@ +/* + * Mach Operating System + * Copyright (c) 1991,1990,1989 Carnegie Mellon University + * All Rights Reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR + * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ + +#ifndef _VM_VM_EXTERNAL_H_ +#define _VM_VM_EXTERNAL_H_ + +/* + * External page management hint technology + * + * The data structure exported by this module maintains + * a (potentially incomplete) map of the pages written + * to external storage for a range of virtual memory. + */ + +/* + * The data structure representing the state of pages + * on external storage. + */ + +typedef struct vm_external { + int existence_size; /* Size of the following bitmap */ + char *existence_map; /* A bitmap of pages that have + * been written to backing + * storage. + */ + int existence_count;/* Number of bits turned on in + * existence_map. + */ +} *vm_external_t; + +#define VM_EXTERNAL_NULL ((vm_external_t) 0) + +#define VM_EXTERNAL_SMALL_SIZE 128 +#define VM_EXTERNAL_LARGE_SIZE 8192 + +/* + * The states that may be recorded for a page of external storage. + */ + +typedef int vm_external_state_t; +#define VM_EXTERNAL_STATE_EXISTS 1 +#define VM_EXTERNAL_STATE_UNKNOWN 2 +#define VM_EXTERNAL_STATE_ABSENT 3 + + +/* + * Routines exported by this module. + */ + +extern void vm_external_module_initialize(); + /* Initialize the module */ + +extern vm_external_t vm_external_create(); /* Create a vm_external_t */ +extern void vm_external_destroy(); /* Destroy one */ + +extern void vm_external_state_set();/* Set state of a page. */ +#define vm_external_state_get(e,offset) (((e) != VM_EXTERNAL_NULL) ? \ + _vm_external_state_get(e, offset) : \ + VM_EXTERNAL_STATE_UNKNOWN) + /* Retrieve the state + * for a given page, if known. + */ +extern vm_external_state_t _vm_external_state_get(); + /* HIDDEN routine */ + +#endif _VM_VM_EXTERNAL_H_ diff --git a/vm/vm_fault.c b/vm/vm_fault.c new file mode 100644 index 00000000..e45687cd --- /dev/null +++ b/vm/vm_fault.c @@ -0,0 +1,2182 @@ +/* + * Mach Operating System + * Copyright (c) 1994,1990,1989,1988,1987 Carnegie Mellon University. + * Copyright (c) 1993,1994 The University of Utah and + * the Computer Systems Laboratory (CSL). + * All rights reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON, THE UNIVERSITY OF UTAH AND CSL ALLOW FREE USE OF + * THIS SOFTWARE IN ITS "AS IS" CONDITION, AND DISCLAIM ANY LIABILITY + * OF ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF + * THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +/* + * File: vm_fault.c + * Author: Avadis Tevanian, Jr., Michael Wayne Young + * + * Page fault handling module. + */ +#include <mach_pagemap.h> +#include <mach_kdb.h> +#include <mach_pcsample.h> + + +#include <vm/vm_fault.h> +#include <mach/kern_return.h> +#include <mach/message.h> /* for error codes */ +#include <kern/counters.h> +#include <kern/thread.h> +#include <kern/sched_prim.h> +#include <vm/vm_map.h> +#include <vm/vm_object.h> +#include <vm/vm_page.h> +#include <vm/pmap.h> +#include <mach/vm_statistics.h> +#include <vm/vm_pageout.h> +#include <mach/vm_param.h> +#include <mach/memory_object.h> +#include "memory_object_user.h" + /* For memory_object_data_{request,unlock} */ +#include <kern/mach_param.h> +#include <kern/macro_help.h> +#include <kern/zalloc.h> + +#if MACH_PCSAMPLE +#include <kern/pc_sample.h> +#endif + + + +/* + * State needed by vm_fault_continue. + * This is a little hefty to drop directly + * into the thread structure. + */ +typedef struct vm_fault_state { + struct vm_map *vmf_map; + vm_offset_t vmf_vaddr; + vm_prot_t vmf_fault_type; + boolean_t vmf_change_wiring; + void (*vmf_continuation)(); + vm_map_version_t vmf_version; + boolean_t vmf_wired; + struct vm_object *vmf_object; + vm_offset_t vmf_offset; + vm_prot_t vmf_prot; + + boolean_t vmfp_backoff; + struct vm_object *vmfp_object; + vm_offset_t vmfp_offset; + struct vm_page *vmfp_first_m; + vm_prot_t vmfp_access; +} vm_fault_state_t; + +zone_t vm_fault_state_zone = 0; + +int vm_object_absent_max = 50; + +int vm_fault_debug = 0; + +boolean_t vm_fault_dirty_handling = FALSE; +boolean_t vm_fault_interruptible = TRUE; + +boolean_t software_reference_bits = TRUE; + +#if MACH_KDB +extern struct db_watchpoint *db_watchpoint_list; +#endif MACH_KDB + +/* + * Routine: vm_fault_init + * Purpose: + * Initialize our private data structures. + */ +void vm_fault_init() +{ + vm_fault_state_zone = zinit(sizeof(vm_fault_state_t), + THREAD_MAX * sizeof(vm_fault_state_t), + sizeof(vm_fault_state_t), + 0, "vm fault state"); +} + +/* + * Routine: vm_fault_cleanup + * Purpose: + * Clean up the result of vm_fault_page. + * Results: + * The paging reference for "object" is released. + * "object" is unlocked. + * If "top_page" is not null, "top_page" is + * freed and the paging reference for the object + * containing it is released. + * + * In/out conditions: + * "object" must be locked. + */ +void +vm_fault_cleanup(object, top_page) + register vm_object_t object; + register vm_page_t top_page; +{ + vm_object_paging_end(object); + vm_object_unlock(object); + + if (top_page != VM_PAGE_NULL) { + object = top_page->object; + vm_object_lock(object); + VM_PAGE_FREE(top_page); + vm_object_paging_end(object); + vm_object_unlock(object); + } +} + + +#if MACH_PCSAMPLE +/* + * Do PC sampling on current thread, assuming + * that it is the thread taking this page fault. + * + * Must check for THREAD_NULL, since faults + * can occur before threads are running. + */ + +#define vm_stat_sample(flavor) \ + MACRO_BEGIN \ + thread_t _thread_ = current_thread(); \ + \ + if (_thread_ != THREAD_NULL) \ + take_pc_sample_macro(_thread_, (flavor)); \ + MACRO_END + +#else +#define vm_stat_sample(x) +#endif /* MACH_PCSAMPLE */ + + + +/* + * Routine: vm_fault_page + * Purpose: + * Find the resident page for the virtual memory + * specified by the given virtual memory object + * and offset. + * Additional arguments: + * The required permissions for the page is given + * in "fault_type". Desired permissions are included + * in "protection". + * + * If the desired page is known to be resident (for + * example, because it was previously wired down), asserting + * the "unwiring" parameter will speed the search. + * + * If the operation can be interrupted (by thread_abort + * or thread_terminate), then the "interruptible" + * parameter should be asserted. + * + * Results: + * The page containing the proper data is returned + * in "result_page". + * + * In/out conditions: + * The source object must be locked and referenced, + * and must donate one paging reference. The reference + * is not affected. The paging reference and lock are + * consumed. + * + * If the call succeeds, the object in which "result_page" + * resides is left locked and holding a paging reference. + * If this is not the original object, a busy page in the + * original object is returned in "top_page", to prevent other + * callers from pursuing this same data, along with a paging + * reference for the original object. The "top_page" should + * be destroyed when this guarantee is no longer required. + * The "result_page" is also left busy. It is not removed + * from the pageout queues. + */ +vm_fault_return_t vm_fault_page(first_object, first_offset, + fault_type, must_be_resident, interruptible, + protection, + result_page, top_page, + resume, continuation) + /* Arguments: */ + vm_object_t first_object; /* Object to begin search */ + vm_offset_t first_offset; /* Offset into object */ + vm_prot_t fault_type; /* What access is requested */ + boolean_t must_be_resident;/* Must page be resident? */ + boolean_t interruptible; /* May fault be interrupted? */ + /* Modifies in place: */ + vm_prot_t *protection; /* Protection for mapping */ + /* Returns: */ + vm_page_t *result_page; /* Page found, if successful */ + vm_page_t *top_page; /* Page in top object, if + * not result_page. + */ + /* More arguments: */ + boolean_t resume; /* We are restarting. */ + void (*continuation)(); /* Continuation for blocking. */ +{ + register + vm_page_t m; + register + vm_object_t object; + register + vm_offset_t offset; + vm_page_t first_m; + vm_object_t next_object; + vm_object_t copy_object; + boolean_t look_for_page; + vm_prot_t access_required; + +#ifdef CONTINUATIONS + if (resume) { + register vm_fault_state_t *state = + (vm_fault_state_t *) current_thread()->ith_other; + + if (state->vmfp_backoff) + goto after_block_and_backoff; + + object = state->vmfp_object; + offset = state->vmfp_offset; + first_m = state->vmfp_first_m; + access_required = state->vmfp_access; + goto after_thread_block; + } +#else /* not CONTINUATIONS */ + assert(continuation == 0); + assert(!resume); +#endif /* not CONTINUATIONS */ + + vm_stat_sample(SAMPLED_PC_VM_FAULTS_ANY); + vm_stat.faults++; /* needs lock XXX */ + +/* + * Recovery actions + */ +#define RELEASE_PAGE(m) \ + MACRO_BEGIN \ + PAGE_WAKEUP_DONE(m); \ + vm_page_lock_queues(); \ + if (!m->active && !m->inactive) \ + vm_page_activate(m); \ + vm_page_unlock_queues(); \ + MACRO_END + + if (vm_fault_dirty_handling +#if MACH_KDB + /* + * If there are watchpoints set, then + * we don't want to give away write permission + * on a read fault. Make the task write fault, + * so that the watchpoint code notices the access. + */ + || db_watchpoint_list +#endif MACH_KDB + ) { + /* + * If we aren't asking for write permission, + * then don't give it away. We're using write + * faults to set the dirty bit. + */ + if (!(fault_type & VM_PROT_WRITE)) + *protection &= ~VM_PROT_WRITE; + } + + if (!vm_fault_interruptible) + interruptible = FALSE; + + /* + * INVARIANTS (through entire routine): + * + * 1) At all times, we must either have the object + * lock or a busy page in some object to prevent + * some other thread from trying to bring in + * the same page. + * + * Note that we cannot hold any locks during the + * pager access or when waiting for memory, so + * we use a busy page then. + * + * Note also that we aren't as concerned about more than + * one thread attempting to memory_object_data_unlock + * the same page at once, so we don't hold the page + * as busy then, but do record the highest unlock + * value so far. [Unlock requests may also be delivered + * out of order.] + * + * 2) To prevent another thread from racing us down the + * shadow chain and entering a new page in the top + * object before we do, we must keep a busy page in + * the top object while following the shadow chain. + * + * 3) We must increment paging_in_progress on any object + * for which we have a busy page, to prevent + * vm_object_collapse from removing the busy page + * without our noticing. + * + * 4) We leave busy pages on the pageout queues. + * If the pageout daemon comes across a busy page, + * it will remove the page from the pageout queues. + */ + + /* + * Search for the page at object/offset. + */ + + object = first_object; + offset = first_offset; + first_m = VM_PAGE_NULL; + access_required = fault_type; + + /* + * See whether this page is resident + */ + + while (TRUE) { + m = vm_page_lookup(object, offset); + if (m != VM_PAGE_NULL) { + /* + * If the page is being brought in, + * wait for it and then retry. + * + * A possible optimization: if the page + * is known to be resident, we can ignore + * pages that are absent (regardless of + * whether they're busy). + */ + + if (m->busy) { + kern_return_t wait_result; + + PAGE_ASSERT_WAIT(m, interruptible); + vm_object_unlock(object); +#ifdef CONTINUATIONS + if (continuation != (void (*)()) 0) { + register vm_fault_state_t *state = + (vm_fault_state_t *) current_thread()->ith_other; + + /* + * Save variables in case + * thread_block discards + * our kernel stack. + */ + + state->vmfp_backoff = FALSE; + state->vmfp_object = object; + state->vmfp_offset = offset; + state->vmfp_first_m = first_m; + state->vmfp_access = + access_required; + state->vmf_prot = *protection; + + counter(c_vm_fault_page_block_busy_user++); + thread_block(continuation); + } else +#endif /* CONTINUATIONS */ + { + counter(c_vm_fault_page_block_busy_kernel++); + thread_block((void (*)()) 0); + } + after_thread_block: + wait_result = current_thread()->wait_result; + vm_object_lock(object); + if (wait_result != THREAD_AWAKENED) { + vm_fault_cleanup(object, first_m); + if (wait_result == THREAD_RESTART) + return(VM_FAULT_RETRY); + else + return(VM_FAULT_INTERRUPTED); + } + continue; + } + + /* + * If the page is in error, give up now. + */ + + if (m->error) { + VM_PAGE_FREE(m); + vm_fault_cleanup(object, first_m); + return(VM_FAULT_MEMORY_ERROR); + } + + /* + * If the page isn't busy, but is absent, + * then it was deemed "unavailable". + */ + + if (m->absent) { + /* + * Remove the non-existent page (unless it's + * in the top object) and move on down to the + * next object (if there is one). + */ + + offset += object->shadow_offset; + access_required = VM_PROT_READ; + next_object = object->shadow; + if (next_object == VM_OBJECT_NULL) { + vm_page_t real_m; + + assert(!must_be_resident); + + /* + * Absent page at bottom of shadow + * chain; zero fill the page we left + * busy in the first object, and flush + * the absent page. But first we + * need to allocate a real page. + */ + + real_m = vm_page_grab(); + if (real_m == VM_PAGE_NULL) { + vm_fault_cleanup(object, first_m); + return(VM_FAULT_MEMORY_SHORTAGE); + } + + if (object != first_object) { + VM_PAGE_FREE(m); + vm_object_paging_end(object); + vm_object_unlock(object); + object = first_object; + offset = first_offset; + m = first_m; + first_m = VM_PAGE_NULL; + vm_object_lock(object); + } + + VM_PAGE_FREE(m); + assert(real_m->busy); + vm_page_lock_queues(); + vm_page_insert(real_m, object, offset); + vm_page_unlock_queues(); + m = real_m; + + /* + * Drop the lock while zero filling + * page. Then break because this + * is the page we wanted. Checking + * the page lock is a waste of time; + * this page was either absent or + * newly allocated -- in both cases + * it can't be page locked by a pager. + */ + vm_object_unlock(object); + + vm_page_zero_fill(m); + + vm_stat_sample(SAMPLED_PC_VM_ZFILL_FAULTS); + + vm_stat.zero_fill_count++; + vm_object_lock(object); + pmap_clear_modify(m->phys_addr); + break; + } else { + if (must_be_resident) { + vm_object_paging_end(object); + } else if (object != first_object) { + vm_object_paging_end(object); + VM_PAGE_FREE(m); + } else { + first_m = m; + m->absent = FALSE; + vm_object_absent_release(object); + m->busy = TRUE; + + vm_page_lock_queues(); + VM_PAGE_QUEUES_REMOVE(m); + vm_page_unlock_queues(); + } + vm_object_lock(next_object); + vm_object_unlock(object); + object = next_object; + vm_object_paging_begin(object); + continue; + } + } + + /* + * If the desired access to this page has + * been locked out, request that it be unlocked. + */ + + if (access_required & m->page_lock) { + if ((access_required & m->unlock_request) != access_required) { + vm_prot_t new_unlock_request; + kern_return_t rc; + + if (!object->pager_ready) { + vm_object_assert_wait(object, + VM_OBJECT_EVENT_PAGER_READY, + interruptible); + goto block_and_backoff; + } + + new_unlock_request = m->unlock_request = + (access_required | m->unlock_request); + vm_object_unlock(object); + if ((rc = memory_object_data_unlock( + object->pager, + object->pager_request, + offset + object->paging_offset, + PAGE_SIZE, + new_unlock_request)) + != KERN_SUCCESS) { + printf("vm_fault: memory_object_data_unlock failed\n"); + vm_object_lock(object); + vm_fault_cleanup(object, first_m); + return((rc == MACH_SEND_INTERRUPTED) ? + VM_FAULT_INTERRUPTED : + VM_FAULT_MEMORY_ERROR); + } + vm_object_lock(object); + continue; + } + + PAGE_ASSERT_WAIT(m, interruptible); + goto block_and_backoff; + } + + /* + * We mark the page busy and leave it on + * the pageout queues. If the pageout + * deamon comes across it, then it will + * remove the page. + */ + + if (!software_reference_bits) { + vm_page_lock_queues(); + if (m->inactive) { + vm_stat_sample(SAMPLED_PC_VM_REACTIVATION_FAULTS); + vm_stat.reactivations++; + } + + VM_PAGE_QUEUES_REMOVE(m); + vm_page_unlock_queues(); + } + + assert(!m->busy); + m->busy = TRUE; + assert(!m->absent); + break; + } + + look_for_page = + (object->pager_created) +#if MACH_PAGEMAP + && (vm_external_state_get(object->existence_info, offset + object->paging_offset) != + VM_EXTERNAL_STATE_ABSENT) +#endif MACH_PAGEMAP + ; + + if ((look_for_page || (object == first_object)) + && !must_be_resident) { + /* + * Allocate a new page for this object/offset + * pair. + */ + + m = vm_page_grab_fictitious(); + if (m == VM_PAGE_NULL) { + vm_fault_cleanup(object, first_m); + return(VM_FAULT_FICTITIOUS_SHORTAGE); + } + + vm_page_lock_queues(); + vm_page_insert(m, object, offset); + vm_page_unlock_queues(); + } + + if (look_for_page && !must_be_resident) { + kern_return_t rc; + + /* + * If the memory manager is not ready, we + * cannot make requests. + */ + if (!object->pager_ready) { + vm_object_assert_wait(object, + VM_OBJECT_EVENT_PAGER_READY, + interruptible); + VM_PAGE_FREE(m); + goto block_and_backoff; + } + + if (object->internal) { + /* + * Requests to the default pager + * must reserve a real page in advance, + * because the pager's data-provided + * won't block for pages. + */ + + if (m->fictitious && !vm_page_convert(m)) { + VM_PAGE_FREE(m); + vm_fault_cleanup(object, first_m); + return(VM_FAULT_MEMORY_SHORTAGE); + } + } else if (object->absent_count > + vm_object_absent_max) { + /* + * If there are too many outstanding page + * requests pending on this object, we + * wait for them to be resolved now. + */ + + vm_object_absent_assert_wait(object, interruptible); + VM_PAGE_FREE(m); + goto block_and_backoff; + } + + /* + * Indicate that the page is waiting for data + * from the memory manager. + */ + + m->absent = TRUE; + object->absent_count++; + + /* + * We have a busy page, so we can + * release the object lock. + */ + vm_object_unlock(object); + + /* + * Call the memory manager to retrieve the data. + */ + + vm_stat.pageins++; + vm_stat_sample(SAMPLED_PC_VM_PAGEIN_FAULTS); + + if ((rc = memory_object_data_request(object->pager, + object->pager_request, + m->offset + object->paging_offset, + PAGE_SIZE, access_required)) != KERN_SUCCESS) { + if (rc != MACH_SEND_INTERRUPTED) + printf("%s(0x%x, 0x%x, 0x%x, 0x%x, 0x%x) failed, %d\n", + "memory_object_data_request", + object->pager, + object->pager_request, + m->offset + object->paging_offset, + PAGE_SIZE, access_required, rc); + /* + * Don't want to leave a busy page around, + * but the data request may have blocked, + * so check if it's still there and busy. + */ + vm_object_lock(object); + if (m == vm_page_lookup(object,offset) && + m->absent && m->busy) + VM_PAGE_FREE(m); + vm_fault_cleanup(object, first_m); + return((rc == MACH_SEND_INTERRUPTED) ? + VM_FAULT_INTERRUPTED : + VM_FAULT_MEMORY_ERROR); + } + + /* + * Retry with same object/offset, since new data may + * be in a different page (i.e., m is meaningless at + * this point). + */ + vm_object_lock(object); + continue; + } + + /* + * For the XP system, the only case in which we get here is if + * object has no pager (or unwiring). If the pager doesn't + * have the page this is handled in the m->absent case above + * (and if you change things here you should look above). + */ + if (object == first_object) + first_m = m; + else + { + assert(m == VM_PAGE_NULL); + } + + /* + * Move on to the next object. Lock the next + * object before unlocking the current one. + */ + access_required = VM_PROT_READ; + + offset += object->shadow_offset; + next_object = object->shadow; + if (next_object == VM_OBJECT_NULL) { + assert(!must_be_resident); + + /* + * If there's no object left, fill the page + * in the top object with zeros. But first we + * need to allocate a real page. + */ + + if (object != first_object) { + vm_object_paging_end(object); + vm_object_unlock(object); + + object = first_object; + offset = first_offset; + vm_object_lock(object); + } + + m = first_m; + assert(m->object == object); + first_m = VM_PAGE_NULL; + + if (m->fictitious && !vm_page_convert(m)) { + VM_PAGE_FREE(m); + vm_fault_cleanup(object, VM_PAGE_NULL); + return(VM_FAULT_MEMORY_SHORTAGE); + } + + vm_object_unlock(object); + vm_page_zero_fill(m); + vm_stat_sample(SAMPLED_PC_VM_ZFILL_FAULTS); + vm_stat.zero_fill_count++; + vm_object_lock(object); + pmap_clear_modify(m->phys_addr); + break; + } + else { + vm_object_lock(next_object); + if ((object != first_object) || must_be_resident) + vm_object_paging_end(object); + vm_object_unlock(object); + object = next_object; + vm_object_paging_begin(object); + } + } + + /* + * PAGE HAS BEEN FOUND. + * + * This page (m) is: + * busy, so that we can play with it; + * not absent, so that nobody else will fill it; + * possibly eligible for pageout; + * + * The top-level page (first_m) is: + * VM_PAGE_NULL if the page was found in the + * top-level object; + * busy, not absent, and ineligible for pageout. + * + * The current object (object) is locked. A paging + * reference is held for the current and top-level + * objects. + */ + +#if EXTRA_ASSERTIONS + assert(m->busy && !m->absent); + assert((first_m == VM_PAGE_NULL) || + (first_m->busy && !first_m->absent && + !first_m->active && !first_m->inactive)); +#endif EXTRA_ASSERTIONS + + /* + * If the page is being written, but isn't + * already owned by the top-level object, + * we have to copy it into a new page owned + * by the top-level object. + */ + + if (object != first_object) { + /* + * We only really need to copy if we + * want to write it. + */ + + if (fault_type & VM_PROT_WRITE) { + vm_page_t copy_m; + + assert(!must_be_resident); + + /* + * If we try to collapse first_object at this + * point, we may deadlock when we try to get + * the lock on an intermediate object (since we + * have the bottom object locked). We can't + * unlock the bottom object, because the page + * we found may move (by collapse) if we do. + * + * Instead, we first copy the page. Then, when + * we have no more use for the bottom object, + * we unlock it and try to collapse. + * + * Note that we copy the page even if we didn't + * need to... that's the breaks. + */ + + /* + * Allocate a page for the copy + */ + copy_m = vm_page_grab(); + if (copy_m == VM_PAGE_NULL) { + RELEASE_PAGE(m); + vm_fault_cleanup(object, first_m); + return(VM_FAULT_MEMORY_SHORTAGE); + } + + vm_object_unlock(object); + vm_page_copy(m, copy_m); + vm_object_lock(object); + + /* + * If another map is truly sharing this + * page with us, we have to flush all + * uses of the original page, since we + * can't distinguish those which want the + * original from those which need the + * new copy. + * + * XXXO If we know that only one map has + * access to this page, then we could + * avoid the pmap_page_protect() call. + */ + + vm_page_lock_queues(); + vm_page_deactivate(m); + pmap_page_protect(m->phys_addr, VM_PROT_NONE); + vm_page_unlock_queues(); + + /* + * We no longer need the old page or object. + */ + + PAGE_WAKEUP_DONE(m); + vm_object_paging_end(object); + vm_object_unlock(object); + + vm_stat.cow_faults++; + vm_stat_sample(SAMPLED_PC_VM_COW_FAULTS); + object = first_object; + offset = first_offset; + + vm_object_lock(object); + VM_PAGE_FREE(first_m); + first_m = VM_PAGE_NULL; + assert(copy_m->busy); + vm_page_lock_queues(); + vm_page_insert(copy_m, object, offset); + vm_page_unlock_queues(); + m = copy_m; + + /* + * Now that we've gotten the copy out of the + * way, let's try to collapse the top object. + * But we have to play ugly games with + * paging_in_progress to do that... + */ + + vm_object_paging_end(object); + vm_object_collapse(object); + vm_object_paging_begin(object); + } + else { + *protection &= (~VM_PROT_WRITE); + } + } + + /* + * Now check whether the page needs to be pushed into the + * copy object. The use of asymmetric copy on write for + * shared temporary objects means that we may do two copies to + * satisfy the fault; one above to get the page from a + * shadowed object, and one here to push it into the copy. + */ + + while ((copy_object = first_object->copy) != VM_OBJECT_NULL) { + vm_offset_t copy_offset; + vm_page_t copy_m; + + /* + * If the page is being written, but hasn't been + * copied to the copy-object, we have to copy it there. + */ + + if ((fault_type & VM_PROT_WRITE) == 0) { + *protection &= ~VM_PROT_WRITE; + break; + } + + /* + * If the page was guaranteed to be resident, + * we must have already performed the copy. + */ + + if (must_be_resident) + break; + + /* + * Try to get the lock on the copy_object. + */ + if (!vm_object_lock_try(copy_object)) { + vm_object_unlock(object); + + simple_lock_pause(); /* wait a bit */ + + vm_object_lock(object); + continue; + } + + /* + * Make another reference to the copy-object, + * to keep it from disappearing during the + * copy. + */ + assert(copy_object->ref_count > 0); + copy_object->ref_count++; + + /* + * Does the page exist in the copy? + */ + copy_offset = first_offset - copy_object->shadow_offset; + copy_m = vm_page_lookup(copy_object, copy_offset); + if (copy_m != VM_PAGE_NULL) { + if (copy_m->busy) { + /* + * If the page is being brought + * in, wait for it and then retry. + */ + PAGE_ASSERT_WAIT(copy_m, interruptible); + RELEASE_PAGE(m); + copy_object->ref_count--; + assert(copy_object->ref_count > 0); + vm_object_unlock(copy_object); + goto block_and_backoff; + } + } + else { + /* + * Allocate a page for the copy + */ + copy_m = vm_page_alloc(copy_object, copy_offset); + if (copy_m == VM_PAGE_NULL) { + RELEASE_PAGE(m); + copy_object->ref_count--; + assert(copy_object->ref_count > 0); + vm_object_unlock(copy_object); + vm_fault_cleanup(object, first_m); + return(VM_FAULT_MEMORY_SHORTAGE); + } + + /* + * Must copy page into copy-object. + */ + + vm_page_copy(m, copy_m); + + /* + * If the old page was in use by any users + * of the copy-object, it must be removed + * from all pmaps. (We can't know which + * pmaps use it.) + */ + + vm_page_lock_queues(); + pmap_page_protect(m->phys_addr, VM_PROT_NONE); + copy_m->dirty = TRUE; + vm_page_unlock_queues(); + + /* + * If there's a pager, then immediately + * page out this page, using the "initialize" + * option. Else, we use the copy. + */ + + if (!copy_object->pager_created) { + vm_page_lock_queues(); + vm_page_activate(copy_m); + vm_page_unlock_queues(); + PAGE_WAKEUP_DONE(copy_m); + } else { + /* + * The page is already ready for pageout: + * not on pageout queues and busy. + * Unlock everything except the + * copy_object itself. + */ + + vm_object_unlock(object); + + /* + * Write the page to the copy-object, + * flushing it from the kernel. + */ + + vm_pageout_page(copy_m, TRUE, TRUE); + + /* + * Since the pageout may have + * temporarily dropped the + * copy_object's lock, we + * check whether we'll have + * to deallocate the hard way. + */ + + if ((copy_object->shadow != object) || + (copy_object->ref_count == 1)) { + vm_object_unlock(copy_object); + vm_object_deallocate(copy_object); + vm_object_lock(object); + continue; + } + + /* + * Pick back up the old object's + * lock. [It is safe to do so, + * since it must be deeper in the + * object tree.] + */ + + vm_object_lock(object); + } + + /* + * Because we're pushing a page upward + * in the object tree, we must restart + * any faults that are waiting here. + * [Note that this is an expansion of + * PAGE_WAKEUP that uses the THREAD_RESTART + * wait result]. Can't turn off the page's + * busy bit because we're not done with it. + */ + + if (m->wanted) { + m->wanted = FALSE; + thread_wakeup_with_result((event_t) m, + THREAD_RESTART); + } + } + + /* + * The reference count on copy_object must be + * at least 2: one for our extra reference, + * and at least one from the outside world + * (we checked that when we last locked + * copy_object). + */ + copy_object->ref_count--; + assert(copy_object->ref_count > 0); + vm_object_unlock(copy_object); + + break; + } + + *result_page = m; + *top_page = first_m; + + /* + * If the page can be written, assume that it will be. + * [Earlier, we restrict the permission to allow write + * access only if the fault so required, so we don't + * mark read-only data as dirty.] + */ + + if (vm_fault_dirty_handling && (*protection & VM_PROT_WRITE)) + m->dirty = TRUE; + + return(VM_FAULT_SUCCESS); + + block_and_backoff: + vm_fault_cleanup(object, first_m); + +#ifdef CONTINUATIONS + if (continuation != (void (*)()) 0) { + register vm_fault_state_t *state = + (vm_fault_state_t *) current_thread()->ith_other; + + /* + * Save variables in case we must restart. + */ + + state->vmfp_backoff = TRUE; + state->vmf_prot = *protection; + + counter(c_vm_fault_page_block_backoff_user++); + thread_block(continuation); + } else +#endif /* CONTINUATIONS */ + { + counter(c_vm_fault_page_block_backoff_kernel++); + thread_block((void (*)()) 0); + } + after_block_and_backoff: + if (current_thread()->wait_result == THREAD_AWAKENED) + return VM_FAULT_RETRY; + else + return VM_FAULT_INTERRUPTED; + +#undef RELEASE_PAGE +} + +/* + * Routine: vm_fault + * Purpose: + * Handle page faults, including pseudo-faults + * used to change the wiring status of pages. + * Returns: + * If an explicit (expression) continuation is supplied, + * then we call the continuation instead of returning. + * Implementation: + * Explicit continuations make this a little icky, + * because it hasn't been rewritten to embrace CPS. + * Instead, we have resume arguments for vm_fault and + * vm_fault_page, to let continue the fault computation. + * + * vm_fault and vm_fault_page save mucho state + * in the moral equivalent of a closure. The state + * structure is allocated when first entering vm_fault + * and deallocated when leaving vm_fault. + */ + +#ifdef CONTINUATIONS +void +vm_fault_continue() +{ + register vm_fault_state_t *state = + (vm_fault_state_t *) current_thread()->ith_other; + + (void) vm_fault(state->vmf_map, + state->vmf_vaddr, + state->vmf_fault_type, + state->vmf_change_wiring, + TRUE, state->vmf_continuation); + /*NOTREACHED*/ +} +#endif /* CONTINUATIONS */ + +kern_return_t vm_fault(map, vaddr, fault_type, change_wiring, + resume, continuation) + vm_map_t map; + vm_offset_t vaddr; + vm_prot_t fault_type; + boolean_t change_wiring; + boolean_t resume; + void (*continuation)(); +{ + vm_map_version_t version; /* Map version for verificiation */ + boolean_t wired; /* Should mapping be wired down? */ + vm_object_t object; /* Top-level object */ + vm_offset_t offset; /* Top-level offset */ + vm_prot_t prot; /* Protection for mapping */ + vm_object_t old_copy_object; /* Saved copy object */ + vm_page_t result_page; /* Result of vm_fault_page */ + vm_page_t top_page; /* Placeholder page */ + kern_return_t kr; + + register + vm_page_t m; /* Fast access to result_page */ + +#ifdef CONTINUATIONS + if (resume) { + register vm_fault_state_t *state = + (vm_fault_state_t *) current_thread()->ith_other; + + /* + * Retrieve cached variables and + * continue vm_fault_page. + */ + + object = state->vmf_object; + if (object == VM_OBJECT_NULL) + goto RetryFault; + version = state->vmf_version; + wired = state->vmf_wired; + offset = state->vmf_offset; + prot = state->vmf_prot; + + kr = vm_fault_page(object, offset, fault_type, + (change_wiring && !wired), !change_wiring, + &prot, &result_page, &top_page, + TRUE, vm_fault_continue); + goto after_vm_fault_page; + } + + if (continuation != (void (*)()) 0) { + /* + * We will probably need to save state. + */ + + char * state; + + /* + * if this assignment stmt is written as + * 'active_threads[cpu_number()] = zalloc()', + * cpu_number may be evaluated before zalloc; + * if zalloc blocks, cpu_number will be wrong + */ + + state = (char *) zalloc(vm_fault_state_zone); + current_thread()->ith_other = state; + + } +#else /* not CONTINUATIONS */ + assert(continuation == 0); + assert(!resume); +#endif /* not CONTINUATIONS */ + + RetryFault: ; + + /* + * Find the backing store object and offset into + * it to begin the search. + */ + + if ((kr = vm_map_lookup(&map, vaddr, fault_type, &version, + &object, &offset, + &prot, &wired)) != KERN_SUCCESS) { + goto done; + } + + /* + * If the page is wired, we must fault for the current protection + * value, to avoid further faults. + */ + + if (wired) + fault_type = prot; + + /* + * Make a reference to this object to + * prevent its disposal while we are messing with + * it. Once we have the reference, the map is free + * to be diddled. Since objects reference their + * shadows (and copies), they will stay around as well. + */ + + assert(object->ref_count > 0); + object->ref_count++; + vm_object_paging_begin(object); + +#ifdef CONTINUATIONS + if (continuation != (void (*)()) 0) { + register vm_fault_state_t *state = + (vm_fault_state_t *) current_thread()->ith_other; + + /* + * Save variables, in case vm_fault_page discards + * our kernel stack and we have to restart. + */ + + state->vmf_map = map; + state->vmf_vaddr = vaddr; + state->vmf_fault_type = fault_type; + state->vmf_change_wiring = change_wiring; + state->vmf_continuation = continuation; + + state->vmf_version = version; + state->vmf_wired = wired; + state->vmf_object = object; + state->vmf_offset = offset; + state->vmf_prot = prot; + + kr = vm_fault_page(object, offset, fault_type, + (change_wiring && !wired), !change_wiring, + &prot, &result_page, &top_page, + FALSE, vm_fault_continue); + } else +#endif /* CONTINUATIONS */ + { + kr = vm_fault_page(object, offset, fault_type, + (change_wiring && !wired), !change_wiring, + &prot, &result_page, &top_page, + FALSE, (void (*)()) 0); + } + after_vm_fault_page: + + /* + * If we didn't succeed, lose the object reference immediately. + */ + + if (kr != VM_FAULT_SUCCESS) + vm_object_deallocate(object); + + /* + * See why we failed, and take corrective action. + */ + + switch (kr) { + case VM_FAULT_SUCCESS: + break; + case VM_FAULT_RETRY: + goto RetryFault; + case VM_FAULT_INTERRUPTED: + kr = KERN_SUCCESS; + goto done; + case VM_FAULT_MEMORY_SHORTAGE: +#ifdef CONTINUATIONS + if (continuation != (void (*)()) 0) { + register vm_fault_state_t *state = + (vm_fault_state_t *) current_thread()->ith_other; + + /* + * Save variables in case VM_PAGE_WAIT + * discards our kernel stack. + */ + + state->vmf_map = map; + state->vmf_vaddr = vaddr; + state->vmf_fault_type = fault_type; + state->vmf_change_wiring = change_wiring; + state->vmf_continuation = continuation; + state->vmf_object = VM_OBJECT_NULL; + + VM_PAGE_WAIT(vm_fault_continue); + } else +#endif /* CONTINUATIONS */ + VM_PAGE_WAIT((void (*)()) 0); + goto RetryFault; + case VM_FAULT_FICTITIOUS_SHORTAGE: + vm_page_more_fictitious(); + goto RetryFault; + case VM_FAULT_MEMORY_ERROR: + kr = KERN_MEMORY_ERROR; + goto done; + } + + m = result_page; + + assert((change_wiring && !wired) ? + (top_page == VM_PAGE_NULL) : + ((top_page == VM_PAGE_NULL) == (m->object == object))); + + /* + * How to clean up the result of vm_fault_page. This + * happens whether the mapping is entered or not. + */ + +#define UNLOCK_AND_DEALLOCATE \ + MACRO_BEGIN \ + vm_fault_cleanup(m->object, top_page); \ + vm_object_deallocate(object); \ + MACRO_END + + /* + * What to do with the resulting page from vm_fault_page + * if it doesn't get entered into the physical map: + */ + +#define RELEASE_PAGE(m) \ + MACRO_BEGIN \ + PAGE_WAKEUP_DONE(m); \ + vm_page_lock_queues(); \ + if (!m->active && !m->inactive) \ + vm_page_activate(m); \ + vm_page_unlock_queues(); \ + MACRO_END + + /* + * We must verify that the maps have not changed + * since our last lookup. + */ + + old_copy_object = m->object->copy; + + vm_object_unlock(m->object); + while (!vm_map_verify(map, &version)) { + vm_object_t retry_object; + vm_offset_t retry_offset; + vm_prot_t retry_prot; + + /* + * To avoid trying to write_lock the map while another + * thread has it read_locked (in vm_map_pageable), we + * do not try for write permission. If the page is + * still writable, we will get write permission. If it + * is not, or has been marked needs_copy, we enter the + * mapping without write permission, and will merely + * take another fault. + */ + kr = vm_map_lookup(&map, vaddr, + fault_type & ~VM_PROT_WRITE, &version, + &retry_object, &retry_offset, &retry_prot, + &wired); + + if (kr != KERN_SUCCESS) { + vm_object_lock(m->object); + RELEASE_PAGE(m); + UNLOCK_AND_DEALLOCATE; + goto done; + } + + vm_object_unlock(retry_object); + vm_object_lock(m->object); + + if ((retry_object != object) || + (retry_offset != offset)) { + RELEASE_PAGE(m); + UNLOCK_AND_DEALLOCATE; + goto RetryFault; + } + + /* + * Check whether the protection has changed or the object + * has been copied while we left the map unlocked. + */ + prot &= retry_prot; + vm_object_unlock(m->object); + } + vm_object_lock(m->object); + + /* + * If the copy object changed while the top-level object + * was unlocked, then we must take away write permission. + */ + + if (m->object->copy != old_copy_object) + prot &= ~VM_PROT_WRITE; + + /* + * If we want to wire down this page, but no longer have + * adequate permissions, we must start all over. + */ + + if (wired && (prot != fault_type)) { + vm_map_verify_done(map, &version); + RELEASE_PAGE(m); + UNLOCK_AND_DEALLOCATE; + goto RetryFault; + } + + /* + * It's critically important that a wired-down page be faulted + * only once in each map for which it is wired. + */ + + vm_object_unlock(m->object); + + /* + * Put this page into the physical map. + * We had to do the unlock above because pmap_enter + * may cause other faults. The page may be on + * the pageout queues. If the pageout daemon comes + * across the page, it will remove it from the queues. + */ + + PMAP_ENTER(map->pmap, vaddr, m, prot, wired); + + /* + * If the page is not wired down and isn't already + * on a pageout queue, then put it where the + * pageout daemon can find it. + */ + vm_object_lock(m->object); + vm_page_lock_queues(); + if (change_wiring) { + if (wired) + vm_page_wire(m); + else + vm_page_unwire(m); + } else if (software_reference_bits) { + if (!m->active && !m->inactive) + vm_page_activate(m); + m->reference = TRUE; + } else { + vm_page_activate(m); + } + vm_page_unlock_queues(); + + /* + * Unlock everything, and return + */ + + vm_map_verify_done(map, &version); + PAGE_WAKEUP_DONE(m); + kr = KERN_SUCCESS; + UNLOCK_AND_DEALLOCATE; + +#undef UNLOCK_AND_DEALLOCATE +#undef RELEASE_PAGE + + done: +#ifdef CONTINUATIONS + if (continuation != (void (*)()) 0) { + register vm_fault_state_t *state = + (vm_fault_state_t *) current_thread()->ith_other; + + zfree(vm_fault_state_zone, (vm_offset_t) state); + (*continuation)(kr); + /*NOTREACHED*/ + } +#endif /* CONTINUATIONS */ + + return(kr); +} + +kern_return_t vm_fault_wire_fast(); + +/* + * vm_fault_wire: + * + * Wire down a range of virtual addresses in a map. + */ +void vm_fault_wire(map, entry) + vm_map_t map; + vm_map_entry_t entry; +{ + + register vm_offset_t va; + register pmap_t pmap; + register vm_offset_t end_addr = entry->vme_end; + + pmap = vm_map_pmap(map); + + /* + * Inform the physical mapping system that the + * range of addresses may not fault, so that + * page tables and such can be locked down as well. + */ + + pmap_pageable(pmap, entry->vme_start, end_addr, FALSE); + + /* + * We simulate a fault to get the page and enter it + * in the physical map. + */ + + for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) { + if (vm_fault_wire_fast(map, va, entry) != KERN_SUCCESS) + (void) vm_fault(map, va, VM_PROT_NONE, TRUE, + FALSE, (void (*)()) 0); + } +} + +/* + * vm_fault_unwire: + * + * Unwire a range of virtual addresses in a map. + */ +void vm_fault_unwire(map, entry) + vm_map_t map; + vm_map_entry_t entry; +{ + register vm_offset_t va; + register pmap_t pmap; + register vm_offset_t end_addr = entry->vme_end; + vm_object_t object; + + pmap = vm_map_pmap(map); + + object = (entry->is_sub_map) + ? VM_OBJECT_NULL : entry->object.vm_object; + + /* + * Since the pages are wired down, we must be able to + * get their mappings from the physical map system. + */ + + for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) { + pmap_change_wiring(pmap, va, FALSE); + + if (object == VM_OBJECT_NULL) { + vm_map_lock_set_recursive(map); + (void) vm_fault(map, va, VM_PROT_NONE, TRUE, + FALSE, (void (*)()) 0); + vm_map_lock_clear_recursive(map); + } else { + vm_prot_t prot; + vm_page_t result_page; + vm_page_t top_page; + vm_fault_return_t result; + + do { + prot = VM_PROT_NONE; + + vm_object_lock(object); + vm_object_paging_begin(object); + result = vm_fault_page(object, + entry->offset + + (va - entry->vme_start), + VM_PROT_NONE, TRUE, + FALSE, &prot, + &result_page, + &top_page, + FALSE, (void (*)()) 0); + } while (result == VM_FAULT_RETRY); + + if (result != VM_FAULT_SUCCESS) + panic("vm_fault_unwire: failure"); + + vm_page_lock_queues(); + vm_page_unwire(result_page); + vm_page_unlock_queues(); + PAGE_WAKEUP_DONE(result_page); + + vm_fault_cleanup(result_page->object, top_page); + } + } + + /* + * Inform the physical mapping system that the range + * of addresses may fault, so that page tables and + * such may be unwired themselves. + */ + + pmap_pageable(pmap, entry->vme_start, end_addr, TRUE); +} + +/* + * vm_fault_wire_fast: + * + * Handle common case of a wire down page fault at the given address. + * If successful, the page is inserted into the associated physical map. + * The map entry is passed in to avoid the overhead of a map lookup. + * + * NOTE: the given address should be truncated to the + * proper page address. + * + * KERN_SUCCESS is returned if the page fault is handled; otherwise, + * a standard error specifying why the fault is fatal is returned. + * + * The map in question must be referenced, and remains so. + * Caller has a read lock on the map. + * + * This is a stripped version of vm_fault() for wiring pages. Anything + * other than the common case will return KERN_FAILURE, and the caller + * is expected to call vm_fault(). + */ +kern_return_t vm_fault_wire_fast(map, va, entry) + vm_map_t map; + vm_offset_t va; + vm_map_entry_t entry; +{ + vm_object_t object; + vm_offset_t offset; + register vm_page_t m; + vm_prot_t prot; + + vm_stat.faults++; /* needs lock XXX */ +/* + * Recovery actions + */ + +#undef RELEASE_PAGE +#define RELEASE_PAGE(m) { \ + PAGE_WAKEUP_DONE(m); \ + vm_page_lock_queues(); \ + vm_page_unwire(m); \ + vm_page_unlock_queues(); \ +} + + +#undef UNLOCK_THINGS +#define UNLOCK_THINGS { \ + object->paging_in_progress--; \ + vm_object_unlock(object); \ +} + +#undef UNLOCK_AND_DEALLOCATE +#define UNLOCK_AND_DEALLOCATE { \ + UNLOCK_THINGS; \ + vm_object_deallocate(object); \ +} +/* + * Give up and have caller do things the hard way. + */ + +#define GIVE_UP { \ + UNLOCK_AND_DEALLOCATE; \ + return(KERN_FAILURE); \ +} + + + /* + * If this entry is not directly to a vm_object, bail out. + */ + if (entry->is_sub_map) + return(KERN_FAILURE); + + /* + * Find the backing store object and offset into it. + */ + + object = entry->object.vm_object; + offset = (va - entry->vme_start) + entry->offset; + prot = entry->protection; + + /* + * Make a reference to this object to prevent its + * disposal while we are messing with it. + */ + + vm_object_lock(object); + assert(object->ref_count > 0); + object->ref_count++; + object->paging_in_progress++; + + /* + * INVARIANTS (through entire routine): + * + * 1) At all times, we must either have the object + * lock or a busy page in some object to prevent + * some other thread from trying to bring in + * the same page. + * + * 2) Once we have a busy page, we must remove it from + * the pageout queues, so that the pageout daemon + * will not grab it away. + * + */ + + /* + * Look for page in top-level object. If it's not there or + * there's something going on, give up. + */ + m = vm_page_lookup(object, offset); + if ((m == VM_PAGE_NULL) || (m->error) || + (m->busy) || (m->absent) || (prot & m->page_lock)) { + GIVE_UP; + } + + /* + * Wire the page down now. All bail outs beyond this + * point must unwire the page. + */ + + vm_page_lock_queues(); + vm_page_wire(m); + vm_page_unlock_queues(); + + /* + * Mark page busy for other threads. + */ + assert(!m->busy); + m->busy = TRUE; + assert(!m->absent); + + /* + * Give up if the page is being written and there's a copy object + */ + if ((object->copy != VM_OBJECT_NULL) && (prot & VM_PROT_WRITE)) { + RELEASE_PAGE(m); + GIVE_UP; + } + + /* + * Put this page into the physical map. + * We have to unlock the object because pmap_enter + * may cause other faults. + */ + vm_object_unlock(object); + + PMAP_ENTER(map->pmap, va, m, prot, TRUE); + + /* + * Must relock object so that paging_in_progress can be cleared. + */ + vm_object_lock(object); + + /* + * Unlock everything, and return + */ + + PAGE_WAKEUP_DONE(m); + UNLOCK_AND_DEALLOCATE; + + return(KERN_SUCCESS); + +} + +/* + * Routine: vm_fault_copy_cleanup + * Purpose: + * Release a page used by vm_fault_copy. + */ + +void vm_fault_copy_cleanup(page, top_page) + vm_page_t page; + vm_page_t top_page; +{ + vm_object_t object = page->object; + + vm_object_lock(object); + PAGE_WAKEUP_DONE(page); + vm_page_lock_queues(); + if (!page->active && !page->inactive) + vm_page_activate(page); + vm_page_unlock_queues(); + vm_fault_cleanup(object, top_page); +} + +/* + * Routine: vm_fault_copy + * + * Purpose: + * Copy pages from one virtual memory object to another -- + * neither the source nor destination pages need be resident. + * + * Before actually copying a page, the version associated with + * the destination address map wil be verified. + * + * In/out conditions: + * The caller must hold a reference, but not a lock, to + * each of the source and destination objects and to the + * destination map. + * + * Results: + * Returns KERN_SUCCESS if no errors were encountered in + * reading or writing the data. Returns KERN_INTERRUPTED if + * the operation was interrupted (only possible if the + * "interruptible" argument is asserted). Other return values + * indicate a permanent error in copying the data. + * + * The actual amount of data copied will be returned in the + * "copy_size" argument. In the event that the destination map + * verification failed, this amount may be less than the amount + * requested. + */ +kern_return_t vm_fault_copy( + src_object, + src_offset, + src_size, + dst_object, + dst_offset, + dst_map, + dst_version, + interruptible + ) + vm_object_t src_object; + vm_offset_t src_offset; + vm_size_t *src_size; /* INOUT */ + vm_object_t dst_object; + vm_offset_t dst_offset; + vm_map_t dst_map; + vm_map_version_t *dst_version; + boolean_t interruptible; +{ + vm_page_t result_page; + vm_prot_t prot; + + vm_page_t src_page; + vm_page_t src_top_page; + + vm_page_t dst_page; + vm_page_t dst_top_page; + + vm_size_t amount_done; + vm_object_t old_copy_object; + +#define RETURN(x) \ + MACRO_BEGIN \ + *src_size = amount_done; \ + MACRO_RETURN(x); \ + MACRO_END + + amount_done = 0; + do { /* while (amount_done != *src_size) */ + + RetrySourceFault: ; + + if (src_object == VM_OBJECT_NULL) { + /* + * No source object. We will just + * zero-fill the page in dst_object. + */ + + src_page = VM_PAGE_NULL; + } else { + prot = VM_PROT_READ; + + vm_object_lock(src_object); + vm_object_paging_begin(src_object); + + switch (vm_fault_page(src_object, src_offset, + VM_PROT_READ, FALSE, interruptible, + &prot, &result_page, &src_top_page, + FALSE, (void (*)()) 0)) { + + case VM_FAULT_SUCCESS: + break; + case VM_FAULT_RETRY: + goto RetrySourceFault; + case VM_FAULT_INTERRUPTED: + RETURN(MACH_SEND_INTERRUPTED); + case VM_FAULT_MEMORY_SHORTAGE: + VM_PAGE_WAIT((void (*)()) 0); + goto RetrySourceFault; + case VM_FAULT_FICTITIOUS_SHORTAGE: + vm_page_more_fictitious(); + goto RetrySourceFault; + case VM_FAULT_MEMORY_ERROR: + return(KERN_MEMORY_ERROR); + } + + src_page = result_page; + + assert((src_top_page == VM_PAGE_NULL) == + (src_page->object == src_object)); + + assert ((prot & VM_PROT_READ) != VM_PROT_NONE); + + vm_object_unlock(src_page->object); + } + + RetryDestinationFault: ; + + prot = VM_PROT_WRITE; + + vm_object_lock(dst_object); + vm_object_paging_begin(dst_object); + + switch (vm_fault_page(dst_object, dst_offset, VM_PROT_WRITE, + FALSE, FALSE /* interruptible */, + &prot, &result_page, &dst_top_page, + FALSE, (void (*)()) 0)) { + + case VM_FAULT_SUCCESS: + break; + case VM_FAULT_RETRY: + goto RetryDestinationFault; + case VM_FAULT_INTERRUPTED: + if (src_page != VM_PAGE_NULL) + vm_fault_copy_cleanup(src_page, + src_top_page); + RETURN(MACH_SEND_INTERRUPTED); + case VM_FAULT_MEMORY_SHORTAGE: + VM_PAGE_WAIT((void (*)()) 0); + goto RetryDestinationFault; + case VM_FAULT_FICTITIOUS_SHORTAGE: + vm_page_more_fictitious(); + goto RetryDestinationFault; + case VM_FAULT_MEMORY_ERROR: + if (src_page != VM_PAGE_NULL) + vm_fault_copy_cleanup(src_page, + src_top_page); + return(KERN_MEMORY_ERROR); + } + assert ((prot & VM_PROT_WRITE) != VM_PROT_NONE); + + dst_page = result_page; + + old_copy_object = dst_page->object->copy; + + vm_object_unlock(dst_page->object); + + if (!vm_map_verify(dst_map, dst_version)) { + + BailOut: ; + + if (src_page != VM_PAGE_NULL) + vm_fault_copy_cleanup(src_page, src_top_page); + vm_fault_copy_cleanup(dst_page, dst_top_page); + break; + } + + + vm_object_lock(dst_page->object); + if (dst_page->object->copy != old_copy_object) { + vm_object_unlock(dst_page->object); + vm_map_verify_done(dst_map, dst_version); + goto BailOut; + } + vm_object_unlock(dst_page->object); + + /* + * Copy the page, and note that it is dirty + * immediately. + */ + + if (src_page == VM_PAGE_NULL) + vm_page_zero_fill(dst_page); + else + vm_page_copy(src_page, dst_page); + dst_page->dirty = TRUE; + + /* + * Unlock everything, and return + */ + + vm_map_verify_done(dst_map, dst_version); + + if (src_page != VM_PAGE_NULL) + vm_fault_copy_cleanup(src_page, src_top_page); + vm_fault_copy_cleanup(dst_page, dst_top_page); + + amount_done += PAGE_SIZE; + src_offset += PAGE_SIZE; + dst_offset += PAGE_SIZE; + + } while (amount_done != *src_size); + + RETURN(KERN_SUCCESS); +#undef RETURN + + /*NOTREACHED*/ +} + + + + + +#ifdef notdef + +/* + * Routine: vm_fault_page_overwrite + * + * Description: + * A form of vm_fault_page that assumes that the + * resulting page will be overwritten in its entirety, + * making it unnecessary to obtain the correct *contents* + * of the page. + * + * Implementation: + * XXX Untested. Also unused. Eventually, this technology + * could be used in vm_fault_copy() to advantage. + */ +vm_fault_return_t vm_fault_page_overwrite(dst_object, dst_offset, result_page) + register + vm_object_t dst_object; + vm_offset_t dst_offset; + vm_page_t *result_page; /* OUT */ +{ + register + vm_page_t dst_page; + +#define interruptible FALSE /* XXX */ + + while (TRUE) { + /* + * Look for a page at this offset + */ + + while ((dst_page = vm_page_lookup(dst_object, dst_offset)) + == VM_PAGE_NULL) { + /* + * No page, no problem... just allocate one. + */ + + dst_page = vm_page_alloc(dst_object, dst_offset); + if (dst_page == VM_PAGE_NULL) { + vm_object_unlock(dst_object); + VM_PAGE_WAIT((void (*)()) 0); + vm_object_lock(dst_object); + continue; + } + + /* + * Pretend that the memory manager + * write-protected the page. + * + * Note that we will be asking for write + * permission without asking for the data + * first. + */ + + dst_page->overwriting = TRUE; + dst_page->page_lock = VM_PROT_WRITE; + dst_page->absent = TRUE; + dst_object->absent_count++; + + break; + + /* + * When we bail out, we might have to throw + * away the page created here. + */ + +#define DISCARD_PAGE \ + MACRO_BEGIN \ + vm_object_lock(dst_object); \ + dst_page = vm_page_lookup(dst_object, dst_offset); \ + if ((dst_page != VM_PAGE_NULL) && dst_page->overwriting) \ + VM_PAGE_FREE(dst_page); \ + vm_object_unlock(dst_object); \ + MACRO_END + } + + /* + * If the page is write-protected... + */ + + if (dst_page->page_lock & VM_PROT_WRITE) { + /* + * ... and an unlock request hasn't been sent + */ + + if ( ! (dst_page->unlock_request & VM_PROT_WRITE)) { + vm_prot_t u; + kern_return_t rc; + + /* + * ... then send one now. + */ + + if (!dst_object->pager_ready) { + vm_object_assert_wait(dst_object, + VM_OBJECT_EVENT_PAGER_READY, + interruptible); + vm_object_unlock(dst_object); + thread_block((void (*)()) 0); + if (current_thread()->wait_result != + THREAD_AWAKENED) { + DISCARD_PAGE; + return(VM_FAULT_INTERRUPTED); + } + continue; + } + + u = dst_page->unlock_request |= VM_PROT_WRITE; + vm_object_unlock(dst_object); + + if ((rc = memory_object_data_unlock( + dst_object->pager, + dst_object->pager_request, + dst_offset + dst_object->paging_offset, + PAGE_SIZE, + u)) != KERN_SUCCESS) { + printf("vm_object_overwrite: memory_object_data_unlock failed\n"); + DISCARD_PAGE; + return((rc == MACH_SEND_INTERRUPTED) ? + VM_FAULT_INTERRUPTED : + VM_FAULT_MEMORY_ERROR); + } + vm_object_lock(dst_object); + continue; + } + + /* ... fall through to wait below */ + } else { + /* + * If the page isn't being used for other + * purposes, then we're done. + */ + if ( ! (dst_page->busy || dst_page->absent || dst_page->error) ) + break; + } + + PAGE_ASSERT_WAIT(dst_page, interruptible); + vm_object_unlock(dst_object); + thread_block((void (*)()) 0); + if (current_thread()->wait_result != THREAD_AWAKENED) { + DISCARD_PAGE; + return(VM_FAULT_INTERRUPTED); + } + } + + *result_page = dst_page; + return(VM_FAULT_SUCCESS); + +#undef interruptible +#undef DISCARD_PAGE +} + +#endif notdef diff --git a/vm/vm_fault.h b/vm/vm_fault.h new file mode 100644 index 00000000..eee39994 --- /dev/null +++ b/vm/vm_fault.h @@ -0,0 +1,64 @@ +/* + * Mach Operating System + * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University + * All Rights Reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR + * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +/* + * File: vm/vm_fault.h + * + * Page fault handling module declarations. + */ + +#ifndef _VM_VM_FAULT_H_ +#define _VM_VM_FAULT_H_ + +#include <mach/kern_return.h> + +/* + * Page fault handling based on vm_object only. + */ + +typedef kern_return_t vm_fault_return_t; +#define VM_FAULT_SUCCESS 0 +#define VM_FAULT_RETRY 1 +#define VM_FAULT_INTERRUPTED 2 +#define VM_FAULT_MEMORY_SHORTAGE 3 +#define VM_FAULT_FICTITIOUS_SHORTAGE 4 +#define VM_FAULT_MEMORY_ERROR 5 + +extern void vm_fault_init(); +extern vm_fault_return_t vm_fault_page(); + +extern void vm_fault_cleanup(); +/* + * Page fault handling based on vm_map (or entries therein) + */ + +extern kern_return_t vm_fault(); +extern void vm_fault_wire(); +extern void vm_fault_unwire(); + +extern kern_return_t vm_fault_copy(); /* Copy pages from + * one object to another + */ +#endif _VM_VM_FAULT_H_ diff --git a/vm/vm_init.c b/vm/vm_init.c new file mode 100644 index 00000000..b76b11b6 --- /dev/null +++ b/vm/vm_init.c @@ -0,0 +1,84 @@ +/* + * Mach Operating System + * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University. + * Copyright (c) 1993,1994 The University of Utah and + * the Computer Systems Laboratory (CSL). + * All rights reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON, THE UNIVERSITY OF UTAH AND CSL ALLOW FREE USE OF + * THIS SOFTWARE IN ITS "AS IS" CONDITION, AND DISCLAIM ANY LIABILITY + * OF ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF + * THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +/* + * File: vm/vm_init.c + * Author: Avadis Tevanian, Jr., Michael Wayne Young + * Date: 1985 + * + * Initialize the Virtual Memory subsystem. + */ + +#include <mach/machine/vm_types.h> +#include <kern/zalloc.h> +#include <kern/kalloc.h> +#include <vm/vm_object.h> +#include <vm/vm_map.h> +#include <vm/vm_page.h> +#include <vm/vm_kern.h> +#include <vm/memory_object.h> + + + +/* + * vm_mem_bootstrap initializes the virtual memory system. + * This is done only by the first cpu up. + */ + +void vm_mem_bootstrap() +{ + vm_offset_t start, end; + + /* + * Initializes resident memory structures. + * From here on, all physical memory is accounted for, + * and we use only virtual addresses. + */ + + vm_page_bootstrap(&start, &end); + + /* + * Initialize other VM packages + */ + + zone_bootstrap(); + vm_object_bootstrap(); + vm_map_init(); + kmem_init(start, end); + pmap_init(); + zone_init(); + kalloc_init(); + vm_fault_init(); + vm_page_module_init(); + memory_manager_default_init(); +} + +void vm_mem_init() +{ + vm_object_init(); +} diff --git a/vm/vm_kern.c b/vm/vm_kern.c new file mode 100644 index 00000000..eb1e0795 --- /dev/null +++ b/vm/vm_kern.c @@ -0,0 +1,1072 @@ +/* + * Mach Operating System + * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University. + * Copyright (c) 1993,1994 The University of Utah and + * the Computer Systems Laboratory (CSL). + * All rights reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON, THE UNIVERSITY OF UTAH AND CSL ALLOW FREE USE OF + * THIS SOFTWARE IN ITS "AS IS" CONDITION, AND DISCLAIM ANY LIABILITY + * OF ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF + * THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +/* + * File: vm/vm_kern.c + * Author: Avadis Tevanian, Jr., Michael Wayne Young + * Date: 1985 + * + * Kernel memory management. + */ + +#include <mach/kern_return.h> +#include "vm_param.h" +#include <kern/assert.h> +#include <kern/lock.h> +#include <kern/thread.h> +#include <vm/vm_fault.h> +#include <vm/vm_kern.h> +#include <vm/vm_map.h> +#include <vm/vm_object.h> +#include <vm/vm_page.h> +#include <vm/vm_pageout.h> + + + +/* + * Variables exported by this module. + */ + +vm_map_t kernel_map; +vm_map_t kernel_pageable_map; + +extern void kmem_alloc_pages(); +extern void kmem_remap_pages(); + +/* + * projected_buffer_allocate + * + * Allocate a wired-down buffer shared between kernel and user task. + * Fresh, zero-filled memory is allocated. + * If persistence is false, this buffer can only be deallocated from + * user task using projected_buffer_deallocate, and deallocation + * from user task also deallocates the buffer from the kernel map. + * projected_buffer_collect is called from vm_map_deallocate to + * automatically deallocate projected buffers on task_deallocate. + * Sharing with more than one user task is achieved by using + * projected_buffer_map for the second and subsequent tasks. + * The user is precluded from manipulating the VM entry of this buffer + * (i.e. changing protection, inheritance or machine attributes). + */ + +kern_return_t +projected_buffer_allocate(map, size, persistence, kernel_p, + user_p, protection, inheritance) + vm_map_t map; + vm_size_t size; + int persistence; + vm_offset_t *kernel_p; + vm_offset_t *user_p; + vm_prot_t protection; + vm_inherit_t inheritance; /*Currently only VM_INHERIT_NONE supported*/ +{ + vm_object_t object; + vm_map_entry_t u_entry, k_entry; + vm_offset_t addr; + vm_size_t r_size; + kern_return_t kr; + + if (map == VM_MAP_NULL || map == kernel_map) + return(KERN_INVALID_ARGUMENT); + + /* + * Allocate a new object. + */ + + size = round_page(size); + object = vm_object_allocate(size); + + vm_map_lock(kernel_map); + kr = vm_map_find_entry(kernel_map, &addr, size, (vm_offset_t) 0, + VM_OBJECT_NULL, &k_entry); + if (kr != KERN_SUCCESS) { + vm_map_unlock(kernel_map); + vm_object_deallocate(object); + return kr; + } + + k_entry->object.vm_object = object; + if (!persistence) + k_entry->projected_on = (vm_map_entry_t) -1; + /*Mark entry so as to automatically deallocate it when + last corresponding user entry is deallocated*/ + vm_map_unlock(kernel_map); + *kernel_p = addr; + + vm_map_lock(map); + kr = vm_map_find_entry(map, &addr, size, (vm_offset_t) 0, + VM_OBJECT_NULL, &u_entry); + if (kr != KERN_SUCCESS) { + vm_map_unlock(map); + vm_map_lock(kernel_map); + vm_map_entry_delete(kernel_map, k_entry); + vm_map_unlock(kernel_map); + vm_object_deallocate(object); + return kr; + } + + u_entry->object.vm_object = object; + vm_object_reference(object); + u_entry->projected_on = k_entry; + /*Creates coupling with kernel mapping of the buffer, and + also guarantees that user cannot directly manipulate + buffer VM entry*/ + u_entry->protection = protection; + u_entry->max_protection = protection; + u_entry->inheritance = inheritance; + vm_map_unlock(map); + *user_p = addr; + + /* + * Allocate wired-down memory in the object, + * and enter it in the kernel pmap. + */ + kmem_alloc_pages(object, 0, + *kernel_p, *kernel_p + size, + VM_PROT_READ | VM_PROT_WRITE); + bzero(*kernel_p, size); /*Zero fill*/ + + /* Set up physical mappings for user pmap */ + + pmap_pageable(map->pmap, *user_p, *user_p + size, FALSE); + for (r_size = 0; r_size < size; r_size += PAGE_SIZE) { + addr = pmap_extract(kernel_pmap, *kernel_p + r_size); + pmap_enter(map->pmap, *user_p + r_size, addr, + protection, TRUE); + } + + return(KERN_SUCCESS); +} + + +/* + * projected_buffer_map + * + * Map an area of kernel memory onto a task's address space. + * No new memory is allocated; the area must previously exist in the + * kernel memory map. + */ + +kern_return_t +projected_buffer_map(map, kernel_addr, size, user_p, protection, inheritance) + vm_map_t map; + vm_offset_t kernel_addr; + vm_size_t size; + vm_offset_t *user_p; + vm_prot_t protection; + vm_inherit_t inheritance; /*Currently only VM_INHERIT_NONE supported*/ +{ + vm_object_t object; + vm_map_entry_t u_entry, k_entry; + vm_offset_t physical_addr, user_addr; + vm_size_t r_size; + kern_return_t kr; + + /* + * Find entry in kernel map + */ + + size = round_page(size); + if (map == VM_MAP_NULL || map == kernel_map || + !vm_map_lookup_entry(kernel_map, kernel_addr, &k_entry) || + kernel_addr + size > k_entry->vme_end) + return(KERN_INVALID_ARGUMENT); + + + /* + * Create entry in user task + */ + + vm_map_lock(map); + kr = vm_map_find_entry(map, &user_addr, size, (vm_offset_t) 0, + VM_OBJECT_NULL, &u_entry); + if (kr != KERN_SUCCESS) { + vm_map_unlock(map); + return kr; + } + + u_entry->object.vm_object = k_entry->object.vm_object; + vm_object_reference(k_entry->object.vm_object); + u_entry->offset = kernel_addr - k_entry->vme_start + k_entry->offset; + u_entry->projected_on = k_entry; + /*Creates coupling with kernel mapping of the buffer, and + also guarantees that user cannot directly manipulate + buffer VM entry*/ + u_entry->protection = protection; + u_entry->max_protection = protection; + u_entry->inheritance = inheritance; + u_entry->wired_count = k_entry->wired_count; + vm_map_unlock(map); + *user_p = user_addr; + + /* Set up physical mappings for user pmap */ + + pmap_pageable(map->pmap, user_addr, user_addr + size, + !k_entry->wired_count); + for (r_size = 0; r_size < size; r_size += PAGE_SIZE) { + physical_addr = pmap_extract(kernel_pmap, kernel_addr + r_size); + pmap_enter(map->pmap, user_addr + r_size, physical_addr, + protection, k_entry->wired_count); + } + + return(KERN_SUCCESS); +} + + +/* + * projected_buffer_deallocate + * + * Unmap projected buffer from task's address space. + * May also unmap buffer from kernel map, if buffer is not + * persistent and only the kernel reference remains. + */ + +kern_return_t +projected_buffer_deallocate(map, start, end) + vm_map_t map; + vm_offset_t start, end; +{ + vm_map_entry_t entry, k_entry; + + vm_map_lock(map); + if (map == VM_MAP_NULL || map == kernel_map || + !vm_map_lookup_entry(map, start, &entry) || + end > entry->vme_end || + /*Check corresponding kernel entry*/ + (k_entry = entry->projected_on) == 0) { + vm_map_unlock(map); + return(KERN_INVALID_ARGUMENT); + } + + /*Prepare for deallocation*/ + if (entry->vme_start < start) + _vm_map_clip_start(map, entry, start); + if (entry->vme_end > end) + _vm_map_clip_end(map, entry, end); + if (map->first_free == entry) /*Adjust first_free hint*/ + map->first_free = entry->vme_prev; + entry->projected_on = 0; /*Needed to allow deletion*/ + entry->wired_count = 0; /*Avoid unwire fault*/ + vm_map_entry_delete(map, entry); + vm_map_unlock(map); + + /*Check if the buffer is not persistent and only the + kernel mapping remains, and if so delete it*/ + vm_map_lock(kernel_map); + if (k_entry->projected_on == (vm_map_entry_t) -1 && + k_entry->object.vm_object->ref_count == 1) { + if (kernel_map->first_free == k_entry) + kernel_map->first_free = k_entry->vme_prev; + k_entry->projected_on = 0; /*Allow unwire fault*/ + vm_map_entry_delete(kernel_map, k_entry); + } + vm_map_unlock(kernel_map); + return(KERN_SUCCESS); +} + + +/* + * projected_buffer_collect + * + * Unmap all projected buffers from task's address space. + */ + +kern_return_t +projected_buffer_collect(map) + vm_map_t map; +{ + vm_map_entry_t entry, next; + + if (map == VM_MAP_NULL || map == kernel_map) + return(KERN_INVALID_ARGUMENT); + + for (entry = vm_map_first_entry(map); + entry != vm_map_to_entry(map); + entry = next) { + next = entry->vme_next; + if (entry->projected_on != 0) + projected_buffer_deallocate(map, entry->vme_start, entry->vme_end); + } + return(KERN_SUCCESS); +} + + +/* + * projected_buffer_in_range + * + * Verifies whether a projected buffer exists in the address range + * given. + */ + +boolean_t +projected_buffer_in_range(map, start, end) + vm_map_t map; + vm_offset_t start, end; +{ + vm_map_entry_t entry; + + if (map == VM_MAP_NULL || map == kernel_map) + return(FALSE); + + /*Find first entry*/ + if (!vm_map_lookup_entry(map, start, &entry)) + entry = entry->vme_next; + + while (entry != vm_map_to_entry(map) && entry->projected_on == 0 && + entry->vme_start <= end) { + entry = entry->vme_next; + } + return(entry != vm_map_to_entry(map) && entry->vme_start <= end); +} + + +/* + * kmem_alloc: + * + * Allocate wired-down memory in the kernel's address map + * or a submap. The memory is not zero-filled. + */ + +kern_return_t +kmem_alloc(map, addrp, size) + vm_map_t map; + vm_offset_t *addrp; + vm_size_t size; +{ + vm_object_t object; + vm_map_entry_t entry; + vm_offset_t addr; + kern_return_t kr; + + /* + * Allocate a new object. We must do this before locking + * the map, lest we risk deadlock with the default pager: + * device_read_alloc uses kmem_alloc, + * which tries to allocate an object, + * which uses kmem_alloc_wired to get memory, + * which blocks for pages. + * then the default pager needs to read a block + * to process a memory_object_data_write, + * and device_read_alloc calls kmem_alloc + * and deadlocks on the map lock. + */ + + size = round_page(size); + object = vm_object_allocate(size); + + vm_map_lock(map); + kr = vm_map_find_entry(map, &addr, size, (vm_offset_t) 0, + VM_OBJECT_NULL, &entry); + if (kr != KERN_SUCCESS) { + vm_map_unlock(map); + vm_object_deallocate(object); + return kr; + } + + entry->object.vm_object = object; + entry->offset = 0; + + /* + * Since we have not given out this address yet, + * it is safe to unlock the map. + */ + vm_map_unlock(map); + + /* + * Allocate wired-down memory in the kernel_object, + * for this entry, and enter it in the kernel pmap. + */ + kmem_alloc_pages(object, 0, + addr, addr + size, + VM_PROT_DEFAULT); + + /* + * Return the memory, not zeroed. + */ + *addrp = addr; + return KERN_SUCCESS; +} + +/* + * kmem_realloc: + * + * Reallocate wired-down memory in the kernel's address map + * or a submap. Newly allocated pages are not zeroed. + * This can only be used on regions allocated with kmem_alloc. + * + * If successful, the pages in the old region are mapped twice. + * The old region is unchanged. Use kmem_free to get rid of it. + */ +kern_return_t kmem_realloc(map, oldaddr, oldsize, newaddrp, newsize) + vm_map_t map; + vm_offset_t oldaddr; + vm_size_t oldsize; + vm_offset_t *newaddrp; + vm_size_t newsize; +{ + vm_offset_t oldmin, oldmax; + vm_offset_t newaddr; + vm_object_t object; + vm_map_entry_t oldentry, newentry; + kern_return_t kr; + + oldmin = trunc_page(oldaddr); + oldmax = round_page(oldaddr + oldsize); + oldsize = oldmax - oldmin; + newsize = round_page(newsize); + + /* + * Find space for the new region. + */ + + vm_map_lock(map); + kr = vm_map_find_entry(map, &newaddr, newsize, (vm_offset_t) 0, + VM_OBJECT_NULL, &newentry); + if (kr != KERN_SUCCESS) { + vm_map_unlock(map); + return kr; + } + + /* + * Find the VM object backing the old region. + */ + + if (!vm_map_lookup_entry(map, oldmin, &oldentry)) + panic("kmem_realloc"); + object = oldentry->object.vm_object; + + /* + * Increase the size of the object and + * fill in the new region. + */ + + vm_object_reference(object); + vm_object_lock(object); + if (object->size != oldsize) + panic("kmem_realloc"); + object->size = newsize; + vm_object_unlock(object); + + newentry->object.vm_object = object; + newentry->offset = 0; + + /* + * Since we have not given out this address yet, + * it is safe to unlock the map. We are trusting + * that nobody will play with either region. + */ + + vm_map_unlock(map); + + /* + * Remap the pages in the old region and + * allocate more pages for the new region. + */ + + kmem_remap_pages(object, 0, + newaddr, newaddr + oldsize, + VM_PROT_DEFAULT); + kmem_alloc_pages(object, oldsize, + newaddr + oldsize, newaddr + newsize, + VM_PROT_DEFAULT); + + *newaddrp = newaddr; + return KERN_SUCCESS; +} + +/* + * kmem_alloc_wired: + * + * Allocate wired-down memory in the kernel's address map + * or a submap. The memory is not zero-filled. + * + * The memory is allocated in the kernel_object. + * It may not be copied with vm_map_copy, and + * it may not be reallocated with kmem_realloc. + */ + +kern_return_t +kmem_alloc_wired(map, addrp, size) + vm_map_t map; + vm_offset_t *addrp; + vm_size_t size; +{ + vm_map_entry_t entry; + vm_offset_t offset; + vm_offset_t addr; + kern_return_t kr; + + /* + * Use the kernel object for wired-down kernel pages. + * Assume that no region of the kernel object is + * referenced more than once. We want vm_map_find_entry + * to extend an existing entry if possible. + */ + + size = round_page(size); + vm_map_lock(map); + kr = vm_map_find_entry(map, &addr, size, (vm_offset_t) 0, + kernel_object, &entry); + if (kr != KERN_SUCCESS) { + vm_map_unlock(map); + return kr; + } + + /* + * Since we didn't know where the new region would + * start, we couldn't supply the correct offset into + * the kernel object. We only initialize the entry + * if we aren't extending an existing entry. + */ + + offset = addr - VM_MIN_KERNEL_ADDRESS; + + if (entry->object.vm_object == VM_OBJECT_NULL) { + vm_object_reference(kernel_object); + + entry->object.vm_object = kernel_object; + entry->offset = offset; + } + + /* + * Since we have not given out this address yet, + * it is safe to unlock the map. + */ + vm_map_unlock(map); + + /* + * Allocate wired-down memory in the kernel_object, + * for this entry, and enter it in the kernel pmap. + */ + kmem_alloc_pages(kernel_object, offset, + addr, addr + size, + VM_PROT_DEFAULT); + + /* + * Return the memory, not zeroed. + */ + *addrp = addr; + return KERN_SUCCESS; +} + +/* + * kmem_alloc_aligned: + * + * Like kmem_alloc_wired, except that the memory is aligned. + * The size should be a power-of-2. + */ + +kern_return_t +kmem_alloc_aligned(map, addrp, size) + vm_map_t map; + vm_offset_t *addrp; + vm_size_t size; +{ + vm_map_entry_t entry; + vm_offset_t offset; + vm_offset_t addr; + kern_return_t kr; + + if ((size & (size - 1)) != 0) + panic("kmem_alloc_aligned"); + + /* + * Use the kernel object for wired-down kernel pages. + * Assume that no region of the kernel object is + * referenced more than once. We want vm_map_find_entry + * to extend an existing entry if possible. + */ + + size = round_page(size); + vm_map_lock(map); + kr = vm_map_find_entry(map, &addr, size, size - 1, + kernel_object, &entry); + if (kr != KERN_SUCCESS) { + vm_map_unlock(map); + return kr; + } + + /* + * Since we didn't know where the new region would + * start, we couldn't supply the correct offset into + * the kernel object. We only initialize the entry + * if we aren't extending an existing entry. + */ + + offset = addr - VM_MIN_KERNEL_ADDRESS; + + if (entry->object.vm_object == VM_OBJECT_NULL) { + vm_object_reference(kernel_object); + + entry->object.vm_object = kernel_object; + entry->offset = offset; + } + + /* + * Since we have not given out this address yet, + * it is safe to unlock the map. + */ + vm_map_unlock(map); + + /* + * Allocate wired-down memory in the kernel_object, + * for this entry, and enter it in the kernel pmap. + */ + kmem_alloc_pages(kernel_object, offset, + addr, addr + size, + VM_PROT_DEFAULT); + + /* + * Return the memory, not zeroed. + */ + *addrp = addr; + return KERN_SUCCESS; +} + +/* + * kmem_alloc_pageable: + * + * Allocate pageable memory in the kernel's address map. + */ + +kern_return_t +kmem_alloc_pageable(map, addrp, size) + vm_map_t map; + vm_offset_t *addrp; + vm_size_t size; +{ + vm_offset_t addr; + kern_return_t kr; + + addr = vm_map_min(map); + kr = vm_map_enter(map, &addr, round_page(size), + (vm_offset_t) 0, TRUE, + VM_OBJECT_NULL, (vm_offset_t) 0, FALSE, + VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT); + if (kr != KERN_SUCCESS) + return kr; + + *addrp = addr; + return KERN_SUCCESS; +} + +/* + * kmem_free: + * + * Release a region of kernel virtual memory allocated + * with kmem_alloc, kmem_alloc_wired, or kmem_alloc_pageable, + * and return the physical pages associated with that region. + */ + +void +kmem_free(map, addr, size) + vm_map_t map; + vm_offset_t addr; + vm_size_t size; +{ + kern_return_t kr; + + kr = vm_map_remove(map, trunc_page(addr), round_page(addr + size)); + if (kr != KERN_SUCCESS) + panic("kmem_free"); +} + +/* + * Allocate new wired pages in an object. + * The object is assumed to be mapped into the kernel map or + * a submap. + */ +void +kmem_alloc_pages(object, offset, start, end, protection) + register vm_object_t object; + register vm_offset_t offset; + register vm_offset_t start, end; + vm_prot_t protection; +{ + /* + * Mark the pmap region as not pageable. + */ + pmap_pageable(kernel_pmap, start, end, FALSE); + + while (start < end) { + register vm_page_t mem; + + vm_object_lock(object); + + /* + * Allocate a page + */ + while ((mem = vm_page_alloc(object, offset)) + == VM_PAGE_NULL) { + vm_object_unlock(object); + VM_PAGE_WAIT((void (*)()) 0); + vm_object_lock(object); + } + + /* + * Wire it down + */ + vm_page_lock_queues(); + vm_page_wire(mem); + vm_page_unlock_queues(); + vm_object_unlock(object); + + /* + * Enter it in the kernel pmap + */ + PMAP_ENTER(kernel_pmap, start, mem, + protection, TRUE); + + vm_object_lock(object); + PAGE_WAKEUP_DONE(mem); + vm_object_unlock(object); + + start += PAGE_SIZE; + offset += PAGE_SIZE; + } +} + +/* + * Remap wired pages in an object into a new region. + * The object is assumed to be mapped into the kernel map or + * a submap. + */ +void +kmem_remap_pages(object, offset, start, end, protection) + register vm_object_t object; + register vm_offset_t offset; + register vm_offset_t start, end; + vm_prot_t protection; +{ + /* + * Mark the pmap region as not pageable. + */ + pmap_pageable(kernel_pmap, start, end, FALSE); + + while (start < end) { + register vm_page_t mem; + + vm_object_lock(object); + + /* + * Find a page + */ + if ((mem = vm_page_lookup(object, offset)) == VM_PAGE_NULL) + panic("kmem_remap_pages"); + + /* + * Wire it down (again) + */ + vm_page_lock_queues(); + vm_page_wire(mem); + vm_page_unlock_queues(); + vm_object_unlock(object); + + /* + * Enter it in the kernel pmap. The page isn't busy, + * but this shouldn't be a problem because it is wired. + */ + PMAP_ENTER(kernel_pmap, start, mem, + protection, TRUE); + + start += PAGE_SIZE; + offset += PAGE_SIZE; + } +} + +/* + * kmem_suballoc: + * + * Allocates a map to manage a subrange + * of the kernel virtual address space. + * + * Arguments are as follows: + * + * parent Map to take range from + * size Size of range to find + * min, max Returned endpoints of map + * pageable Can the region be paged + */ + +vm_map_t +kmem_suballoc(parent, min, max, size, pageable) + vm_map_t parent; + vm_offset_t *min, *max; + vm_size_t size; + boolean_t pageable; +{ + vm_map_t map; + vm_offset_t addr; + kern_return_t kr; + + size = round_page(size); + + /* + * Need reference on submap object because it is internal + * to the vm_system. vm_object_enter will never be called + * on it (usual source of reference for vm_map_enter). + */ + vm_object_reference(vm_submap_object); + + addr = (vm_offset_t) vm_map_min(parent); + kr = vm_map_enter(parent, &addr, size, + (vm_offset_t) 0, TRUE, + vm_submap_object, (vm_offset_t) 0, FALSE, + VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT); + if (kr != KERN_SUCCESS) + panic("kmem_suballoc"); + + pmap_reference(vm_map_pmap(parent)); + map = vm_map_create(vm_map_pmap(parent), addr, addr + size, pageable); + if (map == VM_MAP_NULL) + panic("kmem_suballoc"); + + kr = vm_map_submap(parent, addr, addr + size, map); + if (kr != KERN_SUCCESS) + panic("kmem_suballoc"); + + *min = addr; + *max = addr + size; + return map; +} + +/* + * kmem_init: + * + * Initialize the kernel's virtual memory map, taking + * into account all memory allocated up to this time. + */ +void kmem_init(start, end) + vm_offset_t start; + vm_offset_t end; +{ + kernel_map = vm_map_create(pmap_kernel(), + VM_MIN_KERNEL_ADDRESS, end, + FALSE); + + /* + * Reserve virtual memory allocated up to this time. + */ + + if (start != VM_MIN_KERNEL_ADDRESS) { + kern_return_t rc; + vm_offset_t addr = VM_MIN_KERNEL_ADDRESS; + rc = vm_map_enter(kernel_map, + &addr, start - VM_MIN_KERNEL_ADDRESS, + (vm_offset_t) 0, TRUE, + VM_OBJECT_NULL, (vm_offset_t) 0, FALSE, + VM_PROT_DEFAULT, VM_PROT_ALL, + VM_INHERIT_DEFAULT); + if (rc) + panic("%s:%d: vm_map_enter failed (%d)\n", rc); + } +} + +/* + * New and improved IO wiring support. + */ + +/* + * kmem_io_map_copyout: + * + * Establish temporary mapping in designated map for the memory + * passed in. Memory format must be a page_list vm_map_copy. + * Mapping is READ-ONLY. + */ + +kern_return_t +kmem_io_map_copyout(map, addr, alloc_addr, alloc_size, copy, min_size) + vm_map_t map; + vm_offset_t *addr; /* actual addr of data */ + vm_offset_t *alloc_addr; /* page aligned addr */ + vm_size_t *alloc_size; /* size allocated */ + vm_map_copy_t copy; + vm_size_t min_size; /* Do at least this much */ +{ + vm_offset_t myaddr, offset; + vm_size_t mysize, copy_size; + kern_return_t ret; + register + vm_page_t *page_list; + vm_map_copy_t new_copy; + register + int i; + + assert(copy->type == VM_MAP_COPY_PAGE_LIST); + assert(min_size != 0); + + /* + * Figure out the size in vm pages. + */ + min_size += copy->offset - trunc_page(copy->offset); + min_size = round_page(min_size); + mysize = round_page(copy->offset + copy->size) - + trunc_page(copy->offset); + + /* + * If total size is larger than one page list and + * we don't have to do more than one page list, then + * only do one page list. + * + * XXX Could be much smarter about this ... like trimming length + * XXX if we need more than one page list but not all of them. + */ + + copy_size = ptoa(copy->cpy_npages); + if (mysize > copy_size && copy_size > min_size) + mysize = copy_size; + + /* + * Allocate some address space in the map (must be kernel + * space). + */ + myaddr = vm_map_min(map); + ret = vm_map_enter(map, &myaddr, mysize, + (vm_offset_t) 0, TRUE, + VM_OBJECT_NULL, (vm_offset_t) 0, FALSE, + VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT); + + if (ret != KERN_SUCCESS) + return(ret); + + /* + * Tell the pmap module that this will be wired, and + * enter the mappings. + */ + pmap_pageable(vm_map_pmap(map), myaddr, myaddr + mysize, TRUE); + + *addr = myaddr + (copy->offset - trunc_page(copy->offset)); + *alloc_addr = myaddr; + *alloc_size = mysize; + + offset = myaddr; + page_list = ©->cpy_page_list[0]; + while (TRUE) { + for ( i = 0; i < copy->cpy_npages; i++, offset += PAGE_SIZE) { + PMAP_ENTER(vm_map_pmap(map), offset, *page_list, + VM_PROT_READ, TRUE); + page_list++; + } + + if (offset == (myaddr + mysize)) + break; + + /* + * Onward to the next page_list. The extend_cont + * leaves the current page list's pages alone; + * they'll be cleaned up at discard. Reset this + * copy's continuation to discard the next one. + */ + vm_map_copy_invoke_extend_cont(copy, &new_copy, &ret); + + if (ret != KERN_SUCCESS) { + kmem_io_map_deallocate(map, myaddr, mysize); + return(ret); + } + copy->cpy_cont = vm_map_copy_discard_cont; + copy->cpy_cont_args = (char *) new_copy; + copy = new_copy; + page_list = ©->cpy_page_list[0]; + } + + return(ret); +} + +/* + * kmem_io_map_deallocate: + * + * Get rid of the mapping established by kmem_io_map_copyout. + * Assumes that addr and size have been rounded to page boundaries. + * (e.g., the alloc_addr and alloc_size returned by kmem_io_map_copyout) + */ + +void +kmem_io_map_deallocate(map, addr, size) + vm_map_t map; + vm_offset_t addr; + vm_size_t size; +{ + /* + * Remove the mappings. The pmap_remove is needed. + */ + + pmap_remove(vm_map_pmap(map), addr, addr + size); + vm_map_remove(map, addr, addr + size); +} + +/* + * Routine: copyinmap + * Purpose: + * Like copyin, except that fromaddr is an address + * in the specified VM map. This implementation + * is incomplete; it handles the current user map + * and the kernel map/submaps. + */ + +int copyinmap(map, fromaddr, toaddr, length) + vm_map_t map; + char *fromaddr, *toaddr; + int length; +{ + if (vm_map_pmap(map) == kernel_pmap) { + /* assume a correct copy */ + bcopy(fromaddr, toaddr, length); + return 0; + } + + if (current_map() == map) + return copyin( fromaddr, toaddr, length); + + return 1; +} + +/* + * Routine: copyoutmap + * Purpose: + * Like copyout, except that toaddr is an address + * in the specified VM map. This implementation + * is incomplete; it handles the current user map + * and the kernel map/submaps. + */ + +int copyoutmap(map, fromaddr, toaddr, length) + vm_map_t map; + char *fromaddr, *toaddr; + int length; +{ + if (vm_map_pmap(map) == kernel_pmap) { + /* assume a correct copy */ + bcopy(fromaddr, toaddr, length); + return 0; + } + + if (current_map() == map) + return copyout(fromaddr, toaddr, length); + + return 1; +} diff --git a/vm/vm_kern.h b/vm/vm_kern.h new file mode 100644 index 00000000..8e00fcce --- /dev/null +++ b/vm/vm_kern.h @@ -0,0 +1,63 @@ +/* + * Mach Operating System + * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University + * All Rights Reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR + * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +/* + * File: vm/vm_kern.h + * Author: Avadis Tevanian, Jr., Michael Wayne Young + * Date: 1985 + * + * Kernel memory management definitions. + */ + +#ifndef _VM_VM_KERN_H_ +#define _VM_VM_KERN_H_ + +#include <mach/kern_return.h> +#include <vm/vm_map.h> + +extern kern_return_t projected_buffer_allocate(); +extern kern_return_t projected_buffer_deallocate(); +extern kern_return_t projected_buffer_map(); +extern kern_return_t projected_buffer_collect(); + +extern void kmem_init(); + +extern kern_return_t kmem_alloc(); +extern kern_return_t kmem_alloc_pageable(); +extern kern_return_t kmem_alloc_wired(); +extern kern_return_t kmem_alloc_aligned(); +extern kern_return_t kmem_realloc(); +extern void kmem_free(); + +extern vm_map_t kmem_suballoc(); + +extern kern_return_t kmem_io_map_copyout(); +extern void kmem_io_map_deallocate(); + +extern vm_map_t kernel_map; +extern vm_map_t kernel_pageable_map; +extern vm_map_t ipc_kernel_map; + +#endif _VM_VM_KERN_H_ diff --git a/vm/vm_map.c b/vm/vm_map.c new file mode 100644 index 00000000..c71b8580 --- /dev/null +++ b/vm/vm_map.c @@ -0,0 +1,5244 @@ +/* + * Mach Operating System + * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University. + * Copyright (c) 1993,1994 The University of Utah and + * the Computer Systems Laboratory (CSL). + * All rights reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON, THE UNIVERSITY OF UTAH AND CSL ALLOW FREE USE OF + * THIS SOFTWARE IN ITS "AS IS" CONDITION, AND DISCLAIM ANY LIABILITY + * OF ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF + * THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +/* + * File: vm/vm_map.c + * Author: Avadis Tevanian, Jr., Michael Wayne Young + * Date: 1985 + * + * Virtual memory mapping module. + */ + +#include <norma_ipc.h> + +#include <mach/kern_return.h> +#include <mach/port.h> +#include <mach/vm_attributes.h> +#include <mach/vm_param.h> +#include <kern/assert.h> +#include <kern/zalloc.h> +#include <vm/vm_fault.h> +#include <vm/vm_map.h> +#include <vm/vm_object.h> +#include <vm/vm_page.h> +#include <vm/vm_kern.h> +#include <ipc/ipc_port.h> + +/* + * Macros to copy a vm_map_entry. We must be careful to correctly + * manage the wired page count. vm_map_entry_copy() creates a new + * map entry to the same memory - the wired count in the new entry + * must be set to zero. vm_map_entry_copy_full() creates a new + * entry that is identical to the old entry. This preserves the + * wire count; it's used for map splitting and zone changing in + * vm_map_copyout. + */ +#define vm_map_entry_copy(NEW,OLD) \ +MACRO_BEGIN \ + *(NEW) = *(OLD); \ + (NEW)->is_shared = FALSE; \ + (NEW)->needs_wakeup = FALSE; \ + (NEW)->in_transition = FALSE; \ + (NEW)->wired_count = 0; \ + (NEW)->user_wired_count = 0; \ +MACRO_END + +#define vm_map_entry_copy_full(NEW,OLD) (*(NEW) = *(OLD)) + +/* + * Virtual memory maps provide for the mapping, protection, + * and sharing of virtual memory objects. In addition, + * this module provides for an efficient virtual copy of + * memory from one map to another. + * + * Synchronization is required prior to most operations. + * + * Maps consist of an ordered doubly-linked list of simple + * entries; a single hint is used to speed up lookups. + * + * Sharing maps have been deleted from this version of Mach. + * All shared objects are now mapped directly into the respective + * maps. This requires a change in the copy on write strategy; + * the asymmetric (delayed) strategy is used for shared temporary + * objects instead of the symmetric (shadow) strategy. This is + * selected by the (new) use_shared_copy bit in the object. See + * vm_object_copy_temporary in vm_object.c for details. All maps + * are now "top level" maps (either task map, kernel map or submap + * of the kernel map). + * + * Since portions of maps are specified by start/end addreses, + * which may not align with existing map entries, all + * routines merely "clip" entries to these start/end values. + * [That is, an entry is split into two, bordering at a + * start or end value.] Note that these clippings may not + * always be necessary (as the two resulting entries are then + * not changed); however, the clipping is done for convenience. + * No attempt is currently made to "glue back together" two + * abutting entries. + * + * The symmetric (shadow) copy strategy implements virtual copy + * by copying VM object references from one map to + * another, and then marking both regions as copy-on-write. + * It is important to note that only one writeable reference + * to a VM object region exists in any map when this strategy + * is used -- this means that shadow object creation can be + * delayed until a write operation occurs. The asymmetric (delayed) + * strategy allows multiple maps to have writeable references to + * the same region of a vm object, and hence cannot delay creating + * its copy objects. See vm_object_copy_temporary() in vm_object.c. + * Copying of permanent objects is completely different; see + * vm_object_copy_strategically() in vm_object.c. + */ + +zone_t vm_map_zone; /* zone for vm_map structures */ +zone_t vm_map_entry_zone; /* zone for vm_map_entry structures */ +zone_t vm_map_kentry_zone; /* zone for kernel entry structures */ +zone_t vm_map_copy_zone; /* zone for vm_map_copy structures */ + +boolean_t vm_map_lookup_entry(); /* forward declaration */ + +/* + * Placeholder object for submap operations. This object is dropped + * into the range by a call to vm_map_find, and removed when + * vm_map_submap creates the submap. + */ + +vm_object_t vm_submap_object; + +/* + * vm_map_init: + * + * Initialize the vm_map module. Must be called before + * any other vm_map routines. + * + * Map and entry structures are allocated from zones -- we must + * initialize those zones. + * + * There are three zones of interest: + * + * vm_map_zone: used to allocate maps. + * vm_map_entry_zone: used to allocate map entries. + * vm_map_kentry_zone: used to allocate map entries for the kernel. + * + * The kernel allocates map entries from a special zone that is initially + * "crammed" with memory. It would be difficult (perhaps impossible) for + * the kernel to allocate more memory to a entry zone when it became + * empty since the very act of allocating memory implies the creation + * of a new entry. + */ + +vm_offset_t kentry_data; +vm_size_t kentry_data_size; +int kentry_count = 256; /* to init kentry_data_size */ + +void vm_map_init() +{ + vm_map_zone = zinit((vm_size_t) sizeof(struct vm_map), 40*1024, + PAGE_SIZE, 0, "maps"); + vm_map_entry_zone = zinit((vm_size_t) sizeof(struct vm_map_entry), + 1024*1024, PAGE_SIZE*5, + 0, "non-kernel map entries"); + vm_map_kentry_zone = zinit((vm_size_t) sizeof(struct vm_map_entry), + kentry_data_size, kentry_data_size, + ZONE_FIXED /* XXX */, "kernel map entries"); + + vm_map_copy_zone = zinit((vm_size_t) sizeof(struct vm_map_copy), + 16*1024, PAGE_SIZE, 0, + "map copies"); + + /* + * Cram the kentry zone with initial data. + */ + zcram(vm_map_kentry_zone, kentry_data, kentry_data_size); + + /* + * Submap object is initialized by vm_object_init. + */ +} + +/* + * vm_map_create: + * + * Creates and returns a new empty VM map with + * the given physical map structure, and having + * the given lower and upper address bounds. + */ +vm_map_t vm_map_create(pmap, min, max, pageable) + pmap_t pmap; + vm_offset_t min, max; + boolean_t pageable; +{ + register vm_map_t result; + + result = (vm_map_t) zalloc(vm_map_zone); + if (result == VM_MAP_NULL) + panic("vm_map_create"); + + vm_map_first_entry(result) = vm_map_to_entry(result); + vm_map_last_entry(result) = vm_map_to_entry(result); + result->hdr.nentries = 0; + result->hdr.entries_pageable = pageable; + + result->size = 0; + result->ref_count = 1; + result->pmap = pmap; + result->min_offset = min; + result->max_offset = max; + result->wiring_required = FALSE; + result->wait_for_space = FALSE; + result->first_free = vm_map_to_entry(result); + result->hint = vm_map_to_entry(result); + vm_map_lock_init(result); + simple_lock_init(&result->ref_lock); + simple_lock_init(&result->hint_lock); + + return(result); +} + +/* + * vm_map_entry_create: [ internal use only ] + * + * Allocates a VM map entry for insertion in the + * given map (or map copy). No fields are filled. + */ +#define vm_map_entry_create(map) \ + _vm_map_entry_create(&(map)->hdr) + +#define vm_map_copy_entry_create(copy) \ + _vm_map_entry_create(&(copy)->cpy_hdr) + +vm_map_entry_t _vm_map_entry_create(map_header) + register struct vm_map_header *map_header; +{ + register zone_t zone; + register vm_map_entry_t entry; + + if (map_header->entries_pageable) + zone = vm_map_entry_zone; + else + zone = vm_map_kentry_zone; + + entry = (vm_map_entry_t) zalloc(zone); + if (entry == VM_MAP_ENTRY_NULL) + panic("vm_map_entry_create"); + + return(entry); +} + +/* + * vm_map_entry_dispose: [ internal use only ] + * + * Inverse of vm_map_entry_create. + */ +#define vm_map_entry_dispose(map, entry) \ + _vm_map_entry_dispose(&(map)->hdr, (entry)) + +#define vm_map_copy_entry_dispose(map, entry) \ + _vm_map_entry_dispose(&(copy)->cpy_hdr, (entry)) + +void _vm_map_entry_dispose(map_header, entry) + register struct vm_map_header *map_header; + register vm_map_entry_t entry; +{ + register zone_t zone; + + if (map_header->entries_pageable) + zone = vm_map_entry_zone; + else + zone = vm_map_kentry_zone; + + zfree(zone, (vm_offset_t) entry); +} + +/* + * vm_map_entry_{un,}link: + * + * Insert/remove entries from maps (or map copies). + */ +#define vm_map_entry_link(map, after_where, entry) \ + _vm_map_entry_link(&(map)->hdr, after_where, entry) + +#define vm_map_copy_entry_link(copy, after_where, entry) \ + _vm_map_entry_link(&(copy)->cpy_hdr, after_where, entry) + +#define _vm_map_entry_link(hdr, after_where, entry) \ + MACRO_BEGIN \ + (hdr)->nentries++; \ + (entry)->vme_prev = (after_where); \ + (entry)->vme_next = (after_where)->vme_next; \ + (entry)->vme_prev->vme_next = \ + (entry)->vme_next->vme_prev = (entry); \ + MACRO_END + +#define vm_map_entry_unlink(map, entry) \ + _vm_map_entry_unlink(&(map)->hdr, entry) + +#define vm_map_copy_entry_unlink(copy, entry) \ + _vm_map_entry_unlink(&(copy)->cpy_hdr, entry) + +#define _vm_map_entry_unlink(hdr, entry) \ + MACRO_BEGIN \ + (hdr)->nentries--; \ + (entry)->vme_next->vme_prev = (entry)->vme_prev; \ + (entry)->vme_prev->vme_next = (entry)->vme_next; \ + MACRO_END + +/* + * vm_map_reference: + * + * Creates another valid reference to the given map. + * + */ +void vm_map_reference(map) + register vm_map_t map; +{ + if (map == VM_MAP_NULL) + return; + + simple_lock(&map->ref_lock); + map->ref_count++; + simple_unlock(&map->ref_lock); +} + +/* + * vm_map_deallocate: + * + * Removes a reference from the specified map, + * destroying it if no references remain. + * The map should not be locked. + */ +void vm_map_deallocate(map) + register vm_map_t map; +{ + register int c; + + if (map == VM_MAP_NULL) + return; + + simple_lock(&map->ref_lock); + c = --map->ref_count; + simple_unlock(&map->ref_lock); + + if (c > 0) { + return; + } + + projected_buffer_collect(map); + (void) vm_map_delete(map, map->min_offset, map->max_offset); + + pmap_destroy(map->pmap); + + zfree(vm_map_zone, (vm_offset_t) map); +} + +/* + * SAVE_HINT: + * + * Saves the specified entry as the hint for + * future lookups. Performs necessary interlocks. + */ +#define SAVE_HINT(map,value) \ + simple_lock(&(map)->hint_lock); \ + (map)->hint = (value); \ + simple_unlock(&(map)->hint_lock); + +/* + * vm_map_lookup_entry: [ internal use only ] + * + * Finds the map entry containing (or + * immediately preceding) the specified address + * in the given map; the entry is returned + * in the "entry" parameter. The boolean + * result indicates whether the address is + * actually contained in the map. + */ +boolean_t vm_map_lookup_entry(map, address, entry) + register vm_map_t map; + register vm_offset_t address; + vm_map_entry_t *entry; /* OUT */ +{ + register vm_map_entry_t cur; + register vm_map_entry_t last; + + /* + * Start looking either from the head of the + * list, or from the hint. + */ + + simple_lock(&map->hint_lock); + cur = map->hint; + simple_unlock(&map->hint_lock); + + if (cur == vm_map_to_entry(map)) + cur = cur->vme_next; + + if (address >= cur->vme_start) { + /* + * Go from hint to end of list. + * + * But first, make a quick check to see if + * we are already looking at the entry we + * want (which is usually the case). + * Note also that we don't need to save the hint + * here... it is the same hint (unless we are + * at the header, in which case the hint didn't + * buy us anything anyway). + */ + last = vm_map_to_entry(map); + if ((cur != last) && (cur->vme_end > address)) { + *entry = cur; + return(TRUE); + } + } + else { + /* + * Go from start to hint, *inclusively* + */ + last = cur->vme_next; + cur = vm_map_first_entry(map); + } + + /* + * Search linearly + */ + + while (cur != last) { + if (cur->vme_end > address) { + if (address >= cur->vme_start) { + /* + * Save this lookup for future + * hints, and return + */ + + *entry = cur; + SAVE_HINT(map, cur); + return(TRUE); + } + break; + } + cur = cur->vme_next; + } + *entry = cur->vme_prev; + SAVE_HINT(map, *entry); + return(FALSE); +} + +/* + * Routine: invalid_user_access + * + * Verifies whether user access is valid. + */ + +boolean_t +invalid_user_access(map, start, end, prot) + vm_map_t map; + vm_offset_t start, end; + vm_prot_t prot; +{ + vm_map_entry_t entry; + + return (map == VM_MAP_NULL || map == kernel_map || + !vm_map_lookup_entry(map, start, &entry) || + entry->vme_end < end || + (prot & ~(entry->protection))); +} + + +/* + * Routine: vm_map_find_entry + * Purpose: + * Allocate a range in the specified virtual address map, + * returning the entry allocated for that range. + * Used by kmem_alloc, etc. Returns wired entries. + * + * The map must be locked. + * + * If an entry is allocated, the object/offset fields + * are initialized to zero. If an object is supplied, + * then an existing entry may be extended. + */ +kern_return_t vm_map_find_entry(map, address, size, mask, object, o_entry) + register vm_map_t map; + vm_offset_t *address; /* OUT */ + vm_size_t size; + vm_offset_t mask; + vm_object_t object; + vm_map_entry_t *o_entry; /* OUT */ +{ + register vm_map_entry_t entry, new_entry; + register vm_offset_t start; + register vm_offset_t end; + + /* + * Look for the first possible address; + * if there's already something at this + * address, we have to start after it. + */ + + if ((entry = map->first_free) == vm_map_to_entry(map)) + start = map->min_offset; + else + start = entry->vme_end; + + /* + * In any case, the "entry" always precedes + * the proposed new region throughout the loop: + */ + + while (TRUE) { + register vm_map_entry_t next; + + /* + * Find the end of the proposed new region. + * Be sure we didn't go beyond the end, or + * wrap around the address. + */ + + if (((start + mask) & ~mask) < start) + return(KERN_NO_SPACE); + start = ((start + mask) & ~mask); + end = start + size; + + if ((end > map->max_offset) || (end < start)) + return(KERN_NO_SPACE); + + /* + * If there are no more entries, we must win. + */ + + next = entry->vme_next; + if (next == vm_map_to_entry(map)) + break; + + /* + * If there is another entry, it must be + * after the end of the potential new region. + */ + + if (next->vme_start >= end) + break; + + /* + * Didn't fit -- move to the next entry. + */ + + entry = next; + start = entry->vme_end; + } + + /* + * At this point, + * "start" and "end" should define the endpoints of the + * available new range, and + * "entry" should refer to the region before the new + * range, and + * + * the map should be locked. + */ + + *address = start; + + /* + * See whether we can avoid creating a new entry by + * extending one of our neighbors. [So far, we only attempt to + * extend from below.] + */ + + if ((object != VM_OBJECT_NULL) && + (entry != vm_map_to_entry(map)) && + (entry->vme_end == start) && + (!entry->is_shared) && + (!entry->is_sub_map) && + (entry->object.vm_object == object) && + (entry->needs_copy == FALSE) && + (entry->inheritance == VM_INHERIT_DEFAULT) && + (entry->protection == VM_PROT_DEFAULT) && + (entry->max_protection == VM_PROT_ALL) && + (entry->wired_count == 1) && + (entry->user_wired_count == 0) && + (entry->projected_on == 0)) { + /* + * Because this is a special case, + * we don't need to use vm_object_coalesce. + */ + + entry->vme_end = end; + new_entry = entry; + } else { + new_entry = vm_map_entry_create(map); + + new_entry->vme_start = start; + new_entry->vme_end = end; + + new_entry->is_shared = FALSE; + new_entry->is_sub_map = FALSE; + new_entry->object.vm_object = VM_OBJECT_NULL; + new_entry->offset = (vm_offset_t) 0; + + new_entry->needs_copy = FALSE; + + new_entry->inheritance = VM_INHERIT_DEFAULT; + new_entry->protection = VM_PROT_DEFAULT; + new_entry->max_protection = VM_PROT_ALL; + new_entry->wired_count = 1; + new_entry->user_wired_count = 0; + + new_entry->in_transition = FALSE; + new_entry->needs_wakeup = FALSE; + new_entry->projected_on = 0; + + /* + * Insert the new entry into the list + */ + + vm_map_entry_link(map, entry, new_entry); + } + + map->size += size; + + /* + * Update the free space hint and the lookup hint + */ + + map->first_free = new_entry; + SAVE_HINT(map, new_entry); + + *o_entry = new_entry; + return(KERN_SUCCESS); +} + +int vm_map_pmap_enter_print = FALSE; +int vm_map_pmap_enter_enable = FALSE; + +/* + * Routine: vm_map_pmap_enter + * + * Description: + * Force pages from the specified object to be entered into + * the pmap at the specified address if they are present. + * As soon as a page not found in the object the scan ends. + * + * Returns: + * Nothing. + * + * In/out conditions: + * The source map should not be locked on entry. + */ +void +vm_map_pmap_enter(map, addr, end_addr, object, offset, protection) + vm_map_t map; + register + vm_offset_t addr; + register + vm_offset_t end_addr; + register + vm_object_t object; + vm_offset_t offset; + vm_prot_t protection; +{ + while (addr < end_addr) { + register vm_page_t m; + + vm_object_lock(object); + vm_object_paging_begin(object); + + m = vm_page_lookup(object, offset); + if (m == VM_PAGE_NULL || m->absent) { + vm_object_paging_end(object); + vm_object_unlock(object); + return; + } + + if (vm_map_pmap_enter_print) { + printf("vm_map_pmap_enter:"); + printf("map: %x, addr: %x, object: %x, offset: %x\n", + map, addr, object, offset); + } + + m->busy = TRUE; + vm_object_unlock(object); + + PMAP_ENTER(map->pmap, addr, m, + protection, FALSE); + + vm_object_lock(object); + PAGE_WAKEUP_DONE(m); + vm_page_lock_queues(); + if (!m->active && !m->inactive) + vm_page_activate(m); + vm_page_unlock_queues(); + vm_object_paging_end(object); + vm_object_unlock(object); + + offset += PAGE_SIZE; + addr += PAGE_SIZE; + } +} + +/* + * Routine: vm_map_enter + * + * Description: + * Allocate a range in the specified virtual address map. + * The resulting range will refer to memory defined by + * the given memory object and offset into that object. + * + * Arguments are as defined in the vm_map call. + */ +kern_return_t vm_map_enter( + map, + address, size, mask, anywhere, + object, offset, needs_copy, + cur_protection, max_protection, inheritance) + register + vm_map_t map; + vm_offset_t *address; /* IN/OUT */ + vm_size_t size; + vm_offset_t mask; + boolean_t anywhere; + vm_object_t object; + vm_offset_t offset; + boolean_t needs_copy; + vm_prot_t cur_protection; + vm_prot_t max_protection; + vm_inherit_t inheritance; +{ + register vm_map_entry_t entry; + register vm_offset_t start; + register vm_offset_t end; + kern_return_t result = KERN_SUCCESS; + +#define RETURN(value) { result = value; goto BailOut; } + + StartAgain: ; + + start = *address; + + if (anywhere) { + vm_map_lock(map); + + /* + * Calculate the first possible address. + */ + + if (start < map->min_offset) + start = map->min_offset; + if (start > map->max_offset) + RETURN(KERN_NO_SPACE); + + /* + * Look for the first possible address; + * if there's already something at this + * address, we have to start after it. + */ + + if (start == map->min_offset) { + if ((entry = map->first_free) != vm_map_to_entry(map)) + start = entry->vme_end; + } else { + vm_map_entry_t tmp_entry; + if (vm_map_lookup_entry(map, start, &tmp_entry)) + start = tmp_entry->vme_end; + entry = tmp_entry; + } + + /* + * In any case, the "entry" always precedes + * the proposed new region throughout the + * loop: + */ + + while (TRUE) { + register vm_map_entry_t next; + + /* + * Find the end of the proposed new region. + * Be sure we didn't go beyond the end, or + * wrap around the address. + */ + + if (((start + mask) & ~mask) < start) + return(KERN_NO_SPACE); + start = ((start + mask) & ~mask); + end = start + size; + + if ((end > map->max_offset) || (end < start)) { + if (map->wait_for_space) { + if (size <= (map->max_offset - + map->min_offset)) { + assert_wait((event_t) map, TRUE); + vm_map_unlock(map); + thread_block((void (*)()) 0); + goto StartAgain; + } + } + + RETURN(KERN_NO_SPACE); + } + + /* + * If there are no more entries, we must win. + */ + + next = entry->vme_next; + if (next == vm_map_to_entry(map)) + break; + + /* + * If there is another entry, it must be + * after the end of the potential new region. + */ + + if (next->vme_start >= end) + break; + + /* + * Didn't fit -- move to the next entry. + */ + + entry = next; + start = entry->vme_end; + } + *address = start; + } else { + vm_map_entry_t temp_entry; + + /* + * Verify that: + * the address doesn't itself violate + * the mask requirement. + */ + + if ((start & mask) != 0) + return(KERN_NO_SPACE); + + vm_map_lock(map); + + /* + * ... the address is within bounds + */ + + end = start + size; + + if ((start < map->min_offset) || + (end > map->max_offset) || + (start >= end)) { + RETURN(KERN_INVALID_ADDRESS); + } + + /* + * ... the starting address isn't allocated + */ + + if (vm_map_lookup_entry(map, start, &temp_entry)) + RETURN(KERN_NO_SPACE); + + entry = temp_entry; + + /* + * ... the next region doesn't overlap the + * end point. + */ + + if ((entry->vme_next != vm_map_to_entry(map)) && + (entry->vme_next->vme_start < end)) + RETURN(KERN_NO_SPACE); + } + + /* + * At this point, + * "start" and "end" should define the endpoints of the + * available new range, and + * "entry" should refer to the region before the new + * range, and + * + * the map should be locked. + */ + + /* + * See whether we can avoid creating a new entry (and object) by + * extending one of our neighbors. [So far, we only attempt to + * extend from below.] + */ + + if ((object == VM_OBJECT_NULL) && + (entry != vm_map_to_entry(map)) && + (entry->vme_end == start) && + (!entry->is_shared) && + (!entry->is_sub_map) && + (entry->inheritance == inheritance) && + (entry->protection == cur_protection) && + (entry->max_protection == max_protection) && + (entry->wired_count == 0) && /* implies user_wired_count == 0 */ + (entry->projected_on == 0)) { + if (vm_object_coalesce(entry->object.vm_object, + VM_OBJECT_NULL, + entry->offset, + (vm_offset_t) 0, + (vm_size_t)(entry->vme_end - entry->vme_start), + (vm_size_t)(end - entry->vme_end))) { + + /* + * Coalesced the two objects - can extend + * the previous map entry to include the + * new range. + */ + map->size += (end - entry->vme_end); + entry->vme_end = end; + RETURN(KERN_SUCCESS); + } + } + + /* + * Create a new entry + */ + + /**/ { + register vm_map_entry_t new_entry; + + new_entry = vm_map_entry_create(map); + + new_entry->vme_start = start; + new_entry->vme_end = end; + + new_entry->is_shared = FALSE; + new_entry->is_sub_map = FALSE; + new_entry->object.vm_object = object; + new_entry->offset = offset; + + new_entry->needs_copy = needs_copy; + + new_entry->inheritance = inheritance; + new_entry->protection = cur_protection; + new_entry->max_protection = max_protection; + new_entry->wired_count = 0; + new_entry->user_wired_count = 0; + + new_entry->in_transition = FALSE; + new_entry->needs_wakeup = FALSE; + new_entry->projected_on = 0; + + /* + * Insert the new entry into the list + */ + + vm_map_entry_link(map, entry, new_entry); + map->size += size; + + /* + * Update the free space hint and the lookup hint + */ + + if ((map->first_free == entry) && + ((entry == vm_map_to_entry(map) ? map->min_offset : entry->vme_end) + >= new_entry->vme_start)) + map->first_free = new_entry; + + SAVE_HINT(map, new_entry); + + vm_map_unlock(map); + + if ((object != VM_OBJECT_NULL) && + (vm_map_pmap_enter_enable) && + (!anywhere) && + (!needs_copy) && + (size < (128*1024))) { + vm_map_pmap_enter(map, start, end, + object, offset, cur_protection); + } + + return(result); + /**/ } + + BailOut: ; + + vm_map_unlock(map); + return(result); + +#undef RETURN +} + +/* + * vm_map_clip_start: [ internal use only ] + * + * Asserts that the given entry begins at or after + * the specified address; if necessary, + * it splits the entry into two. + */ +void _vm_map_clip_start(); +#define vm_map_clip_start(map, entry, startaddr) \ + MACRO_BEGIN \ + if ((startaddr) > (entry)->vme_start) \ + _vm_map_clip_start(&(map)->hdr,(entry),(startaddr)); \ + MACRO_END + +void _vm_map_copy_clip_start(); +#define vm_map_copy_clip_start(copy, entry, startaddr) \ + MACRO_BEGIN \ + if ((startaddr) > (entry)->vme_start) \ + _vm_map_clip_start(&(copy)->cpy_hdr,(entry),(startaddr)); \ + MACRO_END + +/* + * This routine is called only when it is known that + * the entry must be split. + */ +void _vm_map_clip_start(map_header, entry, start) + register struct vm_map_header *map_header; + register vm_map_entry_t entry; + register vm_offset_t start; +{ + register vm_map_entry_t new_entry; + + /* + * Split off the front portion -- + * note that we must insert the new + * entry BEFORE this one, so that + * this entry has the specified starting + * address. + */ + + new_entry = _vm_map_entry_create(map_header); + vm_map_entry_copy_full(new_entry, entry); + + new_entry->vme_end = start; + entry->offset += (start - entry->vme_start); + entry->vme_start = start; + + _vm_map_entry_link(map_header, entry->vme_prev, new_entry); + + if (entry->is_sub_map) + vm_map_reference(new_entry->object.sub_map); + else + vm_object_reference(new_entry->object.vm_object); +} + +/* + * vm_map_clip_end: [ internal use only ] + * + * Asserts that the given entry ends at or before + * the specified address; if necessary, + * it splits the entry into two. + */ +void _vm_map_clip_end(); +#define vm_map_clip_end(map, entry, endaddr) \ + MACRO_BEGIN \ + if ((endaddr) < (entry)->vme_end) \ + _vm_map_clip_end(&(map)->hdr,(entry),(endaddr)); \ + MACRO_END + +void _vm_map_copy_clip_end(); +#define vm_map_copy_clip_end(copy, entry, endaddr) \ + MACRO_BEGIN \ + if ((endaddr) < (entry)->vme_end) \ + _vm_map_clip_end(&(copy)->cpy_hdr,(entry),(endaddr)); \ + MACRO_END + +/* + * This routine is called only when it is known that + * the entry must be split. + */ +void _vm_map_clip_end(map_header, entry, end) + register struct vm_map_header *map_header; + register vm_map_entry_t entry; + register vm_offset_t end; +{ + register vm_map_entry_t new_entry; + + /* + * Create a new entry and insert it + * AFTER the specified entry + */ + + new_entry = _vm_map_entry_create(map_header); + vm_map_entry_copy_full(new_entry, entry); + + new_entry->vme_start = entry->vme_end = end; + new_entry->offset += (end - entry->vme_start); + + _vm_map_entry_link(map_header, entry, new_entry); + + if (entry->is_sub_map) + vm_map_reference(new_entry->object.sub_map); + else + vm_object_reference(new_entry->object.vm_object); +} + +/* + * VM_MAP_RANGE_CHECK: [ internal use only ] + * + * Asserts that the starting and ending region + * addresses fall within the valid range of the map. + */ +#define VM_MAP_RANGE_CHECK(map, start, end) \ + { \ + if (start < vm_map_min(map)) \ + start = vm_map_min(map); \ + if (end > vm_map_max(map)) \ + end = vm_map_max(map); \ + if (start > end) \ + start = end; \ + } + +/* + * vm_map_submap: [ kernel use only ] + * + * Mark the given range as handled by a subordinate map. + * + * This range must have been created with vm_map_find using + * the vm_submap_object, and no other operations may have been + * performed on this range prior to calling vm_map_submap. + * + * Only a limited number of operations can be performed + * within this rage after calling vm_map_submap: + * vm_fault + * [Don't try vm_map_copyin!] + * + * To remove a submapping, one must first remove the + * range from the superior map, and then destroy the + * submap (if desired). [Better yet, don't try it.] + */ +kern_return_t vm_map_submap(map, start, end, submap) + register vm_map_t map; + register vm_offset_t start; + register vm_offset_t end; + vm_map_t submap; +{ + vm_map_entry_t entry; + register kern_return_t result = KERN_INVALID_ARGUMENT; + register vm_object_t object; + + vm_map_lock(map); + + VM_MAP_RANGE_CHECK(map, start, end); + + if (vm_map_lookup_entry(map, start, &entry)) { + vm_map_clip_start(map, entry, start); + } + else + entry = entry->vme_next; + + vm_map_clip_end(map, entry, end); + + if ((entry->vme_start == start) && (entry->vme_end == end) && + (!entry->is_sub_map) && + ((object = entry->object.vm_object) == vm_submap_object) && + (object->resident_page_count == 0) && + (object->copy == VM_OBJECT_NULL) && + (object->shadow == VM_OBJECT_NULL) && + (!object->pager_created)) { + entry->object.vm_object = VM_OBJECT_NULL; + vm_object_deallocate(object); + entry->is_sub_map = TRUE; + vm_map_reference(entry->object.sub_map = submap); + result = KERN_SUCCESS; + } + vm_map_unlock(map); + + return(result); +} + +/* + * vm_map_protect: + * + * Sets the protection of the specified address + * region in the target map. If "set_max" is + * specified, the maximum protection is to be set; + * otherwise, only the current protection is affected. + */ +kern_return_t vm_map_protect(map, start, end, new_prot, set_max) + register vm_map_t map; + register vm_offset_t start; + register vm_offset_t end; + register vm_prot_t new_prot; + register boolean_t set_max; +{ + register vm_map_entry_t current; + vm_map_entry_t entry; + + vm_map_lock(map); + + VM_MAP_RANGE_CHECK(map, start, end); + + if (vm_map_lookup_entry(map, start, &entry)) { + vm_map_clip_start(map, entry, start); + } + else + entry = entry->vme_next; + + /* + * Make a first pass to check for protection + * violations. + */ + + current = entry; + while ((current != vm_map_to_entry(map)) && + (current->vme_start < end)) { + + if (current->is_sub_map) { + vm_map_unlock(map); + return(KERN_INVALID_ARGUMENT); + } + if ((new_prot & (VM_PROT_NOTIFY | current->max_protection)) + != new_prot) { + vm_map_unlock(map); + return(KERN_PROTECTION_FAILURE); + } + + current = current->vme_next; + } + + /* + * Go back and fix up protections. + * [Note that clipping is not necessary the second time.] + */ + + current = entry; + + while ((current != vm_map_to_entry(map)) && + (current->vme_start < end)) { + + vm_prot_t old_prot; + + vm_map_clip_end(map, current, end); + + old_prot = current->protection; + if (set_max) + current->protection = + (current->max_protection = new_prot) & + old_prot; + else + current->protection = new_prot; + + /* + * Update physical map if necessary. + */ + + if (current->protection != old_prot) { + pmap_protect(map->pmap, current->vme_start, + current->vme_end, + current->protection); + } + current = current->vme_next; + } + + vm_map_unlock(map); + return(KERN_SUCCESS); +} + +/* + * vm_map_inherit: + * + * Sets the inheritance of the specified address + * range in the target map. Inheritance + * affects how the map will be shared with + * child maps at the time of vm_map_fork. + */ +kern_return_t vm_map_inherit(map, start, end, new_inheritance) + register vm_map_t map; + register vm_offset_t start; + register vm_offset_t end; + register vm_inherit_t new_inheritance; +{ + register vm_map_entry_t entry; + vm_map_entry_t temp_entry; + + vm_map_lock(map); + + VM_MAP_RANGE_CHECK(map, start, end); + + if (vm_map_lookup_entry(map, start, &temp_entry)) { + entry = temp_entry; + vm_map_clip_start(map, entry, start); + } + else + entry = temp_entry->vme_next; + + while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) { + vm_map_clip_end(map, entry, end); + + entry->inheritance = new_inheritance; + + entry = entry->vme_next; + } + + vm_map_unlock(map); + return(KERN_SUCCESS); +} + +/* + * vm_map_pageable_common: + * + * Sets the pageability of the specified address + * range in the target map. Regions specified + * as not pageable require locked-down physical + * memory and physical page maps. access_type indicates + * types of accesses that must not generate page faults. + * This is checked against protection of memory being locked-down. + * access_type of VM_PROT_NONE makes memory pageable. + * + * The map must not be locked, but a reference + * must remain to the map throughout the call. + * + * Callers should use macros in vm/vm_map.h (i.e. vm_map_pageable, + * or vm_map_pageable_user); don't call vm_map_pageable directly. + */ +kern_return_t vm_map_pageable_common(map, start, end, access_type, user_wire) + register vm_map_t map; + register vm_offset_t start; + register vm_offset_t end; + register vm_prot_t access_type; + boolean_t user_wire; +{ + register vm_map_entry_t entry; + vm_map_entry_t start_entry; + + vm_map_lock(map); + + VM_MAP_RANGE_CHECK(map, start, end); + + if (vm_map_lookup_entry(map, start, &start_entry)) { + entry = start_entry; + /* + * vm_map_clip_start will be done later. + */ + } + else { + /* + * Start address is not in map; this is fatal. + */ + vm_map_unlock(map); + return(KERN_FAILURE); + } + + /* + * Actions are rather different for wiring and unwiring, + * so we have two separate cases. + */ + + if (access_type == VM_PROT_NONE) { + + vm_map_clip_start(map, entry, start); + + /* + * Unwiring. First ensure that the range to be + * unwired is really wired down. + */ + while ((entry != vm_map_to_entry(map)) && + (entry->vme_start < end)) { + + if ((entry->wired_count == 0) || + ((entry->vme_end < end) && + ((entry->vme_next == vm_map_to_entry(map)) || + (entry->vme_next->vme_start > entry->vme_end))) || + (user_wire && (entry->user_wired_count == 0))) { + vm_map_unlock(map); + return(KERN_INVALID_ARGUMENT); + } + entry = entry->vme_next; + } + + /* + * Now decrement the wiring count for each region. + * If a region becomes completely unwired, + * unwire its physical pages and mappings. + */ + entry = start_entry; + while ((entry != vm_map_to_entry(map)) && + (entry->vme_start < end)) { + vm_map_clip_end(map, entry, end); + + if (user_wire) { + if (--(entry->user_wired_count) == 0) + entry->wired_count--; + } + else { + entry->wired_count--; + } + + if (entry->wired_count == 0) + vm_fault_unwire(map, entry); + + entry = entry->vme_next; + } + } + + else { + /* + * Wiring. We must do this in two passes: + * + * 1. Holding the write lock, we create any shadow + * or zero-fill objects that need to be created. + * Then we clip each map entry to the region to be + * wired and increment its wiring count. We + * create objects before clipping the map entries + * to avoid object proliferation. + * + * 2. We downgrade to a read lock, and call + * vm_fault_wire to fault in the pages for any + * newly wired area (wired_count is 1). + * + * Downgrading to a read lock for vm_fault_wire avoids + * a possible deadlock with another thread that may have + * faulted on one of the pages to be wired (it would mark + * the page busy, blocking us, then in turn block on the + * map lock that we hold). Because of problems in the + * recursive lock package, we cannot upgrade to a write + * lock in vm_map_lookup. Thus, any actions that require + * the write lock must be done beforehand. Because we + * keep the read lock on the map, the copy-on-write + * status of the entries we modify here cannot change. + */ + + /* + * Pass 1. + */ + while ((entry != vm_map_to_entry(map)) && + (entry->vme_start < end)) { + vm_map_clip_end(map, entry, end); + + if (entry->wired_count == 0) { + + /* + * Perform actions of vm_map_lookup that need + * the write lock on the map: create a shadow + * object for a copy-on-write region, or an + * object for a zero-fill region. + */ + if (entry->needs_copy && + ((entry->protection & VM_PROT_WRITE) != 0)) { + + vm_object_shadow(&entry->object.vm_object, + &entry->offset, + (vm_size_t)(entry->vme_end + - entry->vme_start)); + entry->needs_copy = FALSE; + } + if (entry->object.vm_object == VM_OBJECT_NULL) { + entry->object.vm_object = + vm_object_allocate( + (vm_size_t)(entry->vme_end + - entry->vme_start)); + entry->offset = (vm_offset_t)0; + } + } + vm_map_clip_start(map, entry, start); + vm_map_clip_end(map, entry, end); + + if (user_wire) { + if ((entry->user_wired_count)++ == 0) + entry->wired_count++; + } + else { + entry->wired_count++; + } + + /* + * Check for holes and protection mismatch. + * Holes: Next entry should be contiguous unless + * this is the end of the region. + * Protection: Access requested must be allowed. + */ + if (((entry->vme_end < end) && + ((entry->vme_next == vm_map_to_entry(map)) || + (entry->vme_next->vme_start > entry->vme_end))) || + ((entry->protection & access_type) != access_type)) { + /* + * Found a hole or protection problem. + * Object creation actions + * do not need to be undone, but the + * wired counts need to be restored. + */ + while ((entry != vm_map_to_entry(map)) && + (entry->vme_end > start)) { + if (user_wire) { + if (--(entry->user_wired_count) == 0) + entry->wired_count--; + } + else { + entry->wired_count--; + } + + entry = entry->vme_prev; + } + + vm_map_unlock(map); + return(KERN_FAILURE); + } + entry = entry->vme_next; + } + + /* + * Pass 2. + */ + + /* + * HACK HACK HACK HACK + * + * If we are wiring in the kernel map or a submap of it, + * unlock the map to avoid deadlocks. We trust that the + * kernel threads are well-behaved, and therefore will + * not do anything destructive to this region of the map + * while we have it unlocked. We cannot trust user threads + * to do the same. + * + * HACK HACK HACK HACK + */ + if (vm_map_pmap(map) == kernel_pmap) { + vm_map_unlock(map); /* trust me ... */ + } + else { + vm_map_lock_set_recursive(map); + vm_map_lock_write_to_read(map); + } + + entry = start_entry; + while (entry != vm_map_to_entry(map) && + entry->vme_start < end) { + /* + * Wiring cases: + * Kernel: wired == 1 && user_wired == 0 + * User: wired == 1 && user_wired == 1 + * + * Don't need to wire if either is > 1. wired = 0 && + * user_wired == 1 can't happen. + */ + + /* + * XXX This assumes that the faults always succeed. + */ + if ((entry->wired_count == 1) && + (entry->user_wired_count <= 1)) { + vm_fault_wire(map, entry); + } + entry = entry->vme_next; + } + + if (vm_map_pmap(map) == kernel_pmap) { + vm_map_lock(map); + } + else { + vm_map_lock_clear_recursive(map); + } + } + + vm_map_unlock(map); + + return(KERN_SUCCESS); +} + +/* + * vm_map_entry_delete: [ internal use only ] + * + * Deallocate the given entry from the target map. + */ +void vm_map_entry_delete(map, entry) + register vm_map_t map; + register vm_map_entry_t entry; +{ + register vm_offset_t s, e; + register vm_object_t object; + extern vm_object_t kernel_object; + + s = entry->vme_start; + e = entry->vme_end; + + /*Check if projected buffer*/ + if (map != kernel_map && entry->projected_on != 0) { + /*Check if projected kernel entry is persistent; + may only manipulate directly if it is*/ + if (entry->projected_on->projected_on == 0) + entry->wired_count = 0; /*Avoid unwire fault*/ + else + return; + } + + /* + * Get the object. Null objects cannot have pmap entries. + */ + + if ((object = entry->object.vm_object) != VM_OBJECT_NULL) { + + /* + * Unwire before removing addresses from the pmap; + * otherwise, unwiring will put the entries back in + * the pmap. + */ + + if (entry->wired_count != 0) { + vm_fault_unwire(map, entry); + entry->wired_count = 0; + entry->user_wired_count = 0; + } + + /* + * If the object is shared, we must remove + * *all* references to this data, since we can't + * find all of the physical maps which are sharing + * it. + */ + + if (object == kernel_object) { + vm_object_lock(object); + vm_object_page_remove(object, entry->offset, + entry->offset + (e - s)); + vm_object_unlock(object); + } else if (entry->is_shared) { + vm_object_pmap_remove(object, + entry->offset, + entry->offset + (e - s)); + } + else { + pmap_remove(map->pmap, s, e); + } + } + + /* + * Deallocate the object only after removing all + * pmap entries pointing to its pages. + */ + + if (entry->is_sub_map) + vm_map_deallocate(entry->object.sub_map); + else + vm_object_deallocate(entry->object.vm_object); + + vm_map_entry_unlink(map, entry); + map->size -= e - s; + + vm_map_entry_dispose(map, entry); +} + +/* + * vm_map_delete: [ internal use only ] + * + * Deallocates the given address range from the target + * map. + */ + +kern_return_t vm_map_delete(map, start, end) + register vm_map_t map; + register vm_offset_t start; + register vm_offset_t end; +{ + vm_map_entry_t entry; + vm_map_entry_t first_entry; + + /* + * Find the start of the region, and clip it + */ + + if (!vm_map_lookup_entry(map, start, &first_entry)) + entry = first_entry->vme_next; + else { + entry = first_entry; +#if NORMA_IPC_xxx + /* + * XXX Had to disable this code because: + + _vm_map_delete(c0804b78,c2198000,c219a000,0,c219a000)+df + [vm/vm_map.c:2007] + _vm_map_remove(c0804b78,c2198000,c219a000,c0817834, + c081786c)+42 [vm/vm_map.c:2094] + _kmem_io_map_deallocate(c0804b78,c2198000,2000,c0817834, + c081786c)+43 [vm/vm_kern.c:818] + _device_write_dealloc(c081786c)+117 [device/ds_routines.c:814] + _ds_write_done(c081786c,0)+2e [device/ds_routines.c:848] + _io_done_thread_continue(c08150c0,c21d4e14,c21d4e30,c08150c0, + c080c114)+14 [device/ds_routines.c:1350] + + */ + if (start > entry->vme_start + && end == entry->vme_end + && ! entry->wired_count /* XXX ??? */ + && ! entry->is_shared + && ! entry->projected_on + && ! entry->is_sub_map) { + extern vm_object_t kernel_object; + register vm_object_t object = entry->object.vm_object; + + /* + * The region to be deleted lives at the end + * of this entry, and thus all we have to do is + * truncate the entry. + * + * This special case is necessary if we want + * coalescing to do us any good. + * + * XXX Do we have to adjust object size? + */ + if (object == kernel_object) { + vm_object_lock(object); + vm_object_page_remove(object, + entry->offset + start, + entry->offset + + (end - start)); + vm_object_unlock(object); + } else if (entry->is_shared) { + vm_object_pmap_remove(object, + entry->offset + start, + entry->offset + + (end - start)); + } else { + pmap_remove(map->pmap, start, end); + } + object->size -= (end - start); /* XXX */ + + entry->vme_end = start; + map->size -= (end - start); + + if (map->wait_for_space) { + thread_wakeup((event_t) map); + } + return KERN_SUCCESS; + } +#endif NORMA_IPC + vm_map_clip_start(map, entry, start); + + /* + * Fix the lookup hint now, rather than each + * time though the loop. + */ + + SAVE_HINT(map, entry->vme_prev); + } + + /* + * Save the free space hint + */ + + if (map->first_free->vme_start >= start) + map->first_free = entry->vme_prev; + + /* + * Step through all entries in this region + */ + + while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) { + vm_map_entry_t next; + + vm_map_clip_end(map, entry, end); + + /* + * If the entry is in transition, we must wait + * for it to exit that state. It could be clipped + * while we leave the map unlocked. + */ + if(entry->in_transition) { + /* + * Say that we are waiting, and wait for entry. + */ + entry->needs_wakeup = TRUE; + vm_map_entry_wait(map, FALSE); + vm_map_lock(map); + + /* + * The entry could have been clipped or it + * may not exist anymore. look it up again. + */ + if(!vm_map_lookup_entry(map, start, &entry)) { + entry = entry->vme_next; + } + continue; + } + + next = entry->vme_next; + + vm_map_entry_delete(map, entry); + entry = next; + } + + if (map->wait_for_space) + thread_wakeup((event_t) map); + + return(KERN_SUCCESS); +} + +/* + * vm_map_remove: + * + * Remove the given address range from the target map. + * This is the exported form of vm_map_delete. + */ +kern_return_t vm_map_remove(map, start, end) + register vm_map_t map; + register vm_offset_t start; + register vm_offset_t end; +{ + register kern_return_t result; + + vm_map_lock(map); + VM_MAP_RANGE_CHECK(map, start, end); + result = vm_map_delete(map, start, end); + vm_map_unlock(map); + + return(result); +} + + +/* + * vm_map_copy_steal_pages: + * + * Steal all the pages from a vm_map_copy page_list by copying ones + * that have not already been stolen. + */ +void +vm_map_copy_steal_pages(copy) +vm_map_copy_t copy; +{ + register vm_page_t m, new_m; + register int i; + vm_object_t object; + + for (i = 0; i < copy->cpy_npages; i++) { + + /* + * If the page is not tabled, then it's already stolen. + */ + m = copy->cpy_page_list[i]; + if (!m->tabled) + continue; + + /* + * Page was not stolen, get a new + * one and do the copy now. + */ + while ((new_m = vm_page_grab()) == VM_PAGE_NULL) { + VM_PAGE_WAIT((void(*)()) 0); + } + + vm_page_copy(m, new_m); + + object = m->object; + vm_object_lock(object); + vm_page_lock_queues(); + if (!m->active && !m->inactive) + vm_page_activate(m); + vm_page_unlock_queues(); + PAGE_WAKEUP_DONE(m); + vm_object_paging_end(object); + vm_object_unlock(object); + + copy->cpy_page_list[i] = new_m; + } +} + +/* + * vm_map_copy_page_discard: + * + * Get rid of the pages in a page_list copy. If the pages are + * stolen, they are freed. If the pages are not stolen, they + * are unbusied, and associated state is cleaned up. + */ +void vm_map_copy_page_discard(copy) +vm_map_copy_t copy; +{ + while (copy->cpy_npages > 0) { + vm_page_t m; + + if((m = copy->cpy_page_list[--(copy->cpy_npages)]) != + VM_PAGE_NULL) { + + /* + * If it's not in the table, then it's + * a stolen page that goes back + * to the free list. Else it belongs + * to some object, and we hold a + * paging reference on that object. + */ + if (!m->tabled) { + VM_PAGE_FREE(m); + } + else { + vm_object_t object; + + object = m->object; + + vm_object_lock(object); + vm_page_lock_queues(); + if (!m->active && !m->inactive) + vm_page_activate(m); + vm_page_unlock_queues(); + + PAGE_WAKEUP_DONE(m); + vm_object_paging_end(object); + vm_object_unlock(object); + } + } + } +} + +/* + * Routine: vm_map_copy_discard + * + * Description: + * Dispose of a map copy object (returned by + * vm_map_copyin). + */ +void +vm_map_copy_discard(copy) + vm_map_copy_t copy; +{ +free_next_copy: + if (copy == VM_MAP_COPY_NULL) + return; + + switch (copy->type) { + case VM_MAP_COPY_ENTRY_LIST: + while (vm_map_copy_first_entry(copy) != + vm_map_copy_to_entry(copy)) { + vm_map_entry_t entry = vm_map_copy_first_entry(copy); + + vm_map_copy_entry_unlink(copy, entry); + vm_object_deallocate(entry->object.vm_object); + vm_map_copy_entry_dispose(copy, entry); + } + break; + case VM_MAP_COPY_OBJECT: + vm_object_deallocate(copy->cpy_object); + break; + case VM_MAP_COPY_PAGE_LIST: + + /* + * To clean this up, we have to unbusy all the pages + * and release the paging references in their objects. + */ + if (copy->cpy_npages > 0) + vm_map_copy_page_discard(copy); + + /* + * If there's a continuation, abort it. The + * abort routine releases any storage. + */ + if (vm_map_copy_has_cont(copy)) { + + /* + * Special case: recognize + * vm_map_copy_discard_cont and optimize + * here to avoid tail recursion. + */ + if (copy->cpy_cont == vm_map_copy_discard_cont) { + register vm_map_copy_t new_copy; + + new_copy = (vm_map_copy_t) copy->cpy_cont_args; + zfree(vm_map_copy_zone, (vm_offset_t) copy); + copy = new_copy; + goto free_next_copy; + } + else { + vm_map_copy_abort_cont(copy); + } + } + + break; + } + zfree(vm_map_copy_zone, (vm_offset_t) copy); +} + +/* + * Routine: vm_map_copy_copy + * + * Description: + * Move the information in a map copy object to + * a new map copy object, leaving the old one + * empty. + * + * This is used by kernel routines that need + * to look at out-of-line data (in copyin form) + * before deciding whether to return SUCCESS. + * If the routine returns FAILURE, the original + * copy object will be deallocated; therefore, + * these routines must make a copy of the copy + * object and leave the original empty so that + * deallocation will not fail. + */ +vm_map_copy_t +vm_map_copy_copy(copy) + vm_map_copy_t copy; +{ + vm_map_copy_t new_copy; + + if (copy == VM_MAP_COPY_NULL) + return VM_MAP_COPY_NULL; + + /* + * Allocate a new copy object, and copy the information + * from the old one into it. + */ + + new_copy = (vm_map_copy_t) zalloc(vm_map_copy_zone); + *new_copy = *copy; + + if (copy->type == VM_MAP_COPY_ENTRY_LIST) { + /* + * The links in the entry chain must be + * changed to point to the new copy object. + */ + vm_map_copy_first_entry(copy)->vme_prev + = vm_map_copy_to_entry(new_copy); + vm_map_copy_last_entry(copy)->vme_next + = vm_map_copy_to_entry(new_copy); + } + + /* + * Change the old copy object into one that contains + * nothing to be deallocated. + */ + copy->type = VM_MAP_COPY_OBJECT; + copy->cpy_object = VM_OBJECT_NULL; + + /* + * Return the new object. + */ + return new_copy; +} + +/* + * Routine: vm_map_copy_discard_cont + * + * Description: + * A version of vm_map_copy_discard that can be called + * as a continuation from a vm_map_copy page list. + */ +kern_return_t vm_map_copy_discard_cont(cont_args, copy_result) +vm_map_copyin_args_t cont_args; +vm_map_copy_t *copy_result; /* OUT */ +{ + vm_map_copy_discard((vm_map_copy_t) cont_args); + if (copy_result != (vm_map_copy_t *)0) + *copy_result = VM_MAP_COPY_NULL; + return(KERN_SUCCESS); +} + +/* + * Routine: vm_map_copy_overwrite + * + * Description: + * Copy the memory described by the map copy + * object (copy; returned by vm_map_copyin) onto + * the specified destination region (dst_map, dst_addr). + * The destination must be writeable. + * + * Unlike vm_map_copyout, this routine actually + * writes over previously-mapped memory. If the + * previous mapping was to a permanent (user-supplied) + * memory object, it is preserved. + * + * The attributes (protection and inheritance) of the + * destination region are preserved. + * + * If successful, consumes the copy object. + * Otherwise, the caller is responsible for it. + * + * Implementation notes: + * To overwrite temporary virtual memory, it is + * sufficient to remove the previous mapping and insert + * the new copy. This replacement is done either on + * the whole region (if no permanent virtual memory + * objects are embedded in the destination region) or + * in individual map entries. + * + * To overwrite permanent virtual memory, it is + * necessary to copy each page, as the external + * memory management interface currently does not + * provide any optimizations. + * + * Once a page of permanent memory has been overwritten, + * it is impossible to interrupt this function; otherwise, + * the call would be neither atomic nor location-independent. + * The kernel-state portion of a user thread must be + * interruptible. + * + * It may be expensive to forward all requests that might + * overwrite permanent memory (vm_write, vm_copy) to + * uninterruptible kernel threads. This routine may be + * called by interruptible threads; however, success is + * not guaranteed -- if the request cannot be performed + * atomically and interruptibly, an error indication is + * returned. + */ +kern_return_t vm_map_copy_overwrite(dst_map, dst_addr, copy, interruptible) + vm_map_t dst_map; + vm_offset_t dst_addr; + vm_map_copy_t copy; + boolean_t interruptible; +{ + vm_size_t size; + vm_offset_t start; + vm_map_entry_t tmp_entry; + vm_map_entry_t entry; + + boolean_t contains_permanent_objects = FALSE; + + interruptible = FALSE; /* XXX */ + + /* + * Check for null copy object. + */ + + if (copy == VM_MAP_COPY_NULL) + return(KERN_SUCCESS); + + /* + * Only works for entry lists at the moment. Will + * support page lists LATER. + */ + +#if NORMA_IPC + vm_map_convert_from_page_list(copy); +#else + assert(copy->type == VM_MAP_COPY_ENTRY_LIST); +#endif + + /* + * Currently this routine only handles page-aligned + * regions. Eventually, it should handle misalignments + * by actually copying pages. + */ + + if (!page_aligned(copy->offset) || + !page_aligned(copy->size) || + !page_aligned(dst_addr)) + return(KERN_INVALID_ARGUMENT); + + size = copy->size; + + if (size == 0) { + vm_map_copy_discard(copy); + return(KERN_SUCCESS); + } + + /* + * Verify that the destination is all writeable + * initially. + */ +start_pass_1: + vm_map_lock(dst_map); + if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) { + vm_map_unlock(dst_map); + return(KERN_INVALID_ADDRESS); + } + vm_map_clip_start(dst_map, tmp_entry, dst_addr); + for (entry = tmp_entry;;) { + vm_size_t sub_size = (entry->vme_end - entry->vme_start); + vm_map_entry_t next = entry->vme_next; + + if ( ! (entry->protection & VM_PROT_WRITE)) { + vm_map_unlock(dst_map); + return(KERN_PROTECTION_FAILURE); + } + + /* + * If the entry is in transition, we must wait + * for it to exit that state. Anything could happen + * when we unlock the map, so start over. + */ + if (entry->in_transition) { + + /* + * Say that we are waiting, and wait for entry. + */ + entry->needs_wakeup = TRUE; + vm_map_entry_wait(dst_map, FALSE); + + goto start_pass_1; + } + + if (size <= sub_size) + break; + + if ((next == vm_map_to_entry(dst_map)) || + (next->vme_start != entry->vme_end)) { + vm_map_unlock(dst_map); + return(KERN_INVALID_ADDRESS); + } + + + /* + * Check for permanent objects in the destination. + */ + + if ((entry->object.vm_object != VM_OBJECT_NULL) && + !entry->object.vm_object->temporary) + contains_permanent_objects = TRUE; + + size -= sub_size; + entry = next; + } + + /* + * If there are permanent objects in the destination, then + * the copy cannot be interrupted. + */ + + if (interruptible && contains_permanent_objects) + return(KERN_FAILURE); /* XXX */ + + /* + * XXXO If there are no permanent objects in the destination, + * XXXO and the source and destination map entry zones match, + * XXXO and the destination map entry is not shared, + * XXXO then the map entries can be deleted and replaced + * XXXO with those from the copy. The following code is the + * XXXO basic idea of what to do, but there are lots of annoying + * XXXO little details about getting protection and inheritance + * XXXO right. Should add protection, inheritance, and sharing checks + * XXXO to the above pass and make sure that no wiring is involved. + */ +/* + * if (!contains_permanent_objects && + * copy->cpy_hdr.entries_pageable == dst_map->hdr.entries_pageable) { + * + * * + * * Run over copy and adjust entries. Steal code + * * from vm_map_copyout() to do this. + * * + * + * tmp_entry = tmp_entry->vme_prev; + * vm_map_delete(dst_map, dst_addr, dst_addr + copy->size); + * vm_map_copy_insert(dst_map, tmp_entry, copy); + * + * vm_map_unlock(dst_map); + * vm_map_copy_discard(copy); + * } + */ + /* + * + * Make a second pass, overwriting the data + * At the beginning of each loop iteration, + * the next entry to be overwritten is "tmp_entry" + * (initially, the value returned from the lookup above), + * and the starting address expected in that entry + * is "start". + */ + + start = dst_addr; + + while (vm_map_copy_first_entry(copy) != vm_map_copy_to_entry(copy)) { + vm_map_entry_t copy_entry = vm_map_copy_first_entry(copy); + vm_size_t copy_size = (copy_entry->vme_end - copy_entry->vme_start); + vm_object_t object; + + entry = tmp_entry; + size = (entry->vme_end - entry->vme_start); + /* + * Make sure that no holes popped up in the + * address map, and that the protection is + * still valid, in case the map was unlocked + * earlier. + */ + + if (entry->vme_start != start) { + vm_map_unlock(dst_map); + return(KERN_INVALID_ADDRESS); + } + assert(entry != vm_map_to_entry(dst_map)); + + /* + * Check protection again + */ + + if ( ! (entry->protection & VM_PROT_WRITE)) { + vm_map_unlock(dst_map); + return(KERN_PROTECTION_FAILURE); + } + + /* + * Adjust to source size first + */ + + if (copy_size < size) { + vm_map_clip_end(dst_map, entry, entry->vme_start + copy_size); + size = copy_size; + } + + /* + * Adjust to destination size + */ + + if (size < copy_size) { + vm_map_copy_clip_end(copy, copy_entry, + copy_entry->vme_start + size); + copy_size = size; + } + + assert((entry->vme_end - entry->vme_start) == size); + assert((tmp_entry->vme_end - tmp_entry->vme_start) == size); + assert((copy_entry->vme_end - copy_entry->vme_start) == size); + + /* + * If the destination contains temporary unshared memory, + * we can perform the copy by throwing it away and + * installing the source data. + */ + + object = entry->object.vm_object; + if (!entry->is_shared && + ((object == VM_OBJECT_NULL) || object->temporary)) { + vm_object_t old_object = entry->object.vm_object; + vm_offset_t old_offset = entry->offset; + + entry->object = copy_entry->object; + entry->offset = copy_entry->offset; + entry->needs_copy = copy_entry->needs_copy; + entry->wired_count = 0; + entry->user_wired_count = 0; + + vm_map_copy_entry_unlink(copy, copy_entry); + vm_map_copy_entry_dispose(copy, copy_entry); + + vm_object_pmap_protect( + old_object, + old_offset, + size, + dst_map->pmap, + tmp_entry->vme_start, + VM_PROT_NONE); + + vm_object_deallocate(old_object); + + /* + * Set up for the next iteration. The map + * has not been unlocked, so the next + * address should be at the end of this + * entry, and the next map entry should be + * the one following it. + */ + + start = tmp_entry->vme_end; + tmp_entry = tmp_entry->vme_next; + } else { + vm_map_version_t version; + vm_object_t dst_object = entry->object.vm_object; + vm_offset_t dst_offset = entry->offset; + kern_return_t r; + + /* + * Take an object reference, and record + * the map version information so that the + * map can be safely unlocked. + */ + + vm_object_reference(dst_object); + + version.main_timestamp = dst_map->timestamp; + + vm_map_unlock(dst_map); + + /* + * Copy as much as possible in one pass + */ + + copy_size = size; + r = vm_fault_copy( + copy_entry->object.vm_object, + copy_entry->offset, + ©_size, + dst_object, + dst_offset, + dst_map, + &version, + FALSE /* XXX interruptible */ ); + + /* + * Release the object reference + */ + + vm_object_deallocate(dst_object); + + /* + * If a hard error occurred, return it now + */ + + if (r != KERN_SUCCESS) + return(r); + + if (copy_size != 0) { + /* + * Dispose of the copied region + */ + + vm_map_copy_clip_end(copy, copy_entry, + copy_entry->vme_start + copy_size); + vm_map_copy_entry_unlink(copy, copy_entry); + vm_object_deallocate(copy_entry->object.vm_object); + vm_map_copy_entry_dispose(copy, copy_entry); + } + + /* + * Pick up in the destination map where we left off. + * + * Use the version information to avoid a lookup + * in the normal case. + */ + + start += copy_size; + vm_map_lock(dst_map); + if ((version.main_timestamp + 1) == dst_map->timestamp) { + /* We can safely use saved tmp_entry value */ + + vm_map_clip_end(dst_map, tmp_entry, start); + tmp_entry = tmp_entry->vme_next; + } else { + /* Must do lookup of tmp_entry */ + + if (!vm_map_lookup_entry(dst_map, start, &tmp_entry)) { + vm_map_unlock(dst_map); + return(KERN_INVALID_ADDRESS); + } + vm_map_clip_start(dst_map, tmp_entry, start); + } + } + + } + vm_map_unlock(dst_map); + + /* + * Throw away the vm_map_copy object + */ + vm_map_copy_discard(copy); + + return(KERN_SUCCESS); +} + +/* + * Macro: vm_map_copy_insert + * + * Description: + * Link a copy chain ("copy") into a map at the + * specified location (after "where"). + * Side effects: + * The copy chain is destroyed. + * Warning: + * The arguments are evaluated multiple times. + */ +#define vm_map_copy_insert(map, where, copy) \ + MACRO_BEGIN \ + (((where)->vme_next)->vme_prev = vm_map_copy_last_entry(copy)) \ + ->vme_next = ((where)->vme_next); \ + ((where)->vme_next = vm_map_copy_first_entry(copy)) \ + ->vme_prev = (where); \ + (map)->hdr.nentries += (copy)->cpy_hdr.nentries; \ + zfree(vm_map_copy_zone, (vm_offset_t) copy); \ + MACRO_END + +/* + * Routine: vm_map_copyout + * + * Description: + * Copy out a copy chain ("copy") into newly-allocated + * space in the destination map. + * + * If successful, consumes the copy object. + * Otherwise, the caller is responsible for it. + */ +kern_return_t vm_map_copyout(dst_map, dst_addr, copy) + register + vm_map_t dst_map; + vm_offset_t *dst_addr; /* OUT */ + register + vm_map_copy_t copy; +{ + vm_size_t size; + vm_size_t adjustment; + vm_offset_t start; + vm_offset_t vm_copy_start; + vm_map_entry_t last; + register + vm_map_entry_t entry; + + /* + * Check for null copy object. + */ + + if (copy == VM_MAP_COPY_NULL) { + *dst_addr = 0; + return(KERN_SUCCESS); + } + + /* + * Check for special copy object, created + * by vm_map_copyin_object. + */ + + if (copy->type == VM_MAP_COPY_OBJECT) { + vm_object_t object = copy->cpy_object; + vm_size_t offset = copy->offset; + vm_size_t tmp_size = copy->size; + kern_return_t kr; + + *dst_addr = 0; + kr = vm_map_enter(dst_map, dst_addr, tmp_size, + (vm_offset_t) 0, TRUE, + object, offset, FALSE, + VM_PROT_DEFAULT, VM_PROT_ALL, + VM_INHERIT_DEFAULT); + if (kr != KERN_SUCCESS) + return(kr); + zfree(vm_map_copy_zone, (vm_offset_t) copy); + return(KERN_SUCCESS); + } + + if (copy->type == VM_MAP_COPY_PAGE_LIST) + return(vm_map_copyout_page_list(dst_map, dst_addr, copy)); + + /* + * Find space for the data + */ + + vm_copy_start = trunc_page(copy->offset); + size = round_page(copy->offset + copy->size) - vm_copy_start; + + StartAgain: ; + + vm_map_lock(dst_map); + start = ((last = dst_map->first_free) == vm_map_to_entry(dst_map)) ? + vm_map_min(dst_map) : last->vme_end; + + while (TRUE) { + vm_map_entry_t next = last->vme_next; + vm_offset_t end = start + size; + + if ((end > dst_map->max_offset) || (end < start)) { + if (dst_map->wait_for_space) { + if (size <= (dst_map->max_offset - dst_map->min_offset)) { + assert_wait((event_t) dst_map, TRUE); + vm_map_unlock(dst_map); + thread_block((void (*)()) 0); + goto StartAgain; + } + } + vm_map_unlock(dst_map); + return(KERN_NO_SPACE); + } + + if ((next == vm_map_to_entry(dst_map)) || + (next->vme_start >= end)) + break; + + last = next; + start = last->vme_end; + } + + /* + * Since we're going to just drop the map + * entries from the copy into the destination + * map, they must come from the same pool. + */ + + if (copy->cpy_hdr.entries_pageable != dst_map->hdr.entries_pageable) { + /* + * Mismatches occur when dealing with the default + * pager. + */ + zone_t old_zone; + vm_map_entry_t next, new; + + /* + * Find the zone that the copies were allocated from + */ + old_zone = (copy->cpy_hdr.entries_pageable) + ? vm_map_entry_zone + : vm_map_kentry_zone; + entry = vm_map_copy_first_entry(copy); + + /* + * Reinitialize the copy so that vm_map_copy_entry_link + * will work. + */ + copy->cpy_hdr.nentries = 0; + copy->cpy_hdr.entries_pageable = dst_map->hdr.entries_pageable; + vm_map_copy_first_entry(copy) = + vm_map_copy_last_entry(copy) = + vm_map_copy_to_entry(copy); + + /* + * Copy each entry. + */ + while (entry != vm_map_copy_to_entry(copy)) { + new = vm_map_copy_entry_create(copy); + vm_map_entry_copy_full(new, entry); + vm_map_copy_entry_link(copy, + vm_map_copy_last_entry(copy), + new); + next = entry->vme_next; + zfree(old_zone, (vm_offset_t) entry); + entry = next; + } + } + + /* + * Adjust the addresses in the copy chain, and + * reset the region attributes. + */ + + adjustment = start - vm_copy_start; + for (entry = vm_map_copy_first_entry(copy); + entry != vm_map_copy_to_entry(copy); + entry = entry->vme_next) { + entry->vme_start += adjustment; + entry->vme_end += adjustment; + + entry->inheritance = VM_INHERIT_DEFAULT; + entry->protection = VM_PROT_DEFAULT; + entry->max_protection = VM_PROT_ALL; + entry->projected_on = 0; + + /* + * If the entry is now wired, + * map the pages into the destination map. + */ + if (entry->wired_count != 0) { + register vm_offset_t va; + vm_offset_t offset; + register vm_object_t object; + + object = entry->object.vm_object; + offset = entry->offset; + va = entry->vme_start; + + pmap_pageable(dst_map->pmap, + entry->vme_start, + entry->vme_end, + TRUE); + + while (va < entry->vme_end) { + register vm_page_t m; + + /* + * Look up the page in the object. + * Assert that the page will be found in the + * top object: + * either + * the object was newly created by + * vm_object_copy_slowly, and has + * copies of all of the pages from + * the source object + * or + * the object was moved from the old + * map entry; because the old map + * entry was wired, all of the pages + * were in the top-level object. + * (XXX not true if we wire pages for + * reading) + */ + vm_object_lock(object); + vm_object_paging_begin(object); + + m = vm_page_lookup(object, offset); + if (m == VM_PAGE_NULL || m->wire_count == 0 || + m->absent) + panic("vm_map_copyout: wiring 0x%x", m); + + m->busy = TRUE; + vm_object_unlock(object); + + PMAP_ENTER(dst_map->pmap, va, m, + entry->protection, TRUE); + + vm_object_lock(object); + PAGE_WAKEUP_DONE(m); + /* the page is wired, so we don't have to activate */ + vm_object_paging_end(object); + vm_object_unlock(object); + + offset += PAGE_SIZE; + va += PAGE_SIZE; + } + } + + + } + + /* + * Correct the page alignment for the result + */ + + *dst_addr = start + (copy->offset - vm_copy_start); + + /* + * Update the hints and the map size + */ + + if (dst_map->first_free == last) + dst_map->first_free = vm_map_copy_last_entry(copy); + SAVE_HINT(dst_map, vm_map_copy_last_entry(copy)); + + dst_map->size += size; + + /* + * Link in the copy + */ + + vm_map_copy_insert(dst_map, last, copy); + + vm_map_unlock(dst_map); + + /* + * XXX If wiring_required, call vm_map_pageable + */ + + return(KERN_SUCCESS); +} + +/* + * + * vm_map_copyout_page_list: + * + * Version of vm_map_copyout() for page list vm map copies. + * + */ +kern_return_t vm_map_copyout_page_list(dst_map, dst_addr, copy) + register + vm_map_t dst_map; + vm_offset_t *dst_addr; /* OUT */ + register + vm_map_copy_t copy; +{ + vm_size_t size; + vm_offset_t start; + vm_offset_t end; + vm_offset_t offset; + vm_map_entry_t last; + register + vm_object_t object; + vm_page_t *page_list, m; + vm_map_entry_t entry; + vm_offset_t old_last_offset; + boolean_t cont_invoked, needs_wakeup = FALSE; + kern_return_t result = KERN_SUCCESS; + vm_map_copy_t orig_copy; + vm_offset_t dst_offset; + boolean_t must_wire; + + /* + * Make sure the pages are stolen, because we are + * going to put them in a new object. Assume that + * all pages are identical to first in this regard. + */ + + page_list = ©->cpy_page_list[0]; + if ((*page_list)->tabled) + vm_map_copy_steal_pages(copy); + + /* + * Find space for the data + */ + + size = round_page(copy->offset + copy->size) - + trunc_page(copy->offset); +StartAgain: + vm_map_lock(dst_map); + must_wire = dst_map->wiring_required; + + last = dst_map->first_free; + if (last == vm_map_to_entry(dst_map)) { + start = vm_map_min(dst_map); + } else { + start = last->vme_end; + } + + while (TRUE) { + vm_map_entry_t next = last->vme_next; + end = start + size; + + if ((end > dst_map->max_offset) || (end < start)) { + if (dst_map->wait_for_space) { + if (size <= (dst_map->max_offset - + dst_map->min_offset)) { + assert_wait((event_t) dst_map, TRUE); + vm_map_unlock(dst_map); + thread_block((void (*)()) 0); + goto StartAgain; + } + } + vm_map_unlock(dst_map); + return(KERN_NO_SPACE); + } + + if ((next == vm_map_to_entry(dst_map)) || + (next->vme_start >= end)) { + break; + } + + last = next; + start = last->vme_end; + } + + /* + * See whether we can avoid creating a new entry (and object) by + * extending one of our neighbors. [So far, we only attempt to + * extend from below.] + * + * The code path below here is a bit twisted. If any of the + * extension checks fails, we branch to create_object. If + * it all works, we fall out the bottom and goto insert_pages. + */ + if (last == vm_map_to_entry(dst_map) || + last->vme_end != start || + last->is_shared != FALSE || + last->is_sub_map != FALSE || + last->inheritance != VM_INHERIT_DEFAULT || + last->protection != VM_PROT_DEFAULT || + last->max_protection != VM_PROT_ALL || + (must_wire ? (last->wired_count != 1 || + last->user_wired_count != 1) : + (last->wired_count != 0))) { + goto create_object; + } + + /* + * If this entry needs an object, make one. + */ + if (last->object.vm_object == VM_OBJECT_NULL) { + object = vm_object_allocate( + (vm_size_t)(last->vme_end - last->vme_start + size)); + last->object.vm_object = object; + last->offset = 0; + vm_object_lock(object); + } + else { + vm_offset_t prev_offset = last->offset; + vm_size_t prev_size = start - last->vme_start; + vm_size_t new_size; + + /* + * This is basically vm_object_coalesce. + */ + + object = last->object.vm_object; + vm_object_lock(object); + + /* + * Try to collapse the object first + */ + vm_object_collapse(object); + + /* + * Can't coalesce if pages not mapped to + * last may be in use anyway: + * . more than one reference + * . paged out + * . shadows another object + * . has a copy elsewhere + * . paging references (pages might be in page-list) + */ + + if ((object->ref_count > 1) || + object->pager_created || + (object->shadow != VM_OBJECT_NULL) || + (object->copy != VM_OBJECT_NULL) || + (object->paging_in_progress != 0)) { + vm_object_unlock(object); + goto create_object; + } + + /* + * Extend the object if necessary. Don't have to call + * vm_object_page_remove because the pages aren't mapped, + * and vm_page_replace will free up any old ones it encounters. + */ + new_size = prev_offset + prev_size + size; + if (new_size > object->size) + object->size = new_size; + } + + /* + * Coalesced the two objects - can extend + * the previous map entry to include the + * new range. + */ + dst_map->size += size; + last->vme_end = end; + + SAVE_HINT(dst_map, last); + + goto insert_pages; + +create_object: + + /* + * Create object + */ + object = vm_object_allocate(size); + + /* + * Create entry + */ + + entry = vm_map_entry_create(dst_map); + + entry->object.vm_object = object; + entry->offset = 0; + + entry->is_shared = FALSE; + entry->is_sub_map = FALSE; + entry->needs_copy = FALSE; + + if (must_wire) { + entry->wired_count = 1; + entry->user_wired_count = 1; + } else { + entry->wired_count = 0; + entry->user_wired_count = 0; + } + + entry->in_transition = TRUE; + entry->needs_wakeup = FALSE; + + entry->vme_start = start; + entry->vme_end = start + size; + + entry->inheritance = VM_INHERIT_DEFAULT; + entry->protection = VM_PROT_DEFAULT; + entry->max_protection = VM_PROT_ALL; + entry->projected_on = 0; + + vm_object_lock(object); + + /* + * Update the hints and the map size + */ + if (dst_map->first_free == last) { + dst_map->first_free = entry; + } + SAVE_HINT(dst_map, entry); + dst_map->size += size; + + /* + * Link in the entry + */ + vm_map_entry_link(dst_map, last, entry); + last = entry; + + /* + * Transfer pages into new object. + * Scan page list in vm_map_copy. + */ +insert_pages: + dst_offset = copy->offset & PAGE_MASK; + cont_invoked = FALSE; + orig_copy = copy; + last->in_transition = TRUE; + old_last_offset = last->offset + + (start - last->vme_start); + + vm_page_lock_queues(); + + for (offset = 0; offset < size; offset += PAGE_SIZE) { + m = *page_list; + assert(m && !m->tabled); + + /* + * Must clear busy bit in page before inserting it. + * Ok to skip wakeup logic because nobody else + * can possibly know about this page. + * The page is dirty in its new object. + */ + + assert(!m->wanted); + + m->busy = FALSE; + m->dirty = TRUE; + vm_page_replace(m, object, old_last_offset + offset); + if (must_wire) { + vm_page_wire(m); + PMAP_ENTER(dst_map->pmap, + last->vme_start + m->offset - last->offset, + m, last->protection, TRUE); + } else { + vm_page_activate(m); + } + + *page_list++ = VM_PAGE_NULL; + if (--(copy->cpy_npages) == 0 && + vm_map_copy_has_cont(copy)) { + vm_map_copy_t new_copy; + + /* + * Ok to unlock map because entry is + * marked in_transition. + */ + cont_invoked = TRUE; + vm_page_unlock_queues(); + vm_object_unlock(object); + vm_map_unlock(dst_map); + vm_map_copy_invoke_cont(copy, &new_copy, &result); + + if (result == KERN_SUCCESS) { + + /* + * If we got back a copy with real pages, + * steal them now. Either all of the + * pages in the list are tabled or none + * of them are; mixtures are not possible. + * + * Save original copy for consume on + * success logic at end of routine. + */ + if (copy != orig_copy) + vm_map_copy_discard(copy); + + if ((copy = new_copy) != VM_MAP_COPY_NULL) { + page_list = ©->cpy_page_list[0]; + if ((*page_list)->tabled) + vm_map_copy_steal_pages(copy); + } + } + else { + /* + * Continuation failed. + */ + vm_map_lock(dst_map); + goto error; + } + + vm_map_lock(dst_map); + vm_object_lock(object); + vm_page_lock_queues(); + } + } + + vm_page_unlock_queues(); + vm_object_unlock(object); + + *dst_addr = start + dst_offset; + + /* + * Clear the in transition bits. This is easy if we + * didn't have a continuation. + */ +error: + if (!cont_invoked) { + /* + * We didn't unlock the map, so nobody could + * be waiting. + */ + last->in_transition = FALSE; + assert(!last->needs_wakeup); + needs_wakeup = FALSE; + } + else { + if (!vm_map_lookup_entry(dst_map, start, &entry)) + panic("vm_map_copyout_page_list: missing entry"); + + /* + * Clear transition bit for all constituent entries that + * were in the original entry. Also check for waiters. + */ + while((entry != vm_map_to_entry(dst_map)) && + (entry->vme_start < end)) { + assert(entry->in_transition); + entry->in_transition = FALSE; + if(entry->needs_wakeup) { + entry->needs_wakeup = FALSE; + needs_wakeup = TRUE; + } + entry = entry->vme_next; + } + } + + if (result != KERN_SUCCESS) + vm_map_delete(dst_map, start, end); + + vm_map_unlock(dst_map); + + if (needs_wakeup) + vm_map_entry_wakeup(dst_map); + + /* + * Consume on success logic. + */ + if (copy != orig_copy) { + zfree(vm_map_copy_zone, (vm_offset_t) copy); + } + if (result == KERN_SUCCESS) { + zfree(vm_map_copy_zone, (vm_offset_t) orig_copy); + } + + return(result); +} + +/* + * Routine: vm_map_copyin + * + * Description: + * Copy the specified region (src_addr, len) from the + * source address space (src_map), possibly removing + * the region from the source address space (src_destroy). + * + * Returns: + * A vm_map_copy_t object (copy_result), suitable for + * insertion into another address space (using vm_map_copyout), + * copying over another address space region (using + * vm_map_copy_overwrite). If the copy is unused, it + * should be destroyed (using vm_map_copy_discard). + * + * In/out conditions: + * The source map should not be locked on entry. + */ +kern_return_t vm_map_copyin(src_map, src_addr, len, src_destroy, copy_result) + vm_map_t src_map; + vm_offset_t src_addr; + vm_size_t len; + boolean_t src_destroy; + vm_map_copy_t *copy_result; /* OUT */ +{ + vm_map_entry_t tmp_entry; /* Result of last map lookup -- + * in multi-level lookup, this + * entry contains the actual + * vm_object/offset. + */ + + vm_offset_t src_start; /* Start of current entry -- + * where copy is taking place now + */ + vm_offset_t src_end; /* End of entire region to be + * copied */ + + register + vm_map_copy_t copy; /* Resulting copy */ + + /* + * Check for copies of zero bytes. + */ + + if (len == 0) { + *copy_result = VM_MAP_COPY_NULL; + return(KERN_SUCCESS); + } + + /* + * Compute start and end of region + */ + + src_start = trunc_page(src_addr); + src_end = round_page(src_addr + len); + + /* + * Check that the end address doesn't overflow + */ + + if (src_end <= src_start) + if ((src_end < src_start) || (src_start != 0)) + return(KERN_INVALID_ADDRESS); + + /* + * Allocate a header element for the list. + * + * Use the start and end in the header to + * remember the endpoints prior to rounding. + */ + + copy = (vm_map_copy_t) zalloc(vm_map_copy_zone); + vm_map_copy_first_entry(copy) = + vm_map_copy_last_entry(copy) = vm_map_copy_to_entry(copy); + copy->type = VM_MAP_COPY_ENTRY_LIST; + copy->cpy_hdr.nentries = 0; + copy->cpy_hdr.entries_pageable = TRUE; + + copy->offset = src_addr; + copy->size = len; + +#define RETURN(x) \ + MACRO_BEGIN \ + vm_map_unlock(src_map); \ + vm_map_copy_discard(copy); \ + MACRO_RETURN(x); \ + MACRO_END + + /* + * Find the beginning of the region. + */ + + vm_map_lock(src_map); + + if (!vm_map_lookup_entry(src_map, src_start, &tmp_entry)) + RETURN(KERN_INVALID_ADDRESS); + vm_map_clip_start(src_map, tmp_entry, src_start); + + /* + * Go through entries until we get to the end. + */ + + while (TRUE) { + register + vm_map_entry_t src_entry = tmp_entry; /* Top-level entry */ + vm_size_t src_size; /* Size of source + * map entry (in both + * maps) + */ + + register + vm_object_t src_object; /* Object to copy */ + vm_offset_t src_offset; + + boolean_t src_needs_copy; /* Should source map + * be made read-only + * for copy-on-write? + */ + + register + vm_map_entry_t new_entry; /* Map entry for copy */ + boolean_t new_entry_needs_copy; /* Will new entry be COW? */ + + boolean_t was_wired; /* Was source wired? */ + vm_map_version_t version; /* Version before locks + * dropped to make copy + */ + + /* + * Verify that the region can be read. + */ + + if (! (src_entry->protection & VM_PROT_READ)) + RETURN(KERN_PROTECTION_FAILURE); + + /* + * Clip against the endpoints of the entire region. + */ + + vm_map_clip_end(src_map, src_entry, src_end); + + src_size = src_entry->vme_end - src_start; + src_object = src_entry->object.vm_object; + src_offset = src_entry->offset; + was_wired = (src_entry->wired_count != 0); + + /* + * Create a new address map entry to + * hold the result. Fill in the fields from + * the appropriate source entries. + */ + + new_entry = vm_map_copy_entry_create(copy); + vm_map_entry_copy(new_entry, src_entry); + + /* + * Attempt non-blocking copy-on-write optimizations. + */ + + if (src_destroy && + (src_object == VM_OBJECT_NULL || + (src_object->temporary && !src_object->use_shared_copy))) + { + /* + * If we are destroying the source, and the object + * is temporary, and not shared writable, + * we can move the object reference + * from the source to the copy. The copy is + * copy-on-write only if the source is. + * We make another reference to the object, because + * destroying the source entry will deallocate it. + */ + vm_object_reference(src_object); + + /* + * Copy is always unwired. vm_map_copy_entry + * set its wired count to zero. + */ + + goto CopySuccessful; + } + + if (!was_wired && + vm_object_copy_temporary( + &new_entry->object.vm_object, + &new_entry->offset, + &src_needs_copy, + &new_entry_needs_copy)) { + + new_entry->needs_copy = new_entry_needs_copy; + + /* + * Handle copy-on-write obligations + */ + + if (src_needs_copy && !tmp_entry->needs_copy) { + vm_object_pmap_protect( + src_object, + src_offset, + src_size, + (src_entry->is_shared ? PMAP_NULL + : src_map->pmap), + src_entry->vme_start, + src_entry->protection & + ~VM_PROT_WRITE); + + tmp_entry->needs_copy = TRUE; + } + + /* + * The map has never been unlocked, so it's safe to + * move to the next entry rather than doing another + * lookup. + */ + + goto CopySuccessful; + } + + new_entry->needs_copy = FALSE; + + /* + * Take an object reference, so that we may + * release the map lock(s). + */ + + assert(src_object != VM_OBJECT_NULL); + vm_object_reference(src_object); + + /* + * Record the timestamp for later verification. + * Unlock the map. + */ + + version.main_timestamp = src_map->timestamp; + vm_map_unlock(src_map); + + /* + * Perform the copy + */ + + if (was_wired) { + vm_object_lock(src_object); + (void) vm_object_copy_slowly( + src_object, + src_offset, + src_size, + FALSE, + &new_entry->object.vm_object); + new_entry->offset = 0; + new_entry->needs_copy = FALSE; + } else { + kern_return_t result; + + result = vm_object_copy_strategically(src_object, + src_offset, + src_size, + &new_entry->object.vm_object, + &new_entry->offset, + &new_entry_needs_copy); + + new_entry->needs_copy = new_entry_needs_copy; + + + if (result != KERN_SUCCESS) { + vm_map_copy_entry_dispose(copy, new_entry); + + vm_map_lock(src_map); + RETURN(result); + } + + } + + /* + * Throw away the extra reference + */ + + vm_object_deallocate(src_object); + + /* + * Verify that the map has not substantially + * changed while the copy was being made. + */ + + vm_map_lock(src_map); /* Increments timestamp once! */ + + if ((version.main_timestamp + 1) == src_map->timestamp) + goto CopySuccessful; + + /* + * Simple version comparison failed. + * + * Retry the lookup and verify that the + * same object/offset are still present. + * + * [Note: a memory manager that colludes with + * the calling task can detect that we have + * cheated. While the map was unlocked, the + * mapping could have been changed and restored.] + */ + + if (!vm_map_lookup_entry(src_map, src_start, &tmp_entry)) { + vm_map_copy_entry_dispose(copy, new_entry); + RETURN(KERN_INVALID_ADDRESS); + } + + src_entry = tmp_entry; + vm_map_clip_start(src_map, src_entry, src_start); + + if ((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE) + goto VerificationFailed; + + if (src_entry->vme_end < new_entry->vme_end) + src_size = (new_entry->vme_end = src_entry->vme_end) - src_start; + + if ((src_entry->object.vm_object != src_object) || + (src_entry->offset != src_offset) ) { + + /* + * Verification failed. + * + * Start over with this top-level entry. + */ + + VerificationFailed: ; + + vm_object_deallocate(new_entry->object.vm_object); + vm_map_copy_entry_dispose(copy, new_entry); + tmp_entry = src_entry; + continue; + } + + /* + * Verification succeeded. + */ + + CopySuccessful: ; + + /* + * Link in the new copy entry. + */ + + vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy), + new_entry); + + /* + * Determine whether the entire region + * has been copied. + */ + src_start = new_entry->vme_end; + if ((src_start >= src_end) && (src_end != 0)) + break; + + /* + * Verify that there are no gaps in the region + */ + + tmp_entry = src_entry->vme_next; + if (tmp_entry->vme_start != src_start) + RETURN(KERN_INVALID_ADDRESS); + } + + /* + * If the source should be destroyed, do it now, since the + * copy was successful. + */ + if (src_destroy) + (void) vm_map_delete(src_map, trunc_page(src_addr), src_end); + + vm_map_unlock(src_map); + + *copy_result = copy; + return(KERN_SUCCESS); + +#undef RETURN +} + +/* + * vm_map_copyin_object: + * + * Create a copy object from an object. + * Our caller donates an object reference. + */ + +kern_return_t vm_map_copyin_object(object, offset, size, copy_result) + vm_object_t object; + vm_offset_t offset; /* offset of region in object */ + vm_size_t size; /* size of region in object */ + vm_map_copy_t *copy_result; /* OUT */ +{ + vm_map_copy_t copy; /* Resulting copy */ + + /* + * We drop the object into a special copy object + * that contains the object directly. These copy objects + * are distinguished by entries_pageable == FALSE + * and null links. + */ + + copy = (vm_map_copy_t) zalloc(vm_map_copy_zone); + vm_map_copy_first_entry(copy) = + vm_map_copy_last_entry(copy) = VM_MAP_ENTRY_NULL; + copy->type = VM_MAP_COPY_OBJECT; + copy->cpy_object = object; + copy->offset = offset; + copy->size = size; + + *copy_result = copy; + return(KERN_SUCCESS); +} + +/* + * vm_map_copyin_page_list_cont: + * + * Continuation routine for vm_map_copyin_page_list. + * + * If vm_map_copyin_page_list can't fit the entire vm range + * into a single page list object, it creates a continuation. + * When the target of the operation has used the pages in the + * initial page list, it invokes the continuation, which calls + * this routine. If an error happens, the continuation is aborted + * (abort arg to this routine is TRUE). To avoid deadlocks, the + * pages are discarded from the initial page list before invoking + * the continuation. + * + * NOTE: This is not the same sort of continuation used by + * the scheduler. + */ + +kern_return_t vm_map_copyin_page_list_cont(cont_args, copy_result) +vm_map_copyin_args_t cont_args; +vm_map_copy_t *copy_result; /* OUT */ +{ + kern_return_t result = 0; /* '=0' to quiet gcc warnings */ + register boolean_t do_abort, src_destroy, src_destroy_only; + + /* + * Check for cases that only require memory destruction. + */ + do_abort = (copy_result == (vm_map_copy_t *) 0); + src_destroy = (cont_args->destroy_len != (vm_size_t) 0); + src_destroy_only = (cont_args->src_len == (vm_size_t) 0); + + if (do_abort || src_destroy_only) { + if (src_destroy) + result = vm_map_remove(cont_args->map, + cont_args->destroy_addr, + cont_args->destroy_addr + cont_args->destroy_len); + if (!do_abort) + *copy_result = VM_MAP_COPY_NULL; + } + else { + result = vm_map_copyin_page_list(cont_args->map, + cont_args->src_addr, cont_args->src_len, src_destroy, + cont_args->steal_pages, copy_result, TRUE); + + if (src_destroy && !cont_args->steal_pages && + vm_map_copy_has_cont(*copy_result)) { + vm_map_copyin_args_t new_args; + /* + * Transfer old destroy info. + */ + new_args = (vm_map_copyin_args_t) + (*copy_result)->cpy_cont_args; + new_args->destroy_addr = cont_args->destroy_addr; + new_args->destroy_len = cont_args->destroy_len; + } + } + + vm_map_deallocate(cont_args->map); + kfree((vm_offset_t)cont_args, sizeof(vm_map_copyin_args_data_t)); + + return(result); +} + +/* + * vm_map_copyin_page_list: + * + * This is a variant of vm_map_copyin that copies in a list of pages. + * If steal_pages is TRUE, the pages are only in the returned list. + * If steal_pages is FALSE, the pages are busy and still in their + * objects. A continuation may be returned if not all the pages fit: + * the recipient of this copy_result must be prepared to deal with it. + */ + +kern_return_t vm_map_copyin_page_list(src_map, src_addr, len, src_destroy, + steal_pages, copy_result, is_cont) + vm_map_t src_map; + vm_offset_t src_addr; + vm_size_t len; + boolean_t src_destroy; + boolean_t steal_pages; + vm_map_copy_t *copy_result; /* OUT */ + boolean_t is_cont; +{ + vm_map_entry_t src_entry; + vm_page_t m; + vm_offset_t src_start; + vm_offset_t src_end; + vm_size_t src_size; + register + vm_object_t src_object; + register + vm_offset_t src_offset; + vm_offset_t src_last_offset; + register + vm_map_copy_t copy; /* Resulting copy */ + kern_return_t result = KERN_SUCCESS; + boolean_t need_map_lookup; + vm_map_copyin_args_t cont_args; + + /* + * If steal_pages is FALSE, this leaves busy pages in + * the object. A continuation must be used if src_destroy + * is true in this case (!steal_pages && src_destroy). + * + * XXX Still have a more general problem of what happens + * XXX if the same page occurs twice in a list. Deadlock + * XXX can happen if vm_fault_page was called. A + * XXX possible solution is to use a continuation if vm_fault_page + * XXX is called and we cross a map entry boundary. + */ + + /* + * Check for copies of zero bytes. + */ + + if (len == 0) { + *copy_result = VM_MAP_COPY_NULL; + return(KERN_SUCCESS); + } + + /* + * Compute start and end of region + */ + + src_start = trunc_page(src_addr); + src_end = round_page(src_addr + len); + + /* + * Check that the end address doesn't overflow + */ + + if (src_end <= src_start && (src_end < src_start || src_start != 0)) { + return KERN_INVALID_ADDRESS; + } + + /* + * Allocate a header element for the page list. + * + * Record original offset and size, as caller may not + * be page-aligned. + */ + + copy = (vm_map_copy_t) zalloc(vm_map_copy_zone); + copy->type = VM_MAP_COPY_PAGE_LIST; + copy->cpy_npages = 0; + copy->offset = src_addr; + copy->size = len; + copy->cpy_cont = ((kern_return_t (*)()) 0); + copy->cpy_cont_args = (char *) VM_MAP_COPYIN_ARGS_NULL; + + /* + * Find the beginning of the region. + */ + +do_map_lookup: + + vm_map_lock(src_map); + + if (!vm_map_lookup_entry(src_map, src_start, &src_entry)) { + result = KERN_INVALID_ADDRESS; + goto error; + } + need_map_lookup = FALSE; + + /* + * Go through entries until we get to the end. + */ + + while (TRUE) { + + if (! (src_entry->protection & VM_PROT_READ)) { + result = KERN_PROTECTION_FAILURE; + goto error; + } + + if (src_end > src_entry->vme_end) + src_size = src_entry->vme_end - src_start; + else + src_size = src_end - src_start; + + src_object = src_entry->object.vm_object; + src_offset = src_entry->offset + + (src_start - src_entry->vme_start); + + /* + * If src_object is NULL, allocate it now; + * we're going to fault on it shortly. + */ + if (src_object == VM_OBJECT_NULL) { + src_object = vm_object_allocate((vm_size_t) + src_entry->vme_end - + src_entry->vme_start); + src_entry->object.vm_object = src_object; + } + + /* + * Iterate over pages. Fault in ones that aren't present. + */ + src_last_offset = src_offset + src_size; + for (; (src_offset < src_last_offset && !need_map_lookup); + src_offset += PAGE_SIZE, src_start += PAGE_SIZE) { + + if (copy->cpy_npages == VM_MAP_COPY_PAGE_LIST_MAX) { +make_continuation: + /* + * At this point we have the max number of + * pages busy for this thread that we're + * willing to allow. Stop here and record + * arguments for the remainder. Note: + * this means that this routine isn't atomic, + * but that's the breaks. Note that only + * the first vm_map_copy_t that comes back + * from this routine has the right offset + * and size; those from continuations are + * page rounded, and short by the amount + * already done. + * + * Reset src_end so the src_destroy + * code at the bottom doesn't do + * something stupid. + */ + + cont_args = (vm_map_copyin_args_t) + kalloc(sizeof(vm_map_copyin_args_data_t)); + cont_args->map = src_map; + vm_map_reference(src_map); + cont_args->src_addr = src_start; + cont_args->src_len = len - (src_start - src_addr); + if (src_destroy) { + cont_args->destroy_addr = cont_args->src_addr; + cont_args->destroy_len = cont_args->src_len; + } + else { + cont_args->destroy_addr = (vm_offset_t) 0; + cont_args->destroy_len = (vm_offset_t) 0; + } + cont_args->steal_pages = steal_pages; + + copy->cpy_cont_args = (char *) cont_args; + copy->cpy_cont = vm_map_copyin_page_list_cont; + + src_end = src_start; + vm_map_clip_end(src_map, src_entry, src_end); + break; + } + + /* + * Try to find the page of data. + */ + vm_object_lock(src_object); + vm_object_paging_begin(src_object); + if (((m = vm_page_lookup(src_object, src_offset)) != + VM_PAGE_NULL) && !m->busy && !m->fictitious && + !m->absent && !m->error) { + + /* + * This is the page. Mark it busy + * and keep the paging reference on + * the object whilst we do our thing. + */ + m->busy = TRUE; + + /* + * Also write-protect the page, so + * that the map`s owner cannot change + * the data. The busy bit will prevent + * faults on the page from succeeding + * until the copy is released; after + * that, the page can be re-entered + * as writable, since we didn`t alter + * the map entry. This scheme is a + * cheap copy-on-write. + * + * Don`t forget the protection and + * the page_lock value! + * + * If the source is being destroyed + * AND not shared writable, we don`t + * have to protect the page, since + * we will destroy the (only) + * writable mapping later. + */ + if (!src_destroy || + src_object->use_shared_copy) + { + pmap_page_protect(m->phys_addr, + src_entry->protection + & ~m->page_lock + & ~VM_PROT_WRITE); + } + + } + else { + vm_prot_t result_prot; + vm_page_t top_page; + kern_return_t kr; + + /* + * Have to fault the page in; must + * unlock the map to do so. While + * the map is unlocked, anything + * can happen, we must lookup the + * map entry before continuing. + */ + vm_map_unlock(src_map); + need_map_lookup = TRUE; +retry: + result_prot = VM_PROT_READ; + + kr = vm_fault_page(src_object, src_offset, + VM_PROT_READ, FALSE, FALSE, + &result_prot, &m, &top_page, + FALSE, (void (*)()) 0); + /* + * Cope with what happened. + */ + switch (kr) { + case VM_FAULT_SUCCESS: + break; + case VM_FAULT_INTERRUPTED: /* ??? */ + case VM_FAULT_RETRY: + vm_object_lock(src_object); + vm_object_paging_begin(src_object); + goto retry; + case VM_FAULT_MEMORY_SHORTAGE: + VM_PAGE_WAIT((void (*)()) 0); + vm_object_lock(src_object); + vm_object_paging_begin(src_object); + goto retry; + case VM_FAULT_FICTITIOUS_SHORTAGE: + vm_page_more_fictitious(); + vm_object_lock(src_object); + vm_object_paging_begin(src_object); + goto retry; + case VM_FAULT_MEMORY_ERROR: + /* + * Something broke. If this + * is a continuation, return + * a partial result if possible, + * else fail the whole thing. + * In the continuation case, the + * next continuation call will + * get this error if it persists. + */ + vm_map_lock(src_map); + if (is_cont && + copy->cpy_npages != 0) + goto make_continuation; + + result = KERN_MEMORY_ERROR; + goto error; + } + + if (top_page != VM_PAGE_NULL) { + vm_object_lock(src_object); + VM_PAGE_FREE(top_page); + vm_object_paging_end(src_object); + vm_object_unlock(src_object); + } + + /* + * We do not need to write-protect + * the page, since it cannot have + * been in the pmap (and we did not + * enter it above). The busy bit + * will protect the page from being + * entered as writable until it is + * unlocked. + */ + + } + + /* + * The page is busy, its object is locked, and + * we have a paging reference on it. Either + * the map is locked, or need_map_lookup is + * TRUE. + * + * Put the page in the page list. + */ + copy->cpy_page_list[copy->cpy_npages++] = m; + vm_object_unlock(m->object); + } + + /* + * DETERMINE whether the entire region + * has been copied. + */ + if (src_start >= src_end && src_end != 0) { + if (need_map_lookup) + vm_map_lock(src_map); + break; + } + + /* + * If need_map_lookup is TRUE, have to start over with + * another map lookup. Note that we dropped the map + * lock (to call vm_fault_page) above only in this case. + */ + if (need_map_lookup) + goto do_map_lookup; + + /* + * Verify that there are no gaps in the region + */ + + src_start = src_entry->vme_end; + src_entry = src_entry->vme_next; + if (src_entry->vme_start != src_start) { + result = KERN_INVALID_ADDRESS; + goto error; + } + } + + /* + * If steal_pages is true, make sure all + * pages in the copy are not in any object + * We try to remove them from the original + * object, but we may have to copy them. + * + * At this point every page in the list is busy + * and holds a paging reference to its object. + * When we're done stealing, every page is busy, + * and in no object (m->tabled == FALSE). + */ + src_start = trunc_page(src_addr); + if (steal_pages) { + register int i; + vm_offset_t unwire_end; + + unwire_end = src_start; + for (i = 0; i < copy->cpy_npages; i++) { + + /* + * Remove the page from its object if it + * can be stolen. It can be stolen if: + * + * (1) The source is being destroyed, + * the object is temporary, and + * not shared. + * (2) The page is not precious. + * + * The not shared check consists of two + * parts: (a) there are no objects that + * shadow this object. (b) it is not the + * object in any shared map entries (i.e., + * use_shared_copy is not set). + * + * The first check (a) means that we can't + * steal pages from objects that are not + * at the top of their shadow chains. This + * should not be a frequent occurrence. + * + * Stealing wired pages requires telling the + * pmap module to let go of them. + * + * NOTE: stealing clean pages from objects + * whose mappings survive requires a call to + * the pmap module. Maybe later. + */ + m = copy->cpy_page_list[i]; + src_object = m->object; + vm_object_lock(src_object); + + if (src_destroy && + src_object->temporary && + (!src_object->shadowed) && + (!src_object->use_shared_copy) && + !m->precious) { + vm_offset_t page_vaddr; + + page_vaddr = src_start + (i * PAGE_SIZE); + if (m->wire_count > 0) { + + assert(m->wire_count == 1); + /* + * In order to steal a wired + * page, we have to unwire it + * first. We do this inline + * here because we have the page. + * + * Step 1: Unwire the map entry. + * Also tell the pmap module + * that this piece of the + * pmap is pageable. + */ + vm_object_unlock(src_object); + if (page_vaddr >= unwire_end) { + if (!vm_map_lookup_entry(src_map, + page_vaddr, &src_entry)) + panic("vm_map_copyin_page_list: missing wired map entry"); + + vm_map_clip_start(src_map, src_entry, + page_vaddr); + vm_map_clip_end(src_map, src_entry, + src_start + src_size); + + assert(src_entry->wired_count > 0); + src_entry->wired_count = 0; + src_entry->user_wired_count = 0; + unwire_end = src_entry->vme_end; + pmap_pageable(vm_map_pmap(src_map), + page_vaddr, unwire_end, TRUE); + } + + /* + * Step 2: Unwire the page. + * pmap_remove handles this for us. + */ + vm_object_lock(src_object); + } + + /* + * Don't need to remove the mapping; + * vm_map_delete will handle it. + * + * Steal the page. Setting the wire count + * to zero is vm_page_unwire without + * activating the page. + */ + vm_page_lock_queues(); + vm_page_remove(m); + if (m->wire_count > 0) { + m->wire_count = 0; + vm_page_wire_count--; + } else { + VM_PAGE_QUEUES_REMOVE(m); + } + vm_page_unlock_queues(); + } + else { + /* + * Have to copy this page. Have to + * unlock the map while copying, + * hence no further page stealing. + * Hence just copy all the pages. + * Unlock the map while copying; + * This means no further page stealing. + */ + vm_object_unlock(src_object); + vm_map_unlock(src_map); + + vm_map_copy_steal_pages(copy); + + vm_map_lock(src_map); + break; + } + + vm_object_paging_end(src_object); + vm_object_unlock(src_object); + } + + /* + * If the source should be destroyed, do it now, since the + * copy was successful. + */ + + if (src_destroy) { + (void) vm_map_delete(src_map, src_start, src_end); + } + } + else { + /* + * !steal_pages leaves busy pages in the map. + * This will cause src_destroy to hang. Use + * a continuation to prevent this. + */ + if (src_destroy && !vm_map_copy_has_cont(copy)) { + cont_args = (vm_map_copyin_args_t) + kalloc(sizeof(vm_map_copyin_args_data_t)); + vm_map_reference(src_map); + cont_args->map = src_map; + cont_args->src_addr = (vm_offset_t) 0; + cont_args->src_len = (vm_size_t) 0; + cont_args->destroy_addr = src_start; + cont_args->destroy_len = src_end - src_start; + cont_args->steal_pages = FALSE; + + copy->cpy_cont_args = (char *) cont_args; + copy->cpy_cont = vm_map_copyin_page_list_cont; + } + + } + + vm_map_unlock(src_map); + + *copy_result = copy; + return(result); + +error: + vm_map_unlock(src_map); + vm_map_copy_discard(copy); + return(result); +} + +/* + * vm_map_fork: + * + * Create and return a new map based on the old + * map, according to the inheritance values on the + * regions in that map. + * + * The source map must not be locked. + */ +vm_map_t vm_map_fork(old_map) + vm_map_t old_map; +{ + vm_map_t new_map; + register + vm_map_entry_t old_entry; + register + vm_map_entry_t new_entry; + pmap_t new_pmap = pmap_create((vm_size_t) 0); + vm_size_t new_size = 0; + vm_size_t entry_size; + register + vm_object_t object; + + vm_map_lock(old_map); + + new_map = vm_map_create(new_pmap, + old_map->min_offset, + old_map->max_offset, + old_map->hdr.entries_pageable); + + for ( + old_entry = vm_map_first_entry(old_map); + old_entry != vm_map_to_entry(old_map); + ) { + if (old_entry->is_sub_map) + panic("vm_map_fork: encountered a submap"); + + entry_size = (old_entry->vme_end - old_entry->vme_start); + + switch (old_entry->inheritance) { + case VM_INHERIT_NONE: + break; + + case VM_INHERIT_SHARE: + /* + * New sharing code. New map entry + * references original object. Temporary + * objects use asynchronous copy algorithm for + * future copies. First make sure we have + * the right object. If we need a shadow, + * or someone else already has one, then + * make a new shadow and share it. + */ + + object = old_entry->object.vm_object; + if (object == VM_OBJECT_NULL) { + object = vm_object_allocate( + (vm_size_t)(old_entry->vme_end - + old_entry->vme_start)); + old_entry->offset = 0; + old_entry->object.vm_object = object; + assert(!old_entry->needs_copy); + } + else if (old_entry->needs_copy || object->shadowed || + (object->temporary && !old_entry->is_shared && + object->size > (vm_size_t)(old_entry->vme_end - + old_entry->vme_start))) { + + assert(object->temporary); + assert(!(object->shadowed && old_entry->is_shared)); + vm_object_shadow( + &old_entry->object.vm_object, + &old_entry->offset, + (vm_size_t) (old_entry->vme_end - + old_entry->vme_start)); + + /* + * If we're making a shadow for other than + * copy on write reasons, then we have + * to remove write permission. + */ + + if (!old_entry->needs_copy && + (old_entry->protection & VM_PROT_WRITE)) { + pmap_protect(vm_map_pmap(old_map), + old_entry->vme_start, + old_entry->vme_end, + old_entry->protection & + ~VM_PROT_WRITE); + } + old_entry->needs_copy = FALSE; + object = old_entry->object.vm_object; + } + + /* + * Set use_shared_copy to indicate that + * object must use shared (delayed) copy-on + * write. This is ignored for permanent objects. + * Bump the reference count for the new entry + */ + + vm_object_lock(object); + object->use_shared_copy = TRUE; + object->ref_count++; + vm_object_unlock(object); + + if (old_entry->projected_on != 0) { + /* + * If entry is projected buffer, clone the + * entry exactly. + */ + + vm_map_entry_copy_full(new_entry, old_entry); + + } else { + /* + * Clone the entry, using object ref from above. + * Mark both entries as shared. + */ + + new_entry = vm_map_entry_create(new_map); + vm_map_entry_copy(new_entry, old_entry); + old_entry->is_shared = TRUE; + new_entry->is_shared = TRUE; + } + + /* + * Insert the entry into the new map -- we + * know we're inserting at the end of the new + * map. + */ + + vm_map_entry_link( + new_map, + vm_map_last_entry(new_map), + new_entry); + + /* + * Update the physical map + */ + + pmap_copy(new_map->pmap, old_map->pmap, + new_entry->vme_start, + entry_size, + old_entry->vme_start); + + new_size += entry_size; + break; + + case VM_INHERIT_COPY: + if (old_entry->wired_count == 0) { + boolean_t src_needs_copy; + boolean_t new_entry_needs_copy; + + new_entry = vm_map_entry_create(new_map); + vm_map_entry_copy(new_entry, old_entry); + + if (vm_object_copy_temporary( + &new_entry->object.vm_object, + &new_entry->offset, + &src_needs_copy, + &new_entry_needs_copy)) { + + /* + * Handle copy-on-write obligations + */ + + if (src_needs_copy && !old_entry->needs_copy) { + vm_object_pmap_protect( + old_entry->object.vm_object, + old_entry->offset, + entry_size, + (old_entry->is_shared ? + PMAP_NULL : + old_map->pmap), + old_entry->vme_start, + old_entry->protection & + ~VM_PROT_WRITE); + + old_entry->needs_copy = TRUE; + } + + new_entry->needs_copy = new_entry_needs_copy; + + /* + * Insert the entry at the end + * of the map. + */ + + vm_map_entry_link(new_map, + vm_map_last_entry(new_map), + new_entry); + + + new_size += entry_size; + break; + } + + vm_map_entry_dispose(new_map, new_entry); + } + + /* INNER BLOCK (copy cannot be optimized) */ { + + vm_offset_t start = old_entry->vme_start; + vm_map_copy_t copy; + vm_map_entry_t last = vm_map_last_entry(new_map); + + vm_map_unlock(old_map); + if (vm_map_copyin(old_map, + start, + entry_size, + FALSE, + ©) + != KERN_SUCCESS) { + vm_map_lock(old_map); + if (!vm_map_lookup_entry(old_map, start, &last)) + last = last->vme_next; + old_entry = last; + /* + * For some error returns, want to + * skip to the next element. + */ + + continue; + } + + /* + * Insert the copy into the new map + */ + + vm_map_copy_insert(new_map, last, copy); + new_size += entry_size; + + /* + * Pick up the traversal at the end of + * the copied region. + */ + + vm_map_lock(old_map); + start += entry_size; + if (!vm_map_lookup_entry(old_map, start, &last)) + last = last->vme_next; + else + vm_map_clip_start(old_map, last, start); + old_entry = last; + + continue; + /* INNER BLOCK (copy cannot be optimized) */ } + } + old_entry = old_entry->vme_next; + } + + new_map->size = new_size; + vm_map_unlock(old_map); + + return(new_map); +} + +/* + * vm_map_lookup: + * + * Finds the VM object, offset, and + * protection for a given virtual address in the + * specified map, assuming a page fault of the + * type specified. + * + * Returns the (object, offset, protection) for + * this address, whether it is wired down, and whether + * this map has the only reference to the data in question. + * In order to later verify this lookup, a "version" + * is returned. + * + * The map should not be locked; it will not be + * locked on exit. In order to guarantee the + * existence of the returned object, it is returned + * locked. + * + * If a lookup is requested with "write protection" + * specified, the map may be changed to perform virtual + * copying operations, although the data referenced will + * remain the same. + */ +kern_return_t vm_map_lookup(var_map, vaddr, fault_type, out_version, + object, offset, out_prot, wired) + vm_map_t *var_map; /* IN/OUT */ + register vm_offset_t vaddr; + register vm_prot_t fault_type; + + vm_map_version_t *out_version; /* OUT */ + vm_object_t *object; /* OUT */ + vm_offset_t *offset; /* OUT */ + vm_prot_t *out_prot; /* OUT */ + boolean_t *wired; /* OUT */ +{ + register vm_map_entry_t entry; + register vm_map_t map = *var_map; + register vm_prot_t prot; + + RetryLookup: ; + + /* + * Lookup the faulting address. + */ + + vm_map_lock_read(map); + +#define RETURN(why) \ + { \ + vm_map_unlock_read(map); \ + return(why); \ + } + + /* + * If the map has an interesting hint, try it before calling + * full blown lookup routine. + */ + + simple_lock(&map->hint_lock); + entry = map->hint; + simple_unlock(&map->hint_lock); + + if ((entry == vm_map_to_entry(map)) || + (vaddr < entry->vme_start) || (vaddr >= entry->vme_end)) { + vm_map_entry_t tmp_entry; + + /* + * Entry was either not a valid hint, or the vaddr + * was not contained in the entry, so do a full lookup. + */ + if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) + RETURN(KERN_INVALID_ADDRESS); + + entry = tmp_entry; + } + + /* + * Handle submaps. + */ + + if (entry->is_sub_map) { + vm_map_t old_map = map; + + *var_map = map = entry->object.sub_map; + vm_map_unlock_read(old_map); + goto RetryLookup; + } + + /* + * Check whether this task is allowed to have + * this page. + */ + + prot = entry->protection; + + if ((fault_type & (prot)) != fault_type) + if ((prot & VM_PROT_NOTIFY) && (fault_type & VM_PROT_WRITE)) { + RETURN(KERN_WRITE_PROTECTION_FAILURE); + } else { + RETURN(KERN_PROTECTION_FAILURE); + } + + /* + * If this page is not pageable, we have to get + * it for all possible accesses. + */ + + if (*wired = (entry->wired_count != 0)) + prot = fault_type = entry->protection; + + /* + * If the entry was copy-on-write, we either ... + */ + + if (entry->needs_copy) { + /* + * If we want to write the page, we may as well + * handle that now since we've got the map locked. + * + * If we don't need to write the page, we just + * demote the permissions allowed. + */ + + if (fault_type & VM_PROT_WRITE) { + /* + * Make a new object, and place it in the + * object chain. Note that no new references + * have appeared -- one just moved from the + * map to the new object. + */ + + if (vm_map_lock_read_to_write(map)) { + goto RetryLookup; + } + map->timestamp++; + + vm_object_shadow( + &entry->object.vm_object, + &entry->offset, + (vm_size_t) (entry->vme_end - entry->vme_start)); + + entry->needs_copy = FALSE; + + vm_map_lock_write_to_read(map); + } + else { + /* + * We're attempting to read a copy-on-write + * page -- don't allow writes. + */ + + prot &= (~VM_PROT_WRITE); + } + } + + /* + * Create an object if necessary. + */ + if (entry->object.vm_object == VM_OBJECT_NULL) { + + if (vm_map_lock_read_to_write(map)) { + goto RetryLookup; + } + + entry->object.vm_object = vm_object_allocate( + (vm_size_t)(entry->vme_end - entry->vme_start)); + entry->offset = 0; + vm_map_lock_write_to_read(map); + } + + /* + * Return the object/offset from this entry. If the entry + * was copy-on-write or empty, it has been fixed up. Also + * return the protection. + */ + + *offset = (vaddr - entry->vme_start) + entry->offset; + *object = entry->object.vm_object; + *out_prot = prot; + + /* + * Lock the object to prevent it from disappearing + */ + + vm_object_lock(*object); + + /* + * Save the version number and unlock the map. + */ + + out_version->main_timestamp = map->timestamp; + + RETURN(KERN_SUCCESS); + +#undef RETURN +} + +/* + * vm_map_verify: + * + * Verifies that the map in question has not changed + * since the given version. If successful, the map + * will not change until vm_map_verify_done() is called. + */ +boolean_t vm_map_verify(map, version) + register + vm_map_t map; + register + vm_map_version_t *version; /* REF */ +{ + boolean_t result; + + vm_map_lock_read(map); + result = (map->timestamp == version->main_timestamp); + + if (!result) + vm_map_unlock_read(map); + + return(result); +} + +/* + * vm_map_verify_done: + * + * Releases locks acquired by a vm_map_verify. + * + * This is now a macro in vm/vm_map.h. It does a + * vm_map_unlock_read on the map. + */ + +/* + * vm_region: + * + * User call to obtain information about a region in + * a task's address map. + */ + +kern_return_t vm_region(map, address, size, + protection, max_protection, + inheritance, is_shared, + object_name, offset_in_object) + vm_map_t map; + vm_offset_t *address; /* IN/OUT */ + vm_size_t *size; /* OUT */ + vm_prot_t *protection; /* OUT */ + vm_prot_t *max_protection; /* OUT */ + vm_inherit_t *inheritance; /* OUT */ + boolean_t *is_shared; /* OUT */ + ipc_port_t *object_name; /* OUT */ + vm_offset_t *offset_in_object; /* OUT */ +{ + vm_map_entry_t tmp_entry; + register + vm_map_entry_t entry; + register + vm_offset_t tmp_offset; + vm_offset_t start; + + if (map == VM_MAP_NULL) + return(KERN_INVALID_ARGUMENT); + + start = *address; + + vm_map_lock_read(map); + if (!vm_map_lookup_entry(map, start, &tmp_entry)) { + if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) { + vm_map_unlock_read(map); + return(KERN_NO_SPACE); + } + } else { + entry = tmp_entry; + } + + start = entry->vme_start; + *protection = entry->protection; + *max_protection = entry->max_protection; + *inheritance = entry->inheritance; + *address = start; + *size = (entry->vme_end - start); + + tmp_offset = entry->offset; + + + if (entry->is_sub_map) { + *is_shared = FALSE; + *object_name = IP_NULL; + *offset_in_object = tmp_offset; + } else { + *is_shared = entry->is_shared; + *object_name = vm_object_name(entry->object.vm_object); + *offset_in_object = tmp_offset; + } + + vm_map_unlock_read(map); + + return(KERN_SUCCESS); +} + +/* + * Routine: vm_map_simplify + * + * Description: + * Attempt to simplify the map representation in + * the vicinity of the given starting address. + * Note: + * This routine is intended primarily to keep the + * kernel maps more compact -- they generally don't + * benefit from the "expand a map entry" technology + * at allocation time because the adjacent entry + * is often wired down. + */ +void vm_map_simplify(map, start) + vm_map_t map; + vm_offset_t start; +{ + vm_map_entry_t this_entry; + vm_map_entry_t prev_entry; + + vm_map_lock(map); + if ( + (vm_map_lookup_entry(map, start, &this_entry)) && + ((prev_entry = this_entry->vme_prev) != vm_map_to_entry(map)) && + + (prev_entry->vme_end == start) && + + (prev_entry->is_shared == FALSE) && + (prev_entry->is_sub_map == FALSE) && + + (this_entry->is_shared == FALSE) && + (this_entry->is_sub_map == FALSE) && + + (prev_entry->inheritance == this_entry->inheritance) && + (prev_entry->protection == this_entry->protection) && + (prev_entry->max_protection == this_entry->max_protection) && + (prev_entry->wired_count == this_entry->wired_count) && + (prev_entry->user_wired_count == this_entry->user_wired_count) && + + (prev_entry->needs_copy == this_entry->needs_copy) && + + (prev_entry->object.vm_object == this_entry->object.vm_object) && + ((prev_entry->offset + (prev_entry->vme_end - prev_entry->vme_start)) + == this_entry->offset) && + (prev_entry->projected_on == 0) && + (this_entry->projected_on == 0) + ) { + if (map->first_free == this_entry) + map->first_free = prev_entry; + + SAVE_HINT(map, prev_entry); + vm_map_entry_unlink(map, this_entry); + prev_entry->vme_end = this_entry->vme_end; + vm_object_deallocate(this_entry->object.vm_object); + vm_map_entry_dispose(map, this_entry); + } + vm_map_unlock(map); +} + + +/* + * Routine: vm_map_machine_attribute + * Purpose: + * Provide machine-specific attributes to mappings, + * such as cachability etc. for machines that provide + * them. NUMA architectures and machines with big/strange + * caches will use this. + * Note: + * Responsibilities for locking and checking are handled here, + * everything else in the pmap module. If any non-volatile + * information must be kept, the pmap module should handle + * it itself. [This assumes that attributes do not + * need to be inherited, which seems ok to me] + */ +kern_return_t vm_map_machine_attribute(map, address, size, attribute, value) + vm_map_t map; + vm_offset_t address; + vm_size_t size; + vm_machine_attribute_t attribute; + vm_machine_attribute_val_t* value; /* IN/OUT */ +{ + kern_return_t ret; + + if (address < vm_map_min(map) || + (address + size) > vm_map_max(map)) + return KERN_INVALID_ARGUMENT; + + vm_map_lock(map); + + ret = pmap_attribute(map->pmap, address, size, attribute, value); + + vm_map_unlock(map); + + return ret; +} + +#include <mach_kdb.h> + + +#if MACH_KDB + +#define printf kdbprintf + +/* + * vm_map_print: [ debug ] + */ +void vm_map_print(map) + register vm_map_t map; +{ + register vm_map_entry_t entry; + extern int indent; + + iprintf("Task map 0x%X: pmap=0x%X,", + (vm_offset_t) map, (vm_offset_t) (map->pmap)); + printf("ref=%d,nentries=%d,", map->ref_count, map->hdr.nentries); + printf("version=%d\n", map->timestamp); + indent += 2; + for (entry = vm_map_first_entry(map); + entry != vm_map_to_entry(map); + entry = entry->vme_next) { + static char *inheritance_name[3] = { "share", "copy", "none"}; + + iprintf("map entry 0x%X: ", (vm_offset_t) entry); + printf("start=0x%X, end=0x%X, ", + (vm_offset_t) entry->vme_start, (vm_offset_t) entry->vme_end); + printf("prot=%X/%X/%s, ", + entry->protection, + entry->max_protection, + inheritance_name[entry->inheritance]); + if (entry->wired_count != 0) { + printf("wired("); + if (entry->user_wired_count != 0) + printf("u"); + if (entry->wired_count > + ((entry->user_wired_count == 0) ? 0 : 1)) + printf("k"); + printf(") "); + } + if (entry->in_transition) { + printf("in transition"); + if (entry->needs_wakeup) + printf("(wake request)"); + printf(", "); + } + if (entry->is_sub_map) { + printf("submap=0x%X, offset=0x%X\n", + (vm_offset_t) entry->object.sub_map, + (vm_offset_t) entry->offset); + } else { + printf("object=0x%X, offset=0x%X", + (vm_offset_t) entry->object.vm_object, + (vm_offset_t) entry->offset); + if (entry->is_shared) + printf(", shared"); + if (entry->needs_copy) + printf(", copy needed"); + printf("\n"); + + if ((entry->vme_prev == vm_map_to_entry(map)) || + (entry->vme_prev->object.vm_object != entry->object.vm_object)) { + indent += 2; + vm_object_print(entry->object.vm_object); + indent -= 2; + } + } + } + indent -= 2; +} + +/* + * Routine: vm_map_copy_print + * Purpose: + * Pretty-print a copy object for ddb. + */ + +void vm_map_copy_print(copy) + vm_map_copy_t copy; +{ + extern int indent; + int i, npages; + + printf("copy object 0x%x\n", copy); + + indent += 2; + + iprintf("type=%d", copy->type); + switch (copy->type) { + case VM_MAP_COPY_ENTRY_LIST: + printf("[entry_list]"); + break; + + case VM_MAP_COPY_OBJECT: + printf("[object]"); + break; + + case VM_MAP_COPY_PAGE_LIST: + printf("[page_list]"); + break; + + default: + printf("[bad type]"); + break; + } + printf(", offset=0x%x", copy->offset); + printf(", size=0x%x\n", copy->size); + + switch (copy->type) { + case VM_MAP_COPY_ENTRY_LIST: + /* XXX add stuff here */ + break; + + case VM_MAP_COPY_OBJECT: + iprintf("object=0x%x\n", copy->cpy_object); + break; + + case VM_MAP_COPY_PAGE_LIST: + iprintf("npages=%d", copy->cpy_npages); + printf(", cont=%x", copy->cpy_cont); + printf(", cont_args=%x\n", copy->cpy_cont_args); + if (copy->cpy_npages < 0) { + npages = 0; + } else if (copy->cpy_npages > VM_MAP_COPY_PAGE_LIST_MAX) { + npages = VM_MAP_COPY_PAGE_LIST_MAX; + } else { + npages = copy->cpy_npages; + } + iprintf("copy->cpy_page_list[0..%d] = {", npages); + for (i = 0; i < npages - 1; i++) { + printf("0x%x, ", copy->cpy_page_list[i]); + } + if (npages > 0) { + printf("0x%x", copy->cpy_page_list[npages - 1]); + } + printf("}\n"); + break; + } + + indent -=2; +} +#endif MACH_KDB + +#if NORMA_IPC +/* + * This should one day be eliminated; + * we should always construct the right flavor of copy object + * the first time. Troublesome areas include vm_read, where vm_map_copyin + * is called without knowing whom the copy object is for. + * There are also situations where we do want a lazy data structure + * even if we are sending to a remote port... + */ + +/* + * Convert a copy to a page list. The copy argument is in/out + * because we probably have to allocate a new vm_map_copy structure. + * We take responsibility for discarding the old structure and + * use a continuation to do so. Postponing this discard ensures + * that the objects containing the pages we've marked busy will stick + * around. + */ +kern_return_t +vm_map_convert_to_page_list(caller_copy) + vm_map_copy_t *caller_copy; +{ + vm_map_entry_t entry, next_entry; + vm_offset_t va; + vm_offset_t offset; + vm_object_t object; + kern_return_t result; + vm_map_copy_t copy, new_copy; + int i, num_pages = 0; + + zone_t entry_zone; + + copy = *caller_copy; + + /* + * We may not have to do anything, + * or may not be able to do anything. + */ + if (copy == VM_MAP_COPY_NULL || copy->type == VM_MAP_COPY_PAGE_LIST) { + return KERN_SUCCESS; + } + if (copy->type == VM_MAP_COPY_OBJECT) { + return vm_map_convert_to_page_list_from_object(caller_copy); + } + if (copy->type != VM_MAP_COPY_ENTRY_LIST) { + panic("vm_map_convert_to_page_list: copy type %d!\n", + copy->type); + } + + /* + * Allocate the new copy. Set its continuation to + * discard the old one. + */ + new_copy = (vm_map_copy_t) zalloc(vm_map_copy_zone); + new_copy->type = VM_MAP_COPY_PAGE_LIST; + new_copy->cpy_npages = 0; + new_copy->offset = copy->offset; + new_copy->size = copy->size; + new_copy->cpy_cont = vm_map_copy_discard_cont; + new_copy->cpy_cont_args = (char *) copy; + + /* + * Iterate over entries. + */ + for (entry = vm_map_copy_first_entry(copy); + entry != vm_map_copy_to_entry(copy); + entry = entry->vme_next) { + + object = entry->object.vm_object; + offset = entry->offset; + /* + * Iterate over pages. + */ + for (va = entry->vme_start; + va < entry->vme_end; + va += PAGE_SIZE, offset += PAGE_SIZE) { + + vm_page_t m; + + if (new_copy->cpy_npages == VM_MAP_COPY_PAGE_LIST_MAX) { + /* + * What a mess. We need a continuation + * to do the page list, but also one + * to discard the old copy. The right + * thing to do is probably to copy + * out the old copy into the kernel + * map (or some temporary task holding + * map if we're paranoid about large + * copies), and then copyin the page + * list that we really wanted with + * src_destroy. LATER. + */ + panic("vm_map_convert_to_page_list: num\n"); + } + + /* + * Try to find the page of data. + */ + vm_object_lock(object); + vm_object_paging_begin(object); + if (((m = vm_page_lookup(object, offset)) != + VM_PAGE_NULL) && !m->busy && !m->fictitious && + !m->absent && !m->error) { + + /* + * This is the page. Mark it busy + * and keep the paging reference on + * the object whilst we do our thing. + */ + m->busy = TRUE; + + /* + * Also write-protect the page, so + * that the map`s owner cannot change + * the data. The busy bit will prevent + * faults on the page from succeeding + * until the copy is released; after + * that, the page can be re-entered + * as writable, since we didn`t alter + * the map entry. This scheme is a + * cheap copy-on-write. + * + * Don`t forget the protection and + * the page_lock value! + */ + + pmap_page_protect(m->phys_addr, + entry->protection + & ~m->page_lock + & ~VM_PROT_WRITE); + + } + else { + vm_prot_t result_prot; + vm_page_t top_page; + kern_return_t kr; + +retry: + result_prot = VM_PROT_READ; + + kr = vm_fault_page(object, offset, + VM_PROT_READ, FALSE, FALSE, + &result_prot, &m, &top_page, + FALSE, (void (*)()) 0); + if (kr == VM_FAULT_MEMORY_SHORTAGE) { + VM_PAGE_WAIT((void (*)()) 0); + vm_object_lock(object); + vm_object_paging_begin(object); + goto retry; + } + if (kr != VM_FAULT_SUCCESS) { + /* XXX what about data_error? */ + vm_object_lock(object); + vm_object_paging_begin(object); + goto retry; + } + if (top_page != VM_PAGE_NULL) { + vm_object_lock(object); + VM_PAGE_FREE(top_page); + vm_object_paging_end(object); + vm_object_unlock(object); + } + } + assert(m); + m->busy = TRUE; + new_copy->cpy_page_list[new_copy->cpy_npages++] = m; + vm_object_unlock(object); + } + } + + *caller_copy = new_copy; + return KERN_SUCCESS; +} + +kern_return_t +vm_map_convert_to_page_list_from_object(caller_copy) + vm_map_copy_t *caller_copy; +{ + vm_object_t object; + vm_offset_t offset; + vm_map_copy_t copy, new_copy; + + copy = *caller_copy; + assert(copy->type == VM_MAP_COPY_OBJECT); + object = copy->cpy_object; + assert(object->size == round_page(object->size)); + + /* + * Allocate the new copy. Set its continuation to + * discard the old one. + */ + new_copy = (vm_map_copy_t) zalloc(vm_map_copy_zone); + new_copy->type = VM_MAP_COPY_PAGE_LIST; + new_copy->cpy_npages = 0; + new_copy->offset = copy->offset; + new_copy->size = copy->size; + new_copy->cpy_cont = vm_map_copy_discard_cont; + new_copy->cpy_cont_args = (char *) copy; + + /* + * XXX memory_object_lock_request can probably bust this + * XXX See continuation comment in previous routine for solution. + */ + assert(object->size <= VM_MAP_COPY_PAGE_LIST_MAX * PAGE_SIZE); + + for (offset = 0; offset < object->size; offset += PAGE_SIZE) { + vm_page_t m; + + /* + * Try to find the page of data. + */ + vm_object_lock(object); + vm_object_paging_begin(object); + m = vm_page_lookup(object, offset); + if ((m != VM_PAGE_NULL) && !m->busy && !m->fictitious && + !m->absent && !m->error) { + + /* + * This is the page. Mark it busy + * and keep the paging reference on + * the object whilst we do our thing. + */ + m->busy = TRUE; + } + else { + vm_prot_t result_prot; + vm_page_t top_page; + kern_return_t kr; + +retry: + result_prot = VM_PROT_READ; + + kr = vm_fault_page(object, offset, + VM_PROT_READ, FALSE, FALSE, + &result_prot, &m, &top_page, + FALSE, (void (*)()) 0); + if (kr == VM_FAULT_MEMORY_SHORTAGE) { + VM_PAGE_WAIT((void (*)()) 0); + vm_object_lock(object); + vm_object_paging_begin(object); + goto retry; + } + if (kr != VM_FAULT_SUCCESS) { + /* XXX what about data_error? */ + vm_object_lock(object); + vm_object_paging_begin(object); + goto retry; + } + + if (top_page != VM_PAGE_NULL) { + vm_object_lock(object); + VM_PAGE_FREE(top_page); + vm_object_paging_end(object); + vm_object_unlock(object); + } + } + assert(m); + m->busy = TRUE; + new_copy->cpy_page_list[new_copy->cpy_npages++] = m; + vm_object_unlock(object); + } + + *caller_copy = new_copy; + return (KERN_SUCCESS); +} + +kern_return_t +vm_map_convert_from_page_list(copy) + vm_map_copy_t copy; +{ + vm_object_t object; + int i; + vm_map_entry_t new_entry; + vm_page_t *page_list; + + /* + * Check type of copy object. + */ + if (copy->type == VM_MAP_COPY_ENTRY_LIST) { + return KERN_SUCCESS; + } + if (copy->type == VM_MAP_COPY_OBJECT) { + printf("vm_map_convert_from_page_list: COPY_OBJECT?"); + return KERN_SUCCESS; + } + if (copy->type != VM_MAP_COPY_PAGE_LIST) { + panic("vm_map_convert_from_page_list 0x%x %d", + copy, + copy->type); + } + + /* + * Make sure the pages are loose. This may be + * a "Can't Happen", but just to be safe ... + */ + page_list = ©->cpy_page_list[0]; + if ((*page_list)->tabled) + vm_map_copy_steal_pages(copy); + + /* + * Create object, and stuff pages into it. + */ + object = vm_object_allocate(copy->cpy_npages); + for (i = 0; i < copy->cpy_npages; i++) { + register vm_page_t m = *page_list++; + vm_page_insert(m, object, i * PAGE_SIZE); + m->busy = FALSE; + m->dirty = TRUE; + vm_page_activate(m); + } + + /* + * XXX If this page list contained a continuation, then + * XXX we're screwed. The right thing to do is probably do + * XXX the copyout, and then copyin the entry list we really + * XXX wanted. + */ + if (vm_map_copy_has_cont(copy)) + panic("convert_from_page_list: continuation"); + + /* + * Change type of copy object + */ + vm_map_copy_first_entry(copy) = + vm_map_copy_last_entry(copy) = vm_map_copy_to_entry(copy); + copy->type = VM_MAP_COPY_ENTRY_LIST; + copy->cpy_hdr.nentries = 0; + copy->cpy_hdr.entries_pageable = TRUE; + + /* + * Allocate and initialize an entry for object + */ + new_entry = vm_map_copy_entry_create(copy); + new_entry->vme_start = trunc_page(copy->offset); + new_entry->vme_end = round_page(copy->offset + copy->size); + new_entry->object.vm_object = object; + new_entry->offset = 0; + new_entry->is_shared = FALSE; + new_entry->is_sub_map = FALSE; + new_entry->needs_copy = FALSE; + new_entry->protection = VM_PROT_DEFAULT; + new_entry->max_protection = VM_PROT_ALL; + new_entry->inheritance = VM_INHERIT_DEFAULT; + new_entry->wired_count = 0; + new_entry->user_wired_count = 0; + new_entry->projected_on = 0; + + /* + * Insert entry into copy object, and return. + */ + vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy), new_entry); + return(KERN_SUCCESS); +} +#endif NORMA_IPC diff --git a/vm/vm_map.h b/vm/vm_map.h new file mode 100644 index 00000000..0bdb7d13 --- /dev/null +++ b/vm/vm_map.h @@ -0,0 +1,448 @@ +/* + * Mach Operating System + * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University. + * Copyright (c) 1993,1994 The University of Utah and + * the Computer Systems Laboratory (CSL). + * All rights reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON, THE UNIVERSITY OF UTAH AND CSL ALLOW FREE USE OF + * THIS SOFTWARE IN ITS "AS IS" CONDITION, AND DISCLAIM ANY LIABILITY + * OF ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF + * THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +/* + * File: vm/vm_map.h + * Author: Avadis Tevanian, Jr., Michael Wayne Young + * Date: 1985 + * + * Virtual memory map module definitions. + * + * Contributors: + * avie, dlb, mwyoung + */ + +#ifndef _VM_VM_MAP_H_ +#define _VM_VM_MAP_H_ + +#include <mach/kern_return.h> +#include <mach/boolean.h> +#include <mach/machine/vm_types.h> +#include <mach/vm_prot.h> +#include <mach/vm_inherit.h> +#include <vm/pmap.h> +#include <vm/vm_object.h> +#include <vm/vm_page.h> +#include <kern/lock.h> +#include <kern/macro_help.h> + +/* + * Types defined: + * + * vm_map_t the high-level address map data structure. + * vm_map_entry_t an entry in an address map. + * vm_map_version_t a timestamp of a map, for use with vm_map_lookup + * vm_map_copy_t represents memory copied from an address map, + * used for inter-map copy operations + */ + +/* + * Type: vm_map_object_t [internal use only] + * + * Description: + * The target of an address mapping, either a virtual + * memory object or a sub map (of the kernel map). + */ +typedef union vm_map_object { + struct vm_object *vm_object; /* object object */ + struct vm_map *sub_map; /* belongs to another map */ +} vm_map_object_t; + +/* + * Type: vm_map_entry_t [internal use only] + * + * Description: + * A single mapping within an address map. + * + * Implementation: + * Address map entries consist of start and end addresses, + * a VM object (or sub map) and offset into that object, + * and user-exported inheritance and protection information. + * Control information for virtual copy operations is also + * stored in the address map entry. + */ +struct vm_map_links { + struct vm_map_entry *prev; /* previous entry */ + struct vm_map_entry *next; /* next entry */ + vm_offset_t start; /* start address */ + vm_offset_t end; /* end address */ +}; + +struct vm_map_entry { + struct vm_map_links links; /* links to other entries */ +#define vme_prev links.prev +#define vme_next links.next +#define vme_start links.start +#define vme_end links.end + union vm_map_object object; /* object I point to */ + vm_offset_t offset; /* offset into object */ + unsigned int + /* boolean_t */ is_shared:1, /* region is shared */ + /* boolean_t */ is_sub_map:1, /* Is "object" a submap? */ + /* boolean_t */ in_transition:1, /* Entry being changed */ + /* boolean_t */ needs_wakeup:1, /* Waiters on in_transition */ + /* Only used when object is a vm_object: */ + /* boolean_t */ needs_copy:1; /* does object need to be copied */ + + /* Only in task maps: */ + vm_prot_t protection; /* protection code */ + vm_prot_t max_protection; /* maximum protection */ + vm_inherit_t inheritance; /* inheritance */ + unsigned short wired_count; /* can be paged if = 0 */ + unsigned short user_wired_count; /* for vm_wire */ + struct vm_map_entry *projected_on; /* 0 for normal map entry + or persistent kernel map projected buffer entry; + -1 for non-persistent kernel map projected buffer entry; + pointer to corresponding kernel map entry for user map + projected buffer entry */ +}; + +typedef struct vm_map_entry *vm_map_entry_t; + +#define VM_MAP_ENTRY_NULL ((vm_map_entry_t) 0) + +/* + * Type: struct vm_map_header + * + * Description: + * Header for a vm_map and a vm_map_copy. + */ +struct vm_map_header { + struct vm_map_links links; /* first, last, min, max */ + int nentries; /* Number of entries */ + boolean_t entries_pageable; + /* are map entries pageable? */ +}; + +/* + * Type: vm_map_t [exported; contents invisible] + * + * Description: + * An address map -- a directory relating valid + * regions of a task's address space to the corresponding + * virtual memory objects. + * + * Implementation: + * Maps are doubly-linked lists of map entries, sorted + * by address. One hint is used to start + * searches again from the last successful search, + * insertion, or removal. Another hint is used to + * quickly find free space. + */ +struct vm_map { + lock_data_t lock; /* Lock for map data */ + struct vm_map_header hdr; /* Map entry header */ +#define min_offset hdr.links.start /* start of range */ +#define max_offset hdr.links.end /* end of range */ + pmap_t pmap; /* Physical map */ + vm_size_t size; /* virtual size */ + int ref_count; /* Reference count */ + decl_simple_lock_data(, ref_lock) /* Lock for ref_count field */ + vm_map_entry_t hint; /* hint for quick lookups */ + decl_simple_lock_data(, hint_lock) /* lock for hint storage */ + vm_map_entry_t first_free; /* First free space hint */ + boolean_t wait_for_space; /* Should callers wait + for space? */ + boolean_t wiring_required;/* All memory wired? */ + unsigned int timestamp; /* Version number */ +}; +typedef struct vm_map *vm_map_t; + +#define VM_MAP_NULL ((vm_map_t) 0) + +#define vm_map_to_entry(map) ((struct vm_map_entry *) &(map)->hdr.links) +#define vm_map_first_entry(map) ((map)->hdr.links.next) +#define vm_map_last_entry(map) ((map)->hdr.links.prev) + +/* + * Type: vm_map_version_t [exported; contents invisible] + * + * Description: + * Map versions may be used to quickly validate a previous + * lookup operation. + * + * Usage note: + * Because they are bulky objects, map versions are usually + * passed by reference. + * + * Implementation: + * Just a timestamp for the main map. + */ +typedef struct vm_map_version { + unsigned int main_timestamp; +} vm_map_version_t; + +/* + * Type: vm_map_copy_t [exported; contents invisible] + * + * Description: + * A map copy object represents a region of virtual memory + * that has been copied from an address map but is still + * in transit. + * + * A map copy object may only be used by a single thread + * at a time. + * + * Implementation: + * There are three formats for map copy objects. + * The first is very similar to the main + * address map in structure, and as a result, some + * of the internal maintenance functions/macros can + * be used with either address maps or map copy objects. + * + * The map copy object contains a header links + * entry onto which the other entries that represent + * the region are chained. + * + * The second format is a single vm object. This is used + * primarily in the pageout path. The third format is a + * list of vm pages. An optional continuation provides + * a hook to be called to obtain more of the memory, + * or perform other operations. The continuation takes 3 + * arguments, a saved arg buffer, a pointer to a new vm_map_copy + * (returned) and an abort flag (abort if TRUE). + */ + +#if iPSC386 || iPSC860 +#define VM_MAP_COPY_PAGE_LIST_MAX 64 +#else iPSC386 || iPSC860 +#define VM_MAP_COPY_PAGE_LIST_MAX 8 +#endif iPSC386 || iPSC860 + +typedef struct vm_map_copy { + int type; +#define VM_MAP_COPY_ENTRY_LIST 1 +#define VM_MAP_COPY_OBJECT 2 +#define VM_MAP_COPY_PAGE_LIST 3 + vm_offset_t offset; + vm_size_t size; + union { + struct vm_map_header hdr; /* ENTRY_LIST */ + struct { /* OBJECT */ + vm_object_t object; + } c_o; + struct { /* PAGE_LIST */ + vm_page_t page_list[VM_MAP_COPY_PAGE_LIST_MAX]; + int npages; + kern_return_t (*cont)(); + char *cont_args; + } c_p; + } c_u; +} *vm_map_copy_t; + +#define cpy_hdr c_u.hdr + +#define cpy_object c_u.c_o.object + +#define cpy_page_list c_u.c_p.page_list +#define cpy_npages c_u.c_p.npages +#define cpy_cont c_u.c_p.cont +#define cpy_cont_args c_u.c_p.cont_args + +#define VM_MAP_COPY_NULL ((vm_map_copy_t) 0) + +/* + * Useful macros for entry list copy objects + */ + +#define vm_map_copy_to_entry(copy) \ + ((struct vm_map_entry *) &(copy)->cpy_hdr.links) +#define vm_map_copy_first_entry(copy) \ + ((copy)->cpy_hdr.links.next) +#define vm_map_copy_last_entry(copy) \ + ((copy)->cpy_hdr.links.prev) + +/* + * Continuation macros for page list copy objects + */ + +#define vm_map_copy_invoke_cont(old_copy, new_copy, result) \ +MACRO_BEGIN \ + vm_map_copy_page_discard(old_copy); \ + *result = (*((old_copy)->cpy_cont))((old_copy)->cpy_cont_args, \ + new_copy); \ + (old_copy)->cpy_cont = (kern_return_t (*)()) 0; \ +MACRO_END + +#define vm_map_copy_invoke_extend_cont(old_copy, new_copy, result) \ +MACRO_BEGIN \ + *result = (*((old_copy)->cpy_cont))((old_copy)->cpy_cont_args, \ + new_copy); \ + (old_copy)->cpy_cont = (kern_return_t (*)()) 0; \ +MACRO_END + +#define vm_map_copy_abort_cont(old_copy) \ +MACRO_BEGIN \ + vm_map_copy_page_discard(old_copy); \ + (*((old_copy)->cpy_cont))((old_copy)->cpy_cont_args, \ + (vm_map_copy_t *) 0); \ + (old_copy)->cpy_cont = (kern_return_t (*)()) 0; \ + (old_copy)->cpy_cont_args = (char *) 0; \ +MACRO_END + +#define vm_map_copy_has_cont(copy) \ + (((copy)->cpy_cont) != (kern_return_t (*)()) 0) + +/* + * Continuation structures for vm_map_copyin_page_list. + */ + +typedef struct { + vm_map_t map; + vm_offset_t src_addr; + vm_size_t src_len; + vm_offset_t destroy_addr; + vm_size_t destroy_len; + boolean_t steal_pages; +} vm_map_copyin_args_data_t, *vm_map_copyin_args_t; + +#define VM_MAP_COPYIN_ARGS_NULL ((vm_map_copyin_args_t) 0) + +/* + * Macros: vm_map_lock, etc. [internal use only] + * Description: + * Perform locking on the data portion of a map. + */ + +#define vm_map_lock_init(map) \ +MACRO_BEGIN \ + lock_init(&(map)->lock, TRUE); \ + (map)->timestamp = 0; \ +MACRO_END + +#define vm_map_lock(map) \ +MACRO_BEGIN \ + lock_write(&(map)->lock); \ + (map)->timestamp++; \ +MACRO_END + +#define vm_map_unlock(map) lock_write_done(&(map)->lock) +#define vm_map_lock_read(map) lock_read(&(map)->lock) +#define vm_map_unlock_read(map) lock_read_done(&(map)->lock) +#define vm_map_lock_write_to_read(map) \ + lock_write_to_read(&(map)->lock) +#define vm_map_lock_read_to_write(map) \ + (lock_read_to_write(&(map)->lock) || (((map)->timestamp++), 0)) +#define vm_map_lock_set_recursive(map) \ + lock_set_recursive(&(map)->lock) +#define vm_map_lock_clear_recursive(map) \ + lock_clear_recursive(&(map)->lock) + +/* + * Exported procedures that operate on vm_map_t. + */ + +extern vm_offset_t kentry_data; +extern vm_offset_t kentry_data_size; +extern int kentry_count; +extern void vm_map_init(); /* Initialize the module */ + +extern vm_map_t vm_map_create(); /* Create an empty map */ +extern vm_map_t vm_map_fork(); /* Create a map in the image + * of an existing map */ + +extern void vm_map_reference(); /* Gain a reference to + * an existing map */ +extern void vm_map_deallocate(); /* Lose a reference */ + +extern kern_return_t vm_map_enter(); /* Enter a mapping */ +extern kern_return_t vm_map_find_entry(); /* Enter a mapping primitive */ +extern kern_return_t vm_map_remove(); /* Deallocate a region */ +extern kern_return_t vm_map_protect(); /* Change protection */ +extern kern_return_t vm_map_inherit(); /* Change inheritance */ + +extern void vm_map_print(); /* Debugging: print a map */ + +extern kern_return_t vm_map_lookup(); /* Look up an address */ +extern boolean_t vm_map_verify(); /* Verify that a previous + * lookup is still valid */ +/* vm_map_verify_done is now a macro -- see below */ +extern kern_return_t vm_map_copyin(); /* Make a copy of a region */ +extern kern_return_t vm_map_copyin_page_list();/* Make a copy of a region + * using a page list copy */ +extern kern_return_t vm_map_copyout(); /* Place a copy into a map */ +extern kern_return_t vm_map_copy_overwrite();/* Overwrite existing memory + * with a copy */ +extern void vm_map_copy_discard(); /* Discard a copy without + * using it */ +extern kern_return_t vm_map_copy_discard_cont();/* Page list continuation + * version of previous */ + +extern kern_return_t vm_map_machine_attribute(); + /* Add or remove machine- + dependent attributes from + map regions */ + +/* + * Functions implemented as macros + */ +#define vm_map_min(map) ((map)->min_offset) + /* Lowest valid address in + * a map */ + +#define vm_map_max(map) ((map)->max_offset) + /* Highest valid address */ + +#define vm_map_pmap(map) ((map)->pmap) + /* Physical map associated + * with this address map */ + +#define vm_map_verify_done(map, version) (vm_map_unlock_read(map)) + /* Operation that required + * a verified lookup is + * now complete */ +/* + * Pageability functions. Includes macro to preserve old interface. + */ +extern kern_return_t vm_map_pageable_common(); + +#define vm_map_pageable(map, s, e, access) \ + vm_map_pageable_common(map, s, e, access, FALSE) + +#define vm_map_pageable_user(map, s, e, access) \ + vm_map_pageable_common(map, s, e, access, TRUE) + +/* + * Submap object. Must be used to create memory to be put + * in a submap by vm_map_submap. + */ +extern vm_object_t vm_submap_object; + +/* + * Wait and wakeup macros for in_transition map entries. + */ +#define vm_map_entry_wait(map, interruptible) \ + MACRO_BEGIN \ + assert_wait((event_t)&(map)->hdr, interruptible); \ + vm_map_unlock(map); \ + thread_block((void (*)()) 0); \ + MACRO_END + +#define vm_map_entry_wakeup(map) thread_wakeup((event_t)&(map)->hdr) + +#endif _VM_VM_MAP_H_ diff --git a/vm/vm_object.c b/vm/vm_object.c new file mode 100644 index 00000000..5186ee6c --- /dev/null +++ b/vm/vm_object.c @@ -0,0 +1,3090 @@ +/* + * Mach Operating System + * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University. + * Copyright (c) 1993,1994 The University of Utah and + * the Computer Systems Laboratory (CSL). + * All rights reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON, THE UNIVERSITY OF UTAH AND CSL ALLOW FREE USE OF + * THIS SOFTWARE IN ITS "AS IS" CONDITION, AND DISCLAIM ANY LIABILITY + * OF ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF + * THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +/* + * File: vm/vm_object.c + * Author: Avadis Tevanian, Jr., Michael Wayne Young + * + * Virtual memory object module. + */ + +#include <norma_vm.h> +#include <mach_pagemap.h> + +#if NORMA_VM +#include <norma/xmm_server_rename.h> +#endif /* NORMA_VM */ + +#include <mach/memory_object.h> +#include "memory_object_default.h" +#include "memory_object_user.h" +#include "vm_param.h" +#include <ipc/ipc_port.h> +#include <ipc/ipc_space.h> +#include <kern/assert.h> +#include <kern/lock.h> +#include <kern/queue.h> +#include <kern/xpr.h> +#include <kern/zalloc.h> +#include <vm/memory_object.h> +#include <vm/vm_fault.h> +#include <vm/vm_map.h> +#include <vm/vm_object.h> +#include <vm/vm_page.h> +#include <vm/vm_pageout.h> + + +void memory_object_release( + ipc_port_t pager, + pager_request_t pager_request, + ipc_port_t pager_name); /* forward */ + +void vm_object_deactivate_pages(vm_object_t); + +/* + * Virtual memory objects maintain the actual data + * associated with allocated virtual memory. A given + * page of memory exists within exactly one object. + * + * An object is only deallocated when all "references" + * are given up. Only one "reference" to a given + * region of an object should be writeable. + * + * Associated with each object is a list of all resident + * memory pages belonging to that object; this list is + * maintained by the "vm_page" module, but locked by the object's + * lock. + * + * Each object also records the memory object port + * that is used by the kernel to request and write + * back data (the memory object port, field "pager"), + * and the ports provided to the memory manager, the server that + * manages that data, to return data and control its + * use (the memory object control port, field "pager_request") + * and for naming (the memory object name port, field "pager_name"). + * + * Virtual memory objects are allocated to provide + * zero-filled memory (vm_allocate) or map a user-defined + * memory object into a virtual address space (vm_map). + * + * Virtual memory objects that refer to a user-defined + * memory object are called "permanent", because all changes + * made in virtual memory are reflected back to the + * memory manager, which may then store it permanently. + * Other virtual memory objects are called "temporary", + * meaning that changes need be written back only when + * necessary to reclaim pages, and that storage associated + * with the object can be discarded once it is no longer + * mapped. + * + * A permanent memory object may be mapped into more + * than one virtual address space. Moreover, two threads + * may attempt to make the first mapping of a memory + * object concurrently. Only one thread is allowed to + * complete this mapping; all others wait for the + * "pager_initialized" field is asserted, indicating + * that the first thread has initialized all of the + * necessary fields in the virtual memory object structure. + * + * The kernel relies on a *default memory manager* to + * provide backing storage for the zero-filled virtual + * memory objects. The memory object ports associated + * with these temporary virtual memory objects are only + * generated and passed to the default memory manager + * when it becomes necessary. Virtual memory objects + * that depend on the default memory manager are called + * "internal". The "pager_created" field is provided to + * indicate whether these ports have ever been allocated. + * + * The kernel may also create virtual memory objects to + * hold changed pages after a copy-on-write operation. + * In this case, the virtual memory object (and its + * backing storage -- its memory object) only contain + * those pages that have been changed. The "shadow" + * field refers to the virtual memory object that contains + * the remainder of the contents. The "shadow_offset" + * field indicates where in the "shadow" these contents begin. + * The "copy" field refers to a virtual memory object + * to which changed pages must be copied before changing + * this object, in order to implement another form + * of copy-on-write optimization. + * + * The virtual memory object structure also records + * the attributes associated with its memory object. + * The "pager_ready", "can_persist" and "copy_strategy" + * fields represent those attributes. The "cached_list" + * field is used in the implementation of the persistence + * attribute. + * + * ZZZ Continue this comment. + */ + +zone_t vm_object_zone; /* vm backing store zone */ + +/* + * All wired-down kernel memory belongs to a single virtual + * memory object (kernel_object) to avoid wasting data structures. + */ +vm_object_t kernel_object; + +/* + * Virtual memory objects that are not referenced by + * any address maps, but that are allowed to persist + * (an attribute specified by the associated memory manager), + * are kept in a queue (vm_object_cached_list). + * + * When an object from this queue is referenced again, + * for example to make another address space mapping, + * it must be removed from the queue. That is, the + * queue contains *only* objects with zero references. + * + * The kernel may choose to terminate objects from this + * queue in order to reclaim storage. The current policy + * is to permit a fixed maximum number of unreferenced + * objects (vm_object_cached_max). + * + * A simple lock (accessed by routines + * vm_object_cache_{lock,lock_try,unlock}) governs the + * object cache. It must be held when objects are + * added to or removed from the cache (in vm_object_terminate). + * The routines that acquire a reference to a virtual + * memory object based on one of the memory object ports + * must also lock the cache. + * + * Ideally, the object cache should be more isolated + * from the reference mechanism, so that the lock need + * not be held to make simple references. + */ +queue_head_t vm_object_cached_list; +int vm_object_cached_count; +int vm_object_cached_max = 100; /* may be patched*/ + +decl_simple_lock_data(,vm_object_cached_lock_data) + +#define vm_object_cache_lock() \ + simple_lock(&vm_object_cached_lock_data) +#define vm_object_cache_lock_try() \ + simple_lock_try(&vm_object_cached_lock_data) +#define vm_object_cache_unlock() \ + simple_unlock(&vm_object_cached_lock_data) + +/* + * Virtual memory objects are initialized from + * a template (see vm_object_allocate). + * + * When adding a new field to the virtual memory + * object structure, be sure to add initialization + * (see vm_object_init). + */ +vm_object_t vm_object_template; + +/* + * vm_object_allocate: + * + * Returns a new object with the given size. + */ + +vm_object_t _vm_object_allocate( + vm_size_t size) +{ + register vm_object_t object; + + object = (vm_object_t) zalloc(vm_object_zone); + + *object = *vm_object_template; + queue_init(&object->memq); + vm_object_lock_init(object); + object->size = size; + + return object; +} + +vm_object_t vm_object_allocate( + vm_size_t size) +{ + register vm_object_t object; + register ipc_port_t port; + + object = _vm_object_allocate(size); +#if !NORMA_VM + port = ipc_port_alloc_kernel(); + if (port == IP_NULL) + panic("vm_object_allocate"); + object->pager_name = port; + ipc_kobject_set(port, (ipc_kobject_t) object, IKOT_PAGING_NAME); +#endif /* !NORMA_VM */ + + return object; +} + +/* + * vm_object_bootstrap: + * + * Initialize the VM objects module. + */ +void vm_object_bootstrap(void) +{ + vm_object_zone = zinit((vm_size_t) sizeof(struct vm_object), + round_page(512*1024), + round_page(12*1024), + 0, "objects"); + + queue_init(&vm_object_cached_list); + simple_lock_init(&vm_object_cached_lock_data); + + /* + * Fill in a template object, for quick initialization + */ + + vm_object_template = (vm_object_t) zalloc(vm_object_zone); + bzero((char *) vm_object_template, sizeof *vm_object_template); + + vm_object_template->ref_count = 1; + vm_object_template->size = 0; + vm_object_template->resident_page_count = 0; + vm_object_template->copy = VM_OBJECT_NULL; + vm_object_template->shadow = VM_OBJECT_NULL; + vm_object_template->shadow_offset = (vm_offset_t) 0; + + vm_object_template->pager = IP_NULL; + vm_object_template->paging_offset = 0; + vm_object_template->pager_request = PAGER_REQUEST_NULL; + vm_object_template->pager_name = IP_NULL; + + vm_object_template->pager_created = FALSE; + vm_object_template->pager_initialized = FALSE; + vm_object_template->pager_ready = FALSE; + + vm_object_template->copy_strategy = MEMORY_OBJECT_COPY_NONE; + /* ignored if temporary, will be reset before + * permanent object becomes ready */ + vm_object_template->use_shared_copy = FALSE; + vm_object_template->shadowed = FALSE; + + vm_object_template->absent_count = 0; + vm_object_template->all_wanted = 0; /* all bits FALSE */ + + vm_object_template->paging_in_progress = 0; + vm_object_template->can_persist = FALSE; + vm_object_template->internal = TRUE; + vm_object_template->temporary = TRUE; + vm_object_template->alive = TRUE; + vm_object_template->lock_in_progress = FALSE; + vm_object_template->lock_restart = FALSE; + vm_object_template->use_old_pageout = TRUE; /* XXX change later */ + vm_object_template->last_alloc = (vm_offset_t) 0; + +#if MACH_PAGEMAP + vm_object_template->existence_info = VM_EXTERNAL_NULL; +#endif /* MACH_PAGEMAP */ + + /* + * Initialize the "kernel object" + */ + + kernel_object = _vm_object_allocate( + VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS); + + /* + * Initialize the "submap object". Make it as large as the + * kernel object so that no limit is imposed on submap sizes. + */ + + vm_submap_object = _vm_object_allocate( + VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS); + +#if MACH_PAGEMAP + vm_external_module_initialize(); +#endif /* MACH_PAGEMAP */ +} + +void vm_object_init(void) +{ +#if !NORMA_VM + /* + * Finish initializing the kernel object. + * The submap object doesn't need a name port. + */ + + kernel_object->pager_name = ipc_port_alloc_kernel(); + ipc_kobject_set(kernel_object->pager_name, + (ipc_kobject_t) kernel_object, + IKOT_PAGING_NAME); +#endif /* !NORMA_VM */ +} + +/* + * vm_object_reference: + * + * Gets another reference to the given object. + */ +void vm_object_reference( + register vm_object_t object) +{ + if (object == VM_OBJECT_NULL) + return; + + vm_object_lock(object); + assert(object->ref_count > 0); + object->ref_count++; + vm_object_unlock(object); +} + +/* + * vm_object_deallocate: + * + * Release a reference to the specified object, + * gained either through a vm_object_allocate + * or a vm_object_reference call. When all references + * are gone, storage associated with this object + * may be relinquished. + * + * No object may be locked. + */ +void vm_object_deallocate( + register vm_object_t object) +{ + vm_object_t temp; + + while (object != VM_OBJECT_NULL) { + + /* + * The cache holds a reference (uncounted) to + * the object; we must lock it before removing + * the object. + */ + + vm_object_cache_lock(); + + /* + * Lose the reference + */ + vm_object_lock(object); + if (--(object->ref_count) > 0) { + + /* + * If there are still references, then + * we are done. + */ + vm_object_unlock(object); + vm_object_cache_unlock(); + return; + } + + /* + * See whether this object can persist. If so, enter + * it in the cache, then deactivate all of its + * pages. + */ + if (object->can_persist) { + boolean_t overflow; + + /* + * Enter the object onto the queue + * of "cached" objects. Remember whether + * we've caused the queue to overflow, + * as a hint. + */ + + queue_enter(&vm_object_cached_list, object, + vm_object_t, cached_list); + overflow = (++vm_object_cached_count > vm_object_cached_max); + vm_object_cache_unlock(); + + vm_object_deactivate_pages(object); + vm_object_unlock(object); + + /* + * If we didn't overflow, or if the queue has + * been reduced back to below the specified + * minimum, then quit. + */ + if (!overflow) + return; + + while (TRUE) { + vm_object_cache_lock(); + if (vm_object_cached_count <= + vm_object_cached_max) { + vm_object_cache_unlock(); + return; + } + + /* + * If we must trim down the queue, take + * the first object, and proceed to + * terminate it instead of the original + * object. Have to wait for pager init. + * if it's in progress. + */ + object= (vm_object_t) + queue_first(&vm_object_cached_list); + vm_object_lock(object); + + if (!(object->pager_created && + !object->pager_initialized)) { + + /* + * Ok to terminate, hang on to lock. + */ + break; + } + + vm_object_assert_wait(object, + VM_OBJECT_EVENT_INITIALIZED, FALSE); + vm_object_unlock(object); + vm_object_cache_unlock(); + thread_block((void (*)()) 0); + + /* + * Continue loop to check if cache still + * needs to be trimmed. + */ + } + + /* + * Actually remove object from cache. + */ + + queue_remove(&vm_object_cached_list, object, + vm_object_t, cached_list); + vm_object_cached_count--; + + assert(object->ref_count == 0); + } + else { + if (object->pager_created && + !object->pager_initialized) { + + /* + * Have to wait for initialization. + * Put reference back and retry + * when it's initialized. + */ + object->ref_count++; + vm_object_assert_wait(object, + VM_OBJECT_EVENT_INITIALIZED, FALSE); + vm_object_unlock(object); + vm_object_cache_unlock(); + thread_block((void (*)()) 0); + continue; + } + } + + /* + * Take the reference to the shadow object + * out of the object to be destroyed. + */ + + temp = object->shadow; + + /* + * Destroy the object; the cache lock will + * be released in the process. + */ + + vm_object_terminate(object); + + /* + * Deallocate the reference to the shadow + * by continuing the loop with that object + * in place of the original. + */ + + object = temp; + } +} + +boolean_t vm_object_terminate_remove_all = FALSE; + +/* + * Routine: vm_object_terminate + * Purpose: + * Free all resources associated with a vm_object. + * In/out conditions: + * Upon entry, the object and the cache must be locked, + * and the object must have no references. + * + * The shadow object reference is left alone. + * + * Upon exit, the cache will be unlocked, and the + * object will cease to exist. + */ +void vm_object_terminate( + register vm_object_t object) +{ + register vm_page_t p; + vm_object_t shadow_object; + + /* + * Make sure the object isn't already being terminated + */ + + assert(object->alive); + object->alive = FALSE; + + /* + * Make sure no one can look us up now. + */ + + vm_object_remove(object); + vm_object_cache_unlock(); + + /* + * Detach the object from its shadow if we are the shadow's + * copy. + */ + if ((shadow_object = object->shadow) != VM_OBJECT_NULL) { + vm_object_lock(shadow_object); + assert((shadow_object->copy == object) || + (shadow_object->copy == VM_OBJECT_NULL)); + shadow_object->copy = VM_OBJECT_NULL; + vm_object_unlock(shadow_object); + } + + /* + * The pageout daemon might be playing with our pages. + * Now that the object is dead, it won't touch any more + * pages, but some pages might already be on their way out. + * Hence, we wait until the active paging activities have ceased. + */ + + vm_object_paging_wait(object, FALSE); + + /* + * Clean or free the pages, as appropriate. + * It is possible for us to find busy/absent pages, + * if some faults on this object were aborted. + */ + + if ((object->temporary) || (object->pager == IP_NULL)) { + while (!queue_empty(&object->memq)) { + p = (vm_page_t) queue_first(&object->memq); + + VM_PAGE_CHECK(p); + + if (p->busy && !p->absent) + panic("vm_object_terminate.2 0x%x 0x%x", + object, p); + + VM_PAGE_FREE(p); + } + } else while (!queue_empty(&object->memq)) { + p = (vm_page_t) queue_first(&object->memq); + + VM_PAGE_CHECK(p); + + if (p->busy && !p->absent) + panic("vm_object_terminate.3 0x%x 0x%x", object, p); + + vm_page_lock_queues(); + VM_PAGE_QUEUES_REMOVE(p); + vm_page_unlock_queues(); + + if (p->absent || p->private) { + + /* + * For private pages, VM_PAGE_FREE just + * leaves the page structure around for + * its owner to clean up. For absent + * pages, the structure is returned to + * the appropriate pool. + */ + + goto free_page; + } + + if (p->fictitious) + panic("vm_object_terminate.4 0x%x 0x%x", object, p); + + if (!p->dirty) + p->dirty = pmap_is_modified(p->phys_addr); + + if (p->dirty || p->precious) { + p->busy = TRUE; + vm_pageout_page(p, FALSE, TRUE); /* flush page */ + } else { + free_page: + VM_PAGE_FREE(p); + } + } + + assert(object->ref_count == 0); + assert(object->paging_in_progress == 0); + + /* + * Throw away port rights... note that they may + * already have been thrown away (by vm_object_destroy + * or memory_object_destroy). + * + * Instead of destroying the control and name ports, + * we send all rights off to the memory manager instead, + * using memory_object_terminate. + */ + + vm_object_unlock(object); + + if (object->pager != IP_NULL) { + /* consumes our rights for pager, pager_request, pager_name */ + memory_object_release(object->pager, + object->pager_request, + object->pager_name); + } else if (object->pager_name != IP_NULL) { + /* consumes our right for pager_name */ +#if NORMA_VM + ipc_port_release_send(object->pager_name); +#else /* NORMA_VM */ + ipc_port_dealloc_kernel(object->pager_name); +#endif /* NORMA_VM */ + } + +#if MACH_PAGEMAP + vm_external_destroy(object->existence_info); +#endif /* MACH_PAGEMAP */ + + /* + * Free the space for the object. + */ + + zfree(vm_object_zone, (vm_offset_t) object); +} + +/* + * Routine: vm_object_pager_wakeup + * Purpose: Wake up anyone waiting for IKOT_PAGER_TERMINATING + */ + +void +vm_object_pager_wakeup( + ipc_port_t pager) +{ + boolean_t someone_waiting; + + /* + * If anyone was waiting for the memory_object_terminate + * to be queued, wake them up now. + */ + vm_object_cache_lock(); + assert(ip_kotype(pager) == IKOT_PAGER_TERMINATING); + someone_waiting = (pager->ip_kobject != IKO_NULL); + if (ip_active(pager)) + ipc_kobject_set(pager, IKO_NULL, IKOT_NONE); + vm_object_cache_unlock(); + if (someone_waiting) { + thread_wakeup((event_t) pager); + } +} + +/* + * Routine: memory_object_release + * Purpose: Terminate the pager and release port rights, + * just like memory_object_terminate, except + * that we wake up anyone blocked in vm_object_enter + * waiting for termination message to be queued + * before calling memory_object_init. + */ +void memory_object_release( + ipc_port_t pager, + pager_request_t pager_request, + ipc_port_t pager_name) +{ + + /* + * Keep a reference to pager port; + * the terminate might otherwise release all references. + */ + ip_reference(pager); + + /* + * Terminate the pager. + */ + (void) memory_object_terminate(pager, pager_request, pager_name); + + /* + * Wakeup anyone waiting for this terminate + */ + vm_object_pager_wakeup(pager); + + /* + * Release reference to pager port. + */ + ip_release(pager); +} + +/* + * Routine: vm_object_abort_activity [internal use only] + * Purpose: + * Abort paging requests pending on this object. + * In/out conditions: + * The object is locked on entry and exit. + */ +void vm_object_abort_activity( + vm_object_t object) +{ + register + vm_page_t p; + vm_page_t next; + + /* + * Abort all activity that would be waiting + * for a result on this memory object. + * + * We could also choose to destroy all pages + * that we have in memory for this object, but + * we don't. + */ + + p = (vm_page_t) queue_first(&object->memq); + while (!queue_end(&object->memq, (queue_entry_t) p)) { + next = (vm_page_t) queue_next(&p->listq); + + /* + * If it's being paged in, destroy it. + * If an unlock has been requested, start it again. + */ + + if (p->busy && p->absent) { + VM_PAGE_FREE(p); + } + else { + if (p->unlock_request != VM_PROT_NONE) + p->unlock_request = VM_PROT_NONE; + PAGE_WAKEUP(p); + } + + p = next; + } + + /* + * Wake up threads waiting for the memory object to + * become ready. + */ + + object->pager_ready = TRUE; + vm_object_wakeup(object, VM_OBJECT_EVENT_PAGER_READY); +} + +/* + * Routine: memory_object_destroy [user interface] + * Purpose: + * Shut down a memory object, despite the + * presence of address map (or other) references + * to the vm_object. + * Note: + * This routine may be called either from the user interface, + * or from port destruction handling (via vm_object_destroy). + */ +kern_return_t memory_object_destroy( + register + vm_object_t object, + kern_return_t reason) +{ + ipc_port_t old_object, old_name; + pager_request_t old_control; + +#ifdef lint + reason++; +#endif /* lint */ + + if (object == VM_OBJECT_NULL) + return KERN_SUCCESS; + + /* + * Remove the port associations immediately. + * + * This will prevent the memory manager from further + * meddling. [If it wanted to flush data or make + * other changes, it should have done so before performing + * the destroy call.] + */ + + vm_object_cache_lock(); + vm_object_lock(object); + vm_object_remove(object); + object->can_persist = FALSE; + vm_object_cache_unlock(); + + /* + * Rip out the ports from the vm_object now... this + * will prevent new memory_object calls from succeeding. + */ + + old_object = object->pager; + object->pager = IP_NULL; + + old_control = object->pager_request; + object->pager_request = PAGER_REQUEST_NULL; + + old_name = object->pager_name; + object->pager_name = IP_NULL; + + + /* + * Wait for existing paging activity (that might + * have the old ports) to subside. + */ + + vm_object_paging_wait(object, FALSE); + vm_object_unlock(object); + + /* + * Shut down the ports now. + * + * [Paging operations may be proceeding concurrently -- + * they'll get the null values established above.] + */ + + if (old_object != IP_NULL) { + /* consumes our rights for object, control, name */ + memory_object_release(old_object, old_control, + old_name); + } else if (old_name != IP_NULL) { + /* consumes our right for name */ +#if NORMA_VM + ipc_port_release_send(object->pager_name); +#else /* NORMA_VM */ + ipc_port_dealloc_kernel(object->pager_name); +#endif /* NORMA_VM */ + } + + /* + * Lose the reference that was donated for this routine + */ + + vm_object_deallocate(object); + + return KERN_SUCCESS; +} + +/* + * vm_object_deactivate_pages + * + * Deactivate all pages in the specified object. (Keep its pages + * in memory even though it is no longer referenced.) + * + * The object must be locked. + */ +void vm_object_deactivate_pages( + register vm_object_t object) +{ + register vm_page_t p; + + queue_iterate(&object->memq, p, vm_page_t, listq) { + vm_page_lock_queues(); + if (!p->busy) + vm_page_deactivate(p); + vm_page_unlock_queues(); + } +} + + +/* + * Routine: vm_object_pmap_protect + * + * Purpose: + * Reduces the permission for all physical + * pages in the specified object range. + * + * If removing write permission only, it is + * sufficient to protect only the pages in + * the top-level object; only those pages may + * have write permission. + * + * If removing all access, we must follow the + * shadow chain from the top-level object to + * remove access to all pages in shadowed objects. + * + * The object must *not* be locked. The object must + * be temporary/internal. + * + * If pmap is not NULL, this routine assumes that + * the only mappings for the pages are in that + * pmap. + */ +boolean_t vm_object_pmap_protect_by_page = FALSE; + +void vm_object_pmap_protect( + register vm_object_t object, + register vm_offset_t offset, + vm_offset_t size, + pmap_t pmap, + vm_offset_t pmap_start, + vm_prot_t prot) +{ + if (object == VM_OBJECT_NULL) + return; + + vm_object_lock(object); + + assert(object->temporary && object->internal); + + while (TRUE) { + if (object->resident_page_count > atop(size) / 2 && + pmap != PMAP_NULL) { + vm_object_unlock(object); + pmap_protect(pmap, pmap_start, pmap_start + size, prot); + return; + } + + { + register vm_page_t p; + register vm_offset_t end; + + end = offset + size; + + queue_iterate(&object->memq, p, vm_page_t, listq) { + if (!p->fictitious && + (offset <= p->offset) && + (p->offset < end)) { + if ((pmap == PMAP_NULL) || + vm_object_pmap_protect_by_page) { + pmap_page_protect(p->phys_addr, + prot & ~p->page_lock); + } else { + vm_offset_t start = + pmap_start + + (p->offset - offset); + + pmap_protect(pmap, + start, + start + PAGE_SIZE, + prot); + } + } + } + } + + if (prot == VM_PROT_NONE) { + /* + * Must follow shadow chain to remove access + * to pages in shadowed objects. + */ + register vm_object_t next_object; + + next_object = object->shadow; + if (next_object != VM_OBJECT_NULL) { + offset += object->shadow_offset; + vm_object_lock(next_object); + vm_object_unlock(object); + object = next_object; + } + else { + /* + * End of chain - we are done. + */ + break; + } + } + else { + /* + * Pages in shadowed objects may never have + * write permission - we may stop here. + */ + break; + } + } + + vm_object_unlock(object); +} + +/* + * vm_object_pmap_remove: + * + * Removes all physical pages in the specified + * object range from all physical maps. + * + * The object must *not* be locked. + */ +void vm_object_pmap_remove( + register vm_object_t object, + register vm_offset_t start, + register vm_offset_t end) +{ + register vm_page_t p; + + if (object == VM_OBJECT_NULL) + return; + + vm_object_lock(object); + queue_iterate(&object->memq, p, vm_page_t, listq) { + if (!p->fictitious && + (start <= p->offset) && + (p->offset < end)) + pmap_page_protect(p->phys_addr, VM_PROT_NONE); + } + vm_object_unlock(object); +} + +/* + * Routine: vm_object_copy_slowly + * + * Description: + * Copy the specified range of the source + * virtual memory object without using + * protection-based optimizations (such + * as copy-on-write). The pages in the + * region are actually copied. + * + * In/out conditions: + * The caller must hold a reference and a lock + * for the source virtual memory object. The source + * object will be returned *unlocked*. + * + * Results: + * If the copy is completed successfully, KERN_SUCCESS is + * returned. If the caller asserted the interruptible + * argument, and an interruption occurred while waiting + * for a user-generated event, MACH_SEND_INTERRUPTED is + * returned. Other values may be returned to indicate + * hard errors during the copy operation. + * + * A new virtual memory object is returned in a + * parameter (_result_object). The contents of this + * new object, starting at a zero offset, are a copy + * of the source memory region. In the event of + * an error, this parameter will contain the value + * VM_OBJECT_NULL. + */ +kern_return_t vm_object_copy_slowly( + register + vm_object_t src_object, + vm_offset_t src_offset, + vm_size_t size, + boolean_t interruptible, + vm_object_t *_result_object) /* OUT */ +{ + vm_object_t new_object; + vm_offset_t new_offset; + + if (size == 0) { + vm_object_unlock(src_object); + *_result_object = VM_OBJECT_NULL; + return KERN_INVALID_ARGUMENT; + } + + /* + * Prevent destruction of the source object while we copy. + */ + + assert(src_object->ref_count > 0); + src_object->ref_count++; + vm_object_unlock(src_object); + + /* + * Create a new object to hold the copied pages. + * A few notes: + * We fill the new object starting at offset 0, + * regardless of the input offset. + * We don't bother to lock the new object within + * this routine, since we have the only reference. + */ + + new_object = vm_object_allocate(size); + new_offset = 0; + + assert(size == trunc_page(size)); /* Will the loop terminate? */ + + for ( ; + size != 0 ; + src_offset += PAGE_SIZE, new_offset += PAGE_SIZE, size -= PAGE_SIZE + ) { + vm_page_t new_page; + vm_fault_return_t result; + + while ((new_page = vm_page_alloc(new_object, new_offset)) + == VM_PAGE_NULL) { + VM_PAGE_WAIT((void (*)()) 0); + } + + do { + vm_prot_t prot = VM_PROT_READ; + vm_page_t _result_page; + vm_page_t top_page; + register + vm_page_t result_page; + + vm_object_lock(src_object); + src_object->paging_in_progress++; + + result = vm_fault_page(src_object, src_offset, + VM_PROT_READ, FALSE, interruptible, + &prot, &_result_page, &top_page, + FALSE, (void (*)()) 0); + + switch(result) { + case VM_FAULT_SUCCESS: + result_page = _result_page; + + /* + * We don't need to hold the object + * lock -- the busy page will be enough. + * [We don't care about picking up any + * new modifications.] + * + * Copy the page to the new object. + * + * POLICY DECISION: + * If result_page is clean, + * we could steal it instead + * of copying. + */ + + vm_object_unlock(result_page->object); + vm_page_copy(result_page, new_page); + + /* + * Let go of both pages (make them + * not busy, perform wakeup, activate). + */ + + new_page->busy = FALSE; + new_page->dirty = TRUE; + vm_object_lock(result_page->object); + PAGE_WAKEUP_DONE(result_page); + + vm_page_lock_queues(); + if (!result_page->active && + !result_page->inactive) + vm_page_activate(result_page); + vm_page_activate(new_page); + vm_page_unlock_queues(); + + /* + * Release paging references and + * top-level placeholder page, if any. + */ + + vm_fault_cleanup(result_page->object, + top_page); + + break; + + case VM_FAULT_RETRY: + break; + + case VM_FAULT_MEMORY_SHORTAGE: + VM_PAGE_WAIT((void (*)()) 0); + break; + + case VM_FAULT_FICTITIOUS_SHORTAGE: + vm_page_more_fictitious(); + break; + + case VM_FAULT_INTERRUPTED: + vm_page_free(new_page); + vm_object_deallocate(new_object); + vm_object_deallocate(src_object); + *_result_object = VM_OBJECT_NULL; + return MACH_SEND_INTERRUPTED; + + case VM_FAULT_MEMORY_ERROR: + /* + * A policy choice: + * (a) ignore pages that we can't + * copy + * (b) return the null object if + * any page fails [chosen] + */ + + vm_page_free(new_page); + vm_object_deallocate(new_object); + vm_object_deallocate(src_object); + *_result_object = VM_OBJECT_NULL; + return KERN_MEMORY_ERROR; + } + } while (result != VM_FAULT_SUCCESS); + } + + /* + * Lose the extra reference, and return our object. + */ + + vm_object_deallocate(src_object); + *_result_object = new_object; + return KERN_SUCCESS; +} + +/* + * Routine: vm_object_copy_temporary + * + * Purpose: + * Copy the specified range of the source virtual + * memory object, if it can be done without blocking. + * + * Results: + * If the copy is successful, the copy is returned in + * the arguments; otherwise, the arguments are not + * affected. + * + * In/out conditions: + * The object should be unlocked on entry and exit. + */ + +vm_object_t vm_object_copy_delayed(); /* forward declaration */ + +boolean_t vm_object_copy_temporary( + vm_object_t *_object, /* INOUT */ + vm_offset_t *_offset, /* INOUT */ + boolean_t *_src_needs_copy, /* OUT */ + boolean_t *_dst_needs_copy) /* OUT */ +{ + vm_object_t object = *_object; + +#ifdef lint + ++*_offset; +#endif /* lint */ + + if (object == VM_OBJECT_NULL) { + *_src_needs_copy = FALSE; + *_dst_needs_copy = FALSE; + return TRUE; + } + + /* + * If the object is temporary, we can perform + * a symmetric copy-on-write without asking. + */ + + vm_object_lock(object); + if (object->temporary) { + + /* + * Shared objects use delayed copy + */ + if (object->use_shared_copy) { + + /* + * Asymmetric copy strategy. Destination + * must be copied (to allow copy object reuse). + * Source is unaffected. + */ + vm_object_unlock(object); + object = vm_object_copy_delayed(object); + *_object = object; + *_src_needs_copy = FALSE; + *_dst_needs_copy = TRUE; + return TRUE; + } + + /* + * Make another reference to the object. + * + * Leave object/offset unchanged. + */ + + assert(object->ref_count > 0); + object->ref_count++; + object->shadowed = TRUE; + vm_object_unlock(object); + + /* + * Both source and destination must make + * shadows, and the source must be made + * read-only if not already. + */ + + *_src_needs_copy = TRUE; + *_dst_needs_copy = TRUE; + return TRUE; + } + + if (object->pager_ready && + (object->copy_strategy == MEMORY_OBJECT_COPY_DELAY)) { + /* XXX Do something intelligent (see temporary code above) */ + } + vm_object_unlock(object); + + return FALSE; +} + +/* + * Routine: vm_object_copy_call [internal] + * + * Description: + * Copy the specified (src_offset, size) portion + * of the source object (src_object), using the + * user-managed copy algorithm. + * + * In/out conditions: + * The source object must be locked on entry. It + * will be *unlocked* on exit. + * + * Results: + * If the copy is successful, KERN_SUCCESS is returned. + * This routine is interruptible; if a wait for + * a user-generated event is interrupted, MACH_SEND_INTERRUPTED + * is returned. Other return values indicate hard errors + * in creating the user-managed memory object for the copy. + * + * A new object that represents the copied virtual + * memory is returned in a parameter (*_result_object). + * If the return value indicates an error, this parameter + * is not valid. + */ +kern_return_t vm_object_copy_call( + vm_object_t src_object, + vm_offset_t src_offset, + vm_size_t size, + vm_object_t *_result_object) /* OUT */ +{ + vm_offset_t src_end = src_offset + size; + ipc_port_t new_memory_object; + vm_object_t new_object; + vm_page_t p; + + /* + * Set the backing object for the new + * temporary object. + */ + + assert(src_object->ref_count > 0); + src_object->ref_count++; + vm_object_paging_begin(src_object); + vm_object_unlock(src_object); + + /* + * Create a memory object port to be associated + * with this new vm_object. + * + * Since the kernel has the only rights to this + * port, we need not hold the cache lock. + * + * Since we have the only object reference, we + * need not be worried about collapse operations. + * + */ + + new_memory_object = ipc_port_alloc_kernel(); + if (new_memory_object == IP_NULL) { + panic("vm_object_copy_call: allocate memory object port"); + /* XXX Shouldn't panic here. */ + } + + /* we hold a naked receive right for new_memory_object */ + (void) ipc_port_make_send(new_memory_object); + /* now we also hold a naked send right for new_memory_object */ + + /* + * Let the memory manager know that a copy operation + * is in progress. Note that we're using the old + * memory object's ports (for which we're holding + * a paging reference)... the memory manager cannot + * yet affect the new memory object. + */ + + (void) memory_object_copy(src_object->pager, + src_object->pager_request, + src_offset, size, + new_memory_object); + /* no longer hold the naked receive right for new_memory_object */ + + vm_object_lock(src_object); + vm_object_paging_end(src_object); + + /* + * Remove write access from all of the pages of + * the old memory object that we can. + */ + + queue_iterate(&src_object->memq, p, vm_page_t, listq) { + if (!p->fictitious && + (src_offset <= p->offset) && + (p->offset < src_end) && + !(p->page_lock & VM_PROT_WRITE)) { + p->page_lock |= VM_PROT_WRITE; + pmap_page_protect(p->phys_addr, VM_PROT_ALL & ~p->page_lock); + } + } + + vm_object_unlock(src_object); + + /* + * Initialize the rest of the paging stuff + */ + + new_object = vm_object_enter(new_memory_object, size, FALSE); + new_object->shadow = src_object; + new_object->shadow_offset = src_offset; + + /* + * Drop the reference for new_memory_object taken above. + */ + + ipc_port_release_send(new_memory_object); + /* no longer hold the naked send right for new_memory_object */ + + *_result_object = new_object; + return KERN_SUCCESS; +} + +/* + * Routine: vm_object_copy_delayed [internal] + * + * Description: + * Copy the specified virtual memory object, using + * the asymmetric copy-on-write algorithm. + * + * In/out conditions: + * The object must be unlocked on entry. + * + * This routine will not block waiting for user-generated + * events. It is not interruptible. + */ +vm_object_t vm_object_copy_delayed( + vm_object_t src_object) +{ + vm_object_t new_copy; + vm_object_t old_copy; + vm_page_t p; + + /* + * The user-level memory manager wants to see + * all of the changes to this object, but it + * has promised not to make any changes on its own. + * + * Perform an asymmetric copy-on-write, as follows: + * Create a new object, called a "copy object" + * to hold pages modified by the new mapping + * (i.e., the copy, not the original mapping). + * Record the original object as the backing + * object for the copy object. If the + * original mapping does not change a page, + * it may be used read-only by the copy. + * Record the copy object in the original + * object. When the original mapping causes + * a page to be modified, it must be copied + * to a new page that is "pushed" to the + * copy object. + * Mark the new mapping (the copy object) + * copy-on-write. This makes the copy + * object itself read-only, allowing it + * to be reused if the original mapping + * makes no changes, and simplifying the + * synchronization required in the "push" + * operation described above. + * + * The copy-on-write is said to be assymetric because + * the original object is *not* marked copy-on-write. + * A copied page is pushed to the copy object, regardless + * which party attempted to modify the page. + * + * Repeated asymmetric copy operations may be done. + * If the original object has not been changed since + * the last copy, its copy object can be reused. + * Otherwise, a new copy object can be inserted + * between the original object and its previous + * copy object. Since any copy object is read-only, + * this cannot affect the contents of the previous copy + * object. + * + * Note that a copy object is higher in the object + * tree than the original object; therefore, use of + * the copy object recorded in the original object + * must be done carefully, to avoid deadlock. + */ + + /* + * Allocate a new copy object before locking, even + * though we may not need it later. + */ + + new_copy = vm_object_allocate(src_object->size); + + vm_object_lock(src_object); + + /* + * See whether we can reuse the result of a previous + * copy operation. + */ + Retry: + old_copy = src_object->copy; + if (old_copy != VM_OBJECT_NULL) { + /* + * Try to get the locks (out of order) + */ + if (!vm_object_lock_try(old_copy)) { + vm_object_unlock(src_object); + + simple_lock_pause(); /* wait a bit */ + + vm_object_lock(src_object); + goto Retry; + } + + /* + * Determine whether the old copy object has + * been modified. + */ + + if (old_copy->resident_page_count == 0 && + !old_copy->pager_created) { + /* + * It has not been modified. + * + * Return another reference to + * the existing copy-object. + */ + assert(old_copy->ref_count > 0); + old_copy->ref_count++; + vm_object_unlock(old_copy); + vm_object_unlock(src_object); + + vm_object_deallocate(new_copy); + + return old_copy; + } + + /* + * The copy-object is always made large enough to + * completely shadow the original object, since + * it may have several users who want to shadow + * the original object at different points. + */ + + assert((old_copy->shadow == src_object) && + (old_copy->shadow_offset == (vm_offset_t) 0)); + + /* + * Make the old copy-object shadow the new one. + * It will receive no more pages from the original + * object. + */ + + src_object->ref_count--; /* remove ref. from old_copy */ + assert(src_object->ref_count > 0); + old_copy->shadow = new_copy; + assert(new_copy->ref_count > 0); + new_copy->ref_count++; + vm_object_unlock(old_copy); /* done with old_copy */ + } + + /* + * Point the new copy at the existing object. + */ + + new_copy->shadow = src_object; + new_copy->shadow_offset = 0; + new_copy->shadowed = TRUE; /* caller must set needs_copy */ + assert(src_object->ref_count > 0); + src_object->ref_count++; + src_object->copy = new_copy; + + /* + * Mark all pages of the existing object copy-on-write. + * This object may have a shadow chain below it, but + * those pages will already be marked copy-on-write. + */ + + queue_iterate(&src_object->memq, p, vm_page_t, listq) { + if (!p->fictitious) + pmap_page_protect(p->phys_addr, + (VM_PROT_ALL & ~VM_PROT_WRITE & + ~p->page_lock)); + } + + vm_object_unlock(src_object); + + return new_copy; +} + +/* + * Routine: vm_object_copy_strategically + * + * Purpose: + * Perform a copy according to the source object's + * declared strategy. This operation may block, + * and may be interrupted. + */ +kern_return_t vm_object_copy_strategically( + register + vm_object_t src_object, + vm_offset_t src_offset, + vm_size_t size, + vm_object_t *dst_object, /* OUT */ + vm_offset_t *dst_offset, /* OUT */ + boolean_t *dst_needs_copy) /* OUT */ +{ + kern_return_t result = KERN_SUCCESS; /* to quiet gcc warnings */ + boolean_t interruptible = TRUE; /* XXX */ + + assert(src_object != VM_OBJECT_NULL); + + vm_object_lock(src_object); + + /* XXX assert(!src_object->temporary); JSB FIXME */ + + /* + * The copy strategy is only valid if the memory manager + * is "ready". + */ + + while (!src_object->pager_ready) { + vm_object_wait( src_object, + VM_OBJECT_EVENT_PAGER_READY, + interruptible); + if (interruptible && + (current_thread()->wait_result != THREAD_AWAKENED)) { + *dst_object = VM_OBJECT_NULL; + *dst_offset = 0; + *dst_needs_copy = FALSE; + return MACH_SEND_INTERRUPTED; + } + vm_object_lock(src_object); + } + + /* + * The object may be temporary (even though it is external). + * If so, do a symmetric copy. + */ + + if (src_object->temporary) { + /* + * XXX + * This does not count as intelligent! + * This buys us the object->temporary optimizations, + * but we aren't using a symmetric copy, + * which may confuse the vm code. The correct thing + * to do here is to figure out what to call to get + * a temporary shadowing set up. + */ + src_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY; + } + + /* + * The object is permanent. Use the appropriate copy strategy. + */ + + switch (src_object->copy_strategy) { + case MEMORY_OBJECT_COPY_NONE: + if ((result = vm_object_copy_slowly( + src_object, + src_offset, + size, + interruptible, + dst_object)) + == KERN_SUCCESS) { + *dst_offset = 0; + *dst_needs_copy = FALSE; + } + break; + + case MEMORY_OBJECT_COPY_CALL: + if ((result = vm_object_copy_call( + src_object, + src_offset, + size, + dst_object)) + == KERN_SUCCESS) { + *dst_offset = 0; + *dst_needs_copy = FALSE; + } + break; + + case MEMORY_OBJECT_COPY_DELAY: + vm_object_unlock(src_object); + *dst_object = vm_object_copy_delayed(src_object); + *dst_offset = src_offset; + *dst_needs_copy = TRUE; + + result = KERN_SUCCESS; + break; + } + + return result; +} + +/* + * vm_object_shadow: + * + * Create a new object which is backed by the + * specified existing object range. The source + * object reference is deallocated. + * + * The new object and offset into that object + * are returned in the source parameters. + */ + +void vm_object_shadow( + vm_object_t *object, /* IN/OUT */ + vm_offset_t *offset, /* IN/OUT */ + vm_size_t length) +{ + register vm_object_t source; + register vm_object_t result; + + source = *object; + + /* + * Allocate a new object with the given length + */ + + if ((result = vm_object_allocate(length)) == VM_OBJECT_NULL) + panic("vm_object_shadow: no object for shadowing"); + + /* + * The new object shadows the source object, adding + * a reference to it. Our caller changes his reference + * to point to the new object, removing a reference to + * the source object. Net result: no change of reference + * count. + */ + result->shadow = source; + + /* + * Store the offset into the source object, + * and fix up the offset into the new object. + */ + + result->shadow_offset = *offset; + + /* + * Return the new things + */ + + *offset = 0; + *object = result; +} + +/* + * The relationship between vm_object structures and + * the memory_object ports requires careful synchronization. + * + * All associations are created by vm_object_enter. All three + * port fields are filled in, as follows: + * pager: the memory_object port itself, supplied by + * the user requesting a mapping (or the kernel, + * when initializing internal objects); the + * kernel simulates holding send rights by keeping + * a port reference; + * pager_request: + * pager_name: + * the memory object control and name ports, + * created by the kernel; the kernel holds + * receive (and ownership) rights to these + * ports, but no other references. + * All of the ports are referenced by their global names. + * + * When initialization is complete, the "initialized" field + * is asserted. Other mappings using a particular memory object, + * and any references to the vm_object gained through the + * port association must wait for this initialization to occur. + * + * In order to allow the memory manager to set attributes before + * requests (notably virtual copy operations, but also data or + * unlock requests) are made, a "ready" attribute is made available. + * Only the memory manager may affect the value of this attribute. + * Its value does not affect critical kernel functions, such as + * internal object initialization or destruction. [Furthermore, + * memory objects created by the kernel are assumed to be ready + * immediately; the default memory manager need not explicitly + * set the "ready" attribute.] + * + * [Both the "initialized" and "ready" attribute wait conditions + * use the "pager" field as the wait event.] + * + * The port associations can be broken down by any of the + * following routines: + * vm_object_terminate: + * No references to the vm_object remain, and + * the object cannot (or will not) be cached. + * This is the normal case, and is done even + * though one of the other cases has already been + * done. + * vm_object_destroy: + * The memory_object port has been destroyed, + * meaning that the kernel cannot flush dirty + * pages or request new data or unlock existing + * data. + * memory_object_destroy: + * The memory manager has requested that the + * kernel relinquish rights to the memory object + * port. [The memory manager may not want to + * destroy the port, but may wish to refuse or + * tear down existing memory mappings.] + * Each routine that breaks an association must break all of + * them at once. At some later time, that routine must clear + * the vm_object port fields and release the port rights. + * [Furthermore, each routine must cope with the simultaneous + * or previous operations of the others.] + * + * In addition to the lock on the object, the vm_object_cache_lock + * governs the port associations. References gained through the + * port association require use of the cache lock. + * + * Because the port fields may be cleared spontaneously, they + * cannot be used to determine whether a memory object has + * ever been associated with a particular vm_object. [This + * knowledge is important to the shadow object mechanism.] + * For this reason, an additional "created" attribute is + * provided. + * + * During various paging operations, the port values found in the + * vm_object must be valid. To prevent these port rights from being + * released, and to prevent the port associations from changing + * (other than being removed, i.e., made null), routines may use + * the vm_object_paging_begin/end routines [actually, macros]. + * The implementation uses the "paging_in_progress" and "wanted" fields. + * [Operations that alter the validity of the port values include the + * termination routines and vm_object_collapse.] + */ + +vm_object_t vm_object_lookup( + ipc_port_t port) +{ + vm_object_t object = VM_OBJECT_NULL; + + if (IP_VALID(port)) { + ip_lock(port); + if (ip_active(port) && +#if NORMA_VM + (ip_kotype(port) == IKOT_PAGER)) { +#else /* NORMA_VM */ + (ip_kotype(port) == IKOT_PAGING_REQUEST)) { +#endif /* NORMA_VM */ + vm_object_cache_lock(); + object = (vm_object_t) port->ip_kobject; + vm_object_lock(object); + + assert(object->alive); + + if (object->ref_count == 0) { + queue_remove(&vm_object_cached_list, object, + vm_object_t, cached_list); + vm_object_cached_count--; + } + + object->ref_count++; + vm_object_unlock(object); + vm_object_cache_unlock(); + } + ip_unlock(port); + } + + return object; +} + +vm_object_t vm_object_lookup_name( + ipc_port_t port) +{ + vm_object_t object = VM_OBJECT_NULL; + + if (IP_VALID(port)) { + ip_lock(port); + if (ip_active(port) && + (ip_kotype(port) == IKOT_PAGING_NAME)) { + vm_object_cache_lock(); + object = (vm_object_t) port->ip_kobject; + vm_object_lock(object); + + assert(object->alive); + + if (object->ref_count == 0) { + queue_remove(&vm_object_cached_list, object, + vm_object_t, cached_list); + vm_object_cached_count--; + } + + object->ref_count++; + vm_object_unlock(object); + vm_object_cache_unlock(); + } + ip_unlock(port); + } + + return object; +} + +void vm_object_destroy( + ipc_port_t pager) +{ + vm_object_t object; + pager_request_t old_request; + ipc_port_t old_name; + + /* + * Perform essentially the same operations as in vm_object_lookup, + * except that this time we look up based on the memory_object + * port, not the control port. + */ + vm_object_cache_lock(); + if (ip_kotype(pager) != IKOT_PAGER) { + vm_object_cache_unlock(); + return; + } + + object = (vm_object_t) pager->ip_kobject; + vm_object_lock(object); + if (object->ref_count == 0) { + queue_remove(&vm_object_cached_list, object, + vm_object_t, cached_list); + vm_object_cached_count--; + } + object->ref_count++; + + object->can_persist = FALSE; + + assert(object->pager == pager); + + /* + * Remove the port associations. + * + * Note that the memory_object itself is dead, so + * we don't bother with it. + */ + + object->pager = IP_NULL; + vm_object_remove(object); + + old_request = object->pager_request; + object->pager_request = PAGER_REQUEST_NULL; + + old_name = object->pager_name; + object->pager_name = IP_NULL; + + vm_object_unlock(object); + vm_object_cache_unlock(); + + /* + * Clean up the port references. Note that there's no + * point in trying the memory_object_terminate call + * because the memory_object itself is dead. + */ + + ipc_port_release_send(pager); +#if !NORMA_VM + if (old_request != IP_NULL) + ipc_port_dealloc_kernel(old_request); +#endif /* !NORMA_VM */ + if (old_name != IP_NULL) +#if NORMA_VM + ipc_port_release_send(old_name); +#else /* NORMA_VM */ + ipc_port_dealloc_kernel(old_name); +#endif /* NORMA_VM */ + + /* + * Restart pending page requests + */ + + vm_object_abort_activity(object); + + /* + * Lose the object reference. + */ + + vm_object_deallocate(object); +} + +boolean_t vm_object_accept_old_init_protocol = FALSE; + +/* + * Routine: vm_object_enter + * Purpose: + * Find a VM object corresponding to the given + * pager; if no such object exists, create one, + * and initialize the pager. + */ +vm_object_t vm_object_enter( + ipc_port_t pager, + vm_size_t size, + boolean_t internal) +{ + register + vm_object_t object; + vm_object_t new_object; + boolean_t must_init; + ipc_kobject_type_t po; + +restart: + if (!IP_VALID(pager)) + return vm_object_allocate(size); + + new_object = VM_OBJECT_NULL; + must_init = FALSE; + + /* + * Look for an object associated with this port. + */ + + vm_object_cache_lock(); + for (;;) { + po = ip_kotype(pager); + + /* + * If a previous object is being terminated, + * we must wait for the termination message + * to be queued. + * + * We set kobject to a non-null value to let the + * terminator know that someone is waiting. + * Among the possibilities is that the port + * could die while we're waiting. Must restart + * instead of continuing the loop. + */ + + if (po == IKOT_PAGER_TERMINATING) { + pager->ip_kobject = (ipc_kobject_t) pager; + assert_wait((event_t) pager, FALSE); + vm_object_cache_unlock(); + thread_block((void (*)()) 0); + goto restart; + } + + /* + * Bail if there is already a kobject associated + * with the pager port. + */ + if (po != IKOT_NONE) { + break; + } + + /* + * We must unlock to create a new object; + * if we do so, we must try the lookup again. + */ + + if (new_object == VM_OBJECT_NULL) { + vm_object_cache_unlock(); + new_object = vm_object_allocate(size); + vm_object_cache_lock(); + } else { + /* + * Lookup failed twice, and we have something + * to insert; set the object. + */ + + ipc_kobject_set(pager, + (ipc_kobject_t) new_object, + IKOT_PAGER); + new_object = VM_OBJECT_NULL; + must_init = TRUE; + } + } + + if (internal) + must_init = TRUE; + + /* + * It's only good if it's a VM object! + */ + + object = (po == IKOT_PAGER) ? (vm_object_t) pager->ip_kobject + : VM_OBJECT_NULL; + + if ((object != VM_OBJECT_NULL) && !must_init) { + vm_object_lock(object); + if (object->ref_count == 0) { + queue_remove(&vm_object_cached_list, object, + vm_object_t, cached_list); + vm_object_cached_count--; + } + object->ref_count++; + vm_object_unlock(object); + + vm_stat.hits++; + } + assert((object == VM_OBJECT_NULL) || (object->ref_count > 0) || + ((object->paging_in_progress != 0) && internal)); + + vm_stat.lookups++; + + vm_object_cache_unlock(); + + /* + * If we raced to create a vm_object but lost, let's + * throw away ours. + */ + + if (new_object != VM_OBJECT_NULL) + vm_object_deallocate(new_object); + + if (object == VM_OBJECT_NULL) + return(object); + + if (must_init) { + /* + * Copy the naked send right we were given. + */ + + pager = ipc_port_copy_send(pager); + if (!IP_VALID(pager)) + panic("vm_object_enter: port died"); /* XXX */ + + object->pager_created = TRUE; + object->pager = pager; + +#if NORMA_VM + + /* + * Let the xmm system know that we want to use the pager. + * + * Name port will be provided by the xmm system + * when set_attributes_common is called. + */ + + object->internal = internal; + object->pager_ready = internal; + if (internal) { + assert(object->temporary); + } else { + object->temporary = FALSE; + } + object->pager_name = IP_NULL; + + (void) xmm_memory_object_init(object); +#else /* NORMA_VM */ + + /* + * Allocate request port. + */ + + object->pager_request = ipc_port_alloc_kernel(); + if (object->pager_request == IP_NULL) + panic("vm_object_enter: pager request alloc"); + + ipc_kobject_set(object->pager_request, + (ipc_kobject_t) object, + IKOT_PAGING_REQUEST); + + /* + * Let the pager know we're using it. + */ + + if (internal) { + /* acquire a naked send right for the DMM */ + ipc_port_t DMM = memory_manager_default_reference(); + + /* mark the object internal */ + object->internal = TRUE; + assert(object->temporary); + + /* default-pager objects are ready immediately */ + object->pager_ready = TRUE; + + /* consumes the naked send right for DMM */ + (void) memory_object_create(DMM, + pager, + object->size, + object->pager_request, + object->pager_name, + PAGE_SIZE); + } else { + /* the object is external and not temporary */ + object->internal = FALSE; + object->temporary = FALSE; + + /* user pager objects are not ready until marked so */ + object->pager_ready = FALSE; + + (void) memory_object_init(pager, + object->pager_request, + object->pager_name, + PAGE_SIZE); + + } +#endif /* NORMA_VM */ + + vm_object_lock(object); + object->pager_initialized = TRUE; + + if (vm_object_accept_old_init_protocol) + object->pager_ready = TRUE; + + vm_object_wakeup(object, VM_OBJECT_EVENT_INITIALIZED); + } else { + vm_object_lock(object); + } + /* + * [At this point, the object must be locked] + */ + + /* + * Wait for the work above to be done by the first + * thread to map this object. + */ + + while (!object->pager_initialized) { + vm_object_wait( object, + VM_OBJECT_EVENT_INITIALIZED, + FALSE); + vm_object_lock(object); + } + vm_object_unlock(object); + + return object; +} + +/* + * Routine: vm_object_pager_create + * Purpose: + * Create a memory object for an internal object. + * In/out conditions: + * The object is locked on entry and exit; + * it may be unlocked within this call. + * Limitations: + * Only one thread may be performing a + * vm_object_pager_create on an object at + * a time. Presumably, only the pageout + * daemon will be using this routine. + */ +void vm_object_pager_create( + register + vm_object_t object) +{ + ipc_port_t pager; + + if (object->pager_created) { + /* + * Someone else got to it first... + * wait for them to finish initializing + */ + + while (!object->pager_initialized) { + vm_object_wait( object, + VM_OBJECT_EVENT_PAGER_READY, + FALSE); + vm_object_lock(object); + } + return; + } + + /* + * Indicate that a memory object has been assigned + * before dropping the lock, to prevent a race. + */ + + object->pager_created = TRUE; + + /* + * Prevent collapse or termination by + * holding a paging reference + */ + + vm_object_paging_begin(object); + vm_object_unlock(object); + +#if MACH_PAGEMAP + object->existence_info = vm_external_create( + object->size + + object->paging_offset); + assert((object->size + object->paging_offset) >= + object->size); +#endif /* MACH_PAGEMAP */ + + /* + * Create the pager, and associate with it + * this object. + * + * Note that we only make the port association + * so that vm_object_enter can properly look up + * the object to complete the initialization... + * we do not expect any user to ever map this + * object. + * + * Since the kernel has the only rights to the + * port, it's safe to install the association + * without holding the cache lock. + */ + + pager = ipc_port_alloc_kernel(); + if (pager == IP_NULL) + panic("vm_object_pager_create: allocate pager port"); + + (void) ipc_port_make_send(pager); + ipc_kobject_set(pager, (ipc_kobject_t) object, IKOT_PAGER); + + /* + * Initialize the rest of the paging stuff + */ + + if (vm_object_enter(pager, object->size, TRUE) != object) + panic("vm_object_pager_create: mismatch"); + + /* + * Drop the naked send right taken above. + */ + + ipc_port_release_send(pager); + + /* + * Release the paging reference + */ + + vm_object_lock(object); + vm_object_paging_end(object); +} + +/* + * Routine: vm_object_remove + * Purpose: + * Eliminate the pager/object association + * for this pager. + * Conditions: + * The object cache must be locked. + */ +void vm_object_remove( + vm_object_t object) +{ + ipc_port_t port; + + if ((port = object->pager) != IP_NULL) { + if (ip_kotype(port) == IKOT_PAGER) + ipc_kobject_set(port, IKO_NULL, + IKOT_PAGER_TERMINATING); + else if (ip_kotype(port) != IKOT_NONE) + panic("vm_object_remove: bad object port"); + } +#if !NORMA_VM + if ((port = object->pager_request) != IP_NULL) { + if (ip_kotype(port) == IKOT_PAGING_REQUEST) + ipc_kobject_set(port, IKO_NULL, IKOT_NONE); + else if (ip_kotype(port) != IKOT_NONE) + panic("vm_object_remove: bad request port"); + } + if ((port = object->pager_name) != IP_NULL) { + if (ip_kotype(port) == IKOT_PAGING_NAME) + ipc_kobject_set(port, IKO_NULL, IKOT_NONE); + else if (ip_kotype(port) != IKOT_NONE) + panic("vm_object_remove: bad name port"); + } +#endif /* !NORMA_VM */ +} + +/* + * Global variables for vm_object_collapse(): + * + * Counts for normal collapses and bypasses. + * Debugging variables, to watch or disable collapse. + */ +long object_collapses = 0; +long object_bypasses = 0; + +int vm_object_collapse_debug = 0; +boolean_t vm_object_collapse_allowed = TRUE; +boolean_t vm_object_collapse_bypass_allowed = TRUE; + +/* + * vm_object_collapse: + * + * Collapse an object with the object backing it. + * Pages in the backing object are moved into the + * parent, and the backing object is deallocated. + * + * Requires that the object be locked and the page + * queues be unlocked. May unlock/relock the object, + * so the caller should hold a reference for the object. + */ +void vm_object_collapse( + register vm_object_t object) +{ + register vm_object_t backing_object; + register vm_offset_t backing_offset; + register vm_size_t size; + register vm_offset_t new_offset; + register vm_page_t p, pp; + ipc_port_t old_name_port; + + if (!vm_object_collapse_allowed) + return; + + while (TRUE) { + /* + * Verify that the conditions are right for collapse: + * + * The object exists and no pages in it are currently + * being paged out (or have ever been paged out). + * + * This check is probably overkill -- if a memory + * object has not been created, the fault handler + * shouldn't release the object lock while paging + * is in progress or absent pages exist. + */ + if (object == VM_OBJECT_NULL || + object->pager_created || + object->paging_in_progress != 0 || + object->absent_count != 0) + return; + + /* + * There is a backing object, and + */ + + if ((backing_object = object->shadow) == VM_OBJECT_NULL) + return; + + vm_object_lock(backing_object); + /* + * ... + * The backing object is not read_only, + * and no pages in the backing object are + * currently being paged out. + * The backing object is internal. + * + * XXX It may be sufficient for the backing + * XXX object to be temporary. + */ + + if (!backing_object->internal || + backing_object->paging_in_progress != 0) { + vm_object_unlock(backing_object); + return; + } + + /* + * The backing object can't be a copy-object: + * the shadow_offset for the copy-object must stay + * as 0. Furthermore (for the 'we have all the + * pages' case), if we bypass backing_object and + * just shadow the next object in the chain, old + * pages from that object would then have to be copied + * BOTH into the (former) backing_object and into the + * parent object. + */ + if (backing_object->shadow != VM_OBJECT_NULL && + backing_object->shadow->copy != VM_OBJECT_NULL) { + vm_object_unlock(backing_object); + return; + } + + /* + * We know that we can either collapse the backing + * object (if the parent is the only reference to + * it) or (perhaps) remove the parent's reference + * to it. + */ + + backing_offset = object->shadow_offset; + size = object->size; + + /* + * If there is exactly one reference to the backing + * object, we can collapse it into the parent. + */ + + if (backing_object->ref_count == 1) { + if (!vm_object_cache_lock_try()) { + vm_object_unlock(backing_object); + return; + } + + /* + * We can collapse the backing object. + * + * Move all in-memory pages from backing_object + * to the parent. Pages that have been paged out + * will be overwritten by any of the parent's + * pages that shadow them. + */ + + while (!queue_empty(&backing_object->memq)) { + + p = (vm_page_t) + queue_first(&backing_object->memq); + + new_offset = (p->offset - backing_offset); + + assert(!p->busy || p->absent); + + /* + * If the parent has a page here, or if + * this page falls outside the parent, + * dispose of it. + * + * Otherwise, move it as planned. + */ + + if (p->offset < backing_offset || + new_offset >= size) { + vm_page_lock_queues(); + vm_page_free(p); + vm_page_unlock_queues(); + } else { + pp = vm_page_lookup(object, new_offset); + if (pp != VM_PAGE_NULL && !pp->absent) { + /* + * Parent object has a real page. + * Throw away the backing object's + * page. + */ + vm_page_lock_queues(); + vm_page_free(p); + vm_page_unlock_queues(); + } + else { + if (pp != VM_PAGE_NULL) { + /* + * Parent has an absent page... + * it's not being paged in, so + * it must really be missing from + * the parent. + * + * Throw out the absent page... + * any faults looking for that + * page will restart with the new + * one. + */ + + /* + * This should never happen -- the + * parent cannot have ever had an + * external memory object, and thus + * cannot have absent pages. + */ + panic("vm_object_collapse: bad case"); + + vm_page_lock_queues(); + vm_page_free(pp); + vm_page_unlock_queues(); + + /* + * Fall through to move the backing + * object's page up. + */ + } + /* + * Parent now has no page. + * Move the backing object's page up. + */ + vm_page_rename(p, object, new_offset); + } + } + } + + /* + * Move the pager from backing_object to object. + * + * XXX We're only using part of the paging space + * for keeps now... we ought to discard the + * unused portion. + */ + + switch (vm_object_collapse_debug) { + case 0: + break; + case 1: + if ((backing_object->pager == IP_NULL) && + (backing_object->pager_request == + PAGER_REQUEST_NULL)) + break; + /* Fall through to... */ + + default: + printf("vm_object_collapse: %#x (pager %#x, request %#x) up to %#x\n", + backing_object, backing_object->pager, backing_object->pager_request, + object); + if (vm_object_collapse_debug > 2) + Debugger("vm_object_collapse"); + } + + object->pager = backing_object->pager; + if (object->pager != IP_NULL) + ipc_kobject_set(object->pager, + (ipc_kobject_t) object, + IKOT_PAGER); + object->pager_initialized = backing_object->pager_initialized; + object->pager_ready = backing_object->pager_ready; + object->pager_created = backing_object->pager_created; + + object->pager_request = backing_object->pager_request; +#if NORMA_VM + old_name_port = object->pager_name; + object->pager_name = backing_object->pager_name; +#else /* NORMA_VM */ + if (object->pager_request != IP_NULL) + ipc_kobject_set(object->pager_request, + (ipc_kobject_t) object, + IKOT_PAGING_REQUEST); + old_name_port = object->pager_name; + if (old_name_port != IP_NULL) + ipc_kobject_set(old_name_port, + IKO_NULL, IKOT_NONE); + object->pager_name = backing_object->pager_name; + if (object->pager_name != IP_NULL) + ipc_kobject_set(object->pager_name, + (ipc_kobject_t) object, + IKOT_PAGING_NAME); +#endif /* NORMA_VM */ + + vm_object_cache_unlock(); + + /* + * If there is no pager, leave paging-offset alone. + */ + if (object->pager != IP_NULL) + object->paging_offset = + backing_object->paging_offset + + backing_offset; + +#if MACH_PAGEMAP + assert(object->existence_info == VM_EXTERNAL_NULL); + object->existence_info = backing_object->existence_info; +#endif /* MACH_PAGEMAP */ + + /* + * Object now shadows whatever backing_object did. + * Note that the reference to backing_object->shadow + * moves from within backing_object to within object. + */ + + object->shadow = backing_object->shadow; + object->shadow_offset += backing_object->shadow_offset; + if (object->shadow != VM_OBJECT_NULL && + object->shadow->copy != VM_OBJECT_NULL) { + panic("vm_object_collapse: we collapsed a copy-object!"); + } + /* + * Discard backing_object. + * + * Since the backing object has no pages, no + * pager left, and no object references within it, + * all that is necessary is to dispose of it. + */ + + assert( + (backing_object->ref_count == 1) && + (backing_object->resident_page_count == 0) && + (backing_object->paging_in_progress == 0) + ); + + assert(backing_object->alive); + backing_object->alive = FALSE; + vm_object_unlock(backing_object); + + vm_object_unlock(object); + if (old_name_port != IP_NULL) +#if NORMA_VM + ipc_port_release_send(old_name_port); +#else /* NORMA_VM */ + ipc_port_dealloc_kernel(old_name_port); +#endif /* NORMA_VM */ + zfree(vm_object_zone, (vm_offset_t) backing_object); + vm_object_lock(object); + + object_collapses++; + } + else { + if (!vm_object_collapse_bypass_allowed) { + vm_object_unlock(backing_object); + return; + } + + /* + * If all of the pages in the backing object are + * shadowed by the parent object, the parent + * object no longer has to shadow the backing + * object; it can shadow the next one in the + * chain. + * + * The backing object must not be paged out - we'd + * have to check all of the paged-out pages, as + * well. + */ + + if (backing_object->pager_created) { + vm_object_unlock(backing_object); + return; + } + + /* + * Should have a check for a 'small' number + * of pages here. + */ + + queue_iterate(&backing_object->memq, p, + vm_page_t, listq) + { + new_offset = (p->offset - backing_offset); + + /* + * If the parent has a page here, or if + * this page falls outside the parent, + * keep going. + * + * Otherwise, the backing_object must be + * left in the chain. + */ + + if (p->offset >= backing_offset && + new_offset <= size && + (pp = vm_page_lookup(object, new_offset)) + == VM_PAGE_NULL) { + /* + * Page still needed. + * Can't go any further. + */ + vm_object_unlock(backing_object); + return; + } + } + + /* + * Make the parent shadow the next object + * in the chain. Deallocating backing_object + * will not remove it, since its reference + * count is at least 2. + */ + + vm_object_reference(object->shadow = backing_object->shadow); + object->shadow_offset += backing_object->shadow_offset; + + /* + * Backing object might have had a copy pointer + * to us. If it did, clear it. + */ + if (backing_object->copy == object) + backing_object->copy = VM_OBJECT_NULL; + + /* + * Drop the reference count on backing_object. + * Since its ref_count was at least 2, it + * will not vanish; so we don't need to call + * vm_object_deallocate. + */ + backing_object->ref_count--; + assert(backing_object->ref_count > 0); + vm_object_unlock(backing_object); + + object_bypasses ++; + + } + + /* + * Try again with this object's new backing object. + */ + } +} + +/* + * Routine: vm_object_page_remove: [internal] + * Purpose: + * Removes all physical pages in the specified + * object range from the object's list of pages. + * + * In/out conditions: + * The object must be locked. + */ +unsigned int vm_object_page_remove_lookup = 0; +unsigned int vm_object_page_remove_iterate = 0; + +void vm_object_page_remove( + register vm_object_t object, + register vm_offset_t start, + register vm_offset_t end) +{ + register vm_page_t p, next; + + /* + * One and two page removals are most popular. + * The factor of 16 here is somewhat arbitrary. + * It balances vm_object_lookup vs iteration. + */ + + if (atop(end - start) < (unsigned)object->resident_page_count/16) { + vm_object_page_remove_lookup++; + + for (; start < end; start += PAGE_SIZE) { + p = vm_page_lookup(object, start); + if (p != VM_PAGE_NULL) { + if (!p->fictitious) + pmap_page_protect(p->phys_addr, + VM_PROT_NONE); + vm_page_lock_queues(); + vm_page_free(p); + vm_page_unlock_queues(); + } + } + } else { + vm_object_page_remove_iterate++; + + p = (vm_page_t) queue_first(&object->memq); + while (!queue_end(&object->memq, (queue_entry_t) p)) { + next = (vm_page_t) queue_next(&p->listq); + if ((start <= p->offset) && (p->offset < end)) { + if (!p->fictitious) + pmap_page_protect(p->phys_addr, + VM_PROT_NONE); + vm_page_lock_queues(); + vm_page_free(p); + vm_page_unlock_queues(); + } + p = next; + } + } +} + +/* + * Routine: vm_object_coalesce + * Function: Coalesces two objects backing up adjoining + * regions of memory into a single object. + * + * returns TRUE if objects were combined. + * + * NOTE: Only works at the moment if the second object is NULL - + * if it's not, which object do we lock first? + * + * Parameters: + * prev_object First object to coalesce + * prev_offset Offset into prev_object + * next_object Second object into coalesce + * next_offset Offset into next_object + * + * prev_size Size of reference to prev_object + * next_size Size of reference to next_object + * + * Conditions: + * The object must *not* be locked. + */ + +boolean_t vm_object_coalesce( + register vm_object_t prev_object, + vm_object_t next_object, + vm_offset_t prev_offset, + vm_offset_t next_offset, + vm_size_t prev_size, + vm_size_t next_size) +{ + vm_size_t newsize; + +#ifdef lint + next_offset++; +#endif /* lint */ + + if (next_object != VM_OBJECT_NULL) { + return FALSE; + } + + if (prev_object == VM_OBJECT_NULL) { + return TRUE; + } + + vm_object_lock(prev_object); + + /* + * Try to collapse the object first + */ + vm_object_collapse(prev_object); + + /* + * Can't coalesce if pages not mapped to + * prev_entry may be in use anyway: + * . more than one reference + * . paged out + * . shadows another object + * . has a copy elsewhere + * . paging references (pages might be in page-list) + */ + + if ((prev_object->ref_count > 1) || + prev_object->pager_created || + (prev_object->shadow != VM_OBJECT_NULL) || + (prev_object->copy != VM_OBJECT_NULL) || + (prev_object->paging_in_progress != 0)) { + vm_object_unlock(prev_object); + return FALSE; + } + + /* + * Remove any pages that may still be in the object from + * a previous deallocation. + */ + + vm_object_page_remove(prev_object, + prev_offset + prev_size, + prev_offset + prev_size + next_size); + + /* + * Extend the object if necessary. + */ + newsize = prev_offset + prev_size + next_size; + if (newsize > prev_object->size) + prev_object->size = newsize; + + vm_object_unlock(prev_object); + return TRUE; +} + +vm_object_t vm_object_request_object( + ipc_port_t p) +{ + return vm_object_lookup(p); +} + +/* + * Routine: vm_object_name + * Purpose: + * Returns a naked send right to the "name" port associated + * with this object. + */ +ipc_port_t vm_object_name( + vm_object_t object) +{ + ipc_port_t p; + + if (object == VM_OBJECT_NULL) + return IP_NULL; + + vm_object_lock(object); + + while (object->shadow != VM_OBJECT_NULL) { + vm_object_t new_object = object->shadow; + vm_object_lock(new_object); + vm_object_unlock(object); + object = new_object; + } + + p = object->pager_name; + if (p != IP_NULL) +#if NORMA_VM + p = ipc_port_copy_send(p); +#else /* NORMA_VM */ + p = ipc_port_make_send(p); +#endif /* NORMA_VM */ + vm_object_unlock(object); + + return p; +} + +/* + * Attach a set of physical pages to an object, so that they can + * be mapped by mapping the object. Typically used to map IO memory. + * + * The mapping function and its private data are used to obtain the + * physical addresses for each page to be mapped. + */ +void +vm_object_page_map( + vm_object_t object, + vm_offset_t offset, + vm_size_t size, + vm_offset_t (*map_fn)(void *, vm_offset_t), + void * map_fn_data) /* private to map_fn */ +{ + int num_pages; + int i; + vm_page_t m; + vm_page_t old_page; + vm_offset_t addr; + + num_pages = atop(size); + + for (i = 0; i < num_pages; i++, offset += PAGE_SIZE) { + + addr = (*map_fn)(map_fn_data, offset); + + while ((m = vm_page_grab_fictitious()) == VM_PAGE_NULL) + vm_page_more_fictitious(); + + vm_object_lock(object); + if ((old_page = vm_page_lookup(object, offset)) + != VM_PAGE_NULL) + { + vm_page_lock_queues(); + vm_page_free(old_page); + vm_page_unlock_queues(); + } + + vm_page_init(m, addr); + m->private = TRUE; /* don`t free page */ + m->wire_count = 1; + vm_page_lock_queues(); + vm_page_insert(m, object, offset); + vm_page_unlock_queues(); + + PAGE_WAKEUP_DONE(m); + vm_object_unlock(object); + } +} + +#include <mach_kdb.h> + + +#if MACH_KDB +#define printf kdbprintf + +boolean_t vm_object_print_pages = FALSE; + +/* + * vm_object_print: [ debug ] + */ +void vm_object_print( + vm_object_t object) +{ + register vm_page_t p; + extern indent; + + register int count; + + if (object == VM_OBJECT_NULL) + return; + + iprintf("Object 0x%X: size=0x%X", + (vm_offset_t) object, (vm_offset_t) object->size); + printf(", %d references, %d resident pages,", object->ref_count, + object->resident_page_count); + printf(" %d absent pages,", object->absent_count); + printf(" %d paging ops\n", object->paging_in_progress); + indent += 2; + iprintf("memory object=0x%X (offset=0x%X),", + (vm_offset_t) object->pager, (vm_offset_t) object->paging_offset); + printf("control=0x%X, name=0x%X\n", + (vm_offset_t) object->pager_request, (vm_offset_t) object->pager_name); + iprintf("%s%s", + object->pager_ready ? " ready" : "", + object->pager_created ? " created" : ""); + printf("%s,%s ", + object->pager_initialized ? "" : "uninitialized", + object->temporary ? "temporary" : "permanent"); + printf("%s%s,", + object->internal ? "internal" : "external", + object->can_persist ? " cacheable" : ""); + printf("copy_strategy=%d\n", (vm_offset_t)object->copy_strategy); + iprintf("shadow=0x%X (offset=0x%X),", + (vm_offset_t) object->shadow, (vm_offset_t) object->shadow_offset); + printf("copy=0x%X\n", (vm_offset_t) object->copy); + + indent += 2; + + if (vm_object_print_pages) { + count = 0; + p = (vm_page_t) queue_first(&object->memq); + while (!queue_end(&object->memq, (queue_entry_t) p)) { + if (count == 0) iprintf("memory:="); + else if (count == 4) {printf("\n"); iprintf(" ..."); count = 0;} + else printf(","); + count++; + + printf("(off=0x%X,page=0x%X)", p->offset, (vm_offset_t) p); + p = (vm_page_t) queue_next(&p->listq); + } + if (count != 0) + printf("\n"); + } + indent -= 4; +} + +#endif /* MACH_KDB */ diff --git a/vm/vm_object.h b/vm/vm_object.h new file mode 100644 index 00000000..d3d050a0 --- /dev/null +++ b/vm/vm_object.h @@ -0,0 +1,374 @@ +/* + * Mach Operating System + * Copyright (c) 1993-1987 Carnegie Mellon University + * All Rights Reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR + * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +/* + * File: vm_object.h + * Author: Avadis Tevanian, Jr., Michael Wayne Young + * Date: 1985 + * + * Virtual memory object module definitions. + */ + +#ifndef _VM_VM_OBJECT_H_ +#define _VM_VM_OBJECT_H_ + +#include <mach_pagemap.h> +#include <norma_vm.h> + +#include <mach/kern_return.h> +#include <mach/boolean.h> +#include <mach/memory_object.h> +#include <mach/port.h> +#include <mach/vm_prot.h> +#include <mach/machine/vm_types.h> +#include <kern/queue.h> +#include <kern/lock.h> +#include <kern/assert.h> +#include <kern/macro_help.h> +#include <vm/pmap.h> + +#if MACH_PAGEMAP +#include <vm/vm_external.h> +#endif /* MACH_PAGEMAP */ + +#if NORMA_VM +typedef struct xmm_obj * pager_request_t; +#else /* NORMA_VM */ +typedef struct ipc_port * pager_request_t; +#endif /* NORMA_VM */ +#define PAGER_REQUEST_NULL ((pager_request_t) 0) + +/* + * Types defined: + * + * vm_object_t Virtual memory object. + * + * We use "struct ipc_port *" instead of "ipc_port_t" + * to avoid include file circularities. + */ + +struct vm_object { + queue_chain_t memq; /* Resident memory */ + decl_simple_lock_data(, Lock) /* Synchronization */ +#if VM_OBJECT_DEBUG + thread_t LockHolder; /* Thread holding Lock */ +#endif VM_OBJECT_DEBUG + vm_size_t size; /* Object size (only valid + * if internal) + */ + + short ref_count; /* Number of references */ + short resident_page_count; + /* number of resident pages */ + + struct vm_object *copy; /* Object that should receive + * a copy of my changed pages + */ + struct vm_object *shadow; /* My shadow */ + vm_offset_t shadow_offset; /* Offset into shadow */ + + struct ipc_port *pager; /* Where to get data */ + vm_offset_t paging_offset; /* Offset into memory object */ + pager_request_t pager_request; /* Where data comes back */ + struct ipc_port *pager_name; /* How to identify region */ + + memory_object_copy_strategy_t + copy_strategy; /* How to handle data copy */ + + unsigned int + absent_count; /* The number of pages that + * have been requested but + * not filled. That is, the + * number of pages for which + * the "absent" attribute is + * asserted. + */ + + unsigned int /* boolean_t array */ + all_wanted; /* Bit array of "want to be + * awakened" notations. See + * VM_OBJECT_EVENT_* items + * below + */ + + unsigned int + paging_in_progress:16, + /* The memory object ports are + * being used (e.g., for pagein + * or pageout) -- don't change any + * of these fields (i.e., don't + * collapse, destroy or terminate) + */ + /* boolean_t */ pager_created:1,/* Has pager ever been created? */ + /* boolean_t */ pager_initialized:1,/* Are fields ready to use? */ + /* boolean_t */ pager_ready:1, /* Will manager take requests? */ + + /* boolean_t */ can_persist:1, /* The kernel may keep the data + * for this object (and rights to + * the memory object) after all + * address map references are + * deallocated? + */ + /* boolean_t */ internal:1, /* Created by the kernel (and + * therefore, managed by the + * default memory manger) + */ + /* boolean_t */ temporary:1, /* Permanent objects may be changed + * externally by the memory manager, + * and changes made in memory must + * be reflected back to the memory + * manager. Temporary objects lack + * both of these characteristics. + */ + /* boolean_t */ alive:1, /* Not yet terminated (debug) */ + /* boolean_t */ lock_in_progress : 1, + /* Is a multi-page lock + * request in progress? + */ + /* boolean_t */ lock_restart : 1, + /* Should lock request in + * progress restart search? + */ + /* boolean_t */ use_old_pageout : 1, + /* Use old pageout primitives? + */ + /* boolean_t */ use_shared_copy : 1,/* Use shared (i.e., + * delayed) copy on write */ + /* boolean_t */ shadowed: 1; /* Shadow may exist */ + + queue_chain_t cached_list; /* Attachment point for the list + * of objects cached as a result + * of their can_persist value + */ + vm_offset_t last_alloc; /* last allocation offset */ +#if MACH_PAGEMAP + vm_external_t existence_info; +#endif /* MACH_PAGEMAP */ +}; + +typedef struct vm_object *vm_object_t; +#define VM_OBJECT_NULL ((vm_object_t) 0) + +extern +vm_object_t kernel_object; /* the single kernel object */ + +/* + * Declare procedures that operate on VM objects. + */ + +extern void vm_object_bootstrap(void); +extern void vm_object_init(void); +extern void vm_object_terminate(vm_object_t); +extern vm_object_t vm_object_allocate(vm_size_t); +extern void vm_object_reference(vm_object_t); +extern void vm_object_deallocate(vm_object_t); +extern void vm_object_pmap_protect( + vm_object_t object, + vm_offset_t offset, + vm_size_t size, + pmap_t pmap, + vm_offset_t pmap_start, + vm_prot_t prot); +extern void vm_object_pmap_remove( + vm_object_t object, + vm_offset_t start, + vm_offset_t end); +extern void vm_object_page_remove( + vm_object_t object, + vm_offset_t start, + vm_offset_t end); +extern void vm_object_shadow( + vm_object_t *object, /* in/out */ + vm_offset_t *offset, /* in/out */ + vm_size_t length); +extern void vm_object_collapse(vm_object_t); +extern vm_object_t vm_object_lookup(struct ipc_port *); +extern vm_object_t vm_object_lookup_name(struct ipc_port *); +extern struct ipc_port *vm_object_name(vm_object_t); +extern void vm_object_remove(vm_object_t); + +extern boolean_t vm_object_copy_temporary( + vm_object_t *_object, /* in/out */ + vm_offset_t *_offset, /* in/out */ + boolean_t *_src_needs_copy, /* out */ + boolean_t *_dst_needs_copy); /* out */ +extern kern_return_t vm_object_copy_strategically( + vm_object_t src_object, + vm_offset_t src_offset, + vm_size_t size, + vm_object_t *dst_object, /* out */ + vm_offset_t *dst_offset, /* out */ + boolean_t *dst_needs_copy); /* out */ +extern kern_return_t vm_object_copy_slowly( + vm_object_t src_object, + vm_offset_t src_offset, + vm_size_t size, + boolean_t interruptible, + vm_object_t *_result_object); /* out */ + +extern vm_object_t vm_object_enter( + struct ipc_port *pager, + vm_size_t size, + boolean_t internal); +extern void vm_object_pager_create( + vm_object_t object); +extern void vm_object_destroy( + struct ipc_port *pager); + +extern void vm_object_page_map( + vm_object_t, + vm_offset_t, + vm_size_t, + vm_offset_t (*)(void *, vm_offset_t), + void *); + +extern void vm_object_print(vm_object_t); + +extern vm_object_t vm_object_request_object(struct ipc_port *); + +/* + * Event waiting handling + */ + +#define VM_OBJECT_EVENT_INITIALIZED 0 +#define VM_OBJECT_EVENT_PAGER_READY 1 +#define VM_OBJECT_EVENT_PAGING_IN_PROGRESS 2 +#define VM_OBJECT_EVENT_ABSENT_COUNT 3 +#define VM_OBJECT_EVENT_LOCK_IN_PROGRESS 4 + +#define vm_object_wait(object, event, interruptible) \ + MACRO_BEGIN \ + (object)->all_wanted |= 1 << (event); \ + vm_object_sleep(((vm_offset_t) object) + (event), \ + (object), \ + (interruptible)); \ + MACRO_END + +#define vm_object_assert_wait(object, event, interruptible) \ + MACRO_BEGIN \ + (object)->all_wanted |= 1 << (event); \ + assert_wait((event_t)(((vm_offset_t) object) + (event)), (interruptible)); \ + MACRO_END + +#define vm_object_wakeup(object, event) \ + MACRO_BEGIN \ + if ((object)->all_wanted & (1 << (event))) \ + thread_wakeup((event_t)(((vm_offset_t) object) + (event))); \ + (object)->all_wanted &= ~(1 << (event)); \ + MACRO_END + +/* + * Routines implemented as macros + */ + +#define vm_object_paging_begin(object) \ + ((object)->paging_in_progress++) + +#define vm_object_paging_end(object) \ + MACRO_BEGIN \ + assert((object)->paging_in_progress != 0); \ + if (--(object)->paging_in_progress == 0) { \ + vm_object_wakeup(object, \ + VM_OBJECT_EVENT_PAGING_IN_PROGRESS); \ + } \ + MACRO_END + +#define vm_object_paging_wait(object, interruptible) \ + MACRO_BEGIN \ + while ((object)->paging_in_progress != 0) { \ + vm_object_wait( (object), \ + VM_OBJECT_EVENT_PAGING_IN_PROGRESS, \ + (interruptible)); \ + vm_object_lock(object); \ + \ + /*XXX if ((interruptible) && */ \ + /*XXX (current_thread()->wait_result != THREAD_AWAKENED))*/ \ + /*XXX break; */ \ + } \ + MACRO_END + +#define vm_object_absent_assert_wait(object, interruptible) \ + MACRO_BEGIN \ + vm_object_assert_wait( (object), \ + VM_OBJECT_EVENT_ABSENT_COUNT, \ + (interruptible)); \ + MACRO_END + + +#define vm_object_absent_release(object) \ + MACRO_BEGIN \ + (object)->absent_count--; \ + vm_object_wakeup((object), \ + VM_OBJECT_EVENT_ABSENT_COUNT); \ + MACRO_END + +/* + * Object locking macros (with and without debugging) + */ + +#if VM_OBJECT_DEBUG +#define vm_object_lock_init(object) \ +MACRO_BEGIN \ + simple_lock_init(&(object)->Lock); \ + (object)->LockHolder = 0; \ +MACRO_END +#define vm_object_lock(object) \ +MACRO_BEGIN \ + simple_lock(&(object)->Lock); \ + (object)->LockHolder = current_thread(); \ +MACRO_END +#define vm_object_unlock(object) \ +MACRO_BEGIN \ + if ((object)->LockHolder != current_thread()) \ + panic("vm_object_unlock 0x%x", (object)); \ + (object)->LockHolder = 0; \ + simple_unlock(&(object)->Lock); \ +MACRO_END +#define vm_object_lock_try(object) \ + (simple_lock_try(&(object)->Lock) \ + ? ( ((object)->LockHolder = current_thread()) , TRUE) \ + : FALSE) +#define vm_object_sleep(event, object, interruptible) \ +MACRO_BEGIN \ + if ((object)->LockHolder != current_thread()) \ + panic("vm_object_sleep %#x", (object)); \ + (object)->LockHolder = 0; \ + thread_sleep((event_t)(event), simple_lock_addr((object)->Lock), \ + (interruptible)); \ +MACRO_END +#define vm_object_lock_taken(object) \ + ((object)->LockHolder == current_thread()) +#else /* VM_OBJECT_DEBUG */ +#define vm_object_lock_init(object) simple_lock_init(&(object)->Lock) +#define vm_object_lock(object) simple_lock(&(object)->Lock) +#define vm_object_unlock(object) simple_unlock(&(object)->Lock) +#define vm_object_lock_try(object) simple_lock_try(&(object)->Lock) +#define vm_object_sleep(event, object, interruptible) \ + thread_sleep((event_t)(event), simple_lock_addr((object)->Lock), \ + (interruptible)) +#define vm_object_lock_taken(object) simple_lock_taken(&(object)->Lock) +#endif /* VM_OBJECT_DEBUG */ + +#endif /* _VM_VM_OBJECT_H_ */ diff --git a/vm/vm_page.h b/vm/vm_page.h new file mode 100644 index 00000000..f7fa80a3 --- /dev/null +++ b/vm/vm_page.h @@ -0,0 +1,322 @@ +/* + * Mach Operating System + * Copyright (c) 1993-1988 Carnegie Mellon University + * All Rights Reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR + * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +/* + * File: vm/vm_page.h + * Author: Avadis Tevanian, Jr., Michael Wayne Young + * Date: 1985 + * + * Resident memory system definitions. + */ + +#ifndef _VM_VM_PAGE_H_ +#define _VM_VM_PAGE_H_ + +#include <mach_vm_debug.h> + +#include <mach/boolean.h> +#include <mach/vm_prot.h> +#include <mach/vm_param.h> +#include <vm/vm_object.h> +#include <kern/queue.h> +#include <kern/lock.h> +#include <kern/zalloc.h> + +#include <kern/macro_help.h> +#include <kern/sched_prim.h> /* definitions of wait/wakeup */ + +#if MACH_VM_DEBUG +#include <mach_debug/hash_info.h> +#endif + +/* + * Management of resident (logical) pages. + * + * A small structure is kept for each resident + * page, indexed by page number. Each structure + * is an element of several lists: + * + * A hash table bucket used to quickly + * perform object/offset lookups + * + * A list of all pages for a given object, + * so they can be quickly deactivated at + * time of deallocation. + * + * An ordered list of pages due for pageout. + * + * In addition, the structure contains the object + * and offset to which this page belongs (for pageout), + * and sundry status bits. + * + * Fields in this structure are locked either by the lock on the + * object that the page belongs to (O) or by the lock on the page + * queues (P). [Some fields require that both locks be held to + * change that field; holding either lock is sufficient to read.] + */ + +struct vm_page { + queue_chain_t pageq; /* queue info for FIFO + * queue or free list (P) */ + queue_chain_t listq; /* all pages in same object (O) */ + struct vm_page *next; /* VP bucket link (O) */ + + vm_object_t object; /* which object am I in (O,P) */ + vm_offset_t offset; /* offset into that object (O,P) */ + + unsigned int wire_count:16, /* how many wired down maps use me? + (O&P) */ + /* boolean_t */ inactive:1, /* page is in inactive list (P) */ + active:1, /* page is in active list (P) */ + laundry:1, /* page is being cleaned now (P)*/ + free:1, /* page is on free list (P) */ + reference:1, /* page has been used (P) */ + :0; /* (force to 'long' boundary) */ +#ifdef ns32000 + int pad; /* extra space for ns32000 bit ops */ +#endif /* ns32000 */ + + unsigned int + /* boolean_t */ busy:1, /* page is in transit (O) */ + wanted:1, /* someone is waiting for page (O) */ + tabled:1, /* page is in VP table (O) */ + fictitious:1, /* Physical page doesn't exist (O) */ + private:1, /* Page should not be returned to + * the free list (O) */ + absent:1, /* Data has been requested, but is + * not yet available (O) */ + error:1, /* Data manager was unable to provide + * data due to error (O) */ + dirty:1, /* Page must be cleaned (O) */ + precious:1, /* Page is precious; data must be + * returned even if clean (O) */ + overwriting:1, /* Request to unlock has been made + * without having data. (O) + * [See vm_object_overwrite] */ + :0; + + vm_offset_t phys_addr; /* Physical address of page, passed + * to pmap_enter (read-only) */ + vm_prot_t page_lock; /* Uses prohibited by data manager (O) */ + vm_prot_t unlock_request; /* Outstanding unlock request (O) */ +}; + +typedef struct vm_page *vm_page_t; + +#define VM_PAGE_NULL ((vm_page_t) 0) + +/* + * For debugging, this macro can be defined to perform + * some useful check on a page structure. + */ + +#define VM_PAGE_CHECK(mem) + +/* + * Each pageable resident page falls into one of three lists: + * + * free + * Available for allocation now. + * inactive + * Not referenced in any map, but still has an + * object/offset-page mapping, and may be dirty. + * This is the list of pages that should be + * paged out next. + * active + * A list of pages which have been placed in + * at least one physical map. This list is + * ordered, in LRU-like fashion. + */ + +extern +vm_page_t vm_page_queue_free; /* memory free queue */ +extern +vm_page_t vm_page_queue_fictitious; /* fictitious free queue */ +extern +queue_head_t vm_page_queue_active; /* active memory queue */ +extern +queue_head_t vm_page_queue_inactive; /* inactive memory queue */ + +extern +vm_offset_t first_phys_addr; /* physical address for first_page */ +extern +vm_offset_t last_phys_addr; /* physical address for last_page */ + +extern +int vm_page_free_count; /* How many pages are free? */ +extern +int vm_page_fictitious_count;/* How many fictitious pages are free? */ +extern +int vm_page_active_count; /* How many pages are active? */ +extern +int vm_page_inactive_count; /* How many pages are inactive? */ +extern +int vm_page_wire_count; /* How many pages are wired? */ +extern +int vm_page_free_target; /* How many do we want free? */ +extern +int vm_page_free_min; /* When to wakeup pageout */ +extern +int vm_page_inactive_target;/* How many do we want inactive? */ +extern +int vm_page_free_reserved; /* How many pages reserved to do pageout */ +extern +int vm_page_laundry_count; /* How many pages being laundered? */ + +decl_simple_lock_data(extern,vm_page_queue_lock)/* lock on active and inactive + page queues */ +decl_simple_lock_data(extern,vm_page_queue_free_lock) + /* lock on free page queue */ + +extern unsigned int vm_page_free_wanted; + /* how many threads are waiting for memory */ + +extern vm_offset_t vm_page_fictitious_addr; + /* (fake) phys_addr of fictitious pages */ + +extern void vm_page_bootstrap( + vm_offset_t *startp, + vm_offset_t *endp); +extern void vm_page_module_init(void); + +extern void vm_page_create( + vm_offset_t start, + vm_offset_t end); +extern vm_page_t vm_page_lookup( + vm_object_t object, + vm_offset_t offset); +extern vm_page_t vm_page_grab_fictitious(void); +extern void vm_page_release_fictitious(vm_page_t); +extern boolean_t vm_page_convert(vm_page_t); +extern void vm_page_more_fictitious(void); +extern vm_page_t vm_page_grab(void); +extern void vm_page_release(vm_page_t); +extern void vm_page_wait(void (*)(void)); +extern vm_page_t vm_page_alloc( + vm_object_t object, + vm_offset_t offset); +extern void vm_page_init( + vm_page_t mem, + vm_offset_t phys_addr); +extern void vm_page_free(vm_page_t); +extern void vm_page_activate(vm_page_t); +extern void vm_page_deactivate(vm_page_t); +extern void vm_page_rename( + vm_page_t mem, + vm_object_t new_object, + vm_offset_t new_offset); +extern void vm_page_insert( + vm_page_t mem, + vm_object_t object, + vm_offset_t offset); +extern void vm_page_remove( + vm_page_t mem); + +extern void vm_page_zero_fill(vm_page_t); +extern void vm_page_copy(vm_page_t src_m, vm_page_t dest_m); + +extern void vm_page_wire(vm_page_t); +extern void vm_page_unwire(vm_page_t); + +extern void vm_set_page_size(void); + +#if MACH_VM_DEBUG +extern unsigned int vm_page_info( + hash_info_bucket_t *info, + unsigned int count); +#endif + +/* + * Functions implemented as macros + */ + +#define PAGE_ASSERT_WAIT(m, interruptible) \ + MACRO_BEGIN \ + (m)->wanted = TRUE; \ + assert_wait((event_t) (m), (interruptible)); \ + MACRO_END + +#define PAGE_WAKEUP_DONE(m) \ + MACRO_BEGIN \ + (m)->busy = FALSE; \ + if ((m)->wanted) { \ + (m)->wanted = FALSE; \ + thread_wakeup(((event_t) m)); \ + } \ + MACRO_END + +#define PAGE_WAKEUP(m) \ + MACRO_BEGIN \ + if ((m)->wanted) { \ + (m)->wanted = FALSE; \ + thread_wakeup((event_t) (m)); \ + } \ + MACRO_END + +#define VM_PAGE_FREE(p) \ + MACRO_BEGIN \ + vm_page_lock_queues(); \ + vm_page_free(p); \ + vm_page_unlock_queues(); \ + MACRO_END + +/* + * Macro to be used in place of pmap_enter() + */ + +#define PMAP_ENTER(pmap, virtual_address, page, protection, wired) \ + MACRO_BEGIN \ + pmap_enter( \ + (pmap), \ + (virtual_address), \ + (page)->phys_addr, \ + (protection) & ~(page)->page_lock, \ + (wired) \ + ); \ + MACRO_END + +#define VM_PAGE_WAIT(continuation) vm_page_wait(continuation) + +#define vm_page_lock_queues() simple_lock(&vm_page_queue_lock) +#define vm_page_unlock_queues() simple_unlock(&vm_page_queue_lock) + +#define VM_PAGE_QUEUES_REMOVE(mem) \ + MACRO_BEGIN \ + if (mem->active) { \ + queue_remove(&vm_page_queue_active, \ + mem, vm_page_t, pageq); \ + mem->active = FALSE; \ + vm_page_active_count--; \ + } \ + \ + if (mem->inactive) { \ + queue_remove(&vm_page_queue_inactive, \ + mem, vm_page_t, pageq); \ + mem->inactive = FALSE; \ + vm_page_inactive_count--; \ + } \ + MACRO_END + +#endif /* _VM_VM_PAGE_H_ */ diff --git a/vm/vm_pageout.c b/vm/vm_pageout.c new file mode 100644 index 00000000..411531bb --- /dev/null +++ b/vm/vm_pageout.c @@ -0,0 +1,924 @@ +/* + * Mach Operating System + * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University. + * Copyright (c) 1993,1994 The University of Utah and + * the Computer Systems Laboratory (CSL). + * All rights reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON, THE UNIVERSITY OF UTAH AND CSL ALLOW FREE USE OF + * THIS SOFTWARE IN ITS "AS IS" CONDITION, AND DISCLAIM ANY LIABILITY + * OF ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF + * THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +/* + * File: vm/vm_pageout.c + * Author: Avadis Tevanian, Jr., Michael Wayne Young + * Date: 1985 + * + * The proverbial page-out daemon. + */ + +#include <mach_pagemap.h> +#include <norma_vm.h> + +#include <mach/mach_types.h> +#include <mach/memory_object.h> +#include "memory_object_default.h" +#include "memory_object_user.h" +#include <mach/vm_param.h> +#include <mach/vm_statistics.h> +#include <kern/counters.h> +#include <kern/thread.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> +#include <vm/vm_object.h> +#include <vm/vm_page.h> +#include <vm/vm_pageout.h> +#include <machine/vm_tuning.h> + + + +#ifndef VM_PAGEOUT_BURST_MAX +#define VM_PAGEOUT_BURST_MAX 10 /* number of pages */ +#endif VM_PAGEOUT_BURST_MAX + +#ifndef VM_PAGEOUT_BURST_MIN +#define VM_PAGEOUT_BURST_MIN 5 /* number of pages */ +#endif VM_PAGEOUT_BURST_MIN + +#ifndef VM_PAGEOUT_BURST_WAIT +#define VM_PAGEOUT_BURST_WAIT 30 /* milliseconds per page */ +#endif VM_PAGEOUT_BURST_WAIT + +#ifndef VM_PAGEOUT_EMPTY_WAIT +#define VM_PAGEOUT_EMPTY_WAIT 200 /* milliseconds */ +#endif VM_PAGEOUT_EMPTY_WAIT + +#ifndef VM_PAGEOUT_PAUSE_MAX +#define VM_PAGEOUT_PAUSE_MAX 10 /* number of pauses */ +#endif VM_PAGEOUT_PAUSE_MAX + +/* + * To obtain a reasonable LRU approximation, the inactive queue + * needs to be large enough to give pages on it a chance to be + * referenced a second time. This macro defines the fraction + * of active+inactive pages that should be inactive. + * The pageout daemon uses it to update vm_page_inactive_target. + * + * If vm_page_free_count falls below vm_page_free_target and + * vm_page_inactive_count is below vm_page_inactive_target, + * then the pageout daemon starts running. + */ + +#ifndef VM_PAGE_INACTIVE_TARGET +#define VM_PAGE_INACTIVE_TARGET(avail) ((avail) * 2 / 3) +#endif VM_PAGE_INACTIVE_TARGET + +/* + * Once the pageout daemon starts running, it keeps going + * until vm_page_free_count meets or exceeds vm_page_free_target. + */ + +#ifndef VM_PAGE_FREE_TARGET +#define VM_PAGE_FREE_TARGET(free) (15 + (free) / 80) +#endif VM_PAGE_FREE_TARGET + +/* + * The pageout daemon always starts running once vm_page_free_count + * falls below vm_page_free_min. + */ + +#ifndef VM_PAGE_FREE_MIN +#define VM_PAGE_FREE_MIN(free) (10 + (free) / 100) +#endif VM_PAGE_FREE_MIN + +/* + * When vm_page_free_count falls below vm_page_free_reserved, + * only vm-privileged threads can allocate pages. vm-privilege + * allows the pageout daemon and default pager (and any other + * associated threads needed for default pageout) to continue + * operation by dipping into the reserved pool of pages. + */ + +#ifndef VM_PAGE_FREE_RESERVED +#define VM_PAGE_FREE_RESERVED 15 +#endif VM_PAGE_FREE_RESERVED + +/* + * When vm_page_free_count falls below vm_pageout_reserved_internal, + * the pageout daemon no longer trusts external pagers to clean pages. + * External pagers are probably all wedged waiting for a free page. + * It forcibly double-pages dirty pages belonging to external objects, + * getting the pages to the default pager to clean. + */ + +#ifndef VM_PAGEOUT_RESERVED_INTERNAL +#define VM_PAGEOUT_RESERVED_INTERNAL(reserve) ((reserve) - 5) +#endif VM_PAGEOUT_RESERVED_INTERNAL + +/* + * When vm_page_free_count falls below vm_pageout_reserved_really, + * the pageout daemon stops work entirely to let the default pager + * catch up (assuming the default pager has pages to clean). + * Beyond this point, it is too dangerous to consume memory + * even for memory_object_data_write messages to the default pager. + */ + +#ifndef VM_PAGEOUT_RESERVED_REALLY +#define VM_PAGEOUT_RESERVED_REALLY(reserve) ((reserve) - 10) +#endif VM_PAGEOUT_RESERVED_REALLY + +extern void vm_pageout_continue(); +extern void vm_pageout_scan_continue(); + +unsigned int vm_pageout_reserved_internal = 0; +unsigned int vm_pageout_reserved_really = 0; + +unsigned int vm_pageout_burst_max = 0; +unsigned int vm_pageout_burst_min = 0; +unsigned int vm_pageout_burst_wait = 0; /* milliseconds per page */ +unsigned int vm_pageout_empty_wait = 0; /* milliseconds */ +unsigned int vm_pageout_pause_count = 0; +unsigned int vm_pageout_pause_max = 0; + +/* + * These variables record the pageout daemon's actions: + * how many pages it looks at and what happens to those pages. + * No locking needed because only one thread modifies the variables. + */ + +unsigned int vm_pageout_active = 0; /* debugging */ +unsigned int vm_pageout_inactive = 0; /* debugging */ +unsigned int vm_pageout_inactive_nolock = 0; /* debugging */ +unsigned int vm_pageout_inactive_busy = 0; /* debugging */ +unsigned int vm_pageout_inactive_absent = 0; /* debugging */ +unsigned int vm_pageout_inactive_used = 0; /* debugging */ +unsigned int vm_pageout_inactive_clean = 0; /* debugging */ +unsigned int vm_pageout_inactive_dirty = 0; /* debugging */ +unsigned int vm_pageout_inactive_double = 0; /* debugging */ + +#if NORMA_VM +/* + * Define them here, since they won't be defined by memory_object_user.h. + */ +extern kern_return_t memory_object_data_initialize(); +extern kern_return_t memory_object_data_write(); +#endif NORMA_VM + +/* + * Routine: vm_pageout_setup + * Purpose: + * Set up a page for pageout. + * + * Move or copy the page to a new object, as part + * of which it will be sent to its memory manager + * in a memory_object_data_write or memory_object_initialize + * message. + * + * The "paging_offset" argument specifies the offset + * of the page within its external memory object. + * + * The "new_object" and "new_offset" arguments + * indicate where the page should be moved. + * + * The "flush" argument specifies whether the page + * should be flushed from its object. If not, a + * copy of the page is moved to the new object. + * + * In/Out conditions: + * The page in question must not be on any pageout queues, + * and must be busy. The object to which it belongs + * must be unlocked, and the caller must hold a paging + * reference to it. The new_object must not be locked. + * + * If the page is flushed from its original object, + * this routine returns a pointer to a place-holder page, + * inserted at the same offset, to block out-of-order + * requests for the page. The place-holder page must + * be freed after the data_write or initialize message + * has been sent. If the page is copied, + * the holding page is VM_PAGE_NULL. + * + * The original page is put on a paging queue and marked + * not busy on exit. + */ +vm_page_t +vm_pageout_setup(m, paging_offset, new_object, new_offset, flush) + register vm_page_t m; + vm_offset_t paging_offset; + register vm_object_t new_object; + vm_offset_t new_offset; + boolean_t flush; +{ + register vm_object_t old_object = m->object; + register vm_page_t holding_page = 0; /*'=0'to quiet gcc warnings*/ + register vm_page_t new_m; + + assert(m->busy && !m->absent && !m->fictitious); + + /* + * If we are not flushing the page, allocate a + * page in the object. If we cannot get the + * page, flush instead. + */ + if (!flush) { + vm_object_lock(new_object); + new_m = vm_page_alloc(new_object, new_offset); + if (new_m == VM_PAGE_NULL) + flush = TRUE; + vm_object_unlock(new_object); + } + + if (flush) { + /* + * Create a place-holder page where the old one was, + * to prevent anyone from attempting to page in this + * page while we`re unlocked. + */ + while ((holding_page = vm_page_grab_fictitious()) + == VM_PAGE_NULL) + vm_page_more_fictitious(); + + vm_object_lock(old_object); + vm_page_lock_queues(); + vm_page_remove(m); + vm_page_unlock_queues(); + PAGE_WAKEUP_DONE(m); + + vm_page_lock_queues(); + vm_page_insert(holding_page, old_object, m->offset); + vm_page_unlock_queues(); + + /* + * Record that this page has been written out + */ +#if MACH_PAGEMAP + vm_external_state_set(old_object->existence_info, + paging_offset, + VM_EXTERNAL_STATE_EXISTS); +#endif MACH_PAGEMAP + + vm_object_unlock(old_object); + + vm_object_lock(new_object); + + /* + * Move this page into the new object + */ + + vm_page_lock_queues(); + vm_page_insert(m, new_object, new_offset); + vm_page_unlock_queues(); + + m->dirty = TRUE; + m->precious = FALSE; + m->page_lock = VM_PROT_NONE; + m->unlock_request = VM_PROT_NONE; + } + else { + /* + * Copy the data into the new page, + * and mark the new page as clean. + */ + vm_page_copy(m, new_m); + + vm_object_lock(old_object); + m->dirty = FALSE; + pmap_clear_modify(m->phys_addr); + + /* + * Deactivate old page. + */ + vm_page_lock_queues(); + vm_page_deactivate(m); + vm_page_unlock_queues(); + + PAGE_WAKEUP_DONE(m); + + /* + * Record that this page has been written out + */ + +#if MACH_PAGEMAP + vm_external_state_set(old_object->existence_info, + paging_offset, + VM_EXTERNAL_STATE_EXISTS); +#endif MACH_PAGEMAP + + vm_object_unlock(old_object); + + vm_object_lock(new_object); + + /* + * Use the new page below. + */ + m = new_m; + m->dirty = TRUE; + assert(!m->precious); + PAGE_WAKEUP_DONE(m); + } + + /* + * Make the old page eligible for replacement again; if a + * user-supplied memory manager fails to release the page, + * it will be paged out again to the default memory manager. + * + * Note that pages written to the default memory manager + * must be wired down -- in return, it guarantees to free + * this page, rather than reusing it. + */ + + vm_page_lock_queues(); + vm_stat.pageouts++; + if (m->laundry) { + /* + * vm_pageout_scan is telling us to put this page + * at the front of the inactive queue, so it will + * be immediately paged out to the default pager. + */ + + assert(!old_object->internal); + m->laundry = FALSE; + + queue_enter_first(&vm_page_queue_inactive, m, + vm_page_t, pageq); + m->inactive = TRUE; + vm_page_inactive_count++; + } else if (old_object->internal) { + m->laundry = TRUE; + vm_page_laundry_count++; + + vm_page_wire(m); + } else + vm_page_activate(m); + vm_page_unlock_queues(); + + /* + * Since IPC operations may block, we drop locks now. + * [The placeholder page is busy, and we still have + * paging_in_progress incremented.] + */ + + vm_object_unlock(new_object); + + /* + * Return the placeholder page to simplify cleanup. + */ + return (flush ? holding_page : VM_PAGE_NULL); +} + +/* + * Routine: vm_pageout_page + * Purpose: + * Causes the specified page to be written back to + * the appropriate memory object. + * + * The "initial" argument specifies whether this + * data is an initialization only, and should use + * memory_object_data_initialize instead of + * memory_object_data_write. + * + * The "flush" argument specifies whether the page + * should be flushed from the object. If not, a + * copy of the data is sent to the memory object. + * + * In/out conditions: + * The page in question must not be on any pageout queues. + * The object to which it belongs must be locked. + * Implementation: + * Move this page to a completely new object, if flushing; + * copy to a new page in a new object, if not. + */ +void +vm_pageout_page(m, initial, flush) + register vm_page_t m; + boolean_t initial; + boolean_t flush; +{ + vm_map_copy_t copy; + register vm_object_t old_object; + register vm_object_t new_object; + register vm_page_t holding_page; + vm_offset_t paging_offset; + kern_return_t rc; + boolean_t precious_clean; + + assert(m->busy); + + /* + * Cleaning but not flushing a clean precious page is a + * no-op. Remember whether page is clean and precious now + * because vm_pageout_setup will mark it dirty and not precious. + * + * XXX Check if precious_clean && !flush can really happen. + */ + precious_clean = (!m->dirty) && m->precious; + if (precious_clean && !flush) { + PAGE_WAKEUP_DONE(m); + return; + } + + /* + * Verify that we really want to clean this page. + */ + if (m->absent || m->error || (!m->dirty && !m->precious)) { + VM_PAGE_FREE(m); + return; + } + + /* + * Create a paging reference to let us play with the object. + */ + old_object = m->object; + paging_offset = m->offset + old_object->paging_offset; + vm_object_paging_begin(old_object); + vm_object_unlock(old_object); + + /* + * Allocate a new object into which we can put the page. + */ + new_object = vm_object_allocate(PAGE_SIZE); + + /* + * Move the page into the new object. + */ + holding_page = vm_pageout_setup(m, + paging_offset, + new_object, + 0, /* new offset */ + flush); /* flush */ + + rc = vm_map_copyin_object(new_object, 0, PAGE_SIZE, ©); + assert(rc == KERN_SUCCESS); + + if (initial || old_object->use_old_pageout) { + rc = (*(initial ? memory_object_data_initialize + : memory_object_data_write)) + (old_object->pager, + old_object->pager_request, + paging_offset, (pointer_t) copy, PAGE_SIZE); + } + else { + rc = memory_object_data_return( + old_object->pager, + old_object->pager_request, + paging_offset, (pointer_t) copy, PAGE_SIZE, + !precious_clean, !flush); + } + + if (rc != KERN_SUCCESS) + vm_map_copy_discard(copy); + + /* + * Clean up. + */ + vm_object_lock(old_object); + if (holding_page != VM_PAGE_NULL) + VM_PAGE_FREE(holding_page); + vm_object_paging_end(old_object); +} + +/* + * vm_pageout_scan does the dirty work for the pageout daemon. + * It returns with vm_page_queue_free_lock held and + * vm_page_free_wanted == 0. + */ + +void vm_pageout_scan() +{ + unsigned int burst_count; + + /* + * We want to gradually dribble pages from the active queue + * to the inactive queue. If we let the inactive queue get + * very small, and then suddenly dump many pages into it, + * those pages won't get a sufficient chance to be referenced + * before we start taking them from the inactive queue. + * + * We must limit the rate at which we send pages to the pagers. + * data_write messages consume memory, for message buffers and + * for map-copy objects. If we get too far ahead of the pagers, + * we can potentially run out of memory. + * + * We can use the laundry count to limit directly the number + * of pages outstanding to the default pager. A similar + * strategy for external pagers doesn't work, because + * external pagers don't have to deallocate the pages sent them, + * and because we might have to send pages to external pagers + * even if they aren't processing writes. So we also + * use a burst count to limit writes to external pagers. + * + * When memory is very tight, we can't rely on external pagers to + * clean pages. They probably aren't running, because they + * aren't vm-privileged. If we kept sending dirty pages to them, + * we could exhaust the free list. However, we can't just ignore + * pages belonging to external objects, because there might be no + * pages belonging to internal objects. Hence, we get the page + * into an internal object and then immediately double-page it, + * sending it to the default pager. + * + * consider_zone_gc should be last, because the other operations + * might return memory to zones. When we pause we use + * vm_pageout_scan_continue as our continuation, so we will + * reenter vm_pageout_scan periodically and attempt to reclaim + * internal memory even if we never reach vm_page_free_target. + */ + + Restart: + stack_collect(); + net_kmsg_collect(); + consider_task_collect(); + consider_thread_collect(); + consider_zone_gc(); + + for (burst_count = 0;;) { + register vm_page_t m; + register vm_object_t object; + unsigned int free_count; + + /* + * Recalculate vm_page_inactivate_target. + */ + + vm_page_lock_queues(); + vm_page_inactive_target = + VM_PAGE_INACTIVE_TARGET(vm_page_active_count + + vm_page_inactive_count); + + /* + * Move pages from active to inactive. + */ + + while ((vm_page_inactive_count < vm_page_inactive_target) && + !queue_empty(&vm_page_queue_active)) { + register vm_object_t obj; + + vm_pageout_active++; + m = (vm_page_t) queue_first(&vm_page_queue_active); + assert(m->active && !m->inactive); + + obj = m->object; + if (!vm_object_lock_try(obj)) { + /* + * Move page to end and continue. + */ + + queue_remove(&vm_page_queue_active, m, + vm_page_t, pageq); + queue_enter(&vm_page_queue_active, m, + vm_page_t, pageq); + vm_page_unlock_queues(); + vm_page_lock_queues(); + continue; + } + + /* + * If the page is busy, then we pull it + * off the active queue and leave it alone. + */ + + if (m->busy) { + vm_object_unlock(obj); + queue_remove(&vm_page_queue_active, m, + vm_page_t, pageq); + m->active = FALSE; + vm_page_active_count--; + continue; + } + + /* + * Deactivate the page while holding the object + * locked, so we know the page is still not busy. + * This should prevent races between pmap_enter + * and pmap_clear_reference. The page might be + * absent or fictitious, but vm_page_deactivate + * can handle that. + */ + + vm_page_deactivate(m); + vm_object_unlock(obj); + } + + /* + * We are done if we have met our target *and* + * nobody is still waiting for a page. + */ + + simple_lock(&vm_page_queue_free_lock); + free_count = vm_page_free_count; + if ((free_count >= vm_page_free_target) & + (vm_page_free_wanted == 0)) { + vm_page_unlock_queues(); + break; + } + simple_unlock(&vm_page_queue_free_lock); + + /* + * Sometimes we have to pause: + * 1) No inactive pages - nothing to do. + * 2) Flow control - wait for pagers to catch up. + * 3) Extremely low memory - sending out dirty pages + * consumes memory. We don't take the risk of doing + * this if the default pager already has work to do. + */ + + if (queue_empty(&vm_page_queue_inactive) || + (burst_count >= vm_pageout_burst_max) || + (vm_page_laundry_count >= vm_pageout_burst_max) || + ((free_count < vm_pageout_reserved_really) && + (vm_page_laundry_count > 0))) { + unsigned int pages, msecs; + + /* + * vm_pageout_burst_wait is msecs/page. + * If there is nothing for us to do, we wait + * at least vm_pageout_empty_wait msecs. + */ + + if (vm_page_laundry_count > burst_count) + pages = vm_page_laundry_count; + else + pages = burst_count; + msecs = pages * vm_pageout_burst_wait; + + if (queue_empty(&vm_page_queue_inactive) && + (msecs < vm_pageout_empty_wait)) + msecs = vm_pageout_empty_wait; + vm_page_unlock_queues(); + + thread_will_wait_with_timeout(current_thread(), msecs); + counter(c_vm_pageout_scan_block++); + thread_block(vm_pageout_scan_continue); +#ifndef CONTINUATIONS + /* + * Unfortunately, we don't have call_continuation + * so we can't rely on tail-recursion. + */ + + vm_pageout_scan_continue(); + goto Restart; +#else /* CONTINUATIONS */ + call_continuation(vm_pageout_scan_continue); + /*NOTREACHED*/ +#endif /* CONTINUATIONS */ + } + + vm_pageout_inactive++; + m = (vm_page_t) queue_first(&vm_page_queue_inactive); + assert(!m->active && m->inactive); + object = m->object; + + /* + * Try to lock object; since we've got the + * page queues lock, we can only try for this one. + */ + + if (!vm_object_lock_try(object)) { + /* + * Move page to end and continue. + */ + + queue_remove(&vm_page_queue_inactive, m, + vm_page_t, pageq); + queue_enter(&vm_page_queue_inactive, m, + vm_page_t, pageq); + vm_page_unlock_queues(); + vm_pageout_inactive_nolock++; + continue; + } + + /* + * Remove the page from the inactive list. + */ + + queue_remove(&vm_page_queue_inactive, m, vm_page_t, pageq); + vm_page_inactive_count--; + m->inactive = FALSE; + + if (m->busy || !object->alive) { + /* + * Somebody is already playing with this page. + * Leave it off the pageout queues. + */ + + vm_page_unlock_queues(); + vm_object_unlock(object); + vm_pageout_inactive_busy++; + continue; + } + + /* + * If it's absent, we can reclaim the page. + */ + + if (m->absent) { + vm_pageout_inactive_absent++; + reclaim_page: + vm_page_free(m); + vm_page_unlock_queues(); + vm_object_unlock(object); + continue; + } + + /* + * If it's being used, reactivate. + * (Fictitious pages are either busy or absent.) + */ + + assert(!m->fictitious); + if (m->reference || pmap_is_referenced(m->phys_addr)) { + vm_object_unlock(object); + vm_page_activate(m); + vm_stat.reactivations++; + vm_page_unlock_queues(); + vm_pageout_inactive_used++; + continue; + } + + /* + * Eliminate all mappings. + */ + + m->busy = TRUE; + pmap_page_protect(m->phys_addr, VM_PROT_NONE); + if (!m->dirty) + m->dirty = pmap_is_modified(m->phys_addr); + + /* + * If it's clean and not precious, we can free the page. + */ + + if (!m->dirty && !m->precious) { + vm_pageout_inactive_clean++; + goto reclaim_page; + } + + /* + * If we are very low on memory, then we can't + * rely on an external pager to clean a dirty page, + * because external pagers are not vm-privileged. + * + * The laundry bit tells vm_pageout_setup to + * put the page back at the front of the inactive + * queue instead of activating the page. Hence, + * we will pick the page up again immediately and + * resend it to the default pager. + */ + + assert(!m->laundry); + if ((free_count < vm_pageout_reserved_internal) && + !object->internal) { + m->laundry = TRUE; + vm_pageout_inactive_double++; + } + vm_page_unlock_queues(); + + /* + * If there is no memory object for the page, create + * one and hand it to the default pager. + * [First try to collapse, so we don't create + * one unnecessarily.] + */ + + if (!object->pager_initialized) + vm_object_collapse(object); + if (!object->pager_initialized) + vm_object_pager_create(object); + if (!object->pager_initialized) + panic("vm_pageout_scan"); + + vm_pageout_inactive_dirty++; + vm_pageout_page(m, FALSE, TRUE); /* flush it */ + vm_object_unlock(object); + burst_count++; + } +} + +void vm_pageout_scan_continue() +{ + /* + * We just paused to let the pagers catch up. + * If vm_page_laundry_count is still high, + * then we aren't waiting long enough. + * If we have paused some vm_pageout_pause_max times without + * adjusting vm_pageout_burst_wait, it might be too big, + * so we decrease it. + */ + + vm_page_lock_queues(); + if (vm_page_laundry_count > vm_pageout_burst_min) { + vm_pageout_burst_wait++; + vm_pageout_pause_count = 0; + } else if (++vm_pageout_pause_count > vm_pageout_pause_max) { + vm_pageout_burst_wait = (vm_pageout_burst_wait * 3) / 4; + if (vm_pageout_burst_wait < 1) + vm_pageout_burst_wait = 1; + vm_pageout_pause_count = 0; + } + vm_page_unlock_queues(); + +#ifdef CONTINUATIONS + vm_pageout_continue(); + /*NOTREACHED*/ +#endif /* CONTINUATIONS */ +} + +/* + * vm_pageout is the high level pageout daemon. + */ + +void vm_pageout_continue() +{ + /* + * The pageout daemon is never done, so loop forever. + * We should call vm_pageout_scan at least once each + * time we are woken, even if vm_page_free_wanted is + * zero, to check vm_page_free_target and + * vm_page_inactive_target. + */ + + for (;;) { + vm_pageout_scan(); + /* we hold vm_page_queue_free_lock now */ + assert(vm_page_free_wanted == 0); + + assert_wait(&vm_page_free_wanted, FALSE); + simple_unlock(&vm_page_queue_free_lock); + counter(c_vm_pageout_block++); + thread_block(vm_pageout_continue); + } +} + +void vm_pageout() +{ + int free_after_reserve; + + current_thread()->vm_privilege = TRUE; + stack_privilege(current_thread()); + + /* + * Initialize some paging parameters. + */ + + if (vm_pageout_burst_max == 0) + vm_pageout_burst_max = VM_PAGEOUT_BURST_MAX; + + if (vm_pageout_burst_min == 0) + vm_pageout_burst_min = VM_PAGEOUT_BURST_MIN; + + if (vm_pageout_burst_wait == 0) + vm_pageout_burst_wait = VM_PAGEOUT_BURST_WAIT; + + if (vm_pageout_empty_wait == 0) + vm_pageout_empty_wait = VM_PAGEOUT_EMPTY_WAIT; + + if (vm_page_free_reserved == 0) + vm_page_free_reserved = VM_PAGE_FREE_RESERVED; + + if (vm_pageout_pause_max == 0) + vm_pageout_pause_max = VM_PAGEOUT_PAUSE_MAX; + + if (vm_pageout_reserved_internal == 0) + vm_pageout_reserved_internal = + VM_PAGEOUT_RESERVED_INTERNAL(vm_page_free_reserved); + + if (vm_pageout_reserved_really == 0) + vm_pageout_reserved_really = + VM_PAGEOUT_RESERVED_REALLY(vm_page_free_reserved); + + free_after_reserve = vm_page_free_count - vm_page_free_reserved; + + if (vm_page_free_min == 0) + vm_page_free_min = vm_page_free_reserved + + VM_PAGE_FREE_MIN(free_after_reserve); + + if (vm_page_free_target == 0) + vm_page_free_target = vm_page_free_reserved + + VM_PAGE_FREE_TARGET(free_after_reserve); + + if (vm_page_free_target < vm_page_free_min + 5) + vm_page_free_target = vm_page_free_min + 5; + + /* + * vm_pageout_scan will set vm_page_inactive_target. + */ + + vm_pageout_continue(); + /*NOTREACHED*/ +} diff --git a/vm/vm_pageout.h b/vm/vm_pageout.h new file mode 100644 index 00000000..5b47a5e0 --- /dev/null +++ b/vm/vm_pageout.h @@ -0,0 +1,46 @@ +/* + * Mach Operating System + * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University + * All Rights Reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR + * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +/* + * File: vm/vm_pageout.h + * Author: Avadis Tevanian, Jr. + * Date: 1986 + * + * Declarations for the pageout daemon interface. + */ + +#ifndef _VM_VM_PAGEOUT_H_ +#define _VM_VM_PAGEOUT_H_ + +#include <vm/vm_page.h> + +/* + * Exported routines. + */ + +extern vm_page_t vm_pageout_setup(); +extern void vm_pageout_page(); + +#endif _VM_VM_PAGEOUT_H_ diff --git a/vm/vm_resident.c b/vm/vm_resident.c new file mode 100644 index 00000000..5c4f2822 --- /dev/null +++ b/vm/vm_resident.c @@ -0,0 +1,1505 @@ +/* + * Mach Operating System + * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University. + * Copyright (c) 1993,1994 The University of Utah and + * the Computer Systems Laboratory (CSL). + * All rights reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON, THE UNIVERSITY OF UTAH AND CSL ALLOW FREE USE OF + * THIS SOFTWARE IN ITS "AS IS" CONDITION, AND DISCLAIM ANY LIABILITY + * OF ANY KIND FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF + * THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +/* + * File: vm/vm_page.c + * Author: Avadis Tevanian, Jr., Michael Wayne Young + * + * Resident memory management module. + */ +#include <cpus.h> + +#include <mach/vm_prot.h> +#include <kern/counters.h> +#include <kern/sched_prim.h> +#include <kern/task.h> +#include <kern/thread.h> +#include <mach/vm_statistics.h> +#include "vm_param.h" +#include <kern/xpr.h> +#include <kern/zalloc.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> +#include <vm/vm_page.h> +#include <vm/vm_pageout.h> +#include <vm/vm_kern.h> + +#include <mach_vm_debug.h> +#if MACH_VM_DEBUG +#include <mach/kern_return.h> +#include <mach_debug/hash_info.h> +#include <vm/vm_user.h> +#endif + +/* in zalloc.c XXX */ +extern vm_offset_t zdata; +extern vm_size_t zdata_size; + +/* + * Associated with eacn page of user-allocatable memory is a + * page structure. + */ + +/* + * These variables record the values returned by vm_page_bootstrap, + * for debugging purposes. The implementation of pmap_steal_memory + * and pmap_startup here also uses them internally. + */ + +vm_offset_t virtual_space_start; +vm_offset_t virtual_space_end; + +/* + * The vm_page_lookup() routine, which provides for fast + * (virtual memory object, offset) to page lookup, employs + * the following hash table. The vm_page_{insert,remove} + * routines install and remove associations in the table. + * [This table is often called the virtual-to-physical, + * or VP, table.] + */ +typedef struct { + decl_simple_lock_data(,lock) + vm_page_t pages; +} vm_page_bucket_t; + +vm_page_bucket_t *vm_page_buckets; /* Array of buckets */ +unsigned int vm_page_bucket_count = 0; /* How big is array? */ +unsigned int vm_page_hash_mask; /* Mask for hash function */ + +/* + * Resident page structures are initialized from + * a template (see vm_page_alloc). + * + * When adding a new field to the virtual memory + * object structure, be sure to add initialization + * (see vm_page_bootstrap). + */ +struct vm_page vm_page_template; + +/* + * Resident pages that represent real memory + * are allocated from a free list. + */ +vm_page_t vm_page_queue_free; +vm_page_t vm_page_queue_fictitious; +decl_simple_lock_data(,vm_page_queue_free_lock) +unsigned int vm_page_free_wanted; +int vm_page_free_count; +int vm_page_fictitious_count; + +unsigned int vm_page_free_count_minimum; /* debugging */ + +/* + * Occasionally, the virtual memory system uses + * resident page structures that do not refer to + * real pages, for example to leave a page with + * important state information in the VP table. + * + * These page structures are allocated the way + * most other kernel structures are. + */ +zone_t vm_page_zone; + +/* + * Fictitious pages don't have a physical address, + * but we must initialize phys_addr to something. + * For debugging, this should be a strange value + * that the pmap module can recognize in assertions. + */ +vm_offset_t vm_page_fictitious_addr = (vm_offset_t) -1; + +/* + * Resident page structures are also chained on + * queues that are used by the page replacement + * system (pageout daemon). These queues are + * defined here, but are shared by the pageout + * module. + */ +queue_head_t vm_page_queue_active; +queue_head_t vm_page_queue_inactive; +decl_simple_lock_data(,vm_page_queue_lock) +int vm_page_active_count; +int vm_page_inactive_count; +int vm_page_wire_count; + +/* + * Several page replacement parameters are also + * shared with this module, so that page allocation + * (done here in vm_page_alloc) can trigger the + * pageout daemon. + */ +int vm_page_free_target = 0; +int vm_page_free_min = 0; +int vm_page_inactive_target = 0; +int vm_page_free_reserved = 0; +int vm_page_laundry_count = 0; + +/* + * The VM system has a couple of heuristics for deciding + * that pages are "uninteresting" and should be placed + * on the inactive queue as likely candidates for replacement. + * These variables let the heuristics be controlled at run-time + * to make experimentation easier. + */ + +boolean_t vm_page_deactivate_behind = TRUE; +boolean_t vm_page_deactivate_hint = TRUE; + +/* + * vm_page_bootstrap: + * + * Initializes the resident memory module. + * + * Allocates memory for the page cells, and + * for the object/offset-to-page hash table headers. + * Each page cell is initialized and placed on the free list. + * Returns the range of available kernel virtual memory. + */ + +void vm_page_bootstrap( + vm_offset_t *startp, + vm_offset_t *endp) +{ + register vm_page_t m; + int i; + + /* + * Initialize the vm_page template. + */ + + m = &vm_page_template; + m->object = VM_OBJECT_NULL; /* reset later */ + m->offset = 0; /* reset later */ + m->wire_count = 0; + + m->inactive = FALSE; + m->active = FALSE; + m->laundry = FALSE; + m->free = FALSE; + + m->busy = TRUE; + m->wanted = FALSE; + m->tabled = FALSE; + m->fictitious = FALSE; + m->private = FALSE; + m->absent = FALSE; + m->error = FALSE; + m->dirty = FALSE; + m->precious = FALSE; + m->reference = FALSE; + + m->phys_addr = 0; /* reset later */ + + m->page_lock = VM_PROT_NONE; + m->unlock_request = VM_PROT_NONE; + + /* + * Initialize the page queues. + */ + + simple_lock_init(&vm_page_queue_free_lock); + simple_lock_init(&vm_page_queue_lock); + + vm_page_queue_free = VM_PAGE_NULL; + vm_page_queue_fictitious = VM_PAGE_NULL; + queue_init(&vm_page_queue_active); + queue_init(&vm_page_queue_inactive); + + vm_page_free_wanted = 0; + + /* + * Steal memory for the zone system. + */ + + kentry_data_size = kentry_count * sizeof(struct vm_map_entry); + kentry_data = pmap_steal_memory(kentry_data_size); + + zdata = pmap_steal_memory(zdata_size); + + /* + * Allocate (and initialize) the virtual-to-physical + * table hash buckets. + * + * The number of buckets should be a power of two to + * get a good hash function. The following computation + * chooses the first power of two that is greater + * than the number of physical pages in the system. + */ + + if (vm_page_bucket_count == 0) { + unsigned int npages = pmap_free_pages(); + + vm_page_bucket_count = 1; + while (vm_page_bucket_count < npages) + vm_page_bucket_count <<= 1; + } + + vm_page_hash_mask = vm_page_bucket_count - 1; + + if (vm_page_hash_mask & vm_page_bucket_count) + printf("vm_page_bootstrap: WARNING -- strange page hash\n"); + + vm_page_buckets = (vm_page_bucket_t *) + pmap_steal_memory(vm_page_bucket_count * + sizeof(vm_page_bucket_t)); + + for (i = 0; i < vm_page_bucket_count; i++) { + register vm_page_bucket_t *bucket = &vm_page_buckets[i]; + + bucket->pages = VM_PAGE_NULL; + simple_lock_init(&bucket->lock); + } + + /* + * Machine-dependent code allocates the resident page table. + * It uses vm_page_init to initialize the page frames. + * The code also returns to us the virtual space available + * to the kernel. We don't trust the pmap module + * to get the alignment right. + */ + + pmap_startup(&virtual_space_start, &virtual_space_end); + virtual_space_start = round_page(virtual_space_start); + virtual_space_end = trunc_page(virtual_space_end); + + *startp = virtual_space_start; + *endp = virtual_space_end; + + printf("vm_page_bootstrap: %d free pages\n", vm_page_free_count); + vm_page_free_count_minimum = vm_page_free_count; +} + +#ifndef MACHINE_PAGES +/* + * We implement pmap_steal_memory and pmap_startup with the help + * of two simpler functions, pmap_virtual_space and pmap_next_page. + */ + +vm_offset_t pmap_steal_memory( + vm_size_t size) +{ + vm_offset_t addr, vaddr, paddr; + + /* + * We round the size to an integer multiple. + */ + + size = (size + 3) &~ 3; + + /* + * If this is the first call to pmap_steal_memory, + * we have to initialize ourself. + */ + + if (virtual_space_start == virtual_space_end) { + pmap_virtual_space(&virtual_space_start, &virtual_space_end); + + /* + * The initial values must be aligned properly, and + * we don't trust the pmap module to do it right. + */ + + virtual_space_start = round_page(virtual_space_start); + virtual_space_end = trunc_page(virtual_space_end); + } + + /* + * Allocate virtual memory for this request. + */ + + addr = virtual_space_start; + virtual_space_start += size; + + /* + * Allocate and map physical pages to back new virtual pages. + */ + + for (vaddr = round_page(addr); + vaddr < addr + size; + vaddr += PAGE_SIZE) { + if (!pmap_next_page(&paddr)) + panic("pmap_steal_memory"); + + /* + * XXX Logically, these mappings should be wired, + * but some pmap modules barf if they are. + */ + + pmap_enter(kernel_pmap, vaddr, paddr, + VM_PROT_READ|VM_PROT_WRITE, FALSE); + } + + return addr; +} + +void pmap_startup( + vm_offset_t *startp, + vm_offset_t *endp) +{ + unsigned int i, npages, pages_initialized; + vm_page_t pages; + vm_offset_t paddr; + + /* + * We calculate how many page frames we will have + * and then allocate the page structures in one chunk. + */ + + npages = ((PAGE_SIZE * pmap_free_pages() + + (round_page(virtual_space_start) - virtual_space_start)) / + (PAGE_SIZE + sizeof *pages)); + + pages = (vm_page_t) pmap_steal_memory(npages * sizeof *pages); + + /* + * Initialize the page frames. + */ + + for (i = 0, pages_initialized = 0; i < npages; i++) { + if (!pmap_next_page(&paddr)) + break; + + vm_page_init(&pages[i], paddr); + pages_initialized++; + } + + /* + * Release pages in reverse order so that physical pages + * initially get allocated in ascending addresses. This keeps + * the devices (which must address physical memory) happy if + * they require several consecutive pages. + */ + + for (i = pages_initialized; i > 0; i--) { + vm_page_release(&pages[i - 1]); + } + + /* + * We have to re-align virtual_space_start, + * because pmap_steal_memory has been using it. + */ + + virtual_space_start = round_page(virtual_space_start); + + *startp = virtual_space_start; + *endp = virtual_space_end; +} +#endif /* MACHINE_PAGES */ + +/* + * Routine: vm_page_module_init + * Purpose: + * Second initialization pass, to be done after + * the basic VM system is ready. + */ +void vm_page_module_init(void) +{ + vm_page_zone = zinit((vm_size_t) sizeof(struct vm_page), + VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS, + PAGE_SIZE, + 0, "vm pages"); +} + +/* + * Routine: vm_page_create + * Purpose: + * After the VM system is up, machine-dependent code + * may stumble across more physical memory. For example, + * memory that it was reserving for a frame buffer. + * vm_page_create turns this memory into available pages. + */ + +void vm_page_create( + vm_offset_t start, + vm_offset_t end) +{ + vm_offset_t paddr; + vm_page_t m; + + for (paddr = round_page(start); + paddr < trunc_page(end); + paddr += PAGE_SIZE) { + m = (vm_page_t) zalloc(vm_page_zone); + if (m == VM_PAGE_NULL) + panic("vm_page_create"); + + vm_page_init(m, paddr); + vm_page_release(m); + } +} + +/* + * vm_page_hash: + * + * Distributes the object/offset key pair among hash buckets. + * + * NOTE: To get a good hash function, the bucket count should + * be a power of two. + */ +#define vm_page_hash(object, offset) \ + (((unsigned int)(vm_offset_t)object + (unsigned int)atop(offset)) \ + & vm_page_hash_mask) + +/* + * vm_page_insert: [ internal use only ] + * + * Inserts the given mem entry into the object/object-page + * table and object list. + * + * The object and page must be locked. + */ + +void vm_page_insert( + register vm_page_t mem, + register vm_object_t object, + register vm_offset_t offset) +{ + register vm_page_bucket_t *bucket; + + VM_PAGE_CHECK(mem); + + if (mem->tabled) + panic("vm_page_insert"); + + /* + * Record the object/offset pair in this page + */ + + mem->object = object; + mem->offset = offset; + + /* + * Insert it into the object_object/offset hash table + */ + + bucket = &vm_page_buckets[vm_page_hash(object, offset)]; + simple_lock(&bucket->lock); + mem->next = bucket->pages; + bucket->pages = mem; + simple_unlock(&bucket->lock); + + /* + * Now link into the object's list of backed pages. + */ + + queue_enter(&object->memq, mem, vm_page_t, listq); + mem->tabled = TRUE; + + /* + * Show that the object has one more resident page. + */ + + object->resident_page_count++; + + /* + * Detect sequential access and inactivate previous page. + * We ignore busy pages. + */ + + if (vm_page_deactivate_behind && + (offset == object->last_alloc + PAGE_SIZE)) { + vm_page_t last_mem; + + last_mem = vm_page_lookup(object, object->last_alloc); + if ((last_mem != VM_PAGE_NULL) && !last_mem->busy) + vm_page_deactivate(last_mem); + } + object->last_alloc = offset; +} + +/* + * vm_page_replace: + * + * Exactly like vm_page_insert, except that we first + * remove any existing page at the given offset in object + * and we don't do deactivate-behind. + * + * The object and page must be locked. + */ + +void vm_page_replace( + register vm_page_t mem, + register vm_object_t object, + register vm_offset_t offset) +{ + register vm_page_bucket_t *bucket; + + VM_PAGE_CHECK(mem); + + if (mem->tabled) + panic("vm_page_replace"); + + /* + * Record the object/offset pair in this page + */ + + mem->object = object; + mem->offset = offset; + + /* + * Insert it into the object_object/offset hash table, + * replacing any page that might have been there. + */ + + bucket = &vm_page_buckets[vm_page_hash(object, offset)]; + simple_lock(&bucket->lock); + if (bucket->pages) { + vm_page_t *mp = &bucket->pages; + register vm_page_t m = *mp; + do { + if (m->object == object && m->offset == offset) { + /* + * Remove page from bucket and from object, + * and return it to the free list. + */ + *mp = m->next; + queue_remove(&object->memq, m, vm_page_t, + listq); + m->tabled = FALSE; + object->resident_page_count--; + + /* + * Return page to the free list. + * Note the page is not tabled now, so this + * won't self-deadlock on the bucket lock. + */ + + vm_page_free(m); + break; + } + mp = &m->next; + } while ((m = *mp) != 0); + mem->next = bucket->pages; + } else { + mem->next = VM_PAGE_NULL; + } + bucket->pages = mem; + simple_unlock(&bucket->lock); + + /* + * Now link into the object's list of backed pages. + */ + + queue_enter(&object->memq, mem, vm_page_t, listq); + mem->tabled = TRUE; + + /* + * And show that the object has one more resident + * page. + */ + + object->resident_page_count++; +} + +/* + * vm_page_remove: [ internal use only ] + * + * Removes the given mem entry from the object/offset-page + * table and the object page list. + * + * The object and page must be locked. + */ + +void vm_page_remove( + register vm_page_t mem) +{ + register vm_page_bucket_t *bucket; + register vm_page_t this; + + assert(mem->tabled); + VM_PAGE_CHECK(mem); + + /* + * Remove from the object_object/offset hash table + */ + + bucket = &vm_page_buckets[vm_page_hash(mem->object, mem->offset)]; + simple_lock(&bucket->lock); + if ((this = bucket->pages) == mem) { + /* optimize for common case */ + + bucket->pages = mem->next; + } else { + register vm_page_t *prev; + + for (prev = &this->next; + (this = *prev) != mem; + prev = &this->next) + continue; + *prev = this->next; + } + simple_unlock(&bucket->lock); + + /* + * Now remove from the object's list of backed pages. + */ + + queue_remove(&mem->object->memq, mem, vm_page_t, listq); + + /* + * And show that the object has one fewer resident + * page. + */ + + mem->object->resident_page_count--; + + mem->tabled = FALSE; +} + +/* + * vm_page_lookup: + * + * Returns the page associated with the object/offset + * pair specified; if none is found, VM_PAGE_NULL is returned. + * + * The object must be locked. No side effects. + */ + +vm_page_t vm_page_lookup( + register vm_object_t object, + register vm_offset_t offset) +{ + register vm_page_t mem; + register vm_page_bucket_t *bucket; + + /* + * Search the hash table for this object/offset pair + */ + + bucket = &vm_page_buckets[vm_page_hash(object, offset)]; + + simple_lock(&bucket->lock); + for (mem = bucket->pages; mem != VM_PAGE_NULL; mem = mem->next) { + VM_PAGE_CHECK(mem); + if ((mem->object == object) && (mem->offset == offset)) + break; + } + simple_unlock(&bucket->lock); + return mem; +} + +/* + * vm_page_rename: + * + * Move the given memory entry from its + * current object to the specified target object/offset. + * + * The object must be locked. + */ +void vm_page_rename( + register vm_page_t mem, + register vm_object_t new_object, + vm_offset_t new_offset) +{ + /* + * Changes to mem->object require the page lock because + * the pageout daemon uses that lock to get the object. + */ + + vm_page_lock_queues(); + vm_page_remove(mem); + vm_page_insert(mem, new_object, new_offset); + vm_page_unlock_queues(); +} + +/* + * vm_page_init: + * + * Initialize the fields in a new page. + * This takes a structure with random values and initializes it + * so that it can be given to vm_page_release or vm_page_insert. + */ +void vm_page_init( + vm_page_t mem, + vm_offset_t phys_addr) +{ + *mem = vm_page_template; + mem->phys_addr = phys_addr; +} + +/* + * vm_page_grab_fictitious: + * + * Remove a fictitious page from the free list. + * Returns VM_PAGE_NULL if there are no free pages. + */ + +vm_page_t vm_page_grab_fictitious(void) +{ + register vm_page_t m; + + simple_lock(&vm_page_queue_free_lock); + m = vm_page_queue_fictitious; + if (m != VM_PAGE_NULL) { + vm_page_fictitious_count--; + vm_page_queue_fictitious = (vm_page_t) m->pageq.next; + m->free = FALSE; + } + simple_unlock(&vm_page_queue_free_lock); + + return m; +} + +/* + * vm_page_release_fictitious: + * + * Release a fictitious page to the free list. + */ + +void vm_page_release_fictitious( + register vm_page_t m) +{ + simple_lock(&vm_page_queue_free_lock); + if (m->free) + panic("vm_page_release_fictitious"); + m->free = TRUE; + m->pageq.next = (queue_entry_t) vm_page_queue_fictitious; + vm_page_queue_fictitious = m; + vm_page_fictitious_count++; + simple_unlock(&vm_page_queue_free_lock); +} + +/* + * vm_page_more_fictitious: + * + * Add more fictitious pages to the free list. + * Allowed to block. + */ + +int vm_page_fictitious_quantum = 5; + +void vm_page_more_fictitious(void) +{ + register vm_page_t m; + int i; + + for (i = 0; i < vm_page_fictitious_quantum; i++) { + m = (vm_page_t) zalloc(vm_page_zone); + if (m == VM_PAGE_NULL) + panic("vm_page_more_fictitious"); + + vm_page_init(m, vm_page_fictitious_addr); + m->fictitious = TRUE; + vm_page_release_fictitious(m); + } +} + +/* + * vm_page_convert: + * + * Attempt to convert a fictitious page into a real page. + */ + +boolean_t vm_page_convert( + register vm_page_t m) +{ + register vm_page_t real_m; + + real_m = vm_page_grab(); + if (real_m == VM_PAGE_NULL) + return FALSE; + + m->phys_addr = real_m->phys_addr; + m->fictitious = FALSE; + + real_m->phys_addr = vm_page_fictitious_addr; + real_m->fictitious = TRUE; + + vm_page_release_fictitious(real_m); + return TRUE; +} + +/* + * vm_page_grab: + * + * Remove a page from the free list. + * Returns VM_PAGE_NULL if the free list is too small. + */ + +vm_page_t vm_page_grab(void) +{ + register vm_page_t mem; + + simple_lock(&vm_page_queue_free_lock); + + /* + * Only let privileged threads (involved in pageout) + * dip into the reserved pool. + */ + + if ((vm_page_free_count < vm_page_free_reserved) && + !current_thread()->vm_privilege) { + simple_unlock(&vm_page_queue_free_lock); + return VM_PAGE_NULL; + } + + if (vm_page_queue_free == VM_PAGE_NULL) + panic("vm_page_grab"); + + if (--vm_page_free_count < vm_page_free_count_minimum) + vm_page_free_count_minimum = vm_page_free_count; + mem = vm_page_queue_free; + vm_page_queue_free = (vm_page_t) mem->pageq.next; + mem->free = FALSE; + simple_unlock(&vm_page_queue_free_lock); + + /* + * Decide if we should poke the pageout daemon. + * We do this if the free count is less than the low + * water mark, or if the free count is less than the high + * water mark (but above the low water mark) and the inactive + * count is less than its target. + * + * We don't have the counts locked ... if they change a little, + * it doesn't really matter. + */ + + if ((vm_page_free_count < vm_page_free_min) || + ((vm_page_free_count < vm_page_free_target) && + (vm_page_inactive_count < vm_page_inactive_target))) + thread_wakeup((event_t) &vm_page_free_wanted); + + return mem; +} + +vm_offset_t vm_page_grab_phys_addr(void) +{ + vm_page_t p = vm_page_grab(); + if (p == VM_PAGE_NULL) + return -1; + else + return p->phys_addr; +} + +/* + * vm_page_grab_contiguous_pages: + * + * Take N pages off the free list, the pages should + * cover a contiguous range of physical addresses. + * [Used by device drivers to cope with DMA limitations] + * + * Returns the page descriptors in ascending order, or + * Returns KERN_RESOURCE_SHORTAGE if it could not. + */ + +/* Biggest phys page number for the pages we handle in VM */ + +vm_size_t vm_page_big_pagenum = 0; /* Set this before call! */ + +kern_return_t +vm_page_grab_contiguous_pages( + int npages, + vm_page_t pages[], + natural_t *bits) +{ + register int first_set; + int size, alloc_size; + kern_return_t ret; + vm_page_t mem, prevmem; + +#ifndef NBBY +#define NBBY 8 /* size in bits of sizeof()`s unity */ +#endif + +#define NBPEL (sizeof(natural_t)*NBBY) + + size = (vm_page_big_pagenum + NBPEL - 1) + & ~(NBPEL - 1); /* in bits */ + + size = size / NBBY; /* in bytes */ + + /* + * If we are called before the VM system is fully functional + * the invoker must provide us with the work space. [one bit + * per page starting at phys 0 and up to vm_page_big_pagenum] + */ + if (bits == 0) { + alloc_size = round_page(size); + if (kmem_alloc_wired(kernel_map, + (vm_offset_t *)&bits, + alloc_size) + != KERN_SUCCESS) + return KERN_RESOURCE_SHORTAGE; + } else + alloc_size = 0; + + bzero(bits, size); + + /* + * A very large granularity call, its rare so that is ok + */ + simple_lock(&vm_page_queue_free_lock); + + /* + * Do not dip into the reserved pool. + */ + + if (vm_page_free_count < vm_page_free_reserved) { + simple_unlock(&vm_page_queue_free_lock); + return KERN_RESOURCE_SHORTAGE; + } + + /* + * First pass through, build a big bit-array of + * the pages that are free. It is not going to + * be too large anyways, in 4k we can fit info + * for 32k pages. + */ + mem = vm_page_queue_free; + while (mem) { + register int word_index, bit_index; + + bit_index = (mem->phys_addr >> PAGE_SHIFT); + word_index = bit_index / NBPEL; + bit_index = bit_index - (word_index * NBPEL); + bits[word_index] |= 1 << bit_index; + + mem = (vm_page_t) mem->pageq.next; + } + + /* + * Second loop. Scan the bit array for NPAGES + * contiguous bits. That gives us, if any, + * the range of pages we will be grabbing off + * the free list. + */ + { + register int bits_so_far = 0, i; + + first_set = 0; + + for (i = 0; i < size; i += sizeof(natural_t)) { + + register natural_t v = bits[i / sizeof(natural_t)]; + register int bitpos; + + /* + * Bitscan this one word + */ + if (v) { + /* + * keep counting them beans ? + */ + bitpos = 0; + + if (bits_so_far) { +count_ones: + while (v & 1) { + bitpos++; + /* + * got enough beans ? + */ + if (++bits_so_far == npages) + goto found_em; + v >>= 1; + } + /* if we are being lucky, roll again */ + if (bitpos == NBPEL) + continue; + } + + /* + * search for beans here + */ + bits_so_far = 0; +count_zeroes: + while ((bitpos < NBPEL) && ((v & 1) == 0)) { + bitpos++; + v >>= 1; + } + if (v & 1) { + first_set = (i * NBBY) + bitpos; + goto count_ones; + } + } + /* + * No luck + */ + bits_so_far = 0; + } + } + + /* + * We could not find enough contiguous pages. + */ +not_found_em: + simple_unlock(&vm_page_queue_free_lock); + + ret = KERN_RESOURCE_SHORTAGE; + goto out; + + /* + * Final pass. Now we know which pages we want. + * Scan the list until we find them all, grab + * pages as we go. FIRST_SET tells us where + * in the bit-array our pages start. + */ +found_em: + vm_page_free_count -= npages; + if (vm_page_free_count < vm_page_free_count_minimum) + vm_page_free_count_minimum = vm_page_free_count; + + { + register vm_offset_t first_phys, last_phys; + + /* cache values for compare */ + first_phys = first_set << PAGE_SHIFT; + last_phys = first_phys + (npages << PAGE_SHIFT);/* not included */ + + /* running pointers */ + mem = vm_page_queue_free; + prevmem = VM_PAGE_NULL; + + while (mem) { + + register vm_offset_t addr; + + addr = mem->phys_addr; + + if ((addr >= first_phys) && + (addr < last_phys)) { + if (prevmem) + prevmem->pageq.next = mem->pageq.next; + pages[(addr - first_phys) >> PAGE_SHIFT] = mem; + mem->free = FALSE; + /* + * Got them all ? + */ + if (--npages == 0) break; + } else + prevmem = mem; + + mem = (vm_page_t) mem->pageq.next; + } + } + + simple_unlock(&vm_page_queue_free_lock); + + /* + * Decide if we should poke the pageout daemon. + * We do this if the free count is less than the low + * water mark, or if the free count is less than the high + * water mark (but above the low water mark) and the inactive + * count is less than its target. + * + * We don't have the counts locked ... if they change a little, + * it doesn't really matter. + */ + + if ((vm_page_free_count < vm_page_free_min) || + ((vm_page_free_count < vm_page_free_target) && + (vm_page_inactive_count < vm_page_inactive_target))) + thread_wakeup(&vm_page_free_wanted); + + ret = KERN_SUCCESS; +out: + if (alloc_size) + kmem_free(kernel_map, (vm_offset_t) bits, alloc_size); + + return ret; +} + +/* + * vm_page_release: + * + * Return a page to the free list. + */ + +void vm_page_release( + register vm_page_t mem) +{ + simple_lock(&vm_page_queue_free_lock); + if (mem->free) + panic("vm_page_release"); + mem->free = TRUE; + mem->pageq.next = (queue_entry_t) vm_page_queue_free; + vm_page_queue_free = mem; + vm_page_free_count++; + + /* + * Check if we should wake up someone waiting for page. + * But don't bother waking them unless they can allocate. + * + * We wakeup only one thread, to prevent starvation. + * Because the scheduling system handles wait queues FIFO, + * if we wakeup all waiting threads, one greedy thread + * can starve multiple niceguy threads. When the threads + * all wakeup, the greedy threads runs first, grabs the page, + * and waits for another page. It will be the first to run + * when the next page is freed. + * + * However, there is a slight danger here. + * The thread we wake might not use the free page. + * Then the other threads could wait indefinitely + * while the page goes unused. To forestall this, + * the pageout daemon will keep making free pages + * as long as vm_page_free_wanted is non-zero. + */ + + if ((vm_page_free_wanted > 0) && + (vm_page_free_count >= vm_page_free_reserved)) { + vm_page_free_wanted--; + thread_wakeup_one((event_t) &vm_page_free_count); + } + + simple_unlock(&vm_page_queue_free_lock); +} + +/* + * vm_page_wait: + * + * Wait for a page to become available. + * If there are plenty of free pages, then we don't sleep. + */ + +void vm_page_wait( + void (*continuation)(void)) +{ + +#ifndef CONTINUATIONS + assert (continuation == 0); +#endif + + /* + * We can't use vm_page_free_reserved to make this + * determination. Consider: some thread might + * need to allocate two pages. The first allocation + * succeeds, the second fails. After the first page is freed, + * a call to vm_page_wait must really block. + */ + + simple_lock(&vm_page_queue_free_lock); + if (vm_page_free_count < vm_page_free_target) { + if (vm_page_free_wanted++ == 0) + thread_wakeup((event_t)&vm_page_free_wanted); + assert_wait((event_t)&vm_page_free_count, FALSE); + simple_unlock(&vm_page_queue_free_lock); + if (continuation != 0) { + counter(c_vm_page_wait_block_user++); + thread_block(continuation); + } else { + counter(c_vm_page_wait_block_kernel++); + thread_block((void (*)(void)) 0); + } + } else + simple_unlock(&vm_page_queue_free_lock); +} + +/* + * vm_page_alloc: + * + * Allocate and return a memory cell associated + * with this VM object/offset pair. + * + * Object must be locked. + */ + +vm_page_t vm_page_alloc( + vm_object_t object, + vm_offset_t offset) +{ + register vm_page_t mem; + + mem = vm_page_grab(); + if (mem == VM_PAGE_NULL) + return VM_PAGE_NULL; + + vm_page_lock_queues(); + vm_page_insert(mem, object, offset); + vm_page_unlock_queues(); + + return mem; +} + +/* + * vm_page_free: + * + * Returns the given page to the free list, + * disassociating it with any VM object. + * + * Object and page queues must be locked prior to entry. + */ +void vm_page_free( + register vm_page_t mem) +{ + if (mem->free) + panic("vm_page_free"); + + if (mem->tabled) + vm_page_remove(mem); + VM_PAGE_QUEUES_REMOVE(mem); + + if (mem->wire_count != 0) { + if (!mem->private && !mem->fictitious) + vm_page_wire_count--; + mem->wire_count = 0; + } + + if (mem->laundry) { + vm_page_laundry_count--; + mem->laundry = FALSE; + } + + PAGE_WAKEUP_DONE(mem); + + if (mem->absent) + vm_object_absent_release(mem->object); + + /* + * XXX The calls to vm_page_init here are + * really overkill. + */ + + if (mem->private || mem->fictitious) { + vm_page_init(mem, vm_page_fictitious_addr); + mem->fictitious = TRUE; + vm_page_release_fictitious(mem); + } else { + vm_page_init(mem, mem->phys_addr); + vm_page_release(mem); + } +} + +/* + * vm_page_wire: + * + * Mark this page as wired down by yet + * another map, removing it from paging queues + * as necessary. + * + * The page's object and the page queues must be locked. + */ +void vm_page_wire( + register vm_page_t mem) +{ + VM_PAGE_CHECK(mem); + + if (mem->wire_count == 0) { + VM_PAGE_QUEUES_REMOVE(mem); + if (!mem->private && !mem->fictitious) + vm_page_wire_count++; + } + mem->wire_count++; +} + +/* + * vm_page_unwire: + * + * Release one wiring of this page, potentially + * enabling it to be paged again. + * + * The page's object and the page queues must be locked. + */ +void vm_page_unwire( + register vm_page_t mem) +{ + VM_PAGE_CHECK(mem); + + if (--mem->wire_count == 0) { + queue_enter(&vm_page_queue_active, mem, vm_page_t, pageq); + vm_page_active_count++; + mem->active = TRUE; + if (!mem->private && !mem->fictitious) + vm_page_wire_count--; + } +} + +/* + * vm_page_deactivate: + * + * Returns the given page to the inactive list, + * indicating that no physical maps have access + * to this page. [Used by the physical mapping system.] + * + * The page queues must be locked. + */ +void vm_page_deactivate( + register vm_page_t m) +{ + VM_PAGE_CHECK(m); + + /* + * This page is no longer very interesting. If it was + * interesting (active or inactive/referenced), then we + * clear the reference bit and (re)enter it in the + * inactive queue. Note wired pages should not have + * their reference bit cleared. + */ + + if (m->active || (m->inactive && m->reference)) { + if (!m->fictitious && !m->absent) + pmap_clear_reference(m->phys_addr); + m->reference = FALSE; + VM_PAGE_QUEUES_REMOVE(m); + } + if (m->wire_count == 0 && !m->inactive) { + queue_enter(&vm_page_queue_inactive, m, vm_page_t, pageq); + m->inactive = TRUE; + vm_page_inactive_count++; + } +} + +/* + * vm_page_activate: + * + * Put the specified page on the active list (if appropriate). + * + * The page queues must be locked. + */ + +void vm_page_activate( + register vm_page_t m) +{ + VM_PAGE_CHECK(m); + + if (m->inactive) { + queue_remove(&vm_page_queue_inactive, m, vm_page_t, + pageq); + vm_page_inactive_count--; + m->inactive = FALSE; + } + if (m->wire_count == 0) { + if (m->active) + panic("vm_page_activate: already active"); + + queue_enter(&vm_page_queue_active, m, vm_page_t, pageq); + m->active = TRUE; + vm_page_active_count++; + } +} + +/* + * vm_page_zero_fill: + * + * Zero-fill the specified page. + */ +void vm_page_zero_fill( + vm_page_t m) +{ + VM_PAGE_CHECK(m); + + pmap_zero_page(m->phys_addr); +} + +/* + * vm_page_copy: + * + * Copy one page to another + */ + +void vm_page_copy( + vm_page_t src_m, + vm_page_t dest_m) +{ + VM_PAGE_CHECK(src_m); + VM_PAGE_CHECK(dest_m); + + pmap_copy_page(src_m->phys_addr, dest_m->phys_addr); +} + +#if MACH_VM_DEBUG +/* + * Routine: vm_page_info + * Purpose: + * Return information about the global VP table. + * Fills the buffer with as much information as possible + * and returns the desired size of the buffer. + * Conditions: + * Nothing locked. The caller should provide + * possibly-pageable memory. + */ + +unsigned int +vm_page_info( + hash_info_bucket_t *info, + unsigned int count) +{ + int i; + + if (vm_page_bucket_count < count) + count = vm_page_bucket_count; + + for (i = 0; i < count; i++) { + vm_page_bucket_t *bucket = &vm_page_buckets[i]; + unsigned int bucket_count = 0; + vm_page_t m; + + simple_lock(&bucket->lock); + for (m = bucket->pages; m != VM_PAGE_NULL; m = m->next) + bucket_count++; + simple_unlock(&bucket->lock); + + /* don't touch pageable memory while holding locks */ + info[i].hib_count = bucket_count; + } + + return vm_page_bucket_count; +} +#endif /* MACH_VM_DEBUG */ + +#include <mach_kdb.h> +#if MACH_KDB +#define printf kdbprintf + +/* + * Routine: vm_page_print [exported] + */ +void vm_page_print(p) + vm_page_t p; +{ + iprintf("Page 0x%X: object 0x%X,", (vm_offset_t) p, (vm_offset_t) p->object); + printf(" offset 0x%X", (vm_offset_t) p->offset); + printf("wire_count %d,", p->wire_count); + printf(" %s", + (p->active ? "active" : (p->inactive ? "inactive" : "loose"))); + printf("%s", + (p->free ? " free" : "")); + printf("%s ", + (p->laundry ? " laundry" : "")); + printf("%s", + (p->dirty ? "dirty" : "clean")); + printf("%s", + (p->busy ? " busy" : "")); + printf("%s", + (p->absent ? " absent" : "")); + printf("%s", + (p->error ? " error" : "")); + printf("%s", + (p->fictitious ? " fictitious" : "")); + printf("%s", + (p->private ? " private" : "")); + printf("%s", + (p->wanted ? " wanted" : "")); + printf("%s,", + (p->tabled ? "" : "not_tabled")); + printf("phys_addr = 0x%X, lock = 0x%X, unlock_request = 0x%X\n", + (vm_offset_t) p->phys_addr, + (vm_offset_t) p->page_lock, + (vm_offset_t) p->unlock_request); +} +#endif /* MACH_KDB */ diff --git a/vm/vm_user.c b/vm/vm_user.c new file mode 100644 index 00000000..ebe98449 --- /dev/null +++ b/vm/vm_user.c @@ -0,0 +1,397 @@ +/* + * Mach Operating System + * Copyright (c) 1991,1990,1989,1988 Carnegie Mellon University + * All Rights Reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR + * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +/* + * File: vm/vm_user.c + * Author: Avadis Tevanian, Jr., Michael Wayne Young + * + * User-exported virtual memory functions. + */ + +#include <mach/boolean.h> +#include <mach/kern_return.h> +#include <mach/mach_types.h> /* to get vm_address_t */ +#include <mach/memory_object.h> +#include <mach/std_types.h> /* to get pointer_t */ +#include <mach/vm_attributes.h> +#include <mach/vm_param.h> +#include <mach/vm_statistics.h> +#include <kern/host.h> +#include <kern/task.h> +#include <vm/vm_fault.h> +#include <vm/vm_map.h> +#include <vm/vm_object.h> +#include <vm/vm_page.h> + + + +vm_statistics_data_t vm_stat; + +/* + * vm_allocate allocates "zero fill" memory in the specfied + * map. + */ +kern_return_t vm_allocate(map, addr, size, anywhere) + register vm_map_t map; + register vm_offset_t *addr; + register vm_size_t size; + boolean_t anywhere; +{ + kern_return_t result; + + if (map == VM_MAP_NULL) + return(KERN_INVALID_ARGUMENT); + if (size == 0) { + *addr = 0; + return(KERN_SUCCESS); + } + + if (anywhere) + *addr = vm_map_min(map); + else + *addr = trunc_page(*addr); + size = round_page(size); + + result = vm_map_enter( + map, + addr, + size, + (vm_offset_t)0, + anywhere, + VM_OBJECT_NULL, + (vm_offset_t)0, + FALSE, + VM_PROT_DEFAULT, + VM_PROT_ALL, + VM_INHERIT_DEFAULT); + + return(result); +} + +/* + * vm_deallocate deallocates the specified range of addresses in the + * specified address map. + */ +kern_return_t vm_deallocate(map, start, size) + register vm_map_t map; + vm_offset_t start; + vm_size_t size; +{ + if (map == VM_MAP_NULL) + return(KERN_INVALID_ARGUMENT); + + if (size == (vm_offset_t) 0) + return(KERN_SUCCESS); + + return(vm_map_remove(map, trunc_page(start), round_page(start+size))); +} + +/* + * vm_inherit sets the inheritance of the specified range in the + * specified map. + */ +kern_return_t vm_inherit(map, start, size, new_inheritance) + register vm_map_t map; + vm_offset_t start; + vm_size_t size; + vm_inherit_t new_inheritance; +{ + if (map == VM_MAP_NULL) + return(KERN_INVALID_ARGUMENT); + + switch (new_inheritance) { + case VM_INHERIT_NONE: + case VM_INHERIT_COPY: + case VM_INHERIT_SHARE: + break; + default: + return(KERN_INVALID_ARGUMENT); + } + + /*Check if range includes projected buffer; + user is not allowed direct manipulation in that case*/ + if (projected_buffer_in_range(map, start, start+size)) + return(KERN_INVALID_ARGUMENT); + + return(vm_map_inherit(map, + trunc_page(start), + round_page(start+size), + new_inheritance)); +} + +/* + * vm_protect sets the protection of the specified range in the + * specified map. + */ + +kern_return_t vm_protect(map, start, size, set_maximum, new_protection) + register vm_map_t map; + vm_offset_t start; + vm_size_t size; + boolean_t set_maximum; + vm_prot_t new_protection; +{ + if ((map == VM_MAP_NULL) || + (new_protection & ~(VM_PROT_ALL|VM_PROT_NOTIFY))) + return(KERN_INVALID_ARGUMENT); + + /*Check if range includes projected buffer; + user is not allowed direct manipulation in that case*/ + if (projected_buffer_in_range(map, start, start+size)) + return(KERN_INVALID_ARGUMENT); + + return(vm_map_protect(map, + trunc_page(start), + round_page(start+size), + new_protection, + set_maximum)); +} + +kern_return_t vm_statistics(map, stat) + vm_map_t map; + vm_statistics_data_t *stat; +{ + if (map == VM_MAP_NULL) + return(KERN_INVALID_ARGUMENT); + + *stat = vm_stat; + + stat->pagesize = PAGE_SIZE; + stat->free_count = vm_page_free_count; + stat->active_count = vm_page_active_count; + stat->inactive_count = vm_page_inactive_count; + stat->wire_count = vm_page_wire_count; + + return(KERN_SUCCESS); +} + +/* + * Handle machine-specific attributes for a mapping, such + * as cachability, migrability, etc. + */ +kern_return_t vm_machine_attribute(map, address, size, attribute, value) + vm_map_t map; + vm_address_t address; + vm_size_t size; + vm_machine_attribute_t attribute; + vm_machine_attribute_val_t* value; /* IN/OUT */ +{ + extern kern_return_t vm_map_machine_attribute(); + + if (map == VM_MAP_NULL) + return(KERN_INVALID_ARGUMENT); + + /*Check if range includes projected buffer; + user is not allowed direct manipulation in that case*/ + if (projected_buffer_in_range(map, address, address+size)) + return(KERN_INVALID_ARGUMENT); + + return vm_map_machine_attribute(map, address, size, attribute, value); +} + +kern_return_t vm_read(map, address, size, data, data_size) + vm_map_t map; + vm_address_t address; + vm_size_t size; + pointer_t *data; + vm_size_t *data_size; +{ + kern_return_t error; + vm_map_copy_t ipc_address; + + if (map == VM_MAP_NULL) + return(KERN_INVALID_ARGUMENT); + + if ((error = vm_map_copyin(map, + address, + size, + FALSE, /* src_destroy */ + &ipc_address)) == KERN_SUCCESS) { + *data = (pointer_t) ipc_address; + *data_size = size; + } + return(error); +} + +kern_return_t vm_write(map, address, data, size) + vm_map_t map; + vm_address_t address; + pointer_t data; + vm_size_t size; +{ + if (map == VM_MAP_NULL) + return KERN_INVALID_ARGUMENT; + + return vm_map_copy_overwrite(map, address, (vm_map_copy_t) data, + FALSE /* interruptible XXX */); +} + +kern_return_t vm_copy(map, source_address, size, dest_address) + vm_map_t map; + vm_address_t source_address; + vm_size_t size; + vm_address_t dest_address; +{ + vm_map_copy_t copy; + kern_return_t kr; + + if (map == VM_MAP_NULL) + return KERN_INVALID_ARGUMENT; + + kr = vm_map_copyin(map, source_address, size, + FALSE, ©); + if (kr != KERN_SUCCESS) + return kr; + + kr = vm_map_copy_overwrite(map, dest_address, copy, + FALSE /* interruptible XXX */); + if (kr != KERN_SUCCESS) { + vm_map_copy_discard(copy); + return kr; + } + + return KERN_SUCCESS; +} + +/* + * Routine: vm_map + */ +kern_return_t vm_map( + target_map, + address, size, mask, anywhere, + memory_object, offset, + copy, + cur_protection, max_protection, inheritance) + vm_map_t target_map; + vm_offset_t *address; + vm_size_t size; + vm_offset_t mask; + boolean_t anywhere; + ipc_port_t memory_object; + vm_offset_t offset; + boolean_t copy; + vm_prot_t cur_protection; + vm_prot_t max_protection; + vm_inherit_t inheritance; +{ + register + vm_object_t object; + register + kern_return_t result; + + if ((target_map == VM_MAP_NULL) || + (cur_protection & ~VM_PROT_ALL) || + (max_protection & ~VM_PROT_ALL)) + return(KERN_INVALID_ARGUMENT); + + switch (inheritance) { + case VM_INHERIT_NONE: + case VM_INHERIT_COPY: + case VM_INHERIT_SHARE: + break; + default: + return(KERN_INVALID_ARGUMENT); + } + + *address = trunc_page(*address); + size = round_page(size); + + if (!IP_VALID(memory_object)) { + object = VM_OBJECT_NULL; + offset = 0; + copy = FALSE; + } else if ((object = vm_object_enter(memory_object, size, FALSE)) + == VM_OBJECT_NULL) + return(KERN_INVALID_ARGUMENT); + + /* + * Perform the copy if requested + */ + + if (copy) { + vm_object_t new_object; + vm_offset_t new_offset; + + result = vm_object_copy_strategically(object, offset, size, + &new_object, &new_offset, + ©); + + /* + * Throw away the reference to the + * original object, as it won't be mapped. + */ + + vm_object_deallocate(object); + + if (result != KERN_SUCCESS) + return (result); + + object = new_object; + offset = new_offset; + } + + if ((result = vm_map_enter(target_map, + address, size, mask, anywhere, + object, offset, + copy, + cur_protection, max_protection, inheritance + )) != KERN_SUCCESS) + vm_object_deallocate(object); + return(result); +} + +/* + * Specify that the range of the virtual address space + * of the target task must not cause page faults for + * the indicated accesses. + * + * [ To unwire the pages, specify VM_PROT_NONE. ] + */ +kern_return_t vm_wire(host, map, start, size, access) + host_t host; + register vm_map_t map; + vm_offset_t start; + vm_size_t size; + vm_prot_t access; +{ + if (host == HOST_NULL) + return KERN_INVALID_HOST; + + if (map == VM_MAP_NULL) + return KERN_INVALID_TASK; + + if (access & ~VM_PROT_ALL) + return KERN_INVALID_ARGUMENT; + + /*Check if range includes projected buffer; + user is not allowed direct manipulation in that case*/ + if (projected_buffer_in_range(map, start, start+size)) + return(KERN_INVALID_ARGUMENT); + + return vm_map_pageable_user(map, + trunc_page(start), + round_page(start+size), + access); +} diff --git a/vm/vm_user.h b/vm/vm_user.h new file mode 100644 index 00000000..f8740107 --- /dev/null +++ b/vm/vm_user.h @@ -0,0 +1,50 @@ +/* + * Mach Operating System + * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University + * All Rights Reserved. + * + * Permission to use, copy, modify and distribute this software and its + * documentation is hereby granted, provided that both the copyright + * notice and this permission notice appear in all copies of the + * software, derivative works or modified versions, and any portions + * thereof, and that both notices appear in supporting documentation. + * + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR + * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. + * + * Carnegie Mellon requests users of this software to return to + * + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU + * School of Computer Science + * Carnegie Mellon University + * Pittsburgh PA 15213-3890 + * + * any improvements or extensions that they make and grant Carnegie Mellon + * the rights to redistribute these changes. + */ +/* + * File: vm/vm_user.h + * Author: Avadis Tevanian, Jr., Michael Wayne Young + * Date: 1986 + * + * Declarations of user-visible virtual address space + * management functionality. + */ + +#ifndef _VM_VM_USER_H_ +#define _VM_VM_USER_H_ + +#include <mach/kern_return.h> + +extern kern_return_t vm_allocate(); +extern kern_return_t vm_deallocate(); +extern kern_return_t vm_inherit(); +extern kern_return_t vm_protect(); +extern kern_return_t vm_statistics(); +extern kern_return_t vm_read(); +extern kern_return_t vm_write(); +extern kern_return_t vm_copy(); +extern kern_return_t vm_map(); + +#endif _VM_VM_USER_H_ |