Ksmd的启动是通过ksm_subvert这个函数来的 函数原型 ksm_subvert(struct ksm *k);

static int __g_dpc_logical_rval = 0;

#ifndef __linux__
NTKERNELAPI VOID KeGenericCallDpc(PKDEFERRED_ROUTINE Routine,
PVOID Context);
NTKERNELAPI VOID KeSignalCallDpcDone(PVOID SystemArgument1);
NTKERNELAPI LOGICAL KeSignalCallDpcSynchronize(PVOID SystemArgument2);

#define DEFINE_DPC(name, call, ...) \
VOID __percpu_##name(PRKDPC dpc, void *ctx, void *sys0, void *sys1) \
{ \
UNREFERENCED_PARAMETER(dpc); \
__g_dpc_logical_rval |= (call) (__VA_ARGS__); \
KeSignalCallDpcSynchronize(sys1); \
KeSignalCallDpcDone(sys0); \
}

#define CALL_DPC(name, ...) do { \
__g_dpc_logical_rval = 0; \
KeGenericCallDpc(__percpu_##name, __VA_ARGS__); \
} while (0)

#define CALL_DPC_ON_CPU(cpu, name, fail, ...) do { \
__g_dpc_logical_rval = 0; \
PROCESSOR_NUMBER proc_nr; \
KeGetProcessorNumberFromIndex((cpu), &proc_nr); \
PKDPC dpc = mm_alloc_pool(sizeof(*dpc)); \
if (!dpc) \
fail; \
KeInitializeDpc(dpc, __percpu_##name, __VA_ARGS__); 初始化DPC \
KeSetImportanceDpc(dpc, HighImportance); \设置DPC对象优先级 。高 中 低
KeSetTargetProcessorDpcEx(dpc, &proc_nr); \挂入到指定Cpu
KeInsertQueueDpc(dpc, NULL, NULL);
win内核中每个Cpu都有个属于自己的Kprc结构 DPC被维护在其中DpcData[2]和DpcStack成员
typedefstruct_KDPC_DATA
{
LIST_ENTRYDpcListHead;//DPC队列头;
ULONGDpcLock;
//DPC队列锁,操作队列要先获得锁;
volatileULONGDpcQueueDepth;
ULONGDpcCount;
}KDPC_DATA,*PKDPC_DATA;
分为两种DPC一种是普通dpc,另外一种是由内核中一个专门线程的来执行的DPC 普通dpc可以运行在任意线程上下文。。擦扯远了
\
} while (0)
#else
#define DEFINE_DPC(name, call, ...) \
void __percpu_##name(void *ctx) \
{ \
__g_dpc_logical_rval |= (call) (__VA_ARGS__); \
}

#define CALL_DPC(name, ...) do { \
int cpu; \
__g_dpc_logical_rval = 0; \
for_each_online_cpu(cpu) \
smp_call_function_single(cpu, __percpu_##name, __VA_ARGS__, 1); \
} while (0)

#define CALL_DPC_ON_CPU(cpu, name, fail, ...) do { \
__g_dpc_logical_rval = 0; \
smp_call_function_single(cpu, __percpu_##name, __VA_ARGS__, 1); \
} while (0)
#endif

#define DPC_RET() __g_dpc_logical_rval
#endif
不得不说作者的代码看着很舒服,我一直在学习这样人的编码风格,asamy的宏真的看着舒服
static DEFINE_DPC(__call_init, __ksm_init_cpu, ctx);
int ksm_subvert(struct ksm *k)
{
CALL_DPC(__call_init, k);每个U上开始XO
return DPC_RET();
}
/*
* Virtualizes current CPU.
*/
int __ksm_init_cpu(struct ksm *k)
{
struct vcpu *vcpu;
int ret = ERR_NOMEM;
u64 feat_ctl;
u64 required_feat_bits = FEATURE_CONTROL_LOCKED |
FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;

vcpu = ksm_cpu(k);
if (vcpu->subverted) {
是否侵染过了
KSM_DEBUG_RAW("CPU already subverted\n");
return 0;
}

#ifdef __linux__
if (tboot_enabled())
required_feat_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
#endif

feat_ctl = __readmsr(MSR_IA32_FEATURE_CONTROL);
if ((feat_ctl & required_feat_bits) != required_feat_bits) {
if (feat_ctl & FEATURE_CONTROL_LOCKED)
return ERR_BUSY;

__writemsr(MSR_IA32_FEATURE_CONTROL, feat_ctl | required_feat_bits);
feat_ctl = __readmsr(MSR_IA32_FEATURE_CONTROL);
if ((feat_ctl & required_feat_bits) != required_feat_bits)
return ERR_DENIED;
}
设置进入hv层的MSR
ret = vcpu_init(vcpu);
if (ret < 0) { KSM_DEBUG_RAW("failed to create vcpu, oom?\n"); return ret; } /* Saves state and calls vcpu_run() (Defined in assembly, vmx.{S,asm} */ ret = __vmx_vminit(vcpu);初始化 idt shadow nested cpu ept list等数据 KSM_DEBUG("%s: Started: %d\n", proc_name(), !ret); if (ret < 0) goto out; vcpu->subverted = true;
k->active_vcpus++; // 总结构CPU个数+1 ksm管理vcpu结构是这样的 申请一大片内存 然后均分对应 达到 vcpumem[currentCpu] return 0;

out:
vcpu_free(vcpu);//释放vcpu内存
__writecr4(__readcr4() & ~X86_CR4_VMXE);//还原vmxme
return ret;
}
实际对Vcpu进行侵染的

下面来看int vcpu_init(struct vcpu *vcpu)这个函数 对 vcpu数据结构进行填充 例如ksm实现的shadow idt ept switch NESTED_VMX cr0_read_shadow等等
int vcpu_init(struct vcpu *vcpu)
{
/*
* This is gonna hold the shadow IDT, which they won't see, but it's
* the one that'll they be using.
*/
vcpu->idt.limit = PAGE_SIZE - 1;
vcpu->idt.base = (uintptr_t)mm_alloc_page();//分配用来做shadow idt list保存原有idt表的内存
if (!vcpu->idt.base)
return ERR_NOMEM;

if (!init_ept(&vcpu->ept)) { //分配eptUse 数*表的内存
mm_free_page((void *)vcpu->idt.base);
return ERR_NOMEM;
}

#ifdef NESTED_VMX
vcpu->nested_vcpu.feat_ctl = __readmsr(MSR_IA32_FEATURE_CONTROL) & ~FEATURE_CONTROL_LOCKED;
#endif

/*
* Leave cr0 guest host mask empty, we support all.
* Set VMXE bit in cr4 guest host mask so they VM-exit to us when
* they try to set that bit.
*
* Note: These bits are also removed from CRx_READ_SHADOW fields, if
* you want to opt-in a VM exit without having to remove that bit
* completely from their CR0, then you'd probably want to make
* a different variable, e.g. `cr0_read_shadow = X86_CR0_PE` and OR it
* in CR0_GUEST_HOST_MASK, without masking it in CR0_READ_SHADOW...
*/
vcpu->cr0_guest_host_mask = 0;
vcpu->cr4_guest_host_mask = X86_CR4_VMXE;

*(struct vcpu **)((uintptr_t)vcpu->stack + KERNEL_STACK_SIZE - 8) = vcpu;
return 0;
}
下面是初始化ept 这里和网上常见代码不同的是。
static inline bool init_ept(struct ept *ept)
{
int i;
u16 dontcare;

for (i = 0; i < EPTP_INIT_USED; ++i) { EPTP_INIT_USED使用了几张表 比如读的时候要切换的表 写的时候要切换的表 原来没有修改过的表 if (!ept_create_ptr(ept, EPT_ACCESS_ALL, &dontcare)) {//创建ept table ptr free_pml4_list(ept); return false; } } return true; } bool ept_create_ptr(struct ept *ept, int access, u16 *out) { u64 **pml4; u16 eptp; eptp = (u16)find_first_zero_bit(ept->ptr_bitmap, sizeof(ept->ptr_bitmap));//KSM默认是支持512张EPT表的 获取已经可以使用的EPT LIST INDEX
if (eptp == sizeof(ept->ptr_bitmap)) //和最大支持相等 就返回失败了
return false;

pml4 = &EPT4(ept, eptp);//从ept_list中获取pml4的ptr
if (!(*pml4 = mm_alloc_page()))//分配内存
return false;

if (!setup_pml4(ept, access, eptp)) {//安装ept表
__mm_free_page(*pml4);
return false;
}

EPTP(ept, eptp) = mkeptp(__pa(*pml4));
set_bit(eptp, ept->ptr_bitmap);
*out = eptp;
return true;
}
static bool setup_pml4(struct ept *ept, int access, u16 eptp)
{
/*
* On Linux, this doesn't have to be done, and we can get each
* one as a violation, on Windows, the kernel screams and hangs.
*
* See mm_cache_ram_ranges() in mm.c for how this is optained.
*/
int i;
u64 addr;
u64 apic;
struct pmem_range *range;

for (i = 0; i < ksm->range_count; ++i) {
range = &ksm->ranges[i];
for (addr = range->start; addr < range->end; addr += PAGE_SIZE) {//物理内存边界位置
int r = access;
if (access != EPT_ACCESS_ALL && mm_is_kernel_addr(__va(addr)))//如果该物理内存在当前CR3转换后得到的虚拟内存是内核的那么赋予完整权限
r = EPT_ACCESS_ALL;

if (!ept_alloc_page(EPT4(ept, eptp), r, addr, addr))//初始化
return false;
}
}

/* Allocate APIC page */
apic = __readmsr(MSR_IA32_APICBASE) & MSR_IA32_APICBASE_BASE;
if (!ept_alloc_page(EPT4(ept, eptp), EPT_ACCESS_ALL, apic, apic))
return false;

return true;
}

u64 *ept_alloc_page(u64 *pml4, int access, u64 gpa, u64 hpa)
{
/* PML4 (512 GB) */
u64 *pml4e = &pml4[PGD_INDEX_P(gpa)];
u64 *pdpt = ept_page_addr(pml4e);

if (!pdpt) {
pdpt = mm_alloc_page();
if (!pdpt)
return NULL;

*pml4e = mkepte(EPT_ACCESS_ALL, __pa(pdpt));
}

/* PDPT (1 GB) */
u64 *pdpte = &pdpt[PUD_INDEX_P(gpa)];
u64 *pdt = ept_page_addr(pdpte);
if (!pdt) {
pdt = mm_alloc_page();
if (!pdt)
return NULL;

*pdpte = mkepte(EPT_ACCESS_ALL, __pa(pdt));
}

/* PDT (2 MB) */
u64 *pdte = &pdt[PMD_INDEX_P(gpa)];
u64 *pt = ept_page_addr(pdte);
if (!pt) {
pt = mm_alloc_page();
if (!pt)
return NULL;

*pdte = mkepte(EPT_ACCESS_ALL, __pa(pt));
}

/* PT (4 KB) */
u64 *page = &pt[PTE_INDEX_P(gpa)];
*page = mkepte(access, hpa);
*page |= EPT_MT_WRITEBACK << VMX_EPT_MT_EPTE_SHIFT; return page; } /* * Get a PTE for the specified guest physical address, this can be used * to get the host physical address it redirects to or redirect to it. * * To redirect to an HPA (Host physical address): * \code * struct ept *ept = &vcpu->ept;
* u64 *epte = ept_pte(EPT4(ept, EPTP_EXHOOK), gpa);
* __set_epte_pfn(epte, hpa >> PAGE_SHIFT);
* __invept_all();
* \endcode
*
* Similarly, to get the HPA:
* \code
* struct ept *ept = &vcpu->ept;
* u64 *epte = ept_pte(EPT4(ept, EPTP_EXHOOK), gpa);
* u64 hpa = *epte & PAGE_PA_MASK;
* u64 hfn = hpa >> PAGE_SHIFT;
* \endcode
*/
u64 *ept_pte(u64 *pml4, u64 gpa)
{
u64 *pdpt, *pdt, *pd;
u64 *pdpte, *pdte;

pdpt = ept_page_addr(&pml4[PGD_INDEX_P(gpa)]);
if (!pdpt)
return 0;

pdpte = &pdpt[PUD_INDEX_P(gpa)];
pdt = ept_page_addr(pdpte);
if (!pdt)
return 0;

if (*pdpte & PAGE_LARGE)
return pdpte; /* 1 GB */

pdte = &pdt[PMD_INDEX_P(gpa)];
pd = ept_page_addr(pdte);
if (!pd)
return 0;

if (*pdte & PAGE_LARGE)
return pdte; /* 2 MB */

return &pd[PTE_INDEX_P(gpa)]; /* 4 KB */
}
上面两个函数和其他的VT开源项目就没啥去吧了。初始化表和打标记了
/* Saves state and calls vcpu_run() (Defined in assembly, vmx.{S,asm} */
ret = __vmx_vminit(vcpu);开始侵染
KSM_DEBUG("%s: Started: %d\n", proc_name(), !ret);

if (ret < 0) goto out; vcpu->subverted = true; 侵染成功
k->active_vcpus++;虚拟CPU数+1
return 0;
到这里数据就初始化完了,准备填写Vmcs了
__vmx_vminit PROC
pushfq
PUSHAQ ; -8 * 16

; rcx contains vcpu
mov rdx, rsp ; SP 栈地址 执行后从HOST回到guest的GSP
mov r8, do_resume ; IP after success 执行后从HOST回到guest的GIP

sub rsp, 20h
call vcpu_run
add rsp, 20h

; if we get here, we failed
POPAQ
popfq

mov eax, -1
ret

do_resume:
POPAQ
popfq

xor eax, eax
ret
__vmx_vminit ENDP

..........班门弄斧,KSM的版本更新比较频繁。我也很久没有写过代码了。凭借记忆写下这篇文章如果有误 还请大家指明。

本站文章基于国际协议BY-NA-SA 4.0协议共享;
如未特殊说明,本站文章皆为原创文章,请规范转载。

0

博客管理员