-
Notifications
You must be signed in to change notification settings - Fork 65
/
kvm_x86.c
5330 lines (4452 loc) · 123 KB
/
kvm_x86.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* GPL HEADER START
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*
* GPL HEADER END
*
* Copyright 2011 various Linux Kernel contributors.
* Copyright 2019 Joyent, Inc.
*/
#include <sys/types.h>
#include <sys/param.h>
#include <sys/mutex.h>
#include <sys/ksynch.h>
#include <sys/condvar_impl.h>
#include <sys/ddi.h>
#include <sys/regset.h>
#include <sys/fp.h>
#include <sys/tss.h>
#include <sys/x86_archext.h>
#include <sys/controlregs.h>
#include <sys/smt.h>
#include <sys/machsystm.h>
#include <vm/page.h>
#include <vm/hat.h>
#include <asm/cpu.h>
#include "kvm_bitops.h"
#include "kvm_vmx.h"
#include "msr-index.h"
#include "kvm_msr.h"
#include "kvm_host.h"
#include "kvm_lapic.h"
#include "processor-flags.h"
#include "kvm_cpuid.h"
#include "hyperv.h"
#include "kvm_apicdef.h"
#include "kvm_iodev.h"
#include "kvm.h"
#include "kvm_x86impl.h"
#include "kvm_irq.h"
#include "kvm_ioapic.h"
#include "kvm_coalesced_mmio.h"
#include "kvm_i8254.h"
#include "kvm_mmu.h"
#include "kvm_cache_regs.h"
#include "kvm_para.h"
extern caddr_t smmap64(caddr_t addr, size_t len, int prot, int flags,
int fd, off_t pos);
extern int memcntl(caddr_t, size_t, int, caddr_t, int, int);
extern int lwp_sigmask(int, uint_t, uint_t, uint_t, uint_t);
extern uint64_t cpu_freq_hz;
static unsigned long empty_zero_page[PAGESIZE / sizeof (unsigned long)];
static uint64_t cpu_tsc_khz;
/*
* Globals
*/
struct kvm_x86_ops *kvm_x86_ops;
int ignore_msrs = 0;
#define MAX_IO_MSRS 256
#define CR0_RESERVED_BITS \
(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
| X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
| X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
#define CR4_RESERVED_BITS \
(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
| X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
| X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \
| X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
/*
* EFER defaults:
* - enable syscall per default because its emulated by KVM
* - enable LME and LMA per default on 64 bit KVM
*/
static uint64_t efer_reserved_bits = 0xfffffffffffffafeULL;
static void update_cr8_intercept(struct kvm_vcpu *);
static struct kvm_shared_msrs_global shared_msrs_global;
static struct kvm_shared_msrs *shared_msrs;
void
kvm_sigprocmask(int how, sigset_t *setp, sigset_t *osetp)
{
k_sigset_t kset;
ASSERT(how == SIG_SETMASK);
ASSERT(setp != NULL);
sigutok(setp, &kset);
if (osetp != NULL)
sigktou(&curthread->t_hold, osetp);
(void) lwp_sigmask(SIG_SETMASK,
kset.__sigbits[0], kset.__sigbits[1], kset.__sigbits[2], 0);
}
static void
kvm_on_user_return(struct kvm_vcpu *vcpu, struct kvm_user_return_notifier *urn)
{
unsigned slot;
struct kvm_shared_msrs *locals =
(struct kvm_shared_msrs *)(((caddr_t)urn) -
offsetof(struct kvm_shared_msrs, urn));
struct kvm_shared_msr_values *values;
for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
values = &locals->values[slot];
if (values->host != values->curr) {
wrmsrl(shared_msrs_global.msrs[slot], values->host);
values->curr = values->host;
}
}
locals->registered = 0;
/*
* As the on-user-return handler indicates that this thread is either
* returning to userspace or going off-cpu, the host MSR values should
* be queried again prior to the next VM entry.
*/
locals->host_saved = 0;
kvm_user_return_notifier_unregister(vcpu, urn);
}
void
kvm_define_shared_msr(unsigned slot, uint32_t msr)
{
if (slot >= shared_msrs_global.nr)
shared_msrs_global.nr = slot + 1;
shared_msrs_global.msrs[slot] = msr;
/* we need ensured the shared_msr_global have been updated */
smp_wmb();
}
void
kvm_set_shared_msr(struct kvm_vcpu *vcpu, unsigned slot, uint64_t value,
uint64_t mask)
{
struct kvm_shared_msrs *smsr = &shared_msrs[CPU->cpu_id];
const uint32_t msr = shared_msrs_global.msrs[slot];
const uint_t slot_bit = 1 << slot;
ASSERT(slot < KVM_NR_SHARED_MSRS);
/* Preserve host MSR values prior to loading the guest data. */
if ((smsr->host_saved & slot_bit) == 0) {
uint64_t temp;
rdmsrl_safe(msr, (unsigned long long *)&temp);
smsr->values[slot].host = temp;
smsr->values[slot].curr = temp;
smsr->host_saved |= slot_bit;
}
if (((value ^ smsr->values[slot].curr) & mask) == 0)
return;
smsr->values[slot].curr = value;
wrmsrl(msr, value);
if (!smsr->registered) {
smsr->urn.on_user_return = kvm_on_user_return;
kvm_user_return_notifier_register(vcpu, &smsr->urn);
smsr->registered = 1;
}
}
unsigned long
segment_base(uint16_t selector)
{
struct descriptor_table gdt;
struct desc_struct *d;
unsigned long table_base;
unsigned long v;
if (selector == 0)
return (0);
kvm_get_gdt(&gdt);
table_base = gdt.base;
if (selector & 4) { /* from ldt */
uint16_t ldt_selector = kvm_read_ldt();
table_base = segment_base(ldt_selector);
}
d = (struct desc_struct *)(table_base + (selector & ~7));
v = get_desc_base(d);
if (d->c.b.s == 0 &&
(d->c.b.type == 2 || d->c.b.type == 9 || d->c.b.type == 11))
v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
return (v);
}
uint64_t
kvm_get_apic_base(struct kvm_vcpu *vcpu)
{
if (irqchip_in_kernel(vcpu->kvm))
return (vcpu->arch.apic_base);
else
return (vcpu->arch.apic_base);
}
void
kvm_set_apic_base(struct kvm_vcpu *vcpu, uint64_t data)
{
/* TODO: reserve bits check */
if (irqchip_in_kernel(vcpu->kvm))
kvm_lapic_set_base(vcpu, data);
else
vcpu->arch.apic_base = data;
}
#define EXCPT_BENIGN 0
#define EXCPT_CONTRIBUTORY 1
#define EXCPT_PF 2
static int
exception_class(int vector)
{
switch (vector) {
case PF_VECTOR:
return (EXCPT_PF);
case DE_VECTOR:
case TS_VECTOR:
case NP_VECTOR:
case SS_VECTOR:
case GP_VECTOR:
return (EXCPT_CONTRIBUTORY);
default:
break;
}
return (EXCPT_BENIGN);
}
static void
kvm_multiple_exception(struct kvm_vcpu *vcpu,
unsigned nr, int has_error, uint32_t error_code)
{
uint32_t prev_nr;
int class1, class2;
if (!vcpu->arch.exception.pending) {
queue:
vcpu->arch.exception.pending = 1;
vcpu->arch.exception.has_error_code = has_error;
vcpu->arch.exception.nr = nr;
vcpu->arch.exception.error_code = error_code;
return;
}
/* to check exception */
prev_nr = vcpu->arch.exception.nr;
if (prev_nr == DF_VECTOR) {
/* triple fault -> shutdown */
set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
return;
}
class1 = exception_class(prev_nr);
class2 = exception_class(nr);
if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY) ||
(class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
/* generate double fault per SDM Table 5-5 */
vcpu->arch.exception.pending = 1;
vcpu->arch.exception.has_error_code = 1;
vcpu->arch.exception.nr = DF_VECTOR;
vcpu->arch.exception.error_code = 0;
} else {
/*
* replace previous exception with a new one in a hope
* that instruction re-execution will regenerate lost
* exception
*/
goto queue;
}
}
void
kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
{
kvm_multiple_exception(vcpu, nr, 0, 0);
}
void
kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
uint32_t error_code)
{
KVM_VCPU_KSTAT_INC(vcpu, kvmvs_pf_guest);
vcpu->arch.cr2 = addr;
kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
}
void
kvm_inject_nmi(struct kvm_vcpu *vcpu)
{
vcpu->arch.nmi_pending = 1;
}
void
kvm_inject_gp(struct kvm_vcpu *vcpu, uint32_t error_code)
{
kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
}
void
kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, uint32_t error_code)
{
kvm_multiple_exception(vcpu, nr, 1, error_code);
}
/*
* Checks if cpl <= required_cpl; if true, return true. Otherwise queue
* a #GP and return false.
*/
int
kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
{
if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl)
return (1);
kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
return (0);
}
/*
* Load the pae pdptrs. Return true is they are all valid.
*/
int
load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
{
gfn_t pdpt_gfn = cr3 >> PAGESHIFT;
unsigned offset = ((cr3 & (PAGESIZE-1)) >> 5) << 2;
int i;
int ret;
uint64_t pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn,
pdpte, offset * sizeof (uint64_t), sizeof (pdpte));
if (ret < 0) {
ret = 0;
goto out;
}
for (i = 0; i < ARRAY_SIZE(pdpte); i++) {
if (is_present_gpte(pdpte[i]) &&
(pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) {
ret = 0;
goto out;
}
}
ret = 1;
memcpy(vcpu->arch.pdptrs, pdpte, sizeof (vcpu->arch.pdptrs));
__set_bit(VCPU_EXREG_PDPTR, (unsigned long *)&vcpu->arch.regs_avail);
__set_bit(VCPU_EXREG_PDPTR, (unsigned long *)&vcpu->arch.regs_dirty);
out:
return (ret);
}
static int
pdptrs_changed(struct kvm_vcpu *vcpu)
{
uint64_t pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
if (is_long_mode(vcpu) || !is_pae(vcpu))
return (0);
if (!test_bit(VCPU_EXREG_PDPTR,
(unsigned long *)&vcpu->arch.regs_avail)) {
return (1);
}
if (kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u,
pdpte, sizeof (pdpte)) < 0)
return (1);
return (memcmp(pdpte, vcpu->arch.pdptrs, sizeof (pdpte)) != 0);
}
void
kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
cr0 |= X86_CR0_ET;
if (cr0 & 0xffffffff00000000UL) {
kvm_inject_gp(vcpu, 0);
return;
}
cr0 &= ~CR0_RESERVED_BITS;
if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
kvm_inject_gp(vcpu, 0);
return;
}
if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
kvm_inject_gp(vcpu, 0);
return;
}
if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
if ((vcpu->arch.efer & EFER_LME)) {
int cs_db, cs_l;
if (!is_pae(vcpu)) {
kvm_inject_gp(vcpu, 0);
return;
}
kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
if (cs_l) {
kvm_inject_gp(vcpu, 0);
return;
}
} else
if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
kvm_inject_gp(vcpu, 0);
return;
}
}
kvm_x86_ops->set_cr0(vcpu, cr0);
vcpu->arch.cr0 = cr0;
kvm_mmu_reset_context(vcpu);
}
void
kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
{
kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
}
void
kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
unsigned long old_cr4 = kvm_read_cr4(vcpu);
unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
if (cr4 & CR4_RESERVED_BITS) {
kvm_inject_gp(vcpu, 0);
return;
}
if (is_long_mode(vcpu)) {
if (!(cr4 & X86_CR4_PAE)) {
kvm_inject_gp(vcpu, 0);
return;
}
} else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) &&
((cr4 ^ old_cr4) & pdptr_bits) &&
!load_pdptrs(vcpu, vcpu->arch.cr3)) {
kvm_inject_gp(vcpu, 0);
return;
}
if (cr4 & X86_CR4_VMXE) {
kvm_inject_gp(vcpu, 0);
return;
}
kvm_x86_ops->set_cr4(vcpu, cr4);
vcpu->arch.cr4 = cr4;
vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled;
kvm_mmu_reset_context(vcpu);
}
void
kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
{
if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
kvm_mmu_sync_roots(vcpu);
kvm_mmu_flush_tlb(vcpu);
return;
}
if (is_long_mode(vcpu)) {
if (cr3 & CR3_L_MODE_RESERVED_BITS) {
kvm_inject_gp(vcpu, 0);
return;
}
} else {
if (is_pae(vcpu)) {
if (cr3 & CR3_PAE_RESERVED_BITS) {
kvm_inject_gp(vcpu, 0);
return;
}
if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
kvm_inject_gp(vcpu, 0);
return;
}
}
/*
* We don't check reserved bits in nonpae mode, because
* this isn't enforced, and VMware depends on this.
*/
}
/*
* Does the new cr3 value map to physical memory? (Note, we
* catch an invalid cr3 even in real-mode, because it would
* cause trouble later on when we turn on paging anyway.)
*
* A real CPU would silently accept an invalid cr3 and would
* attempt to use it - with largely undefined (and often hard
* to debug) behavior on the guest side.
*/
if ((!gfn_to_memslot(vcpu->kvm, cr3 >> PAGESHIFT)))
kvm_inject_gp(vcpu, 0);
else {
vcpu->arch.cr3 = cr3;
vcpu->arch.mmu.new_cr3(vcpu);
}
}
void
kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
{
if (cr8 & CR8_RESERVED_BITS) {
kvm_inject_gp(vcpu, 0);
return;
}
if (irqchip_in_kernel(vcpu->kvm))
kvm_lapic_set_tpr(vcpu, cr8);
else
vcpu->arch.cr8 = cr8;
}
unsigned long
kvm_get_cr8(struct kvm_vcpu *vcpu)
{
if (irqchip_in_kernel(vcpu->kvm)) {
return (kvm_lapic_get_cr8(vcpu));
} else {
return (vcpu->arch.cr8);
}
}
/*
* List of msr numbers which we expose to userspace through KVM_GET_MSRS
* and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
*
* This list is modified at module load time to reflect the
* capabilities of the host cpu. This capabilities test skips MSRs that are
* kvm-specific. Those are put in the beginning of the list.
*/
#define KVM_SAVE_MSRS_BEGIN 5
static uint32_t msrs_to_save[] = {
MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
HV_X64_MSR_APIC_ASSIST_PAGE,
MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
MSR_K6_STAR,
MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
};
static unsigned num_msrs_to_save;
static uint32_t emulated_msrs[] = {
MSR_IA32_MISC_ENABLE,
};
static int
set_efer(struct kvm_vcpu *vcpu, uint64_t efer)
{
if (efer & efer_reserved_bits)
return (1);
if (is_paging(vcpu) &&
(vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) {
return (1);
}
if (efer & EFER_FFXSR) {
struct kvm_cpuid_entry2 *feat;
feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT)))
return (1);
}
if (efer & EFER_SVME) {
struct kvm_cpuid_entry2 *feat;
feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM)))
return (1);
}
efer &= ~EFER_LMA;
efer |= vcpu->arch.efer & EFER_LMA;
kvm_x86_ops->set_efer(vcpu, efer);
vcpu->arch.efer = efer;
vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
kvm_mmu_reset_context(vcpu);
return (0);
}
void
kvm_enable_efer_bits(uint64_t mask)
{
efer_reserved_bits &= ~mask;
}
/*
* Writes msr value into into the appropriate "register".
* Returns 0 on success, non-0 otherwise.
* Assumes vcpu_load() was already called.
*/
int
kvm_set_msr(struct kvm_vcpu *vcpu, uint32_t msr_index, uint64_t data)
{
return (kvm_x86_ops->set_msr(vcpu, msr_index, data));
}
/*
* Adapt set_msr() to msr_io()'s calling convention
*/
static int
do_set_msr(struct kvm_vcpu *vcpu, unsigned index, uint64_t *data)
{
return (kvm_set_msr(vcpu, index, *data));
}
static void
kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
{
uint32_t version;
struct pvclock_wall_clock wc;
timespec_t ts;
if (!wall_clock)
return;
if (kvm_read_guest(kvm, wall_clock, &version, sizeof (version)) != 0)
return;
if (version & 1)
version++; /* first time write, random junk */
version++;
kvm_write_guest(kvm, wall_clock, &version, sizeof (version));
/* Use recorded time at VM creation */
wc.sec = kvm->arch.boot_wallclock.tv_sec;
wc.nsec = kvm->arch.boot_wallclock.tv_nsec;
wc.version = version;
kvm_write_guest(kvm, wall_clock, &wc, sizeof (wc));
version++;
kvm_write_guest(kvm, wall_clock, &version, sizeof (version));
}
static uint32_t
div_frac(uint32_t dividend, uint32_t divisor)
{
uint32_t quotient, remainder;
/*
* Don't try to replace with do_div(), this one calculates
* "(dividend << 32) / divisor"
*/
__asm__("divl %4"
: "=a" (quotient), "=d" (remainder)
: "0" (0), "1" (dividend), "r" (divisor));
return (quotient);
}
static void
kvm_write_guest_time(struct kvm_vcpu *v)
{
struct kvm_vcpu_arch *vcpu = &v->arch;
page_t *page;
struct pvclock_vcpu_time_info *pvclock;
hrtime_t hrt;
uint64_t tsc;
uint32_t scale, version;
uint8_t shift;
if (vcpu->time_addr == 0)
return;
page = gfn_to_page(v->kvm, vcpu->time_addr >> PAGESHIFT);
if (page == bad_page) {
vcpu->time_addr = 0;
return;
}
pvclock = (void *)((uintptr_t)page_address(page) +
offset_in_page(vcpu->time_addr));
version = pvclock->version;
/*
* A note from Linux upstream about the role of the 'version' field in
* the pvclock_vcpu_time_info structure:
*
* This VCPU is paused, but it's legal for a guest to read another
* VCPU's kvmclock, so we really have to follow the specification where
* it says that version is odd if data is being modified, and even
* after it is consistent.
*/
if (version & 1) {
/* uninitialized state with update bit set */
version += 2;
} else {
/* indicate update in progress */
version++;
}
pvclock->version = version;
membar_producer();
hrt = tsc_gethrtime_params(&tsc, &scale, &shift);
pvclock->tsc_timestamp = tsc + vcpu->tsc_offset;
pvclock->system_time = hrt - v->kvm->arch.boot_hrtime;
pvclock->tsc_to_system_mul = scale;
pvclock->tsc_shift = shift;
pvclock->flags = PVCLOCK_TSC_STABLE_BIT;
membar_producer();
/* indicate update finished */
pvclock->version = version + 1;
vcpu->time_update = hrt;
kvm_release_page_dirty(page);
mark_page_dirty(v->kvm, vcpu->time_addr >> PAGESHIFT);
}
/*
* In the upstream Linux KVM, routine updates to pvclock data are throttled to
* a 100ms interval. We use that value as well.
*/
#define KVMCLOCK_UPDATE_INTERVAL (100000000U) /* 100ms in ns */
static int
kvm_request_guest_time_update(struct kvm_vcpu *v, boolean_t force)
{
struct kvm_vcpu_arch *vcpu = &v->arch;
if (vcpu->time_addr == 0)
return (0);
/*
* If this is not a forced or first update request, check to see if a
* reasonable (and somewhat arbitrary) amount of time has passed. If
* the last update was recent, skip the pvclock update request to keep
* the write rate down.
*/
if (!force || vcpu->time_update != 0) {
hrtime_t hrt;
hrt = gethrtime();
if ((hrt - vcpu->time_update) < KVMCLOCK_UPDATE_INTERVAL)
return (0);
}
set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests);
return (1);
}
static int
msr_mtrr_valid(unsigned msr)
{
switch (msr) {
case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:
case MSR_MTRRfix64K_00000:
case MSR_MTRRfix16K_80000:
case MSR_MTRRfix16K_A0000:
case MSR_MTRRfix4K_C0000:
case MSR_MTRRfix4K_C8000:
case MSR_MTRRfix4K_D0000:
case MSR_MTRRfix4K_D8000:
case MSR_MTRRfix4K_E0000:
case MSR_MTRRfix4K_E8000:
case MSR_MTRRfix4K_F0000:
case MSR_MTRRfix4K_F8000:
case MSR_MTRRdefType:
case MSR_IA32_CR_PAT:
return (1);
case 0x2f8:
return (1);
}
return (0);
}
static int
valid_pat_type(unsigned t)
{
return (t < 8 && (1 << t) & 0xf3); /* 0, 1, 4, 5, 6, 7 */
}
static int
valid_mtrr_type(unsigned t)
{
return (t < 8 && (1 << t) & 0x73); /* 0, 1, 4, 5, 6 */
}
static int
mtrr_valid(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data)
{
int i;
if (!msr_mtrr_valid(msr))
return (0);
if (msr == MSR_IA32_CR_PAT) {
for (i = 0; i < 8; i++)
if (!valid_pat_type((data >> (i * 8)) & 0xff))
return (0);
return (1);
} else if (msr == MSR_MTRRdefType) {
if (data & ~0xcff)
return (0);
return (valid_mtrr_type(data & 0xff));
} else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) {
for (i = 0; i < 8; i++)
if (!valid_mtrr_type((data >> (i * 8)) & 0xff))
return (0);
return (1);
}
/* variable MTRRs */
return (valid_mtrr_type(data & 0xff));
}
static int
set_msr_mtrr(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data)
{
struct mtrr_state_type *state = &vcpu->arch.mtrr_state;
uint64_t *p = (uint64_t *)&state->fixed_ranges;
if (!mtrr_valid(vcpu, msr, data))
return (1);
if (msr == MSR_MTRRdefType) {
state->def_type = data;
state->enabled = (data & 0xc00) >> 10;
} else if (msr == MSR_MTRRfix64K_00000)
p[0] = data;
else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
p[1 + msr - MSR_MTRRfix16K_80000] = data;
else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
p[3 + msr - MSR_MTRRfix4K_C0000] = data;
else if (msr == MSR_IA32_CR_PAT)
vcpu->arch.pat = data;
else { /* Variable MTRRs */
int idx, is_mtrr_mask;
uint64_t *pt;
idx = (msr - 0x200) / 2;
is_mtrr_mask = msr - 0x200 - 2 * idx;
if (!is_mtrr_mask) {
pt = (uint64_t *)&state->var_ranges[idx].base_lo;
} else {
pt = (uint64_t *)&state->var_ranges[idx].mask_lo;
}
*pt = data;
}
kvm_mmu_reset_context(vcpu);
return (0);
}
static int
set_msr_mce(struct kvm_vcpu *vcpu, uint32_t msr, uint64_t data)
{
uint64_t mcg_cap = vcpu->arch.mcg_cap;
unsigned bank_num = mcg_cap & 0xff;
switch (msr) {
case MSR_IA32_MCG_STATUS:
vcpu->arch.mcg_status = data;
break;
case MSR_IA32_MCG_CTL:
if (!(mcg_cap & MCG_CTL_P))
return (1);
if (data != 0 && data != ~(uint64_t)0)
return (-1);
vcpu->arch.mcg_ctl = data;
break;
default:
if (msr >= MSR_IA32_MC0_CTL &&
msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
uint32_t offset = msr - MSR_IA32_MC0_CTL;
/*
* only 0 or all 1s can be written to IA32_MCi_CTL
* some Linux kernels though clear bit 10 in bank 4 to
* workaround a BIOS/GART TBL issue on AMD K8s, ignore
* this to avoid an uncatched #GP in the guest
*/
if ((offset & 0x3) == 0 &&
data != 0 && (data | (1 << 10)) != ~(uint64_t)0)
return (-1);
vcpu->arch.mce_banks[offset] = data;
break;
}
return (1);
}
return (0);
}
static int
xen_hvm_config(struct kvm_vcpu *vcpu, uint64_t data)
{
struct kvm *kvm = vcpu->kvm;
int lm = is_long_mode(vcpu);
uint8_t *blob_addr = lm ?
(uint8_t *)(long)kvm->arch.xen_hvm_config.blob_addr_64 :
(uint8_t *)(long)kvm->arch.xen_hvm_config.blob_addr_32;
uint8_t blob_size = lm ?
kvm->arch.xen_hvm_config.blob_size_64 :
kvm->arch.xen_hvm_config.blob_size_32;
uint32_t page_num = data & ~PAGEMASK;
uint64_t page_addr = data & PAGEMASK;
uint8_t *page;
int r;
r = E2BIG;
if (page_num >= blob_size)
goto out;
r = ENOMEM;
page = kmem_alloc(PAGESIZE, KM_SLEEP);
r = EFAULT;
if (copyin(blob_addr + (page_num * PAGESIZE), page, PAGESIZE))
goto out_free;
if (kvm_write_guest(kvm, page_addr, page, PAGESIZE))
goto out_free;
r = 0;
out_free:
kmem_free(page, PAGESIZE);
out:
return (r);
}
static int
kvm_hv_hypercall_enabled(struct kvm *kvm)
{
return (kvm->arch.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE);
}
static int
kvm_hv_msr_partition_wide(uint32_t msr)
{
int r = 0;
switch (msr) {
case HV_X64_MSR_GUEST_OS_ID:
case HV_X64_MSR_HYPERCALL:
r = 1;
break;
}
return (r);
}