1 /******************************************************************************
2 * flushtlb.c
3 *
4 * TLB flushes are timestamped using a global virtual 'clock' which ticks
5 * on any TLB flush on any processor.
6 *
7 * Copyright (c) 2003-2006, K A Fraser
8 */
9
10 #include <xen/paging.h>
11 #include <xen/sched.h>
12 #include <xen/smp.h>
13 #include <xen/softirq.h>
14 #include <asm/flushtlb.h>
15 #include <asm/invpcid.h>
16 #include <asm/nops.h>
17 #include <asm/page.h>
18 #include <asm/pv/domain.h>
19 #include <asm/spec_ctrl.h>
20
21 /* Debug builds: Wrap frequently to stress-test the wrap logic. */
22 #ifdef NDEBUG
23 #define WRAP_MASK (0xFFFFFFFFU)
24 #else
25 #define WRAP_MASK (0x000003FFU)
26 #endif
27
28 #ifndef CONFIG_PV
29 # undef X86_CR4_PCIDE
30 # define X86_CR4_PCIDE 0
31 #endif
32
33 u32 tlbflush_clock = 1U;
34 DEFINE_PER_CPU(u32, tlbflush_time);
35
36 /* Signals whether the TLB flush clock is in use. */
37 bool __read_mostly tlb_clk_enabled = true;
38
39 /*
40 * pre_flush(): Increment the virtual TLB-flush clock. Returns new clock value.
41 *
42 * This must happen *before* we flush the TLB. If we do it after, we race other
43 * CPUs invalidating PTEs. For example, a page invalidated after the flush
44 * might get the old timestamp, but this CPU can speculatively fetch the
45 * mapping into its TLB after the flush but before inc'ing the clock.
46 */
pre_flush(void)47 static u32 pre_flush(void)
48 {
49 u32 t, t1, t2;
50
51 t = tlbflush_clock;
52 do {
53 t1 = t2 = t;
54 /* Clock wrapped: someone else is leading a global TLB shootdown. */
55 if ( unlikely(t1 == 0) )
56 goto skip_clocktick;
57 t2 = (t + 1) & WRAP_MASK;
58 }
59 while ( unlikely((t = cmpxchg(&tlbflush_clock, t1, t2)) != t1) );
60
61 /* Clock wrapped: we will lead a global TLB shootdown. */
62 if ( unlikely(t2 == 0) )
63 raise_softirq(NEW_TLBFLUSH_CLOCK_PERIOD_SOFTIRQ);
64
65 skip_clocktick:
66 return t2;
67 }
68
69 /*
70 * post_flush(): Update this CPU's timestamp with specified clock value.
71 *
72 * Note that this happens *after* flushing the TLB, as otherwise we can race a
73 * NEED_FLUSH() test on another CPU. (e.g., other CPU sees the updated CPU
74 * stamp and so does not force a synchronous TLB flush, but the flush in this
75 * function hasn't yet occurred and so the TLB might be stale). The ordering
76 * would only actually matter if this function were interruptible, and
77 * something that abuses the stale mapping could exist in an interrupt
78 * handler. In fact neither of these is the case, so really we are being ultra
79 * paranoid.
80 */
post_flush(u32 t)81 static void post_flush(u32 t)
82 {
83 this_cpu(tlbflush_time) = t;
84 }
85
do_tlb_flush(void)86 static void do_tlb_flush(void)
87 {
88 unsigned long flags, cr4;
89 u32 t = 0;
90
91 /* This non-reentrant function is sometimes called in interrupt context. */
92 local_irq_save(flags);
93
94 if ( tlb_clk_enabled )
95 t = pre_flush();
96
97 if ( use_invpcid )
98 invpcid_flush_all();
99 else if ( (cr4 = read_cr4()) & X86_CR4_PGE )
100 {
101 write_cr4(cr4 & ~X86_CR4_PGE);
102 write_cr4(cr4);
103 }
104 else
105 write_cr3(read_cr3());
106
107 if ( tlb_clk_enabled )
108 post_flush(t);
109
110 local_irq_restore(flags);
111 }
112
switch_cr3_cr4(unsigned long cr3,unsigned long cr4)113 void switch_cr3_cr4(unsigned long cr3, unsigned long cr4)
114 {
115 unsigned long flags, old_cr4;
116 u32 t = 0;
117
118 /* Throughout this function we make this assumption: */
119 ASSERT(!(cr4 & X86_CR4_PCIDE) || !(cr4 & X86_CR4_PGE));
120
121 /* This non-reentrant function is sometimes called in interrupt context. */
122 local_irq_save(flags);
123
124 if ( tlb_clk_enabled )
125 t = pre_flush();
126 hvm_flush_guest_tlbs();
127
128 old_cr4 = read_cr4();
129 ASSERT(!(old_cr4 & X86_CR4_PCIDE) || !(old_cr4 & X86_CR4_PGE));
130
131 /*
132 * We need to write CR4 before CR3 if we're about to enable PCIDE, at the
133 * very least when the new PCID is non-zero.
134 *
135 * As we also need to do two CR4 writes in total when PGE is enabled and
136 * is to remain enabled, do the one temporarily turning off the bit right
137 * here as well.
138 *
139 * The only TLB flushing effect we depend on here is in case we move from
140 * PGE set to PCIDE set, where we want global page entries gone (and none
141 * to re-appear) after this write.
142 */
143 if ( !(old_cr4 & X86_CR4_PCIDE) &&
144 ((cr4 & X86_CR4_PCIDE) || (cr4 & old_cr4 & X86_CR4_PGE)) )
145 {
146 old_cr4 = cr4 & ~X86_CR4_PGE;
147 write_cr4(old_cr4);
148 }
149
150 /*
151 * If the CR4 write is to turn off PCIDE, we don't need the CR3 write to
152 * flush anything, as that transition is a full flush itself.
153 */
154 if ( (old_cr4 & X86_CR4_PCIDE) > (cr4 & X86_CR4_PCIDE) )
155 cr3 |= X86_CR3_NOFLUSH;
156 write_cr3(cr3);
157
158 if ( old_cr4 != cr4 )
159 write_cr4(cr4);
160
161 /*
162 * PGE | PCIDE | flush at
163 * ------+-------+------------------------
164 * 0->0 | 0->0 | CR3 write
165 * 0->0 | 0->1 | n/a (see 1st CR4 write)
166 * 0->x | 1->0 | CR4 write
167 * x->1 | x->1 | n/a
168 * 0->0 | 1->1 | INVPCID
169 * 0->1 | 0->0 | CR3 and CR4 writes
170 * 1->0 | 0->0 | CR4 write
171 * 1->0 | 0->1 | n/a (see 1st CR4 write)
172 * 1->1 | 0->0 | n/a (see 1st CR4 write)
173 * 1->x | 1->x | n/a
174 */
175 if ( cr4 & X86_CR4_PCIDE )
176 invpcid_flush_all_nonglobals();
177
178 if ( tlb_clk_enabled )
179 post_flush(t);
180
181 local_irq_restore(flags);
182 }
183
184 /*
185 * The return value of this function is the passed in "flags" argument with
186 * bits cleared that have been fully (i.e. system-wide) taken care of, i.e.
187 * namely not requiring any further action on remote CPUs.
188 */
flush_area_local(const void * va,unsigned int flags)189 unsigned int flush_area_local(const void *va, unsigned int flags)
190 {
191 unsigned int order = (flags - 1) & FLUSH_ORDER_MASK;
192
193 if ( flags & (FLUSH_TLB|FLUSH_TLB_GLOBAL) )
194 {
195 if ( order == 0 )
196 {
197 /*
198 * We don't INVLPG multi-page regions because the 2M/4M/1G
199 * region may not have been mapped with a superpage. Also there
200 * are various errata surrounding INVLPG usage on superpages, and
201 * a full flush is in any case not *that* expensive.
202 */
203 if ( read_cr4() & X86_CR4_PCIDE )
204 {
205 unsigned long addr = (unsigned long)va;
206
207 /*
208 * Flush the addresses for all potential address spaces.
209 * We can't check the current domain for being subject to
210 * XPTI as current might be the idle vcpu while we still have
211 * some XPTI domain TLB entries.
212 * Using invpcid is okay here, as with PCID enabled we always
213 * have global pages disabled.
214 */
215 invpcid_flush_one(PCID_PV_PRIV, addr);
216 invpcid_flush_one(PCID_PV_USER, addr);
217 if ( opt_xpti_hwdom || opt_xpti_domu )
218 {
219 invpcid_flush_one(PCID_PV_PRIV | PCID_PV_XPTI, addr);
220 invpcid_flush_one(PCID_PV_USER | PCID_PV_XPTI, addr);
221 }
222 }
223 else
224 asm volatile ( "invlpg %0"
225 : : "m" (*(const char *)(va)) : "memory" );
226 }
227 else
228 do_tlb_flush();
229 }
230
231 if ( flags & FLUSH_HVM_ASID_CORE )
232 hvm_flush_guest_tlbs();
233
234 if ( flags & FLUSH_CACHE )
235 {
236 const struct cpuinfo_x86 *c = ¤t_cpu_data;
237 unsigned long i, sz = 0;
238
239 if ( order < (BITS_PER_LONG - PAGE_SHIFT) )
240 sz = 1UL << (order + PAGE_SHIFT);
241
242 if ( (!(flags & (FLUSH_TLB|FLUSH_TLB_GLOBAL)) ||
243 (flags & FLUSH_VA_VALID)) &&
244 c->x86_clflush_size && c->x86_cache_size && sz &&
245 ((sz >> 10) < c->x86_cache_size) )
246 {
247 alternative("", "sfence", X86_FEATURE_CLFLUSHOPT);
248 for ( i = 0; i < sz; i += c->x86_clflush_size )
249 alternative_input(".byte " __stringify(NOP_DS_PREFIX) ";"
250 " clflush %0",
251 "data16 clflush %0", /* clflushopt */
252 X86_FEATURE_CLFLUSHOPT,
253 "m" (((const char *)va)[i]));
254 flags &= ~FLUSH_CACHE;
255 }
256 else
257 {
258 wbinvd();
259 }
260 }
261
262 if ( flags & FLUSH_ROOT_PGTBL )
263 get_cpu_info()->root_pgt_changed = true;
264
265 return flags;
266 }
267
guest_flush_tlb_flags(const struct domain * d)268 unsigned int guest_flush_tlb_flags(const struct domain *d)
269 {
270 bool shadow = paging_mode_shadow(d);
271 bool asid = is_hvm_domain(d) && (cpu_has_svm || shadow);
272
273 return (shadow ? FLUSH_TLB : 0) | (asid ? FLUSH_HVM_ASID_CORE : 0);
274 }
275
guest_flush_tlb_mask(const struct domain * d,const cpumask_t * mask)276 void guest_flush_tlb_mask(const struct domain *d, const cpumask_t *mask)
277 {
278 unsigned int flags = guest_flush_tlb_flags(d);
279
280 if ( flags )
281 flush_mask(mask, flags);
282 }
283