1 /******************************************************************************
2  * flushtlb.c
3  *
4  * TLB flushes are timestamped using a global virtual 'clock' which ticks
5  * on any TLB flush on any processor.
6  *
7  * Copyright (c) 2003-2006, K A Fraser
8  */
9 
10 #include <xen/paging.h>
11 #include <xen/sched.h>
12 #include <xen/smp.h>
13 #include <xen/softirq.h>
14 #include <asm/flushtlb.h>
15 #include <asm/invpcid.h>
16 #include <asm/nops.h>
17 #include <asm/page.h>
18 #include <asm/pv/domain.h>
19 #include <asm/spec_ctrl.h>
20 
21 /* Debug builds: Wrap frequently to stress-test the wrap logic. */
22 #ifdef NDEBUG
23 #define WRAP_MASK (0xFFFFFFFFU)
24 #else
25 #define WRAP_MASK (0x000003FFU)
26 #endif
27 
28 #ifndef CONFIG_PV
29 # undef X86_CR4_PCIDE
30 # define X86_CR4_PCIDE 0
31 #endif
32 
33 u32 tlbflush_clock = 1U;
34 DEFINE_PER_CPU(u32, tlbflush_time);
35 
36 /* Signals whether the TLB flush clock is in use. */
37 bool __read_mostly tlb_clk_enabled = true;
38 
39 /*
40  * pre_flush(): Increment the virtual TLB-flush clock. Returns new clock value.
41  *
42  * This must happen *before* we flush the TLB. If we do it after, we race other
43  * CPUs invalidating PTEs. For example, a page invalidated after the flush
44  * might get the old timestamp, but this CPU can speculatively fetch the
45  * mapping into its TLB after the flush but before inc'ing the clock.
46  */
pre_flush(void)47 static u32 pre_flush(void)
48 {
49     u32 t, t1, t2;
50 
51     t = tlbflush_clock;
52     do {
53         t1 = t2 = t;
54         /* Clock wrapped: someone else is leading a global TLB shootdown. */
55         if ( unlikely(t1 == 0) )
56             goto skip_clocktick;
57         t2 = (t + 1) & WRAP_MASK;
58     }
59     while ( unlikely((t = cmpxchg(&tlbflush_clock, t1, t2)) != t1) );
60 
61     /* Clock wrapped: we will lead a global TLB shootdown. */
62     if ( unlikely(t2 == 0) )
63         raise_softirq(NEW_TLBFLUSH_CLOCK_PERIOD_SOFTIRQ);
64 
65  skip_clocktick:
66     return t2;
67 }
68 
69 /*
70  * post_flush(): Update this CPU's timestamp with specified clock value.
71  *
72  * Note that this happens *after* flushing the TLB, as otherwise we can race a
73  * NEED_FLUSH() test on another CPU. (e.g., other CPU sees the updated CPU
74  * stamp and so does not force a synchronous TLB flush, but the flush in this
75  * function hasn't yet occurred and so the TLB might be stale). The ordering
76  * would only actually matter if this function were interruptible, and
77  * something that abuses the stale mapping could exist in an interrupt
78  * handler. In fact neither of these is the case, so really we are being ultra
79  * paranoid.
80  */
post_flush(u32 t)81 static void post_flush(u32 t)
82 {
83     this_cpu(tlbflush_time) = t;
84 }
85 
do_tlb_flush(void)86 static void do_tlb_flush(void)
87 {
88     unsigned long flags, cr4;
89     u32 t = 0;
90 
91     /* This non-reentrant function is sometimes called in interrupt context. */
92     local_irq_save(flags);
93 
94     if ( tlb_clk_enabled )
95         t = pre_flush();
96 
97     if ( use_invpcid )
98         invpcid_flush_all();
99     else if ( (cr4 = read_cr4()) & X86_CR4_PGE )
100     {
101         write_cr4(cr4 & ~X86_CR4_PGE);
102         write_cr4(cr4);
103     }
104     else
105         write_cr3(read_cr3());
106 
107     if ( tlb_clk_enabled )
108         post_flush(t);
109 
110     local_irq_restore(flags);
111 }
112 
switch_cr3_cr4(unsigned long cr3,unsigned long cr4)113 void switch_cr3_cr4(unsigned long cr3, unsigned long cr4)
114 {
115     unsigned long flags, old_cr4;
116     u32 t = 0;
117 
118     /* Throughout this function we make this assumption: */
119     ASSERT(!(cr4 & X86_CR4_PCIDE) || !(cr4 & X86_CR4_PGE));
120 
121     /* This non-reentrant function is sometimes called in interrupt context. */
122     local_irq_save(flags);
123 
124     if ( tlb_clk_enabled )
125         t = pre_flush();
126     hvm_flush_guest_tlbs();
127 
128     old_cr4 = read_cr4();
129     ASSERT(!(old_cr4 & X86_CR4_PCIDE) || !(old_cr4 & X86_CR4_PGE));
130 
131     /*
132      * We need to write CR4 before CR3 if we're about to enable PCIDE, at the
133      * very least when the new PCID is non-zero.
134      *
135      * As we also need to do two CR4 writes in total when PGE is enabled and
136      * is to remain enabled, do the one temporarily turning off the bit right
137      * here as well.
138      *
139      * The only TLB flushing effect we depend on here is in case we move from
140      * PGE set to PCIDE set, where we want global page entries gone (and none
141      * to re-appear) after this write.
142      */
143     if ( !(old_cr4 & X86_CR4_PCIDE) &&
144          ((cr4 & X86_CR4_PCIDE) || (cr4 & old_cr4 & X86_CR4_PGE)) )
145     {
146         old_cr4 = cr4 & ~X86_CR4_PGE;
147         write_cr4(old_cr4);
148     }
149 
150     /*
151      * If the CR4 write is to turn off PCIDE, we don't need the CR3 write to
152      * flush anything, as that transition is a full flush itself.
153      */
154     if ( (old_cr4 & X86_CR4_PCIDE) > (cr4 & X86_CR4_PCIDE) )
155         cr3 |= X86_CR3_NOFLUSH;
156     write_cr3(cr3);
157 
158     if ( old_cr4 != cr4 )
159         write_cr4(cr4);
160 
161     /*
162      *  PGE  | PCIDE | flush at
163      * ------+-------+------------------------
164      *  0->0 | 0->0  | CR3 write
165      *  0->0 | 0->1  | n/a (see 1st CR4 write)
166      *  0->x | 1->0  | CR4 write
167      *  x->1 | x->1  | n/a
168      *  0->0 | 1->1  | INVPCID
169      *  0->1 | 0->0  | CR3 and CR4 writes
170      *  1->0 | 0->0  | CR4 write
171      *  1->0 | 0->1  | n/a (see 1st CR4 write)
172      *  1->1 | 0->0  | n/a (see 1st CR4 write)
173      *  1->x | 1->x  | n/a
174      */
175     if ( cr4 & X86_CR4_PCIDE )
176         invpcid_flush_all_nonglobals();
177 
178     if ( tlb_clk_enabled )
179         post_flush(t);
180 
181     local_irq_restore(flags);
182 }
183 
184 /*
185  * The return value of this function is the passed in "flags" argument with
186  * bits cleared that have been fully (i.e. system-wide) taken care of, i.e.
187  * namely not requiring any further action on remote CPUs.
188  */
flush_area_local(const void * va,unsigned int flags)189 unsigned int flush_area_local(const void *va, unsigned int flags)
190 {
191     unsigned int order = (flags - 1) & FLUSH_ORDER_MASK;
192 
193     if ( flags & (FLUSH_TLB|FLUSH_TLB_GLOBAL) )
194     {
195         if ( order == 0 )
196         {
197             /*
198              * We don't INVLPG multi-page regions because the 2M/4M/1G
199              * region may not have been mapped with a superpage. Also there
200              * are various errata surrounding INVLPG usage on superpages, and
201              * a full flush is in any case not *that* expensive.
202              */
203             if ( read_cr4() & X86_CR4_PCIDE )
204             {
205                 unsigned long addr = (unsigned long)va;
206 
207                 /*
208                  * Flush the addresses for all potential address spaces.
209                  * We can't check the current domain for being subject to
210                  * XPTI as current might be the idle vcpu while we still have
211                  * some XPTI domain TLB entries.
212                  * Using invpcid is okay here, as with PCID enabled we always
213                  * have global pages disabled.
214                  */
215                 invpcid_flush_one(PCID_PV_PRIV, addr);
216                 invpcid_flush_one(PCID_PV_USER, addr);
217                 if ( opt_xpti_hwdom || opt_xpti_domu )
218                 {
219                     invpcid_flush_one(PCID_PV_PRIV | PCID_PV_XPTI, addr);
220                     invpcid_flush_one(PCID_PV_USER | PCID_PV_XPTI, addr);
221                 }
222             }
223             else
224                 asm volatile ( "invlpg %0"
225                                : : "m" (*(const char *)(va)) : "memory" );
226         }
227         else
228             do_tlb_flush();
229     }
230 
231     if ( flags & FLUSH_HVM_ASID_CORE )
232         hvm_flush_guest_tlbs();
233 
234     if ( flags & FLUSH_CACHE )
235     {
236         const struct cpuinfo_x86 *c = &current_cpu_data;
237         unsigned long i, sz = 0;
238 
239         if ( order < (BITS_PER_LONG - PAGE_SHIFT) )
240             sz = 1UL << (order + PAGE_SHIFT);
241 
242         if ( (!(flags & (FLUSH_TLB|FLUSH_TLB_GLOBAL)) ||
243               (flags & FLUSH_VA_VALID)) &&
244              c->x86_clflush_size && c->x86_cache_size && sz &&
245              ((sz >> 10) < c->x86_cache_size) )
246         {
247             alternative("", "sfence", X86_FEATURE_CLFLUSHOPT);
248             for ( i = 0; i < sz; i += c->x86_clflush_size )
249                 alternative_input(".byte " __stringify(NOP_DS_PREFIX) ";"
250                                   " clflush %0",
251                                   "data16 clflush %0",      /* clflushopt */
252                                   X86_FEATURE_CLFLUSHOPT,
253                                   "m" (((const char *)va)[i]));
254             flags &= ~FLUSH_CACHE;
255         }
256         else
257         {
258             wbinvd();
259         }
260     }
261 
262     if ( flags & FLUSH_ROOT_PGTBL )
263         get_cpu_info()->root_pgt_changed = true;
264 
265     return flags;
266 }
267 
guest_flush_tlb_flags(const struct domain * d)268 unsigned int guest_flush_tlb_flags(const struct domain *d)
269 {
270     bool shadow = paging_mode_shadow(d);
271     bool asid = is_hvm_domain(d) && (cpu_has_svm || shadow);
272 
273     return (shadow ? FLUSH_TLB : 0) | (asid ? FLUSH_HVM_ASID_CORE : 0);
274 }
275 
guest_flush_tlb_mask(const struct domain * d,const cpumask_t * mask)276 void guest_flush_tlb_mask(const struct domain *d, const cpumask_t *mask)
277 {
278     unsigned int flags = guest_flush_tlb_flags(d);
279 
280     if ( flags )
281         flush_mask(mask, flags);
282 }
283