Coverage Report

Created: 2017-10-25 09:10

/root/src/xen/xen/arch/x86/acpi/cpuidle_menu.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * cpuidle_menu - menu governor for cpu idle, main idea come from Linux.
3
 *            drivers/cpuidle/governors/menu.c 
4
 *
5
 *  Copyright (C) 2006-2007 Adam Belay <abelay@novell.com>
6
 *  Copyright (C) 2007, 2008 Intel Corporation
7
 *
8
 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
9
 *
10
 *  This program is free software; you can redistribute it and/or modify
11
 *  it under the terms of the GNU General Public License as published by
12
 *  the Free Software Foundation; either version 2 of the License, or (at
13
 *  your option) any later version.
14
 *
15
 *  This program is distributed in the hope that it will be useful, but
16
 *  WITHOUT ANY WARRANTY; without even the implied warranty of
17
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
 *  General Public License for more details.
19
 *
20
 *  You should have received a copy of the GNU General Public License along
21
 *  with this program; If not, see <http://www.gnu.org/licenses/>.
22
 *
23
 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
24
 */
25
#include <xen/errno.h>
26
#include <xen/lib.h>
27
#include <xen/types.h>
28
#include <xen/acpi.h>
29
#include <xen/timer.h>
30
#include <xen/cpuidle.h>
31
#include <asm/irq.h>
32
33
#define BUCKETS 6
34
1.84M
#define RESOLUTION 1024
35
11.3M
#define DECAY 4
36
1.85M
#define MAX_INTERESTING 50000
37
172k
#define LATENCY_MULTIPLIER 10
38
39
/*
40
 * Concepts and ideas behind the menu governor
41
 *
42
 * For the menu governor, there are 3 decision factors for picking a C
43
 * state:
44
 * 1) Energy break even point
45
 * 2) Performance impact
46
 * 3) Latency tolerance (TBD: from guest virtual C state)
47
 * These these three factors are treated independently.
48
 *
49
 * Energy break even point
50
 * -----------------------
51
 * C state entry and exit have an energy cost, and a certain amount of time in
52
 * the  C state is required to actually break even on this cost. CPUIDLE
53
 * provides us this duration in the "target_residency" field. So all that we
54
 * need is a good prediction of how long we'll be idle. Like the traditional
55
 * menu governor, we start with the actual known "next timer event" time.
56
 *
57
 * Since there are other source of wakeups (interrupts for example) than
58
 * the next timer event, this estimation is rather optimistic. To get a
59
 * more realistic estimate, a correction factor is applied to the estimate,
60
 * that is based on historic behavior. For example, if in the past the actual
61
 * duration always was 50% of the next timer tick, the correction factor will
62
 * be 0.5.
63
 *
64
 * menu uses a running average for this correction factor, however it uses a
65
 * set of factors, not just a single factor. This stems from the realization
66
 * that the ratio is dependent on the order of magnitude of the expected
67
 * duration; if we expect 500 milliseconds of idle time the likelihood of
68
 * getting an interrupt very early is much higher than if we expect 50 micro
69
 * seconds of idle time.
70
 * For this reason we keep an array of 6 independent factors, that gets
71
 * indexed based on the magnitude of the expected duration
72
 *
73
 * Limiting Performance Impact
74
 * ---------------------------
75
 * C states, especially those with large exit latencies, can have a real
76
 * noticable impact on workloads, which is not acceptable for most sysadmins,
77
 * and in addition, less performance has a power price of its own.
78
 *
79
 * As a general rule of thumb, menu assumes that the following heuristic
80
 * holds:
81
 *     The busier the system, the less impact of C states is acceptable
82
 *
83
 * This rule-of-thumb is implemented using average interrupt interval:
84
 * If the exit latency times multiplier is longer than the average
85
 * interrupt interval, the C state is not considered a candidate
86
 * for selection due to a too high performance impact. So the smaller
87
 * the average interrupt interval is, the smaller C state latency should be
88
 * and thus the less likely a busy CPU will hit such a deep C state.
89
 *
90
 * As an additional rule to reduce the performance impact, menu tries to
91
 * limit the exit latency duration to be no more than 10% of the decaying
92
 * measured idle time.
93
 */
94
95
struct perf_factor{
96
    s_time_t    time_stamp;
97
    s_time_t    duration;
98
    unsigned int irq_count_stamp;
99
    unsigned int irq_sum;
100
};
101
102
struct menu_device
103
{
104
    int             last_state_idx;
105
    unsigned int    expected_us;
106
    u64             predicted_us;
107
    u64             latency_factor;
108
    unsigned int    measured_us;
109
    unsigned int    exit_us;
110
    unsigned int    bucket;
111
    u64             correction_factor[BUCKETS];
112
    struct perf_factor pf;
113
};
114
115
static DEFINE_PER_CPU(struct menu_device, menu_devices);
116
117
static inline int which_bucket(unsigned int duration)
118
1.82M
{
119
1.82M
   int bucket = 0;
120
1.82M
121
1.82M
   if (duration < 10)
122
229
       return bucket;
123
1.82M
   if (duration < 100)
124
21.0k
       return bucket + 1;
125
1.80M
   if (duration < 1000)
126
181k
       return bucket + 2;
127
1.62M
   if (duration < 10000)
128
1.64M
       return bucket + 3;
129
18.4E
   if (duration < 100000)
130
0
       return bucket + 4;
131
18.4E
   return bucket + 5;
132
18.4E
}
133
134
/*
135
 * Return the average interrupt interval to take I/O performance
136
 * requirements into account. The smaller the average interrupt
137
 * interval to be, the more busy I/O activity, and thus the higher
138
 * the barrier to go to an expensive C state.
139
 */
140
141
/* 5 milisec sampling period */
142
1.91M
#define SAMPLING_PERIOD     5000000
143
144
/* for I/O interrupt, we give 8x multiplier compared to C state latency*/
145
178k
#define IO_MULTIPLIER       8
146
147
static inline s_time_t avg_intr_interval_us(void)
148
1.91M
{
149
1.91M
    struct menu_device *data = &__get_cpu_var(menu_devices);
150
1.91M
    s_time_t    duration, now;
151
1.91M
    s_time_t    avg_interval;
152
1.91M
    unsigned int irq_sum;
153
1.91M
154
1.91M
    now = NOW();
155
1.91M
    duration = (data->pf.duration + (now - data->pf.time_stamp)
156
1.91M
            * (DECAY - 1)) / DECAY;
157
1.91M
158
1.91M
    irq_sum = (data->pf.irq_sum + (this_cpu(irq_count) - data->pf.irq_count_stamp)
159
1.91M
            * (DECAY - 1)) / DECAY;
160
1.91M
161
1.91M
    if (irq_sum == 0)
162
1.91M
        /* no irq recently, so return a big enough interval: 1 sec */
163
51.8k
        avg_interval = 1000000;
164
1.91M
    else
165
1.86M
        avg_interval = duration / irq_sum / 1000; /* in us */
166
1.91M
167
1.91M
    if ( duration >= SAMPLING_PERIOD){
168
32.2k
        data->pf.time_stamp = now;
169
32.2k
        data->pf.duration = duration;
170
32.2k
        data->pf.irq_count_stamp= this_cpu(irq_count);
171
32.2k
        data->pf.irq_sum = irq_sum;
172
32.2k
    }
173
1.91M
174
1.91M
    return avg_interval;
175
1.91M
}
176
177
static unsigned int get_sleep_length_us(void)
178
1.98M
{
179
1.98M
    s_time_t us = (this_cpu(timer_deadline) - NOW()) / 1000;
180
1.98M
    /*
181
1.98M
     * while us < 0 or us > (u32)-1, return a large u32,
182
1.98M
     * choose (unsigned int)-2000 to avoid wrapping while added with exit
183
1.98M
     * latency because the latency should not larger than 2ms
184
1.98M
     */
185
1.98M
    return (us >> 32) ? (unsigned int)-2000 : (unsigned int)us;
186
1.98M
}
187
188
static int menu_select(struct acpi_processor_power *power)
189
2.02M
{
190
2.02M
    struct menu_device *data = &__get_cpu_var(menu_devices);
191
2.02M
    int i;
192
2.02M
    s_time_t    io_interval;
193
2.02M
194
2.02M
    /*  TBD: Change to 0 if C0(polling mode) support is added later*/
195
2.02M
    data->last_state_idx = CPUIDLE_DRIVER_STATE_START;
196
2.02M
    data->exit_us = 0;
197
2.02M
198
2.02M
    /* determine the expected residency time, round up */
199
2.02M
    data->expected_us = get_sleep_length_us();
200
2.02M
201
2.02M
    data->bucket = which_bucket(data->expected_us);
202
2.02M
203
2.02M
    io_interval = avg_intr_interval_us();
204
2.02M
205
2.02M
    data->latency_factor = DIV_ROUND(
206
2.02M
            data->latency_factor * (DECAY - 1) + data->measured_us,
207
2.02M
            DECAY);
208
2.02M
209
2.02M
    /*
210
2.02M
     * if the correction factor is 0 (eg first time init or cpu hotplug
211
2.02M
     * etc), we actually want to start out with a unity factor.
212
2.02M
     */
213
2.02M
    if (data->correction_factor[data->bucket] == 0)
214
49
        data->correction_factor[data->bucket] = RESOLUTION * DECAY;
215
2.02M
216
2.02M
    /* Make sure to round up for half microseconds */
217
2.02M
    data->predicted_us = DIV_ROUND(
218
2.02M
            data->expected_us * data->correction_factor[data->bucket],
219
2.02M
            RESOLUTION * DECAY);
220
2.02M
221
2.02M
    /* find the deepest idle state that satisfies our constraints */
222
2.17M
    for ( i = CPUIDLE_DRIVER_STATE_START + 1; i < power->count; i++ )
223
1.93M
    {
224
1.93M
        struct acpi_processor_cx *s = &power->states[i];
225
1.93M
226
1.93M
        if (s->target_residency > data->predicted_us)
227
1.75M
            break;
228
178k
        if (s->latency * IO_MULTIPLIER > io_interval)
229
5.75k
            break;
230
172k
        if (s->latency * LATENCY_MULTIPLIER > data->latency_factor)
231
18.9k
            break;
232
172k
        /* TBD: we need to check the QoS requirment in future */
233
153k
        data->exit_us = s->latency;
234
153k
        data->last_state_idx = i;
235
153k
    }
236
2.02M
237
2.02M
    return data->last_state_idx;
238
2.02M
}
239
240
static void menu_reflect(struct acpi_processor_power *power)
241
1.83M
{
242
1.83M
    struct menu_device *data = &__get_cpu_var(menu_devices);
243
1.83M
    u64 new_factor;
244
1.83M
245
1.83M
    data->measured_us = power->last_residency;
246
1.83M
247
1.83M
    /*
248
1.83M
     * We correct for the exit latency; we are assuming here that the
249
1.83M
     * exit latency happens after the event that we're interested in.
250
1.83M
     */
251
1.83M
    if (data->measured_us > data->exit_us)
252
1.84M
        data->measured_us -= data->exit_us;
253
1.83M
254
1.83M
    /* update our correction ratio */
255
1.83M
256
1.83M
    new_factor = data->correction_factor[data->bucket]
257
1.83M
        * (DECAY - 1) / DECAY;
258
1.83M
259
1.85M
    if (data->expected_us > 0 && data->measured_us < MAX_INTERESTING)
260
1.86M
        new_factor += RESOLUTION * data->measured_us / data->expected_us;
261
1.83M
    else
262
1.83M
        /*
263
1.83M
         * we were idle so long that we count it as a perfect
264
1.83M
         * prediction
265
1.83M
         */
266
18.4E
        new_factor += RESOLUTION;
267
1.83M
268
1.83M
    /*
269
1.83M
     * We don't want 0 as factor; we always want at least
270
1.83M
     * a tiny bit of estimated time.
271
1.83M
     */
272
1.83M
    if (new_factor == 0)
273
997k
        new_factor = 1;
274
1.83M
275
1.83M
    data->correction_factor[data->bucket] = new_factor;
276
1.83M
}
277
278
static int menu_enable_device(struct acpi_processor_power *power)
279
0
{
280
0
    if (!cpu_online(power->cpu))
281
0
        return -1;
282
0
283
0
    memset(&per_cpu(menu_devices, power->cpu), 0, sizeof(struct menu_device));
284
0
285
0
    return 0;
286
0
}
287
288
static struct cpuidle_governor menu_governor =
289
{
290
    .name =         "menu",
291
    .rating =       20,
292
    .enable =       menu_enable_device,
293
    .select =       menu_select,
294
    .reflect =      menu_reflect,
295
};
296
297
struct cpuidle_governor *cpuidle_current_governor = &menu_governor;
298
void menu_get_trace_data(u32 *expected, u32 *pred)
299
1.86M
{
300
1.86M
    struct menu_device *data = &__get_cpu_var(menu_devices);
301
1.86M
    *expected = data->expected_us;
302
1.86M
    *pred = data->predicted_us;
303
1.86M
}