list greed {
  "-flto -ffat-lto-objects -fwhole-program".lto
  "-fipa-struct-reorg -fipa-matrix-reorg".lto

  "-finline-limit=40"
  "-finline-limit=99999"
  "-fmerge-all-constants"
  all {
    "-fgcse-sm"
    "-fgcse-las"
    "-fgcse-after-reload"
  }
  "-fdelete-null-pointer-checks"

  "-fno-align-labels"
  "-fno-align-loops"
  "-falign-loops=9"
  "-falign-functions=16"
  "-falign-loops=16"
  "-falign-jumps=16"

  "-fno-builtin-strcpy"
  "-fno-builtin-memcpy"

  /* Those are Devtech optimizations */
  "-mcopy-unaligned".dt46
  "--param case-values-threshold=30".dt46
  "-fwiden-types".dt46
  "-mslow-mfocr".dt46
  "-fbypass-load-on-store".dt46
  "-fopt-array-offset".dt46
  "-fuse-load-updates".dt46
  "-fno-use-load-updates".dt46

  list i-list greed {
    "-frename-registers"
    "-fno-ira-loop-pressure"
    "-fsched-pressure"
    "-fsched-spec-load -fsched-spec-load-dangerous"
    "-fmodulo-sched"
    "-fmodulo-sched -fmodulo-sched-allow-regmoves"
    "-fsched2-use-superblocks"
    "-fselective-scheduling -fselective-scheduling2 -fsel-sched-pipelining"
  }

  list greed {
    "-fivopts"
    "-fipa-pta"
    "-ftracer"
    "-funroll-all-loops -ftree-loop-ivcanon -fvariable-expansion-in-unroller"
    "-ffunction-sections -fdata-sections"
    "-fbranch-target-load-optimize".gcc46
    "-fbranch-target-load-optimize -fbranch-target-load-optimize2".gcc46
    "-fsection-anchors"
    "-funsafe-loop-optimizations"
    "-fpeel-loops"
    "-funswitch-loops"
    "-floop-interchange"
    "-floop-strip-mine"
    "-floop-block"
    "-ftree-loop-distribution"
    "-floop-flatten".gcc46
    "-ftree-loop-if-convert-stores".gcc46
    "-ftree-loop-distribute-patterns".gcc46
    "-ftree-loop-im"
    "-fvect-cost-model"
    "-finline-limit=40"
    "-ffp-contract=fast".gcc48
    "-fno-auto-inc-dec"
    "-fira-region=all".gcc46
  }

  list i-list greed {
    "--param max-sched-extend-regions-iters=4"
    "--param sched-mem-true-dep-cost=4"
    "--param max-inline-insns-auto=10"
    "--param max-inline-insns-auto=1000"
    "--param max-reload-search-insns=1000"
    "--param max-cselib-memory-locations=9999"
    "--param max-unrolled-insns=999999"
    "--param max-average-unrolled-insns=99999999"
    "--param iv-max-considered-uses=9999999"
    "--param iv-consider-all-candidates-bound=99999"
    "--param iv-always-prune-cand-set-bound=999999"
  }

  "-ffast-math -fno-math-errno"
  "-ffast-math -fno-math-errno -fassociative-math -funsafe-math-optimizations"
}

/*
########################################################################
The following options control specific optimizations. They are either
activated by -O options or are related to ones that are. You can use
the following flags in the rare cases when “fine-tuning” of
optimizations to be performed is desired.
########################################################################

-fno-default-inline

    Do not make member functions inline by default merely because they
    are defined inside the class scope (C++ only). Otherwise, when you
    specify -O, member functions defined inside class scope are
    compiled inline by default; i.e., you don't need to add 'inline'
    in front of the member function name.

-fforward-propagate

    Perform a forward propagation pass on RTL. The pass tries to
    combine two instructions and checks if the result can be
    simplified. If loop unrolling is active, two passes are performed
    and the second is scheduled after loop unrolling.

    This option is enabled by default at optimization levels -O, -O2, -O3, -Os.

-ffp-contract=style

    -ffp-contract=off disables floating-point expression
     contraction. -ffp-contract=fast enables floating-point expression
     contraction such as forming of fused multiply-add operations if
     the target has native support for them. -ffp-contract=on enables
     floating-point expression contraction if allowed by the language
     standard. This is currently not implemented and treated equal to
     -ffp-contract=off.

    The default is -ffp-contract=fast.

-fomit-frame-pointer

    Don't keep the frame pointer in a register for functions that
    don't need one. This avoids the instructions to save, set up and
    restore frame pointers; it also makes an extra register available
    in many functions. It also makes debugging impossible on some
    machines.

    On some machines, such as the VAX, this flag has no effect,
    because the standard calling sequence automatically handles the
    frame pointer and nothing is saved by pretending it doesn't
    exist. The machine-description macro FRAME_POINTER_REQUIRED
    controls whether a target machine supports this flag. See Register
    Usage.

    Starting with GCC version 4.6, the default setting (when not
    optimizing for size) for 32-bit Linux x86 and 32-bit Darwin x86
    targets has been changed to -fomit-frame-pointer. The default can
    be reverted to -fno-omit-frame-pointer by configuring GCC with the
    --enable-frame-pointer configure option.

    Enabled at levels -O, -O2, -O3, -Os.

-foptimize-sibling-calls

    Optimize sibling and tail recursive calls.

    Enabled at levels -O2, -O3, -Os.

-fno-inline

    Don't pay attention to the inline keyword. Normally this option is
    used to keep the compiler from expanding any functions
    inline. Note that if you are not optimizing, no functions can be
    expanded inline.

-finline-small-functions

    Integrate functions into their callers when their body is smaller
    than expected function call code (so overall size of program gets
    smaller). The compiler heuristically decides which functions are
    simple enough to be worth integrating in this way.

    Enabled at level -O2.

-findirect-inlining

    Inline also indirect calls that are discovered to be known at
    compile time thanks to previous inlining. This option has any
    effect only when inlining itself is turned on by the
    -finline-functions or -finline-small-functions options.

    Enabled at level -O2.

-finline-functions

    Integrate all simple functions into their callers. The compiler
    heuristically decides which functions are simple enough to be
    worth integrating in this way.

    If all calls to a given function are integrated, and the function
    is declared static, then the function is normally not output as
    assembler code in its own right.

    Enabled at level -O3.

-finline-functions-called-once

    Consider all static functions called once for inlining into their
    caller even if they are not marked inline. If a call to a given
    function is integrated, then the function is not output as
    assembler code in its own right.

    Enabled at levels -O1, -O2, -O3 and -Os.

-fearly-inlining

    Inline functions marked by always_inline and functions whose body
    seems smaller than the function call overhead early before doing
    -fprofile-generate instrumentation and real inlining pass. Doing
    so makes profiling significantly cheaper and usually inlining
    faster on programs having large chains of nested wrapper
    functions.

    Enabled by default.

-fipa-sra

    Perform interprocedural scalar replacement of aggregates, removal
    of unused parameters and replacement of parameters passed by
    reference by parameters passed by value.

    Enabled at levels -O2, -O3 and -Os.

-finline-limit=n

    By default, GCC limits the size of functions that can be
    inlined. This flag allows coarse control of this limit. n is the
    size of functions that can be inlined in number of pseudo
    instructions.

    Inlining is actually controlled by a number of parameters, which
    may be specified individually by using --param name=value. The
    -finline-limit=n option sets some of these parameters as follows:

    max-inline-insns-single
        is set to n/2.
    max-inline-insns-auto
        is set to n/2. 

    See below for a documentation of the individual parameters
    controlling inlining and for the defaults of these parameters.

    Note: there may be no value to -finline-limit that results in
    default behavior.

    Note: pseudo instruction represents, in this particular context,
    an abstract measurement of function's size. In no way does it
    represent a count of assembly instructions and as such its exact
    meaning might change from one release to an another.

-fno-keep-inline-dllexport

    This is a more fine-grained version of -fkeep-inline-functions,
    which applies only to functions that are declared using the
    dllexport attribute or declspec (See Declaring Attributes of
    Functions.)

-fkeep-inline-functions

    In C, emit static functions that are declared inline into the
    object file, even if the function has been inlined into all of its
    callers. This switch does not affect functions using the extern
    inline extension in GNU C90. In C++, emit any and all inline
    functions into the object file.

-fkeep-static-consts

    Emit variables declared static const when optimization isn't
    turned on, even if the variables aren't referenced.

    GCC enables this option by default. If you want to force the
    compiler to check if the variable was referenced, regardless of
    whether or not optimization is turned on, use the
    -fno-keep-static-consts option.

-fmerge-constants

    Attempt to merge identical constants (string constants and
    floating point constants) across compilation units.

    This option is the default for optimized compilation if the
    assembler and linker support it. Use -fno-merge-constants to
    inhibit this behavior.

    Enabled at levels -O, -O2, -O3, -Os.

-fmerge-all-constants

    Attempt to merge identical constants and identical variables.

    This option implies -fmerge-constants. In addition to
    -fmerge-constants this considers e.g. even constant initialized
    arrays or initialized constant variables with integral or floating
    point types. Languages like C or C++ require each variable,
    including multiple instances of the same variable in recursive
    calls, to have distinct locations, so using this option will
    result in non-conforming behavior.

-fmodulo-sched

    Perform swing modulo scheduling immediately before the first
    scheduling pass. This pass looks at innermost loops and reorders
    their instructions by overlapping different iterations.

-fmodulo-sched-allow-regmoves

    Perform more aggressive SMS based modulo scheduling with register
    moves allowed. By setting this flag certain anti-dependences edges
    will be deleted which will trigger the generation of reg-moves
    based on the life-range analysis. This option is effective only
    with -fmodulo-sched enabled.

-fno-branch-count-reg

    Do not use “decrement and branch” instructions on a count
    register, but instead generate a sequence of instructions that
    decrement a register, compare it against zero, then branch based
    upon the result. This option is only meaningful on architectures
    that support such instructions, which include x86, PowerPC, IA-64
    and S/390.

    The default is -fbranch-count-reg.

-fno-function-cse

    Do not put function addresses in registers; make each instruction
    that calls a constant function contain the function's address
    explicitly.

    This option results in less efficient code, but some strange hacks
    that alter the assembler output may be confused by the
    optimizations performed when this option is not used.

    The default is -ffunction-cse

-fno-zero-initialized-in-bss

    If the target supports a BSS section, GCC by default puts
    variables that are initialized to zero into BSS. This can save
    space in the resulting code.

    This option turns off this behavior because some programs
    explicitly rely on variables going to the data section. E.g., so
    that the resulting executable can find the beginning of that
    section and/or make assumptions based on that.

    The default is -fzero-initialized-in-bss.

-fmudflap -fmudflapth -fmudflapir

    For front-ends that support it (C and C++), instrument all risky
    pointer/array dereferencing operations, some standard library
    string/heap functions, and some other associated constructs with
    range/validity tests. Modules so instrumented should be immune to
    buffer overflows, invalid heap use, and some other classes of
    C/C++ programming errors. The instrumentation relies on a separate
    runtime library (libmudflap), which will be linked into a program
    if -fmudflap is given at link time. Run-time behavior of the
    instrumented program is controlled by the MUDFLAP_OPTIONS
    environment variable. See env MUDFLAP_OPTIONS=-help a.out for its
    options.

    Use -fmudflapth instead of -fmudflap to compile and to link if
    your program is multi-threaded. Use -fmudflapir, in addition to
    -fmudflap or -fmudflapth, if instrumentation should ignore pointer
    reads. This produces less instrumentation (and therefore faster
    execution) and still provides some protection against outright
    memory corrupting writes, but allows erroneously read data to
    propagate within a program.

-fthread-jumps

    Perform optimizations where we check to see if a jump branches to
    a location where another comparison subsumed by the first is
    found. If so, the first branch is redirected to either the
    destination of the second branch or a point immediately following
    it, depending on whether the condition is known to be true or
    false.

    Enabled at levels -O2, -O3, -Os.

-fsplit-wide-types

    When using a type that occupies multiple registers, such as long
    long on a 32-bit system, split the registers apart and allocate
    them independently. This normally generates better code for those
    types, but may make debugging more difficult.

    Enabled at levels -O, -O2, -O3, -Os.

-fcse-follow-jumps

    In common subexpression elimination (CSE), scan through jump
    instructions when the target of the jump is not reached by any
    other path. For example, when CSE encounters an if statement with
    an else clause, CSE will follow the jump when the condition tested
    is false.

    Enabled at levels -O2, -O3, -Os.

-fcse-skip-blocks

    This is similar to -fcse-follow-jumps, but causes CSE to follow
    jumps which conditionally skip over blocks. When CSE encounters a
    simple if statement with no else clause, -fcse-skip-blocks causes
    CSE to follow the jump around the body of the if.

    Enabled at levels -O2, -O3, -Os.

-frerun-cse-after-loop

    Re-run common subexpression elimination after loop optimizations
    has been performed.

    Enabled at levels -O2, -O3, -Os.

-fgcse

    Perform a global common subexpression elimination pass. This pass
    also performs global constant and copy propagation.

    Note: When compiling a program using computed gotos, a GCC
    extension, you may get better runtime performance if you disable
    the global common subexpression elimination pass by adding
    -fno-gcse to the command line.

    Enabled at levels -O2, -O3, -Os.

-fgcse-lm

    When -fgcse-lm is enabled, global common subexpression elimination
    will attempt to move loads which are only killed by stores into
    themselves. This allows a loop containing a load/store sequence to
    be changed to a load outside the loop, and a copy/store within the
    loop.

    Enabled by default when gcse is enabled.

-fgcse-sm

    When -fgcse-sm is enabled, a store motion pass is run after global
    common subexpression elimination. This pass will attempt to move
    stores out of loops. When used in conjunction with -fgcse-lm,
    loops containing a load/store sequence can be changed to a load
    before the loop and a store after the loop.

    Not enabled at any optimization level.

-fgcse-las

    When -fgcse-las is enabled, the global common subexpression
    elimination pass eliminates redundant loads that come after stores
    to the same memory location (both partial and full redundancies).

    Not enabled at any optimization level.

-fgcse-after-reload

    When -fgcse-after-reload is enabled, a redundant load elimination
    pass is performed after reload. The purpose of this pass is to
    cleanup redundant spilling.

-funsafe-loop-optimizations

    If given, the loop optimizer will assume that loop indices do not
    overflow, and that the loops with nontrivial exit condition are
    not infinite. This enables a wider range of loop optimizations
    even if the loop optimizer itself cannot prove that these
    assumptions are valid. Using -Wunsafe-loop-optimizations, the
    compiler will warn you if it finds this kind of loop.

-fcrossjumping

    Perform cross-jumping transformation. This transformation unifies
    equivalent code and save code size. The resulting code may or may
    not perform better than without cross-jumping.

    Enabled at levels -O2, -O3, -Os.

-fauto-inc-dec

    Combine increments or decrements of addresses with memory
    accesses. This pass is always skipped on architectures that do not
    have instructions to support this. Enabled by default at -O and
    higher on architectures that support this.

-fdce

    Perform dead code elimination (DCE) on RTL. Enabled by default at
    -O and higher.

-fdse

    Perform dead store elimination (DSE) on RTL. Enabled by default at
    -O and higher.

-fif-conversion

    Attempt to transform conditional jumps into branch-less
    equivalents. This include use of conditional moves, min, max, set
    flags and abs instructions, and some tricks doable by standard
    arithmetics. The use of conditional execution on chips where it is
    available is controlled by if-conversion2.

    Enabled at levels -O, -O2, -O3, -Os.

-fif-conversion2

    Use conditional execution (where available) to transform
    conditional jumps into branch-less equivalents.

    Enabled at levels -O, -O2, -O3, -Os.

-fdelete-null-pointer-checks

    Assume that programs cannot safely dereference null pointers, and
    that no code or data element resides there. This enables simple
    constant folding optimizations at all optimization levels. In
    addition, other optimization passes in GCC use this flag to
    control global dataflow analyses that eliminate useless checks for
    null pointers; these assume that if a pointer is checked after it
    has already been dereferenced, it cannot be null.

    Note however that in some environments this assumption is not
    true. Use -fno-delete-null-pointer-checks to disable this
    optimization for programs which depend on that behavior.

    Some targets, especially embedded ones, disable this option at all
    levels. Otherwise it is enabled at all levels: -O0, -O1, -O2, -O3,
    -Os. Passes that use the information are enabled independently at
    different optimization levels.

-fdevirtualize

    Attempt to convert calls to virtual functions to direct
    calls. This is done both within a procedure and interprocedurally
    as part of indirect inlining (-findirect-inlining) and
    interprocedural constant propagation (-fipa-cp).

    Enabled at levels -O2, -O3, -Os.

-fexpensive-optimizations

    Perform a number of minor optimizations that are relatively
    expensive.

    Enabled at levels -O2, -O3, -Os.

-foptimize-register-move
-fregmove

    Attempt to reassign register numbers in move instructions and as
    operands of other simple instructions in order to maximize the
    amount of register tying. This is especially helpful on machines
    with two-operand instructions.

    Note -fregmove and -foptimize-register-move are the same
    optimization.

    Enabled at levels -O2, -O3, -Os.

-fira-algorithm=algorithm

    Use specified coloring algorithm for the integrated register
    allocator. The algorithm argument should be priority or CB. The
    first algorithm specifies Chow's priority coloring, the second one
    specifies Chaitin-Briggs coloring. The second algorithm can be
    unimplemented for some architectures. If it is implemented, it is
    the default because Chaitin-Briggs coloring as a rule generates a
    better code.

-fira-region=region

    Use specified regions for the integrated register allocator. The
    region argument should be one of all, mixed, or one. The first
    value means using all loops as register allocation regions, the
    second value which is the default means using all loops except for
    loops with small register pressure as the regions, and third one
    means using all function as a single region. The first value can
    give best result for machines with small size and irregular
    register set, the third one results in faster and generates decent
    code and the smallest size code, and the default value usually
    give the best results in most cases and for most architectures.

-fira-loop-pressure

    Use IRA to evaluate register pressure in loops for decision to
    move loop invariants. Usage of this option usually results in
    generation of faster and smaller code on machines with big
    register files (>= 32 registers) but it can slow compiler down.

    This option is enabled at level -O3 for some targets.

-fno-ira-share-save-slots

    Switch off sharing stack slots used for saving call used hard
    registers living through a call. Each hard register will get a
    separate stack slot and as a result function stack frame will be
    bigger.

-fno-ira-share-spill-slots

    Switch off sharing stack slots allocated for
    pseudo-registers. Each pseudo-register which did not get a hard
    register will get a separate stack slot and as a result function
    stack frame will be bigger.

-fira-verbose=n

    Set up how verbose dump file for the integrated register allocator
    will be. Default value is 5. If the value is greater or equal to
    10, the dump file will be stderr as if the value were n minus 10.

-fdelayed-branch

    If supported for the target machine, attempt to reorder
    instructions to exploit instruction slots available after delayed
    branch instructions.

    Enabled at levels -O, -O2, -O3, -Os.

-fschedule-insns

    If supported for the target machine, attempt to reorder
    instructions to eliminate execution stalls due to required data
    being unavailable. This helps machines that have slow floating
    point or memory load instructions by allowing other instructions
    to be issued until the result of the load or floating point
    instruction is required.

    Enabled at levels -O2, -O3.

-fschedule-insns2

    Similar to -fschedule-insns, but requests an additional pass of
    instruction scheduling after register allocation has been
    done. This is especially useful on machines with a relatively
    small number of registers and where memory load instructions take
    more than one cycle.

    Enabled at levels -O2, -O3, -Os.

-fno-sched-interblock

    Don't schedule instructions across basic blocks. This is normally
    enabled by default when scheduling before register allocation,
    i.e. with -fschedule-insns or at -O2 or higher.

-fno-sched-spec

    Don't allow speculative motion of non-load instructions. This is
    normally enabled by default when scheduling before register
    allocation, i.e. with -fschedule-insns or at -O2 or higher.

-fsched-pressure

    Enable register pressure sensitive insn scheduling before the
    register allocation. This only makes sense when scheduling before
    register allocation is enabled, i.e. with -fschedule-insns or at
    -O2 or higher. Usage of this option can improve the generated code
    and decrease its size by preventing register pressure increase
    above the number of available hard registers and as a consequence
    register spills in the register allocation.

-fsched-spec-load

    Allow speculative motion of some load instructions. This only
    makes sense when scheduling before register allocation, i.e. with
    -fschedule-insns or at -O2 or higher.

-fsched-spec-load-dangerous

    Allow speculative motion of more load instructions. This only
    makes sense when scheduling before register allocation, i.e. with
    -fschedule-insns or at -O2 or higher.

-fsched-stalled-insns
-fsched-stalled-insns=n

    Define how many insns (if any) can be moved prematurely from the
    queue of stalled insns into the ready list, during the second
    scheduling pass. -fno-sched-stalled-insns means that no insns will
    be moved prematurely, -fsched-stalled-insns=0 means there is no
    limit on how many queued insns can be moved
    prematurely. -fsched-stalled-insns without a value is equivalent
    to -fsched-stalled-insns=1.

-fsched-stalled-insns-dep
-fsched-stalled-insns-dep=n

    Define how many insn groups (cycles) will be examined for a
    dependency on a stalled insn that is candidate for premature
    removal from the queue of stalled insns. This has an effect only
    during the second scheduling pass, and only if
    -fsched-stalled-insns is used. -fno-sched-stalled-insns-dep is
    equivalent to
    -fsched-stalled-insns-dep=0. -fsched-stalled-insns-dep without a
    value is equivalent to -fsched-stalled-insns-dep=1.

-fsched2-use-superblocks

    When scheduling after register allocation, do use superblock
    scheduling algorithm. Superblock scheduling allows motion across
    basic block boundaries resulting on faster schedules. This option
    is experimental, as not all machine descriptions used by GCC model
    the CPU closely enough to avoid unreliable results from the
    algorithm.

    This only makes sense when scheduling after register allocation,
    i.e. with -fschedule-insns2 or at -O2 or higher.

-fsched-group-heuristic

    Enable the group heuristic in the scheduler. This heuristic favors
    the instruction that belongs to a schedule group. This is enabled
    by default when scheduling is enabled, i.e. with -fschedule-insns
    or -fschedule-insns2 or at -O2 or higher.

-fsched-critical-path-heuristic

    Enable the critical-path heuristic in the scheduler. This
    heuristic favors instructions on the critical path. This is
    enabled by default when scheduling is enabled, i.e. with
    -fschedule-insns or -fschedule-insns2 or at -O2 or higher.

-fsched-spec-insn-heuristic

    Enable the speculative instruction heuristic in the
    scheduler. This heuristic favors speculative instructions with
    greater dependency weakness. This is enabled by default when
    scheduling is enabled, i.e. with -fschedule-insns or
    -fschedule-insns2 or at -O2 or higher.

-fsched-rank-heuristic

    Enable the rank heuristic in the scheduler. This heuristic favors
    the instruction belonging to a basic block with greater size or
    frequency. This is enabled by default when scheduling is enabled,
    i.e. with -fschedule-insns or -fschedule-insns2 or at -O2 or
    higher.

-fsched-last-insn-heuristic

    Enable the last-instruction heuristic in the scheduler. This
    heuristic favors the instruction that is less dependent on the
    last instruction scheduled. This is enabled by default when
    scheduling is enabled, i.e. with -fschedule-insns or
    -fschedule-insns2 or at -O2 or higher.

-fsched-dep-count-heuristic

    Enable the dependent-count heuristic in the scheduler. This
    heuristic favors the instruction that has more instructions
    depending on it. This is enabled by default when scheduling is
    enabled, i.e. with -fschedule-insns or -fschedule-insns2 or at -O2
    or higher.

-freschedule-modulo-scheduled-loops

    The modulo scheduling comes before the traditional scheduling, if
    a loop was modulo scheduled we may want to prevent the later
    scheduling passes from changing its schedule, we use this option
    to control that.

-fselective-scheduling

    Schedule instructions using selective scheduling
    algorithm. Selective scheduling runs instead of the first
    scheduler pass.

-fselective-scheduling2

    Schedule instructions using selective scheduling
    algorithm. Selective scheduling runs instead of the second
    scheduler pass.

-fsel-sched-pipelining

    Enable software pipelining of innermost loops during selective
    scheduling. This option has no effect until one of
    -fselective-scheduling or -fselective-scheduling2 is turned on.

-fsel-sched-pipelining-outer-loops

    When pipelining loops during selective scheduling, also pipeline
    outer loops. This option has no effect until
    -fsel-sched-pipelining is turned on.

-fcaller-saves

    Enable values to be allocated in registers that will be clobbered
    by function calls, by emitting extra instructions to save and
    restore the registers around such calls. Such allocation is done
    only when it seems to result in better code than would otherwise
    be produced.

    This option is always enabled by default on certain machines,
    usually those which have no call-preserved registers to use
    instead.

    Enabled at levels -O2, -O3, -Os.

-fcombine-stack-adjustments

    Tracks stack adjustments (pushes and pops) and stack memory
    references and then tries to find ways to combine them.

    Enabled by default at -O1 and higher.

-fconserve-stack

    Attempt to minimize stack usage. The compiler will attempt to use
    less stack space, even if that makes the program slower. This
    option implies setting the large-stack-frame parameter to 100 and
    the large-stack-frame-growth parameter to 400.

-ftree-reassoc

    Perform reassociation on trees. This flag is enabled by default at
    -O and higher.

-ftree-pre

    Perform partial redundancy elimination (PRE) on trees. This flag
    is enabled by default at -O2 and -O3.

-ftree-forwprop

    Perform forward propagation on trees. This flag is enabled by
    default at -O and higher.

-ftree-fre

    Perform full redundancy elimination (FRE) on trees. The difference
    between FRE and PRE is that FRE only considers expressions that
    are computed on all paths leading to the redundant
    computation. This analysis is faster than PRE, though it exposes
    fewer redundancies. This flag is enabled by default at -O and
    higher.

-ftree-phiprop

    Perform hoisting of loads from conditional pointers on trees. This
    pass is enabled by default at -O and higher.

-ftree-copy-prop

    Perform copy propagation on trees. This pass eliminates
    unnecessary copy operations. This flag is enabled by default at -O
    and higher.

-fipa-pure-const

    Discover which functions are pure or constant. Enabled by default
    at -O and higher.

-fipa-reference

    Discover which static variables do not escape cannot escape the
    compilation unit. Enabled by default at -O and higher.

-fipa-struct-reorg

    Perform structure reorganization optimization, that change C-like
    structures layout in order to better utilize spatial
    locality. This transformation is affective for programs containing
    arrays of structures. Available in two compilation modes:
    profile-based (enabled with -fprofile-generate) or static (which
    uses built-in heuristics). It works only in whole program mode, so
    it requires -fwhole-program to be enabled. Structures considered
    'cold' by this transformation are not affected (see --param
    struct-reorg-cold-struct-ratio=value).

    With this flag, the program debug info reflects a new structure
    layout.

-fipa-pta

    Perform interprocedural pointer analysis and interprocedural
    modification and reference analysis. This option can cause
    excessive memory and compile-time usage on large compilation
    units. It is not enabled by default at any optimization level.

-fipa-profile

    Perform interprocedural profile propagation. The functions called
    only from cold functions are marked as cold. Also functions
    executed once (such as cold, noreturn, static constructors or
    destructors) are identified. Cold functions and loop less parts of
    functions executed once are then optimized for size. Enabled by
    default at -O and higher.

-fipa-cp

    Perform interprocedural constant propagation. This optimization
    analyzes the program to determine when values passed to functions
    are constants and then optimizes accordingly. This optimization
    can substantially increase performance if the application has
    constants passed to functions. This flag is enabled by default at
    -O2, -Os and -O3.

-fipa-cp-clone

    Perform function cloning to make interprocedural constant
    propagation stronger. When enabled, interprocedural constant
    propagation will perform function cloning when externally visible
    function can be called with constant arguments. Because this
    optimization can create multiple copies of functions, it may
    significantly increase code size (see --param
    ipcp-unit-growth=value). This flag is enabled by default at -O3.

-fipa-matrix-reorg

    Perform matrix flattening and transposing. Matrix flattening tries
    to replace an m-dimensional matrix with its equivalent
    n-dimensional matrix, where n < m. This reduces the level of
    indirection needed for accessing the elements of the matrix. The
    second optimization is matrix transposing that attempts to change
    the order of the matrix's dimensions in order to improve cache
    locality. Both optimizations need the -fwhole-program
    flag. Transposing is enabled only if profiling information is
    available.

-ftree-sink

    Perform forward store motion on trees. This flag is enabled by
    default at -O and higher.

-ftree-bit-ccp

    Perform sparse conditional bit constant propagation on trees and
    propagate pointer alignment information. This pass only operates
    on local scalar variables and is enabled by default at -O and
    higher. It requires that -ftree-ccp is enabled.

-ftree-ccp

    Perform sparse conditional constant propagation (CCP) on
    trees. This pass only operates on local scalar variables and is
    enabled by default at -O and higher.

-ftree-switch-conversion

    Perform conversion of simple initializations in a switch to
    initializations from a scalar array. This flag is enabled by
    default at -O2 and higher.

-ftree-dce

    Perform dead code elimination (DCE) on trees. This flag is enabled
    by default at -O and higher.

-ftree-builtin-call-dce

    Perform conditional dead code elimination (DCE) for calls to
    builtin functions that may set errno but are otherwise side-effect
    free. This flag is enabled by default at -O2 and higher if -Os is
    not also specified.

-ftree-dominator-opts

    Perform a variety of simple scalar cleanups (constant/copy
    propagation, redundancy elimination, range propagation and
    expression simplification) based on a dominator tree
    traversal. This also performs jump threading (to reduce jumps to
    jumps). This flag is enabled by default at -O and higher.

-ftree-dse

    Perform dead store elimination (DSE) on trees. A dead store is a
    store into a memory location which will later be overwritten by
    another store without any intervening loads. In this case the
    earlier store can be deleted. This flag is enabled by default at
    -O and higher.

-ftree-ch

    Perform loop header copying on trees. This is beneficial since it
    increases effectiveness of code motion optimizations. It also
    saves one jump. This flag is enabled by default at -O and
    higher. It is not enabled for -Os, since it usually increases code
    size.

-ftree-loop-optimize

    Perform loop optimizations on trees. This flag is enabled by
    default at -O and higher.

-ftree-loop-linear

    Perform loop interchange transformations on tree. Same as
    -floop-interchange. To use this code transformation, GCC has to be
    configured with --with-ppl and --with-cloog to enable the Graphite
    loop transformation infrastructure.

-floop-interchange

    Perform loop interchange transformations on loops. Interchanging
    two nested loops switches the inner and outer loops. For example,
    given a loop like:

              DO J = 1, M
                DO I = 1, N
                  A(J, I) = A(J, I) * C
                ENDDO
              ENDDO
         

    loop interchange will transform the loop as if the user had written:

              DO I = 1, N
                DO J = 1, M
                  A(J, I) = A(J, I) * C
                ENDDO
              ENDDO
         

    which can be beneficial when N is larger than the caches, because
    in Fortran, the elements of an array are stored in memory
    contiguously by column, and the original loop iterates over rows,
    potentially creating at each access a cache miss. This
    optimization applies to all the languages supported by GCC and is
    not limited to Fortran. To use this code transformation, GCC has
    to be configured with --with-ppl and --with-cloog to enable the
    Graphite loop transformation infrastructure.

-floop-strip-mine

    Perform loop strip mining transformations on loops. Strip mining
    splits a loop into two nested loops. The outer loop has strides
    equal to the strip size and the inner loop has strides of the
    original loop within a strip. The strip length can be changed
    using the loop-block-tile-size parameter. For example, given a
    loop like:

              DO I = 1, N
                A(I) = A(I) + C
              ENDDO
         

    loop strip mining will transform the loop as if the user had
    written:

              DO II = 1, N, 51
                DO I = II, min (II + 50, N)
                  A(I) = A(I) + C
                ENDDO
              ENDDO
         

    This optimization applies to all the languages supported by GCC
    and is not limited to Fortran. To use this code transformation,
    GCC has to be configured with --with-ppl and --with-cloog to
    enable the Graphite loop transformation infrastructure.

-floop-block

    Perform loop blocking transformations on loops. Blocking strip
    mines each loop in the loop nest such that the memory accesses of
    the element loops fit inside caches. The strip length can be
    changed using the loop-block-tile-size parameter. For example,
    given a loop like:

              DO I = 1, N
                DO J = 1, M
                  A(J, I) = B(I) + C(J)
                ENDDO
              ENDDO
         

    loop blocking will transform the loop as if the user had written:

              DO II = 1, N, 51
                DO JJ = 1, M, 51
                  DO I = II, min (II + 50, N)
                    DO J = JJ, min (JJ + 50, M)
                      A(J, I) = B(I) + C(J)
                    ENDDO
                  ENDDO
                ENDDO
              ENDDO
         

    which can be beneficial when M is larger than the caches, because
    the innermost loop will iterate over a smaller amount of data that
    can be kept in the caches. This optimization applies to all the
    languages supported by GCC and is not limited to Fortran. To use
    this code transformation, GCC has to be configured with --with-ppl
    and --with-cloog to enable the Graphite loop transformation
    infrastructure.

-fgraphite-identity

    Enable the identity transformation for graphite. For every SCoP we
    generate the polyhedral representation and transform it back to
    gimple. Using -fgraphite-identity we can check the costs or
    benefits of the GIMPLE -> GRAPHITE -> GIMPLE transformation. Some
    minimal optimizations are also performed by the code generator
    CLooG, like index splitting and dead code elimination in loops.

-floop-flatten

    Removes the loop nesting structure: transforms the loop nest into
    a single loop. This transformation can be useful to vectorize all
    the levels of the loop nest.

-floop-parallelize-all

    Use the Graphite data dependence analysis to identify loops that
    can be parallelized. Parallelize all the loops that can be
    analyzed to not contain loop carried dependences without checking
    that it is profitable to parallelize the loops.

-fcheck-data-deps

    Compare the results of several data dependence analyzers. This
    option is used for debugging the data dependence analyzers.

-ftree-loop-if-convert

    Attempt to transform conditional jumps in the innermost loops to
    branch-less equivalents. The intent is to remove control-flow from
    the innermost loops in order to improve the ability of the
    vectorization pass to handle these loops. This is enabled by
    default if vectorization is enabled.

-ftree-loop-if-convert-stores

    Attempt to also if-convert conditional jumps containing memory
    writes. This transformation can be unsafe for multi-threaded
    programs as it transforms conditional memory writes into
    unconditional memory writes. For example,

              for (i = 0; i < N; i++)
                if (cond)
                  A[i] = expr;
         

    would be transformed to

              for (i = 0; i < N; i++)
                A[i] = cond ? expr : A[i];
         

    potentially producing data races.

-ftree-loop-distribution

    Perform loop distribution. This flag can improve cache performance
    on big loop bodies and allow further loop optimizations, like
    parallelization or vectorization, to take place. For example, the
    loop

              DO I = 1, N
                A(I) = B(I) + C
                D(I) = E(I) * F
              ENDDO
         

    is transformed to

              DO I = 1, N
                 A(I) = B(I) + C
              ENDDO
              DO I = 1, N
                 D(I) = E(I) * F
              ENDDO
         


-ftree-loop-distribute-patterns

    Perform loop distribution of patterns that can be code generated
    with calls to a library. This flag is enabled by default at -O3.

    This pass distributes the initialization loops and generates a
    call to memset zero. For example, the loop

              DO I = 1, N
                A(I) = 0
                B(I) = A(I) + I
              ENDDO
         

    is transformed to

              DO I = 1, N
                 A(I) = 0
              ENDDO
              DO I = 1, N
                 B(I) = A(I) + I
              ENDDO
         

    and the initialization loop is transformed into a call to memset
    zero.

-ftree-loop-im

    Perform loop invariant motion on trees. This pass moves only
    invariants that would be hard to handle at RTL level (function
    calls, operations that expand to nontrivial sequences of
    insns). With -funswitch-loops it also moves operands of conditions
    that are invariant out of the loop, so that we can use just
    trivial invariantness analysis in loop unswitching. The pass also
    includes store motion.

-ftree-loop-ivcanon

    Create a canonical counter for number of iterations in the loop
    for that determining number of iterations requires complicated
    analysis. Later optimizations then may determine the number
    easily. Useful especially in connection with unrolling.

-fivopts

    Perform induction variable optimizations (strength reduction,
    induction variable merging and induction variable elimination) on
    trees.

-ftree-parallelize-loops=n

    Parallelize loops, i.e., split their iteration space to run in n
    threads. This is only possible for loops whose iterations are
    independent and can be arbitrarily reordered. The optimization is
    only profitable on multiprocessor machines, for loops that are
    CPU-intensive, rather than constrained e.g. by memory
    bandwidth. This option implies -pthread, and thus is only
    supported on targets that have support for -pthread.

-ftree-pta

    Perform function-local points-to analysis on trees. This flag is
    enabled by default at -O and higher.

-ftree-sra

    Perform scalar replacement of aggregates. This pass replaces
    structure references with scalars to prevent committing structures
    to memory too early. This flag is enabled by default at -O and
    higher.

-ftree-copyrename

    Perform copy renaming on trees. This pass attempts to rename
    compiler temporaries to other variables at copy locations, usually
    resulting in variable names which more closely resemble the
    original variables. This flag is enabled by default at -O and
    higher.

-ftree-ter

    Perform temporary expression replacement during the SSA->normal
    phase. Single use/single def temporaries are replaced at their use
    location with their defining expression. This results in
    non-GIMPLE code, but gives the expanders much more complex trees
    to work on resulting in better RTL generation. This is enabled by
    default at -O and higher.

-ftree-vectorize

    Perform loop vectorization on trees. This flag is enabled by
    default at -O3.

-ftree-slp-vectorize

    Perform basic block vectorization on trees. This flag is enabled
    by default at -O3 and when -ftree-vectorize is enabled.

-ftree-vect-loop-version

    Perform loop versioning when doing loop vectorization on
    trees. When a loop appears to be vectorizable except that data
    alignment or data dependence cannot be determined at compile time
    then vectorized and non-vectorized versions of the loop are
    generated along with runtime checks for alignment or dependence to
    control which version is executed. This option is enabled by
    default except at level -Os where it is disabled.

-fvect-cost-model

    Enable cost model for vectorization.

-ftree-vrp

    Perform Value Range Propagation on trees. This is similar to the
    constant propagation pass, but instead of values, ranges of values
    are propagated. This allows the optimizers to remove unnecessary
    range checks like array bound checks and null pointer checks. This
    is enabled by default at -O2 and higher. Null pointer check
    elimination is only done if -fdelete-null-pointer-checks is
    enabled.

-ftracer

    Perform tail duplication to enlarge superblock size. This
    transformation simplifies the control flow of the function
    allowing other optimizations to do better job.

-funroll-loops

    Unroll loops whose number of iterations can be determined at
    compile time or upon entry to the loop. -funroll-loops implies
    -frerun-cse-after-loop. This option makes code larger, and may or
    may not make it run faster.

-funroll-all-loops

    Unroll all loops, even if their number of iterations is uncertain
    when the loop is entered. This usually makes programs run more
    slowly. -funroll-all-loops implies the same options as
    -funroll-loops,

-fsplit-ivs-in-unroller

    Enables expressing of values of induction variables in later
    iterations of the unrolled loop using the value in the first
    iteration. This breaks long dependency chains, thus improving
    efficiency of the scheduling passes.

    Combination of -fweb and CSE is often sufficient to obtain the
    same effect. However in cases the loop body is more complicated
    than a single basic block, this is not reliable. It also does not
    work at all on some of the architectures due to restrictions in
    the CSE pass.

    This optimization is enabled by default.

-fvariable-expansion-in-unroller

    With this option, the compiler will create multiple copies of some
    local variables when unrolling a loop which can result in superior
    code.

-fpartial-inlining

    Inline parts of functions. This option has any effect only when
    inlining itself is turned on by the -finline-functions or
    -finline-small-functions options.

    Enabled at level -O2.

-fpredictive-commoning

    Perform predictive commoning optimization, i.e., reusing
    computations (especially memory loads and stores) performed in
    previous iterations of loops.

    This option is enabled at level -O3.

-fprefetch-loop-arrays

    If supported by the target machine, generate instructions to
    prefetch memory to improve the performance of loops that access
    large arrays.

    This option may generate better or worse code; results are highly
    dependent on the structure of loops within the source code.

    Disabled at level -Os.

-fno-peephole
-fno-peephole2

    Disable any machine-specific peephole optimizations. The
    difference between -fno-peephole and -fno-peephole2 is in how they
    are implemented in the compiler; some targets use one, some use
    the other, a few use both.

    -fpeephole is enabled by default. -fpeephole2 enabled at levels
     -O2, -O3, -Os.

-fno-guess-branch-probability

    Do not guess branch probabilities using heuristics.

    GCC will use heuristics to guess branch probabilities if they are
    not provided by profiling feedback (-fprofile-arcs). These
    heuristics are based on the control flow graph. If some branch
    probabilities are specified by '__builtin_expect', then the
    heuristics will be used to guess branch probabilities for the rest
    of the control flow graph, taking the '__builtin_expect' info into
    account. The interactions between the heuristics and
    '__builtin_expect' can be complex, and in some cases, it may be
    useful to disable the heuristics so that the effects of
    '__builtin_expect' are easier to understand.

    The default is -fguess-branch-probability at levels -O, -O2, -O3,
    -Os.

-freorder-blocks

    Reorder basic blocks in the compiled function in order to reduce
    number of taken branches and improve code locality.

    Enabled at levels -O2, -O3.

-freorder-blocks-and-partition

    In addition to reordering basic blocks in the compiled function,
    in order to reduce number of taken branches, partitions hot and
    cold basic blocks into separate sections of the assembly and .o
    files, to improve paging and cache locality performance.

    This optimization is automatically turned off in the presence of
    exception handling, for linkonce sections, for functions with a
    user-defined section attribute and on any architecture that does
    not support named sections.

-freorder-functions

    Reorder functions in the object file in order to improve code
    locality. This is implemented by using special subsections
    .text.hot for most frequently executed functions and
    .text.unlikely for unlikely executed functions. Reordering is done
    by the linker so object file format must support named sections
    and linker must place them in a reasonable way.

    Also profile feedback must be available in to make this option
    effective. See -fprofile-arcs for details.

    Enabled at levels -O2, -O3, -Os.

-fstrict-aliasing

    Allow the compiler to assume the strictest aliasing rules
    applicable to the language being compiled. For C (and C++), this
    activates optimizations based on the type of expressions. In
    particular, an object of one type is assumed never to reside at
    the same address as an object of a different type, unless the
    types are almost the same. For example, an unsigned int can alias
    an int, but not a void* or a double. A character type may alias
    any other type.

    Pay special attention to code like this:

              union a_union {
                int i;
                double d;
              };
              
              int f() {
                union a_union t;
                t.d = 3.0;
                return t.i;
              }
         

    The practice of reading from a different union member than the one
    most recently written to (called “type-punning”) is common. Even
    with -fstrict-aliasing, type-punning is allowed, provided the
    memory is accessed through the union type. So, the code above will
    work as expected. See Structures unions enumerations and
    bit-fields implementation. However, this code might not:

              int f() {
                union a_union t;
                int* ip;
                t.d = 3.0;
                ip = &t.i;
                return *ip;
              }
         

    Similarly, access by taking the address, casting the resulting
    pointer and dereferencing the result has undefined behavior, even
    if the cast uses a union type, e.g.:

              int f() {
                double d = 3.0;
                return ((union a_union *) &d)->i;
              }
         

    The -fstrict-aliasing option is enabled at levels -O2, -O3, -Os.

-fstrict-overflow

    Allow the compiler to assume strict signed overflow rules,
    depending on the language being compiled. For C (and C++) this
    means that overflow when doing arithmetic with signed numbers is
    undefined, which means that the compiler may assume that it will
    not happen. This permits various optimizations. For example, the
    compiler will assume that an expression like i + 10 > i will
    always be true for signed i. This assumption is only valid if
    signed overflow is undefined, as the expression is false if i + 10
    overflows when using twos complement arithmetic. When this option
    is in effect any attempt to determine whether an operation on
    signed numbers will overflow must be written carefully to not
    actually involve overflow.

    This option also allows the compiler to assume strict pointer
    semantics: given a pointer to an object, if adding an offset to
    that pointer does not produce a pointer to the same object, the
    addition is undefined. This permits the compiler to conclude that
    p + u > p is always true for a pointer p and unsigned integer
    u. This assumption is only valid because pointer wraparound is
    undefined, as the expression is false if p + u overflows using
    twos complement arithmetic.

    See also the -fwrapv option. Using -fwrapv means that integer
    signed overflow is fully defined: it wraps. When -fwrapv is used,
    there is no difference between -fstrict-overflow and
    -fno-strict-overflow for integers. With -fwrapv certain types of
    overflow are permitted. For example, if the compiler gets an
    overflow when doing arithmetic on constants, the overflowed value
    can still be used with -fwrapv, but not otherwise.

    The -fstrict-overflow option is enabled at levels -O2, -O3, -Os.

-falign-functions
-falign-functions=n

    Align the start of functions to the next power-of-two greater than
    n, skipping up to n bytes. For instance, -falign-functions=32
    aligns functions to the next 32-byte boundary, but
    -falign-functions=24 would align to the next 32-byte boundary only
    if this can be done by skipping 23 bytes or less.

    -fno-align-functions and -falign-functions=1 are equivalent and
     mean that functions will not be aligned.

    Some assemblers only support this flag when n is a power of two;
    in that case, it is rounded up.

    If n is not specified or is zero, use a machine-dependent default.

    Enabled at levels -O2, -O3.

-falign-labels
-falign-labels=n

    Align all branch targets to a power-of-two boundary, skipping up
    to n bytes like -falign-functions. This option can easily make
    code slower, because it must insert dummy operations for when the
    branch target is reached in the usual flow of the code.

    -fno-align-labels and -falign-labels=1 are equivalent and mean
     that labels will not be aligned.

    If -falign-loops or -falign-jumps are applicable and are greater
    than this value, then their values are used instead.

    If n is not specified or is zero, use a machine-dependent default
    which is very likely to be '1', meaning no alignment.

    Enabled at levels -O2, -O3.

-falign-loops
-falign-loops=n

    Align loops to a power-of-two boundary, skipping up to n bytes
    like -falign-functions. The hope is that the loop will be executed
    many times, which will make up for any execution of the dummy
    operations.

    -fno-align-loops and -falign-loops=1 are equivalent and mean that
     loops will not be aligned.

    If n is not specified or is zero, use a machine-dependent default.

    Enabled at levels -O2, -O3.

-falign-jumps
-falign-jumps=n

    Align branch targets to a power-of-two boundary, for branch
    targets where the targets can only be reached by jumping, skipping
    up to n bytes like -falign-functions. In this case, no dummy
    operations need be executed.

    -fno-align-jumps and -falign-jumps=1 are equivalent and mean that
     loops will not be aligned.

    If n is not specified or is zero, use a machine-dependent default.

    Enabled at levels -O2, -O3.

-funit-at-a-time

    This option is left for compatibility reasons. -funit-at-a-time
    has no effect, while -fno-unit-at-a-time implies
    -fno-toplevel-reorder and -fno-section-anchors.

    Enabled by default.

-fno-toplevel-reorder

    Do not reorder top-level functions, variables, and asm
    statements. Output them in the same order that they appear in the
    input file. When this option is used, unreferenced static
    variables will not be removed. This option is intended to support
    existing code which relies on a particular ordering. For new code,
    it is better to use attributes.

    Enabled at level -O0. When disabled explicitly, it also imply
    -fno-section-anchors that is otherwise enabled at -O0 on some
    targets.

-fweb

    Constructs webs as commonly used for register allocation purposes
    and assign each web individual pseudo register. This allows the
    register allocation pass to operate on pseudos directly, but also
    strengthens several other optimization passes, such as CSE, loop
    optimizer and trivial dead code remover. It can, however, make
    debugging impossible, since variables will no longer stay in a
    “home register”.

    Enabled by default with -funroll-loops.

-fwhole-program

    Assume that the current compilation unit represents the whole
    program being compiled. All public functions and variables with
    the exception of main and those merged by attribute
    externally_visible become static functions and in effect are
    optimized more aggressively by interprocedural optimizers. If gold
    is used as the linker plugin, externally_visible attributes are
    automatically added to functions (not variable yet due to a
    current gold issue) that are accessed outside of LTO objects
    according to resolution file produced by gold. For other linkers
    that cannot generate resolution file, explicit externally_visible
    attributes are still necessary. While this option is equivalent to
    proper use of the static keyword for programs consisting of a
    single file, in combination with option -flto this flag can be
    used to compile many smaller scale programs since the functions
    and variables become local for the whole combined compilation
    unit, not for the single source file itself.

    This option implies -fwhole-file for Fortran programs.

-flto[=n]

    This option runs the standard link-time optimizer. When invoked
    with source code, it generates GIMPLE (one of GCC's internal
    representations) and writes it to special ELF sections in the
    object file. When the object files are linked together, all the
    function bodies are read from these ELF sections and instantiated
    as if they had been part of the same translation unit.

    To use the link-timer optimizer, -flto needs to be specified at
    compile time and during the final link. For example,

              gcc -c -O2 -flto foo.c
              gcc -c -O2 -flto bar.c
              gcc -o myprog -flto -O2 foo.o bar.o
         

    The first two invocations to GCC will save a bytecode
    representation of GIMPLE into special ELF sections inside foo.o
    and bar.o. The final invocation will read the GIMPLE bytecode from
    foo.o and bar.o, merge the two files into a single internal image,
    and compile the result as usual. Since both foo.o and bar.o are
    merged into a single image, this causes all the inter-procedural
    analyses and optimizations in GCC to work across the two files as
    if they were a single one. This means, for example, that the
    inliner will be able to inline functions in bar.o into functions
    in foo.o and vice-versa.

    Another (simpler) way to enable link-time optimization is,

              gcc -o myprog -flto -O2 foo.c bar.c
         

    The above will generate bytecode for foo.c and bar.c, merge them
    together into a single GIMPLE representation and optimize them as
    usual to produce myprog.

    The only important thing to keep in mind is that to enable
    link-time optimizations the -flto flag needs to be passed to both
    the compile and the link commands.

    To make whole program optimization effective, it is necessary to
    make certain whole program assumptions. The compiler needs to know
    what functions and variables can be accessed by libraries and
    runtime outside of the link time optimized unit. When supported by
    the linker, the linker plugin (see -fuse-linker-plugin) passes to
    the compiler information about used and externally visible
    symbols. When the linker plugin is not available, -fwhole-program
    should be used to allow the compiler to make these assumptions,
    which will lead to more aggressive optimization decisions.

    Note that when a file is compiled with -flto, the generated object
    file will be larger than a regular object file because it will
    contain GIMPLE bytecodes and the usual final code. This means that
    object files with LTO information can be linked as a normal object
    file. So, in the previous example, if the final link is done with

              gcc -o myprog foo.o bar.o
         

    The only difference will be that no inter-procedural optimizations
    will be applied to produce myprog. The two object files foo.o and
    bar.o will be simply sent to the regular linker.

    Additionally, the optimization flags used to compile individual
    files are not necessarily related to those used at link-time. For
    instance,

              gcc -c -O0 -flto foo.c
              gcc -c -O0 -flto bar.c
              gcc -o myprog -flto -O3 foo.o bar.o
         

    This will produce individual object files with unoptimized
    assembler code, but the resulting binary myprog will be optimized
    at -O3. Now, if the final binary is generated without -flto, then
    myprog will not be optimized.

    When producing the final binary with -flto, GCC will only apply
    link-time optimizations to those files that contain
    bytecode. Therefore, you can mix and match object files and
    libraries with GIMPLE bytecodes and final object code. GCC will
    automatically select which files to optimize in LTO mode and which
    files to link without further processing.

    There are some code generation flags that GCC will preserve when
    generating bytecodes, as they need to be used during the final
    link stage. Currently, the following options are saved into the
    GIMPLE bytecode files: -fPIC, -fcommon and all the -m target
    flags.

    At link time, these options are read-in and reapplied. Note that
    the current implementation makes no attempt at recognizing
    conflicting values for these options. If two or more files have a
    conflicting value (e.g., one file is compiled with -fPIC and
    another isn't), the compiler will simply use the last value read
    from the bytecode files. It is recommended, then, that all the
    files participating in the same link be compiled with the same
    options.

    Another feature of LTO is that it is possible to apply
    interprocedural optimizations on files written in different
    languages. This requires some support in the language front
    end. Currently, the C, C++ and Fortran front ends are capable of
    emitting GIMPLE bytecodes, so something like this should work

              gcc -c -flto foo.c
              g++ -c -flto bar.cc
              gfortran -c -flto baz.f90
              g++ -o myprog -flto -O3 foo.o bar.o baz.o -lgfortran
         

    Notice that the final link is done with g++ to get the C++ runtime
    libraries and -lgfortran is added to get the Fortran runtime
    libraries. In general, when mixing languages in LTO mode, you
    should use the same link command used when mixing languages in a
    regular (non-LTO) compilation. This means that if your build
    process was mixing languages before, all you need to add is -flto
    to all the compile and link commands.

    If LTO encounters objects with C linkage declared with
    incompatible types in separate translation units to be linked
    together (undefined behavior according to ISO C99 6.2.7), a
    non-fatal diagnostic may be issued. The behavior is still
    undefined at runtime.

    If object files containing GIMPLE bytecode are stored in a library
    archive, say libfoo.a, it is possible to extract and use them in
    an LTO link if you are using a linker with linker plugin
    support. To enable this feature, use the flag -fuse-linker-plugin
    at link-time:

              gcc -o myprog -O2 -flto -fuse-linker-plugin a.o b.o -lfoo
         

    With the linker plugin enabled, the linker will extract the needed
    GIMPLE files from libfoo.a and pass them on to the running GCC to
    make them part of the aggregated GIMPLE image to be optimized.

    If you are not using a linker with linker plugin support and/or do
    not enable linker plugin then the objects inside libfoo.a will be
    extracted and linked as usual, but they will not participate in
    the LTO optimization process.

    Link time optimizations do not require the presence of the whole
    program to operate. If the program does not require any symbols to
    be exported, it is possible to combine -flto and with
    -fwhole-program to allow the interprocedural optimizers to use
    more aggressive assumptions which may lead to improved
    optimization opportunities. Use of -fwhole-program is not needed
    when linker plugin is active (see -fuse-linker-plugin).

    Regarding portability: the current implementation of LTO makes no
    attempt at generating bytecode that can be ported between
    different types of hosts. The bytecode files are versioned and
    there is a strict version check, so bytecode files generated in
    one version of GCC will not work with an older/newer version of
    GCC.

    Link time optimization does not play well with generating
    debugging information. Combining -flto with -g is currently
    experimental and expected to produce wrong results.

    If you specify the optional n, the optimization and code
    generation done at link time is executed in parallel using n
    parallel jobs by utilizing an installed make program. The
    environment variable MAKE may be used to override the program
    used. The default value for n is 1.

    You can also specify -flto=jobserver to use GNU make's job server
    mode to determine the number of parallel jobs. This is useful when
    the Makefile calling GCC is already executing in parallel. The
    parent Makefile will need a '+' prepended to the command recipe
    for this to work. This will likely only work if MAKE is GNU make.

    This option is disabled by default.

-flto-partition=alg

    Specify the partitioning algorithm used by the link time
    optimizer. The value is either 1to1 to specify a partitioning
    mirroring the original source files or balanced to specify
    partitioning into equally sized chunks (whenever
    possible). Specifying none as an algorithm disables partitioning
    and streaming completely. The default value is balanced.

-flto-compression-level=n

    This option specifies the level of compression used for
    intermediate language written to LTO object files, and is only
    meaningful in conjunction with LTO mode (-flto). Valid values are
    0 (no compression) to 9 (maximum compression). Values outside this
    range are clamped to either 0 or 9. If the option is not given, a
    default balanced compression setting is used.

-flto-report

    Prints a report with internal details on the workings of the
    link-time optimizer. The contents of this report vary from version
    to version, it is meant to be useful to GCC developers when
    processing object files in LTO mode (via -flto).

    Disabled by default.

-fuse-linker-plugin

    Enables the use of linker plugin during link time
    optimization. This option relies on the linker plugin support in
    linker that is available in gold or in GNU ld 2.21 or newer.

    This option enables the extraction of object files with GIMPLE
    bytecode out of library archives. This improves the quality of
    optimization by exposing more code the the link time
    optimizer. This information specify what symbols can be accessed
    externally (by non-LTO object or during dynamic
    linking). Resulting code quality improvements on binaries (and
    shared libraries that do use hidden visibility) is similar to
    -fwhole-program. See -flto for a description on the effect of this
    flag and how to use it.

    Enabled by default when LTO support in GCC is enabled and GCC was
    compiled with a linker supporting plugins (GNU ld 2.21 or newer or
    gold).

-fcompare-elim

    After register allocation and post-register allocation instruction
    splitting, identify arithmetic instructions that compute processor
    flags similar to a comparison operation based on that
    arithmetic. If possible, eliminate the explicit comparison
    operation.

    This pass only applies to certain targets that cannot explicitly
    represent the comparison operation before register allocation is
    complete.

    Enabled at levels -O, -O2, -O3, -Os.

-fcprop-registers

    After register allocation and post-register allocation instruction
    splitting, we perform a copy-propagation pass to try to reduce
    scheduling dependencies and occasionally eliminate the copy.

    Enabled at levels -O, -O2, -O3, -Os.

-fprofile-correction

    Profiles collected using an instrumented binary for multi-threaded
    programs may be inconsistent due to missed counter updates. When
    this option is specified, GCC will use heuristics to correct or
    smooth out such inconsistencies. By default, GCC will emit an
    error message when an inconsistent profile is detected.

-fprofile-dir=path

    Set the directory to search for the profile data files in to
    path. This option affects only the profile data generated by
    -fprofile-generate, -ftest-coverage, -fprofile-arcs and used by
    -fprofile-use and -fbranch-probabilities and its related
    options. By default, GCC will use the current directory as path,
    thus the profile data file will appear in the same directory as
    the object file.

-fprofile-generate
-fprofile-generate=path

    Enable options usually used for instrumenting application to
    produce profile useful for later recompilation with profile
    feedback based optimization. You must use -fprofile-generate both
    when compiling and when linking your program.

    The following options are enabled: -fprofile-arcs,
    -fprofile-values, -fvpt.

    If path is specified, GCC will look at the path to find the
    profile feedback data files. See -fprofile-dir.

-fprofile-use
-fprofile-use=path

    Enable profile feedback directed optimizations, and optimizations
    generally profitable only with profile feedback available.

    The following options are enabled: -fbranch-probabilities, -fvpt,
    -funroll-loops, -fpeel-loops, -ftracer

    By default, GCC emits an error message if the feedback profiles do
    not match the source code. This error can be turned into a warning
    by using -Wcoverage-mismatch. Note this may result in poorly
    optimized code.

    If path is specified, GCC will look at the path to find the
    profile feedback data files. See -fprofile-dir.

########################################################################
The following options control compiler behavior regarding floating
point arithmetic. These options trade off between speed and
correctness. All must be specifically enabled.
########################################################################

-ffloat-store

    Do not store floating point variables in registers, and inhibit
    other options that might change whether a floating point value is
    taken from a register or memory.

    This option prevents undesirable excess precision on machines such
    as the 68000 where the floating registers (of the 68881) keep more
    precision than a double is supposed to have. Similarly for the x86
    architecture. For most programs, the excess precision does only
    good, but a few programs rely on the precise definition of IEEE
    floating point. Use -ffloat-store for such programs, after
    modifying them to store all pertinent intermediate computations
    into variables.

-fexcess-precision=style

    This option allows further control over excess precision on
    machines where floating-point registers have more precision than
    the IEEE float and double types and the processor does not support
    operations rounding to those types. By default,
    -fexcess-precision=fast is in effect; this means that operations
    are carried out in the precision of the registers and that it is
    unpredictable when rounding to the types specified in the source
    code takes place. When compiling C, if -fexcess-precision=standard
    is specified then excess precision will follow the rules specified
    in ISO C99; in particular, both casts and assignments cause values
    to be rounded to their semantic types (whereas -ffloat-store only
    affects assignments). This option is enabled by default for C if a
    strict conformance option such as -std=c99 is used.

    -fexcess-precision=standard is not implemented for languages other
     than C, and has no effect if -funsafe-math-optimizations or
     -ffast-math is specified. On the x86, it also has no effect if
     -mfpmath=sse or -mfpmath=sse+387 is specified; in the former
     case, IEEE semantics apply without excess precision, and in the
     latter, rounding is unpredictable.

-ffast-math

    Sets -fno-math-errno, -funsafe-math-optimizations,
    -ffinite-math-only, -fno-rounding-math, -fno-signaling-nans and
    -fcx-limited-range.

    This option causes the preprocessor macro __FAST_MATH__ to be
    defined.

    This option is not turned on by any -O option besides -Ofast since
    it can result in incorrect output for programs which depend on an
    exact implementation of IEEE or ISO rules/specifications for math
    functions. It may, however, yield faster code for programs that do
    not require the guarantees of these specifications.

-fno-math-errno

    Do not set ERRNO after calling math functions that are executed
    with a single instruction, e.g., sqrt. A program that relies on
    IEEE exceptions for math error handling may want to use this flag
    for speed while maintaining IEEE arithmetic compatibility.

    This option is not turned on by any -O option since it can result
    in incorrect output for programs which depend on an exact
    implementation of IEEE or ISO rules/specifications for math
    functions. It may, however, yield faster code for programs that do
    not require the guarantees of these specifications.

    The default is -fmath-errno.

    On Darwin systems, the math library never sets errno. There is
    therefore no reason for the compiler to consider the possibility
    that it might, and -fno-math-errno is the default.

-funsafe-math-optimizations

    Allow optimizations for floating-point arithmetic that (a) assume
    that arguments and results are valid and (b) may violate IEEE or
    ANSI standards. When used at link-time, it may include libraries
    or startup files that change the default FPU control word or other
    similar optimizations.

    This option is not turned on by any -O option since it can result
    in incorrect output for programs which depend on an exact
    implementation of IEEE or ISO rules/specifications for math
    functions. It may, however, yield faster code for programs that do
    not require the guarantees of these specifications. Enables
    -fno-signed-zeros, -fno-trapping-math, -fassociative-math and
    -freciprocal-math.

    The default is -fno-unsafe-math-optimizations.

-fassociative-math

    Allow re-association of operands in series of floating-point
    operations. This violates the ISO C and C++ language standard by
    possibly changing computation result. NOTE: re-ordering may change
    the sign of zero as well as ignore NaNs and inhibit or create
    underflow or overflow (and thus cannot be used on a code which
    relies on rounding behavior like (x + 2**52) - 2**52). May also
    reorder floating-point comparisons and thus may not be used when
    ordered comparisons are required. This option requires that both
    -fno-signed-zeros and -fno-trapping-math be in effect. Moreover,
    it doesn't make much sense with -frounding-math. For Fortran the
    option is automatically enabled when both -fno-signed-zeros and
    -fno-trapping-math are in effect.

    The default is -fno-associative-math.

-freciprocal-math

    Allow the reciprocal of a value to be used instead of dividing by
    the value if this enables optimizations. For example x / y can be
    replaced with x * (1/y) which is useful if (1/y) is subject to
    common subexpression elimination. Note that this loses precision
    and increases the number of flops operating on the value.

    The default is -fno-reciprocal-math.

-ffinite-math-only

    Allow optimizations for floating-point arithmetic that assume that
    arguments and results are not NaNs or +-Infs.

    This option is not turned on by any -O option since it can result
    in incorrect output for programs which depend on an exact
    implementation of IEEE or ISO rules/specifications for math
    functions. It may, however, yield faster code for programs that do
    not require the guarantees of these specifications.

    The default is -fno-finite-math-only.

-fno-signed-zeros

    Allow optimizations for floating point arithmetic that ignore the
    signedness of zero. IEEE arithmetic specifies the behavior of
    distinct +0.0 and −0.0 values, which then prohibits simplification
    of expressions such as x+0.0 or 0.0*x (even with
    -ffinite-math-only). This option implies that the sign of a zero
    result isn't significant.

    The default is -fsigned-zeros.

-fno-trapping-math

    Compile code assuming that floating-point operations cannot
    generate user-visible traps. These traps include division by zero,
    overflow, underflow, inexact result and invalid operation. This
    option requires that -fno-signaling-nans be in effect. Setting
    this option may allow faster code if one relies on “non-stop” IEEE
    arithmetic, for example.

    This option should never be turned on by any -O option since it
    can result in incorrect output for programs which depend on an
    exact implementation of IEEE or ISO rules/specifications for math
    functions.

    The default is -ftrapping-math.

-frounding-math

    Disable transformations and optimizations that assume default
    floating point rounding behavior. This is round-to-zero for all
    floating point to integer conversions, and round-to-nearest for
    all other arithmetic truncations. This option should be specified
    for programs that change the FP rounding mode dynamically, or that
    may be executed with a non-default rounding mode. This option
    disables constant folding of floating point expressions at
    compile-time (which may be affected by rounding mode) and
    arithmetic transformations that are unsafe in the presence of
    sign-dependent rounding modes.

    The default is -fno-rounding-math.

    This option is experimental and does not currently guarantee to
    disable all GCC optimizations that are affected by rounding
    mode. Future versions of GCC may provide finer control of this
    setting using C99's FENV_ACCESS pragma. This command line option
    will be used to specify the default state for FENV_ACCESS.

-fsignaling-nans

    Compile code assuming that IEEE signaling NaNs may generate
    user-visible traps during floating-point operations. Setting this
    option disables optimizations that may change the number of
    exceptions visible with signaling NaNs. This option implies
    -ftrapping-math.

    This option causes the preprocessor macro __SUPPORT_SNAN__ to be
    defined.

    The default is -fno-signaling-nans.

    This option is experimental and does not currently guarantee to
    disable all GCC optimizations that affect signaling NaN behavior.

-fsingle-precision-constant

    Treat floating point constant as single precision constant instead
    of implicitly converting it to double precision constant.

-fcx-limited-range

    When enabled, this option states that a range reduction step is
    not needed when performing complex division. Also, there is no
    checking whether the result of a complex multiplication or
    division is NaN + I*NaN, with an attempt to rescue the situation
    in that case. The default is -fno-cx-limited-range, but is enabled
    by -ffast-math.

    This option controls the default setting of the ISO C99
    CX_LIMITED_RANGE pragma. Nevertheless, the option applies to all
    languages.

-fcx-fortran-rules

    Complex multiplication and division follow Fortran rules. Range
    reduction is done as part of complex division, but there is no
    checking whether the result of a complex multiplication or
    division is NaN + I*NaN, with an attempt to rescue the situation
    in that case.

    The default is -fno-cx-fortran-rules. 

########################################################################
The following options control optimizations that may improve
performance, but are not enabled by any -O options. This section
includes experimental options that may produce broken code.
########################################################################

-fbranch-probabilities

    After running a program compiled with -fprofile-arcs (see Options
    for Debugging Your Program or gcc), you can compile it a second
    time using -fbranch-probabilities, to improve optimizations based
    on the number of times each branch was taken. When the program
    compiled with -fprofile-arcs exits it saves arc execution counts
    to a file called sourcename.gcda for each source file. The
    information in this data file is very dependent on the structure
    of the generated code, so you must use the same source code and
    the same optimization options for both compilations.

    With -fbranch-probabilities, GCC puts a 'REG_BR_PROB' note on each
    'JUMP_INSN' and 'CALL_INSN'. These can be used to improve
    optimization. Currently, they are only used in one place: in
    reorg.c, instead of guessing which path a branch is most likely to
    take, the 'REG_BR_PROB' values are used to exactly determine which
    path is taken more often.

-fprofile-values

    If combined with -fprofile-arcs, it adds code so that some data
    about values of expressions in the program is gathered.

    With -fbranch-probabilities, it reads back the data gathered from
    profiling values of expressions for usage in optimizations.

    Enabled with -fprofile-generate and -fprofile-use.

-fvpt

    If combined with -fprofile-arcs, it instructs the compiler to add
    a code to gather information about values of expressions.

    With -fbranch-probabilities, it reads back the data gathered and
    actually performs the optimizations based on them. Currently the
    optimizations include specialization of division operation using
    the knowledge about the value of the denominator.

-frename-registers

    Attempt to avoid false dependencies in scheduled code by making
    use of registers left over after register allocation. This
    optimization will most benefit processors with lots of
    registers. Depending on the debug information format adopted by
    the target, however, it can make debugging impossible, since
    variables will no longer stay in a “home register”.

    Enabled by default with -funroll-loops and -fpeel-loops.

-ftracer

    Perform tail duplication to enlarge superblock size. This
    transformation simplifies the control flow of the function
    allowing other optimizations to do better job.

    Enabled with -fprofile-use.

-funroll-loops

    Unroll loops whose number of iterations can be determined at
    compile time or upon entry to the loop. -funroll-loops implies
    -frerun-cse-after-loop, -fweb and -frename-registers. It also
    turns on complete loop peeling (i.e. complete removal of loops
    with small constant number of iterations). This option makes code
    larger, and may or may not make it run faster.

    Enabled with -fprofile-use.

-funroll-all-loops

    Unroll all loops, even if their number of iterations is uncertain
    when the loop is entered. This usually makes programs run more
    slowly. -funroll-all-loops implies the same options as
    -funroll-loops.

-fpeel-loops

    Peels the loops for that there is enough information that they do
    not roll much (from profile feedback). It also turns on complete
    loop peeling (i.e. complete removal of loops with small constant
    number of iterations).

    Enabled with -fprofile-use.

-fmove-loop-invariants

    Enables the loop invariant motion pass in the RTL loop
    optimizer. Enabled at level -O1

-funswitch-loops

    Move branches with loop invariant conditions out of the loop, with
    duplicates of the loop on both branches (modified according to
    result of the condition).

-ffunction-sections
-fdata-sections

    Place each function or data item into its own section in the
    output file if the target supports arbitrary sections. The name of
    the function or the name of the data item determines the section's
    name in the output file.

    Use these options on systems where the linker can perform
    optimizations to improve locality of reference in the instruction
    space. Most systems using the ELF object format and SPARC
    processors running Solaris 2 have linkers with such
    optimizations. AIX may have these optimizations in the future.

    Only use these options when there are significant benefits from
    doing so. When you specify these options, the assembler and linker
    will create larger object and executable files and will also be
    slower. You will not be able to use gprof on all systems if you
    specify this option and you may have problems with debugging if
    you specify both this option and -g.

-fbranch-target-load-optimize

    Perform branch target register load optimization before prologue /
    epilogue threading. The use of target registers can typically be
    exposed only during reload, thus hoisting loads out of loops and
    doing inter-block scheduling needs a separate optimization pass.

-fbranch-target-load-optimize2

    Perform branch target register load optimization after prologue /
    epilogue threading.

-fbtr-bb-exclusive

    When performing branch target register load optimization, don't
    reuse branch target registers in within any basic block.

-fstack-protector

    Emit extra code to check for buffer overflows, such as stack
    smashing attacks. This is done by adding a guard variable to
    functions with vulnerable objects. This includes functions that
    call alloca, and functions with buffers larger than 8 bytes. The
    guards are initialized when a function is entered and then checked
    when the function exits. If a guard check fails, an error message
    is printed and the program exits.

-fstack-protector-all

    Like -fstack-protector except that all functions are protected.

-fsection-anchors

    Try to reduce the number of symbolic address calculations by using
    shared “anchor” symbols to address nearby objects. This
    transformation can help to reduce the number of GOT entries and
    GOT accesses on some targets.

    For example, the implementation of the following function foo:

              static int a, b, c;
              int foo (void) { return a + b + c; }
         

    would usually calculate the addresses of all three variables, but
    if you compile it with -fsection-anchors, it will access the
    variables from a common anchor point instead. The effect is
    similar to the following pseudocode (which isn't valid C):

              int foo (void)
              {
                register int *xr = &x;
                return xr[&a - &x] + xr[&b - &x] + xr[&c - &x];
              }
         

    Not all targets support this option.


########################################################################
--param name=value
########################################################################

    In some places, GCC uses various constants to control the amount
    of optimization that is done. For example, GCC will not inline
    functions that contain more that a certain number of
    instructions. You can control some of these constants on the
    command-line using the --param option.

    The names of specific parameters, and the meaning of the values,
    are tied to the internals of the compiler, and are subject to
    change without notice in future releases.

    In each case, the value is an integer. The allowable choices for
    name are given in the following table:

    struct-reorg-cold-struct-ratio

        The threshold ratio (as a percentage) between a structure
        frequency and the frequency of the hottest structure in the
        program. This parameter is used by struct-reorg optimization
        enabled by -fipa-struct-reorg. We say that if the ratio of a
        structure frequency, calculated by profiling, to the hottest
        structure frequency in the program is less than this
        parameter, then structure reorganization is not applied to
        this structure. The default is 10.

    predictable-branch-outcome

        When branch is predicted to be taken with probability lower
        than this threshold (in percent), then it is considered well
        predictable. The default is 10.

    max-crossjump-edges

        The maximum number of incoming edges to consider for
        crossjumping. The algorithm used by -fcrossjumping is O(N^2)
        in the number of edges incoming to each block. Increasing
        values mean more aggressive optimization, making the compile
        time increase with probably small improvement in executable
        size.

    min-crossjump-insns

        The minimum number of instructions which must be matched at
        the end of two blocks before crossjumping will be performed on
        them. This value is ignored in the case where all instructions
        in the block being crossjumped from are matched. The default
        value is 5.

    max-grow-copy-bb-insns

        The maximum code size expansion factor when copying basic
        blocks instead of jumping. The expansion is relative to a jump
        instruction. The default value is 8.

    max-goto-duplication-insns

        The maximum number of instructions to duplicate to a block
        that jumps to a computed goto. To avoid O(N^2) behavior in a
        number of passes, GCC factors computed gotos early in the
        compilation process, and unfactors them as late as
        possible. Only computed jumps at the end of a basic blocks
        with no more than max-goto-duplication-insns are
        unfactored. The default value is 8.

    max-delay-slot-insn-search

        The maximum number of instructions to consider when looking
        for an instruction to fill a delay slot. If more than this
        arbitrary number of instructions is searched, the time savings
        from filling the delay slot will be minimal so stop
        searching. Increasing values mean more aggressive
        optimization, making the compile time increase with probably
        small improvement in executable run time.

    max-delay-slot-live-search

        When trying to fill delay slots, the maximum number of
        instructions to consider when searching for a block with valid
        live register information. Increasing this arbitrarily chosen
        value means more aggressive optimization, increasing the
        compile time. This parameter should be removed when the delay
        slot code is rewritten to maintain the control-flow graph.

    max-gcse-memory

        The approximate maximum amount of memory that will be
        allocated in order to perform the global common subexpression
        elimination optimization. If more memory than specified is
        required, the optimization will not be done.

    max-gcse-insertion-ratio

        If the ratio of expression insertions to deletions is larger
        than this value for any expression, then RTL PRE will insert
        or remove the expression and thus leave partially redundant
        computations in the instruction stream. The default value is
        20.

    max-pending-list-length

        The maximum number of pending dependencies scheduling will
        allow before flushing the current state and starting
        over. Large functions with few branches or calls can create
        excessively large lists which needlessly consume memory and
        resources.

    max-inline-insns-single

        Several parameters control the tree inliner used in gcc. This
        number sets the maximum number of instructions (counted in
        GCC's internal representation) in a single function that the
        tree inliner will consider for inlining. This only affects
        functions declared inline and methods implemented in a class
        declaration (C++). The default value is 400.

    max-inline-insns-auto

        When you use -finline-functions (included in -O3), a lot of
        functions that would otherwise not be considered for inlining
        by the compiler will be investigated. To those functions, a
        different (more restrictive) limit compared to functions
        declared inline can be applied. The default value is 40.

    large-function-insns

        The limit specifying really large functions. For functions
        larger than this limit after inlining, inlining is constrained
        by --param large-function-growth. This parameter is useful
        primarily to avoid extreme compilation time caused by
        non-linear algorithms used by the backend. The default value
        is 2700.

    large-function-growth

        Specifies maximal growth of large function caused by inlining
        in percents. The default value is 100 which limits large
        function growth to 2.0 times the original size.

    large-unit-insns

        The limit specifying large translation unit. Growth caused by
        inlining of units larger than this limit is limited by --param
        inline-unit-growth. For small units this might be too tight
        (consider unit consisting of function A that is inline and B
        that just calls A three time. If B is small relative to A, the
        growth of unit is 300\% and yet such inlining is very
        sane. For very large units consisting of small inlineable
        functions however the overall unit growth limit is needed to
        avoid exponential explosion of code size. Thus for smaller
        units, the size is increased to --param large-unit-insns
        before applying --param inline-unit-growth. The default is
        10000

    inline-unit-growth

        Specifies maximal overall growth of the compilation unit
        caused by inlining. The default value is 30 which limits unit
        growth to 1.3 times the original size.

    ipcp-unit-growth

        Specifies maximal overall growth of the compilation unit
        caused by interprocedural constant propagation. The default
        value is 10 which limits unit growth to 1.1 times the original
        size.

    large-stack-frame

        The limit specifying large stack frames. While inlining the
        algorithm is trying to not grow past this limit too
        much. Default value is 256 bytes.

    large-stack-frame-growth

        Specifies maximal growth of large stack frames caused by
        inlining in percents. The default value is 1000 which limits
        large stack frame growth to 11 times the original size.

    max-inline-insns-recursive
    max-inline-insns-recursive-auto

        Specifies maximum number of instructions out-of-line copy of
        self recursive inline function can grow into by performing
        recursive inlining.

        For functions declared inline --param
        max-inline-insns-recursive is taken into account. For function
        not declared inline, recursive inlining happens only when
        -finline-functions (included in -O3) is enabled and --param
        max-inline-insns-recursive-auto is used. The default value is
        450.

    max-inline-recursive-depth
    max-inline-recursive-depth-auto

        Specifies maximum recursion depth used by the recursive
        inlining.

        For functions declared inline --param
        max-inline-recursive-depth is taken into account. For function
        not declared inline, recursive inlining happens only when
        -finline-functions (included in -O3) is enabled and --param
        max-inline-recursive-depth-auto is used. The default value is
        8.

    min-inline-recursive-probability

        Recursive inlining is profitable only for function having deep
        recursion in average and can hurt for function having little
        recursion depth by increasing the prologue size or complexity
        of function body to other optimizers.

        When profile feedback is available (see -fprofile-generate)
        the actual recursion depth can be guessed from probability
        that function will recurse via given call expression. This
        parameter limits inlining only to call expression whose
        probability exceeds given threshold (in percents). The default
        value is 10.

    early-inlining-insns

        Specify growth that early inliner can make. In effect it
        increases amount of inlining for code having large abstraction
        penalty. The default value is 10.

    max-early-inliner-iterations
    max-early-inliner-iterations

        Limit of iterations of early inliner. This basically bounds
        number of nested indirect calls early inliner can
        resolve. Deeper chains are still handled by late inlining.

    comdat-sharing-probability
    comdat-sharing-probability

        Probability (in percent) that C++ inline function with comdat
        visibility will be shared across multiple compilation
        units. The default value is 20.

    min-vect-loop-bound

        The minimum number of iterations under which a loop will not
        get vectorized when -ftree-vectorize is used. The number of
        iterations after vectorization needs to be greater than the
        value specified by this option to allow vectorization. The
        default value is 0.

    gcse-cost-distance-ratio

        Scaling factor in calculation of maximum distance an
        expression can be moved by GCSE optimizations. This is
        currently supported only in the code hoisting pass. The bigger
        the ratio, the more aggressive code hoisting will be with
        simple expressions, i.e., the expressions which have cost less
        than gcse-unrestricted-cost. Specifying 0 will disable
        hoisting of simple expressions. The default value is 10.

    gcse-unrestricted-cost

        Cost, roughly measured as the cost of a single typical machine
        instruction, at which GCSE optimizations will not constrain
        the distance an expression can travel. This is currently
        supported only in the code hoisting pass. The lesser the cost,
        the more aggressive code hoisting will be. Specifying 0 will
        allow all expressions to travel unrestricted distances. The
        default value is 3.

    max-hoist-depth

        The depth of search in the dominator tree for expressions to
        hoist. This is used to avoid quadratic behavior in hoisting
        algorithm. The value of 0 will avoid limiting the search, but
        may slow down compilation of huge functions. The default value
        is 30.

    max-unrolled-insns

        The maximum number of instructions that a loop should have if
        that loop is unrolled, and if the loop is unrolled, it
        determines how many times the loop code is unrolled.

    max-average-unrolled-insns

        The maximum number of instructions biased by probabilities of
        their execution that a loop should have if that loop is
        unrolled, and if the loop is unrolled, it determines how many
        times the loop code is unrolled.

    max-unroll-times

        The maximum number of unrollings of a single loop.

    max-peeled-insns

        The maximum number of instructions that a loop should have if
        that loop is peeled, and if the loop is peeled, it determines
        how many times the loop code is peeled.

    max-peel-times

        The maximum number of peelings of a single loop.

    max-completely-peeled-insns

        The maximum number of insns of a completely peeled loop.

    max-completely-peel-times

        The maximum number of iterations of a loop to be suitable for
        complete peeling.

    max-completely-peel-loop-nest-depth

        The maximum depth of a loop nest suitable for complete
        peeling.

    max-unswitch-insns

        The maximum number of insns of an unswitched loop.

    max-unswitch-level

        The maximum number of branches unswitched in a single loop.

    lim-expensive

        The minimum cost of an expensive expression in the loop
        invariant motion.

    iv-consider-all-candidates-bound

        Bound on number of candidates for induction variables below
        that all candidates are considered for each use in induction
        variable optimizations. Only the most relevant candidates are
        considered if there are more candidates, to avoid quadratic
        time complexity.

    iv-max-considered-uses

        The induction variable optimizations give up on loops that
        contain more induction variable uses.

    iv-always-prune-cand-set-bound

        If number of candidates in the set is smaller than this value,
        we always try to remove unnecessary ivs from the set during
        its optimization when a new iv is added to the set.

    scev-max-expr-size

        Bound on size of expressions used in the scalar evolutions
        analyzer. Large expressions slow the analyzer.

    scev-max-expr-complexity

        Bound on the complexity of the expressions in the scalar
        evolutions analyzer. Complex expressions slow the analyzer.

    omega-max-vars

        The maximum number of variables in an Omega constraint
        system. The default value is 128.

    omega-max-geqs

        The maximum number of inequalities in an Omega constraint
        system. The default value is 256.

    omega-max-eqs

        The maximum number of equalities in an Omega constraint
        system. The default value is 128.

    omega-max-wild-cards

        The maximum number of wildcard variables that the Omega solver
        will be able to insert. The default value is 18.

    omega-hash-table-size

        The size of the hash table in the Omega solver. The default
        value is 550.

    omega-max-keys

        The maximal number of keys used by the Omega solver. The
        default value is 500.

    omega-eliminate-redundant-constraints

        When set to 1, use expensive methods to eliminate all
        redundant constraints. The default value is 0.

    vect-max-version-for-alignment-checks

        The maximum number of runtime checks that can be performed
        when doing loop versioning for alignment in the
        vectorizer. See option ftree-vect-loop-version for more
        information.

    vect-max-version-for-alias-checks

        The maximum number of runtime checks that can be performed
        when doing loop versioning for alias in the vectorizer. See
        option ftree-vect-loop-version for more information.

    max-iterations-to-track

        The maximum number of iterations of a loop the brute force
        algorithm for analysis of # of iterations of the loop tries to
        evaluate.

    hot-bb-count-fraction

        Select fraction of the maximal count of repetitions of basic
        block in program given basic block needs to have to be
        considered hot.

    hot-bb-frequency-fraction

        Select fraction of the entry block frequency of executions of
        basic block in function given basic block needs to have to be
        considered hot

    max-predicted-iterations

        The maximum number of loop iterations we predict
        statically. This is useful in cases where function contain
        single loop with known bound and other loop with unknown. We
        predict the known number of iterations correctly, while the
        unknown number of iterations average to roughly 10. This means
        that the loop without bounds would appear artificially cold
        relative to the other one.

    align-threshold

        Select fraction of the maximal frequency of executions of
        basic block in function given basic block will get aligned.

    align-loop-iterations

        A loop expected to iterate at lest the selected number of
        iterations will get aligned.

    tracer-dynamic-coverage
    tracer-dynamic-coverage-feedback

        This value is used to limit superblock formation once the
        given percentage of executed instructions is covered. This
        limits unnecessary code size expansion.

        The tracer-dynamic-coverage-feedback is used only when profile
        feedback is available. The real profiles (as opposed to
        statically estimated ones) are much less balanced allowing the
        threshold to be larger value.

    tracer-max-code-growth

        Stop tail duplication once code growth has reached given
        percentage. This is rather hokey argument, as most of the
        duplicates will be eliminated later in cross jumping, so it
        may be set to much higher values than is the desired code
        growth.

    tracer-min-branch-ratio

        Stop reverse growth when the reverse probability of best edge
        is less than this threshold (in percent).

    tracer-min-branch-ratio
    tracer-min-branch-ratio-feedback

        Stop forward growth if the best edge do have probability lower
        than this threshold.

        Similarly to tracer-dynamic-coverage two values are present,
        one for compilation for profile feedback and one for
        compilation without. The value for compilation with profile
        feedback needs to be more conservative (higher) in order to
        make tracer effective.

    max-cse-path-length

        Maximum number of basic blocks on path that cse considers. The
        default is 10.

    max-cse-insns

        The maximum instructions CSE process before flushing. The
        default is 1000.

    ggc-min-expand

        GCC uses a garbage collector to manage its own memory
        allocation. This parameter specifies the minimum percentage by
        which the garbage collector's heap should be allowed to expand
        between collections. Tuning this may improve compilation
        speed; it has no effect on code generation.

        The default is 30% + 70% * (RAM/1GB) with an upper bound of
        100% when RAM >= 1GB. If getrlimit is available, the notion of
        "RAM" is the smallest of actual RAM and RLIMIT_DATA or
        RLIMIT_AS. If GCC is not able to calculate RAM on a particular
        platform, the lower bound of 30% is used. Setting this
        parameter and ggc-min-heapsize to zero causes a full
        collection to occur at every opportunity. This is extremely
        slow, but can be useful for debugging.

    ggc-min-heapsize

        Minimum size of the garbage collector's heap before it begins
        bothering to collect garbage. The first collection occurs
        after the heap expands by ggc-min-expand% beyond
        ggc-min-heapsize. Again, tuning this may improve compilation
        speed, and has no effect on code generation.

        The default is the smaller of RAM/8, RLIMIT_RSS, or a limit
        which tries to ensure that RLIMIT_DATA or RLIMIT_AS are not
        exceeded, but with a lower bound of 4096 (four megabytes) and
        an upper bound of 131072 (128 megabytes). If GCC is not able
        to calculate RAM on a particular platform, the lower bound is
        used. Setting this parameter very large effectively disables
        garbage collection. Setting this parameter and ggc-min-expand
        to zero causes a full collection to occur at every
        opportunity.

    max-reload-search-insns

        The maximum number of instruction reload should look backward
        for equivalent register. Increasing values mean more
        aggressive optimization, making the compile time increase with
        probably slightly better performance. The default value is
        100.

    max-cselib-memory-locations

        The maximum number of memory locations cselib should take into
        account. Increasing values mean more aggressive optimization,
        making the compile time increase with probably slightly better
        performance. The default value is 500.

    reorder-blocks-duplicate
    reorder-blocks-duplicate-feedback

        Used by basic block reordering pass to decide whether to use
        unconditional branch or duplicate the code on its
        destination. Code is duplicated when its estimated size is
        smaller than this value multiplied by the estimated size of
        unconditional jump in the hot spots of the program.

        The reorder-block-duplicate-feedback is used only when profile
        feedback is available and may be set to higher values than
        reorder-block-duplicate since information about the hot spots
        is more accurate.

    max-sched-ready-insns

        The maximum number of instructions ready to be issued the
        scheduler should consider at any given time during the first
        scheduling pass. Increasing values mean more thorough
        searches, making the compilation time increase with probably
        little benefit. The default value is 100.

    max-sched-region-blocks

        The maximum number of blocks in a region to be considered for
        interblock scheduling. The default value is 10.

    max-pipeline-region-blocks

        The maximum number of blocks in a region to be considered for
        pipelining in the selective scheduler. The default value is
        15.

    max-sched-region-insns

        The maximum number of insns in a region to be considered for
        interblock scheduling. The default value is 100.

    max-pipeline-region-insns

        The maximum number of insns in a region to be considered for
        pipelining in the selective scheduler. The default value is
        200.

    min-spec-prob

        The minimum probability (in percents) of reaching a source
        block for interblock speculative scheduling. The default value
        is 40.

    max-sched-extend-regions-iters

        The maximum number of iterations through CFG to extend
        regions. 0 - disable region extension, N - do at most N
        iterations. The default value is 0.

    max-sched-insn-conflict-delay

        The maximum conflict delay for an insn to be considered for
        speculative motion. The default value is 3.

    sched-spec-prob-cutoff

        The minimal probability of speculation success (in percents),
        so that speculative insn will be scheduled. The default value
        is 40.

    sched-mem-true-dep-cost

        Minimal distance (in CPU cycles) between store and load
        targeting same memory locations. The default value is 1.

    selsched-max-lookahead

        The maximum size of the lookahead window of selective
        scheduling. It is a depth of search for available
        instructions. The default value is 50.

    selsched-max-sched-times

        The maximum number of times that an instruction will be
        scheduled during selective scheduling. This is the limit on
        the number of iterations through which the instruction may be
        pipelined. The default value is 2.

    selsched-max-insns-to-rename

        The maximum number of best instructions in the ready list that
        are considered for renaming in the selective scheduler. The
        default value is 2.

    max-last-value-rtl

        The maximum size measured as number of RTLs that can be
        recorded in an expression in combiner for a pseudo register as
        last known value of that register. The default is 10000.

    integer-share-limit

        Small integer constants can use a shared data structure,
        reducing the compiler's memory usage and increasing its
        speed. This sets the maximum value of a shared integer
        constant. The default value is 256.

    min-virtual-mappings

        Specifies the minimum number of virtual mappings in the
        incremental SSA updater that should be registered to trigger
        the virtual mappings heuristic defined by
        virtual-mappings-ratio. The default value is 100.

    virtual-mappings-ratio

        If the number of virtual mappings is virtual-mappings-ratio
        bigger than the number of virtual symbols to be updated, then
        the incremental SSA updater switches to a full update for
        those symbols. The default ratio is 3.

    ssp-buffer-size

        The minimum size of buffers (i.e. arrays) that will receive
        stack smashing protection when -fstack-protection is used.

    max-jump-thread-duplication-stmts

        Maximum number of statements allowed in a block that needs to
        be duplicated when threading jumps.

    max-fields-for-field-sensitive

        Maximum number of fields in a structure we will treat in a
        field sensitive manner during pointer analysis. The default is
        zero for -O0, and -O1 and 100 for -Os, -O2, and -O3.

    prefetch-latency

        Estimate on average number of instructions that are executed
        before prefetch finishes. The distance we prefetch ahead is
        proportional to this constant. Increasing this number may also
        lead to less streams being prefetched (see
        simultaneous-prefetches).

    simultaneous-prefetches

        Maximum number of prefetches that can run at the same time.

    l1-cache-line-size

        The size of cache line in L1 cache, in bytes.

    l1-cache-size

        The size of L1 cache, in kilobytes.

    l2-cache-size

        The size of L2 cache, in kilobytes.

    min-insn-to-prefetch-ratio

        The minimum ratio between the number of instructions and the
        number of prefetches to enable prefetching in a loop.

    prefetch-min-insn-to-mem-ratio

        The minimum ratio between the number of instructions and the
        number of memory references to enable prefetching in a loop.

    use-canonical-types

        Whether the compiler should use the “canonical” type
        system. By default, this should always be 1, which uses a more
        efficient internal mechanism for comparing types in C++ and
        Objective-C++. However, if bugs in the canonical type system
        are causing compilation failures, set this value to 0 to
        disable canonical types.

    switch-conversion-max-branch-ratio

        Switch initialization conversion will refuse to create arrays
        that are bigger than switch-conversion-max-branch-ratio times
        the number of branches in the switch.

    max-partial-antic-length

        Maximum length of the partial antic set computed during the
        tree partial redundancy elimination optimization (-ftree-pre)
        when optimizing at -O3 and above. For some sorts of source
        code the enhanced partial redundancy elimination optimization
        can run away, consuming all of the memory available on the
        host machine. This parameter sets a limit on the length of the
        sets that are computed, which prevents the runaway
        behavior. Setting a value of 0 for this parameter will allow
        an unlimited set length.

    sccvn-max-scc-size

        Maximum size of a strongly connected component (SCC) during
        SCCVN processing. If this limit is hit, SCCVN processing for
        the whole function will not be done and optimizations
        depending on it will be disabled. The default maximum SCC size
        is 10000.

    ira-max-loops-num

        IRA uses a regional register allocation by default. If a
        function contains loops more than number given by the
        parameter, only at most given number of the most frequently
        executed loops will form regions for the regional register
        allocation. The default value of the parameter is 100.

    ira-max-conflict-table-size

        Although IRA uses a sophisticated algorithm of compression
        conflict table, the table can be still big for huge
        functions. If the conflict table for a function could be more
        than size in MB given by the parameter, the conflict table is
        not built and faster, simpler, and lower quality register
        allocation algorithm will be used. The algorithm do not use
        pseudo-register conflicts. The default value of the parameter
        is 2000.

    ira-loop-reserved-regs

        IRA can be used to evaluate more accurate register pressure in
        loops for decision to move loop invariants (see -O3). The
        number of available registers reserved for some other purposes
        is described by this parameter. The default value of the
        parameter is 2 which is minimal number of registers needed for
        execution of typical instruction. This value is the best found
        from numerous experiments.

    loop-invariant-max-bbs-in-loop

        Loop invariant motion can be very expensive, both in compile
        time and in amount of needed compile time memory, with very
        large loops. Loops with more basic blocks than this parameter
        won't have loop invariant motion optimization performed on
        them. The default value of the parameter is 1000 for -O1 and
        10000 for -O2 and above.

    max-vartrack-size

        Sets a maximum number of hash table slots to use during
        variable tracking dataflow analysis of any function. If this
        limit is exceeded with variable tracking at assignments
        enabled, analysis for that function is retried without it,
        after removing all debug insns from the function. If the limit
        is exceeded even without debug insns, var tracking analysis is
        completely disabled for the function. Setting the parameter to
        zero makes it unlimited.

    min-nondebug-insn-uid

        Use uids starting at this parameter for nondebug insns. The
        range below the parameter is reserved exclusively for debug
        insns created by -fvar-tracking-assignments, but debug insns
        may get (non-overlapping) uids above it if the reserved range
        is exhausted.

    ipa-sra-ptr-growth-factor

        IPA-SRA will replace a pointer to an aggregate with one or
        more new parameters only when their cumulative size is less or
        equal to ipa-sra-ptr-growth-factor times the size of the
        original pointer parameter.

    graphite-max-nb-scop-params

        To avoid exponential effects in the Graphite loop transforms,
        the number of parameters in a Static Control Part (SCoP) is
        bounded. The default value is 10 parameters. A variable whose
        value is unknown at compile time and defined outside a SCoP is
        a parameter of the SCoP.

    graphite-max-bbs-per-function

        To avoid exponential effects in the detection of SCoPs, the
        size of the functions analyzed by Graphite is bounded. The
        default value is 100 basic blocks.

    loop-block-tile-size

        Loop blocking or strip mining transforms, enabled with
        -floop-block or -floop-strip-mine, strip mine each loop in the
        loop nest by a given number of iterations. The strip length
        can be changed using the loop-block-tile-size parameter. The
        default value is 51 iterations.

    devirt-type-list-size

        IPA-CP attempts to track all possible types passed to a
        function's parameter in order to perform
        devirtualization. devirt-type-list-size is the maximum number
        of types it stores per a single formal parameter of a
        function.

    lto-partitions

        Specify desired number of partitions produced during WHOPR
        compilation. The number of partitions should exceed the number
        of CPUs used for compilation. The default value is 32.

    lto-minpartition

        Size of minimal partition for WHOPR (in estimated
        instructions). This prevents expenses of splitting very small
        programs into too many partitions.

    cxx-max-namespaces-for-diagnostic-help

        The maximum number of namespaces to consult for suggestions
        when C++ name lookup fails for an identifier. The default is
        1000.
*/
