exposure_to_site_tools.py 78.2 KB
Newer Older
g-weatherill's avatar
g-weatherill committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
"""
Python tools for assigning site information to different exposure configuration

Data:
1. Input slope and elevation data files at 30 arc seconds in hdf5 binaries
2. Input geology model as a shapefile

"""
import os
import warnings
import h5py
import argparse
import numpy as np
import pandas as pd
import geopandas as gpd
from scipy import stats
from scipy.interpolate import RegularGridInterpolator
from pyproj import CRS, Transformer
from shapely.geometry import Point
from exposure2site.node_handler import Node, nrml_read, nrml_write

# Coordinate reference systems
CRS_WGS84 = CRS.from_epsg(4326)
MOLL_STR = '+ellps=WGS84 +lon_0=0 +no_defs +proj=moll +units=m +x_0=0 +y_0=0'
CRS_MOLL = CRS.from_string(MOLL_STR)
EUROPE_EQUAL_AREA = "epsg:3035"
SMOOTHED_N = [1, 3, 5]
# Transformations between Molleweide and WGS84
TRANSFORMER = Transformer.from_crs(CRS_WGS84, CRS_MOLL, always_xy=True)
TRANSFORMER_REV = Transformer.from_crs(CRS_MOLL, CRS_WGS84, always_xy=True)

# Geological eras stored as integers - relates the geological era to the int
GEOL_DICT_KEY_TO_NAME = {0: "UNKNOWN",
                         1: "PRECAMBRIAN",
                         2: "PALEOZOIC",
                         3: "JURASSIC-TRIASSIC",
                         4: "CRETACEOUS",
                         5: "CENOZOIC",
                         6: "PLEISTOCENE",
                         7: "HOLOCENE"}


GEOL_DICT_NAME_TO_KEY = {"UNKNOWN": 0,
                         "PRECAMBRIAN": 1,
                         "PALEOZOIC": 2,
                         "JURASSIC-TRIASSIC": 3,
                         "CRETACEOUS": 4,
                         "CENOZOIC": 5,
                         "PLEISTOCENE": 6,
                         "HOLOCENE": 7}


# Data files
54
DATA_PATH = os.path.join(os.path.dirname(__file__), "site_data")
g-weatherill's avatar
g-weatherill committed
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
# Geology shapefiles
GEOLOGY_FILE = os.path.join(DATA_PATH, "GEOL_V8_ERA2.shp")
# File containing 30 arc second grid of slope, geology, elevation and
# built density
# BBOX = [-25.0, 34.0, 45.0, 72.0]
SITE_HDF5_FILE = os.path.join(DATA_PATH, "europe_gebco_topography.hdf5")
# File containing Vs30s at 30 arcseconds using the USGS Wald & Allen method
# BBOX = [-30;, 30., 65., 75.]
VS30_HDF5_FILE = os.path.join(DATA_PATH, "europe_2020_vs30.hdf5")
# Pre-computed grid of volcanic front distances at 5 arc-minutes
# BBOX = [-25.0, 34.0, 45.0, 72.0]
VF_DIST_FILE = os.path.join(DATA_PATH, "volcanic_front_distances_5min.hdf5")
# Europe slice of 250 m x 250 m GHS built density dataset (in Mollewiede proj.)
BUILT_ENV_FILE = os.path.join(DATA_PATH, "ghs_global_250m_Europe.hdf5")
# ESHM20 Attenuation Region File
REGION_FILE = os.path.join(DATA_PATH, "ATTENUATION_REGIONS.shp")

# See if the files are present and raise warnings if not
for fname in [GEOLOGY_FILE, SITE_HDF5_FILE, VS30_HDF5_FILE, VF_DIST_FILE,
              BUILT_ENV_FILE, REGION_FILE]:
    if not os.path.exists(fname):
        warnings.warn("Required file %s not found in data directory "
                      "- if needed, please download from ..." % fname)


# =============================================================================
# ================= General utility functions =================================
# =============================================================================


def layer_to_gridfile(gx, gy, data, spc, output_file, fmt="%.3f"):
    """
    Exports a layer to an ESRI ascii raster file
    """
    odata = np.copy(data)
    odata[np.isnan(odata)] = -999.0
    with open(output_file, "w") as f:
        f.write("NCOLS %g\n" % len(gx))
        f.write("NROWS %g\n" % len(gy))
        f.write("XLLCENTER %f\n" % gx[0])
        f.write("YLLCENTER %f\n" % gy[-1])
        f.write("CELLSIZE %f\n" % spc)
        f.write("NODATA_VALUE %f\n" % -999.0)
        np.savetxt(f, odata, fmt=fmt)


def get_admin_level_definition(country_name, path, output_dfs=False):
    """
    < From Cecilia Nievas >
    This function returns the most-detailed administrative level for which the
    SERA exposure model is defined for country_name. The output is a dictionary
    with the admin level for each case of exposure (Res, Com, Ind), and a list
    grouping these three cases according to common levels of administrative
    units.

    I have tested it manually (and it has passed the test) for:
    - France ({'Res': 5, 'Com': 2, 'Ind': 5}, [['Com'], ['Res', 'Ind']], [2, 5], {dataframes_of_csv_files})
    - Germany ({'Res': 4, 'Com': 3, 'Ind': 4}, [['Com'], ['Res', 'Ind']], [3, 4], {dataframes_of_csv_files})
    - Greece ({'Res': 3, 'Com': 3, 'Ind': 3}, [['Res', 'Com', 'Ind']])
    - Italy ({'Res': 3, 'Com': 3, 'Ind': 3}, [['Res', 'Com', 'Ind']])
    - Spain ({'Res': 2, 'Com': 1, 'Ind': 2}, [['Com'], ['Res', 'Ind']])
    - United Kingdom ({'Res': 2, 'Com': 1, 'Ind': 2}, [['Com'], ['Res', 'Ind']])

    NOTE: It is not possible to break the for i in range(1, max_possible_level+1)
    loop because there are files for which intermediate levels are missing but
    more detailed levels are available.
    """
    max_possible_level = 5
    exposure_cases = ['Res', 'Com', 'Ind']
    actual_levels_list = [0, 0, 0]
    out_dataframes = {}
    for k, case in enumerate(exposure_cases):
        aux_df = pd.read_csv(os.path.join(path,
            'Exposure_' + case + '_' + country_name + '.csv'), sep=',')
        if output_dfs:
            out_dataframes[case] = aux_df
        for i in range(0, max_possible_level + 1):
            if 'id_' + str(i) in aux_df.columns:
                if np.all(aux_df['id_' + str(i)].values != 'No_tag'): # if not('No_tag' in aux_df['id_'+str(i)].values):
                    actual_levels_list[k] = i
    groups = []
    groups_levels = []
    case_count = 0
    for i in range(0, max_possible_level + 1):
        which = np.where(np.array(actual_levels_list) == i)[0]
        if len(which) > 0.5:
            aux_list = []
            for pos in which:
                aux_list.append(exposure_cases[pos])
                case_count = case_count + 1
            groups.append(aux_list)
            groups_levels.append(i)
        if case_count >= len(exposure_cases):
            break
    actual_levels = {}
    actual_levels['Res'] = actual_levels_list[0]
    actual_levels['Com'] = actual_levels_list[1]
    actual_levels['Ind'] = actual_levels_list[2]
    return actual_levels, groups, groups_levels, out_dataframes


def get_maximum_admin_level(admin):
    """
    Get the maximum admin level defined in an administrative geodataframe
    """
    max_admin_level = 0
    for key in admin.columns:
        if key.startswith("ID_"):
            admin_level = int(key.lstrip("ID_"))
            if admin_level > max_admin_level:
                max_admin_level = admin_level
    return max_admin_level


def dissolve_admin(admin, adm_level):
    """
    If the desired admin level is lower than that for which the admin is
    defined (and assuming all admins are given in the file via ID_0, ID_1,
    ID_2 etc.) then dissolve the administrative regions to a lower level
    """
    if adm_level >= get_maximum_admin_level(admin):
        # No need to dissolve as the desired level is at the resolution level
        return admin
    id_str = "ID_{:g}".format(adm_level)
    name_str = "NAME_{:g}".format(adm_level)
    output_admin = admin.dissolve(id_str)
    # As dissovling turns the target colummn into the index, build a simpler
    # dataframe including all attributes at a lower admin level
    data4admin = {"geometry": output_admin["geometry"].values,
                  id_str: output_admin.index.values,
                  name_str: output_admin[name_str].values}
    for i in range(adm_level):
        lev_id = "ID_{:g}".format(i)
        lev_name = "NAME_{:g}".format(i)
        data4admin[lev_id] = output_admin[lev_id].values
        data4admin[lev_name] = output_admin[lev_name].values
    return gpd.GeoDataFrame(data4admin)


KEY_MAPPER = dict([("id_{:g}".format(g), "ID_{:g}".format(g))
               for g in range(6)])
for i in range(6):
    KEY_MAPPER["name_{:g}".format(i)] = "NAME_{:g}".format(i)


def get_site_set_from_exposure(exposure_file,
                               shapefile_path="./Europe_shapefiles/",
                               target_admin_level=None):
    """
    The point exposure models are separated from the polygon geometries, the
    former in csv (assigning the point to the admin level) and the latter in
    shapefiles (mostly, but not always, corresponding to the same admin level).
    For a given exposure model file this finds the corresponding shapefile,
    loads in the dataframe and returns a geopandas GeoDataframe with the
    polygons as the geometry column and the lons, lats of the exposure model
    added as separate columns. This permits the spatial aggregation of the
    site model over the complete polygon whilst assigning the values to the
    desired exposure points.

    Note that duplicate points are dropped
    """
    # Retrieve country and exposure type from filename
    file_info = os.path.splitext(
        os.path.split(exposure_file)[-1])[0].split("_")
    country = file_info[-1]
    exposure_type = file_info[-2]

    # retrive country name for op to three words (e.g., Isle_of_Man)
    if len(file_info) == 4:
        country = file_info[-2] + '_' + file_info[-1]
        exposure_type = file_info[-3]
    elif len(file_info) == 5:
        country = file_info[-3] + '_' + file_info[-2] + '_' + file_info[-1]
        exposure_type = file_info[-4]
    else:
        country = file_info[-1]
        exposure_type = file_info[-2]

    # Read in the exposure file
    exposure = pd.read_csv(exposure_file, sep=",")
    # Some exposure files have ID and NAME colums in lowe case, whilst the
    # polygons have it in upper case. This transforms the admin level IDs and
    # NAME headers all to upper case
    exposure = exposure.rename(KEY_MAPPER, axis='columns')
    # Determine the highest admin level adopted within the exposure model
    admin_level = 0
    for col in exposure.columns:
        if col.upper().startswith("ID_"):
            level = int(col.split("_")[1])
            if level > admin_level:
                # Goes to a higher admin level
                admin_level = level
    # Check a coreresponsing shapefile can be found
    shp_fname = "Adm{:g}_{:s}.shp".format(admin_level, country)
    if shp_fname not in os.listdir(shapefile_path):
        raise ValueError(
            "Shapefile %s not found in path %s" % (shp_fname, shapefile_path))
    # Load in the geometry shapefiles for the admin units
    admin = gpd.GeoDataFrame.from_file(os.path.join(shapefile_path, shp_fname))
    if target_admin_level and (target_admin_level < admin_level):
        # The admin level of the exposure is lower than the admin level of the
        # shapefile - dissolve the shapefile to a lower admin level
        admin = dissolve_admin(admin, target_admin_level)
        admin.crs = {"init": "epsg:4326"}
    else:
        target_admin_level = admin_level
    # Drop the duplicate sites in the exposure model to retain only one
    # location per admin region
    exposure_clean = exposure.drop_duplicates(["lon", "lat"])

    # Now check that every site in the exposure has a corresponding
    # polygon in the admin file and if so add the exposure site lon and lat
    # to the shapefile
    # change type of id-column to string consistent with admin
    admin_id = "ID_{:g}".format(target_admin_level)
    if type(admin[admin_id].iloc[0]) == str:
        exposure_clean[admin_id] = exposure_clean[admin_id].astype(str)
    # discard lead zeros (i.e., ID_1= 0111)
    # csv exposure discards leaded by zero
    for ii in range(len(admin)):
        if str(admin[admin_id].loc[ii])[0] == '0':
            admin.loc[ii, admin_id] = admin[admin_id].loc[ii][1:]

    # exclude geometries missing from exposure
    admin = admin[((admin[admin_id].isin(exposure_clean[admin_id])))]
    admin.index = range(len(admin.index))

    lons = []
    lats = []
    for adm in admin[admin_id].values:
        idx = exposure_clean[admin_id].values == adm
        if np.any(idx):
            lon = exposure_clean[idx].lon.values[0]
            lat = exposure_clean[idx].lat.values[0]
            lons.append(lon)
            lats.append(lat)
        else:
            print("No geometry found for {:s} {:s}".format(admin_id, str(adm)))
            lons.append(np.nan)
            lats.append(np.nan)
    admin["lon"] = np.array(lons)
    admin["lat"] = np.array(lats)
    return admin, target_admin_level


# =============================================================================
# ======= Collection of methods for retrieving maxs, means and medians ========
# ======= from weighted data ==================================================


def weighted_mean(values, weights):
    """
    Returns the weighted mean of a vector
    """
    # Normalise the weights
    sum_weights = np.sum(weights)
    if not sum_weights:
        # No weights found in data - return normal mean
        return np.mean(values)
    weights /= sum_weights
    return np.sum(values * weights)


def weighted_median(values, weights):
    """
    Returns the weighted median of a vector
    """
    sum_weights = np.sum(weights)
    if not sum_weights:
        # No weights found in data - return normal median
        return np.percentile(values, 50)
    # Normalise the weights
    weights /= sum_weights
    # Sort the values in ascending order
    idx = np.argsort(values)
    # Cumulate the weights and values
    cum_weights = np.cumsum(weights[idx])
    values = values[idx]
    # Identify the the location of the element with the
    # weight of 0.5 or greater
    iloc = np.where(cum_weights >= 0.5)[0][0]
    return values[iloc]


def max_weight(values, weights):
    """
    Returns the highest weighted value from a vector
    """
    if not np.sum(weights):
        # No weights given - take ordinary max
        return np.max(values)
    return values[np.argmax(weights)]


WEIGHTING_METHODS = {
    "MEAN": weighted_mean,
    "MAX": max_weight,
    "MEDIAN": weighted_median,
}


UNWEIGHTED_METHODS = {
    "MEAN": lambda x: np.mean(x),
    "MEDIAN": lambda x: np.percentile(x, 50),
    "MAX": lambda x: np.max(x)
}


# =============================================================================
# === Collection of methods for retrieving weighted centroids from data =======
# =============================================================================


def weighted_centroid(xlocs, ylocs, weights):
    """
    Returns the centroid of a set of weighted points
    """
    sum_weights = np.sum(weights)
    if sum_weights:
        # Normalise the weights
        weights /= sum_weights
    else:
        # Assume evenly weighted - should produce same output as normal
        # centroid
        return np.mean(xlocs), np.mean(ylocs)
    # Get centroid
    centx = np.sum(xlocs * weights)
    centy = np.sum(ylocs * weights)
    return centx, centy


def max_weight_centroid(xlocs, ylocs, weights):
    """
    Returns the location of the point of maximum weight from a set of
    weighted points
    """
    if not np.sum(weights):
        # No weights, so just return normal centroid
        return np.mean(xlocs), np.mean(ylocs)
    amax = np.argmax(weights)
    centx = xlocs[amax]
    centy = ylocs[amax]
    return centx, centy


def weighted_medoid(xlocs, ylocs, weights):
    """
    Returns the medoid of a set of points (i.e. the location within the
    data set exceeded by half the weight)
    """
    sum_weights = np.sum(weights)
    if not sum_weights:
        # No weights, so just return normal centroid
        return np.mean(xlocs), np.mean(ylocs)
    # Normalise the weights
    weights /= np.sum(weights)
    # Re-order the weights and points from lowest weight to highest
    idx = np.argsort(weights)
    xlocs = xlocs[idx]
    ylocs = ylocs[idx]
    # Cumulate the normalised weights
    cum_w = np.cumsum(weights[idx])
    # Find the first point where the weights exceed 0.5
    iloc = np.where(cum_w >= 0.5)[0][0]
    centx = xlocs[iloc]
    centy = ylocs[iloc]
    return centx, centy


CENTROID_METHODS = {
    "MEAN": weighted_centroid,
    "MAX": max_weight_centroid,
    "MEDIAN": weighted_medoid
}


# =============================================================================
# =========== Utility methods for accessing/slicing data sets =================
# =============================================================================

def slice_wgs84_datafile(dbfile, bounds, datakey, flatten=False):
    """
    For data stored in an hdf5 data file, slice out the data set within
    a bounding box

    :param str dbfile:
        Path to the database file
    :param list bounds:
        Bounding box define as [llon, llat, ulon, ulat]
    :param str datakey:
        Name of the dataset to be sliced
    :param bool flatten:
        If True then flattens the sliced data to three 1-D vectors of lon, lat
        and data, otherwise returns the lons and lats as 1-D vectors of
        (nlon,) and (nlat,) respectively and data as a 2-D vector (nlat, nlon)
    """
    llon, llat, ulon, ulat = tuple(bounds)
    db = h5py.File(dbfile, "r")
    lons = db["longitude"][:]
    lats = db["latitude"][:]
    idx_lon = np.logical_and(lons >= llon, lons <= ulon)
    idx_lat = np.logical_and(lats >= llat, lats <= ulat)
    lons = lons[idx_lon]
    lats = lats[idx_lat]
    data = db[datakey][idx_lat, :][:, idx_lon]
    db.close()
    if flatten:
        lons, lats = np.meshgrid(lons, lats)
        return lons.flatten(), lats.flatten(), data.flatten()
    else:
        return lons, lats, data


def interpolate_xvf_grid(target_lons, target_lats, default_xvf=150.):
    """
    Defines the volcanic front distance(xvf) at a set of locations by direct 2D
    linear interpolation from the 5 arc minute grid
    :param np.ndrray target_lons:
        Vector (1D) of longitudes
    :param np.ndarray target_lats:
        Vector (1D) of latitudes
    :param float default_xvf:
        Default volcanic front distance (km)
    """

    # Define the bounding box of the region - with a 10 arc minute buffer
    spc = 1. / 6.
    bbox = (np.min(target_lons) - spc, np.min(target_lats) - spc,
            np.max(target_lons) + spc, np.max(target_lats) + spc)
    # Extract the slice of the 5' volcanic distance grid
    lons, lats, xvf = slice_wgs84_datafile(VF_DIST_FILE, bbox, "xvf")
    if np.all(np.fabs(xvf - default_xvf) < 1.0E-3):
        # No relevant volcanic distance within bounding box
        return default_xvf * np.ones(target_lons.shape)
    # Interpolate from regular grid to target sites
    lons, lats = np.meshgrid(lons, lats)
    ipl = RegularGridInterpolator((lons[0, :], lats[:, 0][::-1]),
                                  xvf[::-1, :].T, bounds_error=False,
                                  fill_value=default_xvf)
    output_xvf = ipl(np.column_stack([target_lons.flatten(),
                                      target_lats.flatten()]))
    return np.reshape(output_xvf, target_lons.shape)


# =============================================================================
# =========== Methods to downsample grids (w/ or w/o weighting) ===============
# =============================================================================

def downsample_weighted_grid(lons, lats, data, weights, num, method="MEAN"):
    """
    Downsamples a grid of weighted data returning a coarser grid with the
    cell values determined from the corresponding statistical processing
    (MEAN, MEDIAN or MAX)

    :param np.ndarray lons:
        Vector of (nlon,) longitudes of the grid cell centres
    :param np.ndarray lats:
        Vector of (nlat,) latitudes of the grid cell centres
    :param np.ndarray data:
        Grid of data to be downsampled of size (nlat, nlon)
    :param np.ndarray weights:
        Grid of weights to be applied to the data (nlat, nlon)
    :param int num:
        Number of higher resolution cells per low resolution cell
    :param str method:
        Choice of method for defining the downsampled cell values from ("MEAN",
        "MEDIAN" or "MAX")
    :returns:
        ds_data: Downsamples grid values
        gx: Downsamples longitudes (cell centres)
        gy: Downsamples latitudes (cell centres)
    """
    nlon, nlat = len(lons), len(lats)
    # Longitudes and latitudes must be the same shape as the data
    assert nlat == data.shape[0]
    assert nlon == data.shape[1]
    xloc = np.arange(0, nlon, num)
    yloc = np.arange(0, nlat, num)
    spcx = np.diff(lons)
    spcy = np.diff(lats)
    # Get centres of downsample grid
    gx = lons[xloc] - (spcx[0] / 2.) + ((num / 2.) * spcx[0])
    gy = lats[yloc] - (spcy[0] / 2.) + ((num / 2.) * spcy[0])
    ds_data = np.zeros([len(yloc), len(xloc)])
    for j in range(len(yloc)):
        if j == (len(yloc) - 1):
            rval_m = data[yloc[j]:, :]
            weights_m = weights[yloc[j]:, :]
        else:
            rval_m = data[yloc[j]:yloc[j + 1], :]
            weights_m = weights[yloc[j]:yloc[j + 1], :]
        for i in range(len(xloc)):
            if i == (len(xloc) - 1):
                cell_data = rval_m[:, xloc[i]:]
                cell_weights = weights_m[:, xloc[i]:]
            else:
                cell_data = rval_m[:, xloc[i]:xloc[i + 1]]
                cell_weights = weights_m[:, xloc[i]:xloc[i + 1]]
            # Get valid number of cells
            idx_data = np.logical_and(np.logical_not(np.isnan(cell_data)),
                                      np.logical_not(np.isinf(cell_data)))
            if not np.any(idx_data):
                # No valid data at all in cell
                ds_data[j, i] = np.nan
            cell_data = cell_data[idx_data]
            cell_weights = cell_weights[idx_data]
            if not np.any(cell_weights > 0.0):
                # All cells have a zero weight - therefore they
                # can be considered evenly weighted and only the mean should
                # be considered
                ds_data[j, i] = np.mean(cell_data)
            else:
                # Normalise the weights
                ds_data[j, i] = WEIGHTING_METHODS[method](
                    cell_data.flatten(), cell_weights.flatten())
    return ds_data, gx, gy


def downsample_grid(lons, lats, data, num, method="MEAN"):
    """
    Downsamples a grid of data returning a coarser grid with the
    cell values determined from the corresponding statistical processing
    (MEAN, MEDIAN or MAX)

    :param np.ndarray lons:
        Vector of (nlon,) longitudes of the grid cell centres
    :param np.ndarray lats:
        Vector of (nlat,) latitudes of the grid cell centres
    :param np.ndarray data:
        Grid of data to be downsampled of size (nlat, nlon)
    :param int num:
        Number of higher resolution cells per low resolution cell
    :param str method:
        Choice of method for defining the downsampled cell values from
        ("MEAN", "MEDIAN" or "MAX")
    :returns:
        ds_data: Downsamples grid values
        gx: Downsamples longitudes (cell centres)
        gy: Downsamples latitudes (cell centres)
    """
    nlon, nlat = len(lons), len(lats)
    # Longitudes and latitudes must be the same shape as the data
    assert nlat == data.shape[0]
    assert nlon == data.shape[1]
    xloc = np.arange(0, nlon, num)
    yloc = np.arange(0, nlat, num)
    spcx = np.diff(lons)
    spcy = np.diff(lats)
    # Get centres of downsample grid
    gx = lons[xloc] - (spcx[0] / 2.) + ((num / 2.) * spcx[0])
    gy = lats[yloc] - (spcy[0] / 2.) + ((num / 2.) * spcy[0])
    ds_data = np.zeros([len(yloc), len(xloc)])
    for j in range(len(yloc)):
        if j == (len(yloc) - 1):
            rval_m = data[yloc[j]:, :]
        else:
            rval_m = data[yloc[j]:yloc[j + 1], :]
        for i in range(len(xloc)):
            if i == (len(xloc) - 1):
                cell_data = rval_m[:, xloc[i]:]
            else:
                cell_data = rval_m[:, xloc[i]:xloc[i + 1]]
            # Get valid number of cells
            idx_data = np.logical_and(np.logical_not(np.isnan(cell_data)),
                                      np.logical_not(np.isinf(cell_data)))
            if not np.any(idx_data):
                # No valid data at all in cell
                ds_data[j, i] = np.nan
            else:
                cell_data = cell_data[idx_data]
                ds_data[j, i] = UNWEIGHTED_METHODS[method](cell_data)
    return ds_data, gx, gy


def downsample_modal_grid(lons, lats, data, num):
    """
    Downsamples a grid of categorical data, assigning to each grid cell the
    modal value found within the subcells

    :param np.ndarray lons:
        Vector of (nlon,) longitudes of the grid cell centres
    :param np.ndarray lats:
        Vector of (nlat,) latitudes of the grid cell centres
    :param np.ndarray data:
        Grid of data to be downsampled of size (nlat, nlon)
    :param int num:
        Number of higher resolution cells per low resolution cell
    :returns:
        ds_data: Downsampled grid values
        gx: Downsampled longitudes (cell centres)
        gy: Downsampled latitudes (cell centres)
    """
    nlon, nlat = len(lons), len(lats)
    # Longitudes and latitudes must be the same shape as the data
    assert nlat == data.shape[0]
    assert nlon == data.shape[1]
    xloc = np.arange(0, nlon, num)
    yloc = np.arange(0, nlat, num)
    spcx = np.diff(lons)
    spcy = np.diff(lats)
    gx = lons[xloc] - (spcx[0] / 2.) + ((num / 2.) * spcx[0])
    gy = lats[yloc] - (spcy[0] / 2.) + ((num / 2.) * spcy[0])
    ds_data = np.zeros([len(yloc), len(xloc)], dtype=data.dtype)
    if ("float" in data.dtype.name) or ("int" in data.dtype.name):
        # Data is float or int
        is_num = True
    else:
        # Could be a categorical string array
        is_num = False

    for j in range(len(yloc)):
        if j == (len(yloc) - 1):
            rval_m = data[yloc[j]:, :]
        else:
            rval_m = data[yloc[j]:yloc[j + 1], :]
        for i in range(len(xloc)):
            if i == (len(xloc) - 1):
                cell_data = rval_m[:, xloc[i]:]
            else:
                cell_data = rval_m[:, xloc[i]:xloc[i + 1]]
            # Get valid number of cells
            if is_num:
                idx_data = np.logical_and(np.logical_not(np.isnan(cell_data)),
                                          np.logical_not(np.isinf(cell_data)))
            else:
                idx_data = np.logical_and(np.logical_not(cell_data == "inf"),
                                          np.logical_not(cell_data == "nan"))
            if not np.any(idx_data):
                # No valid data at all in cell
                ds_data[j, i] = np.nan
            else:
                ds_data[j, i] = stats.mode(cell_data[idx_data],
                                           axis=None).mode[0]
    return ds_data, gx, gy


# =============================================================================
# ===== Methods to build slope, geology, vs30 and backarc distance grids ======
# =============================================================================


def downsample_slope_geology_vs30_grid(bbox, num, weighting, averaging="MEAN",
                                       geol_weighting="MODE", default_xvf=150.,
                                       n=0, onshore_only=True,
                                       as_dataframe=False):
    """
    Starting from the 30 arc-seconds grid, downsamples the model to a
    coarser regular grid within a bounding box with spacing a multiple of 30
    arc-seconds

    :param list bbox:
        Bounding box [llon, llat, ulon, ulat] in decimal degrees
    :param int num:
        Number of 30 arc second cells per downsampled cell (e.g. 60 arc-seconds
        = 2, 90 arc-seconds=3, 120 arc-seconds=4)
    :param bool weighting:
        Weight the averages according to density
    :param str averaging:
        Choice of method for taking the values in the downsampled cells
        ("MEAN", "MEDIAN" or "MAX")
    :param str geol_weighting:
        For the geology take either the most common unit per downsampled
        grid cell ("MODE") or sort the units by weighting and take the median
    :param float default_xvf:
        Default value of volcanic front distance for sites unaffected by
        subduction or deep earthquakes
    :param bool onshore_only:
        Limit the returned dataframe only to those sites onshore (as determined
        by elevation >= -5 m above M.S.L.
    :param bool as_dataframe:
        Return the model as a geopandas GeoDataFrame (True) or as a set of
        arrays (False)
    """
    if not (averaging in ("MEAN", "MEDIAN", "MAX")):
        raise ValueError("Averaging method %s not recognised" % averaging)
    # Get the reference longitudes, latitudes, slopes and geology grids
    print("---- Retrieving data within bounding box")
    lons, lats, slope = slice_wgs84_datafile(SITE_HDF5_FILE, bbox, "slope")
    geology = slice_wgs84_datafile(SITE_HDF5_FILE, bbox, "geology")[2]
    vs30 = slice_wgs84_datafile(VS30_HDF5_FILE, bbox, "vs30")[2]

    if weighting or (geol_weighting == "MEDIAN"):
        # Using weighting, so extract density
        density_key = "built_density"
        if n and (n in SMOOTHED_N):
            # Use the smoothed density
            density_key += "_n{:g}".format(n)

        density = slice_wgs84_datafile(SITE_HDF5_FILE, bbox, density_key)[2]
    print("---- Downsampling slope and Vs30")
    if weighting:
        # Get the slopes and vs30s weighted by the building density
        slopes, glons, glats = downsample_weighted_grid(
            lons, lats, slope, density, num, method=averaging)
        vs30s = downsample_weighted_grid(lons, lats, vs30, density, num,
                                         method=averaging)[0]
    else:
        # Get the slopes and vs30s by averaging without weighting
        slopes, glons, glats = downsample_grid(lons, lats, slope, num,
                                               method=averaging)
        vs30s = downsample_grid(lons, lats, vs30, density, num,
                                method=averaging)[0]
    # Need to add on the geology
    print("---- Downsampling geology")
    if geol_weighting in ("MEDIAN", "MAX"):
        # Get the weighted median or weighted max geological category per cell
        geologies = downsample_weighted_grid(lons, lats, geology, density,
                                             num, method=geol_weighting)[0]
    else:
        # Gets the modal geological category per cell
        geologies = downsample_modal_grid(lons, lats, geology, num)[0]
    # Need to add on backarc distances for the target sites
    print("---- Getting backarc distance")
    glons, glats = np.meshgrid(glons, glats)
    xvf = interpolate_xvf_grid(glons, glats, default_xvf)

    if onshore_only:
        print("---- Clipping to onshore")
        raw_elevation = slice_wgs84_datafile(SITE_HDF5_FILE, bbox,
                                             "elevation")[2]
        if weighting:
            elevations = downsample_weighted_grid(lons, lats, raw_elevation,
                                                  density, num,
                                                  method=averaging)[0]
        else:
            elevations = downsample_grid(lons, lats, raw_elevation,
                                         num, method=averaging)[0]
        idx = elevations >= -5.0
    else:
        idx = np.ones(glons.shape, dtype=bool)

    if as_dataframe:
        print("---- Returning dataframe")
        # Return the grids as geopandas GeoDataFrame
        glons = glons[idx].flatten()
        glats = glats[idx].flatten()
        return gpd.GeoDataFrame({
            "geometry": gpd.GeoSeries([Point(lon, lat) for lon, lat in
                                       zip(glons, glats)]),
            "lon": glons,
            "lat": glats,
            "vs30": vs30s[idx].flatten(),
            "slope": slopes[idx].flatten(),
            "geology": [GEOL_DICT_KEY_TO_NAME[geol]
                        for geol in geologies[idx].flatten()],
            "xvf": xvf[idx]})
    else:
        # Returns the grids as 2D array
        xvf = np.reshape(xvf, glons.shape)
        return slopes, vs30s, geologies, xvf, glons, glats, idx


def get_property_at_location(db, longitude, latitude, property_name):
    """
    Returns the property at the location from a regional raster data set
    stored in hdf5 - good for single sites and attributes, but too slow for
    large data sets
    """
    if latitude < db["latitude"][-1] or latitude > db["latitude"][0]:
        print("Latitude %.5f out of range")
        return np.nan
    if latitude < db["longitude"][0] or longitude > db["longitude"][-1]:
        print("Longitude %.5f out of range")
        return np.nan
    idx_x = np.where(db["longitude"][:] <= longitude)[0][-1]
    idx_y = np.where(db["latitude"][:] >= latitude)[0][-1]
    value = db[property_name][idx_y, idx_x]
    if value < 0:
        value = np.nan
    else:
        value = float(value)
    return value


def get_slope_geology_vs30_at_location(site_lons, site_lats, spc=(1. / 120.),
                                       onshore_only=True, as_dataframe=False):
    """
    Retrieves the slope and geology at a set of locations
    :param np.ndarray site_lons:
        Longitudes of target sites
    :param np.ndarray site_lats:
        Latitudes of target sites
    :param float spc:
        Spacing of reference hdf5 file with data (probably 120 arc seconds)
    :returns:
        site_slope: slopes of the site (in m/m)
        site_vs30: Inferred Vs30 (in m/s)
        site_geology: geological index of the site
        site_backarc_distance: backarc distance (in km)
    """
    # Get bounding box of sites (with spc buffer)
    bbox = [np.min(site_lons) - spc, np.min(site_lats) - spc,
            np.max(site_lons) + spc, np.max(site_lats) + spc]
    # Get slope and geology from hdf5 file
    print("---- Retrieving data within bounding box")
    lons, lats, slope = slice_wgs84_datafile(SITE_HDF5_FILE, bbox, "slope")
    geology = slice_wgs84_datafile(SITE_HDF5_FILE, bbox, "geology")[2]
852
    lons_vs, lats_vs, vs30 = slice_wgs84_datafile(VS30_HDF5_FILE, bbox, "vs30")
g-weatherill's avatar
g-weatherill committed
853
854
855
856
857
858
859
860
    if onshore_only:
        elevation = slice_wgs84_datafile(SITE_HDF5_FILE, bbox, "elevation")[2]
    else:
        elevation = np.zeros(lons.shape)
    print("---- Identifying site values")
    # Integer number of multiples of spc indicates the x and y locations
    dx = ((site_lons - (lons[0] - spc / 2.)) / spc).astype(int)
    dy = (np.fabs((site_lats - (lats[0] + spc / 2.)) / spc)).astype(int)
861
862
    dx_vs = ((site_lons - (lons_vs[0] - spc / 2.)) / spc).astype(int)
    dy_vs = (np.fabs((site_lats - (lats_vs[0] + spc / 2.)) / spc)).astype(int)
g-weatherill's avatar
g-weatherill committed
863
864
865
866
867
868
869
870
871
    # Get backarc distance
    print("---- Getting volcanic distance")
    xvf = interpolate_xvf_grid(site_lons, site_lats)

    # Return the slope and geology values at the selected locations
    if as_dataframe:
        print("---- Building dataframe")
        elevation = elevation[dy, dx]
        idx = elevation >= -5.0
872
        #print(site_lons, site_lats)
g-weatherill's avatar
g-weatherill committed
873
874
875
876
877
878
879
        return gpd.GeoDataFrame({
            "geometry": gpd.GeoSeries([
                Point(lon, lat)
                for lon, lat in zip(site_lons[idx], site_lats[idx])]),
            "lon": site_lons[idx],
            "lat": site_lats[idx],
            "slope": slope[dy, dx][idx],
880
            "vs30": vs30[dy_vs, dx_vs][idx],
g-weatherill's avatar
g-weatherill committed
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
            "geology": [GEOL_DICT_KEY_TO_NAME[geol]
                        for geol in geology[dy, dx][idx]],
            "xvf": xvf[idx]})
    idx = elevation >= -5.0
    return slope[dy, dx], vs30[dy, dx], geology[dy, dx], xvf, idx


def get_slope_geology_vs30_polygons(regions, admin_id, weighting=True,
                                    method="MEAN", method_geol="MODE", n=0,
                                    proj=EUROPE_EQUAL_AREA):
    """
    For a set of polygons, retrieves the average slope, geology and vs30
    for each polygon (optionally weighted by built density) and the backarc
    distance of the centroid location.

    :param geopandas.GeoDataFrame regions:
        Administrative regions as a geopandas GeoDataFrame with geometry
        stored as a set of polygons
    :param int admin_id:
        Administrative level
    :param bool weighting:
        Apply weighting to the averaging using tbe build density
    :param str method:
        How to take the weighted average for the group of values ("MEAN",
        "MEDIAN", "MAX"), default "MEAN"
    :param str method_geol:
        How to determine preferred values from the geology data ("MODE",
        "MEDIAN", "MAX"), default "MODE"
    :param str proj:
        Projection to use for geospatial join operations (default Europe
        Lambert Equal Area
    """
    if weighting and not (method in WEIGHTING_METHODS):
            raise ValueError("Weighting method %s not recognised"
                             % str(method))
    if method_geol not in ("MEDIAN", "MODE", "MAX"):
        raise ValueError("Geology weighting method %s not recognised"
                         % str(method_geol))
    # Extract bounding box for entire regions
    bbox = tuple(regions.total_bounds)
    print("--- Retrieving grid data")
    # Get slices of slope, geology, vs30 and built density
    lons, lats, slope = slice_wgs84_datafile(SITE_HDF5_FILE, bbox, "slope",
                                             flatten=True)
    geology = slice_wgs84_datafile(SITE_HDF5_FILE, bbox, "geology",
                                   flatten=True)[2]
    vs30 = slice_wgs84_datafile(VS30_HDF5_FILE, bbox, "vs30",
                                flatten=True)[2]

    print("--- Building and reprojecting dataframe")
    df1 = gpd.GeoDataFrame({
        "geometry": gpd.GeoSeries([Point(lon, lat)
                                   for lon, lat in zip(lons, lats)]),
        "slope": slope,
        "geology": geology,
        "vs30": vs30})
    if weighting:
        density_key = "built_density"
        if n and (n in SMOOTHED_N):
            density_key += "_n{:g}".format(n)
        density = slice_wgs84_datafile(SITE_HDF5_FILE, bbox, density_key,
                                       flatten=True)[2]
        df1["density"] = density
    df1.crs = {"init": "epsg:4326"}
    # Re-project to European (Lambert) equal area projection
    df1 = df1.to_crs({"init": EUROPE_EQUAL_AREA})
    regions_ea = regions.to_crs({"init": EUROPE_EQUAL_AREA})
    # Assign polygons to points
    print("--- Assigning sites to polygons")
    df1 = gpd.sjoin(df1, regions_ea, how="left")
    # Loop through the regions and assign the "average" properties
    vs30s = []
    slopes = []
    geologies = []
    df1_admins = df1.groupby(admin_id)
    print("--- Calculating average properties")
    for reg_id in regions[admin_id].values:
        grp = df1_admins.get_group(reg_id)
        if not (reg_id in df1_admins.groups) or not grp.shape[0]:
            print("Region %s contains no site information" % reg_id)
            # No sites within region
            vs30s.append(np.nan)
            slopes.append(np.nan)
            geologies.append(np.nan)
            continue

        if weighting:
            # Take the averages across the polygon, weighted by the density
            vs30i = WEIGHTING_METHODS[method](grp["vs30"].values,
                                              grp["density"].values)
            slopei = WEIGHTING_METHODS[method](grp["slope"].values,
                                               grp["density"].values)
        else:
            # Take the average across th polyong without density weighting
            vs30i = UNWEIGHTED_METHODS[method](grp["vs30"].values)
            slopei = UNWEIGHTED_METHODS[method](grp["slope"].values)
        vs30s.append(vs30i)
        slopes.append(slopei)
        # Get the geology
        if method_geol == "MEDIAN":
            # Get the median geology
            geologyi = WEIGHTING_METHODS["MEDIAN"](grp["geology"].values,
                                                   grp["density"].values)
        else:
            geologyi = stats.mode(grp["geology"].values).mode[0]
        geologies.append(GEOL_DICT_KEY_TO_NAME[geologyi])
    # Append this to the admin dataframe
    regions["slope"] = np.array(slopes)
    regions["vs30"] = np.array(vs30s)
    regions["geology"] = geologies
    return regions


def weighted_centroid_grid(gx, gy, spc):
    """
    Function to coarsify the high resolution (250 m x 250 m) built density
    file for Europe onto a lower resolution WGS84 defined grid
    """
    # Transform long/lat boundaries to Mollewiede projection
    # Get bounding box in terms of long, lat
For faster browsing, not all history is shown. View entire blame