forked from v-iashin/video_features
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.py
149 lines (136 loc) · 8.11 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# we import unused `numpy` before `torch` because if run from `subprocess.run()`
# it fails with
# `Error: mkl-service + Intel(R) MKL: MKL_THREADING_LAYER=INTEL is incompatible with libgomp.so.1 library.`
# (see https://github.com/pytorch/pytorch/issues/37377)
import numpy
import torch
import argparse
from utils.utils import form_list_from_user_input, fix_tensorflow_gpu_allocation, sanity_check
def parallel_feature_extraction(args):
'''Distributes the feature extraction in embarasingly-parallel fashion. Specifically,
it divides the dataset (list of video paths) among all specified devices evenly and extract features.'''
if args.feature_type == 'i3d':
from models.i3d.extract_i3d import ExtractI3D # defined here to avoid import errors
extractor = ExtractI3D(args)
elif args.feature_type == 'r21d_rgb':
from models.r21d.extract_r21d import ExtractR21D # defined here to avoid import errors
extractor = ExtractR21D(args)
elif args.feature_type == 'vggish':
from models.vggish.extract_vggish import ExtractVGGish # defined here to avoid import errors
fix_tensorflow_gpu_allocation(args)
extractor = ExtractVGGish(args)
elif args.feature_type in ['resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152']:
from models.resnet.extract_resnet import ExtractResNet
extractor = ExtractResNet(args)
elif args.feature_type == 'raft':
from models.raft.extract_raft import ExtractRAFT
extractor = ExtractRAFT(args)
elif args.feature_type == 'pwc':
from models.pwc.extract_pwc import ExtractPWC
extractor = ExtractPWC(args)
elif args.feature_type in ['CLIP-ViT-B/32', 'CLIP4CLIP-ViT-B-32']:
from models.CLIP.extract_clip import ExtractCLIP
extractor = ExtractCLIP(args)
elif args.feature_type == 'vggish_torch':
from models.vggish_torch.extract_vggish import ExtractVGGish
extractor = ExtractVGGish(args)
else:
raise NotADirectoryError
# the indices correspond to the positions of the target videos in
# the video_paths list. They are required here because
# scatter module inputs only tensors but there is no such torch tensor
# that would be suitable for strings (video_paths). Also, the
# input have the method '.device' which allows us to access the
# current device in the extractor.
video_paths = form_list_from_user_input(args)
indices = torch.arange(len(video_paths))
replicas = torch.nn.parallel.replicate(extractor, args.device_ids[:len(indices)])
inputs = torch.nn.parallel.scatter(indices, args.device_ids[:len(indices)])
torch.nn.parallel.parallel_apply(replicas[:len(inputs)], inputs)
# closing the tqdm progress bar to avoid some unexpected errors due to multi-threading
extractor.progress.close()
def cpu_feature_extraction(args):
if args.feature_type == 'i3d':
from models.i3d.extract_i3d import ExtractI3D # defined here to avoid import errors
extractor = ExtractI3D(args)
elif args.feature_type == 'r21d_rgb':
from models.r21d.extract_r21d import ExtractR21D # defined here to avoid import errors
extractor = ExtractR21D(args)
elif args.feature_type == 'vggish':
from models.vggish.extract_vggish import ExtractVGGish # defined here to avoid import errors
fix_tensorflow_gpu_allocation(args)
extractor = ExtractVGGish(args)
elif args.feature_type in ['resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152']:
from models.resnet.extract_resnet import ExtractResNet
extractor = ExtractResNet(args)
elif args.feature_type == 'raft':
from models.raft.extract_raft import ExtractRAFT
extractor = ExtractRAFT(args)
elif args.feature_type == 'pwc':
from models.pwc.extract_pwc import ExtractPWC
extractor = ExtractPWC(args)
elif args.feature_type in ['CLIP-ViT-B/32', 'CLIP-ViT-B/16', 'CLIP4CLIP-ViT-B-32']:
from models.CLIP.extract_clip import ExtractCLIP
extractor = ExtractCLIP(args)
elif args.feature_type == 'vggish_torch':
from models.vggish_torch.extract_vggish import ExtractVGGish
extractor = ExtractVGGish(args)
else:
raise NotADirectoryError
video_paths = form_list_from_user_input(args)
indices = torch.arange(len(video_paths))
extractor(indices)
extractor.progress.close()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Extract Features')
parser.add_argument('--feature_type', required=True,
choices=['i3d', 'vggish', 'r21d_rgb', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
'resnet152', 'raft', 'pwc', 'CLIP-ViT-B/32', 'CLIP-ViT-B/16', 'CLIP4CLIP-ViT-B-32', 'vggish_torch'])
parser.add_argument('--video_paths', nargs='+', help='space-separated paths to videos')
parser.add_argument('--flow_paths', nargs='+', help='space-separated paths to video flow images')
parser.add_argument('--file_with_video_paths', help='.txt file where each line is a path')
parser.add_argument('--video_dir', type=str, help='dir of videos')
parser.add_argument('--flow_dir', type=str,
help='dir of optical flow of videos. [flow_dir]/[video id]/[flow_(x/y)_000001.jpg]')
parser.add_argument('--device_ids', type=int, nargs='+', help='space-separated device ids')
parser.add_argument('--cpu', action='store_true', help='use cpu only')
parser.add_argument('--tmp_path', default='./tmp',
help='folder to store the temporary files used for extraction (frames or aud files)')
parser.add_argument('--keep_tmp_files', dest='keep_tmp_files', action='store_true', default=False,
help='to keep temp files after feature extraction. (works only for vggish and i3d)')
parser.add_argument('--on_extraction', default='print',
choices=['print', 'save_numpy', 'save_pickle'],
help='what to do once the stack is extracted')
parser.add_argument('--output_path', default='./output', help='where to store results if saved')
parser.add_argument('--output_direct', action="store_true",
help='if so, files will be directly saved in output_path')
parser.add_argument('--extraction_fps', type=float, help='(Outdated)For original video fps, leave unspecified')
parser.add_argument('--extract_method', type=str, help='extraction frames method.')
parser.add_argument('--stack_size', type=int, help='Feature time span in fps')
parser.add_argument('--step_size', type=int, help='Feature step size in fps')
parser.add_argument('--streams', nargs='+', choices=['flow', 'rgb'],
help='Streams to use for feature extraction. Both used if not specified')
parser.add_argument('--flow_type', choices=['raft', 'pwc', 'flow'], default='pwc',
help='Flow to use in I3D. PWC is faster while RAFT is more accurate.')
parser.add_argument('--batch_size', type=int, default=1,
help='Batchsize (only frame-wise extractors are supported)')
parser.add_argument('--resize_to_larger_edge', dest='resize_to_smaller_edge', action='store_false',
default=True, help='The larger side will be resized to this number maintaining the'
+ 'aspect ratio. By default, uses the smaller side (as Resize in torchvision).')
parser.add_argument('--side_size', type=int,
help='If specified, the input images will be resized to this value in RAFT.')
parser.add_argument(
'--show_pred', dest='show_pred', action='store_true', default=False,
help='to show preds of a model, i.e. on a pre-train dataset (imagenet or kinetics) for each feature'
)
args = parser.parse_args()
# some printing
if args.on_extraction in ['save_numpy', 'save_pickle']:
print(f'Saving features to {args.output_path}')
if args.keep_tmp_files:
print(f'Keeping temp files in {args.tmp_path}')
sanity_check(args)
if args.cpu:
cpu_feature_extraction(args)
else:
parallel_feature_extraction(args)